[
  {
    "path": ".clang-format",
    "content": "BasedOnStyle: Google\n\n# Modifications for Tesseract.\n\n# Only merge empty functions.\nAllowShortFunctionsOnASingleLine: Empty\n# Do not allow short if statements.\nAllowShortIfStatementsOnASingleLine: false\nIndentPPDirectives: AfterHash\n\n# Default style for some settings.\n\nAccessModifierOffset: -2\nAllowShortLoopsOnASingleLine: false\n# Enforce always the same pointer alignment.\nDerivePointerAlignment: false\nIncludeBlocks: Preserve\nPointerAlignment: Right\nSpacesBeforeTrailingComments: 1\n"
  },
  {
    "path": ".gitattributes",
    "content": "* text=auto\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/issue-bug.yml",
    "content": "name: Bug Report\ndescription: File a bug report\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        ### Attention\n        Before you submit an issue, please review [the guidelines for this repository](https://github.com/tesseract-ocr/tesseract/blob/main/CONTRIBUTING.md).\n\n        Have a question? Need help?\n        Please use [our forum](https://groups.google.com/g/tesseract-ocr).\n\n        Please follow these rules:\n        * Check that your Operating Systems is [supported](https://tesseract-ocr.github.io/tessdoc/supported-operating-systems.html).\n        * Don't open an issue for [Tesseract version which was released more than a year ago](https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html).\n        * Don't open an issue which involves 3rd party tools that use Tesseract as a library. Only report about an issue with the Tesseract command line tool or the C/C++ API.\n        * Please provide the input image.\n        * Also provide output files (txt and/or tsv, hocr, pdf). You can make a zip archive that will contain these files, so GitHub will let you upload them.\n        * Don't attach a screenshot of the command line and output. Instead, copy the text and paste it in your bug report.\n\n  - type: textarea\n    attributes:\n      label: Current Behavior\n  - type: textarea\n    attributes:\n      label: Expected Behavior\n  - type: textarea\n    attributes:\n      label: Suggested Fix\n  - type: textarea\n    attributes:\n      label: tesseract -v\n      description: Version info, compiled libraries, SIMD, OpenMP\n      placeholder: \"Please paste the output of the command: tesseract -v\"\n  - type: dropdown\n    id: os-linux\n    attributes:\n      label: Operating System\n      description:  Choose the OS where the bug occurs\n      multiple: true\n      options:\n        - Windows 11\n        - Windows 10\n        - macOS 26 Tahoe\n        - macOS 15 Sequoia\n        - macOS 14 Sonoma\n        - Ubuntu 24.04 Noble\n        - Ubuntu 22.04 Jammy\n        - Debian 13 Trixie\n        - Debian 12 Bookworm\n        - RHEL 10\n        - RHEL 9\n  - type: textarea\n    attributes:\n      label: Other Operating System\n      placeholder: Enter the name and version of the OS\n  - type: textarea\n    attributes:\n      label: uname -a\n      placeholder: \"Paste the output of the command: uname -a (if available in your system).\"\n\n  - type: textarea\n    attributes:\n      label: Compiler\n      placeholder: \"Enter compiler name and version (Examples: MSVC 2019 16.11, Clang 13.0.1, GCC 11.2, Xcode 14.1)\"\n  - type: textarea\n    attributes:\n      label: CPU\n      placeholder: \"Enter your CPU vendor name and model (Examples: Intel Core i7-11700K, AMD Ryzen 7 5800X, Apple Silicon M1)\"\n  - type: textarea\n    attributes:\n      label: Virtualization / Containers\n      placeholder: \"Enter the name and version of the VM / container which you use (Examples: Oracle VM VirtualBox 7.0.4,VMware Workstation 17.0, Hyper-V, Docker 20.10.22)\"\n  - type: textarea\n    attributes:\n      label: Other Information\n      placeholder: Add more details here.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/issue-feature-request.yml",
    "content": "name: Feature Request\ndescription: File a feature request\nbody:\n  - type: textarea\n    attributes:\n      label: Your Feature Request\n      description: Please look first at the [open issues labeled as 'feature request'](https://github.com/tesseract-ocr/tesseract/labels/feature%20request).\n"
  },
  {
    "path": ".github/copilot-instructions.md",
    "content": "# Tesseract OCR - GitHub Copilot Instructions\n\n## Repository Overview\n\nTesseract is an open-source **OCR (Optical Character Recognition) engine** that recognizes text from images. This repository contains:\n\n- **libtesseract**: C++ OCR library with C API wrapper\n- **tesseract**: Command-line OCR program\n- **Training tools**: For creating custom language models\n\n**Key Facts:**\n- Primary language: **C++17** (requires C++17-compliant compiler)\n- Size: Large (~100MB+ with submodules)\n- License: Apache 2.0\n- Maintained by: Stefan Weil (lead), Zdenko Podobny (maintainer)\n\n## Build Systems\n\nTesseract supports **two build systems**. Both are actively maintained and tested in CI.\n\n### 1. Autotools (Traditional, POSIX Systems)\n\n**When to use:** Linux, macOS (command-line), MSYS2 on Windows\n\n**Build sequence:**\n```bash\n./autogen.sh                    # Generate configure script (only needed after git clone)\n./configure                      # Configure build (creates Makefiles)\nmake                            # Build library and CLI\nsudo make install               # Install to system\nsudo ldconfig                   # Update library cache (Linux only)\nmake training                   # Build training tools (optional)\nsudo make training-install      # Install training tools\n```\n\n**Important:**\n- ALWAYS run `./autogen.sh` first if building from git clone\n- Use `make -j N` for parallel builds (N = number of CPU cores)\n- Check `configure --help` for build options\n- To clean: `make clean` or `make distclean` (complete cleanup)\n\n### 2. CMake (Modern, Cross-platform)\n\n**When to use:** Windows (MSVC, MinGW), cross-platform, modern development\n\n**Build sequence:**\n```bash\nmkdir build                     # MUST use out-of-source build\ncd build\ncmake ..                        # Configure (add options here)\nmake                            # Or: cmake --build .\nsudo make install               # Install to system\n```\n\n**Important CMake options:**\n- `BUILD_TRAINING_TOOLS=ON` - Enable training tools build\n- `CMAKE_BUILD_TYPE=Release` - Release build (default is RelWithDebInfo)\n- `GRAPHICS_DISABLED=ON` - Disable ScrollView (GUI debugger)\n- `ENABLE_NATIVE=OFF` - Disable CPU-specific optimizations (for portability)\n\n**CMake enforces out-of-source builds** - you cannot build in the source directory. If you get an error about this, remove `CMakeCache.txt` and build in a separate directory.\n\n## Dependencies\n\n### Core Required Dependencies\n\n- **Leptonica 1.74.2+** (REQUIRED) - Image I/O library\n  - Without this, build will fail\n  - Usually installed via package manager: `libleptonica-dev` (Ubuntu) or `leptonica` (Homebrew)\n\n- **C++17 compiler:**\n  - GCC 7+, Clang 5+, MSVC 2017+\n  - Verified compilers: gcc-11, gcc-12, gcc-14, clang-15, clang++\n\n### Training Tools Dependencies\n\nOnly needed if building training tools (`make training` or `-DBUILD_TRAINING_TOOLS=ON`):\n\n- pango-devel / libpango1.0-dev\n- cairo-devel\n- icu-devel\n\n### Optional Dependencies\n\n- **libarchive-dev**, **libcurl4-openssl-dev** - For advanced features\n- **OpenMP** - For parallel processing (enabled by default if available)\n- **cabextract** - For testing with CAB archives\n\n### Traineddata Files\n\nTesseract requires **traineddata files** to function. Minimum required:\n- `eng.traineddata` (English)\n- `osd.traineddata` (Orientation and Script Detection)\n\n**Installation:**\n```bash\n# Download individual files (to /usr/local/share/tessdata/ or your TESSDATA_PREFIX path)\ncd /usr/local/share/tessdata/  # Or wherever you want to install\nwget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata\nwget https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata\n\n# Or clone all languages (WARNING: 1.2+ GB)\ngit clone https://github.com/tesseract-ocr/tessdata.git\n```\n\n**Set environment variable:**\n```bash\nexport TESSDATA_PREFIX=/usr/local/share/tessdata/\n```\n\nVerify with: `tesseract --list-langs`\n\n## Testing\n\n### Running Unit Tests\n\n**With autotools:**\n```bash\n./autogen.sh\n./configure\nmake\nmake check                      # Runs all unit tests\n```\n\n**With CMake:**\n```bash\nmkdir build && cd build\ncmake ..\nmake\nctest                          # Or: cmake --build . --target test\n```\n\n**Important:**\n- Tests require `googletest` submodule: `git submodule update --init --recursive`\n- Tests require tessdata files (eng, osd minimum)\n- Test results in `test-suite.log` (autotools) or CTest output (CMake)\n\n### Running Tesseract CLI\n\nBasic test commands:\n```bash\n# After installation:\ntesseract --version\ntesseract --list-langs\ntesseract input.png output      # OCR image, creates output.txt\ntesseract input.png output pdf  # Create searchable PDF\n```\n\nTest files available in `test/testing/` (requires test submodule):\n- `phototest.tif` - English test image\n- `devatest.png` - Hindi/Devanagari test image (different format intentional)\n\n## Project Structure\n\n### Source Code Layout\n\n```\nsrc/\n├── api/               # Public C/C++ API (baseapi.h, capi.h)\n├── ccmain/            # Main OCR control logic\n├── lstm/              # LSTM neural network engine (Tesseract 4+)\n├── ccutil/, cutil/    # Core utilities, data structures\n├── classify/          # Character classifier\n├── dict/              # Dictionary and language model\n├── textord/           # Text line and word detection\n├── wordrec/           # Word recognition\n├── training/          # Training tools (lstmtraining, text2image, etc.)\n└── tesseract.cpp      # CLI main() entry point\n\ninclude/tesseract/     # Public header files\nunittest/              # Unit tests (requires googletest)\ntest/testing/          # Test images and data\ntessdata/              # Default location for traineddata files\ndoc/                   # Documentation\n```\n\n### Key Files\n\n- **src/api/baseapi.h** - Main C++ API class (`TessBaseAPI`)\n- **src/api/capi.h** - C wrapper API\n- **src/tesseract.cpp** - Command-line tool\n- **CMakeLists.txt**, **configure.ac**, **Makefile.am** - Build configuration\n- **VERSION** - Current version string\n\n### Configuration Files\n\n- **.clang-format** - Code formatting rules (LLVM style)\n- **tesseract.pc.in** - pkg-config template\n- **.github/workflows/** - CI/CD definitions\n\n## CI/CD Workflows\n\n### Active Workflows\n\n1. **cmake.yml** - CMake builds on Ubuntu/macOS, 6 configurations\n2. **autotools.yml** - Autotools builds, comprehensive testing\n3. **unittest.yml** - Unit tests with sanitizers (ASAN, UBSAN)\n4. **codeql-analysis.yml** - Security static analysis\n5. **vcpkg.yml**, **msys2.yml**, **cmake-win64.yml** - Windows builds\n\n### Validation Requirements\n\nAll PRs trigger:\n- **Build tests** on multiple platforms (Ubuntu 22.04, 24.04, macOS 14, 15)\n- **Compiler tests** (GCC 11-14, Clang 15)\n- **Unit tests** with sanitizers\n- **CodeQL** security scan\n\n**Expect ~10-30 minutes** for full CI validation.\n\n### Common CI Failures\n\n- **Missing dependencies:** Check workflow files for required packages\n- **Test failures:** Often due to missing tessdata files\n- **Sanitizer errors:** Memory leaks, undefined behavior\n- **CodeQL alerts:** Security vulnerabilities in code\n\n## Common Build Issues & Workarounds\n\n### Issue: \"configure: error: Leptonica not found\"\n**Solution:** Install leptonica development package\n```bash\n# Ubuntu/Debian:\nsudo apt-get install libleptonica-dev\n# macOS:\nbrew install leptonica\n```\n\n### Issue: \"CMake Error: cannot build in source directory\"\n**Solution:** CMake requires out-of-source builds\n```bash\nrm -f CMakeCache.txt\nmkdir build && cd build && cmake ..\n```\n\n### Issue: \"make check\" fails with \"cannot find tessdata\"\n**Solution:** Set TESSDATA_PREFIX or download files\n```bash\nexport TESSDATA_PREFIX=/usr/local/share/tessdata/\n# Or copy files to /usr/local/share/tessdata/\n```\n\n### Issue: Submodule errors (googletest, test)\n**Solution:** Initialize submodules\n```bash\ngit submodule update --init --recursive\n```\n\n### Issue: Old Tesseract version conflicts\n**Solution:** Remove previous installation before building\n```bash\n# Find installed files:\nwhich tesseract\npkg-config --modversion tesseract\n# Uninstall old version, then rebuild\n```\n\n### Issue: Training tools not building\n**Solution:** Install pango, cairo, icu dependencies\n```bash\nsudo apt-get install libpango1.0-dev libcairo2-dev libicu-dev\n```\n\n## Validation Steps for Code Changes\n\nWhen making code changes, follow these steps:\n\n1. **Build the project** (choose one):\n   ```bash\n   # Autotools:\n   ./autogen.sh && ./configure && make\n   # CMake:\n   mkdir build && cd build && cmake .. && make\n   ```\n\n2. **Run unit tests**:\n   ```bash\n   # Autotools:\n   make check\n   # CMake:\n   ctest\n   ```\n\n3. **Test CLI manually**:\n   ```bash\n   tesseract test/testing/phototest.tif output\n   cat output.txt  # Verify OCR output\n   ```\n\n4. **Check for memory issues** (if modifying C++ code):\n   ```bash\n   # Build with sanitizers:\n   CXXFLAGS=\"-g -O2 -fsanitize=address,undefined\" ./configure\n   make && make check\n   ```\n\n5. **Run CodeQL** (security check):\n   - Will run automatically in CI\n   - Or use GitHub Code Scanning locally\n\n6. **Verify documentation** (if API changes):\n   - Update header comments in `include/tesseract/`\n   - Update relevant docs in `doc/`\n\n## Code Style & Conventions\n\n- **Formatting:** Use clang-format with `.clang-format` config (LLVM style)\n- **Naming:** \n  - Classes: `CamelCase` (e.g., `TessBaseAPI`)\n  - Functions: `CamelCase` (e.g., `ProcessPage`)\n  - Variables: `snake_case` or `lower_case`\n- **Headers:** Use include guards, document public APIs\n- **Comments:** Focus on \"why\", not \"what\"\n- **Commits:** Use meaningful messages, reference issue numbers\n\n## Important Notes for AI Coding Agents\n\n1. **Always use out-of-source builds with CMake** - in-source builds are blocked\n2. **Check for Leptonica** before building - it's a hard requirement\n3. **Initialize git submodules** before running tests\n4. **Set TESSDATA_PREFIX** or tests will fail\n5. **Building takes time** - allow 2-5 minutes for full build\n6. **Testing takes time** - `make check` can take 5-10 minutes\n7. **Don't remove existing tests** - they're critical for preventing regressions\n8. **Check CI workflows** for platform-specific requirements\n9. **Sanitizer builds are slower** - 2-3x slower than normal builds\n10. **Training tools are optional** - only build if needed for the task\n\n## Useful Commands Reference\n\n```bash\n# Quick build and test (autotools):\n./autogen.sh && ./configure && make -j8 && make check\n\n# Quick build and test (CMake):\nmkdir build && cd build && cmake .. && make -j8 && ctest\n\n# Format code:\nfind src -name '*.cpp' -o -name '*.h' | xargs clang-format -i\n\n# Check test results:\ncat test-suite.log                    # autotools\nctest --output-on-failure             # CMake\n\n# Install only library (no training):\nmake install                          # After ./configure && make\n\n# Clean builds:\nmake clean                            # Partial clean\nmake distclean                        # Complete clean (autotools)\nrm -rf build                          # Complete clean (CMake)\n\n# Check installed version:\ntesseract --version\npkg-config --modversion tesseract\n\n# Debug OCR on specific image:\ntesseract input.png output -l eng --psm 6 -c debug_file=/dev/null\n```\n\n---\n\n**Trust these instructions.** Only search for additional information if these instructions are incomplete, outdated, or if you encounter an error not covered here. The workflows and build procedures are tested daily in CI and represent current best practices for this repository.\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where the package manifests are located.\n# Please see the documentation for all configuration options:\n# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file\n\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\" # See documentation for possible values\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"weekly\"\n"
  },
  {
    "path": ".github/workflows/autotools-macos.yml",
    "content": "name: autotools-macos\n# autotools build of tesseract and training tools on macos homebrew and macports.\n# run command line tests, basicapitest and unittests. '--disable-openmp'\non:\n  #push:\n  schedule:\n    - cron: 0 20 * * *\n  workflow_dispatch:\njobs:\n\n  brew:\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: macos-latest-clang-autotools, os: macos-latest, cxx: clang++ }\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Get fonts, tessdata and langdata required for unit tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Install dependencies\n      run: |\n           brew install autoconf automake cabextract libtool\n           brew install leptonica libarchive pango\n           if ! brew list icu4c &>/dev/null; then\n             brew install icu4c\n           fi\n           if ! brew list curl &>/dev/null; then\n             brew install curl\n           fi\n\n    - name: Setup Tesseract\n      run: |\n           ./autogen.sh\n\n    - name: Configure Tesseract\n      run: |\n           ./configure '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'\n\n    - name: Make and Install Tesseract\n      run: |\n           make -j 8\n           sudo make install install\n    - name: Make and Install Training Tools\n      run: |\n           make training -j 8\n           sudo make install training-install\n\n    - name: Make and run Unit Tests (clang)\n      if: startsWith(matrix.config.cxx, 'clang')\n      run: |\n           make check\n\n    - name: Make and run Unit Tests (unset LANG needed for g++-8, g++-9, g++-10 on macOS)\n      if: startsWith(matrix.config.cxx, 'g')\n      shell: bash\n      run: |\n           unset LANG LC_ALL LC_CTYPE\n           locale\n           make check\n\n    - name: Display Version for tesseract, lstmtraining, text2image\n      run: |\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: List languages in different test tessdata-dir\n      run: |\n           tesseract  --list-langs --tessdata-dir ../tessdata\n           tesseract  --list-langs --tessdata-dir ../tessdata_best\n           tesseract  --list-langs --tessdata-dir ../tessdata_fast\n\n    - name: Run Tesseract on test images in different languages\n      run: |\n           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata\n           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best\n           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata\n\n    - name: Run Tesseract basicapitest\n      run: |\n           export \"PKG_CONFIG_PATH=/usr/local/lib/pkgconfig\"\n           cd test\n           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp $(pkg-config --cflags --libs tesseract lept) -pthread -std=c++17 -framework accelerate\n           ./basicapitest\n\n    - name: Display Compiler Version\n      run: |\n           ${{ matrix.config.cxx }} --version\n           git log -3 --pretty=format:'%h %ad %s | %an'\n      if: always()\n\n    - name: Display Unit Tests Report\n      run: |\n           cat test-suite.log\n      if: always()\n\n# ============================================================================================\n\n  ports:\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: macos-latest-clang-autotools, os: macos-latest, cxx: clang++ }\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Get fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Install Macports\n      run: |\n        curl -sSLO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci; source ./macports-ci install\n        # --remove-brew does not remove the Homebrew entries in bin,\n        # so remove them now.\n        rm -v $(brew --prefix)/bin/*\n\n    - name: Install Dependencies\n      run: |\n           sudo port install autoconf automake libtool pkgconfig\n           sudo port install leptonica\n           sudo port install cairo pango\n           sudo port install icu +devel\n           sudo port install cabextract libarchive curl\n\n    - name: Setup Tesseract\n      run: |\n           ./autogen.sh\n\n    - name: Configure Tesseract\n      run: |\n           ./configure  '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'\n\n    - name: Make and Install Tesseract\n      run: |\n           make -j 8\n           sudo make install install\n\n    - name: Make and Install Training Tools\n      run: |\n           make training -j 8\n           sudo make install training-install\n\n    - name: Make and run Unit Tests (clang)\n      if: startsWith(matrix.config.cxx, 'clang')\n      run: |\n           make check\n\n    - name: Display Version for tesseract, lstmtraining, text2image\n      run: |\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: List languages in different test tessdata-dir\n      run: |\n           tesseract  --list-langs --tessdata-dir ../tessdata\n           tesseract  --list-langs --tessdata-dir ../tessdata_best\n           tesseract  --list-langs --tessdata-dir ../tessdata_fast\n\n    - name: Run Tesseract on test images in different languages\n      run: |\n           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata\n           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best\n           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata\n\n    - name: Run Tesseract basicapitest\n      run: |\n           export \"PKG_CONFIG_PATH=/usr/local/lib/pkgconfig\"\n           cd test\n           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/opt/local/include -L/opt/local/lib $(pkg-config --cflags --libs tesseract lept) -pthread -std=c++17 -framework Accelerate\n           ./basicapitest\n\n    - name: Display Compiler Version\n      run: |\n           ${{ matrix.config.cxx }} --version\n           git log -3 --pretty=format:'%h %ad %s | %an'\n      if: always()\n\n    - name: Display Unit Tests Report\n      run: |\n           cat test-suite.log\n      if: always()\n"
  },
  {
    "path": ".github/workflows/autotools-openmp.yml",
    "content": "name: autotools-openmp\n# autotools on Ubuntu - run benchmark test. '--enable-openmp' no training tools\non:\n  #push:\n  #schedule:\n  #  - cron: 0 20 * * *\n  workflow_dispatch:\njobs:\n\n  linux:\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: 24.04-openmp, os: ubuntu-24.04 }\n          - { name: 22.04-openmp, os: ubuntu-22.04 }\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Install dependencies\n      run: |\n           sudo apt-get update\n           sudo apt-get install autoconf libleptonica-dev -y\n           sudo apt-get install libpango1.0-dev -y\n           sudo apt-get install cabextract libarchive-dev -y\n           sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y\n\n    - name: Setup Tesseract\n      run: |\n           ./autogen.sh\n\n    - name: Configure Tesseract\n      run: |\n           ./configure '--disable-shared' '--enable-openmp' '--disable-doc' 'CXX=g++' 'CXXFLAGS=-g -O2'\n           grep -i OpenMP config.log\n\n    - name: Make and Install Tesseract\n      run: |\n           make\n           sudo make install\n\n    - name: Setup for Tesseract benchmark using image from issue 263 fifteen times in a list file\n      run: |\n           wget -O i263_speed.jpg https://cloud.githubusercontent.com/assets/9968625/13674495/ac261db4-e6ab-11e5-9b4a-ad91d5b4ff87.jpg\n           printf 'i263_speed.jpg\\n%.0s' {1..15} > benchmarks.list\n\n    - name: Run Tesseract using image from issue 263 with tessdata_fast\n      run: |\n           lscpu\n           free\n           g++ --version\n           tesseract -v\n           time tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1\n           echo \"tessdata_fast\"\n\n    - name: Run Tesseract using image from issue 263 with tessdata_fast and OpenMP Thread Limit\n      run: |\n           for lmt in {1..3}; do\n                time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1 && echo \"OMP_THREAD_LIMIT=\" $lmt \"tessdata_fast\"\n           done\n\n    - name: Run Tesseract using image from issue 263 with tessdata_best and OpenMP Thread Limit\n      run: |\n           for lmt in {1..3}; do\n                time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata_best > /dev/null 2>&1 && echo \"OMP_THREAD_LIMIT=\" $lmt \"tessdata_best\"\n           done\n\n    - name: Run Tesseract using image from issue 263 with tessdata and OpenMP Thread Limit\n      run: |\n           for lmt in {1..3}; do\n                time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata > /dev/null 2>&1 && echo \"OMP_THREAD_LIMIT=\" $lmt \"tessdata\"\n           done\n"
  },
  {
    "path": ".github/workflows/autotools.yml",
    "content": "name: autotools\n# autotools build of tesseract and training tools on Ubuntu.\n# run command line tests, basicapitest and unittests. '--disable-openmp'\non:\n  #push:\n  schedule:\n    - cron: 0 20 * * *\njobs:\n\n  linux:\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: ubuntu-22.04-clang-15-autotools, os: ubuntu-22.04, cxx: clang++-15 } #installed\n\n          - { name: ubuntu-24.04-gcc-14-autotools, os: ubuntu-24.04, cxx: g++-14 } #installed\n          - { name: ubuntu-22.04-gcc-12-autotools, os: ubuntu-22.04, cxx: g++-12 } #installed\n          - { name: ubuntu-22.04-gcc-11-autotools, os: ubuntu-22.04, cxx: g++-11 } #installed\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Install Compiler\n      run: |\n           sudo apt-get update\n           sudo apt-get install -y ${{ matrix.config.cxx }}\n\n    - name: Install dependencies\n      run: |\n           sudo apt-get install autoconf libleptonica-dev -y\n           sudo apt-get install libpango1.0-dev -y\n           sudo apt-get install cabextract libarchive-dev -y\n           sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y\n\n    - name: Setup Tesseract\n      run: |\n           ./autogen.sh\n\n    - name: Configure Tesseract\n      run: |\n           ./configure '--disable-shared' '--disable-openmp' '--disable-doc' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'\n\n    - name: Make and Install Tesseract\n      run: |\n           make -j 8\n           sudo make install install\n\n    - name: Make and Install Training Tools\n      run: |\n           make training -j 8\n           sudo make install training-install\n\n    - name: Make and run Unit Tests\n      run: |\n           make check\n\n    - name: Display Version for tesseract, lstmtraining, text2image\n      run: |\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: List languages in different test tessdata-dir\n      run: |\n           tesseract  --list-langs --tessdata-dir ../tessdata\n           tesseract  --list-langs --tessdata-dir ../tessdata_best\n           tesseract  --list-langs --tessdata-dir ../tessdata_fast\n\n    - name: Run Tesseract on test images in different languages\n      run: |\n           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata\n           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata\n           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best\n           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata\n\n    - name: Run Tesseract basicapitest\n      run: |\n           export \"PKG_CONFIG_PATH=/usr/local/lib/pkgconfig\"\n           cd test\n           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/usr/local/include -L/usr/local/lib `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++17\n           ./basicapitest\n\n    - name: Setup for Tesseract benchmark using image from issue 263 fifteen times in a list file\n      run: |\n           wget -O i263_speed.jpg https://cloud.githubusercontent.com/assets/9968625/13674495/ac261db4-e6ab-11e5-9b4a-ad91d5b4ff87.jpg\n           printf 'i263_speed.jpg\\n%.0s' {1..15} > benchmarks.list\n           lscpu\n           free\n           tesseract -v\n\n    - name: Run Tesseract using image from issue 263 with tessdata_fast\n      run: |\n           time tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1\n           echo \"tessdata_fast - disable-openmp\"\n\n    - name: Run Tesseract using image from issue 263 with tessdata_best\n      run: |\n           time tesseract benchmarks.list - --tessdata-dir ../tessdata_best > /dev/null 2>&1\n           echo \"tessdata_best - disable-openmp\"\n\n    - name: Run Tesseract using image from issue 263 with tessdata_fast\n      run: |\n           time tesseract benchmarks.list - --tessdata-dir ../tessdata > /dev/null 2>&1\n           echo \"tessdata - disable-openmp\"\n\n    - name: Display Compiler Version\n      run: |\n           ${{ matrix.config.cxx }} --version\n           git log -3 --pretty=format:'%h %ad %s | %an'\n      if: always()\n\n    - name: Display Unit Tests Report\n      run: |\n           cat test-suite.log\n      if: always()\n"
  },
  {
    "path": ".github/workflows/cifuzz.yml",
    "content": "name: CIFuzz\n# OSS-Fuzz CI\n# See https://google.github.io/oss-fuzz/getting-started/continuous-integration/\non:\n  pull_request:\n    branches:\n    - main\n    paths:\n    - '**.cpp'\n    - '**.h'\njobs:\n  Fuzzing:\n    runs-on: ubuntu-latest\n    steps:\n    - name: Build Fuzzers\n      id: build\n      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master\n      with:\n        oss-fuzz-project-name: 'tesseract-ocr'\n        language: c++\n        dry-run: false\n    - name: Run Fuzzers\n      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master\n      with:\n        oss-fuzz-project-name: 'tesseract-ocr'\n        fuzz-seconds: 600\n        dry-run: false\n    - name: Upload Crash\n      uses: actions/upload-artifact@v7\n      if: failure() && steps.build.outcome == 'success'\n      with:\n        name: artifacts\n        path: ./out/artifacts\n"
  },
  {
    "path": ".github/workflows/cmake-win64.yml",
    "content": "# Based on https://github.com/zdenop/tesserocr/actions/runs/691257659/workflow\n# Build Tesseract on Windows using cmake. No Training Tools.\nname: cmake-win64\non:\n  push:\n    paths:\n      - cmake/**\n      - '**/CMakeLists.txt'\n  pull_request:\n    types: [opened, reopened, synchronize]\n    paths:\n      - cmake/**\n      - '**/CMakeLists.txt'\n  schedule:\n    - cron: 0 5 * * *\n  workflow_dispatch:\n\nenv:\n  ILOC: d:/a/local\n  png_ver: 1651\n\njobs:\n  build:\n    name: cmake-win64\n    runs-on: windows-latest\n    steps:\n      - uses: ilammy/setup-nasm@v1\n      - uses: microsoft/setup-msbuild@v2\n      - name: \"Checkout ${{ github.ref }} ( ${{ github.sha }} )\"\n        uses: actions/checkout@v6\n        with:\n          submodules: recursive\n      - run: git fetch --prune --unshallow --tags\n\n      - name: Get the version\n        id: get_version\n        continue-on-error: true\n        run: |\n             $git_info=$(git describe --tags HEAD)\n             $stamp=$(date +'%Y-%m-%d_%H%M%S')\n             echo \"version=${git_info}\" >> $env:GITHUB_OUTPUT\n             echo \"stamp=${stamp}\" >> $env:GITHUB_OUTPUT\n\n      - name: Setup Installation Location\n        run: |\n             mkdir ${{env.ILOC}}\n\n      #- name: Uninstall Perl\n      #  run: |\n      #    choco uninstall strawberryperl\n\n      - name: Build and Install zlib-ng\n        shell: cmd\n        run: |\n             git clone --depth 1 https://github.com/zlib-ng/zlib-ng.git\n             cd zlib-ng\n             cmake -Bbuild -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DBUILD_SHARED_LIBS=OFF -DZLIB_COMPAT=ON -DZLIB_ENABLE_TESTS=OFF -DINSTALL_UTILS=OFF\n             cmake --build build --target install\n             cd ..\n\n      - name: Build and Install libpng\n        shell: cmd\n        run: |\n             curl -sSL -o lpng${{env.png_ver}}.zip https://download.sourceforge.net/libpng/lpng${{env.png_ver}}.zip\n             unzip.exe  -qq lpng${{env.png_ver}}.zip\n             cd lpng${{env.png_ver}}\n             cmake -Bbuild -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DPNG_TESTS=OFF -DPNG_SHARED=OFF\n             cmake --build build --target install\n             cd ..\n\n      - name: Build and Install libjpeg\n        shell: cmd\n        run: |\n             git clone --depth 1 https://github.com/libjpeg-turbo/libjpeg-turbo.git\n             cd libjpeg-turbo\n             cmake -Bbuild -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DWITH_TURBOJPEG=OFF -DENABLE_SHARED=OFF\n             cmake --build build --target install\n             cd ..\n\n      - name: Build and Install jbigkit\n        shell: cmd\n        run: |\n             git clone --depth 1 https://github.com/zdenop/jbigkit.git\n             cd jbigkit\n             cmake -Bbuild -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DBUILD_PROGRAMS=OFF -DBUILD_TOOLS=OFF -DCMAKE_WARN_DEPRECATED=OFF\n             cmake --build build --target install\n             cd ..\n\n      - name: Build and Install libtiff\n        shell: cmd\n        run: |\n             git clone -c advice.detachedHead=false -b \"v4.7.1\" --depth 1 https://gitlab.com/libtiff/libtiff.git\n             cd libtiff\n             cmake -Bbuild -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -Dtiff-tools=OFF -Dtiff-tests=OFF -Dtiff-contrib=OFF -Dtiff-docs=OFF\n             cmake --build build --target install\n             cd ..\n\n      - name: Build and Install leptonica\n        shell: cmd\n        run: |\n             echo \"Building leptonica...\"\n             git clone --depth 1 https://github.com/DanBloomberg/leptonica.git\n             cd leptonica\n             cmake -Bbuild -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DSW_BUILD=OFF -DBUILD_PROG=OFF -DBUILD_SHARED_LIBS=ON\n             cmake --build build --target install\n\n      - name: Remove not needed tools Before building tesseract\n        shell: cmd\n        run: >\n             rm -Rf ${{env.ILOC}}/bin/*.exe\n\n      - name: Build and Install tesseract\n        shell: cmd\n        run: |\n             cmake -Bbuild -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DSW_BUILD=OFF -DBUILD_SHARED_LIBS=ON -DENABLE_LTO=ON -DBUILD_TRAINING_TOOLS=OFF -DFAST_FLOAT=ON -DGRAPHICS_DISABLED=ON -DOPENMP_BUILD=OFF\n             cmake --build build --target install\n\n      - name: Upload Build Results\n        uses: actions/upload-artifact@v7\n        with:\n          name: tesseract-${{ steps.get_version.outputs.version }}-${{steps.get_version.outputs.stamp}}-VS2019_win64\n          path: ${{env.ILOC}}\n          retention-days: 5\n\n      - name: Display Tesseract Version and Test Command Line Usage\n        shell: cmd\n        run: |\n          curl -sSL https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata --output ${{env.ILOC}}/share/tessdata/eng.traineddata\n          curl -sSL https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata --output ${{env.ILOC}}/share/tessdata/osd.traineddata\n          echo \"Setting TESSDATA_PREFIX...\"\n          set TESSDATA_PREFIX=${{env.ILOC}}/share/tessdata\n          echo \"Setting PATH...\"\n          set PATH=${{env.ILOC}}/bin;%PATH%\n          echo \"Checking installed tesseract version...\"\n          tesseract -v\n          echo \"Checking installed langs\"\n          tesseract --list-langs\n          echo \"Checking OCR process\"\n          tesseract test/testing/phototest.tif -\n"
  },
  {
    "path": ".github/workflows/cmake.yml",
    "content": "name: cmake\n# cmake build of tesseract and training tools on ubuntu and macOS homebrew using Ninja.\n# test command line version of tesseract. run basicapitest.\non:\n  push:\n    paths:\n      - cmake/**\n      - '**/CMakeLists.txt'\n  pull_request:\n    paths:\n      - cmake/**\n      - '**/CMakeLists.txt'\n  schedule:\n    - cron: 0 21 * * *\n  workflow_dispatch:\n\njobs:\n  basictests:\n    name: ${{ matrix.config.name }}\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: macos-14-clang-15-cmake, os: macos-14, cxx: clang++ } # default\n          - { name: macos-15-gcc-14-cmake, os: macos-15, cxx: g++ } #installed\n          - { name: macos-15-clang-cmake, os: macos-15, cxx: clang++ } # default\n\n          - { name: ubuntu-22.04-clang-15-cmake, os: ubuntu-22.04, cxx: clang++-15 } #installed\n          - { name: ubuntu-24.04-gcc-12-cmake, os: ubuntu-24.04, cxx: g++-14 } #installed\n          - { name: ubuntu-22.04-gcc-12-cmake, os: ubuntu-22.04, cxx: g++-12 } #installed\n\n    steps:\n      - name: Install compilers on Linux\n        run: |\n             sudo apt-get update\n             sudo apt-get install ${{ matrix.config.cxx }} -y\n        if: runner.os == 'Linux'\n\n      - name: Install dependencies on Linux\n        run: |\n           sudo apt-get install autoconf libleptonica-dev -y\n           sudo apt-get install libarchive-dev libcurl4-openssl-dev -y\n           sudo apt-get install libpango1.0-dev -y\n           sudo apt-get install cabextract -y\n           sudo apt-get install ninja-build -y\n           cmake --version\n        if: runner.os == 'Linux'\n\n      - name: Install dependencies on macOS\n        run: |\n           brew install autoconf automake\n           brew install leptonica\n           # if ! brew list libarchive &>/dev/null; then\n           #   brew install libarchive\n           # fi\n           brew install pango\n           if ! brew list icu4c &>/dev/null; then\n             brew install icu4c\n           fi\n           if ! brew list curl &>/dev/null; then\n             brew install curl\n           fi\n           brew install cabextract\n           ninja --version\n           cmake --version\n           clang++ --version\n           g++ --version\n        if: runner.os == 'macOS'\n\n      - name: Checkout Source\n        uses: actions/checkout@v6\n        with:\n             submodules: recursive\n\n      - name: Configure Tesseract (Linux)\n        run: |\n             mkdir build\n             mkdir inst\n             cmake \\\n               -S . \\\n               -B build \\\n               -G Ninja \\\n               -DCMAKE_BUILD_TYPE=Release \\\n               -DOPENMP_BUILD=OFF \\\n               -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \\\n               -DCMAKE_INSTALL_PREFIX:PATH=inst\n        if: runner.os == 'Linux'\n\n      - name: Configure Tesseract (macOS)\n        shell: bash\n        run: |\n             set -e\n             mkdir build\n             mkdir inst\n             cmake \\\n               -S . \\\n               -B build \\\n               -G Ninja \\\n               -DCMAKE_BUILD_TYPE=Release \\\n               -DOPENMP_BUILD=OFF \\\n               -DENABLE_UNITY_BUILD=ON \\\n               -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \\\n               -DCMAKE_INSTALL_PREFIX:PATH=inst\n        if: runner.os == 'macOS'\n\n      - name: Build Tesseract\n        run: |\n             cmake --build build --config Release --target install\n\n      - name: Display Tesseract Version\n        run: |\n             build/inst/bin/tesseract -v\n\n      - name: Display Training Tools Version\n        run: |\n             build/inst/bin/lstmtraining -v\n             build/inst/bin/text2image -v\n\n      - name: Download fonts, tessdata and langdata required for tests\n        run: |\n             git clone https://github.com/egorpugin/tessdata tessdata_unittest\n             cp tessdata_unittest/fonts/* test/testing/\n             mv tessdata_unittest/* ../\n\n      - name: List languages in different tessdata-dir\n        run: |\n             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata\n             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata_best\n             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata_fast\n\n      - name: Run Tesseract on test images in different languages\n        run: |\n             build/inst/bin/tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata\n             build/inst/bin/tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata\n             build/inst/bin/tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata\n             build/inst/bin/tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata\n             build/inst/bin/tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best\n             build/inst/bin/tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata\n\n      - name: Build and run basicapitest (Linux)\n        run: |\n             export \"PKG_CONFIG_PATH=$GITHUB_WORKSPACE/build/inst/lib/pkgconfig/:$PKG_CONFIG_PATH\"\n             cd test\n             ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp \"-I$GITHUB_WORKSPACE/build/inst/include\" \"-L$GITHUB_WORKSPACE/build/inst/lib\" $(pkg-config --cflags --libs tesseract lept libarchive libcurl) -pthread -std=c++17\n             ./basicapitest\n        if: runner.os == 'Linux'\n\n      - name: Build and run basicapitest (macOS)\n        run: |\n             export \"PKG_CONFIG_PATH=$GITHUB_WORKSPACE/build/inst/lib/pkgconfig/:$(brew --prefix)/opt/libarchive/lib/pkgconfig:$(brew --prefix)/Library/Homebrew/os/mac/pkgconfig/11:$PKG_CONFIG_PATH\"\n             cd test\n             ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp \"-I$GITHUB_WORKSPACE/build/inst/include\" \"-L$GITHUB_WORKSPACE/build/inst/lib\" $(pkg-config --cflags --libs tesseract lept libcurl) -pthread -std=c++17\n             ./basicapitest\n        if: runner.os == 'macOS'\n\n      - name: Display Compiler Version\n        run: |\n             ${{ matrix.config.cxx }} --version\n             pwd\n             ls -la\n             # git log -3 --pretty=format:'%h %ad %s | %an'\n        if: always()\n"
  },
  {
    "path": ".github/workflows/codeql-analysis.yml",
    "content": "# For most projects, this workflow file will not need changing; you simply need\n# to commit it to your repository.\n#\n# You may wish to alter this file to override the set of languages analyzed,\n# or to provide custom queries or build logic.\n#\n# ******** NOTE ********\n# We have attempted to detect the languages in your repository. Please check\n# the `language` matrix defined below to confirm you have the correct set of\n# supported CodeQL languages.\n#\nname: \"CodeQL\"\n\non:\n  push:\n    branches: [ main ]\n    paths:\n      - '**.cpp'\n      - '**.h'\n      - '**/codeql-analysis.yml'\n      - 'm4/*.m4'\n      - 'Makefile.am'\n      - 'autogen.sh'\n      - 'configure.ac'\n  pull_request:\n    # The branches below must be a subset of the branches above\n    branches: [ main ]\n    paths:\n      - '**.cpp'\n      - '**.h'\n      - '**/codeql-analysis.yml'\n      - 'm4/*.m4'\n      - 'Makefile.am'\n      - 'autogen.sh'\n      - 'configure.ac'\n  schedule:\n    - cron: '34 23 * * 2'\n\njobs:\n  analyze:\n    name: Analyze\n    runs-on: ubuntu-latest\n    permissions:\n      actions: read\n      contents: read\n      security-events: write\n\n    strategy:\n      fail-fast: false\n      matrix:\n        language: [ 'cpp' ]\n        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]\n        # Learn more:\n        # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed\n\n    steps:\n    - name: Checkout repository\n      uses: actions/checkout@v6\n\n    - name: Install dependencies\n      run: |\n           sudo apt-get update\n           sudo apt-get install autoconf libleptonica-dev -y\n           sudo apt-get install libpango1.0-dev -y\n           sudo apt-get install cabextract libarchive-dev -y\n           sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y\n\n    # Initializes the CodeQL tools for scanning.\n    - name: Initialize CodeQL\n      uses: github/codeql-action/init@v4\n      with:\n        languages: ${{ matrix.language }}\n        # If you wish to specify custom queries, you can do so here or in a config file.\n        # By default, queries listed here will override any specified in a config file.\n        # Prefix the list here with \"+\" to use these queries and those in the config file.\n        # queries: ./path/to/local/query, your-org/your-repo/queries@main\n\n    - name: Build\n      run: |\n       ./autogen.sh\n       ./configure\n       make all training\n\n    - name: Perform CodeQL Analysis\n      uses: github/codeql-action/analyze@v4\n"
  },
  {
    "path": ".github/workflows/installer-for-windows.yml",
    "content": "# GitHub actions - Create Tesseract installer for Windows\n\nname: Cross build for Windows\n\non:\n  # Trigger workflow in GitHub web frontend or from API.\n  workflow_dispatch:\n    inputs:\n      targets:\n        description: 'Target operating system'\n        required: true\n        default: 'Windows (64 bit)'\n        type: choice\n        options:\n          - 'Windows (64 bit)'\n\njobs:\n  build64:\n    runs-on: [ubuntu-24.04]\n    steps:\n    - uses: actions/checkout@v6\n    - name: Build Tesseract installer (64 bit)\n      run: nsis/build.sh x86_64\n    - uses: actions/upload-artifact@v7\n      with:\n        name: Tesseract Installer for Windows (64 bit)\n        path: dist\n"
  },
  {
    "path": ".github/workflows/msys2.yml",
    "content": "name: msys2\n# msys2 build for tesseract -head from main branch.\non:\n  #push:\n  schedule:\n    - cron: 0 17 * * *\njobs:\n  windows:\n    runs-on: windows-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        include:\n          - msystem: MINGW64\n            mingw_package_prefix: mingw-w64-x86_64\n    defaults:\n      run:\n        shell: msys2 {0}\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n    - uses: msys2/setup-msys2@v2\n      with:\n        msystem: ${{ matrix.msystem }}\n        install: autoconf automake automake-wrapper git libtool make\n    - run: pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-gcc\n    - run: gcc --version\n\n    - name: Install dependencies\n      run: |\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-cairo\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-curl\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-gcc-libs\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-icu\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-leptonica\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-libarchive\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-pango\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-pkg-config\n           pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-zlib\n\n    - name: Setup Tesseract\n      run: |\n           ./autogen.sh\n\n    - name: Configure Tesseract\n      run: |\n           ./configure '--disable-shared' '--disable-openmp' '--disable-doc' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'\n\n    - name: Build and install Tesseract\n      run: |\n           make\n           make install\n\n    - name: Make and install training tools\n      run: |\n           make training\n           make training-install\n\n    - name: Display version\n      run: |\n           tesseract -v\n           text2image -v\n           lstmtraining -v\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Run Tesseract on phototest.tif and devatest.png\n      run: |\n           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata\n           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata\n"
  },
  {
    "path": ".github/workflows/sw.yml",
    "content": "name: sw\n\non:\n  schedule:\n    # every 3rd day\n    - cron: 0 0 */3 * *\n\njobs:\n  build:\n    runs-on: ${{ matrix.os }}\n    container: ${{ matrix.container }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [windows-2022, macos-latest]\n        include:\n          - os: ubuntu-22.04\n            container: fedora:latest\n\n    steps:\n    - name: packages\n      if: matrix.os == 'ubuntu-22.04'\n      run: sudo dnf -y install cmake gcc lld which flex bison clang clang-tools-extra git\n\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n    - uses: egorpugin/sw-action@master\n\n    - name: build\n      if: github.event_name != 'pull_request' && (matrix.os == 'windows-2022')\n      run: ./sw -static -shared -platform x86,x64 -config d,r build\n    - name: build-pr\n      if: github.event_name == 'pull_request' && (matrix.os == 'windows-2022')\n      run: ./sw build\n\n    - name: build\n      if: github.event_name != 'pull_request' && (matrix.os != 'windows-2022')\n      run: ./sw -static -shared -config d,r build -Dwith-tests=1\n    - name: build-pr\n      if: github.event_name == 'pull_request' && (matrix.os != 'windows-2022')\n      run: ./sw build -Dwith-tests=1\n\n    - name: download test data\n      run: git clone https://github.com/egorpugin/tessdata tessdata_unittest\n\n    - name: copy fonts\n      if: matrix.os != 'windows-2022'\n      run: cp tessdata_unittest/fonts/* test/testing/\n    - name: copy fonts\n      if: matrix.os == 'windows-2022'\n      run: Copy-Item -Path \"tessdata_unittest\\fonts\\*\" -Destination \"test\\testing\" -Recurse\n      shell: pwsh\n\n    - name: test\n      if: github.event_name != 'pull_request' && (matrix.os != 'windows-2022' && matrix.os != 'macos-latest')\n      run: ./sw -static -shared -config \"d,r\" test -Dwith-tests=1 \"-Dskip-tests=lstm,lstm_recode\"\n      continue-on-error: true\n    - name: test\n      if: github.event_name == 'pull_request' && (matrix.os != 'windows-2022')\n      run: ./sw test -Dwith-tests=1 \"-Dskip-tests=lstm,lstm_recode\"\n      continue-on-error: true\n\n    - name: test-nightly\n      if: matrix.os != 'windows-2022' && matrix.os != 'macos-latest' && github.event.schedule=='0 0 * * *'\n      run: ./sw -static -shared -config \"d,r\" test -Dwith-tests=1\n      continue-on-error: true\n\n    # windows and macos-latest tests hang here for some reason, investigate\n    #- name: test\n      #if: matrix.os == 'windows-2022' || matrix.os == 'macos-latest'\n      #run: ./sw test -Dwith-tests=1 \"-Dskip-tests=lstm,lstm_recode\"\n      #continue-on-error: true\n\n    - name: Upload Unit Test Results\n      if: always() && matrix.os != 'windows-2022'\n      uses: actions/upload-artifact@v7\n      with:\n        name: Test Results (${{ matrix.os }})\n        path: .sw/test/results.xml\n\n    - name: Publish Test Report\n      if: always() && matrix.os != 'windows-2022'\n      uses: mikepenz/action-junit-report@v6\n      with:\n        check_name: test (${{ matrix.os }})\n        report_paths: .sw/test/results.xml\n        github_token: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".github/workflows/unittest-disablelegacy.yml",
    "content": "name: unittest-disablelegacy\n# autotools build on ubuntu, unittests with disabled legacy engine.\n# currently some unittests are failing with disabled legacy engine.\n\non:\n  #push:\n  schedule:\n    - cron: 0 10 * * *\n\njobs:\n  linux:\n    runs-on: ${{ matrix.os }}\n    timeout-minutes: 150\n    strategy:\n      fail-fast: false\n      matrix:\n        compiler: [ g++, clang++-18 ]\n        os: [ ubuntu-24.04 ]\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Install dependencies\n      run: |\n           sudo apt-get update\n           sudo apt-get install autoconf libleptonica-dev libpango1.0-dev -y\n           sudo apt-get install cabextract -y\n           #sudo apt-get install libc++-7-dev libc++abi-7-dev -y\n\n    - name: Setup\n      run: |\n           ./autogen.sh\n\n    - name: Configure\n      run: |\n           ./configure '--disable-shared' '--disable-legacy' 'CXX=${{ matrix.compiler }}'\n\n    - name: Make and Install Tesseract\n      run: |\n           make\n           sudo make install install\n\n    - name: Make and Install Training Tools\n      run: |\n           make training\n           sudo make install training-install\n\n    - name: Display Version\n      run: |\n           ${{ matrix.compiler }} --version\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Run Tesseract on phototest.tif and devatest.png\n      run: |\n           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata\n           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata\n\n    - name: Make and run Unit Tests\n      run: |\n           make check\n\n    - name: Display Unit Tests Report\n      run: |\n           git log -3\n           ${{ matrix.compiler }} --version\n           cat test-suite.log\n      if: always()\n"
  },
  {
    "path": ".github/workflows/unittest-macos.yml",
    "content": "name: unittest-macos\n# autotools build on homebrew. unittests with address sanitizers. with openmp.\non:\n  #push:\n  schedule:\n    - cron: 0 0 * * *\n\njobs:\n  sanitizers:\n    name: ${{ matrix.config.name }}\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: macos-arm-14-clang-unittest, os: macos-14, cxx: clang++ } # Apple silicon\n          - { name: macos-latest-clang-unittest, os: macos-latest, cxx: clang++ }\n          - { name: macos-latest-gcc-unittest, os: macos-latest, cxx: g++ }\n\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Install dependencies (macOS Homebrew)\n      run: |\n           brew install autoconf automake cabextract libtool\n           brew install leptonica libarchive pango\n           if ! brew list icu4c &>/dev/null; then\n             brew install icu4c\n           fi\n           if ! brew list curl &>/dev/null; then\n             brew install curl\n           fi\n\n    - name: Setup\n      run: |\n           ./autogen.sh\n\n    - name: Configure (macOS Homebrew)\n      run: |\n           ./configure '--disable-shared' '--with-pic' \\\n                'CXX=${{ matrix.config.cxx }}' \\\n                'CXXFLAGS=-g -O2 -fsanitize=address,undefined'\n\n    - name: Make and Install Tesseract\n      run: |\n           make\n           sudo make install\n\n    - name: Make and Install Training Tools\n      run: |\n           make training\n           sudo make training-install\n\n    - name: Display Tesseract and Training Tools Version\n      run: |\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Run Tesseract on phototest.tif and devatest.png\n      run: |\n           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata\n           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata\n\n    - name: Make and run Unit Tests\n      run: |\n           make check\n\n    - name: Display Unit Tests Report and compiler version\n      run: |\n           cat test-suite.log\n           ${{ matrix.config.cxx }} --version\n           git log -3 --pretty=format:'%h %ad %s | %an'\n      if: always()\n"
  },
  {
    "path": ".github/workflows/unittest.yml",
    "content": "name: unittest\n# autotools build on ubuntu. unittests with address sanitizers. with openmp.\n# ubuntu-20.04-gcc-unittest - CI runs out of diskspace.\non:\n  #push:\n  pull_request:\n    paths:\n      - '**.cpp'\n      - '**.h'\n      - '**Makefile.am'\n      - '/configure.ac'\n      - 'unittest/**.c'\n      - 'unittest/**.cc'\n  schedule:\n    - cron: 0 0 * * *\n  workflow_dispatch:\n\njobs:\n  sanitizers:\n    name: ${{ matrix.config.name }}\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { name: ubuntu-24.04-gcc-unittest, os: ubuntu-24.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' }\n          - { name: ubuntu-22.04-clang-unittest, os: ubuntu-22.04, cxx: clang++, cxxflags: '-g -O2 -fsanitize=address,undefined -stdlib=libc++' }\n    steps:\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n\n    - name: Remove Homebrew, Android and .NET to provide more disk space\n      run: |\n           # https://github.com/actions/virtual-environments/issues/2606#issuecomment-772683150\n           sudo rm -rf /home/linuxbrew # will release Homebrew\n           sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android\n           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET\n\n    - name: Install dependencies (Linux)\n      run: |\n           sudo apt-get update\n           sudo apt-get install autoconf libleptonica-dev libpango1.0-dev -y\n           sudo apt-get install cabextract -y\n\n    - name: Setup\n      run: |\n           ./autogen.sh\n\n    - name: Configure (Linux)\n      run: |\n           ./configure '--disable-shared' 'CXX=${{ matrix.config.cxx }}' \\\n               'CXXFLAGS=${{ matrix.config.cxxflags }}'\n\n    - name: Make and Install Tesseract\n      run: |\n           ${{ matrix.config.cxx }} --version\n           make\n           sudo make install\n\n    - name: Make and Install Training Tools\n      run: |\n           make training\n           sudo make training-install\n\n    - name: Display Tesseract and Training Tools Version\n      run: |\n           tesseract -v\n           lstmtraining -v\n           text2image -v\n      if: success() || failure()\n\n    - name: Download fonts, tessdata and langdata required for tests\n      run: |\n           git clone https://github.com/egorpugin/tessdata tessdata_unittest\n           cp tessdata_unittest/fonts/* test/testing/\n           mv tessdata_unittest/* ../\n\n    - name: Run Tesseract on phototest.tif and devatest.png\n      run: |\n           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata\n           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata\n\n    - name: Make and run Unit Tests\n      run: |\n           make check\n\n    - name: Display Unit Tests Report and Compiler Version\n      run: |\n           cat test-suite.log\n           ${{ matrix.config.cxx }} --version\n           git log -3 --pretty=format:'%h %ad %s | %an'\n      if: always()\n"
  },
  {
    "path": ".github/workflows/vcpkg.yml",
    "content": "name: vcpkg\n# build and test of tesseract on windows using vcpkg and cmake.\n# vcpkg with -head does not work. https://github.com/microsoft/vcpkg/issues/16019\non:\n  #push:\n  schedule:\n    - cron: 0 23 * * *\n  workflow_dispatch:\n\njobs:\n  build:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [windows-latest]\n\n    steps:\n      - name: Checkout Tesseract Source (--head from main branch)\n        uses: actions/checkout@v6\n        with:\n          submodules: recursive\n\n     # - name: Visual Studio Setup\n     #   shell: cmd\n     #   run: |\n     #        call \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat\"\n\n      - name: Install vcpkg\n        run: |\n             git clone https://github.com/microsoft/vcpkg\n             vcpkg/bootstrap-vcpkg.bat\n             vcpkg/vcpkg integrate install\n\n      - name: Build and Install Leptonica and image libraries using vcpkg\n        run: |\n             vcpkg/vcpkg install leptonica:x64-windows\n\n      - name: Configure and Build Tesseract (--head from main branch) with cmake\n        run: |\n             cmake . -B build -DCMAKE_BUILD_TYPE=Release -DSW_BUILD=OFF -DOPENMP_BUILD=OFF -DBUILD_TRAINING_TOOLS=OFF \"-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake\"\n             cmake --build build --config Release --target install\n\n      - name: Display Tesseract Version\n        run: |\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe --version\n\n      - name: Create CMakeLists.txt file for basicapitest\n        shell: bash\n        run: |\n             cd test\n             cat << \"EOF\" > CMakeLists.txt\n             cmake_minimum_required(VERSION 3.19)\n             project( basicapitest )\n             find_package( Tesseract REQUIRED )\n             find_package( Leptonica REQUIRED )\n             include_directories(${Tesseract_INCLUDE_DIRS})\n             include_directories(${Leptonica_INCLUDE_DIRS})\n             add_executable( basicapitest testing/basicapitest.cpp )\n             target_link_libraries(basicapitest ${Leptonica_LIBRARIES})\n             target_link_libraries(basicapitest Tesseract::libtesseract)\n             add_library(libtesseract UNKNOWN IMPORTED)\n             set_property(TARGET libtesseract PROPERTY IMPORTED_LOCATION D:/a/tesseract/tesseract/build/Release/tesseract50.lib)\n             target_link_libraries(basicapitest Tesseract::libtesseract)\n             EOF\n             cat CMakeLists.txt\n\n      - name: Configure basicapitest\n        run: |\n             cd test\n             cmake . \"-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake\"\n\n      - name: Build basicapitest\n        run: |\n             cd test\n             cmake --build .  --config Release\n\n      - name: Download tessdata and image files used for tests\n        run: |\n             git clone https://github.com/egorpugin/tessdata tessdata_unittest\n             mv tessdata_unittest/* ../\n\n      - name: Run basicapitest\n        run: |\n             cd test\n             D:\\a\\tesseract\\tesseract\\test\\Release\\basicapitest.exe\n\n      - name: Run Tesseract CLI on test images in different languages\n        run: |\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\phototest.tif - --oem 1  --tessdata-dir ..\\tessdata\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\raaj.tif - -l hin --oem 1   --tessdata-dir ..\\tessdata\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\viet.tif - -l vie --oem 1   --tessdata-dir ..\\tessdata\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\hebrew.png - -l heb --oem 1   --tessdata-dir ..\\tessdata\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\eurotext.tif - -l fra --oem 1 --tessdata-dir ..\\tessdata_best\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  test\\testing\\arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ..\\tessdata\n\n      - name: List languages in different test tessdata-dir\n        run: |\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  --list-langs --tessdata-dir ..\\tessdata\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  --list-langs --tessdata-dir ..\\tessdata_best\n             D:\\a\\tesseract\\tesseract\\build\\bin\\Release\\tesseract.exe  --list-langs --tessdata-dir ..\\tessdata_fast\n"
  },
  {
    "path": ".gitignore",
    "content": "*~\n# Windows\n*.user.*\n*.idea*\n*.log\n*.tlog\n*.cache\n*.obj\n*.sdf\n*.opensdf\n*.lastbuildstate\n*.unsuccessfulbuild\n*.suo\n*.res\n*.ipch\n*.manifest\n\n# Linux\n# ignore local configuration\nconfig.*\nconfig/*\nMakefile\nMakefile.in\n*.m4\n\n# ignore help scripts/files\nconfigure\nlibtool\nstamp-h1\ntesseract.pc\nconfig_auto.h\n/doc/html/*\n/doc/*.1\n/doc/*.5\n/doc/*.html\n/doc/*.xml\n\n# generated version file\n/include/tesseract/version.h\n\n# executables\n/tesseract\n/src/training/ambiguous_words\n/src/training/classifier_tester\n/src/training/cntraining\n/src/training/combine_tessdata\n/src/training/dawg2wordlist\n/src/training/merge_unicharsets\n/src/training/mftraining\n/src/training/set_unicharset_properties\n/src/training/shapeclustering\n/src/training/text2image\n/src/training/unicharset_extractor\n/src/training/wordlist2dawg\n\n*.patch\n\n# files generated by libtool\n/src/training/combine_lang_model\n/src/training/lstmeval\n/src/training/lstmtraining\n\n# ignore compilation files\nbuild/*\n/bin\n/cmake-*\n.deps\n.dirstamp\n/.libs\n*/.libs/*\n*/*/.deps/*\n*/*/.libs/*\n*.lo\n*.la\n*.o\n*.Plo\n*.a\n*.class\n*.jar\n__pycache__\n\n# tessdata\n*.traineddata\ntessdata_*\n\n# build dirs\n/build*\n/*.dll\n/*.lib\n/*.exe\n/*.lnk\n/win*\n.vs*\n.s*\n\n# files generated by \"make check\"\n/tests/.dirstamp\n/unittest/*.trs\n/unittest/tmp/*\n\n# test programs\n/unittest/*_test\n/unittest/primesbitvector\n/unittest/primesmap\n\n# generated files from unlvtests\ntimes.txt\n/unlvtests/results*\n\n# snap packaging specific rules\n/parts/\n/stage/\n/prime/\n/snap/.snapcraft/\n\n/*.snap\n/*_source.tar.bz2\n\n# CodeQL and build artifacts\n_codeql_detected_source_root\ninstall-sh\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"googletest\"]\n\tpath = unittest/third_party/googletest\n\turl = https://github.com/google/googletest.git\n[submodule \"test\"]\n\tpath = test\n\turl = https://github.com/tesseract-ocr/test.git\n"
  },
  {
    "path": ".mailmap",
    "content": "Amit Dovev <amitdev2222@gmail.com>\n\nEgor Pugin <egor.pugin@gmail.com>\n\nJeff Breidenbach <breidenbach@gmail.com> <jbreiden@google.com>\n\nJim O'Regan <joregan@gmail.com> <joregan@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\nJim O'Regan <joregan@gmail.com> <joregan@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\n\nRay Smith <rays@google.com> <theraysmith@gmail.com>\nRay Smith <rays@google.com> <rays@rays.lon.corp.google.com>\nRay Smith <rays@google.com> <rays@rays-glaptop.roam.corp.google.com>\nRay Smith <rays@google.com> <theraysmith@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\nRay Smith <rays@google.com> <theraysmith@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\n\nShree Devi Kumar <5095331+Shreeshrii@users.noreply.github.com>\n\nStefan Weil <sw@weilnetz.de> <sw@weil.de>\nStefan Weil <sw@weilnetz.de> <stefan@v2201612906741603.powersrv.de>\nStefan Weil <sw@weilnetz.de> <stefan.weil@bib.uni-mannheim.de>\nStefan Weil <sw@weilnetz.de> <stweil@ub-backup.bib.uni-mannheim.de>\nStefan Weil <sw@weilnetz.de> <stweil@ub-blade-02.bib.uni-mannheim.de>\n\nZdenko Podobný <zdenop@gmail.com> <zdenko.podobny@nbazp1.SPS>\nZdenko Podobný <zdenop@gmail.com> <zdenop@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\nZdenko Podobný <zdenop@gmail.com> <zdenop@d0cd1f9f-072b-0410-8dd7-cf729c803f20>\n"
  },
  {
    "path": "AUTHORS",
    "content": "Ray Smith (lead developer) <theraysmith@gmail.com>\nAhmad Abdulkader\nRika Antonova\nNicholas Beato\nJeff Breidenbach\nSamuel Charron\nPhil Cheatle\nSimon Crouch\nDavid Eger\nSheelagh Huddleston\nDan Johnson\nRajesh Katikam\nThomas Kielbus\nDar-Shyang Lee\nZongyi (Joe) Liu\nRobert Moss\nChris Newton\nMichael Reimer\nMarius Renn\nRaquel Romano\nChristy Russon\nShobhit Saxena\nMark Seaman\nFaisal Shafait\nHiroshi Takenaka\nRanjith Unnikrishnan\nJoern Wanke\nPing Ping Xiu\nAndrew Ziem\nOscar Zuniga\n\nCommunity Contributors:\nZdenko Podobný (Maintainer)\nJim Regan (Maintainer)\nJames R Barlow\nStefan Brechtken\nThomas Breuel\nAmit Dovev\nMartin Ettl\nShree Devi Kumar\nNoah Metzger\nTom Morris\nTobias Müller\nEgor Pugin\nRobert Sachunsky\nRaf Schietekat\nSundar M. Vaidya\nRobin Watts\nStefan Weil\nNick White\nAlexander Zaitsev\n"
  },
  {
    "path": "CITATIONS.bib",
    "content": "@inproceedings{TableDetect,\n  author = {Faisal Shafait and Ray Smith},\n  booktitle = {Document Analysis Systems},\n  editor = {David S. Doermann and Venu Govindaraju and Daniel P. Lopresti and Premkumar Natarajan},\n  pages = {65--72},\n  publisher = {ACM},\n  series = {ACM International Conference Proceeding Series},\n  title = {Table detection in heterogeneous documents.},\n  url = {http://dblp.uni-trier.de/db/conf/das/das2010.html#ShafaitS10},\n  year = 2010,\n  isbn = {978-1-60558-773-8},\n  date = {2010-07-07}\n}\n\n@inproceedings{Multilingual,\n  author = {Ray Smith and Daria Antonova and Dar-Shyang Lee},\n  booktitle = {MOCR '09: Proceedings of the International Workshop on Multilingual OCR},\n  editor = {Venu Govindaraju and Premkumar Natarajan and Santanu Chaudhury and Daniel P. Lopresti},\n  pages = {1--8},\n  publisher = {ACM},\n  series = {ACM International Conference Proceeding Series},\n  title = {Adapting the Tesseract Open Source OCR Engine for Multilingual OCR.},\n  url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35248.pdf},\n  year = 2009,\n  isbn = {978-1-60558-698-4},\n  date = {2009-07-25},\n  doi = {http://doi.acm.org/10/1145/1577802.1577804},\n  location = {Barcelona, Spain},\n}\n\n@inproceedings{ScriptDetect,\n  author = {Ranjith Unnikrishnan and Ray Smith},\n  title = {Combined Orientation and Script Detection using the Tesseract OCR Engine},\n  booktitle = {MOCR '09: Proceedings of the International Workshop on Multilingual OCR},\n  editor = {Venu Govindaraju and Premkumar Natarajan and Santanu Chaudhury and Daniel P. Lopresti},\n  url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35506.pdf},\n  year = {2009},\n  isbn = {978-1-60558-698-4},\n  pages = {1--7},\n  location = {Barcelona, Spain},\n  doi = {http://doi.acm.org/10.1145/1577802.1577809},\n  publisher = {ACM},\n  address = {New York, NY, USA},\n}\n\n@inproceedings{PageLayout,\n  author = {Ray Smith},\n  title = {Hybrid Page Layout Analysis via Tab-Stop Detection},\n  booktitle = {ICDAR '09: Proceedings of the 2009 10th International Conference on Document Analysis and Recognition},\n  url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35094.pdf},\n  year = {2009},\n  isbn = {978-0-7695-3725-2},\n  pages = {241--245},\n  doi = {http://dx.doi.org/10.1109/ICDAR.2009.257},\n  publisher = {IEEE Computer Society},\n  address = {Washington, DC, USA},\n}\n\n@inproceedings{TessOverview,\n  author = {Ray Smith},\n  title = {An Overview of the Tesseract OCR Engine},\n  booktitle = {ICDAR '07: Proceedings of the Ninth International Conference on Document Analysis and Recognition},\n  url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/33418.pdf},\n  year = {2007},\n  isbn = {0-7695-2822-8},\n  pages = {629--633},\n  publisher = {IEEE Computer Society},\n  address = {Washington, DC, USA},\n}\n\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "#\n# tesseract\n#\n\n# ##############################################################################\n#\n# cmake settings\n#\n# ##############################################################################\n\n# Require CMake 3.18 for modern features like precompiled headers, unity builds, and better target management\ncmake_minimum_required(VERSION 3.18 FATAL_ERROR)\n\n# In-source builds are disabled.\nif(\"${CMAKE_CURRENT_SOURCE_DIR}\" STREQUAL \"${CMAKE_CURRENT_BINARY_DIR}\")\n  message(\n    FATAL_ERROR\n      \"CMake generation is not possible within the source directory!\"\n      \"\\n Remove the CMakeCache.txt file and try again from another folder, \"\n      \"e.g.:\\n \"\n      \"\\n rm CMakeCache.txt\"\n      \"\\n mkdir build\"\n      \"\\n cd build\"\n      \"\\n cmake ..\")\nendif()\n\nset(CMAKE_MODULE_PATH \"${CMAKE_MODULE_PATH};${CMAKE_CURRENT_SOURCE_DIR}/cmake\")\n\nset(EXECUTABLE_OUTPUT_PATH \"${CMAKE_BINARY_DIR}/bin\")\nset(CMAKE_RUNTIME_OUTPUT_DIRECTORY \"${EXECUTABLE_OUTPUT_PATH}\")\n\n# Use solution folders.\nset_property(GLOBAL PROPERTY USE_FOLDERS ON)\nset_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER \"CMake Targets\")\n\nif(NOT ${CMAKE_VERSION} VERSION_LESS \"3.15.0\")\n  if(WIN32)\n    cmake_policy(SET CMP0091 NEW)\n    message(STATUS \"Setting policy CMP0091 to NEW\")\n  endif()\nendif()\n\n# ##############################################################################\n#\n# project settings\n#\n# ##############################################################################\n\nproject(tesseract C CXX)\n\n# Get version with components from VERSION file.\nfile(STRINGS \"VERSION\" VERSION_PLAIN)\nstring(REGEX REPLACE \"^([^.]*)\\\\..*\" \"\\\\1\" VERSION_MAJOR ${VERSION_PLAIN})\nstring(REGEX REPLACE \"^[^.]*\\\\.([^.]*)\\\\..*\" \"\\\\1\" VERSION_MINOR\n                     ${VERSION_PLAIN})\nstring(REGEX REPLACE \"^[^.]*\\\\.[^.]*\\\\.([0-9]*).*\" \"\\\\1\" VERSION_PATCH\n                     ${VERSION_PLAIN})\nif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.git)\n  execute_process(COMMAND git --git-dir ${CMAKE_CURRENT_SOURCE_DIR}/.git\n                          describe --abbrev=4 OUTPUT_VARIABLE GIT_REV)\n  string(REGEX REPLACE \"\\n$\" \"\" PACKAGE_VERSION \"${GIT_REV}\")\nendif()\nif(NOT PACKAGE_VERSION)\n  set(PACKAGE_VERSION ${VERSION_PLAIN})\nendif()\n\n# Provide also same macro names as autoconf (see configure.ac).\nset(GENERIC_MAJOR_VERSION ${VERSION_MAJOR})\nset(GENERIC_MINOR_VERSION ${VERSION_MINOR})\nset(GENERIC_MICRO_VERSION ${VERSION_PATCH})\n\nset(MINIMUM_LEPTONICA_VERSION 1.74)\n\n# ##############################################################################\n#\n# options\n#\n# ##############################################################################\n\nmessage(STATUS \"Configuring tesseract version ${PACKAGE_VERSION}...\")\n\nif(WIN32)\n  option(SW_BUILD \"Build with sw\" ON)\nelse()\n  option(SW_BUILD \"Build with sw\" OFF)\nendif()\n\n# Apple's toolchains dont populate CMAKE_SYSTEM_PROCESSOR when crosscompiling\n# instead it uses CMAKE_OSX_ARCHITECTURES which can contain multiple architectures\nif(APPLE AND NOT CMAKE_SYSTEM_PROCESSOR AND CMAKE_OSX_ARCHITECTURES)\n  list(LENGTH CMAKE_OSX_ARCHITECTURES NO_OF_ARCH)\n  if(NO_OF_ARCH GREATER 1)\n    message(FATAL_ERROR \"Apple's universal build for ${NO_OF_ARCH} architectures not supported. CMAKE_OSX_ARCHITECTURES: ${CMAKE_OSX_ARCHITECTURES}\")\n  endif()\n  set(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_OSX_ARCHITECTURES}\" CACHE STRING \"Target processor\" FORCE)\n  message(STATUS \"CMAKE_SYSTEM_PROCESSOR set to '${CMAKE_SYSTEM_PROCESSOR}' from CMAKE_OSX_ARCHITECTURES\")\nendif()\n\noption(OPENMP_BUILD \"Build with openmp support\" OFF) # see issue #1662\noption(GRAPHICS_DISABLED \"Disable disable graphics (ScrollView)\" OFF)\noption(DISABLED_LEGACY_ENGINE \"Disable the legacy OCR engine\" OFF)\noption(ENABLE_LTO \"Enable link-time optimization\" OFF)\noption(FAST_FLOAT \"Enable float for LSTM\" ON)\noption(ENABLE_NATIVE\n       \"Enable optimization for host CPU (could break HW compatibility)\" OFF)\n# see\n# https://stackoverflow.com/questions/52653025/why-is-march-native-used-so-rarely\noption(BUILD_TRAINING_TOOLS \"Build training tools\" ON)\noption(BUILD_TESTS \"Build tests\" OFF)\noption(USE_SYSTEM_ICU \"Use system ICU\" OFF)\noption(DISABLE_TIFF \"Disable build with libtiff (if available)\" OFF)\noption(DISABLE_ARCHIVE \"Disable build with libarchive (if available)\" OFF)\noption(DISABLE_CURL \"Disable build with libcurl (if available)\" OFF)\noption(INSTALL_CONFIGS \"Install tesseract configs\" ON)\n\n# Build optimization options\noption(ENABLE_UNITY_BUILD \"Enable Unity/Jumbo builds for faster compilation\" OFF)\noption(ENABLE_PRECOMPILED_HEADERS \"Enable precompiled headers for faster compilation\" ON)\noption(ENABLE_CCACHE \"Enable ccache for faster incremental builds\" ON)\noption(ENABLE_NINJA_POOL \"Enable Ninja job pools to manage parallelism\" ON)\n\n\n# ##############################################################################\n#\n# compiler and linker\n#\n# ##############################################################################\n\nif(CMAKE_CXX_COMPILER_ID MATCHES \"Clang\")\n  set(CLANG 1)\nendif()\n\nif(NOT CMAKE_BUILD_TYPE)\n  message(STATUS \"Setting build type to 'Release' as none was specified.\")\n  set(CMAKE_BUILD_TYPE\n      Release\n      CACHE STRING \"Choose the type of build.\" FORCE)\n  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS \"Debug\" \"Release\")\nendif()\n\ninclude(CheckCXXCompilerFlag)\n\nset(CMAKE_CXX_STANDARD 17)\nif(\"cxx_std_20\" IN_LIST CMAKE_CXX_COMPILE_FEATURES)\n  set(CMAKE_CXX_STANDARD 20)\nendif()\nset(CMAKE_CXX_STANDARD_REQUIRED ON)\nif(NOT CMAKE_CXX_COMPILER_ID STREQUAL \"GNU\")\n  # cygwin gnu c++ needs to use -std=gnu++17 instead of -std=c++17\n  set(CMAKE_CXX_EXTENSIONS OFF)\nendif()\n\nif(BUILD_SHARED_LIBS)\n  set(CMAKE_CXX_VISIBILITY_PRESET hidden)\nendif()\n\n# LTO\ncmake_policy(SET CMP0069 NEW)\ninclude(CheckIPOSupported)\ncheck_ipo_supported(RESULT LTO_SUPPORTED OUTPUT error)\nif(LTO_SUPPORTED)\n  message(STATUS \"IPO / LTO supported\")\nelse()\n  message(STATUS \"IPO / LTO not supported: <${error}>\")\nendif()\n\nset(MARCH_NATIVE_OPT OFF)\nif(ENABLE_NATIVE)\n  check_cxx_compiler_flag(\"-march=native\" COMPILER_SUPPORTS_MARCH_NATIVE)\n  if(COMPILER_SUPPORTS_MARCH_NATIVE)\n    set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} -march=native\")\n    if(NOT CLANG AND MSVC)\n      # clang-cl does not know this argument\n      set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} -mtune=native\")\n    endif()\n    set(MARCH_NATIVE_OPT ON)\n  endif(COMPILER_SUPPORTS_MARCH_NATIVE)\nendif(ENABLE_NATIVE)\n\nmessage(STATUS \"CMAKE_SYSTEM_PROCESSOR=<${CMAKE_SYSTEM_PROCESSOR}>\")\n\nif(CMAKE_SYSTEM_PROCESSOR MATCHES \"x86|x86_64|AMD64|amd64|i386|i686\")\n\n  set(HAVE_NEON FALSE)\n  if(MSVC)\n    set(HAVE_AVX ON)\n    set(AVX_COMPILE_FLAGS \"/arch:AVX\")\n    add_definitions(\"-DHAVE_AVX\")\n\n    set(HAVE_AVX2 ON)\n    set(AVX2_COMPILE_FLAGS \"/arch:AVX2\")\n    add_definitions(\"-DHAVE_AVX2\")\n\n    set(HAVE_AVX512F ON)\n    set(AVX512F_COMPILE_FLAGS \"/arch:AVX512\")\n    add_definitions(\"-DHAVE_AVX512F\")\n\n    set(HAVE_FMA ON)\n    set(FMA_COMPILE_FLAGS \"-D__FMA__\")\n    add_definitions(\"-DHAVE_FMA\")\n\n    set(HAVE_SSE4_1 ON)\n    set(SSE4_1_COMPILE_FLAGS \"-D__SSE4_1__\")\n    add_definitions(\"-DHAVE_SSE4_1\")\n\n    set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} -openmp:experimental\")\n    add_definitions(\"-DOPENMP_SIMD\")\n\n    # clang with MSVC compatibility\n    if(CLANG)\n      set(CMAKE_CXX_FLAGS\n          \"${CMAKE_CXX_FLAGS} -Wno-microsoft-unqualified-friend\")\n      if(HAVE_FMA)\n        set(FMA_COMPILE_FLAGS \"-mfma ${FMA_COMPILE_FLAGS}\")\n      endif(HAVE_FMA)\n      if(HAVE_SSE4_1)\n        set(SSE4_1_COMPILE_FLAGS \"-msse4.1 ${SSE4_1_COMPILE_FLAGS}\")\n      endif(HAVE_SSE4_1)\n    endif(CLANG)\n  else() # if not MSVC\n    check_cxx_compiler_flag(\"-mavx\" HAVE_AVX)\n    if(HAVE_AVX)\n      set(AVX_COMPILE_FLAGS \"-mavx\")\n      add_definitions(\"-DHAVE_AVX\")\n    endif(HAVE_AVX)\n\n    check_cxx_compiler_flag(\"-mavx2\" HAVE_AVX2)\n    if(HAVE_AVX2)\n      set(AVX2_COMPILE_FLAGS \"-mavx2\")\n      add_definitions(\"-DHAVE_AVX2\")\n    endif()\n\n    check_cxx_compiler_flag(\"-mavx512f\" HAVE_AVX512F)\n    if(HAVE_AVX512F)\n      set(AVX512F_COMPILE_FLAGS \"-mavx512f\")\n      add_definitions(\"-DHAVE_AVX512F\")\n    endif()\n\n    check_cxx_compiler_flag(\"-mfma\" HAVE_FMA)\n    if(HAVE_FMA)\n      set(FMA_COMPILE_FLAGS \"-mfma\")\n      add_definitions(\"-DHAVE_FMA\")\n    endif()\n\n    check_cxx_compiler_flag(\"-msse4.1\" HAVE_SSE4_1)\n    if(HAVE_SSE4_1)\n      set(SSE4_1_COMPILE_FLAGS \"-msse4.1\")\n      add_definitions(\"-DHAVE_SSE4_1\")\n    endif()\n\n    check_cxx_compiler_flag(\"-fopenmp-simd\" OPENMP_SIMD)\n    if(OPENMP_SIMD)\n      set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} -fopenmp-simd\")\n      add_definitions(\"-DOPENMP_SIMD\")\n    endif(OPENMP_SIMD)\n  endif(MSVC)\n\nelseif(CMAKE_SYSTEM_PROCESSOR MATCHES \"arm64|aarch64.*|AARCH64.*\")\n\n  set(HAVE_AVX FALSE)\n  set(HAVE_AVX2 FALSE)\n  set(HAVE_AVX512F FALSE)\n  set(HAVE_FMA FALSE)\n  set(HAVE_SSE4_1 FALSE)\n  set(HAVE_NEON TRUE)\n\nelseif(CMAKE_SYSTEM_PROCESSOR MATCHES \"arm.*\")\n\n  set(HAVE_AVX FALSE)\n  set(HAVE_AVX2 FALSE)\n  set(HAVE_AVX512F FALSE)\n  set(HAVE_FMA FALSE)\n  set(HAVE_SSE4_1 FALSE)\n\n  check_cxx_compiler_flag(\"-mfpu=neon\" HAVE_NEON)\n  if(HAVE_NEON)\n    set(NEON_COMPILE_FLAGS \"-mfpu=neon\")\n  endif(HAVE_NEON)\n\nelse()\n\n  set(HAVE_AVX FALSE)\n  set(HAVE_AVX2 FALSE)\n  set(HAVE_AVX512F FALSE)\n  set(HAVE_FMA FALSE)\n  set(HAVE_NEON FALSE)\n  set(HAVE_SSE4_1 FALSE)\n\nendif(CMAKE_SYSTEM_PROCESSOR MATCHES \"x86|x86_64|AMD64|amd64|i386|i686\")\n\nif(HAVE_NEON)\n  message(STATUS \"LTO build is not supported on arm/RBPi.\")\n  set(ENABLE_LTO FALSE)  # enable LTO cause fatal error on arm/RBPi\nendif()\n\n# Compiler specific environment\nif(CMAKE_COMPILER_IS_GNUCXX OR MINGW)\n  set(CMAKE_CXX_FLAGS_DEBUG\n      \"${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og -Wno-unknown-pragmas\")\nelseif(MSVC)\n  add_definitions(-D_CRT_SECURE_NO_WARNINGS)\n  add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) # strdup\n  add_definitions(-D_USE_MATH_DEFINES) # Enable M_PI and other math constants\n  add_definitions(-DNOMINMAX) # Prevent min/max macro conflicts\n  set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} /utf-8\")\n  if(NOT CLANG)\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} /MP\")\n  endif()\n  # Hide some warnings for release target wd4244 'argument': conversion from\n  # 'uint64_t' to 'unsigned int', possible loss of data wd4251 needs to have\n  # dll-interface wd4267 return': conversion from 'size_t' to 'int', possible\n  # loss of data wd4275 non dll-interface class wd4305 ...truncation from\n  # 'double' to 'float'\n  set(CMAKE_CXX_FLAGS_RELEASE\n      \"${CMAKE_CXX_FLAGS_RELEASE} /wd4244 /wd4305 /wd4267 /wd4251 /wd4275 /wd4005\"\n  )\n  set(CMAKE_CXX_FLAGS_RELEASE \"${CMAKE_CXX_FLAGS_RELEASE} /wd4068\")\n  # Don't use /Wall because it generates too many warnings.\n  set(CMAKE_CXX_FLAGS_DEBUG \"${CMAKE_CXX_FLAGS_DEBUG} /W0 /bigobj\")\n  # MT flag\n  if(WIN32_MT_BUILD)\n    set(CMAKE_MSVC_RUNTIME_LIBRARY \"MultiThreaded$<$<CONFIG:Debug>:Debug>\")\n    message(STATUS \"Building with static CRT.\")\n  endif()\n  # Workaround: When building on VS 2022 17.10 or newer, but using an older runtime,\n  # mutexes can crash\n  # https://stackoverflow.com/questions/78598141/first-stdmutexlock-crashes-in-application-built-with-latest-visual-studio\n  add_definitions(-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)\nendif()\nif(CLANG) # clang all platforms\n  set(CMAKE_CXX_FLAGS_RELEASE\n      \"${CMAKE_CXX_FLAGS_RELEASE} -Wno-unused-command-line-argument\")\n  set(CMAKE_CXX_FLAGS_DEBUG\n      \"${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -O0\")\nendif()\n\nif(OPENMP_BUILD\n   AND MSVC\n   AND \"${MSVC_VERSION}\" LESS 1929)\n  set(OPENMP_BUILD OFF)\nendif()\nif(OPENMP_BUILD)\n  if(MSVC)  # supported from cmake 3.30\n    set(OpenMP_RUNTIME_MSVC \"llvm\")\n  endif(MSVC)\n  find_package(OpenMP)\n  # https://stackoverflow.com/questions/12399422\n  # how-to-set-linker-flags-for-openmp-in-cmakes-try-compile-function\n  if(NOT OpenMP_FOUND\n     AND CLANG\n     AND WIN32)\n    # workaround because find_package(OpenMP) does not work for clang-cl\n    # https://gitlab.kitware.com/cmake/cmake/issues/19404\n    check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)\n    find_library(OpenMP_LIBRARY NAMES omp libomp.lib)\n    message(\">> OpenMP_LIBRARY: ${OpenMP_LIBRARY}\")\n    if(MSVC)\n      set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} /openmp\")\n    else()\n      set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fopenmp\")\n    endif()\n    set(OpenMP_FOUND 1)\n    # OpenMP 3.1 is fully supported from Clang 3.8.0\n    add_definitions(-D_OPENMP=201107)\n  endif()\n  if(MSVC)\n    # Note: -openmp:llvm is available for X64 from MSVC 16.9 from MSVC 16.10\n    # Preview 2 there is support also for x86 and arm64\n    # https://devblogs.microsoft.com/cppblog/openmp-updates-and-fixes-for-cpp-in-visual-studio-2019-16-10/\n    if(\"${OpenMP_CXX_FLAGS}\" STREQUAL \"-openmp\")\n      set(OpenMP_CXX_FLAGS \"-openmp:llvm\")\n    endif()\n  endif()\n  if(OpenMP_FOUND)\n    message(\">> OpenMP_FOUND ${OpenMP_FOUND} version: ${OpenMP_CXX_VERSION}\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}\")\n    if(NOT TARGET OpenMP::OpenMP_CXX)\n      add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)\n    endif()\n  endif()\nendif()\n\nif(CYGWIN)\n  add_definitions(-D__CYGWIN__)\nelseif(UNIX)\n  if(NOT ANDROID)\n    set(LIB_pthread pthread)\n  endif()\nelseif(WIN32)\n  set(LIB_Ws2_32 Ws2_32)\nendif()\n\nadd_definitions(\"-DCMAKE_BUILD\")\n\n# ##############################################################################\n#\n# Build optimizations\n#\n# ##############################################################################\n\n# Setup ccache if available and enabled\nif(ENABLE_CCACHE)\n  find_program(CCACHE_PROGRAM ccache)\n  if(CCACHE_PROGRAM)\n    message(STATUS \"Found ccache: ${CCACHE_PROGRAM}\")\n    set(CMAKE_CXX_COMPILER_LAUNCHER \"${CCACHE_PROGRAM}\")\n    set(CMAKE_C_COMPILER_LAUNCHER \"${CCACHE_PROGRAM}\")\n    # Configure ccache for better performance\n    set(ENV{CCACHE_SLOPPINESS} \"pch_defines,time_macros\")\n    set(ENV{CCACHE_CPP2} \"true\")\n  else()\n    message(STATUS \"ccache not found, disabling ccache support\")\n    set(ENABLE_CCACHE OFF)\n  endif()\nendif()\n\n# Setup Ninja job pools for better resource management\nif(ENABLE_NINJA_POOL AND CMAKE_GENERATOR STREQUAL \"Ninja\")\n  include(ProcessorCount)\n  ProcessorCount(N)\n  if(N GREATER 1)\n    # Use 75% of available cores for compilation, rest for linking\n    math(EXPR COMPILE_JOBS \"${N} * 3 / 4\")\n    math(EXPR LINK_JOBS \"${N} - ${COMPILE_JOBS}\")\n    if(LINK_JOBS LESS 1)\n      set(LINK_JOBS 1)\n    endif()\n\n    set_property(GLOBAL PROPERTY JOB_POOLS \"compile=${COMPILE_JOBS};link=${LINK_JOBS}\")\n    set(CMAKE_JOB_POOL_COMPILE compile)\n    set(CMAKE_JOB_POOL_LINK link)\n    message(STATUS \"Ninja job pools: compile=${COMPILE_JOBS}, link=${LINK_JOBS}\")\n  endif()\nendif()\n\n# ##############################################################################\n#\n# packages\n#\n# ##############################################################################\ninclude(CheckFunctions)\n\nif(SW_BUILD)\n  find_package(SW REQUIRED)\n  if(BUILD_SHARED_LIBS)\n    set(SW_BUILD_SHARED_LIBS 1)\n  else()\n    set(SW_BUILD_SHARED_LIBS 0)\n  endif()\n  sw_add_package(org.sw.demo.danbloomberg.leptonica\n                 org.sw.demo.libarchive.libarchive)\n  if(BUILD_TRAINING_TOOLS)\n    sw_add_package(org.sw.demo.gnome.pango.pangocairo\n                   org.sw.demo.unicode.icu.i18n)\n  endif()\n  sw_execute()\nelse()\n  find_package(PkgConfig)\n  if(APPLE)\n    if(DEFINED ENV{HOMEBREW_PREFIX})\n      set(HOMEBREW_PREFIX $ENV{HOMEBREW_PREFIX})\n      set(PKG_CONFIG_PATH \"${HOMEBREW_PREFIX}/opt/icu4c/lib/pkgconfig:${HOMEBREW_PREFIX}/opt/libarchive/lib/pkgconfig\")\n      set(ENV{PKG_CONFIG_PATH} \"${PKG_CONFIG_PATH}\")\n    endif()\n  endif()\n  # Check for required library. option -DLeptonica_DIR=path => cmake hint where\n  # to find leptonica\n  find_package(Leptonica ${MINIMUM_LEPTONICA_VERSION} CONFIG)\n  if(NOT Leptonica_FOUND AND PKG_CONFIG_EXECUTABLE)\n    pkg_check_modules(Leptonica lept>=${MINIMUM_LEPTONICA_VERSION})\n    link_directories(${Leptonica_LIBRARY_DIRS})\n  endif()\n  if(NOT Leptonica_FOUND)\n    message(FATAL_ERROR \"Cannot find required library Leptonica. Quitting!\")\n  else()\n    message(STATUS \"Found leptonica version: ${Leptonica_VERSION}\")\n  endif(NOT Leptonica_FOUND)\n  include_directories(${Leptonica_INCLUDE_DIRS})\n\n  check_leptonica_tiff_support()\n  if ((NOT LEPT_TIFF_RESULT EQUAL 0) AND LEPT_TIFF_COMPILE_SUCCESS)\n    message(NOTICE \"Leptonica was build without TIFF support! Disabling TIFF support...\")\n    set(DISABLE_TIFF ON)\n  elseif(NOT ${CMAKE_VERSION} VERSION_LESS \"3.25\")\n    message(STATUS \"Leptonica was build with TIFF support.\")\n  endif()\n\n  # Check for optional libraries.\n  if(DISABLE_TIFF)\n    set(HAVE_TIFFIO_H OFF)\n    message(STATUS \"TIFF support disabled.\")\n  else(DISABLE_TIFF)\n    find_package(TIFF) # for tesseract\n    if(NOT TIFF_FOUND AND PKG_CONFIG_EXECUTABLE)\n      # try PKG_CONFIG to find libtiff if cmake failed\n      pkg_check_modules(TIFF libtiff-4)\n    endif()\n    if(TIFF_FOUND)\n      set(HAVE_TIFFIO_H ON)\n      include_directories(${TIFF_INCLUDE_DIRS})\n    endif(TIFF_FOUND)\n  endif(DISABLE_TIFF)\n  if(DISABLE_ARCHIVE)\n    set(HAVE_LIBARCHIVE OFF)\n    message(STATUS \"LibArchive support disabled.\")\n  else(DISABLE_ARCHIVE)\n    find_package(LibArchive)\n    if(NOT LibArchive_FOUND AND PKG_CONFIG_EXECUTABLE)\n      # try PKG_CONFIG to find libarchive if cmake failed\n      pkg_check_modules(LibArchive libarchive)\n    endif()\n    if(LibArchive_FOUND)\n      set(HAVE_LIBARCHIVE ON)\n      include_directories(${LibArchive_INCLUDE_DIRS})\n    endif(LibArchive_FOUND)\n  endif(DISABLE_ARCHIVE)\n  if(DISABLE_CURL)\n    set(HAVE_LIBCURL OFF)\n    message(STATUS \"CURL support disabled.\")\n  else(DISABLE_CURL)\n    find_package(CURL)\n    if(NOT CURL_FOUND AND PKG_CONFIG_EXECUTABLE)\n      # try PKG_CONFIG to find libcurl if cmake failed\n      pkg_check_modules(CURL libcurl)\n    endif()\n    if(CURL_FOUND)\n      set(HAVE_LIBCURL ON)\n      include_directories(${CURL_INCLUDE_DIRS})\n    endif(CURL_FOUND)\n  endif(DISABLE_CURL)\nendif()\n\n# ##############################################################################\n#\n# configure\n#\n# ##############################################################################\n\nif(MSVC)\n  set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} /fp:fast\")\nelse()\n  set(DOTPRODUCT_FLAGS \"${DOTPRODUCT_FLAGS} -O3 -ffast-math\")\nendif()\n\ninclude (GNUInstallDirs)\n\nset(AUTOCONFIG_SRC ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h.in)\nset(AUTOCONFIG ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h)\nadd_definitions(-DHAVE_CONFIG_H)\n\nif(GRAPHICS_DISABLED)\n  message(\"ScrollView debugging disabled.\")\nendif()\nset(CMAKE_REQUIRED_INCLUDES\n    ${CMAKE_REQUIRED_INCLUDES} \"${CMAKE_PREFIX_PATH}/include\"\n    ${CMAKE_INSTALL_INCLUDEDIR})\ninclude(Configure)\n\nconfigure_file(${AUTOCONFIG_SRC} ${AUTOCONFIG} @ONLY)\n\nset(INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR})\nset(LIBRARY_DIRS ${CMAKE_INSTALL_LIBDIR})\n\nconfigure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/tesseract/version.h.in\n               ${CMAKE_CURRENT_BINARY_DIR}/include/tesseract/version.h @ONLY)\n\ninclude(CMakePackageConfigHelpers)\ninclude(GenerateExportHeader)\n\n# show summary of configuration\nif(${CMAKE_BUILD_TYPE} MATCHES Debug)\n  set(COMPILER_FLAGS \"${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}\")\nelseif(${CMAKE_BUILD_TYPE} MATCHES Release)\n  set(COMPILER_FLAGS \"${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}\")\n  if(LTO_SUPPORTED AND ENABLE_LTO)\n    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)\n  else()\n    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)\n  endif() # LTO_SUPPORTED\nendif()\n\nif(CMAKE_SIZEOF_VOID_P EQUAL 8)\n  set(BUILD_ARCH \"64 bits\")\nelseif(CMAKE_SIZEOF_VOID_P EQUAL 4)\n  set(BUILD_ARCH \"32 bits\")\nendif()\n\nmessage(STATUS)\nmessage(STATUS \"General configuration for Tesseract ${PACKAGE_VERSION}\")\nmessage(STATUS \"--------------------------------------------------------\")\nmessage(STATUS \"Build type: ${CMAKE_BUILD_TYPE} ${BUILD_ARCH}\")\nmessage(STATUS \"Compiler: ${CMAKE_CXX_COMPILER_ID}\")\nmessage(STATUS \"Compiler version: ${CMAKE_CXX_COMPILER_VERSION}\")\nmessage(STATUS \"Used standard: C++${CMAKE_CXX_STANDARD}\")\nmessage(STATUS \"CXX compiler options: ${COMPILER_FLAGS}\")\nget_directory_property(DirCompDefs COMPILE_DEFINITIONS)\nmessage(STATUS \"Compile definitions = ${DirCompDefs}\")\nmessage(STATUS \"Linker options: ${CMAKE_EXE_LINKER_FLAGS} \"\n               \"${CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE_UP}}\")\nmessage(STATUS \"Install directory: ${CMAKE_INSTALL_PREFIX}\")\nmessage(STATUS \"HAVE_AVX: ${HAVE_AVX}\")\nmessage(STATUS \"HAVE_AVX2: ${HAVE_AVX2}\")\nmessage(STATUS \"HAVE_AVX512F: ${HAVE_AVX512F}\")\nmessage(STATUS \"HAVE_FMA: ${HAVE_FMA}\")\nmessage(STATUS \"HAVE_SSE4_1: ${HAVE_SSE4_1}\")\nmessage(STATUS \"MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}\")\nmessage(STATUS \"HAVE_NEON: ${HAVE_NEON}\")\nmessage(STATUS \"Link-time optimization: ${CMAKE_INTERPROCEDURAL_OPTIMIZATION}\")\nmessage(STATUS \"--------------------------------------------------------\")\nmessage(STATUS \"Build with sw [SW_BUILD]: ${SW_BUILD}\")\nmessage(STATUS \"Build with openmp support [OPENMP_BUILD]: ${OPENMP_BUILD}\")\nmessage(STATUS \"Build with libarchive support [HAVE_LIBARCHIVE]: \"\n               \"${HAVE_LIBARCHIVE}\")\nmessage(STATUS \"Build with libcurl support [HAVE_LIBCURL]: ${HAVE_LIBCURL}\")\nmessage(STATUS \"Enable float for LSTM [FAST_FLOAT]: ${FAST_FLOAT}\")\nmessage(STATUS \"Enable optimization for host CPU (could break HW compatibility)\"\n               \" [ENABLE_NATIVE]: ${ENABLE_NATIVE}\")\nmessage(STATUS \"Disable disable graphics (ScrollView) [GRAPHICS_DISABLED]: \"\n               \"${GRAPHICS_DISABLED}\")\nmessage(STATUS \"Disable the legacy OCR engine [DISABLED_LEGACY_ENGINE]: \"\n               \"${DISABLED_LEGACY_ENGINE}\")\nmessage(STATUS \"Build training tools [BUILD_TRAINING_TOOLS]: \"\n               \"${BUILD_TRAINING_TOOLS}\")\nmessage(STATUS \"Build tests [BUILD_TESTS]: ${BUILD_TESTS}\")\nmessage(STATUS \"Use system ICU Library [USE_SYSTEM_ICU]: ${USE_SYSTEM_ICU}\")\nmessage(\n  STATUS \"Install tesseract configs [INSTALL_CONFIGS]: ${INSTALL_CONFIGS}\")\nmessage(STATUS \"--------------------------------------------------------\")\nmessage(STATUS \"Modern build optimizations:\")\nmessage(STATUS \"Unity build [ENABLE_UNITY_BUILD]: ${ENABLE_UNITY_BUILD}\")\nmessage(STATUS \"Precompiled headers [ENABLE_PRECOMPILED_HEADERS]: ${ENABLE_PRECOMPILED_HEADERS}\")\nmessage(STATUS \"ccache [ENABLE_CCACHE]: ${ENABLE_CCACHE}\")\nif(CMAKE_GENERATOR STREQUAL \"Ninja\")\n  message(STATUS \"Ninja job pools [ENABLE_NINJA_POOL]: ${ENABLE_NINJA_POOL}\")\nelse()\n  message(STATUS \"Ninja job pools [ENABLE_NINJA_POOL]: Disabled (not using Ninja)\")\nendif()\nmessage(STATUS \"--------------------------------------------------------\")\nmessage(STATUS)\n\n# ##############################################################################\n#\n# build\n#\n# ##############################################################################\n\ninclude(BuildFunctions)\ninclude(SourceGroups)\n\nadd_definitions(-D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1)\n\ninclude_directories(${CMAKE_CURRENT_BINARY_DIR})\ninclude_directories(${CMAKE_CURRENT_BINARY_DIR}/include)\nif(ANDROID_TOOLCHAIN)\n  include_directories(${ANDROID_TOOLCHAIN}/sysroot/usr/include)\n  add_compile_definitions(__ANDROID_API_FUTURE__)\nendif()\n\n# ##############################################################################\n# LIBRARY tesseract\n# ##############################################################################\n\n# Include source file lists\ninclude(cmake/SourceLists.cmake)\n\n# Build the core source file list\nset(TESSERACT_SRC ${TESSERACT_SRC_CORE})\n\nif(DISABLED_LEGACY_ENGINE)\n  # prepend path to list of source files\n  function(prepend_path srcs path)\n    set(tmp, \"\")\n    foreach(src IN LISTS ${srcs})\n      list(APPEND tmp ${path}/${src})\n    endforeach(src ${srcs})\n    set(${srcs}\n        ${tmp}\n        PARENT_SCOPE)\n  endfunction()\n\n  set(TESSERACT_SRC_LEGACY\n      src/ccmain/adaptions.cpp\n      src/ccmain/docqual.cpp\n      src/ccmain/equationdetect.cpp\n      src/ccmain/fixspace.cpp\n      src/ccmain/fixxht.cpp\n      src/ccmain/osdetect.cpp\n      src/ccmain/par_control.cpp\n      src/ccmain/recogtraining.cpp\n      src/ccmain/superscript.cpp\n      src/ccmain/tessbox.cpp\n      src/ccmain/tfacepp.cpp\n      src/ccstruct/fontinfo.cpp\n      src/ccstruct/params_training_featdef.cpp\n      src/ccutil/ambigs.cpp\n      src/ccutil/bitvector.cpp\n      src/ccutil/indexmapbidi.cpp\n      src/classify/adaptive.cpp\n      src/classify/adaptmatch.cpp\n      src/classify/blobclass.cpp\n      src/classify/cluster.cpp\n      src/classify/clusttool.cpp\n      src/classify/cutoffs.cpp\n      src/classify/featdefs.cpp\n      src/classify/float2int.cpp\n      src/classify/fpoint.cpp\n      src/classify/intfeaturespace.cpp\n      src/classify/intfx.cpp\n      src/classify/intmatcher.cpp\n      src/classify/intproto.cpp\n      src/classify/kdtree.cpp\n      src/classify/mf.cpp\n      src/classify/mfoutline.cpp\n      src/classify/mfx.cpp\n      src/classify/normfeat.cpp\n      src/classify/normmatch.cpp\n      src/classify/ocrfeatures.cpp\n      src/classify/outfeat.cpp\n      src/classify/picofeat.cpp\n      src/classify/protos.cpp\n      src/classify/shapeclassifier.cpp\n      src/classify/shapetable.cpp\n      src/classify/tessclassifier.cpp\n      src/classify/trainingsample.cpp\n      src/dict/permdawg.cpp\n      src/dict/hyphen.cpp\n      src/wordrec/associate.cpp\n      src/wordrec/chop.cpp\n      src/wordrec/chopper.cpp\n      src/wordrec/drawfx.cpp\n      src/wordrec/findseam.cpp\n      src/wordrec/gradechop.cpp\n      src/wordrec/language_model.cpp\n      src/wordrec/lm_consistency.cpp\n      src/wordrec/lm_pain_points.cpp\n      src/wordrec/lm_state.cpp\n      src/wordrec/outlines.cpp\n      src/wordrec/params_model.cpp\n      src/wordrec/pieces.cpp\n      src/wordrec/plotedges.cpp\n      src/wordrec/render.cpp\n      src/wordrec/segsearch.cpp\n      src/wordrec/wordclass.cpp)\n  prepend_path(TESSERACT_SRC_LEGACY \"${CMAKE_CURRENT_SOURCE_DIR}\")\n  list(REMOVE_ITEM TESSERACT_SRC ${TESSERACT_SRC_LEGACY})\nendif(DISABLED_LEGACY_ENGINE)\n\n# Use architecture files from SourceLists.cmake\nset(arch_files ${TESSERACT_SRC_ARCH})\n\nif(DOTPRODUCT_FLAGS)\n  set_source_files_properties(src/arch/dotproduct.cpp\n                              PROPERTIES COMPILE_FLAGS ${DOTPRODUCT_FLAGS})\nendif(DOTPRODUCT_FLAGS)\nif(HAVE_AVX)\n  list(APPEND arch_files_opt src/arch/dotproductavx.cpp)\n  set_source_files_properties(src/arch/dotproductavx.cpp\n                              PROPERTIES COMPILE_FLAGS ${AVX_COMPILE_FLAGS})\nendif(HAVE_AVX)\nif(HAVE_AVX2)\n  list(APPEND arch_files_opt src/arch/intsimdmatrixavx2.cpp\n       src/arch/dotproductavx.cpp)\n  set_source_files_properties(src/arch/intsimdmatrixavx2.cpp\n                              PROPERTIES COMPILE_FLAGS ${AVX2_COMPILE_FLAGS})\nendif(HAVE_AVX2)\nif(HAVE_AVX512F)\n  list(APPEND arch_files_opt src/arch/dotproductavx512.cpp)\n  set_source_files_properties(src/arch/dotproductavx512.cpp\n                              PROPERTIES COMPILE_FLAGS ${AVX512F_COMPILE_FLAGS})\nendif(HAVE_AVX512F)\nif(HAVE_FMA)\n  list(APPEND arch_files_opt src/arch/dotproductfma.cpp)\n  set_source_files_properties(src/arch/dotproductfma.cpp\n                              PROPERTIES COMPILE_FLAGS ${FMA_COMPILE_FLAGS})\nendif(HAVE_FMA)\nif(HAVE_SSE4_1)\n  list(APPEND arch_files_opt src/arch/dotproductsse.cpp\n       src/arch/intsimdmatrixsse.cpp)\n  set_source_files_properties(\n    src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp\n    PROPERTIES COMPILE_FLAGS ${SSE4_1_COMPILE_FLAGS})\nendif(HAVE_SSE4_1)\nif(HAVE_NEON)\n  list(APPEND arch_files_opt src/arch/dotproductneon.cpp\n       src/arch/intsimdmatrixneon.cpp)\n  if(NEON_COMPILE_FLAGS)\n    set_source_files_properties(\n      src/arch/dotproductneon.cpp src/arch/intsimdmatrixneon.cpp\n      PROPERTIES COMPILE_FLAGS ${NEON_COMPILE_FLAGS})\n  endif()\nendif(HAVE_NEON)\n\n# Use explicit header file lists from SourceLists.cmake\nset(TESSERACT_HDR ${TESSERACT_HDR_INCLUDE} ${TESSERACT_HDR_INTERNAL})\n\nset(TESSERACT_SRC\n    ${TESSERACT_SRC}\n    src/api/baseapi.cpp\n    src/api/capi.cpp\n    src/api/renderer.cpp\n    src/api/altorenderer.cpp\n    src/api/pagerenderer.cpp\n    src/api/hocrrenderer.cpp\n    src/api/lstmboxrenderer.cpp\n    src/api/pdfrenderer.cpp\n    src/api/wordstrboxrenderer.cpp)\n\nset(TESSERACT_CONFIGS\n    tessdata/configs/alto\n    tessdata/configs/ambigs.train\n    tessdata/configs/api_config\n    tessdata/configs/bazaar\n    tessdata/configs/bigram\n    tessdata/configs/box.train\n    tessdata/configs/box.train.stderr\n    tessdata/configs/digits\n    tessdata/configs/get.images\n    tessdata/configs/hocr\n    tessdata/configs/inter\n    tessdata/configs/kannada\n    tessdata/configs/linebox\n    tessdata/configs/logfile\n    tessdata/configs/lstm.train\n    tessdata/configs/lstmbox\n    tessdata/configs/lstmdebug\n    tessdata/configs/makebox\n    tessdata/configs/page\n    tessdata/configs/pdf\n    tessdata/configs/quiet\n    tessdata/configs/rebox\n    tessdata/configs/strokewidth\n    tessdata/configs/tsv\n    tessdata/configs/txt\n    tessdata/configs/unlv\n    tessdata/configs/wordstrbox)\n\nset(TESSERACT_TESSCONFIGS\n    tessdata/tessconfigs/batch tessdata/tessconfigs/batch.nochop\n    tessdata/tessconfigs/matdemo tessdata/tessconfigs/msdemo\n    tessdata/tessconfigs/nobatch tessdata/tessconfigs/segdemo)\n\nset(LIBTESSFILES ${TESSERACT_SRC} ${arch_files} ${arch_files_opt}\n                 ${TESSERACT_HDR})\n\nsource_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${LIBTESSFILES})\n\nadd_library(libtesseract ${LIBTESSFILES})\n\n# Apply modern optimizations to the main library\nif(ENABLE_UNITY_BUILD)\n  set_target_properties(libtesseract PROPERTIES UNITY_BUILD ON)\n  set_target_properties(libtesseract PROPERTIES UNITY_BUILD_BATCH_SIZE 16)\n  message(STATUS \"Unity build enabled for libtesseract with batch size 16\")\nendif()\n\n# Apply precompiled headers to reduce compilation time\nif(ENABLE_PRECOMPILED_HEADERS)\n  target_precompile_headers(libtesseract PRIVATE\n    <vector>\n    <string>\n    <memory>\n    <algorithm>\n    <iostream>\n    <cstdlib>\n    <cstring>\n    <cmath>\n  )\n\n  # Exclude architecture-specific files from PCH due to custom compiler flags\n  set(ARCH_FILES_NO_PCH\n    src/arch/dotproduct.cpp\n    src/arch/dotproductavx.cpp\n    src/arch/dotproductavx512.cpp\n    src/arch/dotproductfma.cpp\n    src/arch/dotproductsse.cpp\n    src/arch/dotproductneon.cpp\n    src/arch/intsimdmatrixavx2.cpp\n    src/arch/intsimdmatrixsse.cpp\n    src/arch/intsimdmatrixneon.cpp\n  )\n\n  foreach(file ${ARCH_FILES_NO_PCH})\n    if(EXISTS \"${CMAKE_CURRENT_SOURCE_DIR}/${file}\")\n      set_source_files_properties(\"${file}\" PROPERTIES SKIP_PRECOMPILE_HEADERS ON)\n    endif()\n  endforeach()\n\n  message(STATUS \"Precompiled headers enabled for libtesseract (excluding architecture-specific files)\")\nendif()\n\n# Configure build pools for Ninja\nif(ENABLE_NINJA_POOL AND CMAKE_GENERATOR STREQUAL \"Ninja\")\n  set_target_properties(libtesseract PROPERTIES JOB_POOL_COMPILE compile)\n  set_target_properties(libtesseract PROPERTIES JOB_POOL_LINK link)\nendif()\n\ntarget_include_directories(\n  libtesseract BEFORE\n  PRIVATE src\n  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/arch>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccmain>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccstruct>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccutil>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/classify>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/cutil>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/dict>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/lstm>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/textord>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/viewer>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/wordrec>\n         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/training>)\nif(BUILD_SHARED_LIBS)\n  target_compile_definitions(\n    libtesseract\n    PRIVATE -DTESS_EXPORTS\n    INTERFACE -DTESS_IMPORTS)\n  # generate_export_header          (libtesseract EXPORT_MACRO_NAME TESS_API)\nendif()\ntarget_link_libraries(libtesseract PRIVATE ${LIB_Ws2_32} ${LIB_pthread})\nif(OpenMP_CXX_FOUND)\n  target_link_libraries(libtesseract PUBLIC OpenMP::OpenMP_CXX)\nendif()\nif(LibArchive_FOUND)\n  target_link_libraries(libtesseract PUBLIC ${LibArchive_LIBRARIES})\nendif(LibArchive_FOUND)\nif(CURL_FOUND)\n  if(NOT CURL_LIBRARIES)\n    target_link_libraries(libtesseract PUBLIC CURL::libcurl)\n  else()\n    target_link_libraries(libtesseract PUBLIC ${CURL_LIBRARIES})\n  endif()\nendif(CURL_FOUND)\n\nset_target_properties(\n  libtesseract PROPERTIES VERSION\n                          ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})\nset_target_properties(\n  libtesseract PROPERTIES SOVERSION\n                          ${VERSION_MAJOR}.${VERSION_MINOR})\n\nset_target_properties(\n  libtesseract\n  PROPERTIES\n    OUTPUT_NAME\n    tesseract$<$<BOOL:${WIN32}>:${VERSION_MAJOR}${VERSION_MINOR}$<$<CONFIG:DEBUG>:d>>\n)\n\nif(SW_BUILD)\n  target_link_libraries(libtesseract PUBLIC org.sw.demo.danbloomberg.leptonica\n                                            org.sw.demo.libarchive.libarchive)\n  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake\n       \"include(${CMAKE_CURRENT_BINARY_DIR}/cppan.cmake)\\n\")\n  export(\n    TARGETS libtesseract\n    APPEND\n    FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake\n    NAMESPACE Tesseract::)\nelse()\n  target_link_libraries(libtesseract PUBLIC ${Leptonica_LIBRARIES})\n  export(\n    TARGETS libtesseract\n    FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake\n    NAMESPACE Tesseract::)\nendif()\n\nif(WIN32\n   AND CLANG\n   AND OPENMP_BUILD)\n  # Workaround for \"libomp.lib is not automatically added on Windows\" see:\n  # http://lists.llvm.org/pipermail/openmp-dev/2015-August/000857.html\n  target_link_libraries(libtesseract PRIVATE ${OpenMP_LIBRARY})\nendif()\n\nif(ANDROID)\n  add_definitions(-DANDROID)\n  find_package(CpuFeaturesNdkCompat REQUIRED)\n  target_include_directories(\n    libtesseract\n    PRIVATE \"${CpuFeaturesNdkCompat_DIR}/../../../include/ndk_compat\")\n  target_link_libraries(libtesseract PRIVATE CpuFeatures::ndk_compat)\nendif()\n\n# ##############################################################################\n# EXECUTABLE tesseract\n# ##############################################################################\n\nadd_executable(tesseract src/tesseract.cpp)\ntarget_link_libraries(tesseract libtesseract)\nif(HAVE_TIFFIO_H AND WIN32)\n  target_link_libraries(tesseract ${TIFF_LIBRARIES})\nendif()\n\nif(OPENMP_BUILD AND UNIX)\n  target_link_libraries(tesseract pthread)\nendif()\n\n# ##############################################################################\n\nif(BUILD_TESTS\n   AND EXISTS\n       ${CMAKE_CURRENT_SOURCE_DIR}/unittest/third_party/googletest/CMakeLists.txt\n)\n  enable_testing()\n  add_subdirectory(unittest/third_party/googletest)\n  add_subdirectory(unittest)\nendif()\n\nif(BUILD_TRAINING_TOOLS)\n  add_subdirectory(src/training)\nendif()\n\nget_target_property(tesseract_NAME libtesseract NAME)\nget_target_property(tesseract_VERSION libtesseract VERSION)\nget_target_property(tesseract_OUTPUT_NAME libtesseract OUTPUT_NAME)\n\nconfigure_file(tesseract.pc.cmake ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc.in\n               @ONLY)\n# to resolve generator expression in OUTPUT_NAME\nfile(\n  GENERATE\n  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tesseract_$<CONFIG>.pc\n  INPUT ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc.in)\n\nconfigure_package_config_file(\n  cmake/templates/TesseractConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/cmake/tesseract/TesseractConfig.cmake\n  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/tesseract\n  PATH_VARS INCLUDE_DIR LIBRARY_DIRS)\nwrite_basic_package_version_file(\n  ${CMAKE_CURRENT_BINARY_DIR}/cmake/tesseract/TesseractConfigVersion.cmake\n  VERSION ${PACKAGE_VERSION}\n  COMPATIBILITY SameMajorVersion)\n\ninstall(\n  FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract_$<CONFIG>.pc\n  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig\n  RENAME tesseract.pc)\ninstall(TARGETS tesseract DESTINATION bin)\nif (MSVC)\n  install(FILES $<TARGET_PDB_FILE:${PROJECT_NAME}> DESTINATION bin OPTIONAL)\nendif()\ninstall(\n  TARGETS libtesseract\n  EXPORT TesseractTargets\n  RUNTIME DESTINATION bin\n  RUNTIME DESTINATION bin\n  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}\n  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})\nif (MSVC AND BUILD_SHARED_LIBS)\n  install(FILES $<TARGET_PDB_FILE:libtesseract> DESTINATION bin OPTIONAL)\nendif()\ninstall(\n  EXPORT TesseractTargets\n  NAMESPACE Tesseract::\n  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/tesseract)\ninstall(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cmake\n        DESTINATION ${CMAKE_INSTALL_LIBDIR})\n\ninstall(\n  FILES include/tesseract/baseapi.h\n        include/tesseract/capi.h\n        include/tesseract/renderer.h\n        ${CMAKE_CURRENT_BINARY_DIR}/include/tesseract/version.h\n        include/tesseract/ltrresultiterator.h\n        include/tesseract/pageiterator.h\n        include/tesseract/resultiterator.h\n        include/tesseract/osdetect.h\n        include/tesseract/publictypes.h\n        include/tesseract/ocrclass.h\n        include/tesseract/export.h\n        include/tesseract/unichar.h\n        # ${CMAKE_CURRENT_BINARY_DIR}/src/endianness.h\n  DESTINATION include/tesseract)\n\nif(INSTALL_CONFIGS)\n  install(FILES ${TESSERACT_CONFIGS}\n          DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/tessdata/configs)\n  install(FILES ${TESSERACT_TESSCONFIGS}\n          DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/tessdata/tessconfigs)\nendif()\n\n# ##############################################################################\n# uninstall target\n# ##############################################################################\nif(NOT TARGET uninstall)\n  configure_file(\n    \"${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/cmake_uninstall.cmake.in\"\n    \"${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake\" IMMEDIATE @ONLY)\n\n  add_custom_target(\n    uninstall\n    COMMENT \"Uninstall installed files\"\n    COMMAND ${CMAKE_COMMAND} -P\n            ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)\nendif()\n\n# ##############################################################################\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing\n\n**Please follow these rules and advice**.\n\n## Creating an Issue or Using the Forum\n\nIf you think you found a bug in Tesseract, please create an issue.\n\nUse the [user forum](https://groups.google.com/g/tesseract-ocr) instead of creating an issue if ...\n\n* You have problems using Tesseract and need some help.\n* You have problems installing the software.\n* You are not satisfied with the accuracy of the OCR, and want to ask how you can improve it. Note: You should first read the [ImproveQuality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) documentation.\n* You are trying to train Tesseract and you have a problem and/or want to ask a question about the training process. Note: You should first read the **official** guides [[1]](https://tesseract-ocr.github.io/tessdoc/) or [[2]](https://tesseract-ocr.github.io/tessdoc/tess5/TrainingTesseract-5.html) found in the project documentation.\n* You have a general question.\n\nAn issue should only be reported if the platform you are using is one of these:\n\n* Linux (but not a version that is more than 4 years old)\n* Windows (Windows 7 or newer version)\n* macOS (last 3 releases)\n\nFor older versions or other operating systems, use the Tesseract forum.\n\nWhen creating an issue, please report your operating system, including its specific version: \"Ubuntu 16.04\", \"Windows 10\", \"Mac OS X 10.11\" etc.\n\nSearch through open and closed issues to see if similar issue has been reported already (and sometimes also has been solved).\n\nSimilarly, before you post your question in the forum, search through past threads to see if similar question has been asked already.\n\nRead the [documentation](https://tesseract-ocr.github.io/tessdoc/) before you report your issue or ask a question in the forum.\n\nOnly report an issue in the latest official release. Optionally, try to check if the issue is not already solved in the latest snapshot in the git repository.\n\nMake sure you are able to replicate the problem with Tesseract command line program. For external programs that use Tesseract (including wrappers and your own program, if you are developer), report the issue to the developers of that software if it's possible. You can also try to find help in the Tesseract forum.\n\nEach version of Tesseract has its own language data you need to obtain. You **must** obtain and install trained data for English (eng) and osd. Verify that Tesseract knows about these two files (and other trained data you installed) with this command:\n`tesseract --list-langs`.\n\nPost example files to demonstrate the problem.\nBUT don't post files with private info (about yourself or others).\n\nWhen attaching a file to the issue report / forum ...\n\n* Do not post a file larger than 20 MB.\n* GitHub supports only few file name extensions like `.png` or `.txt`. If GitHub rejects your files, you can compress them using a program that can produce a zip archive and then load this zip file to GitHub.\n\nDo not attach programs or libraries to your issues/posts.\n\nFor large files or for programs, add a link to a location where they can be downloaded (your site, Git repo, Google Drive, Dropbox etc.)\n\nAttaching a multi-page TIFF image is useful only if you have problem with multi-page functionality, otherwise attach only one or a few single page images.\n\nCopy the error message from the console instead of sending a screenshot of it.\n\nUse the toolbar above the comment edit area to format your comment.\n\nAdd three backticks before and after a code sample or output of a command to format it (The `Insert code` button can help you doing it).\n\nIf your comment includes a code sample or output of a command that exceeds ~25 lines, post it as attached text file (`filename.txt`).\n\nUse `Preview` before you send your issue. Read it again before sending.\n\nNote that most of the people that respond to issues and answer questions are either other 'regular' users or **volunteers** developers. Please be nice to them :-)\n\nThe [tesseract developers](https://groups.google.com/g/tesseract-dev) forum should be used to discuss Tesseract development: bug fixes, enhancements, add-ons for Tesseract.\n\nSometimes you will not get a respond to your issue or question. We apologize in advance! Please don't take it personally. There can be many reasons for this, including: time limits, no one knows the answer (at least not the ones that are available at that time) or just that\nyour question has been asked (and has been answered) many times before...\n\n## For Developers: Creating a Pull Request\n\nYou should always make sure your changes build and run successfully.\n\nFor that, your clone needs to have all submodules (`googletest`, `test`) included. To do so, either specify `--recurse-submodules` during the initial clone, or run `git submodule update --init --recursive NAME` for each `NAME` later. If `configure` already created those directories (blocking the clone), remove them first (or `make distclean`), then clone and reconfigure.\n\nHave a look at [the README](./README.md) and [testing README](https://github.com/tesseract-ocr/test/blob/main/README.md) and the [documentation](https://tesseract-ocr.github.io/tessdoc/Compiling-%E2%80%93-GitInstallation.html#unit-test-builds) on installation.\n\nIn short, after running `configure` from the build directory of your choice, to build the library and CLI, run `make`. To test it, run `make check`. To build the training tools, run `make training`.\n\nAs soon as your changes are building and tests are succeeding, you can publish them. If you have not already, please [fork](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) tesseract (somewhere) on GitHub, and push your changes to that fork (in a new branch). Then [submit as PR](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork).\n\nPlease also keep track of reports from CI (automated build status) and Coverity/CodeQL (quality scan). When the indicators show deterioration after your changes, further action may be required to improve them.\n"
  },
  {
    "path": "ChangeLog",
    "content": "The ChangeLog for all releases from 1.0 (2006-06-16) up to 5.0.0 (2024-11-10)\nis available here:\n\nhttps://github.com/tesseract-ocr/tesseract/blob/64eab6c457b2337dd690746a5fde5c222b40d5f8/ChangeLog\n\nSee https://github.com/tesseract-ocr/tesseract/releases for the latest release notes.\n\n"
  },
  {
    "path": "INSTALL",
    "content": "Copyright 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software\nFoundation, Inc.\n\n   This file is free documentation; the Free Software Foundation gives\nunlimited permission to copy, distribute and modify it.\n\nBasic Installation\n==================\n\n   These are generic installation instructions. First you need to run\n`./autogen.sh', that creates `configure' script.\n\n   The `configure' shell script attempts to guess correct values for\nvarious system-dependent variables used during compilation.  It uses\nthose values to create a `Makefile' in each directory of the package.\nIt may also create one or more `.h' files containing system-dependent\ndefinitions.  Finally, it creates a shell script `config.status' that\nyou can run in the future to recreate the current configuration, and a\nfile `config.log' containing compiler output (useful mainly for\ndebugging `configure').\n\n   It can also use an optional file (typically called `config.cache'\nand enabled with `--cache-file=config.cache' or simply `-C') that saves\nthe results of its tests to speed up reconfiguring.  (Caching is\ndisabled by default to prevent problems with accidental use of stale\ncache files.)\n\n   If you need to do unusual things to compile the package, please try\nto figure out how `configure' could check whether to do them, and mail\ndiffs or instructions to the address given in the `README' so they can\nbe considered for the next release.  If you are using the cache, and at\nsome point `config.cache' contains results you don't want to keep, you\nmay remove or edit it.\n\n   The file `configure.ac' (or `configure.in') is used to create\n`configure' by a program called `autoconf'.  You only need\n`configure.ac' if you want to change it or regenerate `configure' using\na newer version of `autoconf'.\n\nThe simplest way to compile this package is:\n\n  1. `cd' to the directory containing the package's source code and type\n     `./configure' to configure the package for your system.  If you're\n     using `csh' on an old version of System V, you might need to type\n     `sh ./configure' instead to prevent `csh' from trying to execute\n     `configure' itself.\n\n     Running `configure' takes a while.  While running, it prints some\n     messages telling which features it is checking for.\n\n  2. Type `make' to compile the package.\n\n  3. Optionally, type `make check' to run any self-tests that come with\n     the package.\n\n  4. Type `make install' to install the programs and any data files and\n     documentation.\n\n  5. You can remove the program binaries and object files from the\n     source code directory by typing `make clean'.  To also remove the\n     files that `configure' created (so you can compile the package for\n     a different kind of computer), type `make distclean'.  There is\n     also a `make maintainer-clean' target, but that is intended mainly\n     for the package's developers.  If you use it, you may have to get\n     all sorts of other programs in order to regenerate files that came\n     with the distribution.\n\nCompilers and Options\n=====================\n\n   Some systems require unusual options for compilation or linking that\nthe `configure' script does not know about.  Run `./configure --help'\nfor details on some of the pertinent environment variables.\n\n   You can give `configure' initial values for configuration parameters\nby setting variables in the command line or in the environment.  Here\nis an example:\n\n     ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix\n\n   *Note Defining Variables::, for more details.\n\nCompiling For Multiple Architectures\n====================================\n\n   You can compile the package for more than one kind of computer at the\nsame time, by placing the object files for each architecture in their\nown directory.  To do this, you must use a version of `make' that\nsupports the `VPATH' variable, such as GNU `make'.  `cd' to the\ndirectory where you want the object files and executables to go and run\nthe `configure' script.  `configure' automatically checks for the\nsource code in the directory that `configure' is in and in `..'.\n\n   If you have to use a `make' that does not support the `VPATH'\nvariable, you have to compile the package for one architecture at a\ntime in the source code directory.  After you have installed the\npackage for one architecture, use `make distclean' before reconfiguring\nfor another architecture.\n\nInstallation Names\n==================\n\n   By default, `make install' will install the package's files in\n`/usr/local/bin', `/usr/local/man', etc.  You can specify an\ninstallation prefix other than `/usr/local' by giving `configure' the\noption `--prefix=PATH'.\n\n   You can specify separate installation prefixes for\narchitecture-specific files and architecture-independent files.  If you\ngive `configure' the option `--exec-prefix=PATH', the package will use\nPATH as the prefix for installing programs and libraries.\nDocumentation and other data files will still use the regular prefix.\n\n   In addition, if you use an unusual directory layout you can give\noptions like `--bindir=PATH' to specify different values for particular\nkinds of files.  Run `configure --help' for a list of the directories\nyou can set and what kinds of files go in them.\n\n   If the package supports it, you can cause programs to be installed\nwith an extra prefix or suffix on their names by giving `configure' the\noption `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.\n\nOptional Features\n=================\n\n   Some packages pay attention to `--enable-FEATURE' options to\n`configure', where FEATURE indicates an optional part of the package.\nThey may also pay attention to `--with-PACKAGE' options, where PACKAGE\nis something like `gnu-as' or `x' (for the X Window System).  The\n`README' should mention any `--enable-' and `--with-' options that the\npackage recognizes.\n\n   For packages that use the X Window System, `configure' can usually\nfind the X include and library files automatically, but if it doesn't,\nyou can use the `configure' options `--x-includes=DIR' and\n`--x-libraries=DIR' to specify their locations.\n\nSpecifying the System Type\n==========================\n\n   There may be some features `configure' cannot figure out\nautomatically, but needs to determine by the type of machine the package\nwill run on.  Usually, assuming the package is built to be run on the\n_same_ architectures, `configure' can figure that out, but if it prints\na message saying it cannot guess the machine type, give it the\n`--build=TYPE' option.  TYPE can either be a short name for the system\ntype, such as `sun4', or a canonical name which has the form:\n\n     CPU-COMPANY-SYSTEM\n\nwhere SYSTEM can have one of these forms:\n\n     OS KERNEL-OS\n\n   See the file `config.sub' for the possible values of each field.  If\n`config.sub' isn't included in this package, then this package doesn't\nneed to know the machine type.\n\n   If you are _building_ compiler tools for cross-compiling, you should\nuse the `--target=TYPE' option to select the type of system they will\nproduce code for.\n\n   If you want to _use_ a cross compiler, that generates code for a\nplatform different from the build platform, you should specify the\n\"host\" platform (i.e., that on which the generated programs will\neventually be run) with `--host=TYPE'.\n\nSharing Defaults\n================\n\n   If you want to set default values for `configure' scripts to share,\nyou can create a site shell script called `config.site' that gives\ndefault values for variables like `CC', `cache_file', and `prefix'.\n`configure' looks for `PREFIX/share/config.site' if it exists, then\n`PREFIX/etc/config.site' if it exists.  Or, you can set the\n`CONFIG_SITE' environment variable to the location of the site script.\nA warning: not all `configure' scripts look for a site script.\n\nDefining Variables\n==================\n\n   Variables not defined in a site shell script can be set in the\nenvironment passed to `configure'.  However, some packages may run\nconfigure again during the build, and the customized values of these\nvariables may be lost.  In order to avoid this problem, you should set\nthem in the `configure' command line, using `VAR=value'.  For example:\n\n     ./configure CC=/usr/local2/bin/gcc\n\nwill cause the specified gcc to be used as the C compiler (unless it is\noverridden in the site shell script).\n\n`configure' Invocation\n======================\n\n   `configure' recognizes the following options to control how it\noperates.\n\n`--help'\n`-h'\n     Print a summary of the options to `configure', and exit.\n\n`--version'\n`-V'\n     Print the version of Autoconf used to generate the `configure'\n     script, and exit.\n\n`--cache-file=FILE'\n     Enable the cache: use and save the results of the tests in FILE,\n     traditionally `config.cache'.  FILE defaults to `/dev/null' to\n     disable caching.\n\n`--config-cache'\n`-C'\n     Alias for `--cache-file=config.cache'.\n\n`--quiet'\n`--silent'\n`-q'\n     Do not print messages saying which checks are being made.  To\n     suppress all normal output, redirect it to `/dev/null' (any error\n     messages will still be shown).\n\n`--srcdir=DIR'\n     Look for the package's source code in directory DIR.  Usually\n     `configure' can determine that directory automatically.\n\n`configure' also accepts some other, not widely useful, options.  Run\n`configure --help' for more details.\n"
  },
  {
    "path": "INSTALL.GIT.md",
    "content": "## autotools (LINUX/UNIX , msys...)\n\nIf you have cloned Tesseract from GitHub, you must generate\nthe configure script.\n\nIf you have tesseract 4.0x installation in your system, please remove it\nbefore new build.\n\nYou need Leptonica 1.74.2 (minimum) for Tesseract 4.0x.\n\nKnown dependencies for training tools (excluding leptonica):\n\n* compiler with c++17 support\n* automake\n* pkg-config\n* pango-devel\n* cairo-devel\n* icu-devel\n\nSo, the steps for making Tesseract are:\n\n    ./autogen.sh\n    ./configure\n    make\n    sudo make install\n    sudo ldconfig\n    make training\n    sudo make training-install\n\nYou need to install at least English language and OSD traineddata files to\n`TESSDATA_PREFIX` directory.\n\nYou can retrieve single file with tools like [wget](https://www.gnu.org/software/wget/), [curl](https://curl.haxx.se/), [GithubDownloader](https://github.com/intezer/GithubDownloader) or browser.\n\nAll language data files can be retrieved from git repository (useful only for packagers!).\n(Repository is huge - more that 1.2 GB. You do NOT need to download traineddata files for\nall languages).\n\n    git clone https://github.com/tesseract-ocr/tessdata.git tesseract-ocr.tessdata\n\nYou need an Internet connection and [curl](https://curl.haxx.se/) to compile `ScrollView.jar`\nbecause the build will automatically download\n[piccolo2d-core-3.0.1.jar](https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar) and\n[piccolo2d-extras-3.0.1.jar](https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar) and\n[jaxb-api-2.3.1.jar](http://search.maven.org/remotecontent?filepath=javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar) and place them to `tesseract/java`.\n\nJust run:\n\n    make ScrollView.jar\n\nand follow the instruction on [Viewer Debugging](https://tesseract-ocr.github.io/tessdoc/ViewerDebugging.html).\n\n## cmake\n\nThere is alternative build system based on multiplatform [cmake](https://cmake.org/)\n\n### LINUX\n\n    mkdir build\n    cd build && cmake .. && make\n    sudo make install\n\n### WINDOWS\n\nSee the [documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on this.\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "Makefile.am",
    "content": "## run autogen.sh to create Makefile.in from this file\nACLOCAL_AMFLAGS = -I m4\n\n.PHONY: doc html install-langs ScrollView.jar install-jars pdf training\n\nCLEANFILES =\n\nSUBDIRS = . tessdata\nif MINGW\nSUBDIRS += nsis\nendif\n\nEXTRA_DIST = README.md LICENSE\nEXTRA_DIST += aclocal.m4 config configure.ac autogen.sh\nEXTRA_DIST += tesseract.pc.in doc\nif !GRAPHICS_DISABLED\nEXTRA_DIST += java\nendif\nEXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION\n\nDIST_SUBDIRS = $(SUBDIRS)\n\nEXTRA_PROGRAMS =\n\nuninstall-hook:\n\trm -rf $(DESTDIR)$(pkgincludedir)\n\ndist-hook:\n# added using EXTRA_DIST. $(distdir)/tessdata would in\n# theory suffice.\n\trm -rf `find $(distdir) -name .deps -type d`\n\t-rm -f $(distdir)/*/Makefile $(distdir)/*/*/Makefile\n\trm -f `find $(distdir) -name '*~'`\n\trm -rf $(distdir)/doc/html/* $(distdir)/doc/*.log\n\nif !GRAPHICS_DISABLED\nScrollView.jar:\n\t@cd \"$(top_builddir)/java\" && $(MAKE) $@\n\ninstall-jars:\n\t@cd \"$(top_builddir)/java\" && $(MAKE) $@\nendif\n\ndoc:\n\t-srcdir=\"$(top_srcdir)\" builddir=\"$(top_builddir)\" \\\n\tversion=\"@PACKAGE_VERSION@\" name=\"@PACKAGE_NAME@\" \\\n\tdoxygen $(top_srcdir)/doc/Doxyfile\n\ndoc-pack: doc\n\t-chmod a+r $(top_builddir)/doc/html/*\n\t @tar --create --directory=$(top_builddir)/doc/html --verbose --file=- . | gzip -c -9 > $(top_builddir)/@PACKAGE_NAME@-@PACKAGE_VERSION@-doc-html.tar.gz;\n\ndoc-clean:\n\trm -rf $(top_builddir)/doc/html/*\n\nif MINGW\nwinsetup: training ScrollView.jar\n\t@cd \"$(top_builddir)/nsis\" && $(MAKE) winsetup\nendif\n\npkgconfigdir = $(libdir)/pkgconfig\npkgconfig_DATA = tesseract.pc\n\npkginclude_HEADERS = $(top_builddir)/include/tesseract/version.h\npkginclude_HEADERS += include/tesseract/baseapi.h\npkginclude_HEADERS += include/tesseract/capi.h\npkginclude_HEADERS += include/tesseract/export.h\npkginclude_HEADERS += include/tesseract/ltrresultiterator.h\npkginclude_HEADERS += include/tesseract/ocrclass.h\npkginclude_HEADERS += include/tesseract/osdetect.h\npkginclude_HEADERS += include/tesseract/pageiterator.h\npkginclude_HEADERS += include/tesseract/publictypes.h\npkginclude_HEADERS += include/tesseract/renderer.h\npkginclude_HEADERS += include/tesseract/resultiterator.h\npkginclude_HEADERS += include/tesseract/unichar.h\n\n# Rules for all subdirectories.\n\nnoinst_HEADERS =\nnoinst_LTLIBRARIES =\n\nAM_CPPFLAGS += -I$(top_srcdir)/include\nAM_CPPFLAGS += -I$(top_builddir)/include\nif VISIBILITY\nAM_CPPFLAGS += -DTESS_EXPORTS\nAM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden -fPIC\nendif\n\nAM_CXXFLAGS = $(OPENMP_CXXFLAGS)\n\n# Rules for src/api.\n\nlibtesseract_la_CPPFLAGS = $(AM_CPPFLAGS)\nlibtesseract_la_CPPFLAGS += -DTESS_COMMON_TRAINING_API=\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/arch\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/ccmain\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/classify\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/cutil\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/dict\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/lstm\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/textord\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/training/common\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/viewer\nlibtesseract_la_CPPFLAGS += -I$(top_srcdir)/src/wordrec\nlibtesseract_la_CPPFLAGS += $(libcurl_CFLAGS)\n\nlib_LTLIBRARIES = libtesseract.la\nlibtesseract_la_LDFLAGS = $(LEPTONICA_LIBS)\nlibtesseract_la_LDFLAGS += $(libarchive_LIBS)\nlibtesseract_la_LDFLAGS += $(libcurl_LIBS)\nif T_WIN\nlibtesseract_la_LDFLAGS += -no-undefined -lws2_32\nelse\nlibtesseract_la_LDFLAGS += $(NOUNDEFINED)\nendif\nlibtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION)\n\nlibtesseract_la_SOURCES = src/api/baseapi.cpp\nlibtesseract_la_SOURCES += src/api/altorenderer.cpp\nlibtesseract_la_SOURCES += src/api/pagerenderer.cpp\nlibtesseract_la_SOURCES += src/api/capi.cpp\nlibtesseract_la_SOURCES += src/api/hocrrenderer.cpp\nlibtesseract_la_SOURCES += src/api/lstmboxrenderer.cpp\nlibtesseract_la_SOURCES += src/api/pdfrenderer.cpp\nlibtesseract_la_SOURCES += src/api/renderer.cpp\nlibtesseract_la_SOURCES += src/api/wordstrboxrenderer.cpp\n\nlibtesseract_la_LIBADD = libtesseract_ccutil.la\nlibtesseract_la_LIBADD += libtesseract_lstm.la\nlibtesseract_la_LIBADD += libtesseract_native.la\n\n# Rules for src/arch.\n\nnoinst_HEADERS += src/arch/dotproduct.h\nnoinst_HEADERS += src/arch/intsimdmatrix.h\nnoinst_HEADERS += src/arch/simddetect.h\n\nnoinst_LTLIBRARIES += libtesseract_native.la\n\nlibtesseract_native_la_CXXFLAGS = -O3 -ffast-math\nif OPENMP_SIMD\nlibtesseract_native_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD\nendif\nlibtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_native_la_SOURCES = src/arch/dotproduct.cpp\n\nif HAVE_AVX\nlibtesseract_avx_la_CXXFLAGS = -mavx\nlibtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp\nlibtesseract_la_LIBADD += libtesseract_avx.la\nnoinst_LTLIBRARIES += libtesseract_avx.la\nendif\n\nif HAVE_AVX2\nlibtesseract_avx2_la_CXXFLAGS = -mavx2\nlibtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp\nlibtesseract_la_LIBADD += libtesseract_avx2.la\nnoinst_LTLIBRARIES += libtesseract_avx2.la\nendif\n\nif HAVE_AVX512F\nlibtesseract_avx512_la_CXXFLAGS = -mavx512f\nlibtesseract_avx512_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_avx512_la_SOURCES = src/arch/dotproductavx512.cpp\nlibtesseract_la_LIBADD += libtesseract_avx512.la\nnoinst_LTLIBRARIES += libtesseract_avx512.la\nendif\n\nif HAVE_FMA\nlibtesseract_fma_la_CXXFLAGS = -mfma\nlibtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp\nlibtesseract_la_LIBADD += libtesseract_fma.la\nnoinst_LTLIBRARIES += libtesseract_fma.la\nendif\n\nif HAVE_SSE4_1\nlibtesseract_sse_la_CXXFLAGS = -msse4.1\nlibtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp\nlibtesseract_la_LIBADD += libtesseract_sse.la\nnoinst_LTLIBRARIES += libtesseract_sse.la\nendif\n\nif HAVE_NEON\nlibtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)\nlibtesseract_neon_la_CXXFLAGS += -O3\nif OPENMP_SIMD\nlibtesseract_neon_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD\nendif\nlibtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp\nlibtesseract_neon_la_SOURCES += src/arch/dotproductneon.cpp\nlibtesseract_la_LIBADD += libtesseract_neon.la\nnoinst_LTLIBRARIES += libtesseract_neon.la\nendif\n\nif HAVE_RVV\nlibtesseract_rvv_la_CXXFLAGS = $(RVV_CXXFLAGS)\nlibtesseract_rvv_la_CXXFLAGS += -O3\nlibtesseract_rvv_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_rvv_la_SOURCES = src/arch/intsimdmatrixrvv.cpp\nlibtesseract_la_LIBADD += libtesseract_rvv.la\nnoinst_LTLIBRARIES += libtesseract_rvv.la\nendif\n\nlibtesseract_la_SOURCES += src/arch/intsimdmatrix.cpp\nlibtesseract_la_SOURCES += src/arch/simddetect.cpp\n\n# Rules for src/ccmain.\n\nnoinst_HEADERS += src/ccmain/control.h\nnoinst_HEADERS += src/ccmain/mutableiterator.h\nnoinst_HEADERS += src/ccmain/output.h\nnoinst_HEADERS += src/ccmain/paragraphs.h\nnoinst_HEADERS += src/ccmain/paragraphs_internal.h\nnoinst_HEADERS += src/ccmain/paramsd.h\nnoinst_HEADERS += src/ccmain/pgedit.h\nnoinst_HEADERS += src/ccmain/tesseractclass.h\nnoinst_HEADERS += src/ccmain/tessvars.h\nnoinst_HEADERS += src/ccmain/thresholder.h\nnoinst_HEADERS += src/ccmain/werdit.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/ccmain/docqual.h\nnoinst_HEADERS += src/ccmain/equationdetect.h\nnoinst_HEADERS += src/ccmain/fixspace.h\nnoinst_HEADERS += src/ccmain/reject.h\nendif\n\nlibtesseract_la_SOURCES += src/ccmain/applybox.cpp\nlibtesseract_la_SOURCES += src/ccmain/control.cpp\nlibtesseract_la_SOURCES += src/ccmain/linerec.cpp\nlibtesseract_la_SOURCES += src/ccmain/ltrresultiterator.cpp\nlibtesseract_la_SOURCES += src/ccmain/mutableiterator.cpp\nlibtesseract_la_SOURCES += src/ccmain/output.cpp\nlibtesseract_la_SOURCES += src/ccmain/pageiterator.cpp\nlibtesseract_la_SOURCES += src/ccmain/pagesegmain.cpp\nlibtesseract_la_SOURCES += src/ccmain/pagewalk.cpp\nlibtesseract_la_SOURCES += src/ccmain/paragraphs.cpp\nif !GRAPHICS_DISABLED\nlibtesseract_la_SOURCES += src/ccmain/paramsd.cpp\nlibtesseract_la_SOURCES += src/ccmain/pgedit.cpp\nendif\nlibtesseract_la_SOURCES += src/ccmain/reject.cpp\nlibtesseract_la_SOURCES += src/ccmain/resultiterator.cpp\nlibtesseract_la_SOURCES += src/ccmain/tessedit.cpp\nlibtesseract_la_SOURCES += src/ccmain/tesseractclass.cpp\nlibtesseract_la_SOURCES += src/ccmain/tessvars.cpp\nlibtesseract_la_SOURCES += src/ccmain/thresholder.cpp\nlibtesseract_la_SOURCES += src/ccmain/werdit.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/ccmain/adaptions.cpp\nlibtesseract_la_SOURCES += src/ccmain/docqual.cpp\nlibtesseract_la_SOURCES += src/ccmain/equationdetect.cpp\nlibtesseract_la_SOURCES += src/ccmain/fixspace.cpp\nlibtesseract_la_SOURCES += src/ccmain/fixxht.cpp\nlibtesseract_la_SOURCES += src/ccmain/osdetect.cpp\nlibtesseract_la_SOURCES += src/ccmain/par_control.cpp\nlibtesseract_la_SOURCES += src/ccmain/recogtraining.cpp\nlibtesseract_la_SOURCES += src/ccmain/superscript.cpp\nlibtesseract_la_SOURCES += src/ccmain/tessbox.cpp\nlibtesseract_la_SOURCES += src/ccmain/tfacepp.cpp\nendif\n\n# Rules for src/ccstruct.\n\nnoinst_HEADERS += src/ccstruct/blamer.h\nnoinst_HEADERS += src/ccstruct/blobbox.h\nnoinst_HEADERS += src/ccstruct/blobs.h\nnoinst_HEADERS += src/ccstruct/blread.h\nnoinst_HEADERS += src/ccstruct/boxread.h\nnoinst_HEADERS += src/ccstruct/boxword.h\nnoinst_HEADERS += src/ccstruct/ccstruct.h\nnoinst_HEADERS += src/ccstruct/coutln.h\nnoinst_HEADERS += src/ccstruct/crakedge.h\nnoinst_HEADERS += src/ccstruct/debugpixa.h\nnoinst_HEADERS += src/ccstruct/detlinefit.h\nnoinst_HEADERS += src/ccstruct/dppoint.h\nnoinst_HEADERS += src/ccstruct/image.h\nnoinst_HEADERS += src/ccstruct/imagedata.h\nnoinst_HEADERS += src/ccstruct/linlsq.h\nnoinst_HEADERS += src/ccstruct/matrix.h\nnoinst_HEADERS += src/ccstruct/mod128.h\nnoinst_HEADERS += src/ccstruct/normalis.h\nnoinst_HEADERS += src/ccstruct/ocrblock.h\nnoinst_HEADERS += src/ccstruct/ocrpara.h\nnoinst_HEADERS += src/ccstruct/ocrrow.h\nnoinst_HEADERS += src/ccstruct/otsuthr.h\nnoinst_HEADERS += src/ccstruct/pageres.h\nnoinst_HEADERS += src/ccstruct/pdblock.h\nnoinst_HEADERS += src/ccstruct/points.h\nnoinst_HEADERS += src/ccstruct/polyaprx.h\nnoinst_HEADERS += src/ccstruct/polyblk.h\nnoinst_HEADERS += src/ccstruct/quadlsq.h\nnoinst_HEADERS += src/ccstruct/quadratc.h\nnoinst_HEADERS += src/ccstruct/quspline.h\nnoinst_HEADERS += src/ccstruct/ratngs.h\nnoinst_HEADERS += src/ccstruct/rect.h\nnoinst_HEADERS += src/ccstruct/rejctmap.h\nnoinst_HEADERS += src/ccstruct/seam.h\nnoinst_HEADERS += src/ccstruct/split.h\nnoinst_HEADERS += src/ccstruct/statistc.h\nnoinst_HEADERS += src/ccstruct/stepblob.h\nnoinst_HEADERS += src/ccstruct/werd.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/ccstruct/fontinfo.h\nnoinst_HEADERS += src/ccstruct/params_training_featdef.h\nendif\n\nlibtesseract_la_SOURCES += src/ccstruct/blamer.cpp\nlibtesseract_la_SOURCES += src/ccstruct/blobbox.cpp\nlibtesseract_la_SOURCES += src/ccstruct/blobs.cpp\nlibtesseract_la_SOURCES += src/ccstruct/blread.cpp\nlibtesseract_la_SOURCES += src/ccstruct/boxread.cpp\nlibtesseract_la_SOURCES += src/ccstruct/boxword.cpp\nlibtesseract_la_SOURCES += src/ccstruct/ccstruct.cpp\nlibtesseract_la_SOURCES += src/ccstruct/coutln.cpp\nlibtesseract_la_SOURCES += src/ccstruct/detlinefit.cpp\nlibtesseract_la_SOURCES += src/ccstruct/dppoint.cpp\nlibtesseract_la_SOURCES += src/ccstruct/image.cpp\nlibtesseract_la_SOURCES += src/ccstruct/imagedata.cpp\nlibtesseract_la_SOURCES += src/ccstruct/linlsq.cpp\nlibtesseract_la_SOURCES += src/ccstruct/matrix.cpp\nlibtesseract_la_SOURCES += src/ccstruct/mod128.cpp\nlibtesseract_la_SOURCES += src/ccstruct/normalis.cpp\nlibtesseract_la_SOURCES += src/ccstruct/ocrblock.cpp\nlibtesseract_la_SOURCES += src/ccstruct/ocrpara.cpp\nlibtesseract_la_SOURCES += src/ccstruct/ocrrow.cpp\nlibtesseract_la_SOURCES += src/ccstruct/otsuthr.cpp\nlibtesseract_la_SOURCES += src/ccstruct/pageres.cpp\nlibtesseract_la_SOURCES += src/ccstruct/pdblock.cpp\nlibtesseract_la_SOURCES += src/ccstruct/points.cpp\nlibtesseract_la_SOURCES += src/ccstruct/polyaprx.cpp\nlibtesseract_la_SOURCES += src/ccstruct/polyblk.cpp\nlibtesseract_la_SOURCES += src/ccstruct/quadlsq.cpp\nlibtesseract_la_SOURCES += src/ccstruct/quspline.cpp\nlibtesseract_la_SOURCES += src/ccstruct/ratngs.cpp\nlibtesseract_la_SOURCES += src/ccstruct/rect.cpp\nlibtesseract_la_SOURCES += src/ccstruct/rejctmap.cpp\nlibtesseract_la_SOURCES += src/ccstruct/seam.cpp\nlibtesseract_la_SOURCES += src/ccstruct/split.cpp\nlibtesseract_la_SOURCES += src/ccstruct/statistc.cpp\nlibtesseract_la_SOURCES += src/ccstruct/stepblob.cpp\nlibtesseract_la_SOURCES += src/ccstruct/werd.cpp\n\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/ccstruct/fontinfo.cpp\nlibtesseract_la_SOURCES += src/ccstruct/params_training_featdef.cpp\nendif\n\n# Rules for src/ccutil\n\nlibtesseract_ccutil_la_CPPFLAGS = $(AM_CPPFLAGS)\nlibtesseract_ccutil_la_CPPFLAGS += $(libarchive_CFLAGS)\nif !NO_TESSDATA_PREFIX\nlibtesseract_ccutil_la_CPPFLAGS += -DTESSDATA_PREFIX='\"@datadir@\"'\nendif\n\nnoinst_HEADERS += src/ccutil/ccutil.h\nnoinst_HEADERS += src/ccutil/clst.h\nnoinst_HEADERS += src/ccutil/elst2.h\nnoinst_HEADERS += src/ccutil/elst.h\nnoinst_HEADERS += src/ccutil/errcode.h\nnoinst_HEADERS += src/ccutil/fileerr.h\nnoinst_HEADERS += src/ccutil/genericheap.h\nnoinst_HEADERS += src/ccutil/genericvector.h\nnoinst_HEADERS += src/ccutil/helpers.h\nnoinst_HEADERS += src/ccutil/host.h\nnoinst_HEADERS += src/ccutil/kdpair.h\nnoinst_HEADERS += src/ccutil/lsterr.h\nnoinst_HEADERS += src/ccutil/object_cache.h\nnoinst_HEADERS += src/ccutil/params.h\nnoinst_HEADERS += src/ccutil/qrsequence.h\nnoinst_HEADERS += src/ccutil/sorthelper.h\nnoinst_HEADERS += src/ccutil/scanutils.h\nnoinst_HEADERS += src/ccutil/serialis.h\nnoinst_HEADERS += src/ccutil/tessdatamanager.h\nnoinst_HEADERS += src/ccutil/tprintf.h\nnoinst_HEADERS += src/ccutil/unicharcompress.h\nnoinst_HEADERS += src/ccutil/unicharmap.h\nnoinst_HEADERS += src/ccutil/unicharset.h\nnoinst_HEADERS += src/ccutil/unicity_table.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/ccutil/ambigs.h\nnoinst_HEADERS += src/ccutil/bitvector.h\nnoinst_HEADERS += src/ccutil/indexmapbidi.h\nnoinst_HEADERS += src/ccutil/universalambigs.h\nendif\n\nnoinst_LTLIBRARIES += libtesseract_ccutil.la\n\nlibtesseract_ccutil_la_SOURCES = src/ccutil/ccutil.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/errcode.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/serialis.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/scanutils.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/tessdatamanager.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/tprintf.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/unichar.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/unicharcompress.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/unicharmap.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/unicharset.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/params.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_ccutil_la_SOURCES += src/ccutil/ambigs.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/bitvector.cpp\nlibtesseract_ccutil_la_SOURCES += src/ccutil/indexmapbidi.cpp\nendif\n\n# Rules for src/classify.\n\nnoinst_HEADERS += src/classify/classify.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/classify/adaptive.h\nnoinst_HEADERS += src/classify/cluster.h\nnoinst_HEADERS += src/classify/clusttool.h\nnoinst_HEADERS += src/classify/featdefs.h\nnoinst_HEADERS += src/classify/float2int.h\nnoinst_HEADERS += src/classify/fpoint.h\nnoinst_HEADERS += src/classify/intfeaturespace.h\nnoinst_HEADERS += src/classify/intfx.h\nnoinst_HEADERS += src/classify/intmatcher.h\nnoinst_HEADERS += src/classify/intproto.h\nnoinst_HEADERS += src/classify/kdtree.h\nnoinst_HEADERS += src/classify/mf.h\nnoinst_HEADERS += src/classify/mfdefs.h\nnoinst_HEADERS += src/classify/mfoutline.h\nnoinst_HEADERS += src/classify/mfx.h\nnoinst_HEADERS += src/classify/normfeat.h\nnoinst_HEADERS += src/classify/normmatch.h\nnoinst_HEADERS += src/classify/ocrfeatures.h\nnoinst_HEADERS += src/classify/outfeat.h\nnoinst_HEADERS += src/classify/picofeat.h\nnoinst_HEADERS += src/classify/protos.h\nnoinst_HEADERS += src/classify/shapeclassifier.h\nnoinst_HEADERS += src/classify/shapetable.h\nnoinst_HEADERS += src/classify/tessclassifier.h\nnoinst_HEADERS += src/classify/trainingsample.h\nendif\n\nlibtesseract_la_SOURCES += src/classify/classify.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/classify/adaptive.cpp\nlibtesseract_la_SOURCES += src/classify/adaptmatch.cpp\nlibtesseract_la_SOURCES += src/classify/blobclass.cpp\nlibtesseract_la_SOURCES += src/classify/cluster.cpp\nlibtesseract_la_SOURCES += src/classify/clusttool.cpp\nlibtesseract_la_SOURCES += src/classify/cutoffs.cpp\nlibtesseract_la_SOURCES += src/classify/featdefs.cpp\nlibtesseract_la_SOURCES += src/classify/float2int.cpp\nlibtesseract_la_SOURCES += src/classify/fpoint.cpp\nlibtesseract_la_SOURCES += src/classify/intfeaturespace.cpp\nlibtesseract_la_SOURCES += src/classify/intfx.cpp\nlibtesseract_la_SOURCES += src/classify/intmatcher.cpp\nlibtesseract_la_SOURCES += src/classify/intproto.cpp\nlibtesseract_la_SOURCES += src/classify/kdtree.cpp\nlibtesseract_la_SOURCES += src/classify/mf.cpp\nlibtesseract_la_SOURCES += src/classify/mfoutline.cpp\nlibtesseract_la_SOURCES += src/classify/mfx.cpp\nlibtesseract_la_SOURCES += src/classify/normfeat.cpp\nlibtesseract_la_SOURCES += src/classify/normmatch.cpp\nlibtesseract_la_SOURCES += src/classify/ocrfeatures.cpp\nlibtesseract_la_SOURCES += src/classify/outfeat.cpp\nlibtesseract_la_SOURCES += src/classify/picofeat.cpp\nlibtesseract_la_SOURCES += src/classify/protos.cpp\nlibtesseract_la_SOURCES += src/classify/shapeclassifier.cpp\nlibtesseract_la_SOURCES += src/classify/shapetable.cpp\nlibtesseract_la_SOURCES += src/classify/tessclassifier.cpp\nlibtesseract_la_SOURCES += src/classify/trainingsample.cpp\nendif\n\n# Rules for src/cutil.\n\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/cutil/bitvec.h\nnoinst_HEADERS += src/cutil/oldlist.h\nendif\n\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/cutil/oldlist.cpp\nendif\n\n# Rules for src/dict.\n\nnoinst_HEADERS += src/dict/dawg.h\nnoinst_HEADERS += src/dict/dawg_cache.h\nnoinst_HEADERS += src/dict/dict.h\nnoinst_HEADERS += src/dict/matchdefs.h\nnoinst_HEADERS += src/dict/stopper.h\nnoinst_HEADERS += src/dict/trie.h\n\nlibtesseract_la_SOURCES += src/dict/context.cpp\nlibtesseract_la_SOURCES += src/dict/dawg.cpp\nlibtesseract_la_SOURCES += src/dict/dawg_cache.cpp\nlibtesseract_la_SOURCES += src/dict/dict.cpp\nlibtesseract_la_SOURCES += src/dict/stopper.cpp\nlibtesseract_la_SOURCES += src/dict/trie.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/dict/hyphen.cpp\nlibtesseract_la_SOURCES += src/dict/permdawg.cpp\nendif\n\n# Rules for src/lstm.\n\nlibtesseract_lstm_la_CPPFLAGS = $(AM_CPPFLAGS)\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/arch\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/ccutil\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/classify\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/cutil\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/dict\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/lstm\nlibtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/viewer\nif !NO_TESSDATA_PREFIX\nlibtesseract_lstm_la_CPPFLAGS += -DTESSDATA_PREFIX='\"@datadir@\"'\nendif\n\nnoinst_HEADERS += src/lstm/convolve.h\nnoinst_HEADERS += src/lstm/fullyconnected.h\nnoinst_HEADERS += src/lstm/functions.h\nnoinst_HEADERS += src/lstm/input.h\nnoinst_HEADERS += src/lstm/lstm.h\nnoinst_HEADERS += src/lstm/lstmrecognizer.h\nnoinst_HEADERS += src/lstm/maxpool.h\nnoinst_HEADERS += src/lstm/network.h\nnoinst_HEADERS += src/lstm/networkio.h\nnoinst_HEADERS += src/lstm/networkscratch.h\nnoinst_HEADERS += src/lstm/parallel.h\nnoinst_HEADERS += src/lstm/plumbing.h\nnoinst_HEADERS += src/lstm/recodebeam.h\nnoinst_HEADERS += src/lstm/reconfig.h\nnoinst_HEADERS += src/lstm/reversed.h\nnoinst_HEADERS += src/lstm/series.h\nnoinst_HEADERS += src/lstm/static_shape.h\nnoinst_HEADERS += src/lstm/stridemap.h\nnoinst_HEADERS += src/lstm/weightmatrix.h\n\nnoinst_LTLIBRARIES += libtesseract_lstm.la\n\nlibtesseract_lstm_la_SOURCES = src/lstm/convolve.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/fullyconnected.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/functions.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/input.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/lstm.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/lstmrecognizer.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/maxpool.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/network.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/networkio.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/parallel.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/plumbing.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/recodebeam.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/reconfig.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/reversed.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/series.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/stridemap.cpp\nlibtesseract_lstm_la_SOURCES += src/lstm/weightmatrix.cpp\n\n# Rules for src/textord.\n\nnoinst_HEADERS += src/textord/alignedblob.h\nnoinst_HEADERS += src/textord/baselinedetect.h\nnoinst_HEADERS += src/textord/bbgrid.h\nnoinst_HEADERS += src/textord/blkocc.h\nnoinst_HEADERS += src/textord/blobgrid.h\nnoinst_HEADERS += src/textord/ccnontextdetect.h\nnoinst_HEADERS += src/textord/cjkpitch.h\nnoinst_HEADERS += src/textord/colfind.h\nnoinst_HEADERS += src/textord/colpartition.h\nnoinst_HEADERS += src/textord/colpartitionset.h\nnoinst_HEADERS += src/textord/colpartitiongrid.h\nnoinst_HEADERS += src/textord/devanagari_processing.h\nnoinst_HEADERS += src/textord/drawtord.h\nnoinst_HEADERS += src/textord/edgblob.h\nnoinst_HEADERS += src/textord/edgloop.h\nnoinst_HEADERS += src/textord/fpchop.h\nnoinst_HEADERS += src/textord/gap_map.h\nnoinst_HEADERS += src/textord/imagefind.h\nnoinst_HEADERS += src/textord/linefind.h\nnoinst_HEADERS += src/textord/makerow.h\nnoinst_HEADERS += src/textord/oldbasel.h\nnoinst_HEADERS += src/textord/pithsync.h\nnoinst_HEADERS += src/textord/pitsync1.h\nnoinst_HEADERS += src/textord/scanedg.h\nnoinst_HEADERS += src/textord/sortflts.h\nnoinst_HEADERS += src/textord/strokewidth.h\nnoinst_HEADERS += src/textord/tabfind.h\nnoinst_HEADERS += src/textord/tablefind.h\nnoinst_HEADERS += src/textord/tabvector.h\nnoinst_HEADERS += src/textord/tablerecog.h\nnoinst_HEADERS += src/textord/textlineprojection.h\nnoinst_HEADERS += src/textord/textord.h\nnoinst_HEADERS += src/textord/topitch.h\nnoinst_HEADERS += src/textord/tordmain.h\nnoinst_HEADERS += src/textord/tovars.h\nnoinst_HEADERS += src/textord/underlin.h\nnoinst_HEADERS += src/textord/wordseg.h\nnoinst_HEADERS += src/textord/workingpartset.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/textord/equationdetectbase.h\nendif\n\nlibtesseract_la_SOURCES += src/textord/alignedblob.cpp\nlibtesseract_la_SOURCES += src/textord/baselinedetect.cpp\nlibtesseract_la_SOURCES += src/textord/bbgrid.cpp\nlibtesseract_la_SOURCES += src/textord/blkocc.cpp\nlibtesseract_la_SOURCES += src/textord/blobgrid.cpp\nlibtesseract_la_SOURCES += src/textord/ccnontextdetect.cpp\nlibtesseract_la_SOURCES += src/textord/cjkpitch.cpp\nlibtesseract_la_SOURCES += src/textord/colfind.cpp\nlibtesseract_la_SOURCES += src/textord/colpartition.cpp\nlibtesseract_la_SOURCES += src/textord/colpartitionset.cpp\nlibtesseract_la_SOURCES += src/textord/colpartitiongrid.cpp\nlibtesseract_la_SOURCES += src/textord/devanagari_processing.cpp\nlibtesseract_la_SOURCES += src/textord/drawtord.cpp\nlibtesseract_la_SOURCES += src/textord/edgblob.cpp\nlibtesseract_la_SOURCES += src/textord/edgloop.cpp\nlibtesseract_la_SOURCES += src/textord/fpchop.cpp\nlibtesseract_la_SOURCES += src/textord/gap_map.cpp\nlibtesseract_la_SOURCES += src/textord/imagefind.cpp\nlibtesseract_la_SOURCES += src/textord/linefind.cpp\nlibtesseract_la_SOURCES += src/textord/makerow.cpp\nlibtesseract_la_SOURCES += src/textord/oldbasel.cpp\nlibtesseract_la_SOURCES += src/textord/pithsync.cpp\nlibtesseract_la_SOURCES += src/textord/pitsync1.cpp\nlibtesseract_la_SOURCES += src/textord/scanedg.cpp\nlibtesseract_la_SOURCES += src/textord/sortflts.cpp\nlibtesseract_la_SOURCES += src/textord/strokewidth.cpp\nlibtesseract_la_SOURCES += src/textord/tabfind.cpp\nlibtesseract_la_SOURCES += src/textord/tablefind.cpp\nlibtesseract_la_SOURCES += src/textord/tabvector.cpp\nlibtesseract_la_SOURCES += src/textord/tablerecog.cpp\nlibtesseract_la_SOURCES += src/textord/textlineprojection.cpp\nlibtesseract_la_SOURCES += src/textord/textord.cpp\nlibtesseract_la_SOURCES += src/textord/topitch.cpp\nlibtesseract_la_SOURCES += src/textord/tordmain.cpp\nlibtesseract_la_SOURCES += src/textord/tospace.cpp\nlibtesseract_la_SOURCES += src/textord/tovars.cpp\nlibtesseract_la_SOURCES += src/textord/underlin.cpp\nlibtesseract_la_SOURCES += src/textord/wordseg.cpp\nlibtesseract_la_SOURCES += src/textord/workingpartset.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/textord/equationdetectbase.cpp\nendif\n\n# Rules for src/viewer.\n\nif !GRAPHICS_DISABLED\nnoinst_HEADERS += src/viewer/scrollview.h\nnoinst_HEADERS += src/viewer/svmnode.h\nnoinst_HEADERS += src/viewer/svutil.h\n\nlibtesseract_la_SOURCES += src/viewer/scrollview.cpp\nlibtesseract_la_SOURCES += src/viewer/svmnode.cpp\nlibtesseract_la_SOURCES += src/viewer/svutil.cpp\n\nEXTRA_PROGRAMS += svpaint\nsvpaint_CPPFLAGS = $(AM_CPPFLAGS)\nsvpaint_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\nsvpaint_CPPFLAGS += -I$(top_srcdir)/src/viewer\nsvpaint_SOURCES = src/svpaint.cpp\nsvpaint_LDADD = libtesseract.la\nendif\n\n# Rules for src/wordrec.\n\nnoinst_HEADERS += src/wordrec/wordrec.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/wordrec/associate.h\nnoinst_HEADERS += src/wordrec/chop.h\nnoinst_HEADERS += src/wordrec/drawfx.h\nnoinst_HEADERS += src/wordrec/findseam.h\nnoinst_HEADERS += src/wordrec/language_model.h\nnoinst_HEADERS += src/wordrec/lm_consistency.h\nnoinst_HEADERS += src/wordrec/lm_pain_points.h\nnoinst_HEADERS += src/wordrec/lm_state.h\nnoinst_HEADERS += src/wordrec/outlines.h\nnoinst_HEADERS += src/wordrec/params_model.h\nnoinst_HEADERS += src/wordrec/plotedges.h\nnoinst_HEADERS += src/wordrec/render.h\nendif\n\nlibtesseract_la_SOURCES += src/wordrec/tface.cpp\nlibtesseract_la_SOURCES += src/wordrec/wordrec.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_la_SOURCES += src/wordrec/associate.cpp\nlibtesseract_la_SOURCES += src/wordrec/chop.cpp\nlibtesseract_la_SOURCES += src/wordrec/chopper.cpp\nlibtesseract_la_SOURCES += src/wordrec/drawfx.cpp\nlibtesseract_la_SOURCES += src/wordrec/findseam.cpp\nlibtesseract_la_SOURCES += src/wordrec/gradechop.cpp\nlibtesseract_la_SOURCES += src/wordrec/language_model.cpp\nlibtesseract_la_SOURCES += src/wordrec/lm_consistency.cpp\nlibtesseract_la_SOURCES += src/wordrec/lm_pain_points.cpp\nlibtesseract_la_SOURCES += src/wordrec/lm_state.cpp\nlibtesseract_la_SOURCES += src/wordrec/outlines.cpp\nlibtesseract_la_SOURCES += src/wordrec/params_model.cpp\nlibtesseract_la_SOURCES += src/wordrec/pieces.cpp\nif !GRAPHICS_DISABLED\nlibtesseract_la_SOURCES += src/wordrec/plotedges.cpp\nendif\nlibtesseract_la_SOURCES += src/wordrec/render.cpp\nlibtesseract_la_SOURCES += src/wordrec/segsearch.cpp\nlibtesseract_la_SOURCES += src/wordrec/wordclass.cpp\nendif\n\n# Rules for tesseract executable.\n\nbin_PROGRAMS = tesseract\ntesseract_SOURCES = src/tesseract.cpp\ntesseract_CPPFLAGS = $(AM_CPPFLAGS)\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/arch\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/ccmain\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/ccutil\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/classify\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/cutil\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/dict\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/textord\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/viewer\ntesseract_CPPFLAGS += -I$(top_srcdir)/src/wordrec\n\ntesseract_LDFLAGS = $(OPENMP_CXXFLAGS)\n\ntesseract_LDADD = libtesseract.la\ntesseract_LDADD += $(LEPTONICA_LIBS)\ntesseract_LDADD += $(libarchive_LIBS)\ntesseract_LDADD += $(libcurl_LIBS)\n\nif T_WIN\ntesseract_LDADD += -ltiff\ntesseract_LDADD += -lws2_32\nendif\nif ADD_RT\ntesseract_LDADD += -lrt\nendif\n\n# Rules for training tools.\n\nif ENABLE_TRAINING\n\ntraining: $(trainingtools) | $(PROGRAMS)\n\ntraining-install: $(trainingtools)\n\tmkdir -p $(DESTDIR)$(bindir)\n\t$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install \\\n\t$(INSTALL) $(INSTALL_STRIP_FLAG) $(trainingtools) $(DESTDIR)$(bindir)\n\ntraining-uninstall:\n\n# Some unit tests use code from training.\ncheck: libtesseract_training.la\n\n# dawg_test runs dawg2wordlist and wordlist2dawg.\ncheck: dawg2wordlist wordlist2dawg\n\nelse\n\ntraining:\n\t@echo \"Need to reconfigure project, so there are no errors\"\n\nendif\n\nCLEANFILES += $(EXTRA_PROGRAMS)\n\ntraining_CPPFLAGS = $(AM_CPPFLAGS)\ntraining_CPPFLAGS += -DPANGO_ENABLE_ENGINE\ntraining_CPPFLAGS += -DTESS_COMMON_TRAINING_API=\ntraining_CPPFLAGS += -DTESS_PANGO_TRAINING_API=\ntraining_CPPFLAGS += -DTESS_UNICHARSET_TRAINING_API=\ntraining_CPPFLAGS += -I$(top_srcdir)/src/training\ntraining_CPPFLAGS += -I$(top_srcdir)/src/training/common\ntraining_CPPFLAGS += -I$(top_srcdir)/src/training/pango\ntraining_CPPFLAGS += -I$(top_srcdir)/src/training/unicharset\ntraining_CPPFLAGS += -I$(top_srcdir)/src/api\ntraining_CPPFLAGS += -I$(top_srcdir)/src/ccmain\ntraining_CPPFLAGS += -I$(top_srcdir)/src/ccutil\ntraining_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\ntraining_CPPFLAGS += -I$(top_srcdir)/src/lstm\ntraining_CPPFLAGS += -I$(top_srcdir)/src/arch\ntraining_CPPFLAGS += -I$(top_srcdir)/src/viewer\ntraining_CPPFLAGS += -I$(top_srcdir)/src/textord\ntraining_CPPFLAGS += -I$(top_srcdir)/src/dict\ntraining_CPPFLAGS += -I$(top_srcdir)/src/classify\ntraining_CPPFLAGS += -I$(top_srcdir)/src/wordrec\ntraining_CPPFLAGS += -I$(top_srcdir)/src/cutil\ntraining_CPPFLAGS += $(ICU_UC_CFLAGS) $(ICU_I18N_CFLAGS)\ntraining_CPPFLAGS += $(pango_CFLAGS)\ntraining_CPPFLAGS += $(cairo_CFLAGS)\n\nif DISABLED_LEGACY_ENGINE\ntraining_CPPFLAGS += -DDISABLED_LEGACY_ENGINE\nendif\n\n# TODO: training programs cannot be linked to shared library created\n# with -fvisibility\nif VISIBILITY\nAM_LDFLAGS += -all-static\nendif\n\nnoinst_HEADERS += src/training/pango/boxchar.h\nnoinst_HEADERS += src/training/common/commandlineflags.h\nnoinst_HEADERS += src/training/common/commontraining.h\nnoinst_HEADERS += src/training/common/ctc.h\nnoinst_HEADERS += src/training/common/networkbuilder.h\nnoinst_HEADERS += src/training/degradeimage.h\nnoinst_HEADERS += src/training/pango/ligature_table.h\nnoinst_HEADERS += src/training/pango/pango_font_info.h\nnoinst_HEADERS += src/training/pango/stringrenderer.h\nnoinst_HEADERS += src/training/pango/tlog.h\nnoinst_HEADERS += src/training/unicharset/icuerrorcode.h\nnoinst_HEADERS += src/training/unicharset/fileio.h\nnoinst_HEADERS += src/training/unicharset/lang_model_helpers.h\nnoinst_HEADERS += src/training/unicharset/lstmtester.h\nnoinst_HEADERS += src/training/unicharset/lstmtrainer.h\nnoinst_HEADERS += src/training/unicharset/normstrngs.h\nnoinst_HEADERS += src/training/unicharset/unicharset_training_utils.h\nnoinst_HEADERS += src/training/unicharset/validate_grapheme.h\nnoinst_HEADERS += src/training/unicharset/validate_indic.h\nnoinst_HEADERS += src/training/unicharset/validate_javanese.h\nnoinst_HEADERS += src/training/unicharset/validate_khmer.h\nnoinst_HEADERS += src/training/unicharset/validate_myanmar.h\nnoinst_HEADERS += src/training/unicharset/validator.h\nif !DISABLED_LEGACY_ENGINE\nnoinst_HEADERS += src/training/common/errorcounter.h\nnoinst_HEADERS += src/training/common/intfeaturedist.h\nnoinst_HEADERS += src/training/common/intfeaturemap.h\nnoinst_HEADERS += src/training/common/mastertrainer.h\nnoinst_HEADERS += src/training/common/sampleiterator.h\nnoinst_HEADERS += src/training/common/trainingsampleset.h\nnoinst_HEADERS += src/training/mergenf.h\nendif\n\nCLEANFILES += libtesseract_training.la\n\nEXTRA_LTLIBRARIES = libtesseract_training.la\n\nlibtesseract_training_la_CPPFLAGS = $(training_CPPFLAGS)\nlibtesseract_training_la_SOURCES = src/training/pango/boxchar.cpp\nlibtesseract_training_la_SOURCES += src/training/common/commandlineflags.cpp\nlibtesseract_training_la_SOURCES += src/training/common/commontraining.cpp\nlibtesseract_training_la_SOURCES += src/training/common/ctc.cpp\nlibtesseract_training_la_SOURCES += src/training/common/networkbuilder.cpp\nlibtesseract_training_la_SOURCES += src/training/degradeimage.cpp\nlibtesseract_training_la_SOURCES += src/training/pango/ligature_table.cpp\nlibtesseract_training_la_SOURCES += src/training/pango/pango_font_info.cpp\nlibtesseract_training_la_SOURCES += src/training/pango/stringrenderer.cpp\nlibtesseract_training_la_SOURCES += src/training/pango/tlog.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/icuerrorcode.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/fileio.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/lang_model_helpers.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/lstmtester.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/lstmtrainer.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/normstrngs.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/unicharset_training_utils.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validate_grapheme.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validate_indic.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validate_javanese.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validate_khmer.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validate_myanmar.cpp\nlibtesseract_training_la_SOURCES += src/training/unicharset/validator.cpp\nif !DISABLED_LEGACY_ENGINE\nlibtesseract_training_la_SOURCES += src/training/common/errorcounter.cpp\nlibtesseract_training_la_SOURCES += src/training/common/intfeaturedist.cpp\nlibtesseract_training_la_SOURCES += src/training/common/intfeaturemap.cpp\nlibtesseract_training_la_SOURCES += src/training/common/mastertrainer.cpp\nlibtesseract_training_la_SOURCES += src/training/common/sampleiterator.cpp\nlibtesseract_training_la_SOURCES += src/training/common/trainingsampleset.cpp\nendif\n\ntrainingtools = combine_lang_model$(EXEEXT)\ntrainingtools += combine_tessdata$(EXEEXT)\ntrainingtools += dawg2wordlist$(EXEEXT)\ntrainingtools += lstmeval$(EXEEXT)\ntrainingtools += lstmtraining$(EXEEXT)\ntrainingtools += merge_unicharsets$(EXEEXT)\ntrainingtools += set_unicharset_properties$(EXEEXT)\ntrainingtools += text2image$(EXEEXT)\ntrainingtools += unicharset_extractor$(EXEEXT)\ntrainingtools += wordlist2dawg$(EXEEXT)\nif !DISABLED_LEGACY_ENGINE\ntrainingtools += ambiguous_words$(EXEEXT)\ntrainingtools += classifier_tester$(EXEEXT)\ntrainingtools += cntraining$(EXEEXT)\ntrainingtools += mftraining$(EXEEXT)\ntrainingtools += shapeclustering$(EXEEXT)\nendif\n\n$(trainingtools): libtesseract.la\n\nEXTRA_PROGRAMS += $(trainingtools)\n\nextralib = libtesseract.la\nextralib += $(libarchive_LIBS)\nextralib += $(LEPTONICA_LIBS)\nif T_WIN\nextralib += -lws2_32\nendif\n\nif !DISABLED_LEGACY_ENGINE\nambiguous_words_CPPFLAGS = $(training_CPPFLAGS)\nambiguous_words_SOURCES = src/training/ambiguous_words.cpp\nambiguous_words_LDADD = libtesseract_training.la\nambiguous_words_LDADD += $(extralib)\n\nclassifier_tester_CPPFLAGS = $(training_CPPFLAGS)\nclassifier_tester_SOURCES = src/training/classifier_tester.cpp\nclassifier_tester_LDADD = libtesseract_training.la\nclassifier_tester_LDADD += $(extralib)\n\ncntraining_CPPFLAGS = $(training_CPPFLAGS)\ncntraining_SOURCES = src/training/cntraining.cpp\ncntraining_LDADD = libtesseract_training.la\ncntraining_LDADD += $(extralib)\n\nmftraining_CPPFLAGS = $(training_CPPFLAGS)\nmftraining_SOURCES = src/training/mftraining.cpp src/training/mergenf.cpp\nmftraining_LDADD = libtesseract_training.la\nmftraining_LDADD += $(ICU_UC_LIBS)\nmftraining_LDADD += $(extralib)\n\nshapeclustering_CPPFLAGS = $(training_CPPFLAGS)\nshapeclustering_SOURCES = src/training/shapeclustering.cpp\nshapeclustering_LDADD = libtesseract_training.la\nshapeclustering_LDADD += $(extralib)\nendif\n\ncombine_lang_model_CPPFLAGS = $(training_CPPFLAGS)\ncombine_lang_model_SOURCES = src/training/combine_lang_model.cpp\ncombine_lang_model_LDADD = libtesseract_training.la\ncombine_lang_model_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\ncombine_lang_model_LDADD += $(extralib)\n\ncombine_tessdata_CPPFLAGS = $(training_CPPFLAGS)\ncombine_tessdata_SOURCES = src/training/combine_tessdata.cpp\ncombine_tessdata_LDADD = $(extralib)\n\ndawg2wordlist_CPPFLAGS = $(training_CPPFLAGS)\ndawg2wordlist_SOURCES = src/training/dawg2wordlist.cpp\ndawg2wordlist_LDADD = $(extralib)\n\nlstmeval_CPPFLAGS = $(training_CPPFLAGS)\nlstmeval_SOURCES = src/training/lstmeval.cpp\nlstmeval_LDADD = libtesseract_training.la\nlstmeval_LDADD += $(ICU_UC_LIBS)\nlstmeval_LDADD += $(extralib)\n\nlstmtraining_CPPFLAGS = $(training_CPPFLAGS)\nlstmtraining_SOURCES = src/training/lstmtraining.cpp\nlstmtraining_LDADD = libtesseract_training.la\nlstmtraining_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\nlstmtraining_LDADD += $(extralib)\n\nmerge_unicharsets_CPPFLAGS = $(training_CPPFLAGS)\nmerge_unicharsets_SOURCES = src/training/merge_unicharsets.cpp\nmerge_unicharsets_LDADD = $(extralib)\n\nset_unicharset_properties_CPPFLAGS = $(training_CPPFLAGS)\nset_unicharset_properties_SOURCES = src/training/set_unicharset_properties.cpp\nset_unicharset_properties_LDADD = libtesseract_training.la\nset_unicharset_properties_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\nset_unicharset_properties_LDADD += $(extralib)\n\ntext2image_CPPFLAGS = $(training_CPPFLAGS)\ntext2image_SOURCES = src/training/text2image.cpp\ntext2image_LDADD = libtesseract_training.la\ntext2image_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\ntext2image_LDADD += $(extralib)\ntext2image_LDADD += $(ICU_UC_LIBS) $(cairo_LIBS)\ntext2image_LDADD += $(pango_LIBS) $(pangocairo_LIBS) $(pangoft2_LIBS)\n\nunicharset_extractor_CPPFLAGS = $(training_CPPFLAGS)\nunicharset_extractor_SOURCES = src/training/unicharset_extractor.cpp\nunicharset_extractor_LDADD = libtesseract_training.la\nunicharset_extractor_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\nunicharset_extractor_LDADD += $(extralib)\n\nwordlist2dawg_CPPFLAGS = $(training_CPPFLAGS)\nwordlist2dawg_SOURCES = src/training/wordlist2dawg.cpp\nwordlist2dawg_LDADD = $(extralib)\n\n# fuzzer-api is used for fuzzing tests.\n# They are run by OSS-Fuzz https://oss-fuzz.com/, but can also be run locally.\n# Note: -fsanitize=fuzzer currently requires the clang++ compiler.\n\n# LIB_FUZZING_ENGINE can be overridden by the caller.\n# This is used by OSS-Fuzz.\nLIB_FUZZING_ENGINE ?= -fsanitize=fuzzer\n\nfuzzer-api: libtesseract.la\nfuzzer-api: unittest/fuzzers/fuzzer-api.cpp\n\t$(CXX) $(CXXFLAGS) -g $(LIB_FUZZING_ENGINE) \\\n          -I $(top_srcdir)/include \\\n          -I $(builddir)/include \\\n          -I $(top_srcdir)/src/ccmain \\\n          -I $(top_srcdir)/src/ccstruct \\\n          -I $(top_srcdir)/src/ccutil \\\n          $(LEPTONICA_CFLAGS) \\\n          $(OPENMP_CXXFLAGS) \\\n          $< \\\n          $(builddir)/.libs/libtesseract.a \\\n          $(LEPTONICA_LIBS) \\\n          $(libarchive_LIBS) \\\n          $(libcurl_LIBS) \\\n          -o $@\n\nfuzzer-api-512x256: libtesseract.la\nfuzzer-api-512x256: unittest/fuzzers/fuzzer-api.cpp\n\t$(CXX) $(CXXFLAGS) -g $(LIB_FUZZING_ENGINE) \\\n          -DTESSERACT_FUZZER_WIDTH=512 \\\n          -DTESSERACT_FUZZER_HEIGHT=256 \\\n          -I $(top_srcdir)/include \\\n          -I $(builddir)/include \\\n          -I $(top_srcdir)/src/ccmain \\\n          -I $(top_srcdir)/src/ccstruct \\\n          -I $(top_srcdir)/src/ccutil \\\n          $(LEPTONICA_CFLAGS) \\\n          $(OPENMP_CXXFLAGS) \\\n          $< \\\n          $(builddir)/.libs/libtesseract.a \\\n          $(LEPTONICA_LIBS) \\\n          $(libarchive_LIBS) \\\n          $(libcurl_LIBS) \\\n          -o $@\n\nCLEANFILES += fuzzer-api fuzzer-api-512x256\n\nif ASCIIDOC\n\nman_MANS = doc/combine_lang_model.1\nman_MANS += doc/combine_tessdata.1\nman_MANS += doc/dawg2wordlist.1\nman_MANS += doc/lstmeval.1\nman_MANS += doc/lstmtraining.1\nman_MANS += doc/merge_unicharsets.1\nman_MANS += doc/set_unicharset_properties.1\nman_MANS += doc/tesseract.1\nman_MANS += doc/text2image.1\nman_MANS += doc/unicharset.5\nman_MANS += doc/unicharset_extractor.1\nman_MANS += doc/wordlist2dawg.1\n\nif !DISABLED_LEGACY_ENGINE\nman_MANS += doc/ambiguous_words.1\nman_MANS += doc/classifier_tester.1\nman_MANS += doc/cntraining.1\nman_MANS += doc/mftraining.1\nman_MANS += doc/shapeclustering.1\nman_MANS += doc/unicharambigs.5\nendif\n\nman_xslt = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl\n\nEXTRA_DIST += $(man_MANS) doc/Doxyfile\n\nhtml: ${man_MANS:%=%.html}\npdf: ${man_MANS:%=%.pdf}\n\nSUFFIXES = .asc .html .pdf\n\n.asc:\nif HAVE_XML_CATALOG_FILES\n\tasciidoc -b docbook -d manpage -o - $< | \\\n\tXML_CATALOG_FILES=$(XML_CATALOG_FILES) xsltproc --nonet -o $@ $(man_xslt) -\nelse\n\tasciidoc -b docbook -d manpage -o - $< | \\\n\txsltproc --nonet -o $@ $(man_xslt) -\nendif\n\n.asc.html:\n\tasciidoc -b html5 -o $@ $<\n\n.asc.pdf:\n\tasciidoc -b docbook -d manpage -o $*.dbk $<\n\tdocbook2pdf -o doc $*.dbk\n\nMAINTAINERCLEANFILES = $(man_MANS) Doxyfile\n\nendif\n\n# Absolute path of directory 'langdata'.\nLANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata_lstm\n\n# Absolute path of directory 'tessdata' with traineddata files\n# (must be on same level as top source directory).\nTESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata\n\n# Absolute path of directory 'testing' with test images and ground truth texts\n# (using submodule test).\nTESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing\n# Absolute path of directory 'testdata' with test unicharset etc.\n# (using submodule test).\nTESTDATA_DIR=$(shell cd $(top_srcdir) && pwd)/test/testdata\n\n# Suppress some memory leaks reported by LeakSanitizer.\nexport LSAN_OPTIONS=suppressions=$(top_srcdir)/unittest/tesseract_leaksanitizer.supp\n\nunittest_CPPFLAGS = $(AM_CPPFLAGS)\nunittest_CPPFLAGS += -DTESSBIN_DIR=\"\\\"$(abs_top_builddir)\\\"\"\nunittest_CPPFLAGS += -DLANGDATA_DIR=\"\\\"$(LANGDATA_DIR)\\\"\"\nunittest_CPPFLAGS += -DTESSDATA_DIR=\"\\\"$(TESSDATA_DIR)\\\"\"\nunittest_CPPFLAGS += -DTESTING_DIR=\"\\\"$(TESTING_DIR)\\\"\"\nunittest_CPPFLAGS += -DTESTDATA_DIR=\"\\\"$(TESTDATA_DIR)\\\"\"\nunittest_CPPFLAGS += -DPANGO_ENABLE_ENGINE\nif DISABLED_LEGACY_ENGINE\nunittest_CPPFLAGS += -DDISABLED_LEGACY_ENGINE\nendif # DISABLED_LEGACY_ENGINE\nunittest_CPPFLAGS += -DTESS_COMMON_TRAINING_API=\nunittest_CPPFLAGS += -DTESS_PANGO_TRAINING_API=\nunittest_CPPFLAGS += -DTESS_UNICHARSET_TRAINING_API=\nunittest_CPPFLAGS += -I$(top_srcdir)/src/arch\nunittest_CPPFLAGS += -I$(top_srcdir)/src/ccmain\nunittest_CPPFLAGS += -I$(top_srcdir)/src/ccstruct\nunittest_CPPFLAGS += -I$(top_srcdir)/src/ccutil\nunittest_CPPFLAGS += -I$(top_srcdir)/src/classify\nunittest_CPPFLAGS += -I$(top_srcdir)/src/cutil\nunittest_CPPFLAGS += -I$(top_srcdir)/src/dict\nunittest_CPPFLAGS += -I$(top_srcdir)/src/display\nunittest_CPPFLAGS += -I$(top_srcdir)/src/lstm\nunittest_CPPFLAGS += -I$(top_srcdir)/src/textord\nunittest_CPPFLAGS += -I$(top_srcdir)/unittest/base\nunittest_CPPFLAGS += -I$(top_srcdir)/unittest/util\nunittest_CPPFLAGS += $(LEPTONICA_CFLAGS)\nif ENABLE_TRAINING\nunittest_CPPFLAGS += -I$(top_srcdir)/src/training\nunittest_CPPFLAGS += -I$(top_srcdir)/src/training/common\nunittest_CPPFLAGS += -I$(top_srcdir)/src/training/pango\nunittest_CPPFLAGS += -I$(top_srcdir)/src/training/unicharset\nunittest_CPPFLAGS += $(pangocairo_CFLAGS)\nendif # ENABLE_TRAINING\nunittest_CPPFLAGS += -I$(top_srcdir)/src/viewer\nunittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec\nunittest_CPPFLAGS += -I$(top_srcdir)/unittest\n\n# Build googletest:\ncheck_LTLIBRARIES = libgtest.la libgtest_main.la libgmock.la libgmock_main.la\nlibgtest_la_SOURCES = unittest/third_party/googletest/googletest/src/gtest-all.cc\nlibgtest_la_CPPFLAGS = -I$(top_srcdir)/unittest/third_party/googletest/googletest/include\nlibgtest_la_CPPFLAGS += -I$(top_srcdir)/unittest/third_party/googletest/googletest\nlibgtest_la_CPPFLAGS += -pthread\nlibgtest_main_la_SOURCES = unittest/third_party/googletest/googletest/src/gtest_main.cc\nlibgtest_main_la_CPPFLAGS = $(libgtest_la_CPPFLAGS)\n\nGMOCK_INCLUDES = -I$(top_srcdir)/unittest/third_party/googletest/googlemock/include \\\n                 -I$(top_srcdir)/unittest/third_party/googletest/googlemock \\\n                 -I$(top_srcdir)/unittest/third_party/googletest/googletest/include \\\n                 -I$(top_srcdir)/unittest/third_party/googletest/googletest\n\nlibgmock_la_SOURCES = unittest/third_party/googletest/googlemock/src/gmock-all.cc\nlibgmock_la_CPPFLAGS = $(GMOCK_INCLUDES) \\\n                       -pthread\nlibgmock_main_la_SOURCES = unittest/third_party/googletest/googlemock/src/gmock_main.cc\nlibgmock_main_la_CPPFLAGS = $(GMOCK_INCLUDES) \\\n                            -pthread\n\n# Build unittests\nGTEST_LIBS =  libgtest.la libgtest_main.la -lpthread\nGMOCK_LIBS =  libgmock.la libgmock_main.la\nTESS_LIBS = $(GTEST_LIBS)\nTESS_LIBS += libtesseract.la $(libarchive_LIBS)\nTRAINING_LIBS = libtesseract_training.la\nTRAINING_LIBS += $(TESS_LIBS)\nunittest_CPPFLAGS += -isystem $(top_srcdir)/unittest/third_party/googletest/googletest/include\nunittest_CPPFLAGS += -isystem $(top_srcdir)/unittest/third_party/googletest/googlemock/include\n\ncheck_PROGRAMS = apiexample_test\nif ENABLE_TRAINING\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += applybox_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += baseapi_test\ncheck_PROGRAMS += baseapi_thread_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += bitvector_test\nendif # !DISABLED_LEGACY_ENGINE\nendif # ENABLE_TRAINING\ncheck_PROGRAMS += cleanapi_test\ncheck_PROGRAMS += colpartition_test\nif ENABLE_TRAINING\ncheck_PROGRAMS += commandlineflags_test\ncheck_PROGRAMS += dawg_test\nendif # ENABLE_TRAINING\ncheck_PROGRAMS += denorm_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += equationdetect_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += fileio_test\ncheck_PROGRAMS += heap_test\ncheck_PROGRAMS += imagedata_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += indexmapbidi_test\ncheck_PROGRAMS += intfeaturemap_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += intsimdmatrix_test\ncheck_PROGRAMS += lang_model_test\ncheck_PROGRAMS += layout_test\ncheck_PROGRAMS += ligature_table_test\ncheck_PROGRAMS += linlsq_test\ncheck_PROGRAMS += list_test\nif ENABLE_TRAINING\ncheck_PROGRAMS += lstm_recode_test\ncheck_PROGRAMS += lstm_squashed_test\ncheck_PROGRAMS += lstm_test\ncheck_PROGRAMS += lstmtrainer_test\nendif # ENABLE_TRAINING\ncheck_PROGRAMS += loadlang_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += mastertrainer_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += matrix_test\ncheck_PROGRAMS += networkio_test\nif ENABLE_TRAINING\ncheck_PROGRAMS += normstrngs_test\nendif # ENABLE_TRAINING\ncheck_PROGRAMS += nthitem_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += osd_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += pagesegmode_test\nif ENABLE_TRAINING\ncheck_PROGRAMS += pango_font_info_test\nendif # ENABLE_TRAINING\ncheck_PROGRAMS += paragraphs_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += params_model_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += progress_test\ncheck_PROGRAMS += qrsequence_test\ncheck_PROGRAMS += recodebeam_test\ncheck_PROGRAMS += rect_test\ncheck_PROGRAMS += resultiterator_test\ncheck_PROGRAMS += scanutils_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += shapetable_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += stats_test\ncheck_PROGRAMS += stridemap_test\ncheck_PROGRAMS += stringrenderer_test\ncheck_PROGRAMS += tablefind_test\ncheck_PROGRAMS += tablerecog_test\ncheck_PROGRAMS += tabvector_test\ncheck_PROGRAMS += tatweel_test\nif !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += textlineprojection_test\nendif # !DISABLED_LEGACY_ENGINE\ncheck_PROGRAMS += tfile_test\nif ENABLE_TRAINING\ncheck_PROGRAMS += unichar_test\ncheck_PROGRAMS += unicharcompress_test\ncheck_PROGRAMS += unicharset_test\ncheck_PROGRAMS += validate_grapheme_test\ncheck_PROGRAMS += validate_indic_test\ncheck_PROGRAMS += validate_khmer_test\ncheck_PROGRAMS += validate_myanmar_test\ncheck_PROGRAMS += validator_test\nendif # ENABLE_TRAINING\n\ncheck_PROGRAMS: libtesseract.la libtesseract_training.la\n\nTESTS = $(check_PROGRAMS)\n\n# List of source files needed to build the executable:\n\napiexample_test_SOURCES = unittest/apiexample_test.cc\napiexample_test_CPPFLAGS = $(unittest_CPPFLAGS)\napiexample_test_LDFLAGS = $(LEPTONICA_LIBS)\napiexample_test_LDADD = $(TESS_LIBS) $(LEPTONICA_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\napplybox_test_SOURCES = unittest/applybox_test.cc\napplybox_test_CPPFLAGS = $(unittest_CPPFLAGS)\napplybox_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nbaseapi_test_SOURCES = unittest/baseapi_test.cc\nbaseapi_test_CPPFLAGS = $(unittest_CPPFLAGS)\nbaseapi_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\n\nbaseapi_thread_test_SOURCES = unittest/baseapi_thread_test.cc\nbaseapi_thread_test_CPPFLAGS = $(unittest_CPPFLAGS)\nbaseapi_thread_test_LDADD = $(TESS_LIBS) $(LEPTONICA_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nbitvector_test_SOURCES = unittest/bitvector_test.cc\nbitvector_test_CPPFLAGS = $(unittest_CPPFLAGS)\nbitvector_test_LDADD = $(TRAINING_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\ncleanapi_test_SOURCES = unittest/cleanapi_test.cc\ncleanapi_test_CPPFLAGS = $(unittest_CPPFLAGS)\ncleanapi_test_LDADD = $(TESS_LIBS)\n\ncolpartition_test_SOURCES = unittest/colpartition_test.cc\ncolpartition_test_CPPFLAGS = $(unittest_CPPFLAGS)\ncolpartition_test_LDADD = $(TESS_LIBS)\n\ncommandlineflags_test_SOURCES = unittest/commandlineflags_test.cc\ncommandlineflags_test_CPPFLAGS = $(unittest_CPPFLAGS)\ncommandlineflags_test_LDADD = $(TRAINING_LIBS) $(ICU_UC_LIBS)\n\ndawg_test_SOURCES = unittest/dawg_test.cc\ndawg_test_CPPFLAGS = $(unittest_CPPFLAGS)\ndawg_test_LDADD = $(TRAINING_LIBS)\n\ndenorm_test_SOURCES = unittest/denorm_test.cc\ndenorm_test_CPPFLAGS = $(unittest_CPPFLAGS)\ndenorm_test_LDADD = $(TESS_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nequationdetect_test_SOURCES = unittest/equationdetect_test.cc\nequationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)\nequationdetect_test_LDADD = $(TESS_LIBS) $(LEPTONICA_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nfileio_test_SOURCES = unittest/fileio_test.cc\nfileio_test_CPPFLAGS = $(unittest_CPPFLAGS)\nfileio_test_LDADD = $(TRAINING_LIBS)\n\nheap_test_SOURCES = unittest/heap_test.cc\nheap_test_CPPFLAGS = $(unittest_CPPFLAGS)\nheap_test_LDADD = $(TESS_LIBS)\n\nimagedata_test_SOURCES = unittest/imagedata_test.cc\nimagedata_test_CPPFLAGS = $(unittest_CPPFLAGS)\nimagedata_test_LDADD = $(TRAINING_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nindexmapbidi_test_SOURCES = unittest/indexmapbidi_test.cc\nindexmapbidi_test_CPPFLAGS = $(unittest_CPPFLAGS)\nindexmapbidi_test_LDADD = $(TRAINING_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nif !DISABLED_LEGACY_ENGINE\nintfeaturemap_test_SOURCES = unittest/intfeaturemap_test.cc\nintfeaturemap_test_CPPFLAGS = $(unittest_CPPFLAGS)\nintfeaturemap_test_LDADD = $(TRAINING_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nintsimdmatrix_test_SOURCES = unittest/intsimdmatrix_test.cc\nintsimdmatrix_test_CPPFLAGS = $(unittest_CPPFLAGS)\nif HAVE_AVX2\nintsimdmatrix_test_CPPFLAGS += -DHAVE_AVX2\nendif\nif HAVE_SSE4_1\nintsimdmatrix_test_CPPFLAGS += -DHAVE_SSE4_1\nendif\nintsimdmatrix_test_LDADD = $(TESS_LIBS)\n\nlang_model_test_SOURCES = unittest/lang_model_test.cc\nlang_model_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlang_model_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nlayout_test_SOURCES = unittest/layout_test.cc\nlayout_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlayout_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\n\nligature_table_test_SOURCES = unittest/ligature_table_test.cc\nligature_table_test_CPPFLAGS = $(unittest_CPPFLAGS)\nligature_table_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\nligature_table_test_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\nligature_table_test_LDADD += $(pangocairo_LIBS) $(pangoft2_LIBS)\nligature_table_test_LDADD += $(cairo_LIBS) $(pango_LIBS)\n\nlinlsq_test_SOURCES = unittest/linlsq_test.cc\nlinlsq_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlinlsq_test_LDADD = $(TESS_LIBS)\n\nlist_test_SOURCES = unittest/list_test.cc\nlist_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlist_test_LDADD = $(TESS_LIBS)\n\nloadlang_test_SOURCES = unittest/loadlang_test.cc\nloadlang_test_CPPFLAGS = $(unittest_CPPFLAGS)\nloadlang_test_LDADD = $(TESS_LIBS) $(LEPTONICA_LIBS)\n\nlstm_recode_test_SOURCES = unittest/lstm_recode_test.cc\nlstm_recode_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlstm_recode_test_LDADD = $(TRAINING_LIBS)\n\nlstm_squashed_test_SOURCES = unittest/lstm_squashed_test.cc\nlstm_squashed_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlstm_squashed_test_LDADD = $(TRAINING_LIBS)\n\nlstm_test_SOURCES = unittest/lstm_test.cc\nlstm_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlstm_test_LDADD = $(TRAINING_LIBS)\n\nlstmtrainer_test_SOURCES = unittest/lstmtrainer_test.cc\nlstmtrainer_test_CPPFLAGS = $(unittest_CPPFLAGS)\nlstmtrainer_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nmastertrainer_test_SOURCES = unittest/mastertrainer_test.cc\nmastertrainer_test_CPPFLAGS = $(unittest_CPPFLAGS)\nmastertrainer_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nmatrix_test_SOURCES = unittest/matrix_test.cc\nmatrix_test_CPPFLAGS = $(unittest_CPPFLAGS)\nmatrix_test_LDADD = $(TESS_LIBS)\n\nnetworkio_test_SOURCES = unittest/networkio_test.cc\nnetworkio_test_CPPFLAGS = $(unittest_CPPFLAGS)\nnetworkio_test_LDADD = $(TESS_LIBS)\n\nnormstrngs_test_SOURCES = unittest/normstrngs_test.cc\nnormstrngs_test_CPPFLAGS = $(unittest_CPPFLAGS)\nnormstrngs_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nnthitem_test_SOURCES = unittest/nthitem_test.cc\nnthitem_test_CPPFLAGS = $(unittest_CPPFLAGS)\nnthitem_test_LDADD = $(TESS_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nosd_test_SOURCES = unittest/osd_test.cc\nosd_test_CPPFLAGS = $(unittest_CPPFLAGS)\nosd_test_LDADD = $(TESS_LIBS) $(LEPTONICA_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\npagesegmode_test_SOURCES = unittest/pagesegmode_test.cc\npagesegmode_test_CPPFLAGS = $(unittest_CPPFLAGS)\npagesegmode_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\n\npango_font_info_test_SOURCES = unittest/pango_font_info_test.cc\npango_font_info_test_CPPFLAGS = $(unittest_CPPFLAGS)\npango_font_info_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\npango_font_info_test_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\npango_font_info_test_LDADD += $(pangocairo_LIBS)\npango_font_info_test_LDADD += $(pangoft2_LIBS)\n\nparagraphs_test_SOURCES = unittest/paragraphs_test.cc\nparagraphs_test_CPPFLAGS = $(unittest_CPPFLAGS)\nparagraphs_test_LDADD = $(TESS_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nparams_model_test_SOURCES = unittest/params_model_test.cc\nparams_model_test_CPPFLAGS = $(unittest_CPPFLAGS)\nparams_model_test_LDADD = $(TRAINING_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nprogress_test_SOURCES = unittest/progress_test.cc\nprogress_test_CPPFLAGS = $(unittest_CPPFLAGS)\nprogress_test_LDFLAGS = $(LEPTONICA_LIBS)\nprogress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)\n\nqrsequence_test_SOURCES = unittest/qrsequence_test.cc\nqrsequence_test_CPPFLAGS = $(unittest_CPPFLAGS)\nqrsequence_test_LDADD = $(TESS_LIBS)\n\nrecodebeam_test_SOURCES = unittest/recodebeam_test.cc\nrecodebeam_test_CPPFLAGS = $(unittest_CPPFLAGS)\nrecodebeam_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nrect_test_SOURCES = unittest/rect_test.cc\nrect_test_CPPFLAGS = $(unittest_CPPFLAGS)\nrect_test_LDADD = $(TESS_LIBS)\n\nresultiterator_test_SOURCES = unittest/resultiterator_test.cc\nresultiterator_test_CPPFLAGS = $(unittest_CPPFLAGS)\nresultiterator_test_LDADD = $(TRAINING_LIBS)\nresultiterator_test_LDADD += $(LEPTONICA_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nscanutils_test_SOURCES = unittest/scanutils_test.cc\nscanutils_test_CPPFLAGS = $(unittest_CPPFLAGS)\nscanutils_test_LDADD = $(TRAINING_LIBS)\n\nif !DISABLED_LEGACY_ENGINE\nshapetable_test_SOURCES = unittest/shapetable_test.cc\nshapetable_test_CPPFLAGS = $(unittest_CPPFLAGS)\nshapetable_test_LDADD = $(TRAINING_LIBS)\nendif # !DISABLED_LEGACY_ENGINE\n\nstats_test_SOURCES = unittest/stats_test.cc\nstats_test_CPPFLAGS = $(unittest_CPPFLAGS)\nstats_test_LDADD = $(TESS_LIBS)\n\nstridemap_test_SOURCES = unittest/stridemap_test.cc\nstridemap_test_CPPFLAGS = $(unittest_CPPFLAGS)\nstridemap_test_LDADD = $(TESS_LIBS)\n\nstringrenderer_test_SOURCES = unittest/stringrenderer_test.cc\nstringrenderer_test_CPPFLAGS = $(unittest_CPPFLAGS)\nstringrenderer_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\nstringrenderer_test_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\nstringrenderer_test_LDADD += $(pangocairo_LIBS) $(pangoft2_LIBS)\nstringrenderer_test_LDADD += $(cairo_LIBS) $(pango_LIBS)\n\ntablefind_test_SOURCES = unittest/tablefind_test.cc\ntablefind_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntablefind_test_LDADD = $(TESS_LIBS)\n\ntablerecog_test_SOURCES = unittest/tablerecog_test.cc\ntablerecog_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntablerecog_test_LDADD = $(TESS_LIBS)\n\ntabvector_test_SOURCES = unittest/tabvector_test.cc\ntabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntabvector_test_LDADD = $(TESS_LIBS)\n\ntatweel_test_SOURCES = unittest/tatweel_test.cc\ntatweel_test_SOURCES += unittest/third_party/utf/rune.c\ntatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc\ntatweel_test_SOURCES += unittest/util/utf8/unilib.cc\ntatweel_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntatweel_test_LDADD = $(TRAINING_LIBS)\n\ntextlineprojection_test_SOURCES = unittest/textlineprojection_test.cc\ntextlineprojection_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntextlineprojection_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS)\n\ntfile_test_SOURCES = unittest/tfile_test.cc\ntfile_test_CPPFLAGS = $(unittest_CPPFLAGS)\ntfile_test_LDADD = $(TESS_LIBS)\n\nunichar_test_SOURCES = unittest/unichar_test.cc\nunichar_test_CPPFLAGS = $(unittest_CPPFLAGS)\nunichar_test_LDADD = $(TRAINING_LIBS) $(ICU_UC_LIBS)\n\nunicharcompress_test_SOURCES = unittest/unicharcompress_test.cc\nunicharcompress_test_CPPFLAGS = $(unittest_CPPFLAGS)\nunicharcompress_test_LDADD = $(TRAINING_LIBS) $(ICU_UC_LIBS)\n\nunicharset_test_SOURCES = unittest/unicharset_test.cc\nunicharset_test_CPPFLAGS = $(unittest_CPPFLAGS)\nunicharset_test_LDADD = $(TRAINING_LIBS) $(ICU_UC_LIBS)\n\nvalidate_grapheme_test_SOURCES = unittest/validate_grapheme_test.cc\nvalidate_grapheme_test_CPPFLAGS = $(unittest_CPPFLAGS)\nvalidate_grapheme_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nvalidate_indic_test_SOURCES = unittest/validate_indic_test.cc\nvalidate_indic_test_CPPFLAGS = $(unittest_CPPFLAGS)\nvalidate_indic_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nvalidate_khmer_test_SOURCES = unittest/validate_khmer_test.cc\nvalidate_khmer_test_CPPFLAGS = $(unittest_CPPFLAGS)\nvalidate_khmer_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nvalidate_myanmar_test_SOURCES = unittest/validate_myanmar_test.cc\nvalidate_myanmar_test_CPPFLAGS = $(unittest_CPPFLAGS)\nvalidate_myanmar_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)\n\nvalidator_test_SOURCES = unittest/validator_test.cc\nvalidator_test_CPPFLAGS = $(unittest_CPPFLAGS)\nvalidator_test_LDADD = $(TRAINING_LIBS) $(ICU_UC_LIBS)\n\n# for windows\nif T_WIN\napiexample_test_LDADD += -lws2_32\nintsimdmatrix_test_LDADD += -lws2_32\nmatrix_test_LDADD += -lws2_32\nif !DISABLED_LEGACY_ENGINE\nosd_test_LDADD += -lws2_32\nendif # !DISABLED_LEGACY_ENGINE\nloadlang_test_LDADD += -lws2_32\nendif\n\nEXTRA_apiexample_test_DEPENDENCIES = $(abs_top_builddir)/test/testing/phototest.tif\nEXTRA_apiexample_test_DEPENDENCIES += $(abs_top_builddir)/test/testing/phototest.txt\n\n$(abs_top_builddir)/test/testing/phototest.tif:\n\tmkdir -p $(top_builddir)/test/testing\n\tln -s $(TESTING_DIR)/phototest.tif $(top_builddir)/test/testing/phototest.tif\n\n$(abs_top_builddir)/test/testing/phototest.txt:\n\tmkdir -p $(top_builddir)/test/testing\n\tln -s $(TESTING_DIR)/phototest.txt $(top_builddir)/test/testing/phototest.txt\n\n# Some tests require a local tmp directory.\n\n$(check_PROGRAMS): | tmp\n\ntmp:\n\tmkdir -p tmp\n\n# Some tests require a well defined set of the following font files.\n\nfonts = ae_Arab.ttf\nfonts += Arial_Bold_Italic.ttf\nfonts += DejaVuSans-ExtraLight.ttf\nfonts += Lohit-Hindi.ttf\nfonts += Times_New_Roman.ttf\nfonts += UnBatang.ttf\nfonts += Verdana.ttf\n\n# These tests depend on installed model files and fonts:\n#\n# apiexample_test baseapi_test lang_model_test layout_test\n# ligature_table_test loadlang_test lstm_recode_test lstm_squashed_test\n# lstm_test lstmtrainer_test mastertrainer_test osd_test\n# pagesegmode_test pango_font_info_test progress_test\n# recodebeam_test resultiterator_test stringrenderer_test\n# textlineprojection_test unicharcompress_test\n#\n# Instead of fine-tuned dependencies the following lines\n# simply require those dependencies for all tests.\n# That can be improved if necessary.\n\n$(check_PROGRAMS): | $(LANGDATA_DIR)\n$(check_PROGRAMS): | $(TESSDATA_DIR)\n$(check_PROGRAMS): | $(TESSDATA_BEST_DIR)\n$(check_PROGRAMS): | $(TESSDATA_FAST_DIR)\n$(check_PROGRAMS): | $(fonts:%=$(TESTING_DIR)/%)\n\n$(LANGDATA_DIR) $(TESSDATA_DIR) $(TESSDATA_BEST_DIR) $(TESSDATA_FAST_DIR):\n\t@echo \"Some unit tests require $@.\"\n\t@echo \"It can be installed manually by running this command:\"\n\t@echo \"  git clone https://github.com/tesseract-ocr/$$(basename $@).git $@\"\n\t@exit 1\n\n$(TESTING_DIR)/Arial_Bold_Italic.ttf:\n\tcurl -sSL -o Arial.exe https://sourceforge.net/projects/corefonts/files/the%20fonts/final/arial32.exe/download\n\tcabextract -F Arialbi.TTF -q Arial.exe\n\trm Arial.exe\n\tmv Arialbi.TTF $@\n\n$(TESTING_DIR)/DejaVuSans-ExtraLight.ttf:\n\tcurl -sSL http://sourceforge.net/projects/dejavu/files/dejavu/2.37/dejavu-fonts-ttf-2.37.tar.bz2 | \\\n\ttar -xjO dejavu-fonts-ttf-2.37/ttf/DejaVuSans-ExtraLight.ttf >$@\n\n$(TESTING_DIR)/Lohit-Hindi.ttf:\n\tcurl -sSL https://releases.pagure.org/lohit/lohit-hindi-ttf-2.4.3.tar.gz | \\\n\ttar -xzO lohit-hindi-ttf-2.4.3/Lohit-Hindi.ttf >$@\n\n$(TESTING_DIR)/Times_New_Roman.ttf:\n\tcurl -sSL -o Times.exe https://sourceforge.net/projects/corefonts/files/the%20fonts/final/times32.exe/download\n\tcabextract -F Times.TTF -q Times.exe\n\trm Times.exe\n\tmv Times.TTF $@\n\n$(TESTING_DIR)/UnBatang.ttf:\n\tcurl -sSL -o $@ https://salsa.debian.org/fonts-team/fonts-unfonts-core/-/raw/master/UnBatang.ttf\n\n$(TESTING_DIR)/Verdana.ttf:\n\tcurl -sSL -o Verdana.exe https://sourceforge.net/projects/corefonts/files/the%20fonts/final/verdan32.exe/download\n\tcabextract -F Verdana.TTF -q Verdana.exe\n\trm Verdana.exe\n\tmv Verdana.TTF $@\n\n$(TESTING_DIR)/ae_Arab.ttf:\n\tcurl -sSL -o $@ https://salsa.debian.org/fonts-team/fonts-arabeyes/-/raw/master/ae_Arab.ttf\n"
  },
  {
    "path": "README.md",
    "content": "# Tesseract OCR\n\n[![Coverity Scan Build Status](https://scan.coverity.com/projects/tesseract-ocr/badge.svg)](https://scan.coverity.com/projects/tesseract-ocr)\n[![CodeQL](https://github.com/tesseract-ocr/tesseract/workflows/CodeQL/badge.svg)](https://github.com/tesseract-ocr/tesseract/security/code-scanning)\n[![OSS-Fuzz](https://img.shields.io/badge/oss--fuzz-fuzzing-brightgreen)](https://issues.oss-fuzz.com/issues?q=is:open%20title:tesseract-ocr)\n\\\n[![GitHub license](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](https://raw.githubusercontent.com/tesseract-ocr/tesseract/main/LICENSE)\n[![Downloads](https://img.shields.io/badge/download-all%20releases-brightgreen.svg)](https://github.com/tesseract-ocr/tesseract/releases/)\n\n## Table of Contents\n\n* [Tesseract OCR](#tesseract-ocr)\n  * [About](#about)\n  * [Brief history](#brief-history)\n  * [Installing Tesseract](#installing-tesseract)\n  * [Running Tesseract](#running-tesseract)\n  * [For developers](#for-developers)\n  * [Support](#support)\n  * [License](#license)\n  * [Dependencies](#dependencies)\n  * [Latest Version of README](#latest-version-of-readme)\n\n## About\n\nThis package contains an **OCR engine** - `libtesseract` and a **command line program** - `tesseract`.\n\nTesseract 4 adds a new neural net (LSTM) based [OCR engine](https://en.wikipedia.org/wiki/Optical_character_recognition) which is focused on line recognition, but also still supports the legacy Tesseract OCR engine of Tesseract 3 which works by recognizing character patterns. Compatibility with Tesseract 3 is enabled by using the Legacy OCR Engine mode (--oem 0).\nIt also needs [traineddata](https://tesseract-ocr.github.io/tessdoc/Data-Files.html) files which support the legacy engine, for example those from the [tessdata](https://github.com/tesseract-ocr/tessdata) repository.\n\nStefan Weil is the current lead developer. Ray Smith was the lead developer until 2017. The maintainer is Zdenko Podobny. For a list of contributors see [AUTHORS](https://github.com/tesseract-ocr/tesseract/blob/main/AUTHORS)\nand GitHub's log of [contributors](https://github.com/tesseract-ocr/tesseract/graphs/contributors).\n\nTesseract has **unicode (UTF-8) support**, and can **recognize [more than 100 languages](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)** \"out of the box\".\n\nTesseract supports **[various image formats](https://tesseract-ocr.github.io/tessdoc/InputFormats)** including PNG, JPEG and TIFF.\n\nTesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV, ALTO and PAGE.\n\nYou should note that in many cases, in order to get better OCR results, you'll need to **[improve the quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) of the image** you are giving Tesseract.\n\nThis project **does not include a GUI application**. If you need one, please see the [3rdParty](https://tesseract-ocr.github.io/tessdoc/User-Projects-%E2%80%93-3rdParty.html) documentation.\n\nTesseract **can be trained to recognize other languages**.\nSee [Tesseract Training](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html) for more information.\n\n## Brief history\n\nTesseract was originally developed at Hewlett-Packard Laboratories Bristol UK and at Hewlett-Packard Co, Greeley Colorado USA between 1985 and 1994, with some more changes made in 1996 to port to Windows, and some C++izing in 1998. In 2005 Tesseract was open sourced by HP. From 2006 until August 2017 it was developed by Google.\n\nMajor version 5 is the current stable version and started with release\n[5.0.0](https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0) on November 30, 2021. Newer minor versions and bugfix versions are available from\n[GitHub](https://github.com/tesseract-ocr/tesseract/releases/).\n\nLatest source code is available from [main branch on GitHub](https://github.com/tesseract-ocr/tesseract/tree/main).\nOpen issues can be found in [issue tracker](https://github.com/tesseract-ocr/tesseract/issues),\nand [planning documentation](https://tesseract-ocr.github.io/tessdoc/Planning.html).\n\nSee **[Release Notes](https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html)**\nand **[Change Log](https://github.com/tesseract-ocr/tesseract/blob/main/ChangeLog)** for more details of the releases.\n\n## Installing Tesseract\n\nYou can either [Install Tesseract via pre-built binary package](https://tesseract-ocr.github.io/tessdoc/Installation.html)\nor [build it from source](https://tesseract-ocr.github.io/tessdoc/Compiling.html).\n\nBefore building Tesseract from source, please check that your system has a compiler which is one of the [supported compilers](https://tesseract-ocr.github.io/tessdoc/supported-compilers.html).\n\n## Running Tesseract\n\nBasic **[command line usage](https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html)**:\n\n    tesseract imagename outputbase [-l lang] [--oem ocrenginemode] [--psm pagesegmode] [configfiles...]\n\nFor more information about the various command line options use `tesseract --help` or `man tesseract`.\n\nExamples can be found in the [documentation](https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#simplest-invocation-to-ocr-an-image).\n\n## For developers\n\nDevelopers can use `libtesseract` [C](https://github.com/tesseract-ocr/tesseract/blob/main/include/tesseract/capi.h) or\n[C++](https://github.com/tesseract-ocr/tesseract/blob/main/include/tesseract/baseapi.h) API to build their own application. If you need bindings to `libtesseract` for other programming languages, please see the\n[wrapper](https://tesseract-ocr.github.io/tessdoc/AddOns.html#tesseract-wrappers) section in the AddOns documentation.\n\nDocumentation of Tesseract generated from source code by doxygen can be found on [tesseract-ocr.github.io](https://tesseract-ocr.github.io/).\n\n## Support\n\nBefore you submit an issue, please review **[the guidelines for this repository](https://github.com/tesseract-ocr/tesseract/blob/main/CONTRIBUTING.md)**.\n\nFor support, first read the [documentation](https://tesseract-ocr.github.io/tessdoc/),\nparticularly the [FAQ](https://tesseract-ocr.github.io/tessdoc/FAQ.html) to see if your problem is addressed there.\nIf not, search the [Tesseract user forum](https://groups.google.com/g/tesseract-ocr), the [Tesseract developer forum](https://groups.google.com/g/tesseract-dev) and [past issues](https://github.com/tesseract-ocr/tesseract/issues), and if you still can't find what you need, ask for support in the mailing-lists.\n\nMailing-lists:\n\n* [tesseract-ocr](https://groups.google.com/g/tesseract-ocr) - For tesseract users.\n* [tesseract-dev](https://groups.google.com/g/tesseract-dev) - For tesseract developers.\n\nPlease report an issue only for a **bug**, not for asking questions.\n\n## License\n\n    The code in this repository is licensed under the Apache License, Version 2.0 (the \"License\");\n    you may not use this file except in compliance with the License.\n    You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n    Unless required by applicable law or agreed to in writing, software\n    distributed under the License is distributed on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n    See the License for the specific language governing permissions and\n    limitations under the License.\n\n**NOTE**: This software depends on other packages that may be licensed under different open source licenses.\n\nTesseract uses [Leptonica library](http://leptonica.com/) which essentially\nuses a [BSD 2-clause license](http://leptonica.com/about-the-license.html).\n\n## Dependencies\n\nTesseract uses [Leptonica library](https://github.com/DanBloomberg/leptonica)\nfor opening input images (e.g. not documents like pdf).\nIt is suggested to use leptonica with built-in support for [zlib](https://zlib.net),\n[png](https://sourceforge.net/projects/libpng) and\n[tiff](http://www.simplesystems.org/libtiff) (for multipage tiff).\n\n## Latest Version of README\n\nFor the latest online version of the README.md see:\n\n<https://github.com/tesseract-ocr/tesseract/blob/main/README.md>\n"
  },
  {
    "path": "VERSION",
    "content": "5.5.2\n"
  },
  {
    "path": "appveyor.yml",
    "content": "environment:\n  matrix:\n    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022\n      platform: Win64\n\nconfiguration:\n  - Release\n\ncache:\n  - c:/Users/appveyor/.sw -> appveyor.yml\n\nonly_commits:\n  files:\n    - appveyor.yml\n    - '**.cpp'\n    - '**.h'\n    - 'unittest/**.c'\n    - 'unittest/**.cc'\n\nbefore_build:\n  - git submodule update --init --recursive\n  - curl -fsS -L -o dl.zip https://software-network.org/client/sw-master-windows_x86_64-client.zip\n  - 7z x dl.zip\n  - set PATH=%PATH%;%cd%\n\nbuild_script:\n  - sw -version\n  # -show-output - show command output\n  # debug build causes long builds (> 1h), appveyor drops them\n  - sw -platform %platform% -config r build -Dwith-tests=1\n  # test\n  - git clone https://github.com/egorpugin/tessdata tessdata_unittest\n  - ps: Copy-Item -Path \"tessdata_unittest\\fonts\\*\" -Destination \"test\\testing\" -Recurse\n  - sw -platform %platform% -config r test -Dwith-tests=1 -Dskip-tests=lstm,lstm_recode\n\nafter_build:\n  - 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\\.sw\\out\\**\\*.exe %APPVEYOR_BUILD_FOLDER%\\.sw\\out\\**\\*.dll\n  #- 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\\.sw\\Windows_*_Shared_Release_MSVC_*\\*.exe %APPVEYOR_BUILD_FOLDER%\\.sw\\Windows_*_Shared_Release_MSVC_*\\*.dll\n\non_finish:\n  # gather tests\n  - ps: $wc = New-Object 'System.Net.WebClient'\n  - ps: $wc.UploadFile(\"https://ci.appveyor.com/api/testresults/junit/$($env:APPVEYOR_JOB_ID)\", (Resolve-Path .\\.sw\\test\\results.xml))\n\nartifacts:\n  - path: tesseract.zip\n    name: tesseract-$(APPVEYOR_BUILD_VERSION)\n\n"
  },
  {
    "path": "autogen.sh",
    "content": "#!/bin/sh\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# This is a simple script which is meant to help developers\n# better deal with the GNU autotools, specifically:\n#\n#   aclocal\n#   libtoolize\n#   autoconf\n#   autoheader\n#   automake\n#\n# The whole thing is quite complex...\n#\n# The idea is to run this collection of tools on a single platform,\n# typically the main development platform, running a recent version of\n# autoconf. In theory, if we had these tools on each platform where we\n# ever expected to port the software, we would never need to checkin\n# more than a few autotools configuration files. However, the whole\n# idea is to generate a configure script and associated files in a way\n# that is portable across platforms, so we *have* to check in a whole\n# bunch of files generated by all these tools.\n\n# The real source files are:\n#\n# acinclude.m4 (used by aclocal)\n# configure.ac (main autoconf file)\n# Makefile.am, */Makefile.am (automake config files)\n#\n# All the rest is auto-generated.\n\nif [ \"$1\" = \"clean\" ]; then\n    echo \"Cleaning...\"\n    rm configure aclocal.m4\n    rm m4/l*\n    rm config/*\n    rmdir config\n    find . -iname \"Makefile.in\" -type f -exec rm '{}' +\nfi\n\nbail_out()\n{\n    echo\n    echo \"  Something went wrong, bailing out!\"\n    echo\n    exit 1\n}\n\n# Prevent any errors that might result from failing to properly invoke\n# `libtoolize` or `glibtoolize,` whichever is present on your system,\n# from occurring by testing for its existence and capturing the absolute path to\n# its location for caching purposes prior to using it later on in 'Step 2:'\nif command -v libtoolize >/dev/null 2>&1; then\n  LIBTOOLIZE=\"$(command -v libtoolize)\"\nelif command -v glibtoolize >/dev/null 2>&1; then\n  LIBTOOLIZE=\"$(command -v glibtoolize)\"\nelse\n  echo \"Unable to find a valid copy of libtoolize or glibtoolize in your PATH!\"\n  bail_out\nfi\n\n# --- Step 1: Generate aclocal.m4 from:\n#             . acinclude.m4\n#             . config/*.m4 (these files are referenced in acinclude.m4)\n\nmkdir -p config\n\necho \"Running aclocal\"\naclocal -I config || bail_out\n\n# --- Step 2:\n\necho \"Running $LIBTOOLIZE\"\n$LIBTOOLIZE -f -c || bail_out\n$LIBTOOLIZE --automake || bail_out\n\n# Run aclocal a 2nd time because glibtoolize created additional m4 files.\necho \"Running aclocal\"\naclocal -I config || bail_out\n\n# --- Step 3: Generate configure and include/miaconfig.h from:\n#             . configure.ac\n#\n\necho \"Running autoconf\"\nautoconf || bail_out\n\nif grep -q PKG_CHECK_MODULES configure; then\n  # The generated configure is invalid because pkg-config is unavailable.\n  rm configure\n  echo \"Missing pkg-config. Check the build requirements.\"\n  bail_out\nfi\n\n# --- Step 4: Generate config.h.in from:\n#             . configure.ac (look for AM_CONFIG_HEADER tag or AC_CONFIG_HEADER tag)\n\necho \"Running autoheader\"\nautoheader -f || bail_out\n\n# --- Step 5: Generate Makefile.in, src/Makefile.in, and a whole bunch of\n#             files in config (config.guess, config.sub, depcomp,\n#             install-sh, missing, mkinstalldirs) plus COPYING and\n#             INSTALL from:\n#             . Makefile.am\n#             . src/Makefile.am\n#\n# Using --add-missing --copy makes sure that, if these files are missing,\n# they are copied from the system so they can be used in a distribution.\n\necho \"Running automake --add-missing --copy\"\nautomake --add-missing --copy --warnings=all || bail_out\n\necho \"\"\necho \"All done.\"\necho \"To build the software now, do something like:\"\necho \"\"\necho \"$ ./configure [--enable-debug] [...other options]\"\n"
  },
  {
    "path": "cmake/BuildFunctions.cmake",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n################################################################################\n#\n# macros and functions\n#\n################################################################################\n\n########################################\n# FUNCTION project_group\n########################################\nfunction(project_group target name)\n    set_target_properties(${target} PROPERTIES FOLDER ${name})\nendfunction(project_group)\n\n################################################################################\n"
  },
  {
    "path": "cmake/BuildOptimizations.cmake",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n################################################################################\n#\n# Build Optimizations Module\n#\n# This module provides functions to apply modern CMake build optimizations\n# to targets for faster and incremental builds.\n#\n################################################################################\n\n#\n# Function: apply_modern_optimizations\n# Apply build optimizations to a target\n#\n# Parameters:\n#   target_name - Name of the target to optimize\n#   PCH_HEADERS - Optional list of headers for precompiled headers\n#\nfunction(apply_modern_optimizations target_name)\n    # Parse arguments\n    set(oneValueArgs )\n    set(multiValueArgs PCH_HEADERS)\n    cmake_parse_arguments(ARG \"\" \"${oneValueArgs}\" \"${multiValueArgs}\" ${ARGN})\n\n    # Apply Unity Build if enabled\n    if(ENABLE_UNITY_BUILD)\n        set_target_properties(${target_name} PROPERTIES UNITY_BUILD ON)\n        # Use smaller batch sizes for libraries with many files\n        get_target_property(target_type ${target_name} TYPE)\n        if(target_type STREQUAL \"STATIC_LIBRARY\" OR target_type STREQUAL \"SHARED_LIBRARY\")\n            set_target_properties(${target_name} PROPERTIES UNITY_BUILD_BATCH_SIZE 16)\n        else()\n            set_target_properties(${target_name} PROPERTIES UNITY_BUILD_BATCH_SIZE 8)\n        endif()\n        message(STATUS \"Unity build enabled for ${target_name}\")\n    endif()\n\n    # Apply Precompiled Headers if enabled and headers provided\n    if(ENABLE_PRECOMPILED_HEADERS)\n        if(ARG_PCH_HEADERS)\n            target_precompile_headers(${target_name} PRIVATE ${ARG_PCH_HEADERS})\n            message(STATUS \"Precompiled headers enabled for ${target_name}\")\n        else()\n            # Use common standard library headers as default\n            target_precompile_headers(${target_name} PRIVATE\n                <vector>\n                <string>\n                <memory>\n                <algorithm>\n                <iostream>\n                <cstdlib>\n                <cstring>\n                <cmath>\n            )\n            message(STATUS \"Default precompiled headers enabled for ${target_name}\")\n        endif()\n    endif()\n\n    # Configure build pools for Ninja\n    if(ENABLE_NINJA_POOL AND CMAKE_GENERATOR STREQUAL \"Ninja\")\n        set_target_properties(${target_name} PROPERTIES JOB_POOL_COMPILE compile)\n        set_target_properties(${target_name} PROPERTIES JOB_POOL_LINK link)\n    endif()\n\n    # Apply compiler-specific optimizations\n    if(MSVC)\n        # Enable parallel compilation for MSVC if not already enabled\n        get_target_property(target_compile_options ${target_name} COMPILE_OPTIONS)\n        if(NOT target_compile_options MATCHES \"/MP\")\n            target_compile_options(${target_name} PRIVATE \"/MP\")\n        endif()\n\n        # Enable function-level linking for better optimization\n        target_compile_options(${target_name} PRIVATE \"/Gy\")\n\n        # Enable intrinsic functions for better performance\n        target_compile_options(${target_name} PRIVATE \"/Oi\")\n    elseif(CMAKE_CXX_COMPILER_ID MATCHES \"GNU|Clang\")\n        # Enable split debug info for faster incremental builds\n        if(CMAKE_BUILD_TYPE MATCHES Debug)\n            target_compile_options(${target_name} PRIVATE \"-gsplit-dwarf\")\n        endif()\n\n        # Enable function sections for better dead code elimination\n        target_compile_options(${target_name} PRIVATE \"-ffunction-sections\" \"-fdata-sections\")\n    endif()\nendfunction()\n\n#\n# Function: apply_training_optimizations\n# Apply optimizations specific to training tools\n#\nfunction(apply_training_optimizations target_name)\n    apply_modern_optimizations(${target_name}\n        PCH_HEADERS\n            <vector>\n            <string>\n            <memory>\n            <iostream>\n            <fstream>\n            <cstdlib>\n            <cstring>\n    )\n\n    # Training tools usually build faster, so smaller unity batches are fine\n    if(ENABLE_UNITY_BUILD)\n        set_target_properties(${target_name} PROPERTIES UNITY_BUILD_BATCH_SIZE 4)\n    endif()\nendfunction()\n\n#\n# Function: apply_test_optimizations\n# Apply optimizations specific to test targets\n#\nfunction(apply_test_optimizations target_name)\n    # Tests often have different compilation patterns\n    if(ENABLE_PRECOMPILED_HEADERS)\n        target_precompile_headers(${target_name} PRIVATE\n            <gtest/gtest.h>\n            <vector>\n            <string>\n            <memory>\n            <iostream>\n        )\n        message(STATUS \"Test precompiled headers enabled for ${target_name}\")\n    endif()\n\n    # Tests benefit from unity builds but smaller batches\n    if(ENABLE_UNITY_BUILD)\n        set_target_properties(${target_name} PROPERTIES UNITY_BUILD ON)\n        set_target_properties(${target_name} PROPERTIES UNITY_BUILD_BATCH_SIZE 8)\n        message(STATUS \"Unity build enabled for test ${target_name}\")\n    endif()\n\n    # Configure Ninja pools\n    if(ENABLE_NINJA_POOL AND CMAKE_GENERATOR STREQUAL \"Ninja\")\n        set_target_properties(${target_name} PROPERTIES JOB_POOL_COMPILE compile)\n        set_target_properties(${target_name} PROPERTIES JOB_POOL_LINK link)\n    endif()\nendfunction()\n"
  },
  {
    "path": "cmake/CheckFunctions.cmake",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n# applicable law or agreed to in writing, software distributed under the License\n# is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied. See the License for the specific language\n# governing permissions and limitations under the License.\n# ##############################################################################\n#\n# macros and functions\n#\n# ##############################################################################\n\n# ##############################################################################\n# FUNCTION check_leptonica_tiff_support\n# ##############################################################################\nfunction(check_leptonica_tiff_support)\n  # check if leptonica was build with tiff support set result to\n  # LEPT_TIFF_RESULT\n  set(TIFF_TEST\n  \"#include \\\"allheaders.h\\\"\\n\"\n  \"int main() {\\n\"\n  \"  l_uint8 *data = NULL;\\n\"\n  \"  size_t size = 0;\\n\"\n  \"  PIX* pix = pixCreate(3, 3, 4);\\n\"\n  \"  l_int32 ret_val = pixWriteMemTiff(&data, &size, pix, IFF_TIFF_G3);\\n\"\n  \"  pixDestroy(&pix);\\n\"\n  \"  lept_free(data);\\n\"\n  \"  return ret_val;}\\n\")\n  if(${CMAKE_VERSION} VERSION_LESS \"3.25\")\n    message(STATUS \"Testing TIFF support in Leptonica is available with CMake >= 3.25 (you have ${CMAKE_VERSION}))\")\n  else()\n    set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})\n    try_run(\n      LEPT_TIFF_RESULT\n      LEPT_TIFF_COMPILE_SUCCESS\n      SOURCE_FROM_CONTENT tiff_test.cpp \"${TIFF_TEST}\"\n      CMAKE_FLAGS \"-DINCLUDE_DIRECTORIES=${Leptonica_INCLUDE_DIRS}\"\n      LINK_LIBRARIES ${Leptonica_LIBRARIES}\n      COMPILE_OUTPUT_VARIABLE\n      COMPILE_OUTPUT)\n    if(NOT LEPT_TIFF_COMPILE_SUCCESS)\n      message(STATUS \"COMPILE_OUTPUT: ${COMPILE_OUTPUT}\")\n      message(STATUS \"Leptonica_INCLUDE_DIRS: ${Leptonica_INCLUDE_DIRS}\")\n      message(STATUS \"Leptonica_LIBRARIES: ${Leptonica_LIBRARIES}\")\n      message(STATUS \"LEPT_TIFF_RESULT: ${LEPT_TIFF_RESULT}\")\n      message(STATUS \"LEPT_TIFF_COMPILE: ${LEPT_TIFF_COMPILE}\")\n      message(WARNING \"Failed to compile test\")\n    endif()\n  endif()\nendfunction(check_leptonica_tiff_support)\n\n# ##############################################################################\n"
  },
  {
    "path": "cmake/Configure.cmake",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n################################################################################\n#\n# configure\n#\n################################################################################\n\n########################################\n# FUNCTION check_includes\n########################################\nfunction(check_includes files)\n    foreach(F ${${files}})\n        set(name ${F})\n        string(REPLACE \"-\" \"_\" name ${name})\n        string(REPLACE \".\" \"_\" name ${name})\n        string(REPLACE \"/\" \"_\" name ${name})\n        string(TOUPPER ${name} name)\n        check_include_files(${F} HAVE_${name})\n        file(APPEND ${AUTOCONFIG_SRC} \"/* Define to 1 if you have the <${F}> header file. */\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"#cmakedefine HAVE_${name} 1\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"\\n\")\n    endforeach()\nendfunction(check_includes)\n\n########################################\n# FUNCTION check_functions\n########################################\nfunction(check_functions functions)\n    foreach(F ${${functions}})\n        set(name ${F})\n        string(TOUPPER ${name} name)\n        check_function_exists(${F} HAVE_${name})\n        file(APPEND ${AUTOCONFIG_SRC} \"/* Define to 1 if you have the `${F}' function. */\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"#cmakedefine HAVE_${name} 1\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"\\n\")\n    endforeach()\nendfunction(check_functions)\n\n########################################\n# FUNCTION check_types\n########################################\nfunction(check_types types)\n    foreach(T ${${types}})\n        set(name ${T})\n        string(REPLACE \" \" \"_\" name ${name})\n        string(REPLACE \"-\" \"_\" name ${name})\n        string(REPLACE \".\" \"_\" name ${name})\n        string(REPLACE \"/\" \"_\" name ${name})\n        string(TOUPPER ${name} name)\n        check_type_size(${T} HAVE_${name})\n        file(APPEND ${AUTOCONFIG_SRC} \"/* Define to 1 if the system has the type `${T}'. */\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"#cmakedefine HAVE_${name} 1\\n\")\n        file(APPEND ${AUTOCONFIG_SRC} \"\\n\")\n    endforeach()\nendfunction(check_types)\n\n########################################\n\nfile(WRITE ${AUTOCONFIG_SRC})\n\ninclude(CheckCSourceCompiles)\ninclude(CheckCSourceRuns)\ninclude(CheckCXXSourceCompiles)\ninclude(CheckCXXSourceRuns)\ninclude(CheckFunctionExists)\ninclude(CheckIncludeFiles)\ninclude(CheckLibraryExists)\ninclude(CheckPrototypeDefinition)\ninclude(CheckStructHasMember)\ninclude(CheckSymbolExists)\ninclude(CheckTypeSize)\ninclude(TestBigEndian)\n\nset(include_files_list\n    dlfcn.h\n    inttypes.h\n    memory.h\n    stdint.h\n    stdlib.h\n    string.h\n    sys/stat.h\n    sys/types.h\n    unistd.h\n\n    cairo/cairo-version.h\n    pango-1.0/pango/pango-features.h\n    unicode/uchar.h\n)\n# check_includes(include_files_list)\n\nset(types_list\n    \"long long int\"\n    wchar_t\n)\n# check_types(types_list)\n\nlist(APPEND CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)\nlist(APPEND CMAKE_REQUIRED_LIBRARIES -lm)\nset(functions_list\n    feenableexcept\n)\ncheck_functions(functions_list)\n\nfile(APPEND ${AUTOCONFIG_SRC} \"\n/* Version number */\n#cmakedefine PACKAGE_VERSION \\\"${PACKAGE_VERSION}\\\"\n#cmakedefine GRAPHICS_DISABLED ${GRAPHICS_DISABLED}\n#cmakedefine FAST_FLOAT ${FAST_FLOAT}\n#cmakedefine DISABLED_LEGACY_ENGINE ${DISABLED_LEGACY_ENGINE}\n#cmakedefine HAVE_TIFFIO_H ${HAVE_TIFFIO_H}\n#cmakedefine HAVE_NEON ${HAVE_NEON}\n#cmakedefine HAVE_LIBARCHIVE ${HAVE_LIBARCHIVE}\n#cmakedefine HAVE_LIBCURL ${HAVE_LIBCURL}\n\")\n\nif(TESSDATA_PREFIX)\n file(APPEND ${AUTOCONFIG_SRC} \"\n#cmakedefine TESSDATA_PREFIX \\\"${TESSDATA_PREFIX}\\\"\n\")\nendif()\n\n########################################\n\n################################################################################\n"
  },
  {
    "path": "cmake/SourceGroups.cmake",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#include(SourceGroups)\n\nset(SSRC ${CMAKE_SOURCE_DIR})\nset(BSRC ${CMAKE_BINARY_DIR})\n\nset(_CPP \".*\\\\.cpp\")\nset(CPP \"${_CPP}$\")\n\nset(_H \".*\\\\.h\")\nset(H \"${_H}$\")\n\nset(H_CPP \"(${H}|${CPP})\")\n\nsource_group(\"Resource files\" \".*\\\\.(rc|ico)\")\n\nsource_group(\"api\"          \"${SSRC}/api/${H_CPP}\")\nsource_group(\"arch\"         \"${SSRC}/arch/${H_CPP}\")\nsource_group(\"ccmain\"       \"${SSRC}/ccmain/${H_CPP}\")\nsource_group(\"ccstruct\"     \"${SSRC}/ccstruct/${H_CPP}\")\nsource_group(\"ccutil\"       \"${SSRC}/ccutil/${H_CPP}\")\nsource_group(\"classify\"     \"${SSRC}/classify/${H_CPP}\")\nsource_group(\"cutil\"        \"${SSRC}/cutil/${H_CPP}\")\nsource_group(\"dict\"         \"${SSRC}/dict/${H_CPP}\")\nsource_group(\"lstm\"         \"${SSRC}/lstm/${H_CPP}\")\nsource_group(\"textord\"      \"${SSRC}/textord/${H_CPP}\")\nsource_group(\"viewer\"       \"${SSRC}/viewer/${H_CPP}\")\nsource_group(\"wordrec\"      \"${SSRC}/wordrec/${H_CPP}\")\n"
  },
  {
    "path": "cmake/SourceLists.cmake",
    "content": "# Source file lists for tesseract\n# This file contains all source files organized by module\n\n# API module sources\nset(TESSERACT_SRC_API\n    src/api/altorenderer.cpp\n    src/api/baseapi.cpp\n    src/api/capi.cpp\n    src/api/hocrrenderer.cpp\n    src/api/lstmboxrenderer.cpp\n    src/api/pagerenderer.cpp\n    src/api/pdfrenderer.cpp\n    src/api/renderer.cpp\n    src/api/wordstrboxrenderer.cpp\n)\n\n# Architecture-specific sources\nset(TESSERACT_SRC_ARCH\n    src/arch/dotproduct.cpp\n    src/arch/simddetect.cpp\n    src/arch/intsimdmatrix.cpp\n)\n\n# Optional architecture-specific sources (conditionally added)\nset(TESSERACT_SRC_ARCH_AVX\n    src/arch/dotproductavx.cpp\n)\n\nset(TESSERACT_SRC_ARCH_AVX2\n    src/arch/intsimdmatrixavx2.cpp\n    src/arch/dotproductavx.cpp\n)\n\nset(TESSERACT_SRC_ARCH_AVX512F\n    src/arch/dotproductavx512.cpp\n)\n\nset(TESSERACT_SRC_ARCH_FMA\n    src/arch/dotproductfma.cpp\n)\n\nset(TESSERACT_SRC_ARCH_SSE41\n    src/arch/dotproductsse.cpp\n    src/arch/intsimdmatrixsse.cpp\n)\n\nset(TESSERACT_SRC_ARCH_NEON\n    src/arch/dotproductneon.cpp\n    src/arch/intsimdmatrixneon.cpp\n)\n\n# CCMain module sources\nset(TESSERACT_SRC_CCMAIN\n    src/ccmain/adaptions.cpp\n    src/ccmain/applybox.cpp\n    src/ccmain/control.cpp\n    src/ccmain/docqual.cpp\n    src/ccmain/equationdetect.cpp\n    src/ccmain/fixspace.cpp\n    src/ccmain/fixxht.cpp\n    src/ccmain/linerec.cpp\n    src/ccmain/ltrresultiterator.cpp\n    src/ccmain/mutableiterator.cpp\n    src/ccmain/osdetect.cpp\n    src/ccmain/output.cpp\n    src/ccmain/pageiterator.cpp\n    src/ccmain/pagesegmain.cpp\n    src/ccmain/pagewalk.cpp\n    src/ccmain/par_control.cpp\n    src/ccmain/paragraphs.cpp\n    src/ccmain/paramsd.cpp\n    src/ccmain/pgedit.cpp\n    src/ccmain/recogtraining.cpp\n    src/ccmain/reject.cpp\n    src/ccmain/resultiterator.cpp\n    src/ccmain/superscript.cpp\n    src/ccmain/tessbox.cpp\n    src/ccmain/tessedit.cpp\n    src/ccmain/tesseractclass.cpp\n    src/ccmain/tessvars.cpp\n    src/ccmain/tfacepp.cpp\n    src/ccmain/thresholder.cpp\n    src/ccmain/werdit.cpp\n)\n\n# CCStruct module sources\nset(TESSERACT_SRC_CCSTRUCT\n    src/ccstruct/blamer.cpp\n    src/ccstruct/blobbox.cpp\n    src/ccstruct/blobs.cpp\n    src/ccstruct/blread.cpp\n    src/ccstruct/boxread.cpp\n    src/ccstruct/boxword.cpp\n    src/ccstruct/ccstruct.cpp\n    src/ccstruct/coutln.cpp\n    src/ccstruct/detlinefit.cpp\n    src/ccstruct/dppoint.cpp\n    src/ccstruct/fontinfo.cpp\n    src/ccstruct/image.cpp\n    src/ccstruct/imagedata.cpp\n    src/ccstruct/linlsq.cpp\n    src/ccstruct/matrix.cpp\n    src/ccstruct/mod128.cpp\n    src/ccstruct/normalis.cpp\n    src/ccstruct/ocrblock.cpp\n    src/ccstruct/ocrpara.cpp\n    src/ccstruct/ocrrow.cpp\n    src/ccstruct/otsuthr.cpp\n    src/ccstruct/pageres.cpp\n    src/ccstruct/params_training_featdef.cpp\n    src/ccstruct/pdblock.cpp\n    src/ccstruct/points.cpp\n    src/ccstruct/polyaprx.cpp\n    src/ccstruct/polyblk.cpp\n    src/ccstruct/quadlsq.cpp\n    src/ccstruct/quspline.cpp\n    src/ccstruct/ratngs.cpp\n    src/ccstruct/rect.cpp\n    src/ccstruct/rejctmap.cpp\n    src/ccstruct/seam.cpp\n    src/ccstruct/split.cpp\n    src/ccstruct/statistc.cpp\n    src/ccstruct/stepblob.cpp\n    src/ccstruct/werd.cpp\n)\n\n# CCUtil module sources\nset(TESSERACT_SRC_CCUTIL\n    src/ccutil/ambigs.cpp\n    src/ccutil/bitvector.cpp\n    src/ccutil/ccutil.cpp\n    src/ccutil/errcode.cpp\n    src/ccutil/indexmapbidi.cpp\n    src/ccutil/params.cpp\n    src/ccutil/scanutils.cpp\n    src/ccutil/serialis.cpp\n    src/ccutil/tessdatamanager.cpp\n    src/ccutil/tprintf.cpp\n    src/ccutil/unichar.cpp\n    src/ccutil/unicharcompress.cpp\n    src/ccutil/unicharmap.cpp\n    src/ccutil/unicharset.cpp\n)\n\n# Classify module sources\nset(TESSERACT_SRC_CLASSIFY\n    src/classify/adaptive.cpp\n    src/classify/adaptmatch.cpp\n    src/classify/blobclass.cpp\n    src/classify/classify.cpp\n    src/classify/cluster.cpp\n    src/classify/clusttool.cpp\n    src/classify/cutoffs.cpp\n    src/classify/featdefs.cpp\n    src/classify/float2int.cpp\n    src/classify/fpoint.cpp\n    src/classify/intfeaturespace.cpp\n    src/classify/intfx.cpp\n    src/classify/intmatcher.cpp\n    src/classify/intproto.cpp\n    src/classify/kdtree.cpp\n    src/classify/mf.cpp\n    src/classify/mfoutline.cpp\n    src/classify/mfx.cpp\n    src/classify/normfeat.cpp\n    src/classify/normmatch.cpp\n    src/classify/ocrfeatures.cpp\n    src/classify/outfeat.cpp\n    src/classify/picofeat.cpp\n    src/classify/protos.cpp\n    src/classify/shapeclassifier.cpp\n    src/classify/shapetable.cpp\n    src/classify/tessclassifier.cpp\n    src/classify/trainingsample.cpp\n)\n\n# CUtil module sources\nset(TESSERACT_SRC_CUTIL\n    src/cutil/oldlist.cpp\n)\n\n# Dict module sources\nset(TESSERACT_SRC_DICT\n    src/dict/context.cpp\n    src/dict/dawg.cpp\n    src/dict/dawg_cache.cpp\n    src/dict/dict.cpp\n    src/dict/hyphen.cpp\n    src/dict/permdawg.cpp\n    src/dict/stopper.cpp\n    src/dict/trie.cpp\n)\n\n# LSTM module sources\nset(TESSERACT_SRC_LSTM\n    src/lstm/convolve.cpp\n    src/lstm/fullyconnected.cpp\n    src/lstm/functions.cpp\n    src/lstm/input.cpp\n    src/lstm/lstm.cpp\n    src/lstm/lstmrecognizer.cpp\n    src/lstm/maxpool.cpp\n    src/lstm/network.cpp\n    src/lstm/networkio.cpp\n    src/lstm/parallel.cpp\n    src/lstm/plumbing.cpp\n    src/lstm/recodebeam.cpp\n    src/lstm/reconfig.cpp\n    src/lstm/reversed.cpp\n    src/lstm/series.cpp\n    src/lstm/stridemap.cpp\n    src/lstm/weightmatrix.cpp\n)\n\n# TextOrd module sources\nset(TESSERACT_SRC_TEXTORD\n    src/textord/alignedblob.cpp\n    src/textord/baselinedetect.cpp\n    src/textord/bbgrid.cpp\n    src/textord/blkocc.cpp\n    src/textord/blobgrid.cpp\n    src/textord/ccnontextdetect.cpp\n    src/textord/cjkpitch.cpp\n    src/textord/colfind.cpp\n    src/textord/colpartition.cpp\n    src/textord/colpartitiongrid.cpp\n    src/textord/colpartitionset.cpp\n    src/textord/devanagari_processing.cpp\n    src/textord/drawtord.cpp\n    src/textord/edgblob.cpp\n    src/textord/edgloop.cpp\n    src/textord/equationdetectbase.cpp\n    src/textord/fpchop.cpp\n    src/textord/gap_map.cpp\n    src/textord/imagefind.cpp\n    src/textord/linefind.cpp\n    src/textord/makerow.cpp\n    src/textord/oldbasel.cpp\n    src/textord/pithsync.cpp\n    src/textord/pitsync1.cpp\n    src/textord/scanedg.cpp\n    src/textord/sortflts.cpp\n    src/textord/strokewidth.cpp\n    src/textord/tabfind.cpp\n    src/textord/tablefind.cpp\n    src/textord/tablerecog.cpp\n    src/textord/tabvector.cpp\n    src/textord/textlineprojection.cpp\n    src/textord/textord.cpp\n    src/textord/topitch.cpp\n    src/textord/tordmain.cpp\n    src/textord/tospace.cpp\n    src/textord/tovars.cpp\n    src/textord/underlin.cpp\n    src/textord/wordseg.cpp\n    src/textord/workingpartset.cpp\n)\n\n# Viewer module sources\nset(TESSERACT_SRC_VIEWER\n    src/viewer/scrollview.cpp\n    src/viewer/svmnode.cpp\n    src/viewer/svutil.cpp\n)\n\n# WordRec module sources\nset(TESSERACT_SRC_WORDREC\n    src/wordrec/associate.cpp\n    src/wordrec/chop.cpp\n    src/wordrec/chopper.cpp\n    src/wordrec/drawfx.cpp\n    src/wordrec/findseam.cpp\n    src/wordrec/gradechop.cpp\n    src/wordrec/language_model.cpp\n    src/wordrec/lm_consistency.cpp\n    src/wordrec/lm_pain_points.cpp\n    src/wordrec/lm_state.cpp\n    src/wordrec/outlines.cpp\n    src/wordrec/params_model.cpp\n    src/wordrec/pieces.cpp\n    src/wordrec/plotedges.cpp\n    src/wordrec/render.cpp\n    src/wordrec/segsearch.cpp\n    src/wordrec/tface.cpp\n    src/wordrec/wordclass.cpp\n    src/wordrec/wordrec.cpp\n)\n\n# Legacy engine sources (excluded when DISABLED_LEGACY_ENGINE is ON)\nset(TESSERACT_SRC_LEGACY\n    src/ccmain/adaptions.cpp\n    src/ccmain/docqual.cpp\n    src/ccmain/equationdetect.cpp\n    src/ccmain/fixspace.cpp\n    src/ccmain/fixxht.cpp\n    src/ccmain/osdetect.cpp\n    src/ccmain/par_control.cpp\n    src/ccmain/recogtraining.cpp\n    src/ccmain/superscript.cpp\n    src/ccmain/tessbox.cpp\n    src/ccmain/tfacepp.cpp\n    src/ccstruct/fontinfo.cpp\n    src/ccstruct/params_training_featdef.cpp\n    src/ccutil/ambigs.cpp\n    src/ccutil/bitvector.cpp\n    src/ccutil/indexmapbidi.cpp\n    src/classify/adaptive.cpp\n    src/classify/adaptmatch.cpp\n    src/classify/blobclass.cpp\n    src/classify/cluster.cpp\n    src/classify/clusttool.cpp\n    src/classify/cutoffs.cpp\n    src/classify/featdefs.cpp\n    src/classify/float2int.cpp\n    src/classify/fpoint.cpp\n    src/classify/intfeaturespace.cpp\n    src/classify/intfx.cpp\n    src/classify/intmatcher.cpp\n    src/classify/intproto.cpp\n    src/classify/kdtree.cpp\n    src/classify/mf.cpp\n    src/classify/mfoutline.cpp\n    src/classify/mfx.cpp\n    src/classify/normfeat.cpp\n    src/classify/normmatch.cpp\n    src/classify/ocrfeatures.cpp\n    src/classify/outfeat.cpp\n    src/classify/picofeat.cpp\n    src/classify/protos.cpp\n    src/classify/shapeclassifier.cpp\n    src/classify/shapetable.cpp\n    src/classify/tessclassifier.cpp\n    src/classify/trainingsample.cpp\n    src/dict/permdawg.cpp\n    src/dict/hyphen.cpp\n    src/wordrec/associate.cpp\n    src/wordrec/chop.cpp\n    src/wordrec/chopper.cpp\n    src/wordrec/drawfx.cpp\n    src/wordrec/findseam.cpp\n    src/wordrec/gradechop.cpp\n    src/wordrec/language_model.cpp\n    src/wordrec/lm_consistency.cpp\n    src/wordrec/lm_pain_points.cpp\n    src/wordrec/lm_state.cpp\n    src/wordrec/outlines.cpp\n    src/wordrec/params_model.cpp\n    src/wordrec/pieces.cpp\n    src/wordrec/plotedges.cpp\n    src/wordrec/render.cpp\n    src/wordrec/segsearch.cpp\n    src/wordrec/wordclass.cpp\n)\n\n# Header files\nset(TESSERACT_HDR_INCLUDE\n    include/tesseract/baseapi.h\n    include/tesseract/capi.h\n    include/tesseract/export.h\n    include/tesseract/ltrresultiterator.h\n    include/tesseract/ocrclass.h\n    include/tesseract/osdetect.h\n    include/tesseract/pageiterator.h\n    include/tesseract/publictypes.h\n    include/tesseract/renderer.h\n    include/tesseract/resultiterator.h\n    include/tesseract/unichar.h\n)\n\n# Internal header files\nset(TESSERACT_HDR_INTERNAL\n    src/api/pdf_ttf.h\n    src/arch/dotproduct.h\n    src/arch/intsimdmatrix.h\n    src/arch/simddetect.h\n    src/ccmain/control.h\n    src/ccmain/docqual.h\n    src/ccmain/equationdetect.h\n    src/ccmain/fixspace.h\n    src/ccmain/mutableiterator.h\n    src/ccmain/output.h\n    src/ccmain/paragraphs.h\n    src/ccmain/paragraphs_internal.h\n    src/ccmain/paramsd.h\n    src/ccmain/pgedit.h\n    src/ccmain/reject.h\n    src/ccmain/tesseractclass.h\n    src/ccmain/tessvars.h\n    src/ccmain/thresholder.h\n    src/ccmain/werdit.h\n    src/ccstruct/blamer.h\n    src/ccstruct/blobbox.h\n    src/ccstruct/blobs.h\n    src/ccstruct/blread.h\n    src/ccstruct/boxread.h\n    src/ccstruct/boxword.h\n    src/ccstruct/ccstruct.h\n    src/ccstruct/coutln.h\n    src/ccstruct/crakedge.h\n    src/ccstruct/debugpixa.h\n    src/ccstruct/detlinefit.h\n    src/ccstruct/dppoint.h\n    src/ccstruct/fontinfo.h\n    src/ccstruct/image.h\n    src/ccstruct/imagedata.h\n    src/ccstruct/linlsq.h\n    src/ccstruct/matrix.h\n    src/ccstruct/mod128.h\n    src/ccstruct/normalis.h\n    src/ccstruct/ocrblock.h\n    src/ccstruct/ocrpara.h\n    src/ccstruct/ocrrow.h\n    src/ccstruct/otsuthr.h\n    src/ccstruct/pageres.h\n    src/ccstruct/params_training_featdef.h\n    src/ccstruct/pdblock.h\n    src/ccstruct/points.h\n    src/ccstruct/polyaprx.h\n    src/ccstruct/polyblk.h\n    src/ccstruct/quadlsq.h\n    src/ccstruct/quadratc.h\n    src/ccstruct/quspline.h\n    src/ccstruct/ratngs.h\n    src/ccstruct/rect.h\n    src/ccstruct/rejctmap.h\n    src/ccstruct/seam.h\n    src/ccstruct/split.h\n    src/ccstruct/statistc.h\n    src/ccstruct/stepblob.h\n    src/ccstruct/werd.h\n    src/ccutil/ambigs.h\n    src/ccutil/bitvector.h\n    src/ccutil/ccutil.h\n    src/ccutil/clst.h\n    src/ccutil/elst.h\n    src/ccutil/elst2.h\n    src/ccutil/errcode.h\n    src/ccutil/fileerr.h\n    src/ccutil/genericvector.h\n    src/ccutil/genericheap.h\n    src/ccutil/helpers.h\n    src/ccutil/host.h\n    src/ccutil/indexmapbidi.h\n    src/ccutil/kdpair.h\n    src/ccutil/lsterr.h\n    src/ccutil/object_cache.h\n    src/ccutil/params.h\n    src/ccutil/qrsequence.h\n    src/ccutil/scanutils.h\n    src/ccutil/serialis.h\n    src/ccutil/sorthelper.h\n    src/ccutil/tessdatamanager.h\n    src/ccutil/tesserrstream.h\n    src/ccutil/tesstypes.h\n    src/ccutil/tprintf.h\n    src/ccutil/unicity_table.h\n    src/ccutil/unicharcompress.h\n    src/ccutil/unicharmap.h\n    src/ccutil/unicharset.h\n    src/ccutil/universalambigs.h\n    src/classify/adaptive.h\n    src/classify/classify.h\n    src/classify/cluster.h\n    src/classify/clusttool.h\n    src/classify/featdefs.h\n    src/classify/float2int.h\n    src/classify/fpoint.h\n    src/classify/intfeaturespace.h\n    src/classify/intfx.h\n    src/classify/intmatcher.h\n    src/classify/intproto.h\n    src/classify/kdtree.h\n    src/classify/mf.h\n    src/classify/mfdefs.h\n    src/classify/mfoutline.h\n    src/classify/mfx.h\n    src/classify/normfeat.h\n    src/classify/normmatch.h\n    src/classify/ocrfeatures.h\n    src/classify/outfeat.h\n    src/classify/picofeat.h\n    src/classify/protos.h\n    src/classify/shapeclassifier.h\n    src/classify/shapetable.h\n    src/classify/tessclassifier.h\n    src/classify/trainingsample.h\n    src/cutil/bitvec.h\n    src/cutil/oldlist.h\n    src/dict/dawg.h\n    src/dict/dawg_cache.h\n    src/dict/dict.h\n    src/dict/matchdefs.h\n    src/dict/stopper.h\n    src/dict/trie.h\n    src/lstm/convolve.h\n    src/lstm/fullyconnected.h\n    src/lstm/functions.h\n    src/lstm/input.h\n    src/lstm/lstm.h\n    src/lstm/lstmrecognizer.h\n    src/lstm/maxpool.h\n    src/lstm/network.h\n    src/lstm/networkio.h\n    src/lstm/networkscratch.h\n    src/lstm/parallel.h\n    src/lstm/plumbing.h\n    src/lstm/recodebeam.h\n    src/lstm/reconfig.h\n    src/lstm/reversed.h\n    src/lstm/series.h\n    src/lstm/static_shape.h\n    src/lstm/stridemap.h\n    src/lstm/weightmatrix.h\n    src/textord/alignedblob.h\n    src/textord/baselinedetect.h\n    src/textord/bbgrid.h\n    src/textord/blkocc.h\n    src/textord/blobgrid.h\n    src/textord/ccnontextdetect.h\n    src/textord/cjkpitch.h\n    src/textord/colfind.h\n    src/textord/colpartition.h\n    src/textord/colpartitiongrid.h\n    src/textord/colpartitionset.h\n    src/textord/devanagari_processing.h\n    src/textord/drawtord.h\n    src/textord/edgblob.h\n    src/textord/edgloop.h\n    src/textord/equationdetectbase.h\n    src/textord/fpchop.h\n    src/textord/gap_map.h\n    src/textord/imagefind.h\n    src/textord/linefind.h\n    src/textord/makerow.h\n    src/textord/oldbasel.h\n    src/textord/pithsync.h\n    src/textord/pitsync1.h\n    src/textord/scanedg.h\n    src/textord/sortflts.h\n    src/textord/strokewidth.h\n    src/textord/tabfind.h\n    src/textord/tablefind.h\n    src/textord/tablerecog.h\n    src/textord/tabvector.h\n    src/textord/textlineprojection.h\n    src/textord/textord.h\n    src/textord/topitch.h\n    src/textord/tordmain.h\n    src/textord/tovars.h\n    src/textord/underlin.h\n    src/textord/wordseg.h\n    src/textord/workingpartset.h\n    src/viewer/scrollview.h\n    src/viewer/svmnode.h\n    src/viewer/svutil.h\n    src/wordrec/associate.h\n    src/wordrec/chop.h\n    src/wordrec/drawfx.h\n    src/wordrec/findseam.h\n    src/wordrec/language_model.h\n    src/wordrec/lm_consistency.h\n    src/wordrec/lm_pain_points.h\n    src/wordrec/lm_state.h\n    src/wordrec/outlines.h\n    src/wordrec/params_model.h\n    src/wordrec/plotedges.h\n    src/wordrec/render.h\n    src/wordrec/wordrec.h\n)\n\n# Combine all core source files\nset(TESSERACT_SRC_CORE\n    ${TESSERACT_SRC_API}\n    ${TESSERACT_SRC_CCMAIN}\n    ${TESSERACT_SRC_CCSTRUCT}\n    ${TESSERACT_SRC_CCUTIL}\n    ${TESSERACT_SRC_CLASSIFY}\n    ${TESSERACT_SRC_CUTIL}\n    ${TESSERACT_SRC_DICT}\n    ${TESSERACT_SRC_LSTM}\n    ${TESSERACT_SRC_TEXTORD}\n    ${TESSERACT_SRC_VIEWER}\n    ${TESSERACT_SRC_WORDREC}\n)\n"
  },
  {
    "path": "cmake/templates/TesseractConfig.cmake.in",
    "content": "# ===================================================================================\n#  The Tesseract CMake configuration file\n#\n#             ** File generated automatically, do not modify **\n#\n#  Usage from an external project:\n#    In your CMakeLists.txt, add these lines:\n#\n#    find_package(Tesseract REQUIRED)\n#    target_link_libraries(MY_TARGET_NAME Tesseract::libtesseract)\n#\n#    This file will define the following variables:\n#      - Tesseract_LIBRARIES             : The list of all imported targets.\n#      - Tesseract_INCLUDE_DIRS          : The Tesseract include directories.\n#      - Tesseract_LIBRARY_DIRS          : The Tesseract library directories.\n#      - Tesseract_VERSION               : The version of this Tesseract build: \"@VERSION_PLAIN@\"\n#      - Tesseract_VERSION_MAJOR         : Major version part of Tesseract_VERSION: \"@VERSION_MAJOR@\"\n#      - Tesseract_VERSION_MINOR         : Minor version part of Tesseract_VERSION: \"@VERSION_MINOR@\"\n#      - Tesseract_VERSION_PATCH         : Patch version part of Tesseract_VERSION: \"@VERSION_PATCH@\"\n#\n# ===================================================================================\n\ninclude(CMakeFindDependencyMacro)\nfind_dependency(Leptonica)\n\ninclude(${CMAKE_CURRENT_LIST_DIR}/TesseractTargets.cmake)\n\n@PACKAGE_INIT@\n\nSET(Tesseract_VERSION           @VERSION_PLAIN@)\nSET(Tesseract_VERSION_MAJOR     @VERSION_MAJOR@)\nSET(Tesseract_VERSION_MINOR     @VERSION_MINOR@)\nSET(Tesseract_VERSION_PATCH     @VERSION_PATCH@)\n\nset_and_check(Tesseract_INCLUDE_DIRS \"@PACKAGE_INCLUDE_DIR@\")\nset_and_check(Tesseract_LIBRARY_DIRS \"@PACKAGE_LIBRARY_DIRS@\")\nset(Tesseract_LIBRARIES @tesseract_OUTPUT_NAME@)\n\ncheck_required_components(Tesseract)\n"
  },
  {
    "path": "cmake/templates/cmake_uninstall.cmake.in",
    "content": "# https://gitlab.kitware.com/cmake/community/wikis/FAQ#can-i-do-make-uninstall-with-cmake\nif(NOT EXISTS \"@CMAKE_BINARY_DIR@/install_manifest.txt\")\n  message(FATAL_ERROR \"Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt\")\nendif(NOT EXISTS \"@CMAKE_BINARY_DIR@/install_manifest.txt\")\n\nfile(READ \"@CMAKE_BINARY_DIR@/install_manifest.txt\" files)\nstring(REGEX REPLACE \"\\n\" \";\" files \"${files}\")\nforeach(file ${files})\n  message(STATUS \"Uninstalling $ENV{DESTDIR}${file}\")\n  if(IS_SYMLINK \"$ENV{DESTDIR}${file}\" OR EXISTS \"$ENV{DESTDIR}${file}\")\n    exec_program(\n      \"@CMAKE_COMMAND@\" ARGS \"-E remove \\\"$ENV{DESTDIR}${file}\\\"\"\n      OUTPUT_VARIABLE rm_out\n      RETURN_VALUE rm_retval\n      )\n    if(NOT \"${rm_retval}\" STREQUAL 0)\n      message(FATAL_ERROR \"Problem when removing $ENV{DESTDIR}${file}\")\n    endif(NOT \"${rm_retval}\" STREQUAL 0)\n  else(IS_SYMLINK \"$ENV{DESTDIR}${file}\" OR EXISTS \"$ENV{DESTDIR}${file}\")\n    message(STATUS \"File $ENV{DESTDIR}${file} does not exist.\")\n  endif(IS_SYMLINK \"$ENV{DESTDIR}${file}\" OR EXISTS \"$ENV{DESTDIR}${file}\")\nendforeach(file)\n"
  },
  {
    "path": "configure.ac",
    "content": "# -*-Shell-script-*-\n#\n# Copyright (c) Luc Vincent\n\n# ----------------------------------------\n# Initialization\n# ----------------------------------------\nAC_PREREQ([2.69])\nAC_INIT([tesseract],\n        [m4_esyscmd_s([test -d .git && git describe --abbrev=4 2>/dev/null || cat VERSION])],\n        [https://github.com/tesseract-ocr/tesseract/issues],,\n        [https://github.com/tesseract-ocr/tesseract/])\n\n# Store command like options for CXXFLAGS\nOLD_CXXFLAGS=$CXXFLAGS\nAC_PROG_CXX([g++ clang++])\n# reset compiler flags to initial flags\nAC_LANG([C++])\nAC_LANG_COMPILER_REQUIRE\nCXXFLAGS=${CXXFLAGS:-\"\"}\nAC_CONFIG_MACRO_DIR([m4])\nAC_CONFIG_AUX_DIR([config])\nAC_CONFIG_SRCDIR([src/tesseract.cpp])\nAC_PREFIX_DEFAULT([/usr/local])\n\n# Automake configuration. Do not require README file (we use README.md).\nAM_INIT_AUTOMAKE([foreign subdir-objects nostdinc])\n\n# Define date of package, etc. Could be useful in auto-generated\n# documentation.\nPACKAGE_YEAR=2025\nPACKAGE_DATE=\"12/26\"\n\nabs_top_srcdir=`AS_DIRNAME([$0])`\n\nAC_DEFINE_UNQUOTED([PACKAGE_NAME], [\"${PACKAGE_NAME}\"], [Name of package])\nAC_DEFINE_UNQUOTED([PACKAGE_VERSION], [\"${PACKAGE_VERSION}\"], [Version number])\nAC_DEFINE_UNQUOTED([PACKAGE_YEAR], [\"$PACKAGE_YEAR\"], [Official year for this release])\nAC_DEFINE_UNQUOTED([PACKAGE_DATE], [\"$PACKAGE_DATE\"], [Official date of release])\n\nAC_SUBST([PACKAGE_NAME])\nAC_SUBST([PACKAGE_VERSION])\nAC_SUBST([PACKAGE_YEAR])\nAC_SUBST([PACKAGE_DATE])\n\nGENERIC_LIBRARY_NAME=tesseract\n\n# Release versioning. Get versions from PACKAGE_VERSION.\nAX_SPLIT_VERSION\nGENERIC_MAJOR_VERSION=$(echo \"$AX_MAJOR_VERSION\" | $SED 's/^[[^0-9]]*//')\nGENERIC_MINOR_VERSION=$AX_MINOR_VERSION\nGENERIC_MICRO_VERSION=`echo \"$AX_POINT_VERSION\" | $SED 's/^\\([[0-9]][[0-9]]*\\).*/\\1/'`\n\n# API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)\nGENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION\nGENERIC_LIBRARY_VERSION=$GENERIC_MAJOR_VERSION:$GENERIC_MINOR_VERSION\nAC_SUBST([GENERIC_API_VERSION])\nAC_SUBST([GENERIC_MAJOR_VERSION])\nAC_SUBST([GENERIC_MINOR_VERSION])\nAC_SUBST([GENERIC_MICRO_VERSION])\n\nAC_SUBST([GENERIC_LIBRARY_VERSION])\nPACKAGE=$GENERIC_LIBRARY_NAME\nAC_SUBST([GENERIC_LIBRARY_NAME])\n\nGENERIC_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION.$GENERIC_MICRO_VERSION\nGENERIC_RELEASE=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION\nAC_SUBST([GENERIC_RELEASE])\nAC_SUBST([GENERIC_VERSION])\n\nAC_CONFIG_HEADERS([include/config_auto.h:config/config.h.in])\n\n# default conditional\nAM_CONDITIONAL([T_WIN], false)\nAM_CONDITIONAL([MINGW], false)\nAM_CONDITIONAL([GRAPHICS_DISABLED], false)\nAC_SUBST([AM_CPPFLAGS])\n\n# Be less noisy by default.\n# Can be overridden with `configure --disable-silent-rules` or with `make V=1`.\nAM_SILENT_RULES([yes])\n\n#############################\n#\n# Platform specific setup\n#\n#############################\nAC_CANONICAL_HOST\ncase \"${host_os}\" in\n    mingw*)\n        AC_DEFINE_UNQUOTED([MINGW], 1, [This is a MinGW system])\n        AM_CONDITIONAL([T_WIN], true)\n        AM_CONDITIONAL([MINGW], true)\n        AM_CONDITIONAL([ADD_RT], false)\n        AC_SUBST([AM_LDFLAGS], ['-no-undefined'])\n        ;;\n    cygwin*)\n        AM_CONDITIONAL([ADD_RT], false)\n        AC_SUBST([NOUNDEFINED], ['-no-undefined'])\n        ;;\n    solaris*)\n        LIBS=\"$LIBS -lsocket -lnsl -lrt -lxnet\"\n        AM_CONDITIONAL([ADD_RT], true)\n        ;;\n    *darwin*)\n        AM_CONDITIONAL([ADD_RT], false)\n        ;;\n    *android*|openbsd*)\n        AM_CONDITIONAL([ADD_RT], false)\n        ;;\n    powerpc-*-darwin*)\n        ;;\n    *)\n        # default\n        AM_CONDITIONAL([ADD_RT], true)\n        ;;\nesac\n\nWERROR=-Werror\n# The test code used by AX_CHECK_COMPILE_FLAG uses an empty statement\n# and unused macros which must not raise a compiler error, but it must\n# be an error if flags like -avx are ignored on ARM and other\n# architectures because they are unsupported.\nAX_CHECK_COMPILE_FLAG([-Werror=unused-command-line-argument], [WERROR=-Werror=unused-command-line-argument])\n\n## Checks for supported compiler options.\n\nAM_CONDITIONAL([HAVE_AVX], false)\nAM_CONDITIONAL([HAVE_AVX2], false)\nAM_CONDITIONAL([HAVE_AVX512F], false)\nAM_CONDITIONAL([HAVE_FMA], false)\nAM_CONDITIONAL([HAVE_SSE4_1], false)\nAM_CONDITIONAL([HAVE_NEON], false)\nAM_CONDITIONAL([HAVE_RVV], false)\n\ncase \"${host_cpu}\" in\n\n  amd64|*86*)\n\n    AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_AVX], ${avx})\n    if $avx; then\n      AC_DEFINE([HAVE_AVX], [1], [Enable AVX instructions])\n    fi\n\n    AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_AVX2], $avx2)\n    if $avx2; then\n      AC_DEFINE([HAVE_AVX2], [1], [Enable AVX2 instructions])\n    fi\n\n    AX_CHECK_COMPILE_FLAG([-mavx512f], [avx512f=true], [avx512f=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_AVX512F], $avx512f)\n    if $avx512f; then\n      AC_DEFINE([HAVE_AVX512F], [1], [Enable AVX512F instructions])\n    fi\n\n    AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_FMA], $fma)\n    if $fma; then\n      AC_DEFINE([HAVE_FMA], [1], [Enable FMA instructions])\n    fi\n\n    AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_SSE4_1], $sse41)\n    if $sse41; then\n      AC_DEFINE([HAVE_SSE4_1], [1], [Enable SSE 4.1 instructions])\n    fi\n\n    ;;\n\n  aarch64*|arm64)\n\n    # ARMv8 always has NEON and does not need special compiler flags.\n    AM_CONDITIONAL([HAVE_NEON], true)\n    AC_DEFINE([HAVE_NEON], [1], [Enable NEON instructions])\n    ;;\n\n  arm*)\n\n    AX_CHECK_COMPILE_FLAG([-mfpu=neon], [neon=true], [neon=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_NEON], $neon)\n    if $neon; then\n      AC_DEFINE([HAVE_NEON], [1], [Enable NEON instructions])\n      NEON_CXXFLAGS=\"-mfpu=neon\"\n      AC_SUBST([NEON_CXXFLAGS])\n      check_for_neon=1\n    fi\n\n    ;;\n\n  riscv*)\n\n    AX_CHECK_COMPILE_FLAG([-march=rv64gcv], [rvv=true], [rvv=false], [$WERROR])\n    AM_CONDITIONAL([HAVE_RVV], [$rvv])\n    if $rvv; then\n      AC_DEFINE([HAVE_RVV], [1], [Enable RVV instructions])\n      check_for_rvv=1\n    fi\n    ;;\n\n  *)\n\n    AC_MSG_WARN([No compiler options for $host_cpu])\n\nesac\n\n# check whether feenableexcept is supported. some C libraries (e.g. uclibc) don't.\nAC_CHECK_FUNCS([feenableexcept])\n\n# additional checks for NEON targets\nif test x$check_for_neon = x1; then\n  AC_MSG_NOTICE([checking how to detect NEON availability])\n  AC_CHECK_FUNCS([getauxval elf_aux_info android_getCpuFamily])\n\n  if test $ac_cv_func_getauxval = no && test $ac_cv_func_elf_aux_info = no && test $ac_cv_func_android_getCpuFamily = no; then\n      AC_MSG_WARN([NEON is available, but we don't know how to check for it.  Will not be able to use NEON.])\n  fi\nfi\n\n# additional checks for RVV targets\nif test x$check_for_rvv = x1; then\n  AC_MSG_NOTICE([checking how to detect RVV availability])\n  AC_CHECK_FUNCS([getauxval elf_aux_info])\n\n  if test $ac_cv_func_getauxval = no && test $ac_cv_func_elf_aux_info = no; then\n      AC_MSG_WARN([RVV is available, but we don't know how to check for it.  Will not be able to use RVV.])\n  fi\nfi\n\nAX_CHECK_COMPILE_FLAG([-fopenmp-simd], [openmp_simd=true], [openmp_simd=false], [$WERROR])\nAM_CONDITIONAL([OPENMP_SIMD], $openmp_simd)\n\nAC_ARG_WITH([extra-includes],\n            [AS_HELP_STRING([--with-extra-includes=DIR],\n                       [Define an additional directory for include files])],\n            [if test -d \"$withval\" ; then\n               CFLAGS=\"$CFLAGS -I$withval\"\n             else\n               AC_MSG_ERROR([Cannot stat directory $withval])\n             fi])\n\nAC_ARG_WITH([extra-libraries],\n            [AS_HELP_STRING([--with-extra-libraries=DIR],\n                       [Define an additional directory for library files])],\n            [if test -d \"$withval\" ; then\n              LDFLAGS=\"$LDFLAGS -L$withval\"\n             else\n               AC_MSG_ERROR([Cannot stat directory $withval])\n             fi])\n\nAC_MSG_CHECKING([--enable-float32 argument])\nAC_ARG_ENABLE([float32],\n\t      AS_HELP_STRING([--disable-float32], [disable float and enable double for LSTM]))\nAC_MSG_RESULT([$enable_float32])\nif test \"$enable_float32\" != \"no\"; then\n  AC_DEFINE([FAST_FLOAT], [1], [Enable float for LSTM])\nfi\n\nAC_MSG_CHECKING([--enable-graphics argument])\nAC_ARG_ENABLE([graphics],\n  AS_HELP_STRING([--disable-graphics], [disable graphics (ScrollView)]))\nAC_MSG_RESULT([$enable_graphics])\nif test \"$enable_graphics\" = \"no\"; then\n  AC_DEFINE([GRAPHICS_DISABLED], [], [Disable graphics])\n  AM_CONDITIONAL([GRAPHICS_DISABLED], true)\nfi\n\nAC_MSG_CHECKING([--enable-legacy argument])\nAC_ARG_ENABLE([legacy],\n  AS_HELP_STRING([--disable-legacy], [disable the legacy OCR engine]))\nAC_MSG_RESULT([$enable_legacy])\nAM_CONDITIONAL([DISABLED_LEGACY_ENGINE], test \"$enable_legacy\" = \"no\")\nif test \"$enable_legacy\" = \"no\"; then\n  AC_DEFINE([DISABLED_LEGACY_ENGINE], [1], [Disable legacy OCR engine])\nfi\n\n# check whether to build OpenMP support\nAC_OPENMP\n\nhave_tiff=false\n# Note that the first usage of AC_CHECK_HEADERS must be unconditional.\nAC_CHECK_HEADERS([tiffio.h], [have_tiff=true], [have_tiff=false])\n\n# Configure arguments which allow disabling some optional libraries.\nAC_ARG_WITH([archive],\n            AS_HELP_STRING([--with-archive],\n                           [Build with libarchive which supports compressed model files @<:@default=check@:>@]),\n            [], [with_archive=check])\nAC_ARG_WITH([curl],\n            AS_HELP_STRING([--with-curl],\n                           [Build with libcurl which supports processing an image URL @<:@default=check@:>@]),\n            [], [with_curl=check])\n\n# https://lists.apple.com/archives/unix-porting/2009/Jan/msg00026.html\nm4_define([MY_CHECK_FRAMEWORK],\n  [AC_CACHE_CHECK([if -framework $1 works],[my_cv_framework_$1],\n     [save_LIBS=\"$LIBS\"\n     LIBS=\"$LIBS -framework $1\"\n     AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],\n             [my_cv_framework_$1=yes],\n            [my_cv_framework_$1=no])\n     LIBS=\"$save_LIBS\"\n    ])\n   if test \"$my_cv_framework_$1\"=\"yes\"; then\n     AC_DEFINE(AS_TR_CPP([HAVE_FRAMEWORK_$1]), 1,\n            [Define if you have the $1 framework])\n     AS_TR_CPP([FRAMEWORK_$1])=\"-framework $1\"\n     AC_SUBST(AS_TR_CPP([FRAMEWORK_$1]))\n   fi]\n)\n\ncase \"${host_os}\" in\n  *darwin* | *-macos10*)\n    MY_CHECK_FRAMEWORK([Accelerate])\n    if test $my_cv_framework_Accelerate = yes; then\n      AM_CPPFLAGS=\"-DHAVE_FRAMEWORK_ACCELERATE $AM_CPPFLAGS\"\n      AM_LDFLAGS=\"$AM_LDFLAGS -framework Accelerate\"\n    fi\n    ;;\n  *)\n    # default\n    ;;\nesac\n\n# check whether to build tesseract with -fvisibility=hidden -fvisibility-inlines-hidden\n# http://gcc.gnu.org/wiki/Visibility\n# https://groups.google.com/g/tesseract-dev/c/l2ZFrpgYkSc/m/_cdYSRDSXuUJ\nAC_MSG_CHECKING([--enable-visibility argument])\nAC_ARG_ENABLE([visibility],\n  AS_HELP_STRING([--enable-visibility],\n                 [enable experimental build with -fvisibility [default=no]]))\nAC_MSG_RESULT([$enable_visibility])\nAM_CONDITIONAL([VISIBILITY], [test \"$enable_visibility\" = \"yes\"])\n\n# Check if tessdata-prefix is disabled\nAC_MSG_CHECKING([whether to use tessdata-prefix])\nAC_ARG_ENABLE([tessdata-prefix],\n    [AS_HELP_STRING([--disable-tessdata-prefix],\n            [don't set TESSDATA-PREFIX during compile])],\n    [tessdata_prefix=\"no\"], [tessdata_prefix=\"yes\"])\nAC_MSG_RESULT([$tessdata_prefix])\nAM_CONDITIONAL([NO_TESSDATA_PREFIX], [test \"$tessdata_prefix\" = \"no\"])\n\n\n# Detect Clang compiler\nAC_MSG_CHECKING([if compiling with clang])\nAC_COMPILE_IFELSE(\n[AC_LANG_PROGRAM([], [[\n#ifndef __clang__\n       not clang\n#endif\n]])],\n[CLANG=yes], [CLANG=no])\nAC_MSG_RESULT([$CLANG])\n\n# Check whether to enable debugging\nAC_MSG_CHECKING([whether to enable debugging])\nAC_ARG_ENABLE([debug],\n  AS_HELP_STRING([--enable-debug], [turn on debugging [default=no]]))\nAC_MSG_RESULT([$enable_debug])\nif test x\"$enable_debug\" = x\"yes\"; then\n    CXXFLAGS=${CXXFLAGS:-\"-O2\"}\n    AM_CPPFLAGS=\"$AM_CPPFLAGS -g -Wall -DDEBUG -pedantic\"\n    AM_CXXFLAGS=\"$AM_CXXFLAGS -g -Wall -DDEBUG -pedantic\"\n    if test \"x$CLANG\" = \"xyes\"; then\n        # https://clang.llvm.org/docs/CommandGuide/clang.html\n        # clang treats -Og as -O1\n        AM_CPPFLAGS=\"$AM_CPPFLAGS -O0\"\n        AM_CXXFLAGS=\"$AM_CXXFLAGS -O0\"\n    else\n        AM_CPPFLAGS=\"$AM_CPPFLAGS -Og\"\n        AM_CXXFLAGS=\"$AM_CXXFLAGS -Og\"\n    fi\nelse\n    AM_CXXFLAGS=\"$AM_CXXFLAGS -O2 -DNDEBUG\"\n    AM_CPPFLAGS=\"$AM_CPPFLAGS -O2 -DNDEBUG\"\nfi\n\n# ----------------------------------------\n# Init libtool\n# ----------------------------------------\n\nLT_INIT\n\n\n# ----------------------------------------\n# C++ related options\n# ----------------------------------------\ndnl **********************\ndnl Turn on C++17 or newer\ndnl **********************\n\nCPLUSPLUS=\nAX_CHECK_COMPILE_FLAG([-std=c++17], [CPLUSPLUS=17], [], [$WERROR])\nAX_CHECK_COMPILE_FLAG([-std=c++20], [CPLUSPLUS=20], [], [$WERROR])\n\nif test -z \"$CPLUSPLUS\"; then\n  AC_MSG_ERROR([Your compiler does not have the necessary C++17 support! Cannot proceed.])\nfi\n\n# Set C++17 or newer support based on platform/compiler\ncase \"${host_os}\" in\n  cygwin*)\n    CXXFLAGS=\"$CXXFLAGS -std=gnu++$CPLUSPLUS\"\n    ;;\n  *-darwin* | *-macos10*)\n    CXXFLAGS=\"$CXXFLAGS -std=c++$CPLUSPLUS\"\n    if test \"x$CLANG\" = \"xyes\"; then\n      LDFLAGS=\"$LDFLAGS -stdlib=libc++\"\n    fi\n    ;;\n  *)\n    # default\n    CXXFLAGS=\"$CXXFLAGS -std=c++$CPLUSPLUS\"\n    ;;\nesac\n\n\n# ----------------------------------------\n# Check for libraries\n# ----------------------------------------\n\nAC_SEARCH_LIBS([pthread_create], [pthread])\n\n# Set PKG_CONFIG_PATH for macOS with Homebrew unless it is already set.\nAC_CHECK_PROG([have_brew], brew, true, false)\nif $have_brew; then\n  brew_prefix=$(brew --prefix)\n  if test -z \"$PKG_CONFIG_PATH\"; then\n    PKG_CONFIG_PATH=$brew_prefix/opt/icu4c/lib/pkgconfig:$brew_prefix/opt/libarchive/lib/pkgconfig\n    export PKG_CONFIG_PATH\n  fi\nfi\n\n# ----------------------------------------\n# Check for programs needed to build documentation.\n# ----------------------------------------\n\nAM_CONDITIONAL([ASCIIDOC], false)\nAM_CONDITIONAL([HAVE_XML_CATALOG_FILES], false)\nAC_ARG_ENABLE([doc],\n              AS_HELP_STRING([--disable-doc], [disable build of documentation])\n              [],\n              [: m4_divert_text([DEFAULTS], [enable_doc=check])])\nAS_IF([test \"$enable_doc\" != \"no\"], [\n  AC_CHECK_PROG([have_asciidoc], asciidoc, true, false)\n  AC_CHECK_PROG([have_xsltproc], xsltproc, true, false)\n  # macOS with Homebrew requires the environment variable\n  # XML_CATALOG_FILES for xsltproc.\n  if $have_asciidoc && $have_xsltproc; then\n    AM_CONDITIONAL([ASCIIDOC], true)\n    XML_CATALOG_FILES=\n    if $have_brew; then\n      catalog_file=$brew_prefix/etc/xml/catalog\n      if test -f $catalog_file; then\n        AM_CONDITIONAL([HAVE_XML_CATALOG_FILES], true)\n        XML_CATALOG_FILES=file:$catalog_file\n      else\n        AC_MSG_WARN([Missing file $catalog_file.])\n      fi\n    fi\n    AC_SUBST([XML_CATALOG_FILES])\n  else\n    AS_IF([test \"x$enable_doc\" != xcheck], [\n      AC_MSG_FAILURE(\n        [--enable-doc was given, but test for asciidoc and xsltproc failed])\n    ])\n  fi\n])\n\n# ----------------------------------------\n# Checks for typedefs, structures, and compiler characteristics.\n# ----------------------------------------\n\nAC_CHECK_TYPES([wchar_t],,, [#include \"wchar.h\"])\nAC_CHECK_TYPES([long long int])\n\n# ----------------------------------------\n# Test auxiliary packages\n# ----------------------------------------\n\nAM_CONDITIONAL([HAVE_LIBCURL], false)\nAS_IF([test \"x$with_curl\" != xno], [\n  PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])\n  AM_CONDITIONAL([HAVE_LIBCURL], $have_libcurl)\n  if $have_libcurl; then\n    AC_DEFINE([HAVE_LIBCURL], [1], [Enable libcurl])\n  else\n    AS_IF([test \"x$with_curl\" != xcheck], [\n      AC_MSG_FAILURE(\n        [--with-curl was given, but test for libcurl failed])\n    ])\n  fi\n])\n\nPKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])\nif $have_lept; then\n  CPPFLAGS=\"$CPPFLAGS $LEPTONICA_CFLAGS\"\nelse\n  AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])\nfi\n\nAM_CONDITIONAL([HAVE_LIBARCHIVE], false)\nAS_IF([test \"x$with_archive\" != xno], [\n  PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])\n  AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])\n  if $have_libarchive; then\n    AC_DEFINE([HAVE_LIBARCHIVE], [1], [Enable libarchive])\n    CPPFLAGS=\"$CPPFLAGS $libarchive_CFLAGS\"\n  else\n    AS_IF([test \"x$with_archive\" != xcheck], [\n      AC_MSG_FAILURE(\n        [--with-archive was given, but test for libarchive failed])\n    ])\n  fi\n])\n\nAM_CONDITIONAL([ENABLE_TRAINING], true)\n\n# Check availability of ICU packages.\nPKG_CHECK_MODULES([ICU_UC], [icu-uc >= 52.1], [have_icu_uc=true], [have_icu_uc=false])\nPKG_CHECK_MODULES([ICU_I18N], [icu-i18n >= 52.1], [have_icu_i18n=true], [have_icu_i18n=false])\nif !($have_icu_uc && $have_icu_i18n); then\n  AC_MSG_WARN([icu 52.1 or higher is required, but was not found.])\n  AC_MSG_WARN([Training tools WILL NOT be built.])\n  AC_MSG_WARN([Try to install libicu-dev package.])\n  AM_CONDITIONAL([ENABLE_TRAINING], false)\nfi\n\n# Check location of pango headers\nPKG_CHECK_MODULES([pango], [pango >= 1.38.0], [have_pango=true], [have_pango=false])\nif !($have_pango); then\n        AC_MSG_WARN([pango 1.38.0 or higher is required, but was not found.])\n        AC_MSG_WARN([Training tools WILL NOT be built.])\n        AC_MSG_WARN([Try to install libpango1.0-dev package.])\n        AM_CONDITIONAL([ENABLE_TRAINING], false)\nfi\n\n# Check location of cairo headers\nPKG_CHECK_MODULES([cairo], [cairo], [have_cairo=true], [have_cairo=false])\nif !($have_cairo); then\n        AC_MSG_WARN([Training tools WILL NOT be built because of missing cairo library.])\n        AC_MSG_WARN([Try to install libcairo-dev?? package.])\n        AM_CONDITIONAL([ENABLE_TRAINING], false)\nfi\n\nPKG_CHECK_MODULES([pangocairo], [pangocairo], [], [false])\nPKG_CHECK_MODULES([pangoft2], [pangoft2], [], [false])\n\n# ----------------------------------------\n# Final Tasks and Output\n# ----------------------------------------\n\n# Output files\nAC_CONFIG_FILES([include/tesseract/version.h])\nAC_CONFIG_FILES([Makefile tesseract.pc])\nAC_CONFIG_FILES([tessdata/Makefile])\nAC_CONFIG_FILES([tessdata/configs/Makefile])\nAC_CONFIG_FILES([tessdata/tessconfigs/Makefile])\nAC_CONFIG_FILES([java/Makefile])\nAC_CONFIG_FILES([java/com/Makefile])\nAC_CONFIG_FILES([java/com/google/Makefile])\nAC_CONFIG_FILES([java/com/google/scrollview/Makefile])\nAC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])\nAC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])\nAC_CONFIG_FILES([nsis/Makefile])\nAC_OUTPUT\n\n# Final message\necho \"\"\necho \"Configuration is done.\"\necho \"You can now build and install $PACKAGE_NAME by running:\"\necho \"\"\necho \"$ make\"\necho \"$ sudo make install\"\necho \"$ sudo ldconfig\"\necho \"\"\n\nAM_COND_IF([ASCIIDOC], [\n  echo \"This will also build the documentation.\"\n], [\n  AS_IF([test \"$enable_doc\" = \"no\"], [\n    echo \"Documentation will not be built because it was disabled.\"\n  ], [\n    echo \"Documentation will not be built because asciidoc or xsltproc is missing.\"\n  ])\n])\n\n# echo \"$ sudo make install LANGS=\\\"eng ara deu\\\"\"\n# echo \"  Or:\"\n# echo \"$ sudo make install-langs\"\necho \"\"\n\nAM_COND_IF([ENABLE_TRAINING],\n  [\n   echo \"Training tools can be built and installed with:\"\n   echo \"\"\n   echo \"$ make training\"\n   echo \"$ sudo make training-install\"\n   echo \"\"],\n  [\n   echo \"You cannot build training tools because of missing dependency.\"\n   echo \"Check configure output for details.\"\n   echo \"\"]\n)\n\n# ----------------------------------------\n# CONFIG Template\n# ----------------------------------------\n\n# Fence added in configuration file\nAH_TOP([\n#ifndef CONFIG_AUTO_H\n#define CONFIG_AUTO_H\n/* config_auto.h: begin */\n])\n\n# Stuff added at bottom of file\nAH_BOTTOM([\n\n/* Miscellaneous defines */\n#define AUTOCONF 1\n\n/* Not used yet\n#ifndef NO_GETTEXT\n#define USING_GETTEXT\n#endif\n*/\n\n/* config_auto.h: end */\n#endif\n])\n"
  },
  {
    "path": "doc/Doxyfile",
    "content": "# Doxyfile 1.8.16\n\n# This file describes the settings to be used by the documentation system\n# doxygen (www.doxygen.org) for a project.\n#\n# All text after a double hash (##) is considered a comment and is placed in\n# front of the TAG it is preceding.\n#\n# All text after a single hash (#) is considered a comment and will be ignored.\n# The format is:\n# TAG = value [value, ...]\n# For lists, items can also be appended using:\n# TAG += value [value, ...]\n# Values that contain spaces should be placed between quotes (\\\" \\\").\n\n#---------------------------------------------------------------------------\n# Project related configuration options\n#---------------------------------------------------------------------------\n\n# This tag specifies the encoding used for all characters in the configuration\n# file that follow. The default is UTF-8 which is also the encoding used for all\n# text before the first occurrence of this tag. Doxygen uses libiconv (or the\n# iconv built into libc) for the transcoding. See\n# https://www.gnu.org/software/libiconv/ for the list of possible encodings.\n# The default value is: UTF-8.\n\nDOXYFILE_ENCODING      = UTF-8\n\n# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by\n# double-quotes, unless you are using Doxywizard) that should identify the\n# project for which the documentation is generated. This name is used in the\n# title of most generated pages and in a few other places.\n# The default value is: My Project.\n\nPROJECT_NAME           = $(name)\n\n# The PROJECT_NUMBER tag can be used to enter a project or revision number. This\n# could be handy for archiving the generated documentation or if some version\n# control system is used.\n\nPROJECT_NUMBER         = $(version)\n\n# Using the PROJECT_BRIEF tag one can provide an optional one line description\n# for a project that appears at the top of each page and should give viewer a\n# quick idea about the purpose of the project. Keep the description short.\n\nPROJECT_BRIEF          =\n\n# With the PROJECT_LOGO tag one can specify a logo or an icon that is included\n# in the documentation. The maximum height of the logo should not exceed 55\n# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy\n# the logo to the output directory.\n\nPROJECT_LOGO           =\n\n# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path\n# into which the generated documentation will be written. If a relative path is\n# entered, it will be relative to the location where doxygen was started. If\n# left blank the current directory will be used.\n\nOUTPUT_DIRECTORY       = doc/\n\n# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-\n# directories (in 2 levels) under the output directory of each output format and\n# will distribute the generated files over these directories. Enabling this\n# option can be useful when feeding doxygen a huge amount of source files, where\n# putting all generated files in the same directory would otherwise causes\n# performance problems for the file system.\n# The default value is: NO.\n\nCREATE_SUBDIRS         = NO\n\n# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII\n# characters to appear in the names of generated files. If set to NO, non-ASCII\n# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode\n# U+3044.\n# The default value is: NO.\n\nALLOW_UNICODE_NAMES    = NO\n\n# The OUTPUT_LANGUAGE tag is used to specify the language in which all\n# documentation generated by doxygen is written. Doxygen will use this\n# information to generate all constant output in the proper language.\n# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,\n# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),\n# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,\n# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),\n# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,\n# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,\n# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,\n# Ukrainian and Vietnamese.\n# The default value is: English.\n\nOUTPUT_LANGUAGE        = English\n\n# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all\n# documentation generated by doxygen is written. Doxygen will use this\n# information to generate all generated output in the proper direction.\n# Possible values are: None, LTR, RTL and Context.\n# The default value is: None.\n\nOUTPUT_TEXT_DIRECTION  = None\n\n# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member\n# descriptions after the members that are listed in the file and class\n# documentation (similar to Javadoc). Set to NO to disable this.\n# The default value is: YES.\n\nBRIEF_MEMBER_DESC      = YES\n\n# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief\n# description of a member or function before the detailed description\n#\n# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the\n# brief descriptions will be completely suppressed.\n# The default value is: YES.\n\nREPEAT_BRIEF           = YES\n\n# This tag implements a quasi-intelligent brief description abbreviator that is\n# used to form the text in various listings. Each string in this list, if found\n# as the leading text of the brief description, will be stripped from the text\n# and the result, after processing the whole list, is used as the annotated\n# text. Otherwise, the brief description is used as-is. If left blank, the\n# following values are used ($name is automatically replaced with the name of\n# the entity):The $name class, The $name widget, The $name file, is, provides,\n# specifies, contains, represents, a, an and the.\n\nABBREVIATE_BRIEF       = \"The $name class\" \\\n                         \"The $name widget\" \\\n                         \"The $name file\" \\\n                         is \\\n                         provides \\\n                         specifies \\\n                         contains \\\n                         represents \\\n                         a \\\n                         an \\\n                         the\n\n# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then\n# doxygen will generate a detailed section even if there is only a brief\n# description.\n# The default value is: NO.\n\nALWAYS_DETAILED_SEC    = NO\n\n# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all\n# inherited members of a class in the documentation of that class as if those\n# members were ordinary class members. Constructors, destructors and assignment\n# operators of the base classes will not be shown.\n# The default value is: NO.\n\nINLINE_INHERITED_MEMB  = NO\n\n# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path\n# before files name in the file list and in the header files. If set to NO the\n# shortest path that makes the file name unique will be used\n# The default value is: YES.\n\nFULL_PATH_NAMES        = YES\n\n# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.\n# Stripping is only done if one of the specified strings matches the left-hand\n# part of the path. The tag can be used to show relative paths in the file list.\n# If left blank the directory from which doxygen is run is used as the path to\n# strip.\n#\n# Note that you can specify absolute paths here, but also relative paths, which\n# will be relative from the directory where doxygen is started.\n# This tag requires that the tag FULL_PATH_NAMES is set to YES.\n\nSTRIP_FROM_PATH        = $(builddir)\n\n# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the\n# path mentioned in the documentation of a class, which tells the reader which\n# header file to include in order to use a class. If left blank only the name of\n# the header file containing the class definition is used. Otherwise one should\n# specify the list of include paths that are normally passed to the compiler\n# using the -I flag.\n\nSTRIP_FROM_INC_PATH    =\n\n# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but\n# less readable) file names. This can be useful is your file systems doesn't\n# support long names like on DOS, Mac, or CD-ROM.\n# The default value is: NO.\n\nSHORT_NAMES            = YES\n\n# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the\n# first line (until the first dot) of a Javadoc-style comment as the brief\n# description. If set to NO, the Javadoc-style will behave just like regular Qt-\n# style comments (thus requiring an explicit @brief command for a brief\n# description.)\n# The default value is: NO.\n\nJAVADOC_AUTOBRIEF      = NO\n\n# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line\n# such as\n# /***************\n# as being the beginning of a Javadoc-style comment \"banner\". If set to NO, the\n# Javadoc-style will behave just like regular comments and it will not be\n# interpreted by doxygen.\n# The default value is: NO.\n\nJAVADOC_BANNER         = NO\n\n# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first\n# line (until the first dot) of a Qt-style comment as the brief description. If\n# set to NO, the Qt-style will behave just like regular Qt-style comments (thus\n# requiring an explicit \\brief command for a brief description.)\n# The default value is: NO.\n\nQT_AUTOBRIEF           = NO\n\n# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a\n# multi-line C++ special comment block (i.e. a block of //! or /// comments) as\n# a brief description. This used to be the default behavior. The new default is\n# to treat a multi-line C++ comment block as a detailed description. Set this\n# tag to YES if you prefer the old behavior instead.\n#\n# Note that setting this tag to YES also means that rational rose comments are\n# not recognized any more.\n# The default value is: NO.\n\nMULTILINE_CPP_IS_BRIEF = NO\n\n# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the\n# documentation from any documented member that it re-implements.\n# The default value is: YES.\n\nINHERIT_DOCS           = YES\n\n# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new\n# page for each member. If set to NO, the documentation of a member will be part\n# of the file/class/namespace that contains it.\n# The default value is: NO.\n\nSEPARATE_MEMBER_PAGES  = NO\n\n# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen\n# uses this value to replace tabs by spaces in code fragments.\n# Minimum value: 1, maximum value: 16, default value: 4.\n\nTAB_SIZE               = 8\n\n# This tag can be used to specify a number of aliases that act as commands in\n# the documentation. An alias has the form:\n# name=value\n# For example adding\n# \"sideeffect=@par Side Effects:\\n\"\n# will allow you to put the command \\sideeffect (or @sideeffect) in the\n# documentation, which will result in a user-defined paragraph with heading\n# \"Side Effects:\". You can put \\n's in the value part of an alias to insert\n# newlines (in the resulting output). You can put ^^ in the value part of an\n# alias to insert a newline as if a physical newline was in the original file.\n# When you need a literal { or } or , in the value part of an alias you have to\n# escape them by means of a backslash (\\), this can lead to conflicts with the\n# commands \\{ and \\} for these it is advised to use the version @{ and @} or use\n# a double escape (\\\\{ and \\\\})\n\nALIASES                =\n\n# This tag can be used to specify a number of word-keyword mappings (TCL only).\n# A mapping has the form \"name=value\". For example adding \"class=itcl::class\"\n# will allow you to use the command class in the itcl::class meaning.\n\nTCL_SUBST              =\n\n# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources\n# only. Doxygen will then generate output that is more tailored for C. For\n# instance, some of the names that are used will be different. The list of all\n# members will be omitted, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_FOR_C  = NO\n\n# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or\n# Python sources only. Doxygen will then generate output that is more tailored\n# for that language. For instance, namespaces will be presented as packages,\n# qualified scopes will look different, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_JAVA   = NO\n\n# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran\n# sources. Doxygen will then generate output that is tailored for Fortran.\n# The default value is: NO.\n\nOPTIMIZE_FOR_FORTRAN   = NO\n\n# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL\n# sources. Doxygen will then generate output that is tailored for VHDL.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_VHDL   = NO\n\n# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice\n# sources only. Doxygen will then generate output that is more tailored for that\n# language. For instance, namespaces will be presented as modules, types will be\n# separated into more groups, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_SLICE  = NO\n\n# Doxygen selects the parser to use depending on the extension of the files it\n# parses. With this tag you can assign which parser to use for a given\n# extension. Doxygen has a built-in mapping, but you can override or extend it\n# using this tag. The format is ext=language, where ext is a file extension, and\n# language is one of the parsers supported by doxygen: IDL, Java, Javascript,\n# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,\n# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:\n# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser\n# tries to guess whether the code is fixed or free formatted code, this is the\n# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat\n# .inc files as Fortran files (default is PHP), and .f files as C (default is\n# Fortran), use: inc=Fortran f=C.\n#\n# Note: For files without extension you can use no_extension as a placeholder.\n#\n# Note that for custom extensions you also need to set FILE_PATTERNS otherwise\n# the files are not read by doxygen.\n\nEXTENSION_MAPPING      =\n\n# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments\n# according to the Markdown format, which allows for more readable\n# documentation. See https://daringfireball.net/projects/markdown/ for details.\n# The output of markdown processing is further processed by doxygen, so you can\n# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in\n# case of backward compatibilities issues.\n# The default value is: YES.\n\nMARKDOWN_SUPPORT       = YES\n\n# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up\n# to that level are automatically included in the table of contents, even if\n# they do not have an id attribute.\n# Note: This feature currently applies only to Markdown headings.\n# Minimum value: 0, maximum value: 99, default value: 5.\n# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.\n\nTOC_INCLUDE_HEADINGS   = 5\n\n# When enabled doxygen tries to link words that correspond to documented\n# classes, or namespaces to their corresponding documentation. Such a link can\n# be prevented in individual cases by putting a % sign in front of the word or\n# globally by setting AUTOLINK_SUPPORT to NO.\n# The default value is: YES.\n\nAUTOLINK_SUPPORT       = YES\n\n# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want\n# to include (a tag file for) the STL sources as input, then you should set this\n# tag to YES in order to let doxygen match functions declarations and\n# definitions whose arguments contain STL classes (e.g. func(std::string);\n# versus func(std::string) {}). This also make the inheritance and collaboration\n# diagrams that involve STL classes more complete and accurate.\n# The default value is: NO.\n\nBUILTIN_STL_SUPPORT    = YES\n\n# If you use Microsoft's C++/CLI language, you should set this option to YES to\n# enable parsing support.\n# The default value is: NO.\n\nCPP_CLI_SUPPORT        = NO\n\n# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:\n# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen\n# will parse them like normal C++ but will assume all classes use public instead\n# of private inheritance when no explicit protection keyword is present.\n# The default value is: NO.\n\nSIP_SUPPORT            = NO\n\n# For Microsoft's IDL there are propget and propput attributes to indicate\n# getter and setter methods for a property. Setting this option to YES will make\n# doxygen to replace the get and set methods by a property in the documentation.\n# This will only work if the methods are indeed getting or setting a simple\n# type. If this is not the case, or you want to show the methods anyway, you\n# should set this option to NO.\n# The default value is: YES.\n\nIDL_PROPERTY_SUPPORT   = YES\n\n# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC\n# tag is set to YES then doxygen will reuse the documentation of the first\n# member in the group (if any) for the other members of the group. By default\n# all members of a group must be documented explicitly.\n# The default value is: NO.\n\nDISTRIBUTE_GROUP_DOC   = NO\n\n# If one adds a struct or class to a group and this option is enabled, then also\n# any nested class or struct is added to the same group. By default this option\n# is disabled and one has to add nested compounds explicitly via \\ingroup.\n# The default value is: NO.\n\nGROUP_NESTED_COMPOUNDS = NO\n\n# Set the SUBGROUPING tag to YES to allow class member groups of the same type\n# (for instance a group of public functions) to be put as a subgroup of that\n# type (e.g. under the Public Functions section). Set it to NO to prevent\n# subgrouping. Alternatively, this can be done per class using the\n# \\nosubgrouping command.\n# The default value is: YES.\n\nSUBGROUPING            = YES\n\n# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions\n# are shown inside the group in which they are included (e.g. using \\ingroup)\n# instead of on a separate page (for HTML and Man pages) or section (for LaTeX\n# and RTF).\n#\n# Note that this feature does not work in combination with\n# SEPARATE_MEMBER_PAGES.\n# The default value is: NO.\n\nINLINE_GROUPED_CLASSES = NO\n\n# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions\n# with only public data fields or simple typedef fields will be shown inline in\n# the documentation of the scope in which they are defined (i.e. file,\n# namespace, or group documentation), provided this scope is documented. If set\n# to NO, structs, classes, and unions are shown on a separate page (for HTML and\n# Man pages) or section (for LaTeX and RTF).\n# The default value is: NO.\n\nINLINE_SIMPLE_STRUCTS  = NO\n\n# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or\n# enum is documented as struct, union, or enum with the name of the typedef. So\n# typedef struct TypeS {} TypeT, will appear in the documentation as a struct\n# with name TypeT. When disabled the typedef will appear as a member of a file,\n# namespace, or class. And the struct will be named TypeS. This can typically be\n# useful for C code in case the coding convention dictates that all compound\n# types are typedef'ed and only the typedef is referenced, never the tag name.\n# The default value is: NO.\n\nTYPEDEF_HIDES_STRUCT   = NO\n\n# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This\n# cache is used to resolve symbols given their name and scope. Since this can be\n# an expensive process and often the same symbol appears multiple times in the\n# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small\n# doxygen will become slower. If the cache is too large, memory is wasted. The\n# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range\n# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536\n# symbols. At the end of a run doxygen will report the cache usage and suggest\n# the optimal cache size from a speed point of view.\n# Minimum value: 0, maximum value: 9, default value: 0.\n\nLOOKUP_CACHE_SIZE      = 0\n\n#---------------------------------------------------------------------------\n# Build related configuration options\n#---------------------------------------------------------------------------\n\n# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in\n# documentation are documented, even if no documentation was available. Private\n# class members and static file members will be hidden unless the\n# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.\n# Note: This will also disable the warnings about undocumented members that are\n# normally produced when WARNINGS is set to YES.\n# The default value is: NO.\n\nEXTRACT_ALL            = YES\n\n# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will\n# be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PRIVATE        = NO\n\n# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual\n# methods of a class will be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PRIV_VIRTUAL   = NO\n\n# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal\n# scope will be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PACKAGE        = NO\n\n# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be\n# included in the documentation.\n# The default value is: NO.\n\nEXTRACT_STATIC         = NO\n\n# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined\n# locally in source files will be included in the documentation. If set to NO,\n# only classes defined in header files are included. Does not have any effect\n# for Java sources.\n# The default value is: YES.\n\nEXTRACT_LOCAL_CLASSES  = YES\n\n# This flag is only useful for Objective-C code. If set to YES, local methods,\n# which are defined in the implementation section but not in the interface are\n# included in the documentation. If set to NO, only methods in the interface are\n# included.\n# The default value is: NO.\n\nEXTRACT_LOCAL_METHODS  = NO\n\n# If this flag is set to YES, the members of anonymous namespaces will be\n# extracted and appear in the documentation as a namespace called\n# 'anonymous_namespace{file}', where file will be replaced with the base name of\n# the file that contains the anonymous namespace. By default anonymous namespace\n# are hidden.\n# The default value is: NO.\n\nEXTRACT_ANON_NSPACES   = NO\n\n# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all\n# undocumented members inside documented classes or files. If set to NO these\n# members will be included in the various overviews, but no documentation\n# section is generated. This option has no effect if EXTRACT_ALL is enabled.\n# The default value is: NO.\n\nHIDE_UNDOC_MEMBERS     = NO\n\n# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all\n# undocumented classes that are normally visible in the class hierarchy. If set\n# to NO, these classes will be included in the various overviews. This option\n# has no effect if EXTRACT_ALL is enabled.\n# The default value is: NO.\n\nHIDE_UNDOC_CLASSES     = NO\n\n# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend\n# (class|struct|union) declarations. If set to NO, these declarations will be\n# included in the documentation.\n# The default value is: NO.\n\nHIDE_FRIEND_COMPOUNDS  = NO\n\n# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any\n# documentation blocks found inside the body of a function. If set to NO, these\n# blocks will be appended to the function's detailed documentation block.\n# The default value is: NO.\n\nHIDE_IN_BODY_DOCS      = NO\n\n# The INTERNAL_DOCS tag determines if documentation that is typed after a\n# \\internal command is included. If the tag is set to NO then the documentation\n# will be excluded. Set it to YES to include the internal documentation.\n# The default value is: NO.\n\nINTERNAL_DOCS          = NO\n\n# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file\n# names in lower-case letters. If set to YES, upper-case letters are also\n# allowed. This is useful if you have classes or files whose names only differ\n# in case and if your file system supports case sensitive file names. Windows\n# (including Cygwin) ands Mac users are advised to set this option to NO.\n# The default value is: system dependent.\n\nCASE_SENSE_NAMES       = NO\n\n# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with\n# their full class and namespace scopes in the documentation. If set to YES, the\n# scope will be hidden.\n# The default value is: NO.\n\nHIDE_SCOPE_NAMES       = NO\n\n# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will\n# append additional text to a page's title, such as Class Reference. If set to\n# YES the compound reference will be hidden.\n# The default value is: NO.\n\nHIDE_COMPOUND_REFERENCE= NO\n\n# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of\n# the files that are included by a file in the documentation of that file.\n# The default value is: YES.\n\nSHOW_INCLUDE_FILES     = YES\n\n# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each\n# grouped member an include statement to the documentation, telling the reader\n# which file to include in order to use the member.\n# The default value is: NO.\n\nSHOW_GROUPED_MEMB_INC  = NO\n\n# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include\n# files with double quotes in the documentation rather than with sharp brackets.\n# The default value is: NO.\n\nFORCE_LOCAL_INCLUDES   = NO\n\n# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the\n# documentation for inline members.\n# The default value is: YES.\n\nINLINE_INFO            = YES\n\n# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the\n# (detailed) documentation of file and class members alphabetically by member\n# name. If set to NO, the members will appear in declaration order.\n# The default value is: YES.\n\nSORT_MEMBER_DOCS       = YES\n\n# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief\n# descriptions of file, namespace and class members alphabetically by member\n# name. If set to NO, the members will appear in declaration order. Note that\n# this will also influence the order of the classes in the class list.\n# The default value is: NO.\n\nSORT_BRIEF_DOCS        = NO\n\n# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the\n# (brief and detailed) documentation of class members so that constructors and\n# destructors are listed first. If set to NO the constructors will appear in the\n# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.\n# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief\n# member documentation.\n# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting\n# detailed member documentation.\n# The default value is: NO.\n\nSORT_MEMBERS_CTORS_1ST = NO\n\n# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy\n# of group names into alphabetical order. If set to NO the group names will\n# appear in their defined order.\n# The default value is: NO.\n\nSORT_GROUP_NAMES       = NO\n\n# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by\n# fully-qualified names, including namespaces. If set to NO, the class list will\n# be sorted only by class name, not including the namespace part.\n# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.\n# Note: This option applies only to the class list, not to the alphabetical\n# list.\n# The default value is: NO.\n\nSORT_BY_SCOPE_NAME     = NO\n\n# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper\n# type resolution of all parameters of a function it will reject a match between\n# the prototype and the implementation of a member function even if there is\n# only one candidate or it is obvious which candidate to choose by doing a\n# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still\n# accept a match between prototype and implementation in such cases.\n# The default value is: NO.\n\nSTRICT_PROTO_MATCHING  = NO\n\n# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo\n# list. This list is created by putting \\todo commands in the documentation.\n# The default value is: YES.\n\nGENERATE_TODOLIST      = YES\n\n# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test\n# list. This list is created by putting \\test commands in the documentation.\n# The default value is: YES.\n\nGENERATE_TESTLIST      = YES\n\n# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug\n# list. This list is created by putting \\bug commands in the documentation.\n# The default value is: YES.\n\nGENERATE_BUGLIST       = YES\n\n# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)\n# the deprecated list. This list is created by putting \\deprecated commands in\n# the documentation.\n# The default value is: YES.\n\nGENERATE_DEPRECATEDLIST= YES\n\n# The ENABLED_SECTIONS tag can be used to enable conditional documentation\n# sections, marked by \\if <section_label> ... \\endif and \\cond <section_label>\n# ... \\endcond blocks.\n\nENABLED_SECTIONS       =\n\n# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the\n# initial value of a variable or macro / define can have for it to appear in the\n# documentation. If the initializer consists of more lines than specified here\n# it will be hidden. Use a value of 0 to hide initializers completely. The\n# appearance of the value of individual variables and macros / defines can be\n# controlled using \\showinitializer or \\hideinitializer command in the\n# documentation regardless of this setting.\n# Minimum value: 0, maximum value: 10000, default value: 30.\n\nMAX_INITIALIZER_LINES  = 30\n\n# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at\n# the bottom of the documentation of classes and structs. If set to YES, the\n# list will mention the files that were used to generate the documentation.\n# The default value is: YES.\n\nSHOW_USED_FILES        = YES\n\n# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This\n# will remove the Files entry from the Quick Index and from the Folder Tree View\n# (if specified).\n# The default value is: YES.\n\nSHOW_FILES             = YES\n\n# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces\n# page. This will remove the Namespaces entry from the Quick Index and from the\n# Folder Tree View (if specified).\n# The default value is: YES.\n\nSHOW_NAMESPACES        = YES\n\n# The FILE_VERSION_FILTER tag can be used to specify a program or script that\n# doxygen should invoke to get the current version for each file (typically from\n# the version control system). Doxygen will invoke the program by executing (via\n# popen()) the command command input-file, where command is the value of the\n# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided\n# by doxygen. Whatever the program writes to standard output is used as the file\n# version. For an example see the documentation.\n\nFILE_VERSION_FILTER    =\n\n# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed\n# by doxygen. The layout file controls the global structure of the generated\n# output files in an output format independent way. To create the layout file\n# that represents doxygen's defaults, run doxygen with the -l option. You can\n# optionally specify a file name after the option, if omitted DoxygenLayout.xml\n# will be used as the name of the layout file.\n#\n# Note that if you run doxygen from a directory containing a file called\n# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE\n# tag is left empty.\n\nLAYOUT_FILE            =\n\n# The CITE_BIB_FILES tag can be used to specify one or more bib files containing\n# the reference definitions. This must be a list of .bib files. The .bib\n# extension is automatically appended if omitted. This requires the bibtex tool\n# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.\n# For LaTeX the style of the bibliography can be controlled using\n# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the\n# search path. See also \\cite for info how to create references.\n\nCITE_BIB_FILES         =\n\n#---------------------------------------------------------------------------\n# Configuration options related to warning and progress messages\n#---------------------------------------------------------------------------\n\n# The QUIET tag can be used to turn on/off the messages that are generated to\n# standard output by doxygen. If QUIET is set to YES this implies that the\n# messages are off.\n# The default value is: NO.\n\nQUIET                  = NO\n\n# The WARNINGS tag can be used to turn on/off the warning messages that are\n# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES\n# this implies that the warnings are on.\n#\n# Tip: Turn warnings on while writing the documentation.\n# The default value is: YES.\n\nWARNINGS               = YES\n\n# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate\n# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag\n# will automatically be disabled.\n# The default value is: YES.\n\nWARN_IF_UNDOCUMENTED   = YES\n\n# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for\n# potential errors in the documentation, such as not documenting some parameters\n# in a documented function, or documenting parameters that don't exist or using\n# markup commands wrongly.\n# The default value is: YES.\n\nWARN_IF_DOC_ERROR      = YES\n\n# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that\n# are documented, but have no documentation for their parameters or return\n# value. If set to NO, doxygen will only warn about wrong or incomplete\n# parameter documentation, but not about the absence of documentation. If\n# EXTRACT_ALL is set to YES then this flag will automatically be disabled.\n# The default value is: NO.\n\nWARN_NO_PARAMDOC       = YES\n\n# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when\n# a warning is encountered.\n# The default value is: NO.\n\nWARN_AS_ERROR          = NO\n\n# The WARN_FORMAT tag determines the format of the warning messages that doxygen\n# can produce. The string should contain the $file, $line, and $text tags, which\n# will be replaced by the file and line number from which the warning originated\n# and the warning text. Optionally the format may contain $version, which will\n# be replaced by the version of the file (if it could be obtained via\n# FILE_VERSION_FILTER)\n# The default value is: $file:$line: $text.\n\nWARN_FORMAT            = \"$file:$line: $text\"\n\n# The WARN_LOGFILE tag can be used to specify a file to which warning and error\n# messages should be written. If left blank the output is written to standard\n# error (stderr).\n\nWARN_LOGFILE           = $(builddir)/doc/DoxyWarn.log\n\n#---------------------------------------------------------------------------\n# Configuration options related to the input files\n#---------------------------------------------------------------------------\n\n# The INPUT tag is used to specify the files and/or directories that contain\n# documented source files. You may enter file names like myfile.cpp or\n# directories like /usr/src/myproject. Separate the files or directories with\n# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING\n# Note: If this tag is empty the current directory is searched.\n\nINPUT                  = $(srcdir)/include $(srcdir)/src $(srcdir)/unittest\n\n# This tag can be used to specify the character encoding of the source files\n# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses\n# libiconv (or the iconv built into libc) for the transcoding. See the libiconv\n# documentation (see: https://www.gnu.org/software/libiconv/) for the list of\n# possible encodings.\n# The default value is: UTF-8.\n\nINPUT_ENCODING         = UTF-8\n\n# If the value of the INPUT tag contains directories, you can use the\n# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and\n# *.h) to filter out the source-files in the directories.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# read by doxygen.\n#\n# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,\n# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,\n# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,\n# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,\n# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.\n\nFILE_PATTERNS          = *.c \\\n                         *.cc \\\n                         *.cxx \\\n                         *.cpp \\\n                         *.c++ \\\n                         *.d \\\n                         *.java \\\n                         *.ii \\\n                         *.ixx \\\n                         *.ipp \\\n                         *.i++ \\\n                         *.inl \\\n                         *.h \\\n                         *.hh \\\n                         *.hxx \\\n                         *.hpp \\\n                         *.h++ \\\n                         *.idl \\\n                         *.odl \\\n                         *.cs \\\n                         *.php \\\n                         *.php3 \\\n                         *.inc \\\n                         *.m \\\n                         *.mm \\\n                         *.dox \\\n                         *.py \\\n                         *.f90 \\\n                         *.f \\\n                         *.vhd \\\n                         *.vhdl\n\n# The RECURSIVE tag can be used to specify whether or not subdirectories should\n# be searched for input files as well.\n# The default value is: NO.\n\nRECURSIVE              = YES\n\n# The EXCLUDE tag can be used to specify files and/or directories that should be\n# excluded from the INPUT source files. This way you can easily exclude a\n# subdirectory from a directory tree whose root is specified with the INPUT tag.\n#\n# Note that relative paths are relative to the directory from which doxygen is\n# run.\n\nEXCLUDE                =\n\n# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or\n# directories that are symbolic links (a Unix file system feature) are excluded\n# from the input.\n# The default value is: NO.\n\nEXCLUDE_SYMLINKS       = NO\n\n# If the value of the INPUT tag contains directories, you can use the\n# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude\n# certain files from those directories.\n#\n# Note that the wildcards are matched against the file with absolute path, so to\n# exclude all test directories for example use the pattern */test/*\n\nEXCLUDE_PATTERNS       = */.svn/*\n\n# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names\n# (namespaces, classes, functions, etc.) that should be excluded from the\n# output. The symbol name can be a fully qualified name, a word, or if the\n# wildcard * is used, a substring. Examples: ANamespace, AClass,\n# AClass::ANamespace, ANamespace::*Test\n#\n# Note that the wildcards are matched against the file with absolute path, so to\n# exclude all test directories use the pattern */test/*\n\nEXCLUDE_SYMBOLS        =\n\n# The EXAMPLE_PATH tag can be used to specify one or more files or directories\n# that contain example code fragments that are included (see the \\include\n# command).\n\nEXAMPLE_PATH           =\n\n# If the value of the EXAMPLE_PATH tag contains directories, you can use the\n# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and\n# *.h) to filter out the source-files in the directories. If left blank all\n# files are included.\n\nEXAMPLE_PATTERNS       = *\n\n# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be\n# searched for input files to be used with the \\include or \\dontinclude commands\n# irrespective of the value of the RECURSIVE tag.\n# The default value is: NO.\n\nEXAMPLE_RECURSIVE      = NO\n\n# The IMAGE_PATH tag can be used to specify one or more files or directories\n# that contain images that are to be included in the documentation (see the\n# \\image command).\n\nIMAGE_PATH             =\n\n# The INPUT_FILTER tag can be used to specify a program that doxygen should\n# invoke to filter for each input file. Doxygen will invoke the filter program\n# by executing (via popen()) the command:\n#\n# <filter> <input-file>\n#\n# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the\n# name of an input file. Doxygen will then use the output that the filter\n# program writes to standard output. If FILTER_PATTERNS is specified, this tag\n# will be ignored.\n#\n# Note that the filter must not add or remove lines; it is applied before the\n# code is scanned, but not when the output code is generated. If lines are added\n# or removed, the anchors will not be placed correctly.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# properly processed by doxygen.\n\nINPUT_FILTER           =\n\n# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern\n# basis. Doxygen will compare the file name with each pattern and apply the\n# filter if there is a match. The filters are a list of the form: pattern=filter\n# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how\n# filters are used. If the FILTER_PATTERNS tag is empty or if none of the\n# patterns match the file name, INPUT_FILTER is applied.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# properly processed by doxygen.\n\nFILTER_PATTERNS        =\n\n# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using\n# INPUT_FILTER) will also be used to filter the input files that are used for\n# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).\n# The default value is: NO.\n\nFILTER_SOURCE_FILES    = NO\n\n# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file\n# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and\n# it is also possible to disable source filtering for a specific pattern using\n# *.ext= (so without naming a filter).\n# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.\n\nFILTER_SOURCE_PATTERNS =\n\n# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that\n# is part of the input, its contents will be placed on the main page\n# (index.html). This can be useful if you have a project on for instance GitHub\n# and want to reuse the introduction page also for the doxygen output.\n\nUSE_MDFILE_AS_MAINPAGE =\n\n#---------------------------------------------------------------------------\n# Configuration options related to source browsing\n#---------------------------------------------------------------------------\n\n# If the SOURCE_BROWSER tag is set to YES then a list of source files will be\n# generated. Documented entities will be cross-referenced with these sources.\n#\n# Note: To get rid of all source code in the generated output, make sure that\n# also VERBATIM_HEADERS is set to NO.\n# The default value is: NO.\n\nSOURCE_BROWSER         = YES\n\n# Setting the INLINE_SOURCES tag to YES will include the body of functions,\n# classes and enums directly into the documentation.\n# The default value is: NO.\n\nINLINE_SOURCES         = YES\n\n# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any\n# special comment blocks from generated source code fragments. Normal C, C++ and\n# Fortran comments will always remain visible.\n# The default value is: YES.\n\nSTRIP_CODE_COMMENTS    = YES\n\n# If the REFERENCED_BY_RELATION tag is set to YES then for each documented\n# entity all documented functions referencing it will be listed.\n# The default value is: NO.\n\nREFERENCED_BY_RELATION = NO\n\n# If the REFERENCES_RELATION tag is set to YES then for each documented function\n# all documented entities called/used by that function will be listed.\n# The default value is: NO.\n\nREFERENCES_RELATION    = NO\n\n# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set\n# to YES then the hyperlinks from functions in REFERENCES_RELATION and\n# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will\n# link to the documentation.\n# The default value is: YES.\n\nREFERENCES_LINK_SOURCE = YES\n\n# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the\n# source code will show a tooltip with additional information such as prototype,\n# brief description and links to the definition and documentation. Since this\n# will make the HTML file larger and loading of large files a bit slower, you\n# can opt to disable this feature.\n# The default value is: YES.\n# This tag requires that the tag SOURCE_BROWSER is set to YES.\n\nSOURCE_TOOLTIPS        = YES\n\n# If the USE_HTAGS tag is set to YES then the references to source code will\n# point to the HTML generated by the htags(1) tool instead of doxygen built-in\n# source browser. The htags tool is part of GNU's global source tagging system\n# (see https://www.gnu.org/software/global/global.html). You will need version\n# 4.8.6 or higher.\n#\n# To use it do the following:\n# - Install the latest version of global\n# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file\n# - Make sure the INPUT points to the root of the source tree\n# - Run doxygen as normal\n#\n# Doxygen will invoke htags (and that will in turn invoke gtags), so these\n# tools must be available from the command line (i.e. in the search path).\n#\n# The result: instead of the source browser generated by doxygen, the links to\n# source code will now point to the output of htags.\n# The default value is: NO.\n# This tag requires that the tag SOURCE_BROWSER is set to YES.\n\nUSE_HTAGS              = NO\n\n# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a\n# verbatim copy of the header file for each class for which an include is\n# specified. Set to NO to disable this.\n# See also: Section \\class.\n# The default value is: YES.\n\nVERBATIM_HEADERS       = YES\n\n# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the\n# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the\n# cost of reduced performance. This can be particularly helpful with template\n# rich C++ code for which doxygen's built-in parser lacks the necessary type\n# information.\n# Note: The availability of this option depends on whether or not doxygen was\n# generated with the -Duse_libclang=ON option for CMake.\n# The default value is: NO.\n\nCLANG_ASSISTED_PARSING = NO\n\n# If clang assisted parsing is enabled you can provide the compiler with command\n# line options that you would normally use when invoking the compiler. Note that\n# the include paths will already be set by doxygen for the files and directories\n# specified with INPUT and INCLUDE_PATH.\n# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.\n\nCLANG_OPTIONS          =\n\n# If clang assisted parsing is enabled you can provide the clang parser with the\n# path to the compilation database (see:\n# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files\n# were built. This is equivalent to specifying the \"-p\" option to a clang tool,\n# such as clang-check. These options will then be passed to the parser.\n# Note: The availability of this option depends on whether or not doxygen was\n# generated with the -Duse_libclang=ON option for CMake.\n\nCLANG_DATABASE_PATH    =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the alphabetical class index\n#---------------------------------------------------------------------------\n\n# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all\n# compounds will be generated. Enable this if the project contains a lot of\n# classes, structs, unions or interfaces.\n# The default value is: YES.\n\nALPHABETICAL_INDEX     = NO\n\n# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in\n# which the alphabetical index list will be split.\n# Minimum value: 1, maximum value: 20, default value: 5.\n# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.\n\nCOLS_IN_ALPHA_INDEX    = 5\n\n# In case all classes in a project start with a common prefix, all classes will\n# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag\n# can be used to specify a prefix (or a list of prefixes) that should be ignored\n# while generating the index headers.\n# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.\n\nIGNORE_PREFIX          =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the HTML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output\n# The default value is: YES.\n\nGENERATE_HTML          = YES\n\n# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: html.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_OUTPUT            = html\n\n# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each\n# generated HTML page (for example: .htm, .php, .asp).\n# The default value is: .html.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_FILE_EXTENSION    = .html\n\n# The HTML_HEADER tag can be used to specify a user-defined HTML header file for\n# each generated HTML page. If the tag is left blank doxygen will generate a\n# standard header.\n#\n# To get valid HTML the header file that includes any scripts and style sheets\n# that doxygen needs, which is dependent on the configuration options used (e.g.\n# the setting GENERATE_TREEVIEW). It is highly recommended to start with a\n# default header using\n# doxygen -w html new_header.html new_footer.html new_stylesheet.css\n# YourConfigFile\n# and then modify the file new_header.html. See also section \"Doxygen usage\"\n# for information on how to generate the default header that doxygen normally\n# uses.\n# Note: The header is subject to change so you typically have to regenerate the\n# default header when upgrading to a newer version of doxygen. For a description\n# of the possible markers and block names see the documentation.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_HEADER            =\n\n# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each\n# generated HTML page. If the tag is left blank doxygen will generate a standard\n# footer. See HTML_HEADER for more information on how to generate a default\n# footer and what special commands can be used inside the footer. See also\n# section \"Doxygen usage\" for information on how to generate the default footer\n# that doxygen normally uses.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_FOOTER            =\n\n# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style\n# sheet that is used by each HTML page. It can be used to fine-tune the look of\n# the HTML output. If left blank doxygen will generate a default style sheet.\n# See also section \"Doxygen usage\" for information on how to generate the style\n# sheet that doxygen normally uses.\n# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as\n# it is more robust and this tag (HTML_STYLESHEET) will in the future become\n# obsolete.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_STYLESHEET        =\n\n# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined\n# cascading style sheets that are included after the standard style sheets\n# created by doxygen. Using this option one can overrule certain style aspects.\n# This is preferred over using HTML_STYLESHEET since it does not replace the\n# standard style sheet and is therefore more robust against future updates.\n# Doxygen will copy the style sheet files to the output directory.\n# Note: The order of the extra style sheet files is of importance (e.g. the last\n# style sheet in the list overrules the setting of the previous ones in the\n# list). For an example see the documentation.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_EXTRA_STYLESHEET  =\n\n# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or\n# other source files which should be copied to the HTML output directory. Note\n# that these files will be copied to the base HTML output directory. Use the\n# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these\n# files. In the HTML_STYLESHEET file, use the file name only. Also note that the\n# files will be copied as-is; there are no commands or markers available.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_EXTRA_FILES       =\n\n# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen\n# will adjust the colors in the style sheet and background images according to\n# this color. Hue is specified as an angle on a colorwheel, see\n# https://en.wikipedia.org/wiki/Hue for more information. For instance the value\n# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300\n# purple, and 360 is red again.\n# Minimum value: 0, maximum value: 359, default value: 220.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_HUE    = 220\n\n# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors\n# in the HTML output. For a value of 0 the output will use grayscales only. A\n# value of 255 will produce the most vivid colors.\n# Minimum value: 0, maximum value: 255, default value: 100.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_SAT    = 100\n\n# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the\n# luminance component of the colors in the HTML output. Values below 100\n# gradually make the output lighter, whereas values above 100 make the output\n# darker. The value divided by 100 is the actual gamma applied, so 80 represents\n# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not\n# change the gamma.\n# Minimum value: 40, maximum value: 240, default value: 80.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_GAMMA  = 80\n\n# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML\n# page will contain the date and time when the page was generated. Setting this\n# to YES can help to show when doxygen was last run and thus if the\n# documentation is up to date.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_TIMESTAMP         = YES\n\n# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML\n# documentation will contain a main index with vertical navigation menus that\n# are dynamically created via Javascript. If disabled, the navigation index will\n# consists of multiple levels of tabs that are statically embedded in every HTML\n# page. Disable this option to support browsers that do not have Javascript,\n# like the Qt help browser.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_DYNAMIC_MENUS     = YES\n\n# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML\n# documentation will contain sections that can be hidden and shown after the\n# page has loaded.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_DYNAMIC_SECTIONS  = NO\n\n# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries\n# shown in the various tree structured indices initially; the user can expand\n# and collapse entries dynamically later on. Doxygen will expand the tree to\n# such a level that at most the specified number of entries are visible (unless\n# a fully collapsed tree already exceeds this amount). So setting the number of\n# entries 1 will produce a full collapsed tree by default. 0 is a special value\n# representing an infinite number of entries and will result in a full expanded\n# tree by default.\n# Minimum value: 0, maximum value: 9999, default value: 100.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_INDEX_NUM_ENTRIES = 100\n\n# If the GENERATE_DOCSET tag is set to YES, additional index files will be\n# generated that can be used as input for Apple's Xcode 3 integrated development\n# environment (see: https://developer.apple.com/xcode/), introduced with OSX\n# 10.5 (Leopard). To create a documentation set, doxygen will generate a\n# Makefile in the HTML output directory. Running make will produce the docset in\n# that directory and running make install will install the docset in\n# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at\n# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy\n# genXcode/_index.html for more information.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_DOCSET        = NO\n\n# This tag determines the name of the docset feed. A documentation feed provides\n# an umbrella under which multiple documentation sets from a single provider\n# (such as a company or product suite) can be grouped.\n# The default value is: Doxygen generated docs.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_FEEDNAME        = \"Doxygen generated docs\"\n\n# This tag specifies a string that should uniquely identify the documentation\n# set bundle. This should be a reverse domain-name style string, e.g.\n# com.mycompany.MyDocSet. Doxygen will append .docset to the name.\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_BUNDLE_ID       = org.doxygen.Project\n\n# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify\n# the documentation publisher. This should be a reverse domain-name style\n# string, e.g. com.mycompany.MyDocSet.documentation.\n# The default value is: org.doxygen.Publisher.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_PUBLISHER_ID    = org.doxygen.Publisher\n\n# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.\n# The default value is: Publisher.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_PUBLISHER_NAME  = Publisher\n\n# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three\n# additional HTML index files: index.hhp, index.hhc, and index.hhk. The\n# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop\n# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on\n# Windows.\n#\n# The HTML Help Workshop contains a compiler that can convert all HTML output\n# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML\n# files are now used as the Windows 98 help format, and will replace the old\n# Windows help format (.hlp) on all Windows platforms in the future. Compressed\n# HTML files also contain an index, a table of contents, and you can search for\n# words in the documentation. The HTML workshop also contains a viewer for\n# compressed HTML files.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_HTMLHELP      = NO\n\n# The CHM_FILE tag can be used to specify the file name of the resulting .chm\n# file. You can add a path in front of the file if the result should not be\n# written to the html output directory.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nCHM_FILE               =\n\n# The HHC_LOCATION tag can be used to specify the location (absolute path\n# including file name) of the HTML help compiler (hhc.exe). If non-empty,\n# doxygen will try to run the HTML help compiler on the generated index.hhp.\n# The file has to be specified with full path.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nHHC_LOCATION           =\n\n# The GENERATE_CHI flag controls if a separate .chi index file is generated\n# (YES) or that it should be included in the master .chm file (NO).\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nGENERATE_CHI           = NO\n\n# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)\n# and project file content.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nCHM_INDEX_ENCODING     =\n\n# The BINARY_TOC flag controls whether a binary table of contents is generated\n# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it\n# enables the Previous and Next buttons.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nBINARY_TOC             = NO\n\n# The TOC_EXPAND flag can be set to YES to add extra items for group members to\n# the table of contents of the HTML help documentation and to the tree view.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nTOC_EXPAND             = NO\n\n# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and\n# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that\n# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help\n# (.qch) of the generated HTML documentation.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_QHP           = NO\n\n# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify\n# the file name of the resulting .qch file. The path specified is relative to\n# the HTML output folder.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQCH_FILE               =\n\n# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help\n# Project output. For more information please see Qt Help Project / Namespace\n# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_NAMESPACE          = org.doxygen.Project\n\n# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt\n# Help Project output. For more information please see Qt Help Project / Virtual\n# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-\n# folders).\n# The default value is: doc.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_VIRTUAL_FOLDER     = doc\n\n# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom\n# filter to add. For more information please see Qt Help Project / Custom\n# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-\n# filters).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_CUST_FILTER_NAME   =\n\n# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the\n# custom filter to add. For more information please see Qt Help Project / Custom\n# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-\n# filters).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_CUST_FILTER_ATTRS  =\n\n# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this\n# project's filter section matches. Qt Help Project / Filter Attributes (see:\n# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_SECT_FILTER_ATTRS  =\n\n# The QHG_LOCATION tag can be used to specify the location of Qt's\n# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the\n# generated .qhp file.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHG_LOCATION           =\n\n# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be\n# generated, together with the HTML files, they form an Eclipse help plugin. To\n# install this plugin and make it available under the help contents menu in\n# Eclipse, the contents of the directory containing the HTML and XML files needs\n# to be copied into the plugins directory of eclipse. The name of the directory\n# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.\n# After copying Eclipse needs to be restarted before the help appears.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_ECLIPSEHELP   = NO\n\n# A unique identifier for the Eclipse help plugin. When installing the plugin\n# the directory name containing the HTML and XML files should also have this\n# name. Each documentation set should have its own identifier.\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.\n\nECLIPSE_DOC_ID         = org.doxygen.Project\n\n# If you want full control over the layout of the generated HTML pages it might\n# be necessary to disable the index and replace it with your own. The\n# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top\n# of each HTML page. A value of NO enables the index and the value YES disables\n# it. Since the tabs in the index contain the same information as the navigation\n# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nDISABLE_INDEX          = NO\n\n# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index\n# structure should be generated to display hierarchical information. If the tag\n# value is set to YES, a side panel will be generated containing a tree-like\n# index structure (just like the one that is generated for HTML Help). For this\n# to work a browser that supports JavaScript, DHTML, CSS and frames is required\n# (i.e. any modern browser). Windows users are probably better off using the\n# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can\n# further fine-tune the look of the index. As an example, the default style\n# sheet generated by doxygen has an example that shows how to put an image at\n# the root of the tree instead of the PROJECT_NAME. Since the tree basically has\n# the same information as the tab index, you could consider setting\n# DISABLE_INDEX to YES when enabling this option.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_TREEVIEW      = YES\n\n# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that\n# doxygen will group on one line in the generated HTML documentation.\n#\n# Note that a value of 0 will completely suppress the enum values from appearing\n# in the overview section.\n# Minimum value: 0, maximum value: 20, default value: 4.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nENUM_VALUES_PER_LINE   = 4\n\n# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used\n# to set the initial width (in pixels) of the frame in which the tree is shown.\n# Minimum value: 0, maximum value: 1500, default value: 250.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nTREEVIEW_WIDTH         = 250\n\n# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to\n# external symbols imported via tag files in a separate window.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nEXT_LINKS_IN_WINDOW    = NO\n\n# Use this tag to change the font size of LaTeX formulas included as images in\n# the HTML documentation. When you change the font size after a successful\n# doxygen run you need to manually remove any form_*.png images from the HTML\n# output directory to force them to be regenerated.\n# Minimum value: 8, maximum value: 50, default value: 10.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nFORMULA_FONTSIZE       = 10\n\n# Use the FORMULA_TRANSPARENT tag to determine whether or not the images\n# generated for formulas are transparent PNGs. Transparent PNGs are not\n# supported properly for IE 6.0, but are supported on all modern browsers.\n#\n# Note that when changing this option you need to delete any form_*.png files in\n# the HTML output directory before the changes have effect.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nFORMULA_TRANSPARENT    = YES\n\n# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see\n# https://www.mathjax.org) which uses client side Javascript for the rendering\n# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX\n# installed or if you want to formulas look prettier in the HTML output. When\n# enabled you may also need to install MathJax separately and configure the path\n# to it using the MATHJAX_RELPATH option.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nUSE_MATHJAX            = NO\n\n# When MathJax is enabled you can set the default output format to be used for\n# the MathJax output. See the MathJax site (see:\n# http://docs.mathjax.org/en/latest/output.html) for more details.\n# Possible values are: HTML-CSS (which is slower, but has the best\n# compatibility), NativeMML (i.e. MathML) and SVG.\n# The default value is: HTML-CSS.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_FORMAT         = HTML-CSS\n\n# When MathJax is enabled you need to specify the location relative to the HTML\n# output directory using the MATHJAX_RELPATH option. The destination directory\n# should contain the MathJax.js script. For instance, if the mathjax directory\n# is located at the same level as the HTML output directory, then\n# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax\n# Content Delivery Network so you can quickly see the result without installing\n# MathJax. However, it is strongly recommended to install a local copy of\n# MathJax from https://www.mathjax.org before deployment.\n# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_RELPATH        = http://www.mathjax.org/mathjax\n\n# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax\n# extension names that should be enabled during MathJax rendering. For example\n# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_EXTENSIONS     =\n\n# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces\n# of code that will be used on startup of the MathJax code. See the MathJax site\n# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an\n# example see the documentation.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_CODEFILE       =\n\n# When the SEARCHENGINE tag is enabled doxygen will generate a search box for\n# the HTML output. The underlying search engine uses javascript and DHTML and\n# should work on any modern browser. Note that when using HTML help\n# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)\n# there is already a search function so this one should typically be disabled.\n# For large projects the javascript based search engine can be slow, then\n# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to\n# search using the keyboard; to jump to the search box use <access key> + S\n# (what the <access key> is depends on the OS and browser, but it is typically\n# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down\n# key> to jump into the search results window, the results can be navigated\n# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel\n# the search. The filter options can be selected when the cursor is inside the\n# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>\n# to select a filter and <Enter> or <escape> to activate or cancel the filter\n# option.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nSEARCHENGINE           = YES\n\n# When the SERVER_BASED_SEARCH tag is enabled the search engine will be\n# implemented using a web server instead of a web client using Javascript. There\n# are two flavors of web server based searching depending on the EXTERNAL_SEARCH\n# setting. When disabled, doxygen will generate a PHP script for searching and\n# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing\n# and searching needs to be provided by external tools. See the section\n# \"External Indexing and Searching\" for details.\n# The default value is: NO.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSERVER_BASED_SEARCH    = NO\n\n# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP\n# script for searching. Instead the search results are written to an XML file\n# which needs to be processed by an external indexer. Doxygen will invoke an\n# external search engine pointed to by the SEARCHENGINE_URL option to obtain the\n# search results.\n#\n# Doxygen ships with an example indexer (doxyindexer) and search engine\n# (doxysearch.cgi) which are based on the open source search engine library\n# Xapian (see: https://xapian.org/).\n#\n# See the section \"External Indexing and Searching\" for details.\n# The default value is: NO.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTERNAL_SEARCH        = NO\n\n# The SEARCHENGINE_URL should point to a search engine hosted by a web server\n# which will return the search results when EXTERNAL_SEARCH is enabled.\n#\n# Doxygen ships with an example indexer (doxyindexer) and search engine\n# (doxysearch.cgi) which are based on the open source search engine library\n# Xapian (see: https://xapian.org/). See the section \"External Indexing and\n# Searching\" for details.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSEARCHENGINE_URL       =\n\n# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed\n# search data is written to a file for indexing by an external tool. With the\n# SEARCHDATA_FILE tag the name of this file can be specified.\n# The default file is: searchdata.xml.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSEARCHDATA_FILE        = searchdata.xml\n\n# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the\n# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is\n# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple\n# projects and redirect the results back to the right project.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTERNAL_SEARCH_ID     =\n\n# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen\n# projects other than the one defined by this configuration file, but that are\n# all added to the same external search index. Each project needs to have a\n# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of\n# to a relative location where the documentation can be found. The format is:\n# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTRA_SEARCH_MAPPINGS  =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the LaTeX output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.\n# The default value is: YES.\n\nGENERATE_LATEX         = NO\n\n# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: latex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_OUTPUT           = latex\n\n# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be\n# invoked.\n#\n# Note that when not enabling USE_PDFLATEX the default is latex when enabling\n# USE_PDFLATEX the default is pdflatex and when in the later case latex is\n# chosen this is overwritten by pdflatex. For specific output languages the\n# default can have been set differently, this depends on the implementation of\n# the output language.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_CMD_NAME         = latex\n\n# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate\n# index for LaTeX.\n# Note: This tag is used in the Makefile / make.bat.\n# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file\n# (.tex).\n# The default file is: makeindex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nMAKEINDEX_CMD_NAME     = makeindex\n\n# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to\n# generate index for LaTeX. In case there is no backslash (\\) as first character\n# it will be automatically added in the LaTeX code.\n# Note: This tag is used in the generated output file (.tex).\n# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.\n# The default value is: makeindex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_MAKEINDEX_CMD    = makeindex\n\n# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX\n# documents. This may be useful for small projects and may help to save some\n# trees in general.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nCOMPACT_LATEX          = NO\n\n# The PAPER_TYPE tag can be used to set the paper type that is used by the\n# printer.\n# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x\n# 14 inches) and executive (7.25 x 10.5 inches).\n# The default value is: a4.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nPAPER_TYPE             = a4wide\n\n# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names\n# that should be included in the LaTeX output. The package can be specified just\n# by its name or with the correct syntax as to be used with the LaTeX\n# \\usepackage command. To get the times font for instance you can specify :\n# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}\n# To use the option intlimits with the amsmath package you can specify:\n# EXTRA_PACKAGES=[intlimits]{amsmath}\n# If left blank no extra packages will be included.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nEXTRA_PACKAGES         =\n\n# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the\n# generated LaTeX document. The header should contain everything until the first\n# chapter. If it is left blank doxygen will generate a standard header. See\n# section \"Doxygen usage\" for information on how to let doxygen write the\n# default header to a separate file.\n#\n# Note: Only use a user-defined header if you know what you are doing! The\n# following commands have a special meaning inside the header: $title,\n# $datetime, $date, $doxygenversion, $projectname, $projectnumber,\n# $projectbrief, $projectlogo. Doxygen will replace $title with the empty\n# string, for the replacement values of the other commands the user is referred\n# to HTML_HEADER.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_HEADER           =\n\n# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the\n# generated LaTeX document. The footer should contain everything after the last\n# chapter. If it is left blank doxygen will generate a standard footer. See\n# LATEX_HEADER for more information on how to generate a default footer and what\n# special commands can be used inside the footer.\n#\n# Note: Only use a user-defined footer if you know what you are doing!\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_FOOTER           =\n\n# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined\n# LaTeX style sheets that are included after the standard style sheets created\n# by doxygen. Using this option one can overrule certain style aspects. Doxygen\n# will copy the style sheet files to the output directory.\n# Note: The order of the extra style sheet files is of importance (e.g. the last\n# style sheet in the list overrules the setting of the previous ones in the\n# list).\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EXTRA_STYLESHEET =\n\n# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or\n# other source files which should be copied to the LATEX_OUTPUT output\n# directory. Note that the files will be copied as-is; there are no commands or\n# markers available.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EXTRA_FILES      =\n\n# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is\n# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will\n# contain links (just like the HTML output) instead of page references. This\n# makes the output suitable for online browsing using a PDF viewer.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nPDF_HYPERLINKS         = YES\n\n# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate\n# the PDF file directly from the LaTeX files. Set this option to YES, to get a\n# higher quality PDF documentation.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nUSE_PDFLATEX           = YES\n\n# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode\n# command to the generated LaTeX files. This will instruct LaTeX to keep running\n# if errors occur, instead of asking the user for help. This option is also used\n# when generating formulas in HTML.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_BATCHMODE        = NO\n\n# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the\n# index chapters (such as File Index, Compound Index, etc.) in the output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_HIDE_INDICES     = NO\n\n# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source\n# code with syntax highlighting in the LaTeX output.\n#\n# Note that which sources are shown also depends on other settings such as\n# SOURCE_BROWSER.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_SOURCE_CODE      = NO\n\n# The LATEX_BIB_STYLE tag can be used to specify the style to use for the\n# bibliography, e.g. plainnat, or ieeetr. See\n# https://en.wikipedia.org/wiki/BibTeX and \\cite for more info.\n# The default value is: plain.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_BIB_STYLE        = plain\n\n# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated\n# page will contain the date and time when the page was generated. Setting this\n# to NO can help when comparing the output of multiple runs.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_TIMESTAMP        = NO\n\n# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)\n# path from which the emoji images will be read. If a relative path is entered,\n# it will be relative to the LATEX_OUTPUT directory. If left blank the\n# LATEX_OUTPUT directory will be used.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EMOJI_DIRECTORY  =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the RTF output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The\n# RTF output is optimized for Word 97 and may not look too pretty with other RTF\n# readers/editors.\n# The default value is: NO.\n\nGENERATE_RTF           = NO\n\n# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: rtf.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_OUTPUT             = rtf\n\n# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF\n# documents. This may be useful for small projects and may help to save some\n# trees in general.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nCOMPACT_RTF            = NO\n\n# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will\n# contain hyperlink fields. The RTF file will contain links (just like the HTML\n# output) instead of page references. This makes the output suitable for online\n# browsing using Word or some other Word compatible readers that support those\n# fields.\n#\n# Note: WordPad (write) and others do not support links.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_HYPERLINKS         = NO\n\n# Load stylesheet definitions from file. Syntax is similar to doxygen's\n# configuration file, i.e. a series of assignments. You only have to provide\n# replacements, missing definitions are set to their default value.\n#\n# See also section \"Doxygen usage\" for information on how to generate the\n# default style sheet that doxygen normally uses.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_STYLESHEET_FILE    =\n\n# Set optional variables used in the generation of an RTF document. Syntax is\n# similar to doxygen's configuration file. A template extensions file can be\n# generated using doxygen -e rtf extensionFile.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_EXTENSIONS_FILE    =\n\n# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code\n# with syntax highlighting in the RTF output.\n#\n# Note that which sources are shown also depends on other settings such as\n# SOURCE_BROWSER.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_SOURCE_CODE        = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the man page output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for\n# classes and files.\n# The default value is: NO.\n\nGENERATE_MAN           = NO\n\n# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it. A directory man3 will be created inside the directory specified by\n# MAN_OUTPUT.\n# The default directory is: man.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_OUTPUT             = man\n\n# The MAN_EXTENSION tag determines the extension that is added to the generated\n# man pages. In case the manual section does not start with a number, the number\n# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is\n# optional.\n# The default value is: .3.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_EXTENSION          = .3\n\n# The MAN_SUBDIR tag determines the name of the directory created within\n# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by\n# MAN_EXTENSION with the initial . removed.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_SUBDIR             =\n\n# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it\n# will generate one additional man file for each entity documented in the real\n# man page(s). These additional files only source the real man page, but without\n# them the man command would be unable to find the correct page.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_LINKS              = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the XML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that\n# captures the structure of the code including all documentation.\n# The default value is: NO.\n\nGENERATE_XML           = NO\n\n# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: xml.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_OUTPUT             = xml\n\n# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program\n# listings (including syntax highlighting and cross-referencing information) to\n# the XML output. Note that enabling this will significantly increase the size\n# of the XML output.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_PROGRAMLISTING     = YES\n\n# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include\n# namespace members in file scope as well, matching the HTML output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_NS_MEMB_FILE_SCOPE = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the DOCBOOK output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files\n# that can be used to generate PDF.\n# The default value is: NO.\n\nGENERATE_DOCBOOK       = NO\n\n# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.\n# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in\n# front of it.\n# The default directory is: docbook.\n# This tag requires that the tag GENERATE_DOCBOOK is set to YES.\n\nDOCBOOK_OUTPUT         = docbook\n\n# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the\n# program listings (including syntax highlighting and cross-referencing\n# information) to the DOCBOOK output. Note that enabling this will significantly\n# increase the size of the DOCBOOK output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_DOCBOOK is set to YES.\n\nDOCBOOK_PROGRAMLISTING = NO\n\n#---------------------------------------------------------------------------\n# Configuration options for the AutoGen Definitions output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an\n# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures\n# the structure of the code including all documentation. Note that this feature\n# is still experimental and incomplete at the moment.\n# The default value is: NO.\n\nGENERATE_AUTOGEN_DEF   = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the Perl module output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module\n# file that captures the structure of the code including all documentation.\n#\n# Note that this feature is still experimental and incomplete at the moment.\n# The default value is: NO.\n\nGENERATE_PERLMOD       = NO\n\n# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary\n# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI\n# output from the Perl module output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_LATEX          = NO\n\n# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely\n# formatted so it can be parsed by a human reader. This is useful if you want to\n# understand what is going on. On the other hand, if this tag is set to NO, the\n# size of the Perl module output will be much smaller and Perl will parse it\n# just the same.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_PRETTY         = YES\n\n# The names of the make variables in the generated doxyrules.make file are\n# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful\n# so different doxyrules.make files included by the same Makefile don't\n# overwrite each other's variables.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_MAKEVAR_PREFIX =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the preprocessor\n#---------------------------------------------------------------------------\n\n# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all\n# C-preprocessor directives found in the sources and include files.\n# The default value is: YES.\n\nENABLE_PREPROCESSING   = YES\n\n# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names\n# in the source code. If set to NO, only conditional compilation will be\n# performed. Macro expansion can be done in a controlled way by setting\n# EXPAND_ONLY_PREDEF to YES.\n# The default value is: NO.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nMACRO_EXPANSION        = YES\n\n# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then\n# the macro expansion is limited to the macros specified with the PREDEFINED and\n# EXPAND_AS_DEFINED tags.\n# The default value is: NO.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nEXPAND_ONLY_PREDEF     = YES\n\n# If the SEARCH_INCLUDES tag is set to YES, the include files in the\n# INCLUDE_PATH will be searched if a #include is found.\n# The default value is: YES.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nSEARCH_INCLUDES        = YES\n\n# The INCLUDE_PATH tag can be used to specify one or more directories that\n# contain include files that are not input files but should be processed by the\n# preprocessor.\n# This tag requires that the tag SEARCH_INCLUDES is set to YES.\n\nINCLUDE_PATH           =\n\n# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard\n# patterns (like *.h and *.hpp) to filter out the header-files in the\n# directories. If left blank, the patterns specified with FILE_PATTERNS will be\n# used.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nINCLUDE_FILE_PATTERNS  =\n\n# The PREDEFINED tag can be used to specify one or more macro names that are\n# defined before the preprocessor is started (similar to the -D option of e.g.\n# gcc). The argument of the tag is a list of macros of the form: name or\n# name=definition (no spaces). If the definition and the \"=\" are omitted, \"=1\"\n# is assumed. To prevent a macro definition from being undefined via #undef or\n# recursively expanded use the := operator instead of the = operator.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nPREDEFINED             = \"BOOL_VAR_H(var,val,doc)=bool var = val; /**< doc */\" \\\n                         \"INT_VAR_H(var,val,doc)=int var = val; /**< doc */\" \\\n                         \"double_VAR_H(var,val,doc)=double var = val; /**< doc */\" \\\n                         \"STRING_VAR_H(var,val,doc)=char* var = val; /**< doc */\" \\\n                         \"BOOL_VAR(var,val,doc)=bool var = val; /**< doc */\" \\\n                         \"double_VAR(var,val,doc)=double var = val; /**< doc */\" \\\n                         \"STRING_VAR(var,val,doc)=char* var = val; /**< doc */\" \\\n                         \"INT_VAR(var,val,doc)=int var = val; /**< doc */\"\n\n# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this\n# tag can be used to specify a list of macro names that should be expanded. The\n# macro definition that is found in the sources will be used. Use the PREDEFINED\n# tag if you want to use a different macro definition that overrules the\n# definition found in the source code.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nEXPAND_AS_DEFINED      =\n\n# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will\n# remove all references to function-like macros that are alone on a line, have\n# an all uppercase name, and do not end with a semicolon. Such function macros\n# are typically used for boiler-plate code, and will confuse the parser if not\n# removed.\n# The default value is: YES.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nSKIP_FUNCTION_MACROS   = YES\n\n#---------------------------------------------------------------------------\n# Configuration options related to external references\n#---------------------------------------------------------------------------\n\n# The TAGFILES tag can be used to specify one or more tag files. For each tag\n# file the location of the external documentation should be added. The format of\n# a tag file without this location is as follows:\n# TAGFILES = file1 file2 ...\n# Adding location for the tag files is done as follows:\n# TAGFILES = file1=loc1 \"file2 = loc2\" ...\n# where loc1 and loc2 can be relative or absolute paths or URLs. See the\n# section \"Linking to external documentation\" for more information about the use\n# of tag files.\n# Note: Each tag file must have a unique name (where the name does NOT include\n# the path). If a tag file is not located in the directory in which doxygen is\n# run, you must also specify the path to the tagfile here.\n\nTAGFILES               =\n\n# When a file name is specified after GENERATE_TAGFILE, doxygen will create a\n# tag file that is based on the input files it reads. See section \"Linking to\n# external documentation\" for more information about the usage of tag files.\n\nGENERATE_TAGFILE       =\n\n# If the ALLEXTERNALS tag is set to YES, all external class will be listed in\n# the class index. If set to NO, only the inherited external classes will be\n# listed.\n# The default value is: NO.\n\nALLEXTERNALS           = NO\n\n# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed\n# in the modules index. If set to NO, only the current project's groups will be\n# listed.\n# The default value is: YES.\n\nEXTERNAL_GROUPS        = YES\n\n# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in\n# the related pages index. If set to NO, only the current project's pages will\n# be listed.\n# The default value is: YES.\n\nEXTERNAL_PAGES         = YES\n\n#---------------------------------------------------------------------------\n# Configuration options related to the dot tool\n#---------------------------------------------------------------------------\n\n# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram\n# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to\n# NO turns the diagrams off. Note that this option also works with HAVE_DOT\n# disabled, but it is recommended to install and use dot, since it yields more\n# powerful graphs.\n# The default value is: YES.\n\nCLASS_DIAGRAMS         = YES\n\n# You can include diagrams made with dia in doxygen documentation. Doxygen will\n# then run dia to produce the diagram and insert it in the documentation. The\n# DIA_PATH tag allows you to specify the directory where the dia binary resides.\n# If left empty dia is assumed to be found in the default search path.\n\nDIA_PATH               =\n\n# If set to YES the inheritance and collaboration graphs will hide inheritance\n# and usage relations if the target is undocumented or is not a class.\n# The default value is: YES.\n\nHIDE_UNDOC_RELATIONS   = YES\n\n# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is\n# available from the path. This tool is part of Graphviz (see:\n# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent\n# Bell Labs. The other options in this section have no effect if this option is\n# set to NO\n# The default value is: YES.\n\nHAVE_DOT               = NO\n\n# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed\n# to run in parallel. When set to 0 doxygen will base this on the number of\n# processors available in the system. You can set it explicitly to a value\n# larger than 0 to get control over the balance between CPU load and processing\n# speed.\n# Minimum value: 0, maximum value: 32, default value: 0.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_NUM_THREADS        = 0\n\n# When you want a differently looking font in the dot files that doxygen\n# generates you can specify the font name using DOT_FONTNAME. You need to make\n# sure dot is able to find the font, which can be done by putting it in a\n# standard location or by setting the DOTFONTPATH environment variable or by\n# setting DOT_FONTPATH to the directory containing the font.\n# The default value is: Helvetica.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTNAME           =\n\n# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of\n# dot graphs.\n# Minimum value: 4, maximum value: 24, default value: 10.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTSIZE           = 10\n\n# By default doxygen will tell dot to use the default font as specified with\n# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set\n# the path where dot can find it using this tag.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTPATH           =\n\n# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for\n# each documented class showing the direct and indirect inheritance relations.\n# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCLASS_GRAPH            = YES\n\n# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a\n# graph for each documented class showing the direct and indirect implementation\n# dependencies (inheritance, containment, and class references variables) of the\n# class with other documented classes.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCOLLABORATION_GRAPH    = YES\n\n# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for\n# groups, showing the direct groups dependencies.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGROUP_GRAPHS           = YES\n\n# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and\n# collaboration diagrams in a style similar to the OMG's Unified Modeling\n# Language.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nUML_LOOK               = NO\n\n# If the UML_LOOK tag is enabled, the fields and methods are shown inside the\n# class node. If there are many fields or methods and many nodes the graph may\n# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the\n# number of items for each type to make the size more manageable. Set this to 0\n# for no limit. Note that the threshold may be exceeded by 50% before the limit\n# is enforced. So when you set the threshold to 10, up to 15 fields may appear,\n# but if the number exceeds 15, the total amount of fields shown is limited to\n# 10.\n# Minimum value: 0, maximum value: 100, default value: 10.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nUML_LIMIT_NUM_FIELDS   = 10\n\n# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and\n# collaboration graphs will show the relations between templates and their\n# instances.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nTEMPLATE_RELATIONS     = NO\n\n# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to\n# YES then doxygen will generate a graph for each documented file showing the\n# direct and indirect include dependencies of the file with other documented\n# files.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINCLUDE_GRAPH          = YES\n\n# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are\n# set to YES then doxygen will generate a graph for each documented file showing\n# the direct and indirect include dependencies of the file with other documented\n# files.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINCLUDED_BY_GRAPH      = YES\n\n# If the CALL_GRAPH tag is set to YES then doxygen will generate a call\n# dependency graph for every global function or class method.\n#\n# Note that enabling this option will significantly increase the time of a run.\n# So in most cases it will be better to enable call graphs for selected\n# functions only using the \\callgraph command. Disabling a call graph can be\n# accomplished by means of the command \\hidecallgraph.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCALL_GRAPH             = NO\n\n# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller\n# dependency graph for every global function or class method.\n#\n# Note that enabling this option will significantly increase the time of a run.\n# So in most cases it will be better to enable caller graphs for selected\n# functions only using the \\callergraph command. Disabling a caller graph can be\n# accomplished by means of the command \\hidecallergraph.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCALLER_GRAPH           = NO\n\n# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical\n# hierarchy of all classes instead of a textual one.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGRAPHICAL_HIERARCHY    = YES\n\n# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the\n# dependencies a directory has on other directories in a graphical way. The\n# dependency relations are determined by the #include relations between the\n# files in the directories.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDIRECTORY_GRAPH        = YES\n\n# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images\n# generated by dot. For an explanation of the image formats see the section\n# output formats in the documentation of the dot tool (Graphviz (see:\n# http://www.graphviz.org/)).\n# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order\n# to make the SVG files visible in IE 9+ (other browsers do not have this\n# requirement).\n# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,\n# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,\n# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,\n# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and\n# png:gdiplus:gdiplus.\n# The default value is: png.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_IMAGE_FORMAT       = png\n\n# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to\n# enable generation of interactive SVG images that allow zooming and panning.\n#\n# Note that this requires a modern browser other than Internet Explorer. Tested\n# and working are Firefox, Chrome, Safari, and Opera.\n# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make\n# the SVG files visible. Older versions of IE do not have SVG support.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINTERACTIVE_SVG        = NO\n\n# The DOT_PATH tag can be used to specify the path where the dot tool can be\n# found. If left blank, it is assumed the dot tool can be found in the path.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_PATH               =\n\n# The DOTFILE_DIRS tag can be used to specify one or more directories that\n# contain dot files that are included in the documentation (see the \\dotfile\n# command).\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOTFILE_DIRS           =\n\n# The MSCFILE_DIRS tag can be used to specify one or more directories that\n# contain msc files that are included in the documentation (see the \\mscfile\n# command).\n\nMSCFILE_DIRS           =\n\n# The DIAFILE_DIRS tag can be used to specify one or more directories that\n# contain dia files that are included in the documentation (see the \\diafile\n# command).\n\nDIAFILE_DIRS           =\n\n# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the\n# path where java can find the plantuml.jar file. If left blank, it is assumed\n# PlantUML is not used or called during a preprocessing step. Doxygen will\n# generate a warning when it encounters a \\startuml command in this case and\n# will not generate output for the diagram.\n\nPLANTUML_JAR_PATH      =\n\n# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a\n# configuration file for plantuml.\n\nPLANTUML_CFG_FILE      =\n\n# When using plantuml, the specified paths are searched for files specified by\n# the !include statement in a plantuml block.\n\nPLANTUML_INCLUDE_PATH  =\n\n# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes\n# that will be shown in the graph. If the number of nodes in a graph becomes\n# larger than this value, doxygen will truncate the graph, which is visualized\n# by representing a node as a red box. Note that doxygen if the number of direct\n# children of the root node in a graph is already larger than\n# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that\n# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.\n# Minimum value: 0, maximum value: 10000, default value: 50.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_GRAPH_MAX_NODES    = 50\n\n# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs\n# generated by dot. A depth value of 3 means that only nodes reachable from the\n# root by following a path via at most 3 edges will be shown. Nodes that lay\n# further from the root node will be omitted. Note that setting this option to 1\n# or 2 may greatly reduce the computation time needed for large code bases. Also\n# note that the size of a graph can be further restricted by\n# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.\n# Minimum value: 0, maximum value: 1000, default value: 0.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nMAX_DOT_GRAPH_DEPTH    = 0\n\n# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent\n# background. This is disabled by default, because dot on Windows does not seem\n# to support this out of the box.\n#\n# Warning: Depending on the platform used, enabling this option may lead to\n# badly anti-aliased labels on the edges of a graph (i.e. they become hard to\n# read).\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_TRANSPARENT        = NO\n\n# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output\n# files in one run (i.e. multiple -o and -T options on the command line). This\n# makes dot run faster, but since only newer versions of dot (>1.8.10) support\n# this, this feature is disabled by default.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_MULTI_TARGETS      = NO\n\n# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page\n# explaining the meaning of the various boxes and arrows in the dot generated\n# graphs.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGENERATE_LEGEND        = YES\n\n# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot\n# files that are used to generate the various graphs.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_CLEANUP            = YES\n"
  },
  {
    "path": "doc/ambiguous_words.1.asc",
    "content": "AMBIGUOUS_WORDS(1)\n==================\n:doctype: manpage\n\nNAME\n----\nambiguous_words - generate sets of words Tesseract is likely to find ambiguous\n\nSYNOPSIS\n--------\n*ambiguous_words* [-l lang] 'TESSDATADIR' 'WORDLIST' 'AMBIGUOUSFILE'\n\nDESCRIPTION\n-----------\nambiguous_words(1) runs Tesseract in a special mode, and for each word\nin word list, produces a set of words which Tesseract thinks might be\nambiguous with it.   'TESSDATADIR' must be set to the absolute path of\na directory containing 'tessdata/lang.traineddata'.\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/classifier_tester.1.asc",
    "content": "CLASSIFIER_TESTER(1)\n====================\n\nNAME\n----\nclassifier_tester - for *legacy tesseract* engine.\n\nSYNOPSIS\n--------\n*classifier_tester* -U 'unicharset_file' -F 'font_properties_file' -X 'xheights_file'  -classifier 'x' -lang 'lang' [-output_trainer trainer] *.tr\n\nDESCRIPTION\n-----------\nclassifier_tester(1) runs Tesseract in a special mode.\nIt takes a list of .tr files and tests a character classifier\non data as formatted for training,\nbut it doesn't have to be the same as the training data.\n\nIN/OUT ARGUMENTS\n----------------\n\na list of .tr files\n\nOPTIONS\n-------\n-l 'lang'::\n\t(Input) three character language code; default value 'eng'.\n\n-classifier 'x'::\n\t(Input) One of \"pruner\", \"full\".\n\n\n-U 'unicharset'::\n\t(Input) The unicharset for the language.\n\n-F 'font_properties_file'::\n\t(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1:\n\n\t*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*\n\n-X 'xheights_file'::\n\t(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]\n\n\t*font_name* *xheight*\n\n-output_trainer 'trainer'::\n\t(Output, Optional) Filename for output trainer.\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/cntraining.1.asc",
    "content": "CNTRAINING(1)\n=============\n\nNAME\n----\ncntraining - character normalization training for Tesseract\n\nSYNOPSIS\n--------\n*cntraining* [-D 'dir'] 'FILE'...\n\nDESCRIPTION\n-----------\ncntraining takes a list of .tr files, from which it generates the\n*normproto* data file (the character normalization sensitivity\nprototypes).\n\nOPTIONS\n--------\n-D 'dir'::\n\tDirectory to write output files to.\n\nSEE ALSO\n--------\ntesseract(1), shapeclustering(1), mftraining(1)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nCOPYING\n-------\nCopyright (c) Hewlett-Packard Company, 1988\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/combine_lang_model.1.asc",
    "content": "COMBINE_LANG_MODEL(1)\n=====================\n:doctype: manpage\n\nNAME\n----\ncombine_lang_model - generate starter traineddata\n\nSYNOPSIS\n--------\n*combine_lang_model*  --input_unicharset 'filename' --script_dir 'dirname' --output_dir 'rootdir' --lang 'lang' [--lang_is_rtl] [pass_through_recoder] [--words file --puncs file --numbers file]\n\nDESCRIPTION\n-----------\ncombine_lang_model(1) generates a starter traineddata file that can be used to train an LSTM-based neural network model. It takes as input a unicharset and an optional set of wordlists. It eliminates the need to run set_unicharset_properties(1), wordlist2dawg(1), some non-existent binary to generate the recoder (unicode compressor), and finally combine_tessdata(1).\n\nOPTIONS\n-------\n'--lang lang'::\n\tThe language to use.\n\tTesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)\n\n'--script_dir  PATH'::\n  Directory name for input script unicharsets. It should point to the location of langdata (github repo) directory.  (type:string default:)\n\n'--input_unicharset  FILE'::\n  Unicharset to complete and use in encoding. It can be a hand-created file with incomplete fields. Its basic and script properties will be set before it is used.  (type:string default:)\n\n'--lang_is_rtl  BOOL'::\n  True if language being processed is written right-to-left (eg Arabic/Hebrew). (type:bool default:false)\n\n'--pass_through_recoder BOOL'::\n  If true, the recoder is a simple pass-through of the unicharset. Otherwise, potentially a compression of it by encoding Hangul in Jamos, decomposing multi-unicode symbols into sequences of unicodes, and encoding Han using the data in the radical_table_data, which must be the content of the file: langdata/radical-stroke.txt. (type:bool default:false)\n\n'--version_str  STRING'::\n  An arbitrary version label to add to traineddata file  (type:string default:)\n\n'--words  FILE'::\n  (Optional) File listing words to use for the system dictionary  (type:string default:)\n\n'--numbers  FILE'::\n  (Optional) File listing number patterns  (type:string default:)\n\n'--puncs  FILE'::\n  (Optional) File listing punctuation patterns. The words/puncs/numbers lists may be all empty. If any are non-empty then puncs must be non-empty.  (type:string default:)\n\n'--output_dir   PATH'::\n  Root directory for output files. Output files will be written to <output_dir>/<lang>/<lang>.*  (type:string default:)\n\nHISTORY\n-------\ncombine_lang_model(1) was first made available for tesseract4.00.00alpha.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training tesseract LSTM: <https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/combine_tessdata.1.asc",
    "content": "COMBINE_TESSDATA(1)\n===================\n\nNAME\n----\ncombine_tessdata - combine/extract/overwrite/list/compact Tesseract data\n\nSYNOPSIS\n--------\n*combine_tessdata* ['OPTION'] 'FILE'...\n\nDESCRIPTION\n-----------\ncombine_tessdata(1) is the main program to combine/extract/overwrite/list/compact\ntessdata components in [lang].traineddata files.\n\nTo combine all the individual tessdata components (unicharset, DAWGs,\nclassifier templates, ambiguities, language configs) located at, say,\n/home/$USER/temp/eng.* run:\n\n  combine_tessdata /home/$USER/temp/eng.\n\nThe result will be a combined tessdata file /home/$USER/temp/eng.traineddata\n\nSpecify option -e if you would like to extract individual components\nfrom a combined traineddata file. For example, to extract language config\nfile and the unicharset from tessdata/eng.traineddata run:\n\n  combine_tessdata -e tessdata/eng.traineddata \\\n    /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset\n\nThe desired config file and unicharset will be written to\n/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset\n\nSpecify option -o to overwrite individual components of the given\n[lang].traineddata file. For example, to overwrite language config\nand unichar ambiguities files in tessdata/eng.traineddata use:\n\n  combine_tessdata -o tessdata/eng.traineddata \\\n    /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs\n\nAs a result, tessdata/eng.traineddata will contain the new language config\nand unichar ambigs, plus all the original DAWGs, classifier templates, etc.\n\nNote: the file names of the files to extract to and to overwrite from should\nhave the appropriate file suffixes (extensions) indicating their tessdata\ncomponent type (.unicharset for the unicharset, .unicharambigs for unichar\nambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.\n\nSpecify option -u to unpack all the components to the specified path:\n\n    combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.\n\nThis will create  /home/$USER/temp/eng.* files with individual tessdata\ncomponents from tessdata/eng.traineddata.\n\nOPTIONS\n-------\n\n*-c* '.traineddata' 'FILE'...:\n    Compacts the LSTM component in the .traineddata file to int.\n\n*-d* '.traineddata' 'FILE'...:\n    Lists directory of components from the .traineddata file.\n\n*-e* '.traineddata' 'FILE'...:\n    Extracts the specified components from the .traineddata file\n\n*-l* '.traineddata' 'FILE'...:\n   List the network information.\n\n*-o* '.traineddata' 'FILE'...:\n    Overwrites the specified components of the .traineddata file\n    with those provided on the command line.\n\n*-u* '.traineddata' 'PATHPREFIX'\n    Unpacks the .traineddata using the provided prefix.\n\nCAVEATS\n-------\n'Prefix' refers to the full file prefix, including period (.)\n\n\nCOMPONENTS\n----------\nThe components in a Tesseract lang.traineddata file as of\nTesseract 4.0 are briefly described below; For more information on\nmany of these files, see\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\nand\n<https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nlang.config::\n  (Optional) Language-specific overrides to default config variables.\n  For 4.0 traineddata files, lang.config provides control parameters which\n  can affect layout analysis, and sub-languages.\n\nlang.unicharset::\n  (Required - 3.0x  legacy tesseract) The list of symbols that Tesseract recognizes, with properties.\n  See unicharset(5).\n\nlang.unicharambigs::\n  (Optional - 3.0x  legacy tesseract) This file contains information on pairs of recognized symbols\n  which are often confused.  For example, 'rn' and 'm'.\n\nlang.inttemp::\n  (Required - 3.0x  legacy tesseract) Character shape templates for each unichar.  Produced by\n  mftraining(1).\n\nlang.pffmtable::\n  (Required - 3.0x  legacy tesseract) The number of features expected for each unichar.\n  Produced by mftraining(1) from *.tr* files.\n\nlang.normproto::\n  (Required - 3.0x  legacy tesseract) Character normalization prototypes generated by cntraining(1)\n  from *.tr* files.\n\nlang.punc-dawg::\n  (Optional - 3.0x  legacy tesseract) A dawg made from punctuation patterns found around words.\n  The \"word\" part is replaced by a single space.\n\nlang.word-dawg::\n  (Optional - 3.0x  legacy tesseract) A dawg made from dictionary words from the language.\n\nlang.number-dawg::\n  (Optional - 3.0x  legacy tesseract) A dawg made from tokens which originally contained digits.\n  Each digit is replaced by a space character.\n\nlang.freq-dawg::\n  (Optional - 3.0x  legacy tesseract) A dawg made from the most frequent words which would have\n  gone into word-dawg.\n\nlang.fixed-length-dawgs::\n  (Optional - 3.0x  legacy tesseract) Several dawgs of different fixed lengths -- useful for\n  languages like Chinese.\n\nlang.shapetable::\n  (Optional - 3.0x  legacy tesseract) When present, a shapetable is an extra layer between the character\n  classifier and the word recognizer that allows the character classifier to\n  return a collection of unichar ids and fonts instead of a single unichar-id\n  and font.\n\nlang.bigram-dawg::\n  (Optional - 3.0x  legacy tesseract) A dawg of word bigrams where the words are separated by a space\n  and each digit is replaced by a '?'.\n\nlang.unambig-dawg::\n  (Optional - 3.0x  legacy tesseract) .\n\nlang.params-model::\n  (Optional - 3.0x  legacy tesseract) .\n\nlang.lstm::\n  (Required - 4.0 LSTM) Neural net trained recognition model generated by lstmtraining.\n\nlang.lstm-punc-dawg::\n  (Optional - 4.0 LSTM) A dawg made from punctuation patterns found around words.\n  The \"word\" part is replaced by a single space. Uses lang.lstm-unicharset.\n\nlang.lstm-word-dawg::\n  (Optional - 4.0 LSTM) A dawg made from dictionary words from the language.\n  Uses lang.lstm-unicharset.\n\nlang.lstm-number-dawg::\n  (Optional - 4.0 LSTM) A dawg made from tokens which originally contained digits.\n  Each digit is replaced by a space character. Uses lang.lstm-unicharset.\n\nlang.lstm-unicharset::\n  (Required - 4.0 LSTM) The unicode character set that Tesseract recognizes, with properties.\n  Same unicharset must be used to train the LSTM and build the lstm-*-dawgs files.\n\nlang.lstm-recoder::\n  (Required - 4.0 LSTM) Unicharcompress, aka the recoder, which maps the unicharset\n  further to the codes actually used by the neural network recognizer. This is created as\n  part of the starter traineddata by combine_lang_model.\n\nlang.version::\n  (Optional) Version string for the traineddata file.\n  First appeared in version 4.0 of Tesseract.\n  Old version of traineddata files will report Version:Pre-4.0.0.\n  4.0 version of traineddata files may include the network spec\n  used for LSTM training as part of version string.\n\nHISTORY\n-------\ncombine_tessdata(1) first appeared in version 3.00 of Tesseract\n\nSEE ALSO\n--------\ntesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5),\nunicharambigs(5)\n\nCOPYING\n-------\nCopyright \\(C) 2009, Google Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/dawg2wordlist.1.asc",
    "content": "DAWG2WORDLIST(1)\n================\n:doctype: manpage\n\nNAME\n----\ndawg2wordlist - convert a Tesseract DAWG to a wordlist\n\nSYNOPSIS\n--------\n*dawg2wordlist* 'UNICHARSET' 'DAWG' 'WORDLIST'\n\nDESCRIPTION\n-----------\ndawg2wordlist(1) converts a Tesseract Directed Acyclic Word\nGraph (DAWG) to a list of words using a unicharset as key.\n\nOPTIONS\n-------\n'UNICHARSET'\n\tThe unicharset of the language. This is the unicharset\n\tgenerated by mftraining(1).\n\n'DAWG'\n\tThe input DAWG, created by wordlist2dawg(1)\n\n'WORDLIST'\n\tPlain text (output) file in UTF-8, one word per line\n\nSEE ALSO\n--------\ntesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5),\ncombine_tessdata(1)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/generate_manpages.sh",
    "content": "#!/bin/bash\n#\n# File:         generate_manpages.sh\n# Description:  Converts .asc files into man pages, etc. for Tesseract.\n# Author:       eger@google.com (David Eger)\n# Created:      9 Feb 2012\n#\n# (C) Copyright 2012 Google Inc.\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nman_xslt=http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl\nasciidoc=$(which asciidoc)\nxsltproc=$(which xsltproc)\nif [[ -z \"${asciidoc}\" ]] || [[ -z \"${xsltproc}\" ]]; then\n  echo \"Please make sure asciidoc and xsltproc are installed.\"\n  exit 1\nelse\n  for src in *.asc; do\n    pagename=${src/.asc/}\n    (${asciidoc} -d manpage \"${src}\" &&\n     ${asciidoc} -d manpage -b docbook \"${src}\" &&\n       ${xsltproc} --nonet ${man_xslt} \"${pagename}\".xml) ||\n       echo \"Error generating ${pagename}\"\n  done\nfi\nexit 0\n"
  },
  {
    "path": "doc/lstmeval.1.asc",
    "content": "LSTMEVAL(1)\n===========\n:doctype: manpage\n\nNAME\n----\nlstmeval - Evaluation program for LSTM-based networks.\n\nSYNOPSIS\n--------\n*lstmeval* --model 'lang.lstm|modelname_checkpoint|modelname_N.NN_NN_NN.checkpoint' [--traineddata lang/lang.traineddata] --eval_listfile 'lang.eval_files.txt' [--verbosity N] [--max_image_MB NNNN]\n\nDESCRIPTION\n-----------\nlstmeval(1) evaluates LSTM-based networks. Either a recognition model or a training checkpoint can be given as input for evaluation along with a list of lstmf files. If evaluating a training checkpoint, '--traineddata' should also be specified. Intermediate training checkpoints can also be used.\n\nOPTIONS\n-------\n'--model  FILE'::\n  Name of model file (training or recognition)  (type:string default:)\n\n'--traineddata  FILE'::\n  If model is a training checkpoint, then traineddata must be the traineddata file that was given to the trainer  (type:string default:)\n\n'--eval_listfile  FILE'::\n  File listing sample files in lstmf training format.  (type:string default:)\n\n'--max_image_MB  INT'::\n  Max memory to use for images.  (type:int default:2000)\n\n'--verbosity  INT'::\n  Amount of diagnosting information to output (0-2).  (type:int default:1)\n\nHISTORY\n-------\nlstmeval(1) was first made available for tesseract4.00.00alpha.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training tesseract LSTM: <https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/lstmtraining.1.asc",
    "content": "LSTMTRAINING(1)\n===============\n:doctype: manpage\n\nNAME\n----\nlstmtraining - Training program for LSTM-based networks.\n\nSYNOPSIS\n--------\n*lstmtraining*\n  --continue_from  'train_output_dir/continue_from_lang.lstm'\n  --old_traineddata 'bestdata_dir/continue_from_lang.traineddata'\n  --traineddata   'train_output_dir/lang/lang.traineddata'\n  --max_iterations 'NNN'\n  --debug_interval '0|-1'\n  --train_listfile 'train_output_dir/lang.training_files.txt'\n  --model_output  'train_output_dir/newlstmmodel'\n\nDESCRIPTION\n-----------\nlstmtraining(1)  trains LSTM-based networks using a list of lstmf files and starter traineddata file as the main input. Training from scratch is not recommended to be done by users. Finetuning (example command shown in synopsis above) or replacing a layer options can be used instead. Different options apply to different types of training.\nRead the [training documentation](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html) for details.\n\nOPTIONS\n-------\n\n'--debug_interval  '::\n  How often to display the alignment.  (type:int default:0)\n\n'--net_mode  '::\n  Controls network behavior.  (type:int default:192)\n\n'--perfect_sample_delay  '::\n  How many imperfect samples between perfect ones.  (type:int default:0)\n\n'--max_image_MB  '::\n  Max memory to use for images.  (type:int default:6000)\n\n'--append_index  '::\n  Index in continue_from Network at which to attach the new network defined by net_spec  (type:int default:-1)\n\n'--max_iterations  '::\n  If set, exit after this many iterations. A negative value is interpreted as epochs, 0 means infinite iterations.  (type:int default:0)\n\n'--target_error_rate  '::\n  Final error rate in percent.  (type:double default:0.01)\n\n'--weight_range  '::\n  Range of initial random weights.  (type:double default:0.1)\n\n'--learning_rate  '::\n  Weight factor for new deltas.  (type:double default:0.001)\n\n'--momentum  '::\n  Decay factor for repeating deltas.  (type:double default:0.5)\n\n'--adam_beta  '::\n  Decay factor for repeating deltas.  (type:double default:0.999)\n\n'--stop_training  '::\n  Just convert the training model to a runtime model.  (type:bool default:false)\n\n'--convert_to_int  '::\n  Convert the recognition model to an integer model.  (type:bool default:false)\n\n'--sequential_training  '::\n  Use the training files sequentially instead of round-robin.  (type:bool default:false)\n\n'--debug_network  '::\n  Get info on distribution of weight values  (type:bool default:false)\n\n'--randomly_rotate  '::\n  Train OSD and randomly turn training samples upside-down  (type:bool default:false)\n\n'--net_spec  '::\n  Network specification  (type:string default:)\n\n'--continue_from  '::\n  Existing model to extend  (type:string default:)\n\n'--model_output  '::\n  Basename for output models  (type:string default:lstmtrain)\n\n'--train_listfile  '::\n  File listing training files in lstmf training format.  (type:string default:)\n\n'--eval_listfile  '::\n  File listing eval files in lstmf training format.  (type:string default:)\n\n'--traineddata  '::\n  Starter traineddata with combined Dawgs/Unicharset/Recoder for language model  (type:string default:)\n\n'--old_traineddata  '::\n  When changing the character set, this specifies the traineddata with the old character set that is to be replaced  (type:string default:)\n\nHISTORY\n-------\nlstmtraining(1) was first made available for tesseract4.00.00alpha.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training tesseract LSTM: <https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/merge_unicharsets.1.asc",
    "content": "MERGE_UNICHARSETS(1)\n====================\n:doctype: manpage\n\nNAME\n----\nmerge_unicharsets - Simple tool to merge two or more unicharsets.\n\nSYNOPSIS\n--------\n*merge_unicharsets* 'unicharset-in-1' ... 'unicharset-in-n' 'unicharset-out'\n\nDESCRIPTION\n-----------\nmerge_unicharsets(1) is a simple tool to merge two or more unicharsets.\nIt could be used to create a combined unicharset for a script-level engine,\nlike the new Latin or Devanagari.\n\nIN/OUT ARGUMENTS\n----------------\n'unicharset-in-1'::\n\t(Input) The name of the first unicharset file to be merged.\n\n'unicharset-in-n'::\n\t(Input) The name of the nth unicharset file to be merged.\n\n'unicharset-out'::\n\t(Output) The name of the merged unicharset file.\n\nHISTORY\n-------\nmerge_unicharsets(1) was first made available for tesseract4.00.00alpha.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training tesseract LSTM: <https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/mftraining.1.asc",
    "content": "MFTRAINING(1)\n=============\n:doctype: manpage\n\nNAME\n----\nmftraining - feature training for Tesseract\n\nSYNOPSIS\n--------\nmftraining -U 'unicharset' -O 'lang.unicharset' 'FILE'...\n\nDESCRIPTION\n-----------\nmftraining takes a list of .tr files, from which it generates the\nfiles *inttemp* (the shape prototypes), *shapetable*, and *pffmtable*\n(the number of expected features for each character).  (A fourth file\ncalled Microfeat is also written by this program, but it is not used.)\n\nOPTIONS\n-------\n-U 'FILE'::\n\t(Input) The unicharset generated by unicharset_extractor(1)\n\n-F 'font_properties_file'::\n\t(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1:\n\n\t*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*\n\n-X 'xheights_file'::\n\t(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]\n\n\t*font_name* *xheight*\n\n-D 'dir'::\n\tDirectory to write output files to.\n\n-O 'FILE'::\n\t(Output) The output unicharset that will be given to combine_tessdata(1)\n\nSEE ALSO\n--------\ntesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),\nshapeclustering(1), unicharset(5)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nCOPYING\n-------\nCopyright \\(C) Hewlett-Packard Company, 1988\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/set_unicharset_properties.1.asc",
    "content": "SET_UNICHARSET_PROPERTIES(1)\n============================\n:doctype: manpage\n\nNAME\n----\nset_unicharset_properties - set  properties about the unichars\n\nSYNOPSIS\n--------\n*set_unicharset_properties*  --U 'input_unicharsetfile'  --script_dir '/path/to/langdata'   --O 'output_unicharsetfile'\n\nDESCRIPTION\n-----------\nset_unicharset_properties(1) reads a unicharset file, puts the result in a UNICHARSET object, fills it with properties about the unichars it contains and writes the result back to another unicharset file.\n\nOPTIONS\n-------\n\n'--script_dir /path/to/langdata'::\n\t(Input) Specify the location of directory for universal script unicharsets and font xheights  (type:string default:)\n\n'--U unicharsetfile'::\n\t(Input) Specify the location of the unicharset to load as input.\n\n'--O unicharsetfile'::\n\t(Output) Specify the location of the unicharset to be written with updated properties.\n\nHISTORY\n-------\nset_unicharset_properties(1) was first made available for tesseract version 3.03.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training: <https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/shapeclustering.1.asc",
    "content": "SHAPECLUSTERING(1)\n==================\n:doctype: manpage\n\nNAME\n----\nshapeclustering - shape clustering training for Tesseract\n\nSYNOPSIS\n--------\nshapeclustering -D 'output_dir'\n    -U 'unicharset' -O 'mfunicharset'\n    -F 'font_props' -X 'xheights'\n    'FILE'...\n\nDESCRIPTION\n-----------\nshapeclustering(1) takes extracted feature .tr files (generated by\ntesseract(1) run in a special mode from box files) and produces a\nfile *shapetable* and an enhanced unicharset.  This program is still\nexperimental, and is not required (yet) for training Tesseract.\n\nOPTIONS\n-------\n-U 'FILE'::\n\tThe unicharset generated by unicharset_extractor(1).\n\n-D 'dir'::\n\tDirectory to write output files to.\n\n-F 'font_properties_file'::\n\t(Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1:\n\n\t'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'\n\n-X 'xheights_file'::\n\t(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]\n\n\t'font_name' 'xheight'\n\n-O 'FILE'::\n\tThe output unicharset that will be given to combine_tessdata(1).\n\nSEE ALSO\n--------\ntesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),\nunicharset(5)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nCOPYING\n-------\nCopyright \\(C) Google, 2011\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/tesseract.1.asc",
    "content": "TESSERACT(1)\n============\n:doctype: manpage\n\nNAME\n----\ntesseract - command-line OCR engine\n\nSYNOPSIS\n--------\n*tesseract* 'FILE' 'OUTPUTBASE' ['OPTIONS']... ['CONFIGFILE']...\n\nDESCRIPTION\n-----------\ntesseract(1) is a commercial quality OCR engine originally developed at HP\nbetween 1985 and 1995. In 1995, this engine was among the top 3 evaluated by\nUNLV. It was open-sourced by HP and UNLV in 2005, and has been developed\nat Google until 2018.\n\n\nIN/OUT ARGUMENTS\n----------------\n'FILE'::\n  The name of the input file.\n  This can either be an image file or a text file. +\n  Most image file formats (anything readable by Leptonica) are supported. +\n  A text file lists the names of all input images (one image name per line).\n  The results will be combined in a single file for each output file format\n  (txt, pdf, hocr, xml). +\n  If 'FILE' is `stdin` or `-` then the standard input is used.\n\n'OUTPUTBASE'::\n  The basename of the output file (to which the appropriate extension\n  will be appended).  By default the output will be a text file\n  with `.txt` added to the basename unless there are one or more\n  parameters set which explicitly specify the desired output. +\n  If 'OUTPUTBASE' is `stdout` or `-` then the standard output is used.\n\n\n[[TESSDATADIR]]\nOPTIONS\n-------\n*-c* 'CONFIGVAR=VALUE'::\n  Set value for parameter 'CONFIGVAR' to VALUE. Multiple *-c* arguments are allowed.\n\n*--dpi* 'N'::\n  Specify the resolution 'N' in DPI for the input image(s).\n  A typical value for 'N' is `300`. Without this option,\n  the resolution is read from the metadata included in the image.\n  If an image does not include that information, Tesseract tries to guess it.\n\n*-l* 'LANG'::\n*-l* 'SCRIPT'::\n  The language or script to use.\n  If none is specified, `eng` (English) is assumed.\n  Multiple languages may be specified, separated by plus characters.\n  Tesseract uses 3-character ISO 639-2 language codes\n  (see <<LANGUAGES,*LANGUAGES AND SCRIPTS*>>).\n\n*--psm* 'N'::\n  Set Tesseract to only run a subset of layout analysis and assume\n  a certain form of image. The options for 'N' are:\n\n  0 = Orientation and script detection (OSD) only.\n  1 = Automatic page segmentation with OSD.\n  2 = Automatic page segmentation, but no OSD, or OCR. (not implemented)\n  3 = Fully automatic page segmentation, but no OSD. (Default)\n  4 = Assume a single column of text of variable sizes.\n  5 = Assume a single uniform block of vertically aligned text.\n  6 = Assume a single uniform block of text.\n  7 = Treat the image as a single text line.\n  8 = Treat the image as a single word.\n  9 = Treat the image as a single word in a circle.\n  10 = Treat the image as a single character.\n  11 = Sparse text. Find as much text as possible in no particular order.\n  12 = Sparse text with OSD.\n  13 = Raw line. Treat the image as a single text line,\n       bypassing hacks that are Tesseract-specific.\n\n*--oem* 'N'::\n  Specify OCR Engine mode. The options for 'N' are:\n\n  0 = Original Tesseract only.\n  1 = Neural nets LSTM only.\n  2 = Tesseract + LSTM.\n  3 = Default, based on what is available.\n\n*--tessdata-dir* 'PATH'::\n  Specify the location of tessdata path.\n\n*--user-patterns* 'FILE'::\n  Specify the location of user patterns file.\n\n*--user-words* 'FILE'::\n  Specify the location of user words file.\n\n[[CONFIGFILE]]\n'CONFIGFILE'::\n  The name of a config to use. The name can be a file in `tessdata/configs`\n  or `tessdata/tessconfigs`, or an absolute or relative file path.\n  A config is a plain text file which contains a list of parameters and\n  their values, one per line, with a space separating parameter from value. +\n  Interesting config files include:\n\n  * *alto* -- Output in ALTO format ('OUTPUTBASE'`.xml`).\n  * *hocr* -- Output in hOCR format ('OUTPUTBASE'`.hocr`).\n  * *page* -- Output in PAGE format ('OUTPUTBASE'`.page.xml`).\n              The output can be customized with the flags:\n              page_xml_polygon -- Create polygons instead of bounding boxes (default: true)\n              page_xml_level -- Create the PAGE file on  0=linelevel or 1=wordlevel (default: 0)\n  * *pdf* -- Output PDF ('OUTPUTBASE'`.pdf`).\n  * *tsv* -- Output TSV ('OUTPUTBASE'`.tsv`).\n  * *txt* -- Output plain text ('OUTPUTBASE'`.txt`).\n  * *get.images* -- Write processed input images to file ('OUTPUTBASE'`.processedPAGENUMBER.tif`).\n  * *logfile* -- Redirect debug messages to file (`tesseract.log`).\n  * *lstm.train* -- Output files used by LSTM training ('OUTPUTBASE'`.lstmf`).\n  * *makebox* -- Write box file ('OUTPUTBASE'`.box`).\n  * *quiet* -- Redirect debug messages to '/dev/null'.\n\nIt is possible to select several config files, for example\n`tesseract image.png demo alto hocr pdf txt` will create four output files\n`demo.alto`, `demo.hocr`, `demo.pdf` and `demo.txt` with the OCR results.\n\n*Nota bene:*   The options *-l* 'LANG', *-l* 'SCRIPT' and *--psm* 'N'\nmust occur before any 'CONFIGFILE'.\n\n\nSINGLE OPTIONS\n--------------\n*-h, --help*::\n  Show help message.\n\n*--help-extra*::\n  Show extra help for advanced users.\n\n*--help-psm*::\n  Show page segmentation modes.\n\n*--help-oem*::\n  Show OCR Engine modes.\n\n*-v, --version*::\n  Returns the current version of the tesseract(1) executable.\n\n*--list-langs*::\n  List available languages for tesseract engine.\n  Can be used with *--tessdata-dir* 'PATH'.\n\n*--print-parameters*::\n  Print tesseract parameters.\n\n\n[[LANGUAGES]]\nLANGUAGES AND SCRIPTS\n---------------------\n\nTo recognize some text with Tesseract, it is normally necessary to specify\nthe language(s) or script(s) of the text (unless it is English text which is\nsupported by default) using *-l* 'LANG' or *-l* 'SCRIPT'.\n\nSelecting a language automatically also selects the language specific\ncharacter set and dictionary (word list).\n\nSelecting a script typically selects all characters of that script\nwhich can be from different languages. The dictionary which is included\nalso contains a mix from different languages.\nIn most cases, a script also supports English.\nSo it is possible to recognize a language that has not been specifically\ntrained for by using traineddata for the script it is written in.\n\nMore than one language or script may be specified by using `+`.\nExample: `tesseract myimage.png myimage -l eng+deu+fra`.\n\nhttps://github.com/tesseract-ocr/tessdata_fast provides fast language and\nscript models which are also part of Linux distributions.\n\nFor Tesseract 4, `tessdata_fast` includes traineddata files for the\nfollowing languages:\n\n*afr* (Afrikaans),\n*amh* (Amharic),\n*ara* (Arabic),\n*asm* (Assamese),\n*aze* (Azerbaijani),\n*aze_cyrl* (Azerbaijani - Cyrillic),\n*bel* (Belarusian),\n*ben* (Bengali),\n*bod* (Tibetan),\n*bos* (Bosnian),\n*bre* (Breton),\n*bul* (Bulgarian),\n*cat* (Catalan; Valencian),\n*ceb* (Cebuano),\n*ces* (Czech),\n*chi_sim* (Chinese simplified),\n*chi_tra* (Chinese traditional),\n*chr* (Cherokee),\n*cos* (Corsican),\n*cym* (Welsh),\n*dan* (Danish),\n*deu* (German),\n*deu_latf* (German Fraktur Latin),\n*div* (Dhivehi),\n*dzo* (Dzongkha),\n*ell* (Greek, Modern, 1453-),\n*eng* (English),\n*enm* (English, Middle, 1100-1500),\n*epo* (Esperanto),\n*equ* (Math / equation detection module),\n*est* (Estonian),\n*eus* (Basque),\n*fas* (Persian),\n*fao* (Faroese),\n*fil* (Filipino),\n*fin* (Finnish),\n*fra* (French),\n*frm* (French, Middle, ca.1400-1600),\n*fry* (West Frisian),\n*gla* (Scottish Gaelic),\n*gle* (Irish),\n*glg* (Galician),\n*grc* (Greek, Ancient, to 1453),\n*guj* (Gujarati),\n*hat* (Haitian; Haitian Creole),\n*heb* (Hebrew),\n*hin* (Hindi),\n*hrv* (Croatian),\n*hun* (Hungarian),\n*hye* (Armenian),\n*iku* (Inuktitut),\n*ind* (Indonesian),\n*isl* (Icelandic),\n*ita* (Italian),\n*ita_old* (Italian - Old),\n*jav* (Javanese),\n*jpn* (Japanese),\n*kan* (Kannada),\n*kat* (Georgian),\n*kat_old* (Georgian - Old),\n*kaz* (Kazakh),\n*khm* (Central Khmer),\n*kir* (Kirghiz; Kyrgyz),\n*kmr* (Kurdish Kurmanji),\n*kor* (Korean),\n*kor_vert* (Korean vertical),\n*lao* (Lao),\n*lat* (Latin),\n*lav* (Latvian),\n*lit* (Lithuanian),\n*ltz* (Luxembourgish),\n*mal* (Malayalam),\n*mar* (Marathi),\n*mkd* (Macedonian),\n*mlt* (Maltese),\n*mon* (Mongolian),\n*mri* (Maori),\n*msa* (Malay),\n*mya* (Burmese),\n*nep* (Nepali),\n*nld* (Dutch; Flemish),\n*nor* (Norwegian),\n*oci* (Occitan post 1500),\n*ori* (Oriya),\n*osd* (Orientation and script detection module),\n*pan* (Panjabi; Punjabi),\n*pol* (Polish),\n*por* (Portuguese),\n*pus* (Pushto; Pashto),\n*que* (Quechua),\n*ron* (Romanian; Moldavian; Moldovan),\n*rus* (Russian),\n*san* (Sanskrit),\n*sin* (Sinhala; Sinhalese),\n*slk* (Slovak),\n*slv* (Slovenian),\n*snd* (Sindhi),\n*spa* (Spanish; Castilian),\n*spa_old* (Spanish; Castilian - Old),\n*sqi* (Albanian),\n*srp* (Serbian),\n*srp_latn* (Serbian - Latin),\n*sun* (Sundanese),\n*swa* (Swahili),\n*swe* (Swedish),\n*syr* (Syriac),\n*tam* (Tamil),\n*tat* (Tatar),\n*tel* (Telugu),\n*tgk* (Tajik),\n*tha* (Thai),\n*tir* (Tigrinya),\n*ton* (Tonga),\n*tur* (Turkish),\n*uig* (Uighur; Uyghur),\n*ukr* (Ukrainian),\n*urd* (Urdu),\n*uzb* (Uzbek),\n*uzb_cyrl* (Uzbek - Cyrillic),\n*vie* (Vietnamese),\n*yid* (Yiddish),\n*yor* (Yoruba)\n\nTo use a non-standard language pack named `foo.traineddata`, set the\n`TESSDATA_PREFIX` environment variable so the file can be found at\n`TESSDATA_PREFIX/tessdata/foo.traineddata` and give Tesseract the\nargument *-l* `foo`.\n\nFor Tesseract 4, `tessdata_fast` includes traineddata files for the\nfollowing scripts:\n\n*Arabic*,\n*Armenian*,\n*Bengali*,\n*Canadian_Aboriginal*,\n*Cherokee*,\n*Cyrillic*,\n*Devanagari*,\n*Ethiopic*,\n*Fraktur*,\n*Georgian*,\n*Greek*,\n*Gujarati*,\n*Gurmukhi*,\n*HanS* (Han simplified),\n*HanS_vert* (Han simplified, vertical),\n*HanT* (Han traditional),\n*HanT_vert* (Han traditional, vertical),\n*Hangul*,\n*Hangul_vert* (Hangul vertical),\n*Hebrew*,\n*Japanese*,\n*Japanese_vert* (Japanese vertical),\n*Kannada*,\n*Khmer*,\n*Lao*,\n*Latin*,\n*Malayalam*,\n*Myanmar*,\n*Oriya* (Odia),\n*Sinhala*,\n*Syriac*,\n*Tamil*,\n*Telugu*,\n*Thaana*,\n*Thai*,\n*Tibetan*,\n*Vietnamese*.\n\nThe same languages and scripts are available from\nhttps://github.com/tesseract-ocr/tessdata_best.\n`tessdata_best` provides slow language and script models.\nThese models are needed for training. They also can give better OCR results,\nbut the recognition takes much more time.\n\nBoth `tessdata_fast` and `tessdata_best` only support the LSTM OCR engine.\n\nThere is a third repository, https://github.com/tesseract-ocr/tessdata,\nwith models which support both the Tesseract 3 legacy OCR engine and the\nTesseract 4 LSTM OCR engine.\n\n\nCONFIG FILES AND AUGMENTING WITH USER DATA\n------------------------------------------\n\nTesseract config files consist of lines with parameter-value pairs (space\nseparated).  The parameters are documented as flags in the source code like\nthe following one in tesseractclass.h:\n\n`STRING_VAR_H(tessedit_char_blacklist, \"\",\n             \"Blacklist of chars not to recognize\");`\n\nThese parameters may enable or disable various features of the engine, and\nmay cause it to load (or not load) various data.  For instance, let's suppose\nyou want to OCR in English, but suppress the normal dictionary and load an\nalternative word list and an alternative list of patterns -- these two files\nare the most commonly used extra data files.\n\nIf your language pack is in '/path/to/eng.traineddata' and the hocr config\nis in '/path/to/configs/hocr' then create three new files:\n\n'/path/to/eng.user-words':\n[verse]\nthe\nquick\nbrown\nfox\njumped\n\n'/path/to/eng.user-patterns':\n[verse]\n1-\\d\\d\\d-GOOG-411\nwww.\\n\\\\\\*.com\n\n'/path/to/configs/bazaar':\n[verse]\nload_system_dawg     F\nload_freq_dawg       F\nuser_words_suffix    user-words\nuser_patterns_suffix user-patterns\n\nNow, if you pass the word 'bazaar' as a <<CONFIGFILE,'CONFIGFILE'>> to\nTesseract, Tesseract will not bother loading the system dictionary nor\nthe dictionary of frequent words and will load and use the 'eng.user-words'\nand 'eng.user-patterns' files you provided.  The former is a simple word list,\none per line.  The format of the latter is documented in 'dict/trie.h'\non 'read_pattern_list()'.\n\n\nENVIRONMENT VARIABLES\n---------------------\n*`TESSDATA_PREFIX`*::\n  If the `TESSDATA_PREFIX` is set to a path, then that path is used to\n  find the `tessdata` directory with language and script recognition\n  models and config files.\n  Using <<TESSDATADIR,*--tessdata-dir* 'PATH'>> is the recommended alternative.\n*`OMP_THREAD_LIMIT`*::\n  If the `tesseract` executable was built with multithreading support,\n  it will normally use four CPU cores for the OCR process. While this\n  can be faster for a single image, it gives bad performance if the host\n  computer provides less than four CPU cores or if OCR is made for many images.\n  Only a single CPU core is used with `OMP_THREAD_LIMIT=1`.\n\n\nHISTORY\n-------\nThe engine was developed at Hewlett Packard Laboratories Bristol and at\nHewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more\nchanges made in 1996 to port to Windows, and some $$C++$$izing in 1998. A\nlot of the code was written in C, and then some more was written in $$C++$$.\nThe $$C++$$ code makes heavy use of a list system using macros. This predates\nSTL, was portable before STL, and is more efficient than STL lists, but has\nthe big negative that if you do get a segmentation violation, it is hard to\ndebug.\n\nVersion 2.00 brought Unicode (UTF-8) support, six languages, and the ability\nto train Tesseract.\n\nTesseract was included in UNLV's Fourth Annual Test of OCR Accuracy.\nSee <https://github.com/tesseract-ocr/docs/blob/main/AT-1995.pdf>.\nSince Tesseract 2.00,\nscripts are now included to allow anyone to reproduce some of these tests.\nSee <https://tesseract-ocr.github.io/tessdoc/TestingTesseract.html> for more\ndetails.\n\nTesseract 3.00 added a number of new languages, including Chinese, Japanese,\nand Korean. It also introduced a new, single-file based system of managing\nlanguage data.\n\nTesseract 3.02 added BiDirectional text support, the ability to recognize\nmultiple languages in a single image, and improved layout analysis.\n\nTesseract 4 adds a new neural net (LSTM) based OCR engine which is focused\non line recognition, but also still supports the legacy Tesseract OCR engine of\nTesseract 3 which works by recognizing character patterns. Compatibility with\nTesseract 3 is enabled by `--oem 0`. This also needs traineddata files which\nsupport the legacy engine, for example those from the tessdata repository\n(https://github.com/tesseract-ocr/tessdata).\n\nFor further details, see the release notes in the Tesseract documentation\n(<https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html>).\n\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nUser forum: <https://groups.google.com/g/tesseract-ocr> +\nDocumentation: <https://tesseract-ocr.github.io/> +\nInformation on training: <https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nSEE ALSO\n--------\nambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),\nshape_training(1), mftraining(1), unicharambigs(5), unicharset(5),\nunicharset_extractor(1), wordlist2dawg(1)\n\nAUTHOR\n------\nTesseract development was led at Hewlett-Packard and Google by Ray Smith.\nThe development team has included:\n\nAhmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,\nEric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,\nMark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,\nPingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel\nRomano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh\nLloyd, Shobhit Saxena, and Thomas Kielbus.\n\nFor a list of contributors see\n<https://github.com/tesseract-ocr/tesseract/blob/main/AUTHORS>.\n\nCOPYING\n-------\nLicensed under the Apache License, Version 2.0\n"
  },
  {
    "path": "doc/tesseract.natvis",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<AutoVisualizer xmlns=\"http://schemas.microsoft.com/vstudio/debugger/natvis/2010\">\n  <Type Name=\"GenericVector&lt;*&gt;\">\n    <DisplayString>{{size={size_used_}}}</DisplayString>\n    <Expand>\n      <Item Name=\"[size]\" ExcludeView=\"simple\">size_used_</Item>\n      <Item Name=\"[capacity]\" ExcludeView=\"simple\">size_reserved_</Item>\n      <ArrayItems>\n        <Size>size_used_</Size>\n        <ValuePointer>data_</ValuePointer>\n      </ArrayItems>\n    </Expand>\n  </Type>\n\n  <Type Name=\"tesseract::IntParam\">\n    <DisplayString>{value_}</DisplayString>\n  </Type>\n  <Type Name=\"tesseract::BoolParam\">\n    <DisplayString>{value_}</DisplayString>\n  </Type>\n\n  <Type Name=\"tesseract::StringParam\">\n    <DisplayString>{value_}</DisplayString>\n  </Type>\n\n  <Type Name=\"tesseract::DoubleParam\">\n    <DisplayString>{value_}</DisplayString>\n  </Type>\n\n</AutoVisualizer>\n"
  },
  {
    "path": "doc/text2image.1.asc",
    "content": "TEXT2IMAGE(1)\n=============\n:doctype: manpage\n\nNAME\n----\ntext2image - generate OCR training pages.\n\nSYNOPSIS\n--------\n*text2image* --text 'FILE' --outputbase 'PATH' --fonts_dir 'PATH' [OPTION]\n\nDESCRIPTION\n-----------\ntext2image(1)  generates OCR training pages. Given a text file it outputs an image with a given font and degradation.\n\nOPTIONS\n-------\n'--text  FILE'::\n File name of text input to use for creating synthetic training data.  (type:string default:)\n\n'--outputbase  FILE'::\n Basename for output image/box file  (type:string default:)\n\n'--fontconfig_tmpdir  PATH'::\n Overrides fontconfig default temporary dir  (type:string default:/tmp)\n\n'--fonts_dir  PATH'::\n If empty it use system default. Otherwise it overrides system default font location  (type:string default:)\n\n'--font  FONTNAME'::\n Font description name to use  (type:string default:Arial)\n\n'--writing_mode  MODE'::\n Specify one of the following writing modes.\n  'horizontal' : Render regular horizontal text. (default)\n  'vertical' : Render vertical text. Glyph orientation is selected by Pango.\n  'vertical-upright' : Render vertical text. Glyph orientation is set to be upright.  (type:string default:horizontal)\n\n'--tlog_level  INT'::\n Minimum logging level for tlog() output  (type:int default:0)\n\n'--max_pages  INT'::\n Maximum number of pages to output (0=unlimited)  (type:int default:0)\n\n'--degrade_image  BOOL'::\n Degrade rendered image with speckle noise, dilation/erosion and rotation  (type:bool default:true)\n\n'--rotate_image  BOOL'::\n Rotate the image in a random way.  (type:bool default:true)\n\n'--strip_unrenderable_words  BOOL'::\n Remove unrenderable words from source text  (type:bool default:true)\n\n'--ligatures  BOOL'::\n Rebuild and render ligatures  (type:bool default:false)\n\n'--exposure  INT'::\n Exposure level in photocopier  (type:int default:0)\n\n'--resolution  INT'::\n Pixels per inch  (type:int default:300)\n\n'--xsize  INT'::\n Width of output image  (type:int default:3600)\n\n'--ysize  INT'::\n Height of output image  (type:int default:4800)\n\n'--margin  INT'::\n Margin round edges of image  (type:int default:100)\n\n'--ptsize  INT'::\n Size of printed text  (type:int default:12)\n\n'--leading  INT'::\n Inter-line space (in pixels)  (type:int default:12)\n\n'--box_padding  INT'::\n Padding around produced bounding boxes  (type:int default:0)\n\n'--char_spacing  DOUBLE'::\n Inter-character space in ems  (type:double default:0)\n\n'--underline_start_prob  DOUBLE'::\n Fraction of words to underline (value in [0,1])  (type:double default:0)\n\n'--underline_continuation_prob  DOUBLE'::\n Fraction of words to underline (value in [0,1])  (type:double default:0)\n\n'--render_ngrams  BOOL'::\n Put each space-separated entity from the input file into one bounding box. The ngrams in the input file  will be randomly permuted before rendering (so that there is sufficient variety of characters on each line).  (type:bool default:false)\n\n'--output_word_boxes  BOOL'::\n Output word bounding boxes instead of character boxes. This is used for Cube training, and implied by --render_ngrams.  (type:bool default:false)\n\n'--unicharset_file  FILE'::\n File with characters in the unicharset. If --render_ngrams is true and --unicharset_file is specified, ngrams with characters that are not in unicharset will be omitted  (type:string default:)\n\n'--bidirectional_rotation  BOOL'::\n Rotate the generated characters both ways.  (type:bool default:false)\n\n'--only_extract_font_properties  BOOL'::\n Assumes that the input file contains a list of ngrams. Renders each ngram, extracts spacing properties and records them in output_base/[font_name].fontinfo file.  (type:bool default:false)\n\nUse these flags to output zero-padded, square individual character images\n-------------------------------------------------------------------------\n\n'--output_individual_glyph_images  BOOL'::\n If true also outputs individual character images  (type:bool default:false)\n\n'--glyph_resized_size  INT'::\n Each glyph is square with this side length in pixels  (type:int default:0)\n\n'--glyph_num_border_pixels_to_pad  INT'::\n Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad  (type:int default:0)\n\nUse these flags to find fonts that can render a given text\n----------------------------------------------------------\n\n'--find_fonts  BOOL'::\n Search for all fonts that can render the text  (type:bool default:false)\n\n'--render_per_font  BOOL'::\n If find_fonts==true, render each font to its own image. Image filenames are of the form output_name.font_name.tif  (type:bool default:true)\n\n'--min_coverage  DOUBLE'::\n If find_fonts==true, the minimum coverage the font has of the characters in the text file to include it, between 0 and 1.  (type:double default:1)\n\nExample Usage:\n```\ntext2image --find_fonts \\\n--fonts_dir /usr/share/fonts \\\n--text ../langdata/hin/hin.training_text \\\n--min_coverage .9  \\\n--render_per_font \\\n--outputbase ../langdata/hin/hin \\\n|& grep raw | sed -e 's/ :.*/\" \\\\/g'  | sed -e 's/^/  \"/' >../langdata/hin/fontslist.txt\n```\n\nSINGLE OPTIONS\n--------------\n\n'--list_available_fonts  BOOL'::\n List available fonts and quit.  (type:bool default:false)\n\nHISTORY\n-------\ntext2image(1) was first made available for tesseract 3.03.\n\nRESOURCES\n---------\nMain web site: <https://github.com/tesseract-ocr> +\nInformation on training tesseract LSTM: <https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html>\n\nSEE ALSO\n--------\ntesseract(1)\n\nCOPYING\n-------\nCopyright \\(C) 2012 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/unicharambigs.5.asc",
    "content": "UNICHARAMBIGS(5)\n================\n\nNAME\n----\nunicharambigs - Tesseract unicharset ambiguities\n\nDESCRIPTION\n-----------\nThe unicharambigs file (a component of traineddata, see combine_tessdata(1) )\nis used by Tesseract to represent possible ambiguities between characters,\nor groups of characters.\n\nThe file contains a number of lines, laid out as follow:\n\n...........................\n[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]\n...........................\n\n[horizontal]\nField one:: the number of characters contained in field two\nField two:: the character sequence to be replaced\nField three:: the number of characters contained in field four\nField four:: the character sequence used to replace field two\nField five:: contains either 1 or 0. 1 denotes a mandatory\nreplacement, 0 denotes an optional replacement.\n\nCharacters appearing in fields two and four should appear in\nunicharset. The numbers in fields one and three refer to the\nnumber of unichars (not bytes).\n\nEXAMPLE\n-------\n\n...............................\nv1\n2       ' '     1       \"     1\n1       m       2       r n   0\n3       i i i   1       m     0\n...............................\n\nThe first line is a version identifier.\nIn this example, all instances of the '2' character sequence '''' will\n*always* be replaced by the '1' character sequence '\"'; a '1' character\nsequence 'm' *may* be replaced by the '2' character sequence 'rn', and\nthe '3' character sequence *may* be replaced by the '1' character\nsequence 'm'.\n\nVersion 3.03 and on supports a new, simpler format for the unicharambigs\nfile:\n\n...............................\nv2\n'' \" 1\nm rn 0\niii m 0\n...............................\n\nIn this format, the \"error\" and \"correction\" are simple UTF-8 strings\nseparated by a space, and, after another space, the same type specifier\nas v1 (0 for optional and 1 for mandatory substitution). Note the downside\nof this simpler format is that Tesseract has to encode the UTF-8 strings\ninto the components of the unicharset. In complex scripts, this encoding\nmay be ambiguous. In this case, the encoding is chosen such as to use the\nleast UTF-8 characters for each component, ie the shortest unicharset\ncomponents will make up the encoding.\n\nHISTORY\n-------\nThe unicharambigs file first appeared in Tesseract 3.00; prior to that, a\nsimilar format, called DangAmbigs ('dangerous ambiguities') was used: the\nformat was almost identical, except only mandatory replacements could be\nspecified, and field 5 was absent.\n\nBUGS\n----\nThis is a documentation \"bug\": it's not currently clear what should be done\nin the case of ligatures (such as 'fi') which may also appear as regular\nletters in the unicharset.\n\nSEE ALSO\n--------\ntesseract(1), unicharset(5)\nhttps://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05.html#the-unicharambigs-file\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/unicharset.5.asc",
    "content": "UNICHARSET(5)\n=============\n:doctype: manpage\n\nNAME\n----\nunicharset - character properties file used by tesseract(1)\n\nDESCRIPTION\n-----------\nTesseract's unicharset file contains information on each symbol\n(unichar) the Tesseract OCR engine is trained to recognize.\n\nA unicharset file (i.e. 'eng.unicharset') is distributed as part of a\nTesseract language pack (i.e. 'eng.traineddata').  For information on\nextracting the unicharset file, see combine_tessdata(1).\n\nThe first line of a unicharset file contains the number of unichars in\nthe file.  After this line, each subsequent line provides information for\na single unichar.  The first such line contains a placeholder reserved for\nthe space character.  Each unichar is referred to within Tesseract by its\nUnichar ID, which is the line number (minus 1) within the unicharset file.\nTherefore, space gets unichar 0.\n\nEach unichar line in the unicharset file (v2+) may have four space-separated fields:\n\n  'character' 'properties' 'script' 'id'\n\nStarting with Tesseract v3.02, more information may be given for each unichar:\n\n  'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'\n\nEntries:\n\n'character':: The UTF-8 encoded string to be produced for this unichar.\n'properties':: An integer mask of character properties, one per bit.\n    From least to most significant bit, these are: isalpha, islower, isupper,\n    isdigit, ispunctuation.\n'glyph_metrics':: Ten comma-separated integers representing various standards\n    for where this glyph is to be found within a baseline-normalized coordinate\n    system where 128 is normalized to x-height.\n  * min_bottom, max_bottom: the ranges where the bottom of the character can\n    be found.\n  * min_top, max_top: the ranges where the top of the character may be found.\n  * min_width, max_width: horizontal width of the character.\n  * min_bearing, max_bearing: how far from the usual start position does the\n    leftmost part of the character begin.\n  * min_advance, max_advance: how far from the printer's cell left do we\n    advance to begin the next character.\n'script':: Name of the script (Latin, Common, Greek, Cyrillic, Han, null).\n'other_case':: The Unichar ID of the other case version of this character\n    (upper or lower).\n'direction':: The Unicode BiDi direction of this character, as defined by\n    ICU's enum UCharDirection. (0 = Left to Right, 1 = Right to Left,\n    2 = European Number...)\n'mirror':: The Unichar ID of the BiDirectional mirror of this character.\n    For example the mirror of open paren is close paren, but Latin Capital C\n    has no mirror, so it remains a Latin Capital C.\n'normed_form':: The UTF-8 representation of a \"normalized form\" of this unichar\n    for the purpose of blaming a module for errors given ground truth text.\n    For instance, a left or right single quote may normalize to an ASCII quote.\n\n\nEXAMPLE (v2)\n------------\n..............\n; 10 Common 46\nb 3 Latin 59\nW 5 Latin 40\n7 8 Common 66\n= 0 Common 93\n..............\n\n\";\" is a punctuation character. Its properties are thus represented by the\nbinary number 10000 (10 in hexadecimal).\n\n\"b\" is an alphabetic character and a lower case character. Its properties are\nthus represented by the binary number 00011 (3 in hexadecimal).\n\n\"W\" is an alphabetic character and an upper case character. Its properties are\nthus represented by the binary number 00101 (5 in hexadecimal).\n\n\"7\" is just a digit. Its properties are thus represented by the binary number\n01000 (8 in hexadecimal).\n\n\"=\" is not punctuation nor a digit nor an alphabetic character. Its properties\nare thus represented by the binary number 00000 (0 in hexadecimal).\n\nJapanese or Chinese alphabetic character properties are represented by the\nbinary number 00001 (1 in hexadecimal): they are alphabetic, but neither\nupper nor lower case.\n\nEXAMPLE (v3.02)\n---------------\n..................................................................\n110\nNULL 0 NULL 0\nN 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N\nY 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y\n1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1\n9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9\na 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a\n. . .\n..................................................................\n\nCAVEATS\n-------\nAlthough the unicharset reader maintains the ability to read unicharsets\nof older formats and will assign default values to missing fields,\nthe accuracy will be degraded.\n\nFurther, most other data files are indexed by the unicharset file,\nso changing it without re-generating the others is likely to have dire\nconsequences.\n\nHISTORY\n-------\nThe unicharset format first appeared with Tesseract 2.00, which was the\nfirst version to support languages other than English. The unicharset file\ncontained only the first two fields, and the \"ispunctuation\" property was\nabsent (punctuation was regarded as \"0\", as \"=\" is in the above example.\n\nSEE ALSO\n--------\ntesseract(1), combine_tessdata(1), unicharset_extractor(1)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/unicharset_extractor.1.asc",
    "content": "UNICHARSET_EXTRACTOR(1)\n=======================\n\nNAME\n----\nunicharset_extractor - Reads box or plain text files to extract the unicharset.\n\nSYNOPSIS\n--------\n*unicharset_extractor*  [--output_unicharset filename] [--norm_mode mode] box_or_text_file [...]\n\nWhere mode means:\n 1=combine graphemes (use for Latin and other simple scripts)\n 2=split graphemes (use for Indic/Khmer/Myanmar)\n 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n\nDESCRIPTION\n-----------\nTesseract needs to know the set of possible characters it can output.\nTo generate the unicharset data file, use the unicharset_extractor\nprogram on training pages bounding box files or a plain text file:\n\n    unicharset_extractor fontfile_1.box fontfile_2.box ...\n\nThe unicharset will be put into the file './unicharset' if no output filename is provided.\n\n*NOTE* Use the appropriate norm_mode based on the language.\n\nSEE ALSO\n--------\ntesseract(1), unicharset(5)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nHISTORY\n-------\nunicharset_extractor first appeared in Tesseract 2.00.\n\nCOPYING\n-------\nCopyright \\(C) 2006, Google Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "doc/wordlist2dawg.1.asc",
    "content": "WORDLIST2DAWG(1)\n================\n:doctype: manpage\n\nNAME\n----\nwordlist2dawg - convert a wordlist to a DAWG for Tesseract\n\nSYNOPSIS\n--------\n*wordlist2dawg* 'WORDLIST' 'DAWG' 'lang.unicharset'\n\n*wordlist2dawg* -t 'WORDLIST' 'DAWG' 'lang.unicharset'\n\n*wordlist2dawg* -r 1 'WORDLIST' 'DAWG' 'lang.unicharset'\n\n*wordlist2dawg* -r 2 'WORDLIST' 'DAWG' 'lang.unicharset'\n\n*wordlist2dawg* -l <short> <long> 'WORDLIST' 'DAWG' 'lang.unicharset'\n\nDESCRIPTION\n-----------\nwordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph\n(DAWG) for use with Tesseract.  A DAWG is a compressed, space and time\nefficient representation of a word list.\n\nOPTIONS\n-------\n-t\n\tVerify that a given dawg file is equivalent to a given wordlist.\n\n-r 1\n\tReverse a word if it contains an RTL character.\n\n-r 2\n\tReverse all words.\n\n-l <short> <long>\n\tProduce a file with several dawgs in it, one each for words\n\tof length <short>, <short+1>,... <long>\n\nARGUMENTS\n---------\n\n'WORDLIST'\n\tA plain text file in UTF-8, one word per line.\n\n'DAWG'\n\tThe output DAWG to write.\n\n'lang.unicharset'\n\tThe unicharset of the language. This is the unicharset\n\tgenerated by mftraining(1).\n\nSEE ALSO\n--------\ntesseract(1), combine_tessdata(1), dawg2wordlist(1)\n\n<https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html>\n\nCOPYING\n-------\nCopyright \\(C) 2006 Google, Inc.\nLicensed under the Apache License, Version 2.0\n\nAUTHOR\n------\nThe Tesseract OCR engine was written by Ray Smith and his research groups\nat Hewlett Packard (1985-1995) and Google (2006-2018).\n"
  },
  {
    "path": "include/tesseract/baseapi.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        baseapi.h\n// Description: Simple API for calling tesseract.\n// Author:      Ray Smith\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_API_BASEAPI_H_\n#define TESSERACT_API_BASEAPI_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#include \"export.h\"\n#include \"pageiterator.h\"\n#include \"publictypes.h\"\n#include \"resultiterator.h\"\n#include \"unichar.h\"\n\n#include <tesseract/version.h>\n\n#include <cstdio>\n#include <vector> // for std::vector\n\nstruct Pix;\nstruct Pixa;\nstruct Boxa;\n\nnamespace tesseract {\n\nclass PAGE_RES;\nclass ParagraphModel;\nclass BLOCK_LIST;\nclass ETEXT_DESC;\nstruct OSResults;\nclass UNICHARSET;\n\nclass Dawg;\nclass Dict;\nclass EquationDetect;\nclass PageIterator;\nclass ImageThresholder;\nclass LTRResultIterator;\nclass ResultIterator;\nclass MutableIterator;\nclass TessResultRenderer;\nclass Tesseract;\n\n// Function to read a std::vector<char> from a whole file.\n// Returns false on failure.\nusing FileReader = bool (*)(const char *filename, std::vector<char> *data);\n\nusing DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,\n                               bool) const;\nusing ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,\n                                                  int, const char *, int);\n\n/**\n * Base class for all tesseract APIs.\n * Specific classes can add ability to work on different inputs or produce\n * different outputs.\n * This class is mostly an interface layer on top of the Tesseract instance\n * class to hide the data types so that users of this class don't have to\n * include any other Tesseract headers.\n */\nclass TESS_API TessBaseAPI {\npublic:\n  TessBaseAPI();\n  virtual ~TessBaseAPI();\n  // Copy constructor and assignment operator are currently unsupported.\n  TessBaseAPI(TessBaseAPI const &) = delete;\n  TessBaseAPI &operator=(TessBaseAPI const &) = delete;\n\n  /**\n   * Returns the version identifier as a static string. Do not delete.\n   */\n  static const char *Version();\n\n  /**\n   * Set the name of the input file. Needed for training and\n   * reading a UNLV zone file, and for searchable PDF output.\n   */\n  void SetInputName(const char *name);\n  /**\n   * These functions are required for searchable PDF output.\n   * We need our hands on the input file so that we can include\n   * it in the PDF without transcoding. If that is not possible,\n   * we need the original image. Finally, resolution metadata\n   * is stored in the PDF so we need that as well.\n   */\n  const char *GetInputName();\n  // Takes ownership of the input pix.\n  void SetInputImage(Pix *pix);\n  Pix *GetInputImage();\n  int GetSourceYResolution();\n  const char *GetDatapath();\n\n  /** Set the name of the bonus output files. Needed only for debugging. */\n  void SetOutputName(const char *name);\n\n  /**\n   * Set the value of an internal \"parameter.\"\n   * Supply the name of the parameter and the value as a string, just as\n   * you would in a config file.\n   * Returns false if the name lookup failed.\n   * Eg SetVariable(\"tessedit_char_blacklist\", \"xyz\"); to ignore x, y and z.\n   * Or SetVariable(\"classify_bln_numeric_mode\", \"1\"); to set numeric-only mode.\n   * SetVariable may be used before Init, but settings will revert to\n   * defaults on End().\n   *\n   * Note: Must be called after Init(). Only works for non-init variables\n   * (init variables should be passed to Init()).\n   */\n  bool SetVariable(const char *name, const char *value);\n  bool SetDebugVariable(const char *name, const char *value);\n\n  /**\n   * Returns true if the parameter was found among Tesseract parameters.\n   * Fills in value with the value of the parameter.\n   */\n  bool GetIntVariable(const char *name, int *value) const;\n  bool GetBoolVariable(const char *name, bool *value) const;\n  bool GetDoubleVariable(const char *name, double *value) const;\n\n  /**\n   * Returns the pointer to the string that represents the value of the\n   * parameter if it was found among Tesseract parameters.\n   */\n  const char *GetStringVariable(const char *name) const;\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n  /**\n   * Print Tesseract fonts table to the given file.\n   */\n  void PrintFontsTable(FILE *fp) const;\n\n#endif\n\n  /**\n   * Print Tesseract parameters to the given file.\n   */\n  void PrintVariables(FILE *fp) const;\n\n  /**\n   * Get value of named variable as a string, if it exists.\n   */\n  bool GetVariableAsString(const char *name, std::string *val) const;\n\n  /**\n   * Instances are now mostly thread-safe and totally independent,\n   * but some global parameters remain. Basically it is safe to use multiple\n   * TessBaseAPIs in different threads in parallel, UNLESS:\n   * you use SetVariable on some of the Params in classify and textord.\n   * If you do, then the effect will be to change it for all your instances.\n   *\n   * Start tesseract. Returns zero on success and -1 on failure.\n   * NOTE that the only members that may be called before Init are those\n   * listed above here in the class definition.\n   *\n   * The datapath must be the name of the tessdata directory.\n   * The language is (usually) an ISO 639-3 string or nullptr will default to\n   * eng. It is entirely safe (and eventually will be efficient too) to call\n   * Init multiple times on the same instance to change language, or just\n   * to reset the classifier.\n   * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating\n   * that multiple languages are to be loaded. Eg hin+eng will load Hindi and\n   * English. Languages may specify internally that they want to be loaded\n   * with one or more other languages, so the ~ sign is available to override\n   * that. Eg if hin were set to load eng by default, then hin+~eng would force\n   * loading only hin. The number of loaded languages is limited only by\n   * memory, with the caveat that loading additional languages will impact\n   * both speed and accuracy, as there is more work to do to decide on the\n   * applicable language, and there is more chance of hallucinating incorrect\n   * words.\n   * WARNING: On changing languages, all Tesseract parameters are reset\n   * back to their default values. (Which may vary between languages.)\n   * If you have a rare need to set a Variable that controls\n   * initialization for a second call to Init you should explicitly\n   * call End() and then use SetVariable before Init. This is only a very\n   * rare use case, since there are very few uses that require any parameters\n   * to be set before Init.\n   *\n   * If set_only_non_debug_params is true, only params that do not contain\n   * \"debug\" in the name will be set.\n   */\n  int Init(const char *datapath, const char *language, OcrEngineMode mode,\n           char **configs, int configs_size,\n           const std::vector<std::string> *vars_vec,\n           const std::vector<std::string> *vars_values,\n           bool set_only_non_debug_params);\n  int Init(const char *datapath, const char *language, OcrEngineMode oem) {\n    return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);\n  }\n  int Init(const char *datapath, const char *language) {\n    return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,\n                false);\n  }\n  // In-memory version reads the traineddata file directly from the given\n  // data[data_size] array, and/or reads data via a FileReader.\n  int Init(const char *data, int data_size, const char *language,\n           OcrEngineMode mode, char **configs, int configs_size,\n           const std::vector<std::string> *vars_vec,\n           const std::vector<std::string> *vars_values,\n           bool set_only_non_debug_params, FileReader reader);\n\n  /**\n   * Returns the languages string used in the last valid initialization.\n   * If the last initialization specified \"deu+hin\" then that will be\n   * returned. If hin loaded eng automatically as well, then that will\n   * not be included in this list. To find the languages actually\n   * loaded use GetLoadedLanguagesAsVector.\n   * The returned string should NOT be deleted.\n   */\n  const char *GetInitLanguagesAsString() const;\n\n  /**\n   * Returns the loaded languages in the vector of std::string.\n   * Includes all languages loaded by the last Init, including those loaded\n   * as dependencies of other loaded languages.\n   */\n  void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;\n\n  /**\n   * Returns the available languages in the sorted vector of std::string.\n   */\n  void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;\n\n  /**\n   * Init only for page layout analysis. Use only for calls to SetImage and\n   * AnalysePage. Calls that attempt recognition will generate an error.\n   */\n  void InitForAnalysePage();\n\n  /**\n   * Read a \"config\" file containing a set of param, value pairs.\n   * Searches the standard places: tessdata/configs, tessdata/tessconfigs\n   * and also accepts a relative or absolute path name.\n   * Note: only non-init params will be set (init params are set by Init()).\n   */\n  void ReadConfigFile(const char *filename);\n  /** Same as above, but only set debug params from the given config file. */\n  void ReadDebugConfigFile(const char *filename);\n\n  /**\n   * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.\n   * The mode is stored as an IntParam so it can also be modified by\n   * ReadConfigFile or SetVariable(\"tessedit_pageseg_mode\", mode as string).\n   */\n  void SetPageSegMode(PageSegMode mode);\n\n  /** Return the current page segmentation mode. */\n  PageSegMode GetPageSegMode() const;\n\n  /**\n   * Recognize a rectangle from an image and return the result as a string.\n   * May be called many times for a single Init.\n   * Currently has no error checking.\n   * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.\n   * Palette color images will not work properly and must be converted to\n   * 24 bit.\n   * Binary images of 1 bit per pixel may also be given but they must be\n   * byte packed with the MSB of the first byte being the first pixel, and a\n   * 1 represents WHITE. For binary images set bytes_per_pixel=0.\n   * The recognized text is returned as a char* which is coded\n   * as UTF8 and must be freed with the delete [] operator.\n   *\n   * Note that TesseractRect is the simplified convenience interface.\n   * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,\n   * and one or more of the Get*Text functions below.\n   */\n  char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,\n                      int bytes_per_line, int left, int top, int width,\n                      int height);\n\n  /**\n   * Call between pages or documents etc to free up memory and forget\n   * adaptive data.\n   */\n  void ClearAdaptiveClassifier();\n\n  /**\n   * @defgroup AdvancedAPI Advanced API\n   * The following methods break TesseractRect into pieces, so you can\n   * get hold of the thresholded image, get the text in different formats,\n   * get bounding boxes, confidences etc.\n   */\n  /* @{ */\n\n  /**\n   * Provide an image for Tesseract to recognize. Format is as\n   * TesseractRect above. Copies the image buffer and converts to Pix.\n   * SetImage clears all recognition results, and sets the rectangle to the\n   * full image, so it may be followed immediately by a GetUTF8Text, and it\n   * will automatically perform recognition.\n   */\n  void SetImage(const unsigned char *imagedata, int width, int height,\n                int bytes_per_pixel, int bytes_per_line);\n\n  /**\n   * Provide an image for Tesseract to recognize. As with SetImage above,\n   * Tesseract takes its own copy of the image, so it need not persist until\n   * after Recognize.\n   * Pix vs raw, which to use?\n   * Use Pix where possible. Tesseract uses Pix as its internal representation\n   * and it is therefore more efficient to provide a Pix directly.\n   */\n  void SetImage(Pix *pix);\n\n  /**\n   * Set the resolution of the source image in pixels per inch so font size\n   * information can be calculated in results.  Call this after SetImage().\n   */\n  void SetSourceResolution(int ppi);\n\n  /**\n   * Restrict recognition to a sub-rectangle of the image. Call after SetImage.\n   * Each SetRectangle clears the recogntion results so multiple rectangles\n   * can be recognized with the same image.\n   */\n  void SetRectangle(int left, int top, int width, int height);\n\n  /**\n   * Get a copy of the internal thresholded image from Tesseract.\n   * Caller takes ownership of the Pix and must pixDestroy it.\n   * May be called any time after SetImage, or after TesseractRect.\n   */\n  Pix *GetThresholdedImage();\n\n  /**\n   * Return average gradient of lines on page.\n   */\n  float GetGradient();\n\n  /**\n   * Get the result of page layout analysis as a leptonica-style\n   * Boxa, Pixa pair, in reading order.\n   * Can be called before or after Recognize.\n   */\n  Boxa *GetRegions(Pixa **pixa);\n\n  /**\n   * Get the textlines as a leptonica-style\n   * Boxa, Pixa pair, in reading order.\n   * Can be called before or after Recognize.\n   * If raw_image is true, then extract from the original image instead of the\n   * thresholded image and pad by raw_padding pixels.\n   * If blockids is not nullptr, the block-id of each line is also returned as\n   * an array of one element per line. delete [] after use. If paraids is not\n   * nullptr, the paragraph-id of each line within its block is also returned as\n   * an array of one element per line. delete [] after use.\n   */\n  Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,\n                     int **blockids, int **paraids);\n  /*\n   Helper method to extract from the thresholded image. (most common usage)\n*/\n  Boxa *GetTextlines(Pixa **pixa, int **blockids) {\n    return GetTextlines(false, 0, pixa, blockids, nullptr);\n  }\n\n  /**\n   * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa\n   * pair, in reading order. Enables downstream handling of non-rectangular\n   * regions.\n   * Can be called before or after Recognize.\n   * If blockids is not nullptr, the block-id of each line is also returned as\n   * an array of one element per line. delete [] after use.\n   */\n  Boxa *GetStrips(Pixa **pixa, int **blockids);\n\n  /**\n   * Get the words as a leptonica-style\n   * Boxa, Pixa pair, in reading order.\n   * Can be called before or after Recognize.\n   */\n  Boxa *GetWords(Pixa **pixa);\n\n  /**\n   * Gets the individual connected (text) components (created\n   * after pages segmentation step, but before recognition)\n   * as a leptonica-style Boxa, Pixa pair, in reading order.\n   * Can be called before or after Recognize.\n   * Note: the caller is responsible for calling boxaDestroy()\n   * on the returned Boxa array and pixaDestroy() on cc array.\n   */\n  Boxa *GetConnectedComponents(Pixa **cc);\n\n  /**\n   * Get the given level kind of components (block, textline, word etc.) as a\n   * leptonica-style Boxa, Pixa pair, in reading order.\n   * Can be called before or after Recognize.\n   * If blockids is not nullptr, the block-id of each component is also returned\n   * as an array of one element per component. delete [] after use.\n   * If blockids is not nullptr, the paragraph-id of each component with its\n   * block is also returned as an array of one element per component. delete []\n   * after use. If raw_image is true, then portions of the original image are\n   * extracted instead of the thresholded image and padded with raw_padding. If\n   * text_only is true, then only text components are returned.\n   */\n  Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,\n                           bool raw_image, int raw_padding, Pixa **pixa,\n                           int **blockids, int **paraids);\n  // Helper function to get binary images with no padding (most common usage).\n  Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,\n                           Pixa **pixa, int **blockids) {\n    return GetComponentImages(level, text_only, false, 0, pixa, blockids,\n                              nullptr);\n  }\n\n  /**\n   * Returns the scale factor of the thresholded image that would be returned by\n   * GetThresholdedImage() and the various GetX() methods that call\n   * GetComponentImages().\n   * Returns 0 if no thresholder has been set.\n   */\n  int GetThresholdedImageScaleFactor() const;\n\n  /**\n   * Runs page layout analysis in the mode set by SetPageSegMode.\n   * May optionally be called prior to Recognize to get access to just\n   * the page layout results. Returns an iterator to the results.\n   * If merge_similar_words is true, words are combined where suitable for use\n   * with a line recognizer. Use if you want to use AnalyseLayout to find the\n   * textlines, and then want to process textline fragments with an external\n   * line recognizer.\n   * Returns nullptr on error or an empty page.\n   * The returned iterator must be deleted after use.\n   * WARNING! This class points to data held within the TessBaseAPI class, and\n   * therefore can only be used while the TessBaseAPI class still exists and\n   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n   * DetectOS, or anything else that changes the internal PAGE_RES.\n   */\n  PageIterator *AnalyseLayout();\n  PageIterator *AnalyseLayout(bool merge_similar_words);\n\n  /**\n   * Recognize the image from SetAndThresholdImage, generating Tesseract\n   * internal structures. Returns 0 on success.\n   * Optional. The Get*Text functions below will call Recognize if needed.\n   * After Recognize, the output is kept internally until the next SetImage.\n   */\n  int Recognize(ETEXT_DESC *monitor);\n\n  /**\n   * Methods to retrieve information after SetAndThresholdImage(),\n   * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)\n   */\n\n  /**\n   * Turns images into symbolic text.\n   *\n   * filename can point to a single image, a multi-page TIFF,\n   * or a plain text list of image filenames.\n   *\n   * retry_config is useful for debugging. If not nullptr, you can fall\n   * back to an alternate configuration if a page fails for some\n   * reason.\n   *\n   * timeout_millisec terminates processing if any single page\n   * takes too long. Set to 0 for unlimited time.\n   *\n   * renderer is responsible for creating the output. For example,\n   * use the TessTextRenderer if you want plaintext output, or\n   * the TessPDFRender to produce searchable PDF.\n   *\n   * If tessedit_page_number is non-negative, will only process that\n   * single page. Works for multi-page tiff file, or filelist.\n   *\n   * Returns true if successful, false on error.\n   */\n  bool ProcessPages(const char *filename, const char *retry_config,\n                    int timeout_millisec, TessResultRenderer *renderer);\n  // Does the real work of ProcessPages.\n  bool ProcessPagesInternal(const char *filename, const char *retry_config,\n                            int timeout_millisec, TessResultRenderer *renderer);\n\n  /**\n   * Turn a single image into symbolic text.\n   *\n   * The pix is the image processed. filename and page_index are\n   * metadata used by side-effect processes, such as reading a box\n   * file or formatting as hOCR.\n   *\n   * See ProcessPages for descriptions of other parameters.\n   */\n  bool ProcessPage(Pix *pix, int page_index, const char *filename,\n                   const char *retry_config, int timeout_millisec,\n                   TessResultRenderer *renderer);\n\n  /**\n   * Get a reading-order iterator to the results of LayoutAnalysis and/or\n   * Recognize. The returned iterator must be deleted after use.\n   * WARNING! This class points to data held within the TessBaseAPI class, and\n   * therefore can only be used while the TessBaseAPI class still exists and\n   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n   * DetectOS, or anything else that changes the internal PAGE_RES.\n   */\n  ResultIterator *GetIterator();\n\n  /**\n   * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.\n   * The returned iterator must be deleted after use.\n   * WARNING! This class points to data held within the TessBaseAPI class, and\n   * therefore can only be used while the TessBaseAPI class still exists and\n   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n   * DetectOS, or anything else that changes the internal PAGE_RES.\n   */\n  MutableIterator *GetMutableIterator();\n\n  /**\n   * The recognized text is returned as a char* which is coded\n   * as UTF8 and must be freed with the delete [] operator.\n   */\n  char *GetUTF8Text();\n\n  /**\n   * Make a HTML-formatted string with hOCR markup from the internal\n   * data structures.\n   * page_number is 0-based but will appear in the output as 1-based.\n   * monitor can be used to\n   *  cancel the recognition\n   *  receive progress callbacks\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetHOCRText(ETEXT_DESC *monitor, int page_number);\n\n  /**\n   * Make a HTML-formatted string with hOCR markup from the internal\n   * data structures.\n   * page_number is 0-based but will appear in the output as 1-based.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetHOCRText(int page_number);\n\n  /**\n   * Make an XML-formatted string with Alto markup from the internal\n   * data structures.\n   */\n  char *GetAltoText(ETEXT_DESC *monitor, int page_number);\n\n  /**\n   * Make an XML-formatted string with Alto markup from the internal\n   * data structures.\n   */\n  char *GetAltoText(int page_number);\n\n   /**\n   * Make an XML-formatted string with PAGE markup from the internal\n   * data structures.\n   */\n  char *GetPAGEText(ETEXT_DESC *monitor, int page_number);\n\n  /**\n   * Make an XML-formatted string with PAGE markup from the internal\n   * data structures.\n   */\n  char *GetPAGEText(int page_number);\n\n  /**\n   * Make a TSV-formatted string from the internal data structures.\n   * page_number is 0-based but will appear in the output as 1-based.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetTSVText(int page_number);\n\n  /**\n   * Make a box file for LSTM training from the internal data structures.\n   * Constructs coordinates in the original image - not just the rectangle.\n   * page_number is a 0-based page index that will appear in the box file.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetLSTMBoxText(int page_number);\n\n  /**\n   * The recognized text is returned as a char* which is coded in the same\n   * format as a box file used in training.\n   * Constructs coordinates in the original image - not just the rectangle.\n   * page_number is a 0-based page index that will appear in the box file.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetBoxText(int page_number);\n\n  /**\n   * The recognized text is returned as a char* which is coded in the same\n   * format as a WordStr box file used in training.\n   * page_number is a 0-based page index that will appear in the box file.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetWordStrBoxText(int page_number);\n\n  /**\n   * The recognized text is returned as a char* which is coded\n   * as UNLV format Latin-1 with specific reject and suspect codes.\n   * Returned string must be freed with the delete [] operator.\n   */\n  char *GetUNLVText();\n\n  /**\n   * Detect the orientation of the input image and apparent script (alphabet).\n   * orient_deg is the detected clockwise rotation of the input image in degrees\n   * (0, 90, 180, 270)\n   * orient_conf is the confidence (15.0 is reasonably confident)\n   * script_name is an ASCII string, the name of the script, e.g. \"Latin\"\n   * script_conf is confidence level in the script\n   * Returns true on success and writes values to each parameter as an output\n   */\n  bool DetectOrientationScript(int *orient_deg, float *orient_conf,\n                               const char **script_name, float *script_conf);\n\n  /**\n   * The recognized text is returned as a char* which is coded\n   * as UTF8 and must be freed with the delete [] operator.\n   * page_number is a 0-based page index that will appear in the osd file.\n   */\n  char *GetOsdText(int page_number);\n\n  /** Returns the (average) confidence value between 0 and 100. */\n  int MeanTextConf();\n  /**\n   * Returns all word confidences (between 0 and 100) in an array, terminated\n   * by -1.  The calling function must delete [] after use.\n   * The number of confidences should correspond to the number of space-\n   * delimited words in GetUTF8Text.\n   */\n  int *AllWordConfidences();\n\n#ifndef DISABLED_LEGACY_ENGINE\n  /**\n   * Applies the given word to the adaptive classifier if possible.\n   * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can\n   * tell the boundaries of the graphemes.\n   * Assumes that SetImage/SetRectangle have been used to set the image\n   * to the given word. The mode arg should be PSM_SINGLE_WORD or\n   * PSM_CIRCLE_WORD, as that will be used to control layout analysis.\n   * The currently set PageSegMode is preserved.\n   * Returns false if adaption was not possible for some reason.\n   */\n  bool AdaptToWordStr(PageSegMode mode, const char *wordstr);\n#endif //  ndef DISABLED_LEGACY_ENGINE\n\n  /**\n   * Free up recognition results and any stored image data, without actually\n   * freeing any recognition data that would be time-consuming to reload.\n   * Afterwards, you must call SetImage or TesseractRect before doing\n   * any Recognize or Get* operation.\n   */\n  void Clear();\n\n  /**\n   * Close down tesseract and free up all memory. End() is equivalent to\n   * destructing and reconstructing your TessBaseAPI.\n   * Once End() has been used, none of the other API functions may be used\n   * other than Init and anything declared above it in the class definition.\n   */\n  void End();\n\n  /**\n   * Clear any library-level memory caches.\n   * There are a variety of expensive-to-load constant data structures (mostly\n   * language dictionaries) that are cached globally -- surviving the Init()\n   * and End() of individual TessBaseAPI's.  This function allows the clearing\n   * of these caches.\n   **/\n  static void ClearPersistentCache();\n\n  /**\n   * Check whether a word is valid according to Tesseract's language model\n   * @return 0 if the word is invalid, non-zero if valid.\n   * @warning temporary! This function will be removed from here and placed\n   * in a separate API at some future time.\n   */\n  int IsValidWord(const char *word) const;\n  // Returns true if utf8_character is defined in the UniCharset.\n  bool IsValidCharacter(const char *utf8_character) const;\n\n  bool GetTextDirection(int *out_offset, float *out_slope);\n\n  /** Sets Dict::letter_is_okay_ function to point to the given function. */\n  void SetDictFunc(DictFunc f);\n\n  /** Sets Dict::probability_in_context_ function to point to the given\n   * function.\n   */\n  void SetProbabilityInContextFunc(ProbabilityInContextFunc f);\n\n  /**\n   * Estimates the Orientation And Script of the image.\n   * @return true if the image was processed successfully.\n   */\n  bool DetectOS(OSResults *);\n\n  /**\n   * Return text orientation of each block as determined by an earlier run\n   * of layout analysis.\n   */\n  void GetBlockTextOrientations(int **block_orientation,\n                                bool **vertical_writing);\n\n  /** This method returns the string form of the specified unichar. */\n  const char *GetUnichar(int unichar_id) const;\n\n  /** Return the pointer to the i-th dawg loaded into tesseract_ object. */\n  const Dawg *GetDawg(int i) const;\n\n  /** Return the number of dawgs loaded into tesseract_ object. */\n  int NumDawgs() const;\n\n  Tesseract *tesseract() const {\n    return tesseract_;\n  }\n\n  OcrEngineMode oem() const {\n    return last_oem_requested_;\n  }\n\n  void set_min_orientation_margin(double margin);\n  /* @} */\n\nprotected:\n  /** Common code for setting the image. Returns true if Init has been called.\n   */\n  bool InternalSetImage();\n\n  /**\n   * Run the thresholder to make the thresholded image. If pix is not nullptr,\n   * the source is thresholded to pix instead of the internal IMAGE.\n   */\n  virtual bool Threshold(Pix **pix);\n\n  /**\n   * Find lines from the image making the BLOCK_LIST.\n   * @return 0 on success.\n   */\n  int FindLines();\n\n  /** Delete the pageres and block list ready for a new page. */\n  void ClearResults();\n\n  /**\n   * Return an LTR Result Iterator -- used only for training, as we really want\n   * to ignore all BiDi smarts at that point.\n   * delete once you're done with it.\n   */\n  LTRResultIterator *GetLTRIterator();\n\n  /**\n   * Return the length of the output text string, as UTF8, assuming\n   * one newline per line and one per block, with a terminator,\n   * and assuming a single character reject marker for each rejected character.\n   * Also return the number of recognized blobs in blob_count.\n   */\n  int TextLength(int *blob_count) const;\n\n  //// paragraphs.cpp ////////////////////////////////////////////////////\n  void DetectParagraphs(bool after_text_recognition);\n\n  const PAGE_RES *GetPageRes() const {\n    return page_res_;\n  }\n\nprotected:\n  Tesseract *tesseract_;          ///< The underlying data object.\n  Tesseract *osd_tesseract_;      ///< For orientation & script detection.\n  EquationDetect *equ_detect_;    ///< The equation detector.\n  FileReader reader_;             ///< Reads files from any filesystem.\n  ImageThresholder *thresholder_; ///< Image thresholding module.\n  std::vector<ParagraphModel *> *paragraph_models_;\n  BLOCK_LIST *block_list_;           ///< The page layout.\n  PAGE_RES *page_res_;               ///< The page-level data.\n  std::string input_file_;           ///< Name used by training code.\n  std::string output_file_;          ///< Name used by debug code.\n  std::string datapath_;             ///< Current location of tessdata.\n  std::string language_;             ///< Last initialized language.\n  OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.\n  bool recognition_done_;            ///< page_res_ contains recognition data.\n\n  /**\n   * @defgroup ThresholderParams Thresholder Parameters\n   * Parameters saved from the Thresholder. Needed to rebuild coordinates.\n   */\n  /* @{ */\n  int rect_left_;\n  int rect_top_;\n  int rect_width_;\n  int rect_height_;\n  int image_width_;\n  int image_height_;\n  /* @} */\n\nprivate:\n  // A list of image filenames gets special consideration\n  bool ProcessPagesFileList(FILE *fp, std::string *buf,\n                            const char *retry_config, int timeout_millisec,\n                            TessResultRenderer *renderer,\n                            int tessedit_page_number);\n  // TIFF supports multipage so gets special consideration.\n  bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,\n                                 const char *filename, const char *retry_config,\n                                 int timeout_millisec,\n                                 TessResultRenderer *renderer,\n                                 int tessedit_page_number);\n}; // class TessBaseAPI.\n\n/** Escape a char string - replace &<>\"' with HTML codes. */\nstd::string HOcrEscape(const char *text);\n\n} // namespace tesseract\n\n#endif // TESSERACT_API_BASEAPI_H_\n"
  },
  {
    "path": "include/tesseract/capi.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        capi.h\n// Description: C-API TessBaseAPI\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef API_CAPI_H_\n#define API_CAPI_H_\n\n#include \"export.h\"\n\n#ifdef __cplusplus\n#  include <tesseract/baseapi.h>\n#  include <tesseract/ocrclass.h>\n#  include <tesseract/pageiterator.h>\n#  include <tesseract/renderer.h>\n#  include <tesseract/resultiterator.h>\n#endif\n\n#include <stdbool.h>\n#include <stdio.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#ifndef BOOL\n#  define BOOL int\n#  define TRUE 1\n#  define FALSE 0\n#endif\n\n#ifdef __cplusplus\ntypedef tesseract::TessResultRenderer TessResultRenderer;\ntypedef tesseract::TessBaseAPI TessBaseAPI;\ntypedef tesseract::PageIterator TessPageIterator;\ntypedef tesseract::ResultIterator TessResultIterator;\ntypedef tesseract::MutableIterator TessMutableIterator;\ntypedef tesseract::ChoiceIterator TessChoiceIterator;\ntypedef tesseract::OcrEngineMode TessOcrEngineMode;\ntypedef tesseract::PageSegMode TessPageSegMode;\ntypedef tesseract::PageIteratorLevel TessPageIteratorLevel;\ntypedef tesseract::Orientation TessOrientation;\ntypedef tesseract::ParagraphJustification TessParagraphJustification;\ntypedef tesseract::WritingDirection TessWritingDirection;\ntypedef tesseract::TextlineOrder TessTextlineOrder;\ntypedef tesseract::PolyBlockType TessPolyBlockType;\ntypedef tesseract::ETEXT_DESC ETEXT_DESC;\n#else\ntypedef struct TessResultRenderer TessResultRenderer;\ntypedef struct TessBaseAPI TessBaseAPI;\ntypedef struct TessPageIterator TessPageIterator;\ntypedef struct TessResultIterator TessResultIterator;\ntypedef struct TessMutableIterator TessMutableIterator;\ntypedef struct TessChoiceIterator TessChoiceIterator;\ntypedef enum TessOcrEngineMode {\n  OEM_TESSERACT_ONLY,\n  OEM_LSTM_ONLY,\n  OEM_TESSERACT_LSTM_COMBINED,\n  OEM_DEFAULT\n} TessOcrEngineMode;\ntypedef enum TessPageSegMode {\n  PSM_OSD_ONLY,\n  PSM_AUTO_OSD,\n  PSM_AUTO_ONLY,\n  PSM_AUTO,\n  PSM_SINGLE_COLUMN,\n  PSM_SINGLE_BLOCK_VERT_TEXT,\n  PSM_SINGLE_BLOCK,\n  PSM_SINGLE_LINE,\n  PSM_SINGLE_WORD,\n  PSM_CIRCLE_WORD,\n  PSM_SINGLE_CHAR,\n  PSM_SPARSE_TEXT,\n  PSM_SPARSE_TEXT_OSD,\n  PSM_RAW_LINE,\n  PSM_COUNT\n} TessPageSegMode;\ntypedef enum TessPageIteratorLevel {\n  RIL_BLOCK,\n  RIL_PARA,\n  RIL_TEXTLINE,\n  RIL_WORD,\n  RIL_SYMBOL\n} TessPageIteratorLevel;\ntypedef enum TessPolyBlockType {\n  PT_UNKNOWN,\n  PT_FLOWING_TEXT,\n  PT_HEADING_TEXT,\n  PT_PULLOUT_TEXT,\n  PT_EQUATION,\n  PT_INLINE_EQUATION,\n  PT_TABLE,\n  PT_VERTICAL_TEXT,\n  PT_CAPTION_TEXT,\n  PT_FLOWING_IMAGE,\n  PT_HEADING_IMAGE,\n  PT_PULLOUT_IMAGE,\n  PT_HORZ_LINE,\n  PT_VERT_LINE,\n  PT_NOISE,\n  PT_COUNT\n} TessPolyBlockType;\ntypedef enum TessOrientation {\n  ORIENTATION_PAGE_UP,\n  ORIENTATION_PAGE_RIGHT,\n  ORIENTATION_PAGE_DOWN,\n  ORIENTATION_PAGE_LEFT\n} TessOrientation;\ntypedef enum TessParagraphJustification {\n  JUSTIFICATION_UNKNOWN,\n  JUSTIFICATION_LEFT,\n  JUSTIFICATION_CENTER,\n  JUSTIFICATION_RIGHT\n} TessParagraphJustification;\ntypedef enum TessWritingDirection {\n  WRITING_DIRECTION_LEFT_TO_RIGHT,\n  WRITING_DIRECTION_RIGHT_TO_LEFT,\n  WRITING_DIRECTION_TOP_TO_BOTTOM\n} TessWritingDirection;\ntypedef enum TessTextlineOrder {\n  TEXTLINE_ORDER_LEFT_TO_RIGHT,\n  TEXTLINE_ORDER_RIGHT_TO_LEFT,\n  TEXTLINE_ORDER_TOP_TO_BOTTOM\n} TessTextlineOrder;\ntypedef struct ETEXT_DESC ETEXT_DESC;\n#endif\n\ntypedef bool (*TessCancelFunc)(void *cancel_this, int words);\ntypedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,\n                                 int bottom);\n\nstruct Pix;\nstruct Boxa;\nstruct Pixa;\n\n/* General free functions */\n\nTESS_API const char *TessVersion();\n\n/**\n * Frees the memory allocated for the text string returned by\n * TessBaseAPIGetUTF8Text, TessBaseAPIGetHOCRText, etc.\n *\n * @param text The pointer to the string to be freed.\n */\nTESS_API void TessDeleteText(const char *text);\nTESS_API void TessDeleteTextArray(char **arr);\nTESS_API void TessDeleteIntArray(const int *arr);\n\n/* Renderer API */\nTESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,\n                                                     BOOL font_info);\nTESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessPAGERendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,\n                                                   const char *datadir,\n                                                   BOOL textonly);\nTESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);\nTESS_API TessResultRenderer *TessWordStrBoxRendererCreate(\n    const char *outputbase);\n\nTESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);\nTESS_API void TessResultRendererInsert(TessResultRenderer *renderer,\n                                       TessResultRenderer *next);\nTESS_API TessResultRenderer *TessResultRendererNext(\n    TessResultRenderer *renderer);\nTESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,\n                                              const char *title);\nTESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,\n                                         TessBaseAPI *api);\nTESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);\n\nTESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);\nTESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);\nTESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);\n\n/* Base API */\n\n/**\n * Creates a new instance of the Tesseract API.\n *\n * The lifecycle of the instance is:\n * 1. TessBaseAPICreate()\n * 2. TessBaseAPIInit3() (or similar)\n * 3. TessBaseAPISetImage2() (or similar)\n * 4. TessBaseAPIGetUTF8Text() (or similar)\n * 5. TessDeleteText()\n * 6. TessBaseAPIEnd() (optional, clears internal structures)\n * 7. TessBaseAPIDelete()\n *\n * The returned handle must be freed using TessBaseAPIDelete.\n *\n * @return A pointer to the new TessBaseAPI instance, or NULL on failure.\n */\nTESS_API TessBaseAPI *TessBaseAPICreate();\n\n/**\n * Frees the memory associated with a TessBaseAPI instance.\n *\n * @param handle The TessBaseAPI instance to be freed.\n */\nTESS_API void TessBaseAPIDelete(TessBaseAPI *handle);\n\nTESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);\nTESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);\n\nTESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);\nTESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);\n\nTESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);\nTESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);\n\nTESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);\n\nTESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,\n                                     const char *value);\nTESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,\n                                          const char *value);\n\nTESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,\n                                        const char *name, int *value);\nTESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,\n                                         const char *name, BOOL *value);\nTESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,\n                                           const char *name, double *value);\nTESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,\n                                                  const char *name);\n\nTESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);\nTESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,\n                                              const char *filename);\n\nTESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,\n                              const char *language, TessOcrEngineMode oem,\n                              char **configs, int configs_size);\nTESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,\n                              const char *language, TessOcrEngineMode oem);\n\n/**\n * Initializes the Tesseract engine.\n *\n * This function (or one of the other Init functions) must be called\n * before processing any images.\n *\n * @param handle The TessBaseAPI instance.\n * @param datapath The path to the tessdata directory. If NULL, the function\n *                 attempts to use the TESSDATA_PREFIX environment variable\n *                 or a compile-time default.\n * @param language The language code(s) (e.g., \"eng\", \"eng+deu\").\n * @return 0 on success, -1 on failure.\n */\nTESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,\n                              const char *language);\n\nTESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,\n                              const char *language, TessOcrEngineMode mode,\n                              char **configs, int configs_size, char **vars_vec,\n                              char **vars_values, size_t vars_vec_size,\n                              BOOL set_only_non_debug_params);\n\nTESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,\n                              const char *language, TessOcrEngineMode mode,\n                              char **configs, int configs_size, char **vars_vec,\n                              char **vars_values, size_t vars_vec_size,\n                              BOOL set_only_non_debug_params);\n\nTESS_API const char *TessBaseAPIGetInitLanguagesAsString(\n    const TessBaseAPI *handle);\nTESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(\n    const TessBaseAPI *handle);\nTESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(\n    const TessBaseAPI *handle);\n\nTESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);\n\nTESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,\n                                        const char *filename);\nTESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,\n                                             const char *filename);\n\nTESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,\n                                        TessPageSegMode mode);\nTESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);\n\nTESS_API char *TessBaseAPIRect(TessBaseAPI *handle,\n                               const unsigned char *imagedata,\n                               int bytes_per_pixel, int bytes_per_line,\n                               int left, int top, int width, int height);\n\nTESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);\n\nTESS_API void TessBaseAPISetImage(TessBaseAPI *handle,\n                                  const unsigned char *imagedata, int width,\n                                  int height, int bytes_per_pixel,\n                                  int bytes_per_line);\n\n/**\n * Sets the input image for recognition using a Leptonica Pix structure.\n *\n * @note Tesseract does NOT take ownership of the Pix structure. The caller\n * remains responsible for the memory and must call pixDestroy() on the\n * Pix pointer after it is no longer needed by the API (e.g. after recognition\n * or after clearing/ending the API).\n *\n * @param handle The TessBaseAPI instance.\n * @param pix A pointer to the Leptonica Pix structure.\n */\nTESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);\n\nTESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);\n\nTESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,\n                                      int width, int height);\n\nTESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);\nTESS_API float TessBaseAPIGetGradient(TessBaseAPI *handle);\nTESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,\n                                            struct Pixa **pixa);\nTESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,\n                                              struct Pixa **pixa,\n                                              int **blockids);\nTESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,\n                                               BOOL raw_image, int raw_padding,\n                                               struct Pixa **pixa,\n                                               int **blockids, int **paraids);\nTESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,\n                                           struct Pixa **pixa, int **blockids);\nTESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,\n                                          struct Pixa **pixa);\nTESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,\n                                                        struct Pixa **cc);\nTESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,\n                                                    TessPageIteratorLevel level,\n                                                    BOOL text_only,\n                                                    struct Pixa **pixa,\n                                                    int **blockids);\nTESS_API struct Boxa *TessBaseAPIGetComponentImages1(\n    TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,\n    BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,\n    int **paraids);\n\nTESS_API int TessBaseAPIGetThresholdedImageScaleFactor(\n    const TessBaseAPI *handle);\n\nTESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);\n\nTESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);\n\nTESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,\n                                      const char *retry_config,\n                                      int timeout_millisec,\n                                      TessResultRenderer *renderer);\nTESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,\n                                     int page_index, const char *filename,\n                                     const char *retry_config,\n                                     int timeout_millisec,\n                                     TessResultRenderer *renderer);\n\nTESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);\nTESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(\n    TessBaseAPI *handle);\n\n/**\n * Recognizes the image and returns the result as a UTF-8 encoded string.\n *\n * The caller is responsible for freeing the returned string using\n * TessDeleteText.\n *\n * @param handle The TessBaseAPI instance.\n * @return A newly allocated string containing the recognized text, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);\n\n/**\n * Returns the HOCR text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the ALTO XML text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the PAGE XML text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetPAGEText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the TSV text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the box file text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the LSTM box file text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);\n\n/**\n * Returns the WordStr box file text for the page.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @param page_number The page number (0-based).\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,\n                                            int page_number);\n\n/**\n * Returns the UNLV format text.\n *\n * The caller is responsible for freeing the returned string using TessDeleteText().\n *\n * @param handle The TessBaseAPI instance.\n * @return A newly allocated string, or NULL on error.\n */\nTESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);\nTESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);\n\nTESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);\n\n#ifndef DISABLED_LEGACY_ENGINE\nTESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,\n                                        TessPageSegMode mode,\n                                        const char *wordstr);\n#endif // #ifndef DISABLED_LEGACY_ENGINE\n\nTESS_API void TessBaseAPIClear(TessBaseAPI *handle);\nTESS_API void TessBaseAPIEnd(TessBaseAPI *handle);\n\nTESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);\nTESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,\n                                          float *out_slope);\n\nTESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);\n\nTESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n// Call TessDeleteText(*best_script_name) to free memory allocated by this\n// function\nTESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,\n                                                 int *orient_deg,\n                                                 float *orient_conf,\n                                                 const char **script_name,\n                                                 float *script_conf);\n#endif // #ifndef DISABLED_LEGACY_ENGINE\n\nTESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,\n                                                 double margin);\n\nTESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);\n\nTESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);\n\nTESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,\n                                               int **block_orientation,\n                                               bool **vertical_writing);\n\n/* Page iterator */\n\nTESS_API void TessPageIteratorDelete(TessPageIterator *handle);\n\nTESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);\n\nTESS_API void TessPageIteratorBegin(TessPageIterator *handle);\n\nTESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,\n                                   TessPageIteratorLevel level);\n\nTESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,\n                                              TessPageIteratorLevel level);\n\nTESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,\n                                               TessPageIteratorLevel level,\n                                               TessPageIteratorLevel element);\n\nTESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,\n                                          TessPageIteratorLevel level,\n                                          int *left, int *top, int *right,\n                                          int *bottom);\n\nTESS_API TessPolyBlockType\nTessPageIteratorBlockType(const TessPageIterator *handle);\n\nTESS_API struct Pix *TessPageIteratorGetBinaryImage(\n    const TessPageIterator *handle, TessPageIteratorLevel level);\n\nTESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,\n                                              TessPageIteratorLevel level,\n                                              int padding,\n                                              struct Pix *original_image,\n                                              int *left, int *top);\n\nTESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,\n                                       TessPageIteratorLevel level, int *x1,\n                                       int *y1, int *x2, int *y2);\n\nTESS_API void TessPageIteratorOrientation(\n    TessPageIterator *handle, TessOrientation *orientation,\n    TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,\n    float *deskew_angle);\n\nTESS_API void TessPageIteratorParagraphInfo(\n    TessPageIterator *handle, TessParagraphJustification *justification,\n    BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);\n\n/* Result iterator */\n\nTESS_API void TessResultIteratorDelete(TessResultIterator *handle);\nTESS_API TessResultIterator *TessResultIteratorCopy(\n    const TessResultIterator *handle);\nTESS_API TessPageIterator *TessResultIteratorGetPageIterator(\n    TessResultIterator *handle);\nTESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(\n    const TessResultIterator *handle);\nTESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(\n    const TessResultIterator *handle);\n\nTESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,\n                                     TessPageIteratorLevel level);\nTESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,\n                                             TessPageIteratorLevel level);\nTESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,\n                                            TessPageIteratorLevel level);\nTESS_API const char *TessResultIteratorWordRecognitionLanguage(\n    const TessResultIterator *handle);\nTESS_API const char *TessResultIteratorWordFontAttributes(\n    const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,\n    BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,\n    int *pointsize, int *font_id);\n\nTESS_API BOOL\nTessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);\nTESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);\nTESS_API BOOL\nTessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);\nTESS_API BOOL\nTessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);\nTESS_API BOOL\nTessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);\n\nTESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);\nTESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);\nTESS_API const char *TessChoiceIteratorGetUTF8Text(\n    const TessChoiceIterator *handle);\nTESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);\n\n/* Progress monitor */\n\nTESS_API ETEXT_DESC *TessMonitorCreate();\nTESS_API void TessMonitorDelete(ETEXT_DESC *monitor);\nTESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,\n                                       TessCancelFunc cancelFunc);\nTESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);\nTESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);\nTESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,\n                                         TessProgressFunc progressFunc);\nTESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);\nTESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif // API_CAPI_H_\n"
  },
  {
    "path": "include/tesseract/export.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        export.h\n// Description: Place holder\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_PLATFORM_H_\n#define TESSERACT_PLATFORM_H_\n\n#ifndef TESS_API\n#  if defined(_WIN32) || defined(__CYGWIN__)\n#    if defined(TESS_EXPORTS)\n#      define TESS_API __declspec(dllexport)\n#    elif defined(TESS_IMPORTS)\n#      define TESS_API __declspec(dllimport)\n#    else\n#      define TESS_API\n#    endif\n#  else\n#    if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)\n#      define TESS_API __attribute__((visibility(\"default\")))\n#    else\n#      define TESS_API\n#    endif\n#  endif\n#endif\n\n#endif // TESSERACT_PLATFORM_H_\n"
  },
  {
    "path": "include/tesseract/ltrresultiterator.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        ltrresultiterator.h\n// Description: Iterator for tesseract results in strict left-to-right\n//              order that avoids using tesseract internal data structures.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_\n#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_\n\n#include \"export.h\"       // for TESS_API\n#include \"pageiterator.h\" // for PageIterator\n#include \"publictypes.h\"  // for PageIteratorLevel\n#include \"unichar.h\"      // for StrongScriptDirection\n\nnamespace tesseract {\n\nclass BLOB_CHOICE_IT;\nclass PAGE_RES;\nclass WERD_RES;\n\nclass Tesseract;\n\n// Class to iterate over tesseract results, providing access to all levels\n// of the page hierarchy, without including any tesseract headers or having\n// to handle any tesseract structures.\n// WARNING! This class points to data held within the TessBaseAPI class, and\n// therefore can only be used while the TessBaseAPI class still exists and\n// has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n// DetectOS, or anything else that changes the internal PAGE_RES.\n// See tesseract/publictypes.h for the definition of PageIteratorLevel.\n// See also base class PageIterator, which contains the bulk of the interface.\n// LTRResultIterator adds text-specific methods for access to OCR output.\n\nclass TESS_API LTRResultIterator : public PageIterator {\n  friend class ChoiceIterator;\n\npublic:\n  // page_res and tesseract come directly from the BaseAPI.\n  // The rectangle parameters are copied indirectly from the Thresholder,\n  // via the BaseAPI. They represent the coordinates of some rectangle in an\n  // original image (in top-left-origin coordinates) and therefore the top-left\n  // needs to be added to any output boxes in order to specify coordinates\n  // in the original image. See TessBaseAPI::SetRectangle.\n  // The scale and scaled_yres are in case the Thresholder scaled the image\n  // rectangle prior to thresholding. Any coordinates in tesseract's image\n  // must be divided by scale before adding (rect_left, rect_top).\n  // The scaled_yres indicates the effective resolution of the binary image\n  // that tesseract has been given by the Thresholder.\n  // After the constructor, Begin has already been called.\n  LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,\n                    int scaled_yres, int rect_left, int rect_top,\n                    int rect_width, int rect_height);\n\n  ~LTRResultIterator() override;\n\n  // LTRResultIterators may be copied! This makes it possible to iterate over\n  // all the objects at a lower level, while maintaining an iterator to\n  // objects at a higher level. These constructors DO NOT CALL Begin, so\n  // iterations will continue from the location of src.\n  // TODO: For now the copy constructor and operator= only need the base class\n  // versions, but if new data members are added, don't forget to add them!\n\n  // ============= Moving around within the page ============.\n\n  // See PageIterator.\n\n  // ============= Accessing data ==============.\n\n  // Returns the null terminated UTF-8 encoded text string for the current\n  // object at the given level. Use delete [] to free after use.\n  char *GetUTF8Text(PageIteratorLevel level) const;\n\n  // Set the string inserted at the end of each text line. \"\\n\" by default.\n  void SetLineSeparator(const char *new_line);\n\n  // Set the string inserted at the end of each paragraph. \"\\n\" by default.\n  void SetParagraphSeparator(const char *new_para);\n\n  // Returns the mean confidence of the current object at the given level.\n  // The number should be interpreted as a percent probability. (0.0f-100.0f)\n  float Confidence(PageIteratorLevel level) const;\n\n  // ============= Functions that refer to words only ============.\n\n  // Returns the font attributes of the current word. If iterating at a higher\n  // level object than words, eg textlines, then this will return the\n  // attributes of the first word in that textline.\n  // The actual return value is a string representing a font name. It points\n  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as\n  // the iterator itself, ie rendered invalid by various members of\n  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.\n  // Pointsize is returned in printers points (1/72 inch.)\n  const char *WordFontAttributes(bool *is_bold, bool *is_italic,\n                                 bool *is_underlined, bool *is_monospace,\n                                 bool *is_serif, bool *is_smallcaps,\n                                 int *pointsize, int *font_id) const;\n\n  // Return the name of the language used to recognize this word.\n  // On error, nullptr.  Do not delete this pointer.\n  const char *WordRecognitionLanguage() const;\n\n  // Return the overall directionality of this word.\n  StrongScriptDirection WordDirection() const;\n\n  // Returns true if the current word was found in a dictionary.\n  bool WordIsFromDictionary() const;\n\n  // Returns the number of blanks before the current word.\n  int BlanksBeforeWord() const;\n\n  // Returns true if the current word is numeric.\n  bool WordIsNumeric() const;\n\n  // Returns true if the word contains blamer information.\n  bool HasBlamerInfo() const;\n\n  // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle\n  // of the current word.\n  const void *GetParamsTrainingBundle() const;\n\n  // Returns a pointer to the string with blamer information for this word.\n  // Assumes that the word's blamer_bundle is not nullptr.\n  const char *GetBlamerDebug() const;\n\n  // Returns a pointer to the string with misadaption information for this word.\n  // Assumes that the word's blamer_bundle is not nullptr.\n  const char *GetBlamerMisadaptionDebug() const;\n\n  // Returns true if a truth string was recorded for the current word.\n  bool HasTruthString() const;\n\n  // Returns true if the given string is equivalent to the truth string for\n  // the current word.\n  bool EquivalentToTruth(const char *str) const;\n\n  // Returns a null terminated UTF-8 encoded truth string for the current word.\n  // Use delete [] to free after use.\n  char *WordTruthUTF8Text() const;\n\n  // Returns a null terminated UTF-8 encoded normalized OCR string for the\n  // current word. Use delete [] to free after use.\n  char *WordNormedUTF8Text() const;\n\n  // Returns a pointer to serialized choice lattice.\n  // Fills lattice_size with the number of bytes in lattice data.\n  const char *WordLattice(int *lattice_size) const;\n\n  // ============= Functions that refer to symbols only ============.\n\n  // Returns true if the current symbol is a superscript.\n  // If iterating at a higher level object than symbols, eg words, then\n  // this will return the attributes of the first symbol in that word.\n  bool SymbolIsSuperscript() const;\n  // Returns true if the current symbol is a subscript.\n  // If iterating at a higher level object than symbols, eg words, then\n  // this will return the attributes of the first symbol in that word.\n  bool SymbolIsSubscript() const;\n  // Returns true if the current symbol is a dropcap.\n  // If iterating at a higher level object than symbols, eg words, then\n  // this will return the attributes of the first symbol in that word.\n  bool SymbolIsDropcap() const;\n\nprotected:\n  const char *line_separator_;\n  const char *paragraph_separator_;\n};\n\n// Class to iterate over the classifier choices for a single RIL_SYMBOL.\nclass TESS_API ChoiceIterator {\npublic:\n  // Construction is from a LTRResultIterator that points to the symbol of\n  // interest. The ChoiceIterator allows a one-shot iteration over the\n  // choices for this symbol and after that it is useless.\n  explicit ChoiceIterator(const LTRResultIterator &result_it);\n  ~ChoiceIterator();\n\n  // Moves to the next choice for the symbol and returns false if there\n  // are none left.\n  bool Next();\n\n  // ============= Accessing data ==============.\n\n  // Returns the null terminated UTF-8 encoded text string for the current\n  // choice.\n  // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an\n  // internal structure and should NOT be delete[]ed to free after use.\n  const char *GetUTF8Text() const;\n\n  // Returns the confidence of the current choice depending on the used language\n  // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All\n  // choices for one symbol should roughly add up to 1.0f.\n  // If only traineddata of the legacy engine is used, the number should be\n  // interpreted as a percent probability. (0.0f-100.0f) In this case\n  // probabilities won't add up to 100. Each one stands on its own.\n  float Confidence() const;\n\n  // Returns a vector containing all timesteps, which belong to the currently\n  // selected symbol. A timestep is a vector containing pairs of symbols and\n  // floating point numbers. The number states the probability for the\n  // corresponding symbol.\n  std::vector<std::vector<std::pair<const char *, float>>> *Timesteps() const;\n\nprivate:\n  // clears the remaining spaces out of the results and adapt the probabilities\n  void filterSpaces();\n  // Pointer to the WERD_RES object owned by the API.\n  WERD_RES *word_res_;\n  // Iterator over the blob choices.\n  BLOB_CHOICE_IT *choice_it_;\n  std::vector<std::pair<const char *, float>> *LSTM_choices_ = nullptr;\n  std::vector<std::pair<const char *, float>>::iterator LSTM_choice_it_;\n\n  const int *tstep_index_;\n  // regulates the rating granularity\n  double rating_coefficient_;\n  // leading blanks\n  int blanks_before_word_;\n  // true when there is lstm engine related trained data\n  bool oemLSTM_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_\n"
  },
  {
    "path": "include/tesseract/ocrclass.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n/**********************************************************************\n * File:        ocrclass.h\n * Description: Class definitions and constants for the OCR API.\n * Author:      Hewlett-Packard Co\n *\n * (C) Copyright 1996, Hewlett-Packard Co.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n/**********************************************************************\n * This file contains typedefs for all the structures used by\n * the HP OCR interface.\n * The structures are designed to allow them to be used with any\n * structure alignment up to 8.\n **********************************************************************/\n\n#ifndef CCUTIL_OCRCLASS_H_\n#define CCUTIL_OCRCLASS_H_\n\n#include <chrono>\n#include <ctime>\n\nnamespace tesseract {\n\n/**********************************************************************\n * EANYCODE_CHAR\n * Description of a single character. The character code is defined by\n * the character set of the current font.\n * Output text is sent as an array of these structures.\n * Spaces and line endings in the output are represented in the\n * structures of the surrounding characters. They are not directly\n * represented as characters.\n * The first character in a word has a positive value of blanks.\n * Missing information should be set to the defaults in the comments.\n * If word bounds are known, but not character bounds, then the top and\n * bottom of each character should be those of the word. The left of the\n * first and right of the last char in each word should be set. All other\n * lefts and rights should be set to -1.\n * If set, the values of right and bottom are left+width and top+height.\n * Most of the members come directly from the parameters to ocr_append_char.\n * The formatting member uses the enhancement parameter and combines the\n * line direction stuff into the top 3 bits.\n * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,\n * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what\n * the coding is, only that it is backwards compatible with the previous\n * version.\n **********************************************************************/\n\nstruct EANYCODE_CHAR { /*single character */\n  // It should be noted that the format for char_code for version 2.0 and beyond\n  // is UTF8 which means that ASCII characters will come out as one structure\n  // but other characters will be returned in two or more instances of this\n  // structure with a single byte of the  UTF8 code in each, but each will have\n  // the same bounding box. Programs which want to handle languages with\n  // different characters sets will need to handle extended characters\n  // appropriately, but *all* code needs to be prepared to receive UTF8 coded\n  // characters for characters such as bullet and fancy quotes.\n  uint16_t char_code; /*character itself */\n  int16_t left;       /*of char (-1) */\n  int16_t right;      /*of char (-1) */\n  int16_t top;        /*of char (-1) */\n  int16_t bottom;     /*of char (-1) */\n  int16_t font_index; /*what font (0) */\n  uint8_t confidence; /*0=perfect, 100=reject (0/100) */\n  uint8_t point_size; /*of char, 72=i inch, (10) */\n  int8_t blanks;      /*no of spaces before this char (1) */\n  uint8_t formatting; /*char formatting (0) */\n};\n\n/**********************************************************************\n * ETEXT_DESC\n * Description of the output of the OCR engine.\n * This structure is used as both a progress monitor and the final\n * output header, since it needs to be a valid progress monitor while\n * the OCR engine is storing its output to shared memory.\n * During progress, all the buffer info is -1.\n * Progress starts at 0 and increases to 100 during OCR. No other constraint.\n * Additionally the progress callback contains the bounding box of the word that\n * is currently being processed.\n * Every progress callback, the OCR engine must set ocr_alive to 1.\n * The HP side will set ocr_alive to 0. Repeated failure to reset\n * to 1 indicates that the OCR engine is dead.\n * If the cancel function is not null then it is called with the number of\n * user words found. If it returns true then operation is cancelled.\n **********************************************************************/\nclass ETEXT_DESC;\n\nusing CANCEL_FUNC = bool (*)(void *, int);\nusing PROGRESS_FUNC = bool (*)(int, int, int, int, int);\nusing PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);\n\nclass ETEXT_DESC { // output header\npublic:\n  int16_t count{0};    /// chars in this buffer(0)\n  int16_t progress{0}; /// percent complete increasing (0-100)\n  /** Progress monitor covers word recognition and it does not cover layout\n   * analysis.\n   * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */\n  int8_t more_to_come{0};       /// true if not last\n  volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0\n  int8_t err_code{0};           /// for errcode use\n  CANCEL_FUNC cancel{nullptr};  /// returns true to cancel\n  PROGRESS_FUNC progress_callback{\n      nullptr};                      /// called whenever progress increases\n  PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback\n  void *cancel_this{nullptr};        /// this or other data for cancel\n  std::chrono::steady_clock::time_point end_time;\n  /// Time to stop. Expected to be set only\n  /// by call to set_deadline_msecs().\n  EANYCODE_CHAR text[1]{}; /// character data\n\n  ETEXT_DESC() : progress_callback2(&default_progress_func) {\n    end_time = std::chrono::time_point<std::chrono::steady_clock,\n                                       std::chrono::milliseconds>();\n  }\n\n  // Sets the end time to be deadline_msecs milliseconds from now.\n  void set_deadline_msecs(int32_t deadline_msecs) {\n    if (deadline_msecs > 0) {\n      end_time = std::chrono::steady_clock::now() +\n                 std::chrono::milliseconds(deadline_msecs);\n    }\n  }\n\n  // Returns false if we've not passed the end_time, or have not set a deadline.\n  bool deadline_exceeded() const {\n    if (end_time.time_since_epoch() ==\n        std::chrono::steady_clock::duration::zero()) {\n      return false;\n    }\n    auto now = std::chrono::steady_clock::now();\n    return (now > end_time);\n  }\n\nprivate:\n  static bool default_progress_func(ETEXT_DESC *ths, int left, int right,\n                                    int top, int bottom) {\n    if (ths->progress_callback != nullptr) {\n      return (*(ths->progress_callback))(ths->progress, left, right, top,\n                                         bottom);\n    }\n    return true;\n  }\n};\n\n} // namespace tesseract\n\n#endif // CCUTIL_OCRCLASS_H_\n"
  },
  {
    "path": "include/tesseract/osdetect.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        osdetect.h\n// Description: Orientation and script detection.\n// Author:      Samuel Charron\n//              Ranjith Unnikrishnan\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCMAIN_OSDETECT_H_\n#define TESSERACT_CCMAIN_OSDETECT_H_\n\n#include \"export.h\" // for TESS_API\n\n#include <vector> // for std::vector\n\nnamespace tesseract {\n\nclass BLOBNBOX;\nclass BLOBNBOX_CLIST;\nclass BLOB_CHOICE_LIST;\nclass TO_BLOCK_LIST;\nclass UNICHARSET;\n\nclass Tesseract;\n\n// Max number of scripts in ICU + \"NULL\" + Japanese and Korean + Fraktur\nconst int kMaxNumberOfScripts = 116 + 1 + 2 + 1;\n\nstruct OSBestResult {\n  OSBestResult()\n      : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}\n  int orientation_id;\n  int script_id;\n  float sconfidence;\n  float oconfidence;\n};\n\nstruct OSResults {\n  OSResults() : unicharset(nullptr) {\n    for (int i = 0; i < 4; ++i) {\n      for (int j = 0; j < kMaxNumberOfScripts; ++j) {\n        scripts_na[i][j] = 0;\n      }\n      orientations[i] = 0;\n    }\n  }\n  void update_best_orientation();\n  // Set the estimate of the orientation to the given id.\n  void set_best_orientation(int orientation_id);\n  // Update/Compute the best estimate of the script assuming the given\n  // orientation id.\n  void update_best_script(int orientation_id);\n  // Return the index of the script with the highest score for this orientation.\n  TESS_API int get_best_script(int orientation_id) const;\n  // Accumulate scores with given OSResults instance and update the best script.\n  void accumulate(const OSResults &osr);\n\n  // Print statistics.\n  void print_scores(void) const;\n  void print_scores(int orientation_id) const;\n\n  // Array holding scores for each orientation id [0,3].\n  // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the\n  // page respectively, where the values refer to the amount of clockwise\n  // rotation to be applied to the page for the text to be upright and readable.\n  float orientations[4];\n  // Script confidence scores for each of 4 possible orientations.\n  float scripts_na[4][kMaxNumberOfScripts];\n\n  UNICHARSET *unicharset;\n  OSBestResult best_result;\n};\n\nclass OrientationDetector {\npublic:\n  OrientationDetector(const std::vector<int> *allowed_scripts,\n                      OSResults *results);\n  bool detect_blob(BLOB_CHOICE_LIST *scores);\n  int get_orientation();\n\nprivate:\n  OSResults *osr_;\n  const std::vector<int> *allowed_scripts_;\n};\n\nclass ScriptDetector {\npublic:\n  ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,\n                 tesseract::Tesseract *tess);\n  void detect_blob(BLOB_CHOICE_LIST *scores);\n  bool must_stop(int orientation) const;\n\nprivate:\n  OSResults *osr_;\n  int korean_id_;\n  int japanese_id_;\n  int katakana_id_;\n  int hiragana_id_;\n  int han_id_;\n  int hangul_id_;\n  int latin_id_;\n  int fraktur_id_;\n  tesseract::Tesseract *tess_;\n  const std::vector<int> *allowed_scripts_;\n};\n\nint orientation_and_script_detection(const char *filename, OSResults *,\n                                     tesseract::Tesseract *);\n\nint os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,\n              tesseract::Tesseract *tess);\n\nint os_detect_blobs(const std::vector<int> *allowed_scripts,\n                    BLOBNBOX_CLIST *blob_list, OSResults *osr,\n                    tesseract::Tesseract *tess);\n\nbool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,\n                    OSResults *, tesseract::Tesseract *tess);\n\n// Helper method to convert an orientation index to its value in degrees.\n// The value represents the amount of clockwise rotation in degrees that must be\n// applied for the text to be upright (readable).\nTESS_API int OrientationIdToValue(const int &id);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCMAIN_OSDETECT_H_\n"
  },
  {
    "path": "include/tesseract/pageiterator.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        pageiterator.h\n// Description: Iterator for tesseract page structure that avoids using\n//              tesseract internal data structures.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_\n#define TESSERACT_CCMAIN_PAGEITERATOR_H_\n\n#include \"export.h\"\n#include \"publictypes.h\"\n\nstruct Pix;\nstruct Pta;\n\nnamespace tesseract {\n\nstruct BlamerBundle;\nclass C_BLOB_IT;\nclass PAGE_RES;\nclass PAGE_RES_IT;\nclass WERD;\n\nclass Tesseract;\n\n/**\n * Class to iterate over tesseract page structure, providing access to all\n * levels of the page hierarchy, without including any tesseract headers or\n * having to handle any tesseract structures.\n * WARNING! This class points to data held within the TessBaseAPI class, and\n * therefore can only be used while the TessBaseAPI class still exists and\n * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n * DetectOS, or anything else that changes the internal PAGE_RES.\n * See tesseract/publictypes.h for the definition of PageIteratorLevel.\n * See also ResultIterator, derived from PageIterator, which adds in the\n * ability to access OCR output with text-specific methods.\n */\n\nclass TESS_API PageIterator {\npublic:\n  /**\n   * page_res and tesseract come directly from the BaseAPI.\n   * The rectangle parameters are copied indirectly from the Thresholder,\n   * via the BaseAPI. They represent the coordinates of some rectangle in an\n   * original image (in top-left-origin coordinates) and therefore the top-left\n   * needs to be added to any output boxes in order to specify coordinates\n   * in the original image. See TessBaseAPI::SetRectangle.\n   * The scale and scaled_yres are in case the Thresholder scaled the image\n   * rectangle prior to thresholding. Any coordinates in tesseract's image\n   * must be divided by scale before adding (rect_left, rect_top).\n   * The scaled_yres indicates the effective resolution of the binary image\n   * that tesseract has been given by the Thresholder.\n   * After the constructor, Begin has already been called.\n   */\n  PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,\n               int scaled_yres, int rect_left, int rect_top, int rect_width,\n               int rect_height);\n  virtual ~PageIterator();\n\n  /**\n   * Page/ResultIterators may be copied! This makes it possible to iterate over\n   * all the objects at a lower level, while maintaining an iterator to\n   * objects at a higher level. These constructors DO NOT CALL Begin, so\n   * iterations will continue from the location of src.\n   */\n  PageIterator(const PageIterator &src);\n  const PageIterator &operator=(const PageIterator &src);\n\n  /** Are we positioned at the same location as other? */\n  bool PositionedAtSameWord(const PAGE_RES_IT *other) const;\n\n  // ============= Moving around within the page ============.\n\n  /**\n   * Moves the iterator to point to the start of the page to begin an\n   * iteration.\n   */\n  virtual void Begin();\n\n  /**\n   * Moves the iterator to the beginning of the paragraph.\n   * This class implements this functionality by moving it to the zero indexed\n   * blob of the first (leftmost) word on the first row of the paragraph.\n   */\n  virtual void RestartParagraph();\n\n  /**\n   * Return whether this iterator points anywhere in the first textline of a\n   * paragraph.\n   */\n  bool IsWithinFirstTextlineOfParagraph() const;\n\n  /**\n   * Moves the iterator to the beginning of the text line.\n   * This class implements this functionality by moving it to the zero indexed\n   * blob of the first (leftmost) word of the row.\n   */\n  virtual void RestartRow();\n\n  /**\n   * Moves to the start of the next object at the given level in the\n   * page hierarchy, and returns false if the end of the page was reached.\n   * NOTE that RIL_SYMBOL will skip non-text blocks, but all other\n   * PageIteratorLevel level values will visit each non-text block once.\n   * Think of non text blocks as containing a single para, with a single line,\n   * with a single imaginary word.\n   * Calls to Next with different levels may be freely intermixed.\n   * This function iterates words in right-to-left scripts correctly, if\n   * the appropriate language has been loaded into Tesseract.\n   */\n  virtual bool Next(PageIteratorLevel level);\n\n  /**\n   * Returns true if the iterator is at the start of an object at the given\n   * level.\n   *\n   * For instance, suppose an iterator it is pointed to the first symbol of the\n   * first word of the third line of the second paragraph of the first block in\n   * a page, then:\n   *   it.IsAtBeginningOf(RIL_BLOCK) = false\n   *   it.IsAtBeginningOf(RIL_PARA) = false\n   *   it.IsAtBeginningOf(RIL_TEXTLINE) = true\n   *   it.IsAtBeginningOf(RIL_WORD) = true\n   *   it.IsAtBeginningOf(RIL_SYMBOL) = true\n   */\n  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;\n\n  /**\n   * Returns whether the iterator is positioned at the last element in a\n   * given level. (e.g. the last word in a line, the last line in a block)\n   *\n   *     Here's some two-paragraph example\n   *   text.  It starts off innocuously\n   *   enough but quickly turns bizarre.\n   *     The author inserts a cornucopia\n   *   of words to guard against confused\n   *   references.\n   *\n   * Now take an iterator it pointed to the start of \"bizarre.\"\n   *  it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false\n   *  it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true\n   *  it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false\n   */\n  virtual bool IsAtFinalElement(PageIteratorLevel level,\n                                PageIteratorLevel element) const;\n\n  /**\n   * Returns whether this iterator is positioned\n   *   before other:   -1\n   *   equal to other:  0\n   *   after other:     1\n   */\n  int Cmp(const PageIterator &other) const;\n\n  // ============= Accessing data ==============.\n  // Coordinate system:\n  // Integer coordinates are at the cracks between the pixels.\n  // The top-left corner of the top-left pixel in the image is at (0,0).\n  // The bottom-right corner of the bottom-right pixel in the image is at\n  // (width, height).\n  // Every bounding box goes from the top-left of the top-left contained\n  // pixel to the bottom-right of the bottom-right contained pixel, so\n  // the bounding box of the single top-left pixel in the image is:\n  // (0,0)->(1,1).\n  // If an image rectangle has been set in the API, then returned coordinates\n  // relate to the original (full) image, rather than the rectangle.\n\n  /**\n   * Controls what to include in a bounding box. Bounding boxes of all levels\n   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.\n   * Between layout analysis and recognition, it isn't known where all\n   * diacritics belong, so this control is used to include or exclude some\n   * diacritics that are above or below the main body of the word. In most cases\n   * where the placement is obvious, and after recognition, it doesn't make as\n   * much difference, as the diacritics will already be included in the word.\n   */\n  void SetBoundingBoxComponents(bool include_upper_dots,\n                                bool include_lower_dots) {\n    include_upper_dots_ = include_upper_dots;\n    include_lower_dots_ = include_lower_dots;\n  }\n\n  /**\n   * Returns the bounding rectangle of the current object at the given level.\n   * See comment on coordinate system above.\n   * Returns false if there is no such object at the current position.\n   * The returned bounding box is guaranteed to match the size and position\n   * of the image returned by GetBinaryImage, but may clip foreground pixels\n   * from a grey image. The padding argument to GetImage can be used to expand\n   * the image to include more foreground pixels. See GetImage below.\n   */\n  bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,\n                   int *bottom) const;\n  bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,\n                   int *right, int *bottom) const;\n  /**\n   * Returns the bounding rectangle of the object in a coordinate system of the\n   * working image rectangle having its origin at (rect_left_, rect_top_) with\n   * respect to the original image and is scaled by a factor scale_.\n   */\n  bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,\n                           int *right, int *bottom) const;\n\n  /** Returns whether there is no object of a given level. */\n  bool Empty(PageIteratorLevel level) const;\n\n  /**\n   * Returns the type of the current block.\n   * See tesseract/publictypes.h for PolyBlockType.\n   */\n  PolyBlockType BlockType() const;\n\n  /**\n   * Returns the polygon outline of the current block. The returned Pta must\n   * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices\n   * of the polygon, and the last edge is the line segment between the last\n   * point and the first point. nullptr will be returned if the iterator is\n   * at the end of the document or layout analysis was not used.\n   */\n  Pta *BlockPolygon() const;\n\n  /**\n   * Returns a binary image of the current object at the given level.\n   * The position and size match the return from BoundingBoxInternal, and so\n   * this could be upscaled with respect to the original input image.\n   * Use pixDestroy to delete the image after use.\n   */\n  Pix *GetBinaryImage(PageIteratorLevel level) const;\n\n  /**\n   * Returns an image of the current object at the given level in greyscale\n   * if available in the input. To guarantee a binary image use BinaryImage.\n   * NOTE that in order to give the best possible image, the bounds are\n   * expanded slightly over the binary connected component, by the supplied\n   * padding, so the top-left position of the returned image is returned\n   * in (left,top). These will most likely not match the coordinates\n   * returned by BoundingBox.\n   * If you do not supply an original image, you will get a binary one.\n   * Use pixDestroy to delete the image after use.\n   */\n  Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,\n                int *left, int *top) const;\n\n  /**\n   * Returns the baseline of the current object at the given level.\n   * The baseline is the line that passes through (x1, y1) and (x2, y2).\n   * WARNING: with vertical text, baselines may be vertical!\n   * Returns false if there is no baseline at the current position.\n   */\n  bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,\n                int *y2) const;\n\n  // Returns the attributes of the current row.\n  void RowAttributes(float *row_height, float *descenders,\n                     float *ascenders) const;\n\n  /**\n   * Returns orientation for the block the iterator points to.\n   *   orientation, writing_direction, textline_order: see publictypes.h\n   *   deskew_angle: after rotating the block so the text orientation is\n   *                 upright, how many radians does one have to rotate the\n   *                 block anti-clockwise for it to be level?\n   *                   -Pi/4 <= deskew_angle <= Pi/4\n   */\n  void Orientation(tesseract::Orientation *orientation,\n                   tesseract::WritingDirection *writing_direction,\n                   tesseract::TextlineOrder *textline_order,\n                   float *deskew_angle) const;\n\n  /**\n   * Returns information about the current paragraph, if available.\n   *\n   *   justification -\n   *     LEFT if ragged right, or fully justified and script is left-to-right.\n   *     RIGHT if ragged left, or fully justified and script is right-to-left.\n   *     unknown if it looks like source code or we have very few lines.\n   *   is_list_item -\n   *     true if we believe this is a member of an ordered or unordered list.\n   *   is_crown -\n   *     true if the first line of the paragraph is aligned with the other\n   *     lines of the paragraph even though subsequent paragraphs have first\n   *     line indents.  This typically indicates that this is the continuation\n   *     of a previous paragraph or that it is the very first paragraph in\n   *     the chapter.\n   *   first_line_indent -\n   *     For LEFT aligned paragraphs, the first text line of paragraphs of\n   *     this kind are indented this many pixels from the left edge of the\n   *     rest of the paragraph.\n   *     for RIGHT aligned paragraphs, the first text line of paragraphs of\n   *     this kind are indented this many pixels from the right edge of the\n   *     rest of the paragraph.\n   *     NOTE 1: This value may be negative.\n   *     NOTE 2: if *is_crown == true, the first line of this paragraph is\n   *             actually flush, and first_line_indent is set to the \"common\"\n   *             first_line_indent for subsequent paragraphs in this block\n   *             of text.\n   */\n  void ParagraphInfo(tesseract::ParagraphJustification *justification,\n                     bool *is_list_item, bool *is_crown,\n                     int *first_line_indent) const;\n\n  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle\n  // of the current word to the given pointer (takes ownership of the pointer)\n  // and returns true.\n  // Can only be used when iterating on the word level.\n  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);\n\nprotected:\n  /**\n   * Sets up the internal data for iterating the blobs of a new word, then\n   * moves the iterator to the given offset.\n   */\n  void BeginWord(int offset);\n\n  /** Pointer to the page_res owned by the API. */\n  PAGE_RES *page_res_;\n  /** Pointer to the Tesseract object owned by the API. */\n  Tesseract *tesseract_;\n  /**\n   * The iterator to the page_res_. Owned by this ResultIterator.\n   * A pointer just to avoid dragging in Tesseract includes.\n   */\n  PAGE_RES_IT *it_;\n  /**\n   * The current input WERD being iterated. If there is an output from OCR,\n   * then word_ is nullptr. Owned by the API\n   */\n  WERD *word_;\n  /** The length of the current word_. */\n  int word_length_;\n  /** The current blob index within the word. */\n  int blob_index_;\n  /**\n   * Iterator to the blobs within the word. If nullptr, then we are iterating\n   * OCR results in the box_word.\n   * Owned by this ResultIterator.\n   */\n  C_BLOB_IT *cblob_it_;\n  /** Control over what to include in bounding boxes. */\n  bool include_upper_dots_;\n  bool include_lower_dots_;\n  /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/\n  int scale_;\n  int scaled_yres_;\n  int rect_left_;\n  int rect_top_;\n  int rect_width_;\n  int rect_height_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_\n"
  },
  {
    "path": "include/tesseract/publictypes.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        publictypes.h\n// Description: Types used in both the API and internally\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_\n#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_\n\nnamespace tesseract {\n\n// This file contains types that are used both by the API and internally\n// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic\n// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.\n// Restated: It is OK for low-level Tesseract files to include publictypes.h,\n// but not for the low-level tesseract code to include top-level API code.\n// This file should not use other Tesseract types, as that would drag\n// their includes into the API-level.\n\n/** Number of printers' points in an inch. The unit of the pointsize return. */\nconstexpr int kPointsPerInch = 72;\n/**\n * Minimum believable resolution. Used as a default if there is no other\n * information, as it is safer to under-estimate than over-estimate.\n */\nconstexpr int kMinCredibleResolution = 70;\n/** Maximum believable resolution.  */\nconstexpr int kMaxCredibleResolution = 2400;\n/**\n * Ratio between median blob size and likely resolution. Used to estimate\n * resolution when none is provided. This is basically 1/usual text size in\n * inches.  */\nconstexpr int kResolutionEstimationFactor = 10;\n\n/**\n * Possible types for a POLY_BLOCK or ColPartition.\n * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions\n * below, as well as kPolyBlockNames in layout_test.cc.\n * Used extensively by ColPartition, and POLY_BLOCK.\n */\nenum PolyBlockType {\n  PT_UNKNOWN,         // Type is not yet known. Keep as the first element.\n  PT_FLOWING_TEXT,    // Text that lives inside a column.\n  PT_HEADING_TEXT,    // Text that spans more than one column.\n  PT_PULLOUT_TEXT,    // Text that is in a cross-column pull-out region.\n  PT_EQUATION,        // Partition belonging to an equation region.\n  PT_INLINE_EQUATION, // Partition has inline equation.\n  PT_TABLE,           // Partition belonging to a table region.\n  PT_VERTICAL_TEXT,   // Text-line runs vertically.\n  PT_CAPTION_TEXT,    // Text that belongs to an image.\n  PT_FLOWING_IMAGE,   // Image that lives inside a column.\n  PT_HEADING_IMAGE,   // Image that spans more than one column.\n  PT_PULLOUT_IMAGE,   // Image that is in a cross-column pull-out region.\n  PT_HORZ_LINE,       // Horizontal Line.\n  PT_VERT_LINE,       // Vertical Line.\n  PT_NOISE,           // Lies outside of any column.\n  PT_COUNT\n};\n\n/** Returns true if PolyBlockType is of line type */\ninline bool PTIsLineType(PolyBlockType type) {\n  return type == PT_HORZ_LINE || type == PT_VERT_LINE;\n}\n/** Returns true if PolyBlockType is of image type */\ninline bool PTIsImageType(PolyBlockType type) {\n  return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||\n         type == PT_PULLOUT_IMAGE;\n}\n/** Returns true if PolyBlockType is of text type */\ninline bool PTIsTextType(PolyBlockType type) {\n  return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||\n         type == PT_PULLOUT_TEXT || type == PT_TABLE ||\n         type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||\n         type == PT_INLINE_EQUATION;\n}\n// Returns true if PolyBlockType is of pullout(inter-column) type\ninline bool PTIsPulloutType(PolyBlockType type) {\n  return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;\n}\n\n/**\n *  +------------------+  Orientation Example:\n *  | 1 Aaaa Aaaa Aaaa |  ====================\n *  | Aaa aa aaa aa    |  To left is a diagram of some (1) English and\n *  | aaaaaa A aa aaa. |  (2) Chinese text and a (3) photo credit.\n *  |                2 |\n *  |   #######  c c C |  Upright Latin characters are represented as A and a.\n *  |   #######  c c c |  '<' represents a latin character rotated\n *  | < #######  c c c |      anti-clockwise 90 degrees.\n *  | < #######  c   c |\n *  | < #######  .   c |  Upright Chinese characters are represented C and c.\n *  | 3 #######      c |\n *  +------------------+  NOTA BENE: enum values here should match goodoc.proto\n\n * If you orient your head so that \"up\" aligns with Orientation,\n * then the characters will appear \"right side up\" and readable.\n *\n * In the example above, both the English and Chinese paragraphs are oriented\n * so their \"up\" is the top of the page (page up).  The photo credit is read\n * with one's head turned leftward (\"up\" is to page left).\n *\n * The values of this enum match the convention of Tesseract's osdetect.h\n*/\nenum Orientation {\n  ORIENTATION_PAGE_UP = 0,\n  ORIENTATION_PAGE_RIGHT = 1,\n  ORIENTATION_PAGE_DOWN = 2,\n  ORIENTATION_PAGE_LEFT = 3,\n};\n\n/**\n * The grapheme clusters within a line of text are laid out logically\n * in this direction, judged when looking at the text line rotated so that\n * its Orientation is \"page up\".\n *\n * For English text, the writing direction is left-to-right.  For the\n * Chinese text in the above example, the writing direction is top-to-bottom.\n */\nenum WritingDirection {\n  WRITING_DIRECTION_LEFT_TO_RIGHT = 0,\n  WRITING_DIRECTION_RIGHT_TO_LEFT = 1,\n  WRITING_DIRECTION_TOP_TO_BOTTOM = 2,\n};\n\n/**\n * The text lines are read in the given sequence.\n *\n * In English, the order is top-to-bottom.\n * In Chinese, vertical text lines are read right-to-left.  Mongolian is\n * written in vertical columns top to bottom like Chinese, but the lines\n * order left-to right.\n *\n * Note that only some combinations make sense.  For example,\n * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM\n */\nenum TextlineOrder {\n  TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,\n  TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,\n  TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,\n};\n\n/**\n * Possible modes for page layout analysis. These *must* be kept in order\n * of decreasing amount of layout analysis to be done, except for OSD_ONLY,\n * so that the inequality test macros below work.\n */\nenum PageSegMode {\n  PSM_OSD_ONLY = 0,      ///< Orientation and script detection only.\n  PSM_AUTO_OSD = 1,      ///< Automatic page segmentation with orientation and\n                         ///< script detection. (OSD)\n  PSM_AUTO_ONLY = 2,     ///< Automatic page segmentation, but no OSD, or OCR.\n  PSM_AUTO = 3,          ///< Fully automatic page segmentation, but no OSD.\n  PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.\n  PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of\n                                  ///< vertically aligned text.\n  PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)\n  PSM_SINGLE_LINE = 7,  ///< Treat the image as a single text line.\n  PSM_SINGLE_WORD = 8,  ///< Treat the image as a single word.\n  PSM_CIRCLE_WORD = 9,  ///< Treat the image as a single word in a circle.\n  PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.\n  PSM_SPARSE_TEXT =\n      11, ///< Find as much text as possible in no particular order.\n  PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.\n  PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing\n                     ///< hacks that are Tesseract-specific.\n\n  PSM_COUNT ///< Number of enum entries.\n};\n\n/**\n * Inline functions that act on a PageSegMode to determine whether components of\n * layout analysis are enabled.\n * *Depend critically on the order of elements of PageSegMode.*\n * NOTE that arg is an int for compatibility with INT_PARAM.\n */\ninline bool PSM_OSD_ENABLED(int pageseg_mode) {\n  return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;\n}\ninline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {\n  return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;\n}\ninline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {\n  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;\n}\ninline bool PSM_SPARSE(int pageseg_mode) {\n  return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;\n}\ninline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {\n  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;\n}\ninline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {\n  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;\n}\ninline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {\n  return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||\n         pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;\n}\n\n/**\n * enum of the elements of the page hierarchy, used in ResultIterator\n * to provide functions that operate on each level without having to\n * have 5x as many functions.\n */\nenum PageIteratorLevel {\n  RIL_BLOCK,    // Block of text/image/separator line.\n  RIL_PARA,     // Paragraph within a block.\n  RIL_TEXTLINE, // Line within a paragraph.\n  RIL_WORD,     // Word within a textline.\n  RIL_SYMBOL    // Symbol/character within a word.\n};\n\n/**\n * JUSTIFICATION_UNKNOWN\n *   The alignment is not clearly one of the other options.  This could happen\n *   for example if there are only one or two lines of text or the text looks\n *   like source code or poetry.\n *\n * NOTA BENE: Fully justified paragraphs (text aligned to both left and right\n *    margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text\n *    is written with a left-to-right script and with JUSTIFICATION_RIGHT if\n *    their text is written in a right-to-left script.\n *\n * Interpretation for text read in vertical lines:\n *   \"Left\" is wherever the starting reading position is.\n *\n * JUSTIFICATION_LEFT\n *   Each line, except possibly the first, is flush to the same left tab stop.\n *\n * JUSTIFICATION_CENTER\n *   The text lines of the paragraph are centered about a line going\n *   down through their middle of the text lines.\n *\n * JUSTIFICATION_RIGHT\n *   Each line, except possibly the first, is flush to the same right tab stop.\n */\nenum ParagraphJustification {\n  JUSTIFICATION_UNKNOWN,\n  JUSTIFICATION_LEFT,\n  JUSTIFICATION_CENTER,\n  JUSTIFICATION_RIGHT,\n};\n\n/**\n * When Tesseract/Cube is initialized we can choose to instantiate/load/run\n * only the Tesseract part, only the Cube part or both along with the combiner.\n * The preference of which engine to use is stored in tessedit_ocr_engine_mode.\n *\n * ATTENTION: When modifying this enum, please make sure to make the\n * appropriate changes to all the enums mirroring it (e.g. OCREngine in\n * cityblock/workflow/detection/detection_storage.proto). Such enums will\n * mention the connection to OcrEngineMode in the comments.\n */\nenum OcrEngineMode {\n  OEM_TESSERACT_ONLY,          // Run Tesseract only - fastest; deprecated\n  OEM_LSTM_ONLY,               // Run just the LSTM line recognizer.\n  OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback\n                               // to Tesseract when things get difficult.\n                               // deprecated\n  OEM_DEFAULT,                 // Specify this mode when calling init_*(),\n                               // to indicate that any of the above modes\n                               // should be automatically inferred from the\n                               // variables in the language-specific config,\n                               // command-line configs, or if not specified\n                               // in any of the above should be set to the\n                               // default OEM_TESSERACT_ONLY.\n  OEM_COUNT                    // Number of OEMs\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_\n"
  },
  {
    "path": "include/tesseract/renderer.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        renderer.h\n// Description: Rendering interface to inject into TessBaseAPI\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_API_RENDERER_H_\n#define TESSERACT_API_RENDERER_H_\n\n#include \"export.h\"\n\n// To avoid collision with other typenames include the ABSOLUTE MINIMUM\n// complexity of includes here. Use forward declarations wherever possible\n// and hide includes of complex types in baseapi.cpp.\n#include <cstdint>\n#include <string> // for std::string\n#include <vector> // for std::vector\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass TessBaseAPI;\n\n/**\n * Interface for rendering tesseract results into a document, such as text,\n * HOCR or pdf. This class is abstract. Specific classes handle individual\n * formats. This interface is then used to inject the renderer class into\n * tesseract when processing images.\n *\n * For simplicity implementing this with tesseract version 3.01,\n * the renderer contains document state that is cleared from document\n * to document just as the TessBaseAPI is. This way the base API can just\n * delegate its rendering functionality to injected renderers, and the\n * renderers can manage the associated state needed for the specific formats\n * in addition to the heuristics for producing it.\n */\nclass TESS_API TessResultRenderer {\npublic:\n  virtual ~TessResultRenderer();\n\n  // Takes ownership of pointer so must be new'd instance.\n  // Renderers aren't ordered, but appends the sequences of next parameter\n  // and existing next(). The renderers should be unique across both lists.\n  void insert(TessResultRenderer *next);\n\n  // Returns the next renderer or nullptr.\n  TessResultRenderer *next() {\n    return next_;\n  }\n\n  /**\n   * Starts a new document with the given title.\n   * This clears the contents of the output data.\n   * Title should use UTF-8 encoding.\n   */\n  bool BeginDocument(const char *title);\n\n  /**\n   * Adds the recognized text from the source image to the current document.\n   * Invalid if BeginDocument not yet called.\n   *\n   * Note that this API is a bit weird but is designed to fit into the\n   * current TessBaseAPI implementation where the api has lots of state\n   * information that we might want to add in.\n   */\n  bool AddImage(TessBaseAPI *api);\n\n  /**\n   * Finishes the document and finalizes the output data\n   * Invalid if BeginDocument not yet called.\n   */\n  bool EndDocument();\n\n  const char *file_extension() const {\n    return file_extension_;\n  }\n  const char *title() const {\n    return title_.c_str();\n  }\n\n  // Is everything fine? Otherwise something went wrong.\n  bool happy() const {\n    return happy_;\n  }\n\n  /**\n   * Returns the index of the last image given to AddImage\n   * (i.e. images are incremented whether the image succeeded or not)\n   *\n   * This is always defined. It means either the number of the\n   * current image, the last image ended, or in the completed document\n   * depending on when in the document lifecycle you are looking at it.\n   * Will return -1 if a document was never started.\n   */\n  int imagenum() const {\n    return imagenum_;\n  }\n\nprotected:\n  /**\n   * Called by concrete classes.\n   *\n   * outputbase is the name of the output file excluding\n   * extension. For example, \"/path/to/chocolate-chip-cookie-recipe\"\n   *\n   * extension indicates the file extension to be used for output\n   * files. For example \"pdf\" will produce a .pdf file, and \"hocr\"\n   * will produce .hocr files.\n   */\n  TessResultRenderer(const char *outputbase, const char *extension);\n\n  // Hook for specialized handling in BeginDocument()\n  virtual bool BeginDocumentHandler();\n\n  // This must be overridden to render the OCR'd results\n  virtual bool AddImageHandler(TessBaseAPI *api) = 0;\n\n  // Hook for specialized handling in EndDocument()\n  virtual bool EndDocumentHandler();\n\n  // Renderers can call this to append '\\0' terminated strings into\n  // the output string returned by GetOutput.\n  // This method will grow the output buffer if needed.\n  void AppendString(const char *s);\n\n  // Renderers can call this to append binary byte sequences into\n  // the output string returned by GetOutput. Note that s is not necessarily\n  // '\\0' terminated (and can contain '\\0' within it).\n  // This method will grow the output buffer if needed.\n  void AppendData(const char *s, int len);\n\n  template <typename T>\n  auto AppendData(T &&d) {\n    AppendData(d.data(), d.size());\n    return d.size();\n  }\n\nprivate:\n  TessResultRenderer *next_;   // Can link multiple renderers together\n  FILE *fout_;                 // output file pointer\n  const char *file_extension_; // standard extension for generated output\n  std::string title_;          // title of document being rendered\n  int imagenum_;               // index of last image added\n  bool happy_;                 // I get grumpy when the disk fills up, etc.\n};\n\n/**\n * Renders tesseract output into a plain UTF-8 text string\n */\nclass TESS_API TessTextRenderer : public TessResultRenderer {\npublic:\n  explicit TessTextRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n/**\n * Renders tesseract output into an hocr text string\n */\nclass TESS_API TessHOcrRenderer : public TessResultRenderer {\npublic:\n  explicit TessHOcrRenderer(const char *outputbase, bool font_info);\n  explicit TessHOcrRenderer(const char *outputbase);\n\nprotected:\n  bool BeginDocumentHandler() override;\n  bool AddImageHandler(TessBaseAPI *api) override;\n  bool EndDocumentHandler() override;\n\nprivate:\n  bool font_info_; // whether to print font information\n};\n\n/**\n * Renders tesseract output into an alto text string\n */\nclass TESS_API TessAltoRenderer : public TessResultRenderer {\npublic:\n  explicit TessAltoRenderer(const char *outputbase);\n\nprotected:\n  bool BeginDocumentHandler() override;\n  bool AddImageHandler(TessBaseAPI *api) override;\n  bool EndDocumentHandler() override;\n\nprivate:\n  bool begin_document;\n};\n\n/**\n * Renders Tesseract output into a PAGE XML text string\n */\nclass TESS_API TessPAGERenderer : public TessResultRenderer {\npublic:\n  explicit TessPAGERenderer(const char *outputbase);\n\nprotected:\n  bool BeginDocumentHandler() override;\n  bool AddImageHandler(TessBaseAPI *api) override;\n  bool EndDocumentHandler() override;\n\nprivate:\n  bool begin_document;\n};\n\n\n/**\n * Renders Tesseract output into a TSV string\n */\nclass TESS_API TessTsvRenderer : public TessResultRenderer {\npublic:\n  explicit TessTsvRenderer(const char *outputbase, bool font_info);\n  explicit TessTsvRenderer(const char *outputbase);\n\nprotected:\n  bool BeginDocumentHandler() override;\n  bool AddImageHandler(TessBaseAPI *api) override;\n  bool EndDocumentHandler() override;\n\nprivate:\n  bool font_info_; // whether to print font information\n};\n\n/**\n * Renders tesseract output into searchable PDF\n */\nclass TESS_API TessPDFRenderer : public TessResultRenderer {\npublic:\n  // datadir is the location of the TESSDATA. We need it because\n  // we load a custom PDF font from this location.\n  TessPDFRenderer(const char *outputbase, const char *datadir,\n                  bool textonly = false);\n\nprotected:\n  bool BeginDocumentHandler() override;\n  bool AddImageHandler(TessBaseAPI *api) override;\n  bool EndDocumentHandler() override;\n\nprivate:\n  // We don't want to have every image in memory at once,\n  // so we store some metadata as we go along producing\n  // PDFs one page at a time. At the end, that metadata is\n  // used to make everything that isn't easily handled in a\n  // streaming fashion.\n  long int obj_;                  // counter for PDF objects\n  std::vector<uint64_t> offsets_; // offset of every PDF object in bytes\n  std::vector<long int> pages_;   // object number for every /Page object\n  std::string datadir_;           // where to find the custom font\n  bool textonly_;                 // skip images if set\n  // Bookkeeping only. DIY = Do It Yourself.\n  void AppendPDFObjectDIY(size_t objectsize);\n  // Bookkeeping + emit data.\n  void AppendPDFObject(const char *data);\n  // Create the /Contents object for an entire page.\n  char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);\n  // Turn an image into a PDF object. Only transcode if we have to.\n  static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,\n                            char **pdf_object, long int *pdf_object_size,\n                            int jpg_quality);\n};\n\n/**\n * Renders tesseract output into a plain UTF-8 text string\n */\nclass TESS_API TessUnlvRenderer : public TessResultRenderer {\npublic:\n  explicit TessUnlvRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n/**\n * Renders tesseract output into a plain UTF-8 text string for LSTMBox\n */\nclass TESS_API TessLSTMBoxRenderer : public TessResultRenderer {\npublic:\n  explicit TessLSTMBoxRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n/**\n * Renders tesseract output into a plain UTF-8 text string\n */\nclass TESS_API TessBoxTextRenderer : public TessResultRenderer {\npublic:\n  explicit TessBoxTextRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n/**\n * Renders tesseract output into a plain UTF-8 text string in WordStr format\n */\nclass TESS_API TessWordStrBoxRenderer : public TessResultRenderer {\npublic:\n  explicit TessWordStrBoxRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n/**\n * Renders tesseract output into an osd text string\n */\nclass TESS_API TessOsdRenderer : public TessResultRenderer {\npublic:\n  explicit TessOsdRenderer(const char *outputbase);\n\nprotected:\n  bool AddImageHandler(TessBaseAPI *api) override;\n};\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n} // namespace tesseract.\n\n#endif // TESSERACT_API_RENDERER_H_\n"
  },
  {
    "path": "include/tesseract/resultiterator.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        resultiterator.h\n// Description: Iterator for tesseract results that is capable of\n//              iterating in proper reading order over Bi Directional\n//              (e.g. mixed Hebrew and English) text.\n// Author:      David Eger\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_\n#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_\n\n#include \"export.h\"            // for TESS_API, TESS_LOCAL\n#include \"ltrresultiterator.h\" // for LTRResultIterator\n#include \"publictypes.h\"       // for PageIteratorLevel\n#include \"unichar.h\"           // for StrongScriptDirection\n\n#include <set>    // for std::pair\n#include <vector> // for std::vector\n\nnamespace tesseract {\n\nclass TESS_API ResultIterator : public LTRResultIterator {\npublic:\n  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);\n\n  /**\n   * ResultIterator is copy constructible!\n   * The default copy constructor works just fine for us.\n   */\n  ~ResultIterator() override = default;\n\n  // ============= Moving around within the page ============.\n  /**\n   * Moves the iterator to point to the start of the page to begin\n   * an iteration.\n   */\n  void Begin() override;\n\n  /**\n   * Moves to the start of the next object at the given level in the\n   * page hierarchy in the appropriate reading order and returns false if\n   * the end of the page was reached.\n   * NOTE that RIL_SYMBOL will skip non-text blocks, but all other\n   * PageIteratorLevel level values will visit each non-text block once.\n   * Think of non text blocks as containing a single para, with a single line,\n   * with a single imaginary word.\n   * Calls to Next with different levels may be freely intermixed.\n   * This function iterates words in right-to-left scripts correctly, if\n   * the appropriate language has been loaded into Tesseract.\n   */\n  bool Next(PageIteratorLevel level) override;\n\n  /**\n   * IsAtBeginningOf() returns whether we're at the logical beginning of the\n   * given level.  (as opposed to ResultIterator's left-to-right top-to-bottom\n   * order).  Otherwise, this acts the same as PageIterator::IsAtBeginningOf().\n   * For a full description, see pageiterator.h\n   */\n  bool IsAtBeginningOf(PageIteratorLevel level) const override;\n\n  /**\n   * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.\n   * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we\n   * point at the last word in a paragraph.  See PageIterator for full comment.\n   */\n  bool IsAtFinalElement(PageIteratorLevel level,\n                        PageIteratorLevel element) const override;\n\n  // ============= Functions that refer to words only ============.\n  // Returns the number of blanks before the current word.\n  int BlanksBeforeWord() const;\n\n  // ============= Accessing data ==============.\n\n  /**\n   * Returns the null terminated UTF-8 encoded text string for the current\n   * object at the given level. Use delete [] to free after use.\n   */\n  virtual char *GetUTF8Text(PageIteratorLevel level) const;\n\n  /**\n   * Returns the LSTM choices for every LSTM timestep for the current word.\n   */\n  virtual std::vector<std::vector<std::vector<std::pair<const char *, float>>>>\n      *GetRawLSTMTimesteps() const;\n  virtual std::vector<std::vector<std::pair<const char *, float>>>\n      *GetBestLSTMSymbolChoices() const;\n\n  /**\n   * Return whether the current paragraph's dominant reading direction\n   * is left-to-right (as opposed to right-to-left).\n   */\n  bool ParagraphIsLtr() const;\n\n  // ============= Exposed only for testing =============.\n\n  /**\n   * Yields the reading order as a sequence of indices and (optional)\n   * meta-marks for a set of words (given left-to-right).\n   * The meta marks are passed as negative values:\n   *   kMinorRunStart  Start of minor direction text.\n   *   kMinorRunEnd    End of minor direction text.\n   *   kComplexWord    The next indexed word contains both left-to-right and\n   *                    right-to-left characters and was treated as neutral.\n   *\n   * For example, suppose we have five words in a text line,\n   * indexed [0,1,2,3,4] from the leftmost side of the text line.\n   * The following are all believable reading_orders:\n   *\n   * Left-to-Right (in ltr paragraph):\n   *     { 0, 1, 2, 3, 4 }\n   * Left-to-Right (in rtl paragraph):\n   *     { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }\n   * Right-to-Left (in rtl paragraph):\n   *     { 4, 3, 2, 1, 0 }\n   * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:\n   *     { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }\n   */\n  static void CalculateTextlineOrder(\n      bool paragraph_is_ltr,\n      const std::vector<StrongScriptDirection> &word_dirs,\n      std::vector<int> *reading_order);\n\n  static const int kMinorRunStart;\n  static const int kMinorRunEnd;\n  static const int kComplexWord;\n\nprotected:\n  /**\n   * We presume the data associated with the given iterator will outlive us.\n   * NB: This is private because it does something that is non-obvious:\n   *   it resets to the beginning of the paragraph instead of staying wherever\n   *   resit might have pointed.\n   */\n  explicit ResultIterator(const LTRResultIterator &resit);\n\nprivate:\n  /**\n   * Calculates the current paragraph's dominant writing direction.\n   * Typically, members should use current_paragraph_ltr_ instead.\n   */\n  bool CurrentParagraphIsLtr() const;\n\n  /**\n   * Returns word indices as measured from resit->RestartRow() = index 0\n   * for the reading order of words within a textline given an iterator\n   * into the middle of the text line.\n   * In addition to non-negative word indices, the following negative values\n   * may be inserted:\n   *   kMinorRunStart  Start of minor direction text.\n   *   kMinorRunEnd    End of minor direction text.\n   *   kComplexWord    The previous word contains both left-to-right and\n   *                   right-to-left characters and was treated as neutral.\n   */\n  void CalculateTextlineOrder(bool paragraph_is_ltr,\n                              const LTRResultIterator &resit,\n                              std::vector<int> *indices) const;\n  /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */\n  void CalculateTextlineOrder(bool paragraph_is_ltr,\n                              const LTRResultIterator &resit,\n                              std::vector<StrongScriptDirection> *ssd,\n                              std::vector<int> *indices) const;\n\n  /**\n   * What is the index of the current word in a strict left-to-right reading\n   * of the row?\n   */\n  int LTRWordIndex() const;\n\n  /**\n   * Given an iterator pointing at a word, returns the logical reading order\n   * of blob indices for the word.\n   */\n  void CalculateBlobOrder(std::vector<int> *blob_indices) const;\n\n  /** Precondition: current_paragraph_is_ltr_ is set. */\n  void MoveToLogicalStartOfTextline();\n\n  /**\n   * Precondition: current_paragraph_is_ltr_ and in_minor_direction_\n   * are set.\n   */\n  void MoveToLogicalStartOfWord();\n\n  /** Are we pointing at the final (reading order) symbol of the word? */\n  bool IsAtFinalSymbolOfWord() const;\n\n  /** Are we pointing at the first (reading order) symbol of the word? */\n  bool IsAtFirstSymbolOfWord() const;\n\n  /**\n   * Append any extra marks that should be appended to this word when printed.\n   * Mostly, these are Unicode BiDi control characters.\n   */\n  void AppendSuffixMarks(std::string *text) const;\n\n  /** Appends the current word in reading order to the given buffer.*/\n  void AppendUTF8WordText(std::string *text) const;\n\n  /**\n   * Appends the text of the current text line, *assuming this iterator is\n   * positioned at the beginning of the text line*  This function\n   * updates the iterator to point to the first position past the text line.\n   * Each textline is terminated in a single newline character.\n   * If the textline ends a paragraph, it gets a second terminal newline.\n   */\n  void IterateAndAppendUTF8TextlineText(std::string *text);\n\n  /**\n   * Appends the text of the current paragraph in reading order\n   * to the given buffer.\n   * Each textline is terminated in a single newline character, and the\n   * paragraph gets an extra newline at the end.\n   */\n  void AppendUTF8ParagraphText(std::string *text) const;\n\n  /** Returns whether the bidi_debug flag is set to at least min_level. */\n  bool BidiDebug(int min_level) const;\n\n  bool current_paragraph_is_ltr_;\n\n  /**\n   * Is the currently pointed-at character at the beginning of\n   * a minor-direction run?\n   */\n  bool at_beginning_of_minor_run_;\n\n  /** Is the currently pointed-at character in a minor-direction sequence? */\n  bool in_minor_direction_;\n\n  /**\n   * Should detected inter-word spaces be preserved, or \"compressed\" to a single\n   * space character (default behavior).\n   */\n  bool preserve_interword_spaces_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_\n"
  },
  {
    "path": "include/tesseract/unichar.h",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        unichar.h\n// Description: Unicode character/ligature class.\n// Author:      Ray Smith\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCUTIL_UNICHAR_H_\n#define TESSERACT_CCUTIL_UNICHAR_H_\n\n#include \"export.h\"\n\n#include <memory.h>\n#include <cstring>\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\n// Maximum number of characters that can be stored in a UNICHAR. Must be\n// at least 4. Must not exceed 31 without changing the coding of length.\n#define UNICHAR_LEN 30\n\n// A UNICHAR_ID is the unique id of a unichar.\nusing UNICHAR_ID = int;\n\n// A variable to indicate an invalid or uninitialized unichar id.\nstatic const int INVALID_UNICHAR_ID = -1;\n// A special unichar that corresponds to INVALID_UNICHAR_ID.\nstatic const char INVALID_UNICHAR[] = \"__INVALID_UNICHAR__\";\n\nenum StrongScriptDirection {\n  DIR_NEUTRAL = 0,       // Text contains only neutral characters.\n  DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.\n  DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.\n  DIR_MIX = 3,           // Text contains a mixture of left-to-right\n                         // and right-to-left characters.\n};\n\nusing char32 = signed int;\n\n// The UNICHAR class holds a single classification result. This may be\n// a single Unicode character (stored as between 1 and 4 utf8 bytes) or\n// multiple Unicode characters representing the NFKC expansion of a ligature\n// such as fi, ffl etc. These are also stored as utf8.\nclass TESS_API UNICHAR {\npublic:\n  UNICHAR() {\n    memset(chars, 0, UNICHAR_LEN);\n  }\n\n  // Construct from a utf8 string. If len<0 then the string is null terminated.\n  // If the string is too long to fit in the UNICHAR then it takes only what\n  // will fit.\n  UNICHAR(const char *utf8_str, int len);\n\n  // Construct from a single UCS4 character.\n  explicit UNICHAR(int unicode);\n\n  // Default copy constructor and operator= are OK.\n\n  // Get the first character as UCS-4.\n  int first_uni() const;\n\n  // Get the length of the UTF8 string.\n  int utf8_len() const {\n    int len = chars[UNICHAR_LEN - 1];\n    return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;\n  }\n\n  // Get a UTF8 string, but NOT nullptr terminated.\n  const char *utf8() const {\n    return chars;\n  }\n\n  // Get a terminated UTF8 string: Must delete[] it after use.\n  char *utf8_str() const;\n\n  // Get the number of bytes in the first character of the given utf8 string.\n  static int utf8_step(const char *utf8_str);\n\n  // A class to simplify iterating over and accessing elements of a UTF8\n  // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or\n  // take ownership of the underlying byte array. It also does not permit\n  // modification of the array (as the name suggests).\n  //\n  // Example:\n  //   for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);\n  //        it != UNICHAR::end(str, len);\n  //        ++it) {\n  //     printf(\"UCS-4 symbol code = %d\\n\", *it);\n  //     char buf[5];\n  //     int char_len = it.get_utf8(buf); buf[char_len] = '\\0';\n  //     printf(\"Char = %s\\n\", buf);\n  //   }\n  class TESS_API const_iterator {\n    using CI = const_iterator;\n\n  public:\n    // Step to the next UTF8 character.\n    // If the current position is at an illegal UTF8 character, then print an\n    // error message and step by one byte. If the current position is at a\n    // nullptr value, don't step past it.\n    const_iterator &operator++();\n\n    // Return the UCS-4 value at the current position.\n    // If the current position is at an illegal UTF8 value, return a single\n    // space character.\n    int operator*() const;\n\n    // Store the UTF-8 encoding of the current codepoint into buf, which must be\n    // at least 4 bytes long. Return the number of bytes written.\n    // If the current position is at an illegal UTF8 value, writes a single\n    // space character and returns 1.\n    // Note that this method does not null-terminate the buffer.\n    int get_utf8(char *buf) const;\n    // Returns the number of bytes of the current codepoint. Returns 1 if the\n    // current position is at an illegal UTF8 value.\n    int utf8_len() const;\n    // Returns true if the UTF-8 encoding at the current position is legal.\n    bool is_legal() const;\n\n    // Return the pointer into the string at the current position.\n    const char *utf8_data() const {\n      return it_;\n    }\n\n    // Iterator equality operators.\n    friend bool operator==(const CI &lhs, const CI &rhs) {\n      return lhs.it_ == rhs.it_;\n    }\n    friend bool operator!=(const CI &lhs, const CI &rhs) {\n      return !(lhs == rhs);\n    }\n\n  private:\n    friend class UNICHAR;\n    explicit const_iterator(const char *it) : it_(it) {}\n\n    const char *it_; // Pointer into the string.\n  };\n\n  // Create a start/end iterator pointing to a string. Note that these methods\n  // are static and do NOT create a copy or take ownership of the underlying\n  // array.\n  static const_iterator begin(const char *utf8_str, int byte_length);\n  static const_iterator end(const char *utf8_str, int byte_length);\n\n  // Converts a utf-8 string to a vector of unicodes.\n  // Returns an empty vector if the input contains invalid UTF-8.\n  static std::vector<char32> UTF8ToUTF32(const char *utf8_str);\n  // Converts a vector of unicodes to a utf8 string.\n  // Returns an empty string if the input contains an invalid unicode.\n  static std::string UTF32ToUTF8(const std::vector<char32> &str32);\n\nprivate:\n  // A UTF-8 representation of 1 or more Unicode characters.\n  // The last element (chars[UNICHAR_LEN - 1]) is a length if\n  // its value < UNICHAR_LEN, otherwise it is a genuine character.\n  char chars[UNICHAR_LEN]{};\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_UNICHAR_H_\n"
  },
  {
    "path": "include/tesseract/version.h.in",
    "content": "// SPDX-License-Identifier: Apache-2.0\n// File:        version.h\n// Description: Version information\n//\n// (C) Copyright 2018, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_API_VERSION_H_\n#define TESSERACT_API_VERSION_H_\n\n// clang-format off\n\n#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@\n#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@\n#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@\n\n#define TESSERACT_VERSION          \\\n  (TESSERACT_MAJOR_VERSION << 16 | \\\n   TESSERACT_MINOR_VERSION <<  8 | \\\n   TESSERACT_MICRO_VERSION)\n\n#define TESSERACT_VERSION_STR \"@PACKAGE_VERSION@\"\n\n// clang-format on\n\n#endif // TESSERACT_API_VERSION_H_\n"
  },
  {
    "path": "java/Makefile.am",
    "content": "SUBDIRS = com\nscrollview_path = @datadir@/tessdata\n\nJAVAC = javac\nJAR = jar\n\nif !GRAPHICS_DISABLED\nSCROLLVIEW_FILES = \\\n\t$(srcdir)/com/google/scrollview/ui/SVAbstractMenuItem.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVCheckboxMenuItem.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVEmptyMenuItem.java \\\n\t$(srcdir)/com/google/scrollview/events/SVEvent.java \\\n\t$(srcdir)/com/google/scrollview/events/SVEventHandler.java \\\n\t$(srcdir)/com/google/scrollview/events/SVEventType.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVImageHandler.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVMenuBar.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVMenuItem.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVPopupMenu.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVSubMenuItem.java \\\n\t$(srcdir)/com/google/scrollview/ui/SVWindow.java \\\n\t$(srcdir)/com/google/scrollview/ScrollView.java\n\nSCROLLVIEW_CLASSES = \\\n\tcom/google/scrollview/ui/SVAbstractMenuItem.class \\\n\tcom/google/scrollview/ui/SVCheckboxMenuItem.class \\\n\tcom/google/scrollview/ui/SVEmptyMenuItem.class \\\n\tcom/google/scrollview/events/SVEvent.class \\\n\tcom/google/scrollview/events/SVEventHandler.class \\\n\tcom/google/scrollview/events/SVEventType.class \\\n\tcom/google/scrollview/ui/SVImageHandler.class \\\n\tcom/google/scrollview/ui/SVMenuBar.class \\\n\tcom/google/scrollview/ui/SVMenuItem.class \\\n\tcom/google/scrollview/ui/SVPopupMenu.class \\\n\tcom/google/scrollview/ui/SVSubMenuItem.class \\\n\tcom/google/scrollview/ui/SVWindow.class \\\n\tcom/google/scrollview/ScrollView.class\n\nSCROLLVIEW_LIBS = \\\n\tpiccolo2d-core-3.0.1.jar \\\n\tpiccolo2d-extras-3.0.1.jar \\\n\tjaxb-api-2.3.1.jar\n\nCLASSPATH = piccolo2d-core-3.0.1.jar:piccolo2d-extras-3.0.1.jar:jaxb-api-2.3.1.jar\n\nScrollView.jar : $(SCROLLVIEW_CLASSES)\n\t$(JAR) cfm $@ $(srcdir)/Manifest.txt com/google/scrollview/*.class \\\n           com/google/scrollview/events/*.class com/google/scrollview/ui/*.class\n\n$(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) $(SCROLLVIEW_LIBS)\n\t$(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir)\n\n.PHONY: fetch-jars\nfetch-jars $(SCROLLVIEW_LIBS):\n\tcurl -sSLO https://repo1.maven.org/maven2/org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar\n\tcurl -sSLO https://repo1.maven.org/maven2/org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar\n\tcurl -sSLO https://repo1.maven.org/maven2/javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar\n\n.PHONY: install-jars\ninstall-jars : ScrollView.jar\n\t@if [ ! -d  $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;\n\t$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);\n\t$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);\n\t@echo \"Don't forget to set environment variable SCROLLVIEW_PATH to $(scrollview_path)\";\n\nuninstall:\n\trm -f $(scrollview_path)/*.jar\nendif\n\nclean-local:\n\trm -f ScrollView.jar $(SCROLLVIEW_CLASSES)\n\n# all-am does nothing, to make the java part optional.\nall all-am install :\n"
  },
  {
    "path": "java/Manifest.txt",
    "content": "Main-Class: com/google/scrollview/ScrollView\nClass-Path: ScrollView.jar piccolo2d-core-3.0.1.jar piccolo2d-extras-3.0.1.jar jaxb-api-2.3.1.jar\n"
  },
  {
    "path": "java/com/Makefile.am",
    "content": "SUBDIRS = google\n"
  },
  {
    "path": "java/com/google/Makefile.am",
    "content": "SUBDIRS = scrollview\n"
  },
  {
    "path": "java/com/google/scrollview/Makefile.am",
    "content": "SUBDIRS = events ui\n\nEXTRA_DIST = \\\n    ScrollView.java\n"
  },
  {
    "path": "java/com/google/scrollview/ScrollView.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview;\n\nimport com.google.scrollview.events.SVEvent;\nimport com.google.scrollview.ui.SVImageHandler;\nimport com.google.scrollview.ui.SVWindow;\nimport org.piccolo2d.nodes.PImage;\n\nimport java.io.BufferedReader;\nimport java.io.IOException;\nimport java.io.InputStreamReader;\nimport java.io.PrintStream;\nimport java.net.ServerSocket;\nimport java.net.Socket;\nimport java.util.ArrayList;\nimport java.util.regex.Pattern;\n\n/**\n * The ScrollView class is the main class which gets started from the command\n * line. It sets up LUA and handles the network processing.\n * @author wanke@google.com\n */\npublic class ScrollView {\n\n  /** The port our server listens at. */\n  public static int SERVER_PORT = 8461;\n\n  /**\n   * All SVWindow objects share the same connection stream. The socket is needed\n   * to detect when the connection got closed, in/out are used to send and\n   * receive messages.\n   */\n  private static Socket socket;\n  private static PrintStream out;\n  public static BufferedReader in;\n  public static float polylineXCoords[];  // The coords being received.\n  public static float polylineYCoords[];  // The coords being received.\n  public static int polylineSize;       // The size of the coords arrays.\n  public static int polylineScanned;    // The size read so far.\n  private static ArrayList<SVWindow> windows;  // The id to SVWindow map.\n  private static Pattern intPattern;        // For checking integer arguments.\n  private static Pattern floatPattern;     // For checking float arguments.\n\n  /** Keeps track of the number of messages received. */\n  static int nrInputLines = 0;\n\n  /** Prints all received messages to the console if true. */\n  static boolean debugViewNetworkTraffic = false;\n\n  /** Add a new message to the outgoing queue. */\n  public static void addMessage(SVEvent e) {\n    if (debugViewNetworkTraffic) {\n      System.out.println(\"(S->c) \" + e.toString());\n    }\n    String str = e.toString();\n    // Send the whole thing as UTF8.\n    try {\n      byte [] utf8 = str.getBytes(\"UTF8\");\n      out.write(utf8, 0, utf8.length);\n    } catch (java.io.UnsupportedEncodingException ex) {\n      System.out.println(\"Oops... can't encode to UTF8... Exiting\");\n      System.exit(0);\n    }\n    out.println();\n    // Flush the output and check for errors.\n    boolean error = out.checkError();\n    if (error) {\n      System.out.println(\"Connection error. Quitting ScrollView Server...\");\n      System.exit(0);\n    }\n  }\n\n  /** Read one message from client (assuming there are any). */\n  public static String receiveMessage() throws IOException {\n    return in.readLine();\n  }\n\n  /**\n   * The main program loop. Basically loops through receiving messages and\n   * processing them and then sending messages (if there are any).\n   */\n  private static void IOLoop() {\n    String inputLine;\n\n    try {\n      while (!socket.isClosed() && !socket.isInputShutdown() &&\n             !socket.isOutputShutdown() &&\n             socket.isConnected() && socket.isBound()) {\n        inputLine = receiveMessage();\n        if (inputLine == null) {\n          // End of stream reached.\n          break;\n        }\n        nrInputLines++;\n        if (debugViewNetworkTraffic) {\n          System.out.println(\"(c->S,\" + nrInputLines + \")\" + inputLine);\n        }\n\n        if (polylineSize > polylineScanned) {\n          // We are processing a polyline.\n          // Read pairs of coordinates separated by commas.\n          boolean first = true;\n          for (String coordStr : inputLine.split(\",\")) {\n            int coord = Integer.parseInt(coordStr);\n            if (first) {\n              polylineXCoords[polylineScanned] = coord;\n            } else {\n              polylineYCoords[polylineScanned++] = coord;\n            }\n            first = !first;\n          }\n          assert first;\n        } else {\n          // Process this normally.\n          processInput(inputLine);\n        }\n      }\n    }\n    // Some connection error\n    catch (IOException e) {\n      System.out.println(\"Connection error. Quitting ScrollView Server...\");\n    }\n    System.exit(0);\n  }\n\n  // Parse a comma-separated list of arguments into ArrayLists of the\n  // possible types. Each type is stored in order, but the order\n  // distinction between types is lost.\n  // Note that the format is highly constrained to what the client used\n  // to send to LUA:\n  // Quoted string -> String.\n  // true or false -> Boolean.\n  // %f format number -> Float (no %e allowed)\n  // Sequence of digits -> Integer\n  // Nothing else allowed.\n  private static void parseArguments(String argList,\n                                     ArrayList<Integer> intList,\n                                     ArrayList<Float> floatList,\n                                     ArrayList<String> stringList,\n                                     ArrayList<Boolean> boolList) {\n    // str is only non-null if an argument starts with a single or double\n    // quote. str is set back to null on completion of the string with a\n    // matching quote. If the string contains a comma then str will stay\n    // non-null across multiple argStr values until a matching closing quote.\n    // Backslash escaped quotes do not count as terminating the string.\n    String str = null;\n    for (String argStr : argList.split(\",\")) {\n      if (str != null) {\n        // Last string was incomplete. Append argStr to it and restore comma.\n        // Execute str += \",\" + argStr in Java.\n        int length = str.length() + 1 + argStr.length();\n        StringBuilder appended = new StringBuilder(length);\n        appended.append(str);\n        appended.append(\",\");\n        appended.append(argStr);\n        str =  appended.toString();\n      } else if (argStr.length() == 0) {\n        continue;\n      } else {\n        char quote = argStr.charAt(0);\n        // If it begins with a quote then it is a string, but may not\n        // end this time if it contained a comma.\n        if (quote == '\\'' || quote == '\"') {\n          str = argStr;\n        }\n      }\n      if (str != null) {\n        // It began with a quote. Check that it still does.\n        assert str.charAt(0) == '\\'' || str.charAt(0) == '\"';\n        int len = str.length();\n        if (len > 1 && str.charAt(len - 1) == str.charAt(0)) {\n          // We have an ending quote of the right type. Now check that\n          // it is not escaped. Must have an even number of slashes before.\n          int slash = len - 1;\n          while (slash > 0 && str.charAt(slash - 1) == '\\\\')\n            --slash;\n          if ((len - 1 - slash) % 2 == 0) {\n            // It is now complete. Chop off the quotes and save.\n            // TODO(rays) remove the first backslash of each pair.\n            stringList.add(str.substring(1, len - 1));\n            str = null;\n          }\n        }\n        // If str is not null here, then we have a string with a comma in it.\n        // Append, and the next argument at the next iteration, but check\n        // that str is null after the loop terminates in case it was an\n        // unterminated string.\n      } else if (floatPattern.matcher(argStr).matches()) {\n        // It is a float.\n        floatList.add(Float.parseFloat(argStr));\n      } else if (argStr.equals(\"true\")) {\n        boolList.add(true);\n      } else if (argStr.equals(\"false\")) {\n        boolList.add(false);\n      } else if (intPattern.matcher(argStr).matches()) {\n        // Only contains digits so must be an int.\n        intList.add(Integer.parseInt(argStr));\n      }\n      // else ignore all incompatible arguments for forward compatibility.\n    }\n    // All strings must have been terminated.\n    assert str == null;\n  }\n\n  /** Executes the LUA command parsed as parameter. */\n  private static void processInput(String inputLine) {\n    if (inputLine == null) {\n      return;\n    }\n    // Execute a function encoded as a LUA statement! Yuk!\n    if (inputLine.charAt(0) == 'w') {\n      // This is a method call on a window. Parse it.\n      String noWLine = inputLine.substring(1);\n      String[] idStrs = noWLine.split(\"[ :]\", 2);\n      int windowID = Integer.parseInt(idStrs[0]);\n      // Find the parentheses.\n      int start = inputLine.indexOf('(');\n      int end = inputLine.lastIndexOf(')');\n      // Parse the args.\n      ArrayList<Integer> intList = new ArrayList<Integer>(4);\n      ArrayList<Float> floatList = new ArrayList<Float>(2);\n      ArrayList<String> stringList = new ArrayList<String>(4);\n      ArrayList<Boolean> boolList = new ArrayList<Boolean>(3);\n      parseArguments(inputLine.substring(start + 1, end),\n                     intList, floatList, stringList, boolList);\n      int colon = inputLine.indexOf(':');\n      if (colon > 1 && colon < start) {\n        // This is a regular function call. Look for the name and call it.\n        String func = inputLine.substring(colon + 1, start);\n        if (func.equals(\"drawLine\")) {\n          windows.get(windowID).drawLine(intList.get(0), intList.get(1),\n                                         intList.get(2), intList.get(3));\n        } else if (func.equals(\"createPolyline\")) {\n          windows.get(windowID).createPolyline(intList.get(0));\n        } else if (func.equals(\"drawPolyline\")) {\n          windows.get(windowID).drawPolyline();\n        } else if (func.equals(\"drawRectangle\")) {\n          windows.get(windowID).drawRectangle(intList.get(0), intList.get(1),\n                                              intList.get(2), intList.get(3));\n        } else if (func.equals(\"setVisible\")) {\n          windows.get(windowID).setVisible(boolList.get(0));\n        } else if (func.equals(\"setAlwaysOnTop\")) {\n          windows.get(windowID).setAlwaysOnTop(boolList.get(0));\n        } else if (func.equals(\"addMessage\")) {\n          windows.get(windowID).addMessage(stringList.get(0));\n        } else if (func.equals(\"addMessageBox\")) {\n          windows.get(windowID).addMessageBox();\n        } else if (func.equals(\"clear\")) {\n          windows.get(windowID).clear();\n        } else if (func.equals(\"setStrokeWidth\")) {\n          windows.get(windowID).setStrokeWidth(floatList.get(0));\n        } else if (func.equals(\"drawEllipse\")) {\n          windows.get(windowID).drawEllipse(intList.get(0), intList.get(1),\n                                            intList.get(2), intList.get(3));\n        } else if (func.equals(\"pen\")) {\n          if (intList.size() == 4) {\n            windows.get(windowID).pen(intList.get(0), intList.get(1),\n                                      intList.get(2), intList.get(3));\n          } else {\n            windows.get(windowID).pen(intList.get(0), intList.get(1),\n                                      intList.get(2));\n          }\n        } else if (func.equals(\"brush\")) {\n          if (intList.size() == 4) {\n            windows.get(windowID).brush(intList.get(0), intList.get(1),\n                                        intList.get(2), intList.get(3));\n          } else {\n            windows.get(windowID).brush(intList.get(0), intList.get(1),\n                                        intList.get(2));\n          }\n        } else if (func.equals(\"textAttributes\")) {\n          windows.get(windowID).textAttributes(stringList.get(0),\n                                               intList.get(0),\n                                               boolList.get(0),\n                                               boolList.get(1),\n                                               boolList.get(2));\n        } else if (func.equals(\"drawText\")) {\n          windows.get(windowID).drawText(intList.get(0), intList.get(1),\n                                         stringList.get(0));\n        } else if (func.equals(\"addMenuBarItem\")) {\n          if (boolList.size() > 0) {\n            windows.get(windowID).addMenuBarItem(stringList.get(0),\n                                                 stringList.get(1),\n                                                 intList.get(0),\n                                                 boolList.get(0));\n          } else if (intList.size() > 0) {\n            windows.get(windowID).addMenuBarItem(stringList.get(0),\n                                                 stringList.get(1),\n                                                 intList.get(0));\n          } else {\n            windows.get(windowID).addMenuBarItem(stringList.get(0),\n                                                 stringList.get(1));\n          }\n        } else if (func.equals(\"addPopupMenuItem\")) {\n          if (stringList.size() == 4) {\n            windows.get(windowID).addPopupMenuItem(stringList.get(0),\n                                                   stringList.get(1),\n                                                   intList.get(0),\n                                                   stringList.get(2),\n                                                   stringList.get(3));\n          } else {\n             windows.get(windowID).addPopupMenuItem(stringList.get(0),\n                                                    stringList.get(1));\n          }\n        } else if (func.equals(\"update\")) {\n          windows.get(windowID).update();\n        } else if (func.equals(\"showInputDialog\")) {\n          windows.get(windowID).showInputDialog(stringList.get(0));\n        } else if (func.equals(\"showYesNoDialog\")) {\n          windows.get(windowID).showYesNoDialog(stringList.get(0));\n        } else if (func.equals(\"zoomRectangle\")) {\n          windows.get(windowID).zoomRectangle(intList.get(0), intList.get(1),\n                                              intList.get(2), intList.get(3));\n        } else if (func.equals(\"readImage\")) {\n          PImage image = SVImageHandler.readImage(intList.get(2), in);\n          windows.get(windowID).drawImage(image, intList.get(0), intList.get(1));\n        } else if (func.equals(\"drawImage\")) {\n          PImage image = new PImage(stringList.get(0));\n          windows.get(windowID).drawImage(image, intList.get(0), intList.get(1));\n        } else if (func.equals(\"destroy\")) {\n          windows.get(windowID).destroy();\n        }\n        // else for forward compatibility purposes, silently ignore any\n        // unrecognized function call.\n      } else {\n        // No colon. Check for create window.\n        if (idStrs[1].startsWith(\"= luajava.newInstance\")) {\n          while (windows.size() <= windowID) {\n            windows.add(null);\n          }\n          windows.set(windowID, new SVWindow(stringList.get(1),\n                                             intList.get(0), intList.get(1),\n                                             intList.get(2), intList.get(3),\n                                             intList.get(4), intList.get(5),\n                                             intList.get(6)));\n        }\n        // else for forward compatibility purposes, silently ignore any\n        // unrecognized function call.\n      }\n    } else if (inputLine.startsWith(\"svmain\")) {\n        // Startup or end. Startup is a lua bind, which is now a no-op.\n        if (inputLine.startsWith(\"svmain:exit\")) {\n          exit();\n        }\n        // else for forward compatibility purposes, silently ignore any\n        // unrecognized function call.\n    }\n    // else for forward compatibility purposes, silently ignore any\n    // unrecognized function call.\n  }\n\n  /** Called from the client to make the server exit. */\n  public static void exit() {\n    System.exit(0);\n  }\n\n  /**\n   * The main function. Sets up LUA and the server connection and then calls the\n   * IOLoop.\n   */\n  public static void main(String[] args) {\n    if (args.length > 0) {\n      SERVER_PORT = Integer.parseInt(args[0]);\n    }\n    windows = new ArrayList<SVWindow>(100);\n    intPattern = Pattern.compile(\"[0-9-][0-9]*\");\n    floatPattern = Pattern.compile(\"[0-9-][0-9]*\\\\.[0-9]*\");\n\n    // Open a socket to listen on.\n    try (ServerSocket serverSocket = new ServerSocket(SERVER_PORT)) {\n      System.out.println(\"Socket started on port \" + SERVER_PORT);\n\n      // Wait (blocking) for an incoming connection\n      socket = serverSocket.accept();\n      System.out.println(\"Client connected\");\n\n      // Setup the streams\n      out = new PrintStream(socket.getOutputStream(), true, \"UTF-8\");\n      in =\n          new BufferedReader(new InputStreamReader(socket.getInputStream(),\n              \"UTF8\"));\n    } catch (IOException e) {\n      // Something went wrong and we were unable to set up a connection. This is\n      // pretty much a fatal error.\n      // Note: The server does not get restarted automatically if this happens.\n      e.printStackTrace();\n      System.exit(1);\n    }\n\n    // Enter the main program loop.\n    IOLoop();\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/events/Makefile.am",
    "content": "SUBDIRS =\n\nEXTRA_DIST = \\\n    SVEvent.java SVEventHandler.java \\\n    SVEventType.java\n"
  },
  {
    "path": "java/com/google/scrollview/events/SVEvent.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.events;\n\nimport com.google.scrollview.ui.SVWindow;\n\n/**\n * The SVEvent is a structure which holds the actual values of a message to be\n * transmitted. It corresponds to the client structure defined in scrollview.h\n *\n * @author wanke@google.com\n */\npublic class SVEvent {\n  SVEventType type; // What kind of event.\n  SVWindow window; // Window event relates to.\n  int x; // Coords of click or selection.\n  int y;\n  int xSize; // Size of selection.\n  int ySize;\n  int commandId;\n  String parameter; // Any string that might have been passed as argument.\n\n  /**\n   * A \"normal\" SVEvent.\n   *\n   * @param t The type of the event as specified in SVEventType (e.g.\n   *        SVET_CLICK)\n   * @param w The window the event corresponds to\n   * @param x1 X position of the mouse at the time of the event\n   * @param y1 Y position of the mouse at the time of the event\n   * @param x2 X selection size at the time of the event\n   * @param y2 Y selection size at the time of the event\n   * @param p A parameter associated with the event (e.g. keyboard input)\n   */\n  public SVEvent(SVEventType t, SVWindow w, int x1, int y1, int x2, int y2,\n      String p) {\n    type = t;\n    window = w;\n    x = x1;\n    y = y1;\n    xSize = x2;\n    ySize = y2;\n    commandId = 0;\n    parameter = p;\n  }\n\n  /**\n   * An event which issues a command (like clicking on an item in the menubar).\n   *\n   * @param eventtype The type of the event as specified in SVEventType\n   *        (usually SVET_MENU or SVET_POPUP)\n   * @param svWindow The window the event corresponds to\n   * @param commandid The associated id with the command (given by the client\n   *        on construction of the item)\n   * @param value A parameter associated with the event (e.g. keyboard input)\n   */\n  public SVEvent(SVEventType eventtype, SVWindow svWindow, int commandid,\n      String value) {\n    type = eventtype;\n    window = svWindow;\n\n    parameter = value;\n    x = 0;\n    y = 0;\n    xSize = 0;\n    ySize = 0;\n    commandId = commandid;\n  }\n\n  /**\n   * This is the string representation of the message, which is what will\n   * actually be transferred over the network.\n   */\n  @Override\n  public String toString() {\n    return (window.hash + \",\" + type.ordinal() + \",\" + x + \",\" + y + \",\"\n        + xSize + \",\" + ySize + \",\" + commandId + \",\" + parameter);\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/events/SVEventHandler.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.events;\n\nimport com.google.scrollview.ScrollView;\nimport com.google.scrollview.events.SVEvent;\nimport com.google.scrollview.events.SVEventType;\nimport com.google.scrollview.ui.SVWindow;\n\nimport org.piccolo2d.PCamera;\nimport org.piccolo2d.PNode;\nimport org.piccolo2d.event.PBasicInputEventHandler;\nimport org.piccolo2d.event.PInputEvent;\nimport org.piccolo2d.nodes.PPath;\n\nimport java.awt.Color;\nimport java.awt.event.ActionEvent;\nimport java.awt.event.ActionListener;\nimport java.awt.event.KeyEvent;\nimport java.awt.event.KeyListener;\nimport java.awt.event.WindowEvent;\nimport java.awt.event.WindowListener;\nimport java.awt.Window;\n\nimport javax.swing.Timer;\n\n/**\n * The ScrollViewEventHandler takes care of any events which might happen on the\n * canvas and converts them to an according SVEvent, which is (using the\n * processEvent method) then added to a message queue. All events from the\n * message queue get sent gradually.\n *\n * @author wanke@google.com\n */\npublic class SVEventHandler extends PBasicInputEventHandler implements\n    ActionListener, KeyListener, WindowListener {\n\n  /** Necessary to wait for a defined period of time (for SVET_HOVER). */\n  public Timer timer;\n\n  /** The window which the event corresponds to. */\n  private SVWindow svWindow;\n\n  /** These are used to determine a selection size (for SVET_SELECTION). */\n  private int lastX = 0;\n  private int lastY = 0;\n\n  /**\n   * These are used in case we want to transmit our position, but do not get it\n   * because it was no MouseEvent, in particular SVET_HOVER and SVET_INPUT.\n   */\n  private int lastXMove = 0;\n  private int lastYMove = 0;\n\n  /** For Drawing a rubber-band rectangle for selection. */\n  private int startX = 0;\n  private int startY = 0;\n  private float rubberBandTransparency = 0.5f;\n  private PNode selection = null;\n\n  /** The string entered since the last enter. Since the client\n   *  end eats all newlines, we can't use the newline\n   *  character, so use ! for now, as it cannot be entered\n   *  directly anyway and therefore can never show up for real. */\n  private String keyStr = \"!\";\n\n  /** Setup the timer. */\n  public SVEventHandler(SVWindow wdw) {\n    timer = new Timer(1000, this);\n    svWindow = wdw;\n  }\n\n  /**\n   * Store the newest x,y values, add the message to the queue and restart the\n   * timer.\n   */\n  private void processEvent(SVEvent e) {\n    lastXMove = e.x;\n    lastYMove = e.y;\n    ScrollView.addMessage(e);\n    timer.restart();\n  }\n\n  /** Show the associated popup menu at (x,y) (relative position of the window). */\n  private void showPopup(PInputEvent e) {\n    double x = e.getCanvasPosition().getX();\n    double y = e.getCanvasPosition().getY();\n\n    if (svWindow.svPuMenu != null) {\n      svWindow.svPuMenu.show(svWindow, (int) x, (int) y);\n    }\n  }\n\n\n  /** The mouse is clicked - create an SVET_CLICK event. */\n  @Override\n  public void mouseClicked(PInputEvent e) {\n    if (e.isPopupTrigger()) {\n      showPopup(e);\n    } else {\n      processEvent(new SVEvent(SVEventType.SVET_CLICK, svWindow, (int) e\n          .getPosition().getX(), (int) e.getPosition().getY(), 0, 0, null));\n    }\n  }\n\n  /**\n   * The mouse key is pressed (and keeps getting pressed).\n   * Depending on the OS, show a popup menu (if the button pressed is associated\n   * with popup menus, like the RMB under windows&linux) or otherwise save the\n   * position (in case it is a selection).\n   */\n  @Override\n  public void mousePressed(PInputEvent e) {\n    if (e.isPopupTrigger()) {\n      showPopup(e);\n    } else {\n      lastX = (int) e.getPosition().getX();\n      lastY = (int) e.getPosition().getY();\n      timer.restart();\n    }\n  }\n\n  /** The mouse is getting dragged - create an SVET_MOUSE event. */\n  @Override\n  public void mouseDragged(PInputEvent e) {\n    processEvent(new SVEvent(SVEventType.SVET_MOUSE, svWindow, (int) e\n        .getPosition().getX(), (int) e.getPosition().getY(), (int) e\n        .getPosition().getX()\n        - lastX, (int) e.getPosition().getY() - lastY, null));\n\n    // Paint a selection rectangle.\n    if (selection == null) {\n      startX = (int) e.getPosition().getX();\n      startY = (int) e.getPosition().getY();\n      selection = PPath.createRectangle(startX, startY, 1, 1);\n      selection.setTransparency(rubberBandTransparency);\n      svWindow.canvas.getLayer().addChild(selection);\n    } else {\n      int right = Math.max(startX, (int) e.getPosition().getX());\n      int left = Math.min(startX, (int) e.getPosition().getX());\n      int bottom = Math.max(startY, (int) e.getPosition().getY());\n      int top = Math.min(startY, (int) e.getPosition().getY());\n      svWindow.canvas.getLayer().removeChild(selection);\n      selection = PPath.createRectangle(left, top, right - left, bottom - top);\n      selection.setPaint(Color.YELLOW);\n      selection.setTransparency(rubberBandTransparency);\n      svWindow.canvas.getLayer().addChild(selection);\n    }\n  }\n\n  /**\n   * The mouse was released.\n   * Depending on the OS, show a popup menu (if the button pressed is associated\n   * with popup menus, like the RMB under windows&linux) or otherwise create an\n   * SVET_SELECTION event.\n   */\n  @Override\n  public void mouseReleased(PInputEvent e) {\n    if (e.isPopupTrigger()) {\n      showPopup(e);\n    } else {\n      processEvent(new SVEvent(SVEventType.SVET_SELECTION, svWindow, (int) e\n          .getPosition().getX(), (int) e.getPosition().getY(), (int) e\n          .getPosition().getX()\n          - lastX, (int) e.getPosition().getY() - lastY, null));\n    }\n    if (selection != null) {\n      svWindow.canvas.getLayer().removeChild(selection);\n      selection = null;\n    }\n  }\n\n  /**\n   * The mouse wheel is used to zoom in and out of the viewport and center on\n   * the (x,y) position the mouse is currently on.\n   */\n  @Override\n  public void mouseWheelRotated(PInputEvent e) {\n    PCamera lc = svWindow.canvas.getCamera();\n    double sf = SVWindow.SCALING_FACTOR;\n\n    if (e.getWheelRotation() < 0) {\n      sf = 1 / sf;\n    }\n    lc.scaleViewAboutPoint(lc.getScale() / sf, e.getPosition().getX(), e\n        .getPosition().getY());\n  }\n\n  /**\n   * The mouse was moved - create an SVET_MOTION event. NOTE: This obviously\n   * creates a lot of traffic and, depending on the type of application, could\n   * quite possibly be disabled.\n   */\n  @Override\n  public void mouseMoved(PInputEvent e) {\n    processEvent(new SVEvent(SVEventType.SVET_MOTION, svWindow, (int) e\n        .getPosition().getX(), (int) e.getPosition().getY(), 0, 0, null));\n  }\n\n  /**\n   * The mouse entered the window.\n   * Start the timer, which will then emit SVET_HOVER events every X ms. */\n  @Override\n  public void mouseEntered(PInputEvent e) {\n    timer.restart();\n  }\n\n  /**\n   * The mouse exited the window\n   * Stop the timer, so no more SVET_HOVER events will emit. */\n  @Override\n  public void mouseExited(PInputEvent e) {\n    timer.stop();\n  }\n\n  /**\n   * The only associated object with this is the timer, so we use it to send a\n   * SVET_HOVER event.\n   */\n  public void actionPerformed(ActionEvent e) {\n    processEvent(new SVEvent(SVEventType.SVET_HOVER, svWindow, lastXMove,\n        lastYMove, 0, 0, null));\n  }\n\n  /**\n   * A key was pressed - create an SVET_INPUT event.\n   *\n   * NOTE: Might be useful to specify hotkeys.\n   *\n   * Implementation note: The keyListener provided by Piccolo seems to be\n   * broken, so we use the AWT listener directly.\n   * There are never any keyTyped events received either so we are\n   * stuck with physical keys, which is very ugly.\n   */\n  public void keyPressed(KeyEvent e) {\n    char keyCh = e.getKeyChar();\n    if (keyCh == '\\r' || keyCh == '\\n' || keyCh == '\\0' || keyCh == '?') {\n      processEvent(new SVEvent(SVEventType.SVET_INPUT, svWindow, lastXMove,\n                               lastYMove, 0, 0, keyStr));\n      // Send newline characters as '!' as '!' can never be a keypressed\n      // and the client eats all newline characters.\n      keyStr = \"!\";\n    } else {\n      processEvent(new SVEvent(SVEventType.SVET_INPUT, svWindow, lastXMove,\n                               lastYMove, 0, 0, String.valueOf(keyCh)));\n      keyStr += keyCh;\n    }\n  }\n\n  /**\n   * A window is closed (by the 'x') - create an SVET_DESTROY event. If it was\n   * the last open Window, also send an SVET_EXIT event (but do not exit unless\n   * the client says so).\n   */\n  public void windowClosing(WindowEvent e) {\n    processEvent(new SVEvent(SVEventType.SVET_DESTROY, svWindow, lastXMove,\n        lastYMove, 0, 0, null));\n    Window w = e.getWindow();\n    if (w != null) {\n      w.dispose();\n    }\n    SVWindow.nrWindows--;\n    if (SVWindow.nrWindows == 0) {\n      processEvent(new SVEvent(SVEventType.SVET_EXIT, svWindow, lastXMove,\n          lastYMove, 0, 0, null));\n    }\n  }\n\n  /** These are all events we do not care about and throw away. */\n  public void keyReleased(KeyEvent e) {\n  }\n\n  public void keyTyped(KeyEvent e) {\n  }\n\n  public void windowActivated(WindowEvent e) {\n  }\n\n  public void windowClosed(WindowEvent e) {\n  }\n\n  public void windowDeactivated(WindowEvent e) {\n  }\n\n  public void windowDeiconified(WindowEvent e) {\n  }\n\n  public void windowIconified(WindowEvent e) {\n  }\n\n  public void windowOpened(WindowEvent e) {\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/events/SVEventType.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.events;\n\n/**\n * These are the defined events which can happen in ScrollView and be\n * transferred to the client. They are same events as on the client side part of\n * ScrollView (defined in ScrollView.h).\n *\n * @author wanke@google.com\n */\npublic enum SVEventType {\n  SVET_DESTROY, // Window has been destroyed by user.\n  SVET_EXIT, // User has destroyed the last window by clicking on the 'X'\n  SVET_CLICK, // Any button pressed that is not a popup trigger.\n  SVET_SELECTION, // Left button selection.\n  SVET_INPUT, // Any kind of input\n  SVET_MOUSE, // The mouse has moved with a button pressed.\n  SVET_MOTION, // The mouse has moved with no button pressed.\n  SVET_HOVER, // The mouse has stayed still for a second.\n  SVET_POPUP, // A command selected through a popup menu\n  SVET_MENU; // A command selected through the menubar\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/Makefile.am",
    "content": "SUBDIRS =\n\nEXTRA_DIST = \\\n    SVAbstractMenuItem.java \\\n    SVCheckboxMenuItem.java SVEmptyMenuItem.java \\\n    SVImageHandler.java SVMenuBar.java \\\n    SVMenuItem.java SVPopupMenu.java SVSubMenuItem.java SVWindow.java\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVAbstractMenuItem.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\n/**\n * A MenuListItem is any sort of menu entry. This can either be within a popup\n * menu or within a menubar. It can either be a submenu (only name and\n * command-id) or a name with an associated value and possibly description. They\n * can also have new entries added (if they are submenus).\n *\n * @author wanke@google.com\n */\n\nimport com.google.scrollview.events.SVEventType;\n\nimport javax.swing.JMenu;\nimport javax.swing.JMenuItem;\n\nabstract class SVAbstractMenuItem {\n  JMenuItem mi;\n  public String name;\n  public int id;\n\n  /**\n   * Sets the basic attributes for name, id and the corresponding swing item\n   */\n  SVAbstractMenuItem(int id, String name, JMenuItem jmi) {\n    this.mi = jmi;\n    this.name = name;\n    this.id = id;\n  }\n\n  /** Returns the actual value of the MenuListItem. */\n  public String getValue() { return null; }\n\n  /** Adds a child entry to the submenu. */\n  public void add(SVAbstractMenuItem mli) { }\n\n  /** Adds a child menu to the submenu (or root node). */\n  public void add(JMenu jli) { }\n\n  /**\n   * What to do when user clicks on this item.\n   * @param window The window the event happened.\n   * @param eventType What kind of event will be associated\n   * (usually SVET_POPUP or SVET_MENU).\n   */\n  public void performAction(SVWindow window, SVEventType eventType) {}\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVCheckboxMenuItem.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\n/**\n * A MenuListItem is any sort of menu entry. This can either be within a popup\n * menu or within a menubar. It can either be a submenu (only name and\n * command-id) or a name with an associated value and possibly description. They\n * can also have new entries added (if they are submenus).\n *\n * @author wanke@google.com\n */\n\nimport com.google.scrollview.ScrollView;\nimport com.google.scrollview.events.SVEvent;\nimport com.google.scrollview.events.SVEventType;\n\nimport javax.swing.JCheckBoxMenuItem;\n\n/**\n * Constructs a new menulistitem which possesses a flag that can be toggled.\n */\nclass SVCheckboxMenuItem extends SVAbstractMenuItem {\n  public boolean bvalue;\n\n  SVCheckboxMenuItem(int id, String name, boolean val) {\n    super(id, name, new JCheckBoxMenuItem(name, val));\n    bvalue = val;\n  }\n\n  /** What to do when user clicks on this item. */\n  @Override\n  public void performAction(SVWindow window, SVEventType eventType) {\n    // Checkbox entry - trigger and send event.\n    if (bvalue) {\n      bvalue = false;\n    } else {\n      bvalue = true;\n    }\n    SVEvent svme = new SVEvent(eventType, window, id, getValue());\n    ScrollView.addMessage(svme);\n  }\n\n  /** Returns the actual value of the MenuListItem. */\n  @Override\n  public String getValue() {\n    return Boolean.toString(bvalue);\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVEmptyMenuItem.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\n/**\n * A MenuListItem is any sort of menu entry. This can either be within a popup\n * menu or within a menubar. It can either be a submenu (only name and\n * command-id) or a name with an associated value and possibly description. They\n * can also have new entries added (if they are submenus).\n *\n * @author wanke@google.com\n */\n\nimport com.google.scrollview.ScrollView;\nimport com.google.scrollview.events.SVEvent;\nimport com.google.scrollview.events.SVEventType;\n\nimport javax.swing.JMenuItem;\n\n/**\n * Constructs a new menulistitem which just has an ID and a name attached to\n * it. In this case, we will have to ask for the value of the item and its\n * description if it gets called.\n */\nclass SVEmptyMenuItem extends SVAbstractMenuItem {\n  SVEmptyMenuItem(int id, String name) {\n    super(id, name, new JMenuItem(name));\n  }\n  /** What to do when user clicks on this item. */\n  @Override\n  public void performAction(SVWindow window, SVEventType eventType) {\n  // Send an event indicating that someone clicked on an entry.\n  // Value will be null here.\n    SVEvent svme =\n        new SVEvent(eventType, window, id, getValue());\n    ScrollView.addMessage(svme);\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVImageHandler.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\nimport org.piccolo2d.nodes.PImage;\n\nimport java.io.BufferedReader;\nimport java.io.ByteArrayInputStream;\nimport java.io.IOException;\nimport javax.imageio.ImageIO;\nimport javax.xml.bind.DatatypeConverter;\n\n/**\n * The ScrollViewImageHandler is a helper class which takes care of image\n * processing. It is used to construct an Image from the message-stream and\n * basically consists of a number of utility functions to process the input\n * stream.\n *\n * @author wanke@google.com\n */\npublic class SVImageHandler {\n  /* All methods are static, so we forbid to construct SVImageHandler objects. */\n  private SVImageHandler() {\n  }\n\n  /**\n   * Reads size bytes from the stream in and interprets it as an image file,\n   * encoded as png, and then text-encoded as base 64, returning the decoded\n   * bitmap.\n   *\n   * @param size The size of the image file.\n   * @param in The input stream from which to read the bytes.\n   */\n  public static PImage readImage(int size, BufferedReader in) {\n    char[] charbuffer = new char[size];\n    int numRead = 0;\n    while (numRead < size) {\n      int newRead = -1;\n      try {\n        newRead = in.read(charbuffer, numRead, size - numRead);\n      } catch (IOException e) {\n        System.out.println(\"Failed to read image data from socket:\" + e.getMessage());\n        return null;\n      }\n      if (newRead < 0) {\n        return null;\n      }\n      numRead += newRead;\n    }\n    if (numRead != size) {\n        System.out.println(\"Failed to read image data from socket\");\n      return null;\n    }\n    // Convert the character data to binary.\n    byte[] binarydata = DatatypeConverter.parseBase64Binary(new String(charbuffer));\n    // Convert the binary data to a byte stream and parse to image.\n    ByteArrayInputStream byteStream = new ByteArrayInputStream(binarydata);\n    try {\n      PImage img = new PImage(ImageIO.read(byteStream));\n      return img;\n    } catch (IOException e) {\n      System.out.println(\"Failed to decode image data from socket\" + e.getMessage());\n    }\n    return null;\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVMenuBar.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\nimport com.google.scrollview.events.SVEventType;\nimport com.google.scrollview.ui.SVWindow;\n\nimport java.awt.event.ActionEvent;\nimport java.awt.event.ActionListener;\nimport java.util.HashMap;\n\nimport javax.swing.JMenu;\nimport javax.swing.JMenuBar;\n\n/**\n * The SVMenuBar class provides the functionality to add a menubar to\n * ScrollView. Each menubar item gets associated with a (client-defined)\n * command-id, which SVMenuBar will return upon clicking it.\n *\n * @author wanke@google.com\n *\n */\npublic class SVMenuBar implements ActionListener {\n  /** The root entry to add items to. */\n  private JMenuBar root;\n  /** Contains a map of item name to its actual entry. */\n  private HashMap<String, SVAbstractMenuItem> items;\n  /** The window the menubar belongs to. */\n  private SVWindow svWindow;\n\n  /**\n   * Create a new SVMenuBar and place it at the top of the ScrollView window.\n   *\n   * @param scrollView The window our menubar belongs to.\n   */\n  public SVMenuBar(SVWindow scrollView) {\n    root = new JMenuBar();\n    svWindow = scrollView;\n    items = new HashMap<String, SVAbstractMenuItem>();\n    svWindow.setJMenuBar(root);\n  }\n\n\n  /**\n   * A click on one of the items in our menubar has occurred. Forward it\n   * to the item itself to let it decide what happens.\n   */\n  public void actionPerformed(ActionEvent e) {\n    // Get the corresponding menuitem.\n    SVAbstractMenuItem svm = items.get(e.getActionCommand());\n\n    svm.performAction(svWindow, SVEventType.SVET_MENU);\n  }\n\n  /**\n   * Add a new entry to the menubar.\n   *\n   * @param parent The menu we add our new entry to (should have been defined\n   *        before). If the parent is \"\", we will add the entry to the root\n   *        (top-level)\n   * @param name The caption of the new entry.\n   * @param id The Id of the new entry. If it is -1, the entry will be treated\n   *        as a menu.\n   */\n  public void add(String parent, String name, int id) {\n    // A duplicate entry - we just throw it away, since its already in.\n    if (items.get(name) != null) { return; }\n    // A new submenu at the top-level\n    if (parent.equals(\"\")) {\n      JMenu jli = new JMenu(name);\n      SVAbstractMenuItem mli = new SVSubMenuItem(name, jli);\n      items.put(name, mli);\n      root.add(jli);\n    }\n    // A new sub-submenu\n    else if (id == -1) {\n      SVAbstractMenuItem jmi = items.get(parent);\n      JMenu jli = new JMenu(name);\n      SVAbstractMenuItem mli = new SVSubMenuItem(name, jli);\n      items.put(name, mli);\n      jmi.add(jli);\n    }\n    // A new child entry. Add to appropriate parent.\n    else {\n      SVAbstractMenuItem jmi = items.get(parent);\n      if (jmi == null) {\n        System.out.println(\"ERROR: Unknown parent \" + parent);\n        System.exit(1);\n      }\n      SVAbstractMenuItem mli = new SVEmptyMenuItem(id, name);\n      mli.mi.addActionListener(this);\n      items.put(name, mli);\n      jmi.add(mli);\n    }\n  }\n\n  /**\n   * Add a new checkbox entry to the menubar.\n   *\n   * @param parent The menu we add our new entry to (should have been defined\n   *        before). If the parent is \"\", we will add the entry to the root\n   *        (top-level)\n   * @param name The caption of the new entry.\n   * @param id The Id of the new entry. If it is -1, the entry will be treated\n   *        as a menu.\n   * @param b Whether the entry is initially flagged.\n   *\n   */\n\n  public void add(String parent, String name, int id, boolean b) {\n    SVAbstractMenuItem jmi = items.get(parent);\n    if (jmi == null) {\n      System.out.println(\"ERROR: Unknown parent \" + parent);\n      System.exit(1);\n    }\n    SVAbstractMenuItem mli = new SVCheckboxMenuItem(id, name, b);\n    mli.mi.addActionListener(this);\n    items.put(name, mli);\n    jmi.add(mli);\n  }\n\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVMenuItem.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\n/**\n * A MenuListItem is any sort of menu entry. This can either be within a popup\n * menu or within a menubar. It can either be a submenu (only name and\n * command-id) or a name with an associated value and possibly description. They\n * can also have new entries added (if they are submenus).\n *\n * @author wanke@google.com\n */\n\nimport com.google.scrollview.events.SVEventType;\n\nimport javax.swing.JMenuItem;\n\n/**\n * Constructs a new menulistitem which also has a value and a description. For\n * these, we will not have to ask the server what the value is when the user\n * wants to change it, but can just call the client with the new value.\n */\nclass SVMenuItem extends SVAbstractMenuItem {\n  public String value = null;\n  public String desc = null;\n\n  SVMenuItem(int id, String name, String v, String d) {\n    super(id, name, new JMenuItem(name));\n    value = v;\n    desc = d;\n  }\n\n  /**\n   * Ask the user for new input for a variable and send it.\n   * Depending on whether there is a description given for the entry, show\n   * the description in the dialog or just show the name.\n   */\n  @Override\n  public void performAction(SVWindow window, SVEventType eventType) {\n    if (desc != null) {\n      window.showInputDialog(desc, value, id, eventType);\n    } else {\n      window.showInputDialog(name, value, id, eventType);\n    }\n  }\n\n  /** Returns the actual value of the MenuListItem. */\n  @Override\n  public String getValue() {\n    return value;\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVPopupMenu.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\nimport com.google.scrollview.events.SVEventType;\nimport com.google.scrollview.ui.SVMenuItem;\nimport com.google.scrollview.ui.SVWindow;\n\nimport java.awt.Component;\nimport java.awt.event.ActionEvent;\nimport java.awt.event.ActionListener;\nimport java.util.HashMap;\n\nimport javax.swing.JMenu;\nimport javax.swing.JPopupMenu;\n\n/**\n * The SVPopupMenu class provides the functionality to add a popup menu to\n * ScrollView. Each popup menu item gets associated with a (client-defined)\n * command-id, which SVPopupMenu will return upon clicking it.\n *\n * @author wanke@google.com\n *\n */\n\npublic class SVPopupMenu implements ActionListener {\n  /** The root entry to add items to. */\n  private JPopupMenu root;\n  /** Contains a map of item name to its actual entry. */\n  private HashMap<String, SVAbstractMenuItem> items;\n  /** The window the menubar belongs to. */\n  private SVWindow svWindow;\n\n  /**\n   * Create a new SVPopupMenu and associate it with a ScrollView window.\n   *\n   * @param sv The window our popup menu belongs to.\n   */\n  SVPopupMenu(SVWindow sv) {\n    root = new JPopupMenu();\n    svWindow = sv;\n    items = new HashMap<String, SVAbstractMenuItem>();\n  }\n\n  /**\n   * Add a new entry to the menubar. For these items, the server will poll the\n   * client to ask what to do.\n   *\n   * @param parent The menu we add our new entry to (should have been defined\n   *        before). If the parent is \"\", we will add the entry to the root\n   *        (top-level).\n   * @param name The caption of the new entry.\n   * @param id The Id of the new entry. If it is -1, the entry will be treated\n   *        as a menu.\n   */\n  public void add(String parent, String name, int id) {\n    // A duplicate entry - we just throw it away, since its already in.\n    if (items.get(name) != null) { return; }\n    // A new submenu at the top-level.\n    if (parent.equals(\"\")) {\n      JMenu jli = new JMenu(name);\n      SVAbstractMenuItem mli = new SVSubMenuItem(name, jli);\n      items.put(name, mli);\n      root.add(jli);\n    }\n    // A new sub-submenu.\n    else if (id == -1) {\n      SVAbstractMenuItem jmi = items.get(parent);\n      JMenu jli = new JMenu(name);\n      SVAbstractMenuItem mli = new SVSubMenuItem(name, jli);\n      items.put(name, mli);\n      jmi.add(jli);\n    }\n    // A new child entry. Add to appropriate parent.\n    else {\n      SVAbstractMenuItem jmi = items.get(parent);\n      if (jmi == null) {\n        System.out.println(\"ERROR: Unknown parent \" + parent);\n        System.exit(1);\n      }\n      SVAbstractMenuItem mli = new SVEmptyMenuItem(id, name);\n      mli.mi.addActionListener(this);\n      items.put(name, mli);\n      jmi.add(mli);\n    }\n  }\n\n  /**\n   * Add a new entry to the menubar. In this case, we also know its value and\n   * possibly even have a description. For these items, the server will not poll\n   * the client to ask what to do, but just show an input dialog and send a\n   * message with the new value.\n   *\n   * @param parent The menu we add our new entry to (should have been defined\n   *        before). If the parent is \"\", we will add the entry to the root\n   *        (top-level).\n   * @param name The caption of the new entry.\n   * @param id The Id of the new entry. If it is -1, the entry will be treated\n   *        as a menu.\n   * @param value The value of the new entry.\n   * @param desc The description of the new entry.\n   */\n  public void add(String parent, String name, int id, String value, String desc) {\n    SVAbstractMenuItem jmi = items.get(parent);\n    SVMenuItem mli = new SVMenuItem(id, name, value, desc);\n    mli.mi.addActionListener(this);\n    items.put(name, mli);\n    if (jmi == null) { // add to root\n      root.add(mli.mi);\n    } else { // add to parent\n      jmi.add(mli);\n    }\n  }\n\n\n\n  /**\n   * A click on one of the items in our menubar has occurred. Forward it\n   * to the item itself to let it decide what happens.\n   */\n  public void actionPerformed(ActionEvent e) {\n\n    // Get the corresponding menuitem\n    SVAbstractMenuItem svm = items.get(e.getActionCommand());\n\n   svm.performAction(svWindow, SVEventType.SVET_POPUP);\n  }\n\n  /**\n   * Gets called by the SVEventHandler of the window to actually show the\n   * content of the popup menu.\n   */\n  public void show(Component Invoker, int x, int y) {\n    root.show(Invoker, x, y);\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVSubMenuItem.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\n/**\n * A MenuListItem is any sort of menu entry. This can either be within a popup\n * menu or within a menubar. It can either be a submenu (only name and\n * command-id) or a name with an associated value and possibly description. They\n * can also have new entries added (if they are submenus).\n *\n * @author wanke@google.com\n */\n\nimport javax.swing.JMenu;\n\n/** Constructs a new submenu which can hold other entries. */\nclass SVSubMenuItem extends SVAbstractMenuItem {\n  public SVSubMenuItem(String name, JMenu jli) {\n    super(-1, name, jli);\n  }\n  /** Adds a child entry to the submenu. */\n  @Override\n  public void add(SVAbstractMenuItem mli) {\n    mi.add(mli.mi);\n  }\n  /** Adds a child menu to the submenu (or root node). */\n  @Override\n  public void add(JMenu jli) {\n    mi.add(jli);\n  }\n}\n"
  },
  {
    "path": "java/com/google/scrollview/ui/SVWindow.java",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); You may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by\n// applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied. See the License for the specific\n// language governing permissions and limitations under the License.\n\npackage com.google.scrollview.ui;\n\nimport com.google.scrollview.ScrollView;\nimport com.google.scrollview.events.SVEvent;\nimport com.google.scrollview.events.SVEventHandler;\nimport com.google.scrollview.events.SVEventType;\nimport com.google.scrollview.ui.SVMenuBar;\nimport com.google.scrollview.ui.SVPopupMenu;\n\nimport org.piccolo2d.PCamera;\nimport org.piccolo2d.PCanvas;\nimport org.piccolo2d.PLayer;\nimport org.piccolo2d.extras.swing.PScrollPane;\nimport org.piccolo2d.nodes.PImage;\nimport org.piccolo2d.nodes.PPath;\nimport org.piccolo2d.nodes.PText;\nimport org.piccolo2d.util.PPaintContext;\n\nimport java.awt.BasicStroke;\nimport java.awt.BorderLayout;\nimport java.awt.Color;\nimport java.awt.Font;\nimport java.awt.GraphicsEnvironment;\nimport java.awt.Rectangle;\nimport java.awt.TextArea;\nimport java.awt.geom.IllegalPathStateException;\nimport java.util.regex.Matcher;\nimport java.util.regex.Pattern;\n\nimport javax.swing.JFrame;\nimport javax.swing.JOptionPane;\nimport javax.swing.SwingUtilities;\nimport javax.swing.WindowConstants;\n\n/**\n * The SVWindow is the top-level ui class. It should get instantiated whenever\n * the user intends to create a new window. It contains helper functions to draw\n * on the canvas, add new menu items, show modal dialogs etc.\n *\n * @author wanke@google.com\n */\npublic class SVWindow extends JFrame {\n  /**\n   * Constants defining the maximum initial size of the window.\n   */\n  private static final int MAX_WINDOW_X = 1000;\n  private static final int MAX_WINDOW_Y = 800;\n\n  /* Constant defining the (approx) height of the default message box*/\n  private static final int DEF_MESSAGEBOX_HEIGHT = 200;\n\n  /** Constant defining the \"speed\" at which to zoom in and out. */\n  public static final double SCALING_FACTOR = 2;\n\n  /** The top level layer we add our PNodes to (root node). */\n  PLayer layer;\n\n  /** The current color of the pen. It is used to draw edges, text, etc. */\n  Color currentPenColor;\n\n  /**\n   * The current color of the brush. It is used to draw the interior of\n   * primitives.\n   */\n  Color currentBrushColor;\n\n  /** The system name of the current font we are using (e.g.\n   *  \"Times New Roman\"). */\n  Font currentFont;\n\n  /** The stroke width to be used. */\n  // This really needs to be a fixed width stroke as the basic stroke is\n  // anti-aliased and gets too faint, but the piccolo fixed width stroke\n  // is too buggy and generates missing initial moveto in path definition\n  // errors with an IllegalPathStateException that cannot be caught because\n  // it is in the automatic repaint function. If we can fix the exceptions\n  // in piccolo, then we can use the following instead of BasicStroke:\n  //   import edu.umd.cs.piccolox.util.PFixedWidthStroke;\n  //   PFixedWidthStroke stroke = new PFixedWidthStroke(0.5f);\n  // Instead we use the BasicStroke and turn off anti-aliasing.\n  BasicStroke stroke = new BasicStroke(0.5f);\n\n  /**\n   * A unique representation for the window, also known by the client. It is\n   * used when sending messages from server to client to identify him.\n   */\n  public int hash;\n\n  /**\n   * The total number of created Windows. If this ever reaches 0 (apart from the\n   * beginning), quit the server.\n   */\n  public static int nrWindows = 0;\n\n  /**\n   * The Canvas, MessageBox, EventHandler, Menubar and Popupmenu associated with\n   * this window.\n   */\n  private SVEventHandler svEventHandler = null;\n  private SVMenuBar svMenuBar = null;\n  private TextArea ta = null;\n  public SVPopupMenu svPuMenu = null;\n  public PCanvas canvas;\n  private int winSizeX;\n  private int winSizeY;\n\n  /** Set the brush to an RGB color */\n  public void brush(int red, int green, int blue) {\n    brush(red, green, blue, 255);\n  }\n\n  /** Set the brush to an RGBA color */\n  public void brush(int red, int green, int blue, int alpha) {\n    // If alpha is zero, use a null brush to save rendering time.\n    if (alpha == 0) {\n      currentBrushColor = null;\n    } else {\n      currentBrushColor = new Color(red, green, blue, alpha);\n    }\n  }\n\n  /** Erase all content from the window, but do not destroy it. */\n  public void clear() {\n    // Manipulation of Piccolo's scene graph should be done from Swings\n    // event dispatch thread since Piccolo is not thread safe. This code calls\n    // removeAllChildren() from that thread and releases the latch.\n    final java.util.concurrent.CountDownLatch latch = new java.util.concurrent.CountDownLatch(1);\n    SwingUtilities.invokeLater(new Runnable() {\n      public void run() {\n        layer.removeAllChildren();\n        repaint();\n        latch.countDown();\n      }\n    });\n    try {\n      latch.await();\n    } catch (InterruptedException e) {\n    }\n  }\n\n  /**\n   * Start setting up a new polyline. The server will now expect\n   * polyline data until the polyline is complete.\n   *\n   * @param length number of coordinate pairs\n   */\n  public void createPolyline(int length) {\n    ScrollView.polylineXCoords = new float[length];\n    ScrollView.polylineYCoords = new float[length];\n    ScrollView.polylineSize = length;\n    ScrollView.polylineScanned = 0;\n  }\n\n  /**\n   * Draw the now complete polyline.\n   */\n  public void drawPolyline() {\n    int numCoords = ScrollView.polylineXCoords.length;\n    if (numCoords < 2) {\n      return;\n    }\n    PPath pn = PPath.createLine(ScrollView.polylineXCoords[0],\n                                ScrollView.polylineYCoords[0],\n                                ScrollView.polylineXCoords[1],\n                                ScrollView.polylineYCoords[1]);\n    pn.reset();\n    pn.moveTo(ScrollView.polylineXCoords[0], ScrollView.polylineYCoords[0]);\n    for (int p = 1; p < numCoords; ++p) {\n      pn.lineTo(ScrollView.polylineXCoords[p], ScrollView.polylineYCoords[p]);\n    }\n    pn.closePath();\n    ScrollView.polylineSize = 0;\n    pn.setStrokePaint(currentPenColor);\n    pn.setPaint(null);  // Don't fill the polygon - this is just a polyline.\n    pn.setStroke(stroke);\n    layer.addChild(pn);\n  }\n\n  /**\n   * Construct a new SVWindow and set it visible.\n   *\n   * @param name Title of the window.\n   * @param hash Unique internal representation. This has to be the same as\n   *        defined by the client, as they use this to refer to the windows.\n   * @param posX X position of where to draw the window (upper left).\n   * @param posY Y position of where to draw the window (upper left).\n   * @param sizeX The width of the window.\n   * @param sizeY The height of the window.\n   * @param canvasSizeX The canvas width of the window.\n   * @param canvasSizeY The canvas height of the window.\n   */\n  public SVWindow(String name, int hash, int posX, int posY, int sizeX,\n                  int sizeY, int canvasSizeX, int canvasSizeY) {\n    super(name);\n\n    // Provide defaults for sizes.\n    if (sizeX <= 0) sizeX = canvasSizeX;\n    if (sizeY <= 0) sizeY = canvasSizeY;\n    if (canvasSizeX <= 0) canvasSizeX = sizeX;\n    if (canvasSizeY <= 0) canvasSizeY = sizeY;\n\n    // Avoid later division by zero.\n    if (sizeX <= 0) {\n      sizeX = 1;\n      canvasSizeX = sizeX;\n    }\n    if (sizeY <= 0) {\n      sizeY = 1;\n      canvasSizeY = sizeY;\n    }\n\n    // Initialize variables\n    nrWindows++;\n    this.hash = hash;\n    this.svEventHandler = new SVEventHandler(this);\n    this.currentPenColor = Color.BLACK;\n    this.currentBrushColor = Color.BLACK;\n    this.currentFont = new Font(\"Times New Roman\", Font.PLAIN, 12);\n\n    // Determine the initial size and zoom factor of the window.\n    // If the window is too big, rescale it and zoom out.\n    int shrinkfactor = 1;\n\n    if (sizeX > MAX_WINDOW_X) {\n      shrinkfactor = (sizeX + MAX_WINDOW_X - 1) / MAX_WINDOW_X;\n    }\n    if (sizeY / shrinkfactor > MAX_WINDOW_Y) {\n      shrinkfactor = (sizeY + MAX_WINDOW_Y - 1) / MAX_WINDOW_Y;\n    }\n    winSizeX = sizeX / shrinkfactor;\n    winSizeY = sizeY / shrinkfactor;\n    double initialScalingfactor = 1.0 / shrinkfactor;\n    if (winSizeX > canvasSizeX || winSizeY > canvasSizeY) {\n      initialScalingfactor = Math.min(1.0 * winSizeX / canvasSizeX,\n                                      1.0 * winSizeY / canvasSizeY);\n    }\n\n    // Setup the actual window (its size, camera, title, etc.)\n    if (canvas == null) {\n      canvas = new PCanvas();\n      getContentPane().add(canvas, BorderLayout.CENTER);\n    }\n\n    layer = canvas.getLayer();\n    canvas.setBackground(Color.BLACK);\n\n    // Disable antialiasing to make the lines more visible.\n    canvas.setDefaultRenderQuality(PPaintContext.LOW_QUALITY_RENDERING);\n\n    setLayout(new BorderLayout());\n\n    setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE);\n\n    validate();\n    canvas.requestFocus();\n\n    // Manipulation of Piccolo's scene graph should be done from Swings\n    // event dispatch thread since Piccolo is not thread safe. This code calls\n    // initialize() from that thread once the PFrame is initialized, so you are\n    // safe to start working with Piccolo in the initialize() method.\n    SwingUtilities.invokeLater(new Runnable() {\n      public void run() {\n        repaint();\n      }\n    });\n\n    setSize(winSizeX, winSizeY);\n    setLocation(posX, posY);\n    setTitle(name);\n\n    // Add a Scrollpane to be able to scroll within the canvas\n    PScrollPane scrollPane = new PScrollPane(canvas);\n    getContentPane().add(scrollPane);\n    scrollPane.setWheelScrollingEnabled(false);\n    PCamera lc = canvas.getCamera();\n    lc.scaleViewAboutPoint(initialScalingfactor, 0, 0);\n\n    // Disable the default event handlers and add our own.\n    addWindowListener(svEventHandler);\n    canvas.removeInputEventListener(canvas.getPanEventHandler());\n    canvas.removeInputEventListener(canvas.getZoomEventHandler());\n    canvas.addInputEventListener(svEventHandler);\n    canvas.addKeyListener(svEventHandler);\n\n    // Make the window visible.\n    validate();\n    setVisible(true);\n\n  }\n\n  /**\n   * Convenience function to add a message box to the window which can be used\n   * to output debug information.\n   */\n  public void addMessageBox() {\n    if (ta == null) {\n      ta = new TextArea();\n      ta.setEditable(false);\n      getContentPane().add(ta, BorderLayout.SOUTH);\n    }\n    // We need to make the window bigger to accommodate the message box.\n    winSizeY += DEF_MESSAGEBOX_HEIGHT;\n    setSize(winSizeX, winSizeY);\n  }\n\n  /**\n   * Allows you to specify the thickness with which to draw lines, recantgles\n   * and ellipses.\n   * @param width The new thickness.\n   */\n  public void setStrokeWidth(float width) {\n    // If this worked we wouldn't need the antialiased rendering off.\n    // stroke = new PFixedWidthStroke(width);\n    stroke = new BasicStroke(width);\n  }\n\n  /**\n   * Draw an ellipse at (x,y) with given width and height, using the\n   * current stroke, the current brush color to fill it and the\n   * current pen color for the outline.\n   */\n  public void drawEllipse(int x, int y, int width, int height) {\n    PPath pn = PPath.createEllipse(x, y, width, height);\n    pn.setStrokePaint(currentPenColor);\n    pn.setStroke(stroke);\n    pn.setPaint(currentBrushColor);\n    layer.addChild(pn);\n  }\n\n  /**\n   * Draw the image with the given name at (x,y). Any image loaded stays in\n   * memory, so if you intend to redraw an image, you do not have to use\n   * createImage again.\n   */\n  public void drawImage(PImage img, int xPos, int yPos) {\n    img.setX(xPos);\n    img.setY(yPos);\n    layer.addChild(img);\n  }\n\n  /**\n   * Draw a line from (x1,y1) to (x2,y2) using the current pen color and stroke.\n   */\n  public void drawLine(int x1, int y1, int x2, int y2) {\n    PPath pn = PPath.createLine(x1, y1, x2, y2);\n    pn.setStrokePaint(currentPenColor);\n    pn.setPaint(null);  // Null paint may render faster than the default.\n    pn.setStroke(stroke);\n    pn.moveTo(x1, y1);\n    pn.lineTo(x2, y2);\n    layer.addChild(pn);\n  }\n\n  /**\n   * Draw a rectangle given the two points (x1,y1) and (x2,y2) using the current\n   * stroke, pen color for the border and the brush to fill the\n   * interior.\n   */\n  public void drawRectangle(int x1, int y1, int x2, int y2) {\n\n    if (x1 > x2) {\n      int t = x1;\n      x1 = x2;\n      x2 = t;\n    }\n    if (y1 > y2) {\n      int t = y1;\n      y1 = y2;\n      y2 = t;\n    }\n\n    PPath pn = PPath.createRectangle(x1, y1, x2 - x1, y2 - y1);\n    pn.setStrokePaint(currentPenColor);\n    pn.setStroke(stroke);\n    pn.setPaint(currentBrushColor);\n    layer.addChild(pn);\n  }\n\n  /**\n   * Draw some text at (x,y) using the current pen color and text attributes. If\n   * the current font does NOT support at least one character, it tries to find\n   * a font which is capable of displaying it and use that to render the text.\n   * Note: If the font says it can render a glyph, but in reality it turns out\n   * to be crap, there is nothing we can do about it.\n   */\n  public void drawText(int x, int y, String text) {\n    int unreadableCharAt = -1;\n    char[] chars = text.toCharArray();\n    PText pt = new PText(text);\n    pt.setTextPaint(currentPenColor);\n    pt.setFont(currentFont);\n\n    // Check to see if every character can be displayed by the current font.\n    for (int i = 0; i < chars.length; i++) {\n      if (!currentFont.canDisplay(chars[i])) {\n        // Set to the first not displayable character.\n        unreadableCharAt = i;\n        break;\n      }\n    }\n\n    // Have to find some working font and use it for this text entry.\n    if (unreadableCharAt != -1) {\n      Font[] allfonts =\n          GraphicsEnvironment.getLocalGraphicsEnvironment().getAllFonts();\n      for (int j = 0; j < allfonts.length; j++) {\n        if (allfonts[j].canDisplay(chars[unreadableCharAt])) {\n          Font tempFont =\n              new Font(allfonts[j].getFontName(), currentFont.getStyle(),\n                  currentFont.getSize());\n          pt.setFont(tempFont);\n          break;\n        }\n      }\n    }\n\n    pt.setX(x);\n    pt.setY(y);\n    layer.addChild(pt);\n  }\n\n  /** Set the pen color to an RGB value */\n  public void pen(int red, int green, int blue) {\n    pen(red, green, blue, 255);\n  }\n\n  /** Set the pen color to an RGBA value */\n  public void pen(int red, int green, int blue, int alpha) {\n    currentPenColor = new Color(red, green, blue, alpha);\n  }\n\n  /**\n   * Define how to display text. Note: underlined is not currently not supported\n   */\n  public void textAttributes(String font, int pixelSize, boolean bold,\n      boolean italic, boolean underlined) {\n\n    // For legacy reasons convert \"Times\" to \"Times New Roman\"\n    if (font.equals(\"Times\")) {\n      font = \"Times New Roman\";\n    }\n\n    int style = Font.PLAIN;\n    if (bold) {\n      style += Font.BOLD;\n    }\n    if (italic) {\n      style += Font.ITALIC;\n    }\n    currentFont = new Font(font, style, pixelSize);\n  }\n\n  /**\n   * Zoom the window to the rectangle given the two points (x1,y1)\n   * and (x2,y2), which must be greater than (x1,y1).\n   */\n  public void zoomRectangle(int x1, int y1, int x2, int y2) {\n    if (x2 > x1 && y2 > y1) {\n      winSizeX = getWidth();\n      winSizeY = getHeight();\n      int width = x2 - x1;\n      int height = y2 - y1;\n      // Since piccolo doesn't do this well either, pad with a margin\n      // all the way around.\n      int wmargin = width / 2;\n      int hmargin = height / 2;\n      double scalefactor = Math.min(winSizeX / (2.0 * wmargin + width),\n                                    winSizeY / (2.0 * hmargin + height));\n      PCamera lc = canvas.getCamera();\n      lc.scaleView(scalefactor / lc.getViewScale());\n      lc.animateViewToPanToBounds(new Rectangle(x1 - hmargin, y1 - hmargin,\n                                                2 * wmargin + width,\n                                                2 * hmargin + height), 0);\n    }\n  }\n\n  /**\n   * Flush buffers and update display.\n   *\n   * Only actually reacts if there are no more messages in the stack, to prevent\n   * the canvas from flickering.\n   */\n  public void update() {\n    // TODO(rays) fix bugs in piccolo or use something else.\n    // The repaint function generates many\n    // exceptions for no good reason. We catch and ignore as many as we\n    // can here, but most of them are generated by the system repaints\n    // caused by resizing/exposing parts of the window etc, and they\n    // generate unwanted stack traces that have to be piped to /dev/null\n    // (on linux).\n    try {\n      repaint();\n    } catch (NullPointerException e) {\n      // Do nothing so the output isn't full of stack traces.\n    } catch (IllegalPathStateException e) {\n      // Do nothing so the output isn't full of stack traces.\n    }\n  }\n\n  /** Adds a checkbox entry to the menubar, c.f. SVMenubar.add(...) */\n  public void addMenuBarItem(String parent, String name, int id,\n                             boolean checked) {\n    svMenuBar.add(parent, name, id, checked);\n  }\n\n  /** Adds a submenu to the menubar, c.f. SVMenubar.add(...) */\n  public void addMenuBarItem(String parent, String name) {\n    addMenuBarItem(parent, name, -1);\n  }\n\n  /** Adds a new entry to the menubar, c.f. SVMenubar.add(...) */\n  public void addMenuBarItem(String parent, String name, int id) {\n    if (svMenuBar == null) {\n      svMenuBar = new SVMenuBar(this);\n\n    }\n    svMenuBar.add(parent, name, id);\n  }\n\n  /** Add a message to the message box. */\n  public void addMessage(String message) {\n    if (ta != null) {\n      ta.append(message + \"\\n\");\n    } else {\n      System.out.println(message + \"\\n\");\n    }\n  }\n\n  /**\n   * This method converts a string which might contain hexadecimal values to a\n   * string which contains the respective unicode counterparts.\n   *\n   * For example, Hall0x0094chen returns Hall<o umlaut>chen\n   * encoded as utf8.\n   *\n   * @param input The original string, containing 0x values\n   * @return The converted string which has the replaced unicode symbols\n   */\n  private static String convertIntegerStringToUnicodeString(String input) {\n    StringBuffer sb = new StringBuffer(input);\n    Pattern numbers = Pattern.compile(\"0x[0-9a-fA-F]{4}\");\n    Matcher matcher = numbers.matcher(sb);\n\n    while (matcher.find()) {\n      // Find the next match which resembles a hexadecimal value and convert it\n      // to\n      // its char value\n      char a = (char) (Integer.decode(matcher.group()).intValue());\n\n      // Replace the original with the new character\n      sb.replace(matcher.start(), matcher.end(), String.valueOf(a));\n\n      // Start again, since our positions have switched\n      matcher.reset();\n    }\n    return sb.toString();\n  }\n\n  /**\n   * Show a modal input dialog. The answer by the dialog is then send to the\n   * client, together with the associated menu id, as SVET_POPUP\n   *\n   * @param msg The text that is displayed in the dialog.\n   * @param def The default value of the dialog.\n   * @param id The associated commandId\n   * @param evtype The event this is associated with (usually SVET_MENU\n   * or SVET_POPUP)\n   */\n  public void showInputDialog(String msg, String def, int id,\n                              SVEventType evtype) {\n    svEventHandler.timer.stop();\n    String tmp =\n        (String) JOptionPane.showInputDialog(this, msg, \"\",\n            JOptionPane.QUESTION_MESSAGE, null, null, def);\n\n    if (tmp != null) {\n      tmp = convertIntegerStringToUnicodeString(tmp);\n      SVEvent res = new SVEvent(evtype, this, id, tmp);\n      ScrollView.addMessage(res);\n    }\n    svEventHandler.timer.restart();\n  }\n\n\n  /**\n   * Shows a modal input dialog to the user. The return value is automatically\n   * sent to the client as SVET_INPUT event (with command id -1).\n   *\n   * @param msg The text of the dialog.\n   */\n  public void showInputDialog(String msg) {\n    showInputDialog(msg, null, -1, SVEventType.SVET_INPUT);\n  }\n\n  /**\n   * Shows a dialog presenting \"Yes\" and \"No\" as answers and returns either a\n   * \"y\" or \"n\" to the client.\n   *\n   * Closing the dialog without answering is handled like \"No\".\n   *\n   * @param msg The text that is displayed in the dialog.\n   */\n  public void showYesNoDialog(String msg) {\n    // res returns 0 on yes, 1 on no. Seems to be a bit counterintuitive\n    int res =\n        JOptionPane.showOptionDialog(this, msg, \"\", JOptionPane.YES_NO_OPTION,\n            JOptionPane.QUESTION_MESSAGE, null, null, null);\n\n    SVEvent e = new SVEvent(SVEventType.SVET_INPUT, this, 0, 0, 0, 0,\n                            res == 0 ? \"y\" : \"n\");\n    ScrollView.addMessage(e);\n  }\n\n  /** Adds a submenu to the popup menu, c.f. SVPopupMenu.add(...) */\n  public void addPopupMenuItem(String parent, String name) {\n    if (svPuMenu == null) {\n      svPuMenu = new SVPopupMenu(this);\n    }\n    svPuMenu.add(parent, name, -1);\n  }\n\n  /** Adds a new menu entry to the popup menu, c.f. SVPopupMenu.add(...) */\n  public void addPopupMenuItem(String parent, String name, int cmdEvent,\n      String value, String desc) {\n    if (svPuMenu == null) {\n      svPuMenu = new SVPopupMenu(this);\n    }\n    svPuMenu.add(parent, name, cmdEvent, value, desc);\n  }\n\n  /** Destroys a window. */\n  public void destroy() {\n    ScrollView.addMessage(new SVEvent(SVEventType.SVET_DESTROY, this, 0,\n        \"SVET_DESTROY\"));\n    setVisible(false);\n    // dispose();\n  }\n}\n"
  },
  {
    "path": "nsis/Makefile.am",
    "content": "AUTOMAKE_OPTIONS = subdir-objects\n\nall:\n\nif MINGW\n\ngitrev=\"$(shell git --git-dir=${abs_top_srcdir}/.git --work-tree=${abs_top_srcdir} describe --always --tags | sed s/^v//)\"\n\n.PHONY: winsetup\n\nPlugins/x86-unicode/INetC.dll:\n\tcurl -OsS https://nsis.sourceforge.io/mediawiki/images/c/c9/Inetc.zip\n\tunzip Inetc.zip $@\n\nwinpath.exe: winpath.cpp\n\tx86_64-w64-mingw32-g++ -Os -o $@ $<\n\tx86_64-w64-mingw32-strip --strip-unneeded $@\n\nwinsetup: Plugins/x86-unicode/INetC.dll winpath.exe\n\tmakensis -DCROSSBUILD -DSHARED -DSIGNCODE=$(SIGNCODE) -DSRCDIR=$(top_srcdir) -DVERSION=${gitrev} $(shell test \"$(host_cpu)\" = x86_64 && echo \"-DW64\") -NOCD $(top_srcdir)/nsis/tesseract.nsi\n\nendif\n"
  },
  {
    "path": "nsis/build.sh",
    "content": "#!/bin/bash\n\n# GitHub actions - Create Tesseract installer for Windows\n\n# Author: Stefan Weil (2010-2024)\n\nset -e\nset -x\n\nLANG=C.UTF-8\n\nARCH=$1\n\nif [ \"$ARCH\" = \"i686\" ]; then\n  MINGW=/mingw32\nelse\n  ARCH=x86_64\n  MINGW=/mingw64\nfi\n\nROOTDIR=$PWD\nDISTDIR=$ROOTDIR/dist\nHOST=$ARCH-w64-mingw32\nTAG=$(cat VERSION).$(date +%Y%m%d)\nBUILDDIR=bin/ndebug/$HOST-$TAG\nPKG_ARCH=mingw-w64-${ARCH/_/-}\n\n# Install packages.\nsudo apt-get update --quiet\nsudo apt-get install --assume-yes --no-install-recommends --quiet \\\n  asciidoc curl xsltproc docbook-xml docbook-xsl \\\n  automake dpkg-dev libtool pkg-config default-jdk-headless \\\n  mingw-w64-tools nsis g++-\"$PKG_ARCH\" \\\n  makepkg pacman-package-manager python3-venv unzip\n\n# Configure pacman.\n\n# Enable mirrorlist.\nsudo sed -Ei 's/^#.*(Include.*mirrorlist)/\\1/' /etc/pacman.conf\n(\n# Add msys key for pacman.\ncd /usr/share/keyrings\nsudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2.gpg\nsudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2-revoked\nsudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2-trusted\n)\n(\n# Add active environments for pacman.\n# See https://www.msys2.org/docs/repos-mirrors/.\nsudo mkdir -p /etc/pacman.d\ncd /etc/pacman.d\ncat <<eod | sudo tee mirrorlist >/dev/null\n[mingw64]\nInclude = /etc/pacman.d/mirrorlist.mingw\neod\nsudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-packages/master/pacman-mirrors/mirrorlist.mingw\n# sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-packages/master/pacman-mirrors/mirrorlist.msys\n)\n\nsudo pacman-key --init\nsudo pacman-key --populate msys2\nsudo pacman -Syu --noconfirm\n\n# Install required pacman packages.\nsudo pacman -S --noconfirm \\\n mingw-w64-x86_64-curl-winssl \\\n mingw-w64-x86_64-giflib \\\n mingw-w64-x86_64-icu \\\n mingw-w64-x86_64-leptonica \\\n mingw-w64-x86_64-libarchive \\\n mingw-w64-x86_64-libidn2 \\\n mingw-w64-x86_64-openjpeg2 \\\n mingw-w64-x86_64-openssl \\\n mingw-w64-x86_64-pango \\\n mingw-w64-x86_64-libpng \\\n mingw-w64-x86_64-libtiff \\\n mingw-w64-x86_64-libwebp\n\ngit config --global user.email \"sw@weilnetz.de\"\ngit config --global user.name \"Stefan Weil\"\ngit tag -a \"v$TAG\" -m \"Tesseract $TAG\"\n\n# Run autogen.\n./autogen.sh\n\n# Build Tesseract installer.\nmkdir -p \"$BUILDDIR\" && cd \"$BUILDDIR\"\n\n# Run configure.\nPKG_CONFIG_PATH=$MINGW/lib/pkgconfig\nexport PKG_CONFIG_PATH\n# Disable OpenMP (see https://github.com/tesseract-ocr/tesseract/issues/1662).\n../../../configure --disable-openmp --host=\"$HOST\" --prefix=\"/usr/$HOST\" \\\n  CXX=\"$HOST-g++-posix\" \\\n  CXXFLAGS=\"-fno-math-errno -Wall -Wextra -Wpedantic -g -O2 -isystem $MINGW/include\" \\\n  LDFLAGS=\"-L$MINGW/lib\"\n\nmake all training\nMINGW_INSTALL=${PWD}${MINGW}\nmake install-jars install training-install html prefix=\"$MINGW_INSTALL\" INSTALL_STRIP_FLAG=-s\ntest -d venv || python3 -m venv venv\nsource venv/bin/activate\npip install pefile\nmkdir -p dll\nln -sv $(\"$ROOTDIR/nsis/find_deps.py\" \"$MINGW_INSTALL\"/bin/*.exe \"$MINGW_INSTALL\"/bin/*.dll) dll/\nln -svf /usr/lib/gcc/x86_64-w64-mingw32/*-win32/libstdc++-6.dll dll/\nln -svf /usr/lib/gcc/x86_64-w64-mingw32/*-win32/libgcc_s_seh-1.dll dll/\nmake winsetup prefix=\"$MINGW_INSTALL\"\n\n# Copy result for upload.\nmkdir -p \"$DISTDIR\" && cp nsis/tesseract-ocr-w*-setup-*.exe \"$DISTDIR\"\n"
  },
  {
    "path": "nsis/find_deps.py",
    "content": "#!/usr/bin/env python3\n#\n# Copyright (C) 2024 Stefan Weil\n#\n# SPDX-License-Identifier: MIT\n#\n# Find the DLL files which are required for a given set of\n# Windows executables and libraries.\n\nimport argparse\nimport os\nimport pefile\n\nVERBOSE = False\n\ndef find_dependencies(binary, search_path, analyzed_deps):\n    pe = pefile.PE(binary)\n    pe.parse_data_directories()\n    if VERBOSE:\n        print(f'{binary}:')\n    # print(pe.dump_info())\n\n    for entry in pe.DIRECTORY_ENTRY_IMPORT:\n        name = entry.dll.decode('utf-8')\n        if name in analyzed_deps:\n            if VERBOSE:\n                print(f'skip {name} (already analyzed)')\n            continue\n        analyzed_deps.add(name)\n        fullpath = os.path.join(search_path, name)\n        if not os.path.exists(fullpath):\n            # Not found, maybe system DLL. Skip it.\n            if VERBOSE:\n                print(f'skip {name} (not found, maybe system DLL)')\n            continue\n        print(fullpath)\n        analyzed_deps = find_dependencies(fullpath, search_path, analyzed_deps)\n\n    return analyzed_deps\n\ndef main():\n    \"\"\"\n    Command-line interface for universal dependency scanner.\n    \"\"\"\n\n    parser = argparse.ArgumentParser(description='Find and copy DLL dependencies')\n    parser.add_argument('files', nargs='+', help='Paths to executable or library files')\n    parser.add_argument('--dlldir', dest='dlldir', default='/mingw64/bin/',\n                        help='path to dll files')\n\n    args = parser.parse_args()\n\n    # try:\n    # Find dependencies\n    analyzed_deps = set()\n    for binary in args.files:\n        if True:\n            analyzed_deps = find_dependencies(binary, args.dlldir, analyzed_deps)\n        # except:\n        #    print(f'error: failed to find dependencies for {binary}')\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "nsis/include/EnvVarUpdate.nsh",
    "content": "/**\n *  EnvVarUpdate.nsh\n *    : Environmental Variables: append, prepend, and remove entries\n *\n *     WARNING: If you use StrFunc.nsh header then include it before this file\n *              with all required definitions. This is to avoid conflicts\n *\n *  Usage:\n *    ${EnvVarUpdate} \"ResultVar\" \"EnvVarName\" \"Action\" \"RegLoc\" \"PathString\"\n *\n *  Credits:\n *  Version 1.0\n *  * Cal Turney (turnec2)\n *  * Amir Szekely (KiCHiK) and e-circ for developing the forerunners of this\n *    function: AddToPath, un.RemoveFromPath, AddToEnvVar, un.RemoveFromEnvVar,\n *    WriteEnvStr, and un.DeleteEnvStr\n *  * Diego Pedroso (deguix) for StrTok\n *  * Kevin English (kenglish_hi) for StrContains\n *  * Hendri Adriaens (Smile2Me), Diego Pedroso (deguix), and Dan Fuhry\n *    (dandaman32) for StrReplace\n *\n *  Version 1.1 (compatibility with StrFunc.nsh)\n *  * techtonik\n *\n *  http://nsis.sourceforge.net/Environmental_Variables:_append%2C_prepend%2C_and_remove_entries\n *\n */\n\n\n!ifndef ENVVARUPDATE_FUNCTION\n!define ENVVARUPDATE_FUNCTION\n!verbose push\n!verbose 3\n!include \"LogicLib.nsh\"\n!include \"WinMessages.NSH\"\n!include \"StrFunc.nsh\"\n\n; ---- Fix for conflict if StrFunc.nsh is already includes in main file -----------------------\n!macro _IncludeStrFunction StrFuncName\n  !ifndef ${StrFuncName}_INCLUDED\n    ${${StrFuncName}}\n  !endif\n  !ifndef Un${StrFuncName}_INCLUDED\n    ${Un${StrFuncName}}\n  !endif\n  !define un.${StrFuncName} \"${Un${StrFuncName}}\"\n!macroend\n\n!insertmacro _IncludeStrFunction StrTok\n!insertmacro _IncludeStrFunction StrStr\n!insertmacro _IncludeStrFunction StrRep\n\n; ---------------------------------- Macro Definitions ----------------------------------------\n!macro _EnvVarUpdateConstructor ResultVar EnvVarName Action Regloc PathString\n  Push \"${EnvVarName}\"\n  Push \"${Action}\"\n  Push \"${RegLoc}\"\n  Push \"${PathString}\"\n    Call EnvVarUpdate\n  Pop \"${ResultVar}\"\n!macroend\n!define EnvVarUpdate '!insertmacro \"_EnvVarUpdateConstructor\"'\n\n!macro _unEnvVarUpdateConstructor ResultVar EnvVarName Action Regloc PathString\n  Push \"${EnvVarName}\"\n  Push \"${Action}\"\n  Push \"${RegLoc}\"\n  Push \"${PathString}\"\n    Call un.EnvVarUpdate\n  Pop \"${ResultVar}\"\n!macroend\n!define un.EnvVarUpdate '!insertmacro \"_unEnvVarUpdateConstructor\"'\n; ---------------------------------- Macro Definitions end-------------------------------------\n\n;----------------------------------- EnvVarUpdate start----------------------------------------\n!define hklm_all_users     'HKLM \"SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Environment\"'\n!define hkcu_current_user  'HKCU \"Environment\"'\n\n!macro EnvVarUpdate UN\n\nFunction ${UN}EnvVarUpdate\n\n  Push $0\n  Exch 4\n  Exch $1\n  Exch 3\n  Exch $2\n  Exch 2\n  Exch $3\n  Exch\n  Exch $4\n  Push $5\n  Push $6\n  Push $7\n  Push $8\n  Push $9\n  Push $R0\n\n  /* After this point:\n  -------------------------\n     $0 = ResultVar     (returned)\n     $1 = EnvVarName    (input)\n     $2 = Action        (input)\n     $3 = RegLoc        (input)\n     $4 = PathString    (input)\n     $5 = Orig EnvVar   (read from registry)\n     $6 = Len of $0     (temp)\n     $7 = tempstr1      (temp)\n     $8 = Entry counter (temp)\n     $9 = tempstr2      (temp)\n     $R0 = tempChar     (temp)  */\n\n  ; Step 1:  Read contents of EnvVarName from RegLoc\n  ;\n  ; Check for empty EnvVarName\n  ${If} $1 == \"\"\n    SetErrors\n    DetailPrint \"ERROR: EnvVarName is blank\"\n    Goto EnvVarUpdate_Restore_Vars\n  ${EndIf}\n\n  ; Check for valid Action\n  ${If}    $2 != \"A\"\n  ${AndIf} $2 != \"P\"\n  ${AndIf} $2 != \"R\"\n    SetErrors\n    DetailPrint \"ERROR: Invalid Action - must be A, P, or R\"\n    Goto EnvVarUpdate_Restore_Vars\n  ${EndIf}\n\n  ${If} $3 == HKLM\n    ReadRegStr $5 ${hklm_all_users} $1     ; Get EnvVarName from all users into $5\n  ${ElseIf} $3 == HKCU\n    ReadRegStr $5 ${hkcu_current_user} $1  ; Read EnvVarName from current user into $5\n  ${Else}\n    SetErrors\n    DetailPrint 'ERROR: Action is [$3] but must be \"HKLM\" or HKCU\"'\n    Goto EnvVarUpdate_Restore_Vars\n  ${EndIf}\n\n  ; Check for empty PathString\n  ${If} $4 == \"\"\n    SetErrors\n    DetailPrint \"ERROR: PathString is blank\"\n    Goto EnvVarUpdate_Restore_Vars\n  ${EndIf}\n\n  ; Make sure we've got some work to do\n  ${If} $5 == \"\"\n  ${AndIf} $2 == \"R\"\n    SetErrors\n    DetailPrint \"$1 is empty - Nothing to remove\"\n    Goto EnvVarUpdate_Restore_Vars\n  ${EndIf}\n\n  ; Step 2: Scrub EnvVar\n  ;\n  StrCpy $0 $5                             ; Copy the contents to $0\n  ; Remove spaces around semicolons (NOTE: spaces before the 1st entry or\n  ; after the last one are not removed here but instead in Step 3)\n  ${If} $0 != \"\"                           ; If EnvVar is not empty ...\n    ${Do}\n      ${${UN}StrStr} $7 $0 \" ;\"\n      ${If} $7 == \"\"\n        ${ExitDo}\n      ${EndIf}\n      ${${UN}StrRep} $0  $0 \" ;\" \";\"         ; Remove '<space>;'\n    ${Loop}\n    ${Do}\n      ${${UN}StrStr} $7 $0 \"; \"\n      ${If} $7 == \"\"\n        ${ExitDo}\n      ${EndIf}\n      ${${UN}StrRep} $0  $0 \"; \" \";\"         ; Remove ';<space>'\n    ${Loop}\n    ${Do}\n      ${${UN}StrStr} $7 $0 \";;\"\n      ${If} $7 == \"\"\n        ${ExitDo}\n      ${EndIf}\n      ${${UN}StrRep} $0  $0 \";;\" \";\"\n    ${Loop}\n\n    ; Remove a leading or trailing semicolon from EnvVar\n    StrCpy  $7  $0 1 0\n    ${If} $7 == \";\"\n      StrCpy $0  $0 \"\" 1                   ; Change ';<EnvVar>' to '<EnvVar>'\n    ${EndIf}\n    StrLen $6 $0\n    IntOp $6 $6 - 1\n    StrCpy $7  $0 1 $6\n    ${If} $7 == \";\"\n     StrCpy $0  $0 $6                      ; Change ';<EnvVar>' to '<EnvVar>'\n    ${EndIf}\n    ; DetailPrint \"Scrubbed $1: [$0]\"      ; Uncomment to debug\n  ${EndIf}\n\n  /* Step 3. Remove all instances of the target path/string (even if \"A\" or \"P\")\n     $6 = bool flag (1 = found and removed PathString)\n     $7 = a string (e.g. path) delimited by semicolon(s)\n     $8 = entry counter starting at 0\n     $9 = copy of $0\n     $R0 = tempChar      */\n\n  ${If} $5 != \"\"                           ; If EnvVar is not empty ...\n    StrCpy $9 $0\n    StrCpy $0 \"\"\n    StrCpy $8 0\n    StrCpy $6 0\n\n    ${Do}\n      ${${UN}StrTok} $7 $9 \";\" $8 \"0\"      ; $7 = next entry, $8 = entry counter\n\n      ${If} $7 == \"\"                       ; If we've run out of entries,\n        ${ExitDo}                          ;    were done\n      ${EndIf}                             ;\n\n      ; Remove leading and trailing spaces from this entry (critical step for Action=Remove)\n      ${Do}\n        StrCpy $R0  $7 1\n        ${If} $R0 != \" \"\n          ${ExitDo}\n        ${EndIf}\n        StrCpy $7   $7 \"\" 1                ;  Remove leading space\n      ${Loop}\n      ${Do}\n        StrCpy $R0  $7 1 -1\n        ${If} $R0 != \" \"\n          ${ExitDo}\n        ${EndIf}\n        StrCpy $7   $7 -1                  ;  Remove trailing space\n      ${Loop}\n      ${If} $7 == $4                       ; If string matches, remove it by not appending it\n        StrCpy $6 1                        ; Set 'found' flag\n      ${ElseIf} $7 != $4                   ; If string does NOT match\n      ${AndIf}  $0 == \"\"                   ;    and the 1st string being added to $0,\n        StrCpy $0 $7                       ;    copy it to $0 without a prepended semicolon\n      ${ElseIf} $7 != $4                   ; If string does NOT match\n      ${AndIf}  $0 != \"\"                   ;    and this is NOT the 1st string to be added to $0,\n        StrCpy $0 $0;$7                    ;    append path to $0 with a prepended semicolon\n      ${EndIf}                             ;\n\n      IntOp $8 $8 + 1                      ; Bump counter\n    ${Loop}                                ; Check for duplicates until we run out of paths\n  ${EndIf}\n\n  ; Step 4:  Perform the requested Action\n  ;\n  ${If} $2 != \"R\"                          ; If Append or Prepend\n    ${If} $6 == 1                          ; And if we found the target\n      DetailPrint \"Target is already present in $1. It will be removed and\"\n    ${EndIf}\n    ${If} $0 == \"\"                         ; If EnvVar is (now) empty\n      StrCpy $0 $4                         ;   just copy PathString to EnvVar\n      ${If} $6 == 0                        ; If found flag is either 0\n      ${OrIf} $6 == \"\"                     ; or blank (if EnvVarName is empty)\n        DetailPrint \"$1 was empty and has been updated with the target\"\n      ${EndIf}\n    ${ElseIf} $2 == \"A\"                    ;  If Append (and EnvVar is not empty),\n      StrCpy $0 $0;$4                      ;     append PathString\n      ${If} $6 == 1\n        DetailPrint \"appended to $1\"\n      ${Else}\n        DetailPrint \"Target was appended to $1\"\n      ${EndIf}\n    ${Else}                                ;  If Prepend (and EnvVar is not empty),\n      StrCpy $0 $4;$0                      ;     prepend PathString\n      ${If} $6 == 1\n        DetailPrint \"prepended to $1\"\n      ${Else}\n        DetailPrint \"Target was prepended to $1\"\n      ${EndIf}\n    ${EndIf}\n  ${Else}                                  ; If Action = Remove\n    ${If} $6 == 1                          ;   and we found the target\n      DetailPrint \"Target was found and removed from $1\"\n    ${Else}\n      DetailPrint \"Target was NOT found in $1 (nothing to remove)\"\n    ${EndIf}\n    ${If} $0 == \"\"\n      DetailPrint \"$1 is now empty\"\n    ${EndIf}\n  ${EndIf}\n\n  ; Step 5:  Update the registry at RegLoc with the updated EnvVar and announce the change\n  ;\n  ClearErrors\n  ${If} $3  == HKLM\n    WriteRegExpandStr ${hklm_all_users} $1 $0     ; Write it in all users section\n  ${ElseIf} $3 == HKCU\n    WriteRegExpandStr ${hkcu_current_user} $1 $0  ; Write it to current user section\n  ${EndIf}\n\n  IfErrors 0 +4\n    MessageBox MB_OK|MB_ICONEXCLAMATION \"Could not write updated $1 to $3\"\n    DetailPrint \"Could not write updated $1 to $3\"\n    Goto EnvVarUpdate_Restore_Vars\n\n  ; \"Export\" our change\n  SendMessage ${HWND_BROADCAST} ${WM_WININICHANGE} 0 \"STR:Environment\" /TIMEOUT=5000\n\n  EnvVarUpdate_Restore_Vars:\n  ;\n  ; Restore the user's variables and return ResultVar\n  Pop $R0\n  Pop $9\n  Pop $8\n  Pop $7\n  Pop $6\n  Pop $5\n  Pop $4\n  Pop $3\n  Pop $2\n  Pop $1\n  Push $0  ; Push my $0 (ResultVar)\n  Exch\n  Pop $0   ; Restore his $0\n\nFunctionEnd\n\n!macroend   ; EnvVarUpdate UN\n!insertmacro EnvVarUpdate \"\"\n!insertmacro EnvVarUpdate \"un.\"\n;----------------------------------- EnvVarUpdate end----------------------------------------\n\n!verbose pop\n!endif\n"
  },
  {
    "path": "nsis/tesseract.nsi",
    "content": "; (C) Copyright 2010, Sergey Bronnikov\n; (C) Copyright 2010-2012, Zdenko Podobný\n; (C) Copyright 2015-2024 Stefan Weil\n;\n; Licensed under the Apache License, Version 2.0 (the \"License\");\n; you may not use this file except in compliance with the License.\n; You may obtain a copy of the License at\n; http://www.apache.org/licenses/LICENSE-2.0\n; Unless required by applicable law or agreed to in writing, software\n; distributed under the License is distributed on an \"AS IS\" BASIS,\n; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n; See the License for the specific language governing permissions and\n; limitations under the License.\n\n; Links to NSIS documentation:\n; https://nsis.sourceforge.io/Docs/Modern%20UI%202/Readme.html\n\n; TODO:\n; * Add Tesseract icon and images for installer.\n\nSetCompressor /FINAL /SOLID lzma\nSetCompressorDictSize 32\n\nUnicode true\n\n; Settings which normally should be passed as command line arguments.\n;define CROSSBUILD\n;define SHARED\n;define W64\n!ifndef COMMENTS\n!define COMMENTS \"GitHub CI build\"\n!endif\n!ifndef COMPANYNAME\n!define COMPANYNAME \"Open Source Community\"\n!endif\n!ifndef SRCDIR\n!define SRCDIR .\n!endif\n!ifndef VERSION\n!define VERSION undefined\n!endif\n\n!define PRODUCT_NAME \"Tesseract-OCR\"\n!define PRODUCT_VERSION \"${VERSION}\"\n!define PRODUCT_PUBLISHER \"Tesseract-OCR community\"\n!ifndef PRODUCT_WEB_SITE\n!define PRODUCT_WEB_SITE \"https://github.com/tesseract-ocr/tesseract\"\n!endif\n!define GITHUB_RAW_FILE_URL \\\n  \"https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main\"\n\n!ifdef CROSSBUILD\n!addincludedir ${SRCDIR}\\nsis\\include\n!addplugindir Plugins/x86-unicode\n!endif\n\n!ifdef W64\n!define ARCH \"x86_64\"\n!define SETUP \"tesseract-ocr-w64-setup\"\n!else\n!define ARCH \"i686\"\n!define SETUP \"tesseract-ocr-w32-setup\"\n!endif\n\n# Name of program and file\n!define OUTFILE \"${SETUP}-${VERSION}.exe\"\nOutFile ${OUTFILE}\n\n!ifdef SIGNCODE\n!finalize \"${SIGNCODE} %1\"\n!uninstfinalize \"${SIGNCODE} %1\"\n!endif\n\n!ifndef PREFIX\n!define PREFIX \"../mingw64\"\n!endif\n!define BINDIR \"${PREFIX}/bin\"\n\n# General Definitions\nName \"${PRODUCT_NAME}\"\nCaption \"${PRODUCT_NAME} ${VERSION}\"\n!ifndef CROSSBUILD\nBrandingText /TRIMCENTER \"(c) 2010-2019 ${PRODUCT_NAME}\"\n!endif\n\n; File properties.\n!define /date DATEVERSION \"%Y%m%d%H%M%S\"\nVIProductVersion \"${VERSION}\"\nVIAddVersionKey \"ProductName\" \"${PRODUCT_NAME}\"\nVIAddVersionKey \"Comments\" \"${COMMENTS}\"\nVIAddVersionKey \"CompanyName\" \"${COMPANYNAME}\"\nVIAddVersionKey \"FileDescription\" \"Tesseract OCR\"\n!define /date DATETIME \"%Y-%m-%d-%H-%M-%S\"\nVIAddVersionKey \"FileVersion\" \"${DATETIME}\"\nVIAddVersionKey \"InternalName\" \"Tesseract\"\nVIAddVersionKey \"LegalCopyright\" \"Apache-2.0\"\n#VIAddVersionKey \"LegalTrademarks\" \"\"\nVIAddVersionKey \"OriginalFilename\" \"${OUTFILE}\"\nVIAddVersionKey \"ProductVersion\" \"${VERSION}\"\n\n!define REGKEY \"SOFTWARE\\${PRODUCT_NAME}\"\n; HKLM (all users) vs HKCU (current user) defines\n!define env_hklm 'HKLM \"SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Environment\"'\n!define env_hkcu 'HKCU \"Environment\"'\n\n# MultiUser Symbol Definitions\n# https://nsis.sourceforge.io/Docs/MultiUser/Readme.html\n!define MULTIUSER_EXECUTIONLEVEL Highest\n!define MULTIUSER_MUI\n!define MULTIUSER_INSTALLMODE_DEFAULT_REGISTRY_KEY \"${REGKEY}\"\n!define MULTIUSER_INSTALLMODE_DEFAULT_REGISTRY_VALUENAME MultiUserInstallMode\n!define MULTIUSER_INSTALLMODE_COMMANDLINE\n!define MULTIUSER_INSTALLMODE_INSTDIR ${PRODUCT_NAME}\n!define MULTIUSER_INSTALLMODE_INSTDIR_REGISTRY_KEY \"${REGKEY}\"\n!define MULTIUSER_INSTALLMODE_INSTDIR_REGISTRY_VALUE \"Path\"\n!ifdef W64\n!define MULTIUSER_USE_PROGRAMFILES64\n!endif\n\n# MUI Symbol Definitions\n!define MUI_ABORTWARNING\n!define MUI_COMPONENTSPAGE_SMALLDESC\n!define MUI_HEADERIMAGE\n!define MUI_HEADERIMAGE_BITMAP_NOSTRETCH\n!define MUI_ICON \"${NSISDIR}\\Contrib\\Graphics\\Icons\\modern-install-blue-full.ico\"\n!define MUI_FINISHPAGE_LINK \"View Tesseract on GitHub\"\n!define MUI_FINISHPAGE_LINK_LOCATION \"https://github.com/tesseract-ocr/tesseract\"\n!define MUI_FINISHPAGE_NOAUTOCLOSE\n!ifdef SHOW_README\n; Showing the README does not work.\n!define MUI_FINISHPAGE_SHOWREADME \"$INSTDIR\\doc\\README.md\"\n!define MUI_FINISHPAGE_SHOWREADME_FUNCTION ShowReadme\n!define MUI_FINISHPAGE_SHOWREADME_TEXT \"Show README\"\n!endif\n!define MUI_STARTMENUPAGE_REGISTRY_ROOT HKLM\n!define MUI_STARTMENUPAGE_REGISTRY_KEY ${REGKEY}\n!define MUI_STARTMENUPAGE_REGISTRY_VALUENAME StartMenuGroup\n!define MUI_STARTMENUPAGE_DEFAULTFOLDER ${PRODUCT_NAME}\n!define MUI_UNICON \"${NSISDIR}\\Contrib\\Graphics\\Icons\\orange-uninstall.ico\"\n!define MUI_UNFINISHPAGE_NOAUTOCLOSE\n!define MUI_WELCOMEPAGE_TITLE_3LINES\n\n# Included files\n!include MultiUser.nsh\n!include Sections.nsh\n!include MUI2.nsh\n!include LogicLib.nsh\n!include winmessages.nsh # include for some of the windows messages defines\n\n# Variables\nVar StartMenuGroup\n; Define user variables\nVar OLD_KEY\n\n# Installer pages\n!insertmacro MUI_PAGE_WELCOME\n!insertmacro MUI_PAGE_LICENSE \"${SRCDIR}\\LICENSE\"\n!insertmacro MULTIUSER_PAGE_INSTALLMODE\n  Page custom PageReinstall PageLeaveReinstall\n!insertmacro MUI_PAGE_COMPONENTS\n!insertmacro MUI_PAGE_DIRECTORY\n!insertmacro MUI_PAGE_STARTMENU Application $StartMenuGroup\n!insertmacro MUI_PAGE_INSTFILES\n!insertmacro MUI_PAGE_FINISH\n!insertmacro MUI_UNPAGE_CONFIRM\n!insertmacro MUI_UNPAGE_INSTFILES\n\n# Languages\n!insertmacro MUI_LANGUAGE \"English\"\n!insertmacro MUI_LANGUAGE \"French\"\n!insertmacro MUI_LANGUAGE \"German\"\n!insertmacro MUI_LANGUAGE \"Italian\"\n!insertmacro MUI_LANGUAGE \"Portuguese\"\n!insertmacro MUI_LANGUAGE \"Russian\"\n!insertmacro MUI_LANGUAGE \"Slovak\"\n!insertmacro MUI_LANGUAGE \"Spanish\"\n!insertmacro MUI_LANGUAGE \"SpanishInternational\"\n\n# Installer attributes\nShowInstDetails hide\nInstProgressFlags smooth colored\nXPStyle on\nSpaceTexts\nCRCCheck on\nInstProgressFlags smooth colored\nCRCCheck On  # Do a CRC check before installing\n\n!macro Download_Lang_Data Lang\n  ; Download traineddata file.\n  DetailPrint \"Download: ${Lang} language file\"\n  inetc::get /caption \"Downloading ${Lang} language file\" \\\n      \"${GITHUB_RAW_FILE_URL}/${Lang}.traineddata\" $INSTDIR/tessdata/${Lang}.traineddata \\\n      /END\n    Pop $0 # return value = exit code, \"OK\" if OK\n    StrCmp $0 \"OK\" +2\n    MessageBox MB_OK|MB_ICONEXCLAMATION \\\n      \"Download error. Status of ${Lang}: $0. Click OK to continue.\" /SD IDOK\n!macroend\n\nSection -Main SEC0000\n  ; mark as read only component\n  SectionIn RO\n  SetOutPath \"$INSTDIR\"\n  # files included in distribution\n  File ${BINDIR}/tesseract.exe\n  File ${BINDIR}/libtesseract-*.dll\n!ifdef CROSSBUILD\n  File ../dll/*.dll\n!endif\n  File winpath.exe\n  File ../doc/*.html\n  CreateDirectory \"$INSTDIR\\tessdata\"\n  SetOutPath \"$INSTDIR\\tessdata\"\n  File ${PREFIX}/share/tessdata/pdf.ttf\n  CreateDirectory \"$INSTDIR\\tessdata\\configs\"\n  SetOutPath \"$INSTDIR\\tessdata\\configs\"\n  File ${PREFIX}/share/tessdata/configs/*\n  CreateDirectory \"$INSTDIR\\tessdata\\script\"\n  CreateDirectory \"$INSTDIR\\tessdata\\tessconfigs\"\n  SetOutPath \"$INSTDIR\\tessdata\\tessconfigs\"\n  File ${PREFIX}/share/tessdata/tessconfigs/*\n  CreateDirectory \"$INSTDIR\\doc\"\n  SetOutPath \"$INSTDIR\\doc\"\n  File ${SRCDIR}\\AUTHORS\n  File ${SRCDIR}\\LICENSE\n  File ${SRCDIR}\\README.md\n##  File ${SRCDIR}\\ReleaseNotes\nSectionEnd\n\nSection \"ScrollView\" SecScrollView\n  SectionIn 1\n  SetOutPath \"$INSTDIR\\tessdata\"\n  File ${PREFIX}/share/tessdata/*.jar\nSectionEnd\n\nSection \"Training Tools\" SecTr\n  SectionIn 1\n  SetOutPath \"$INSTDIR\"\n  File /x tesseract.exe ${BINDIR}/*.exe\nSectionEnd\n\n!define UNINST_EXE \"$INSTDIR\\tesseract-uninstall.exe\"\n!define UNINST_KEY \"Software\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\${PRODUCT_NAME}\"\n\nSection -post SEC0001\n!ifdef W64\n  SetRegView 64\n!endif\n  ;Store installation folder - we always use HKLM!\n  WriteRegStr HKLM \"${REGKEY}\" \"Path\" \"$INSTDIR\"\n  WriteRegStr HKLM \"${REGKEY}\" \"Mode\" $MultiUser.InstallMode\n  WriteRegStr HKLM \"${REGKEY}\" \"InstallDir\" \"$INSTDIR\"\n  WriteRegStr HKLM \"${REGKEY}\" \"CurrentVersion\" \"${VERSION}\"\n  WriteRegStr HKLM \"${REGKEY}\" \"Uninstaller\" \"${UNINST_EXE}\"\n  ;WriteRegStr HKLM \"Software\\Microsoft\\Windows\\CurrentVersion\\App Paths\\tesseract.exe\" \"$INSTDIR\\tesseract.exe\"\n  ;WriteRegStr HKLM \"Software\\Microsoft\\Windows\\CurrentVersion\\Run\" \"Tesseract-OCR\" \"$INSTDIR\\tesseract.exe\"\n  ; Register to Add/Remove program in control panel\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"DisplayName\" \"${PRODUCT_NAME} - open source OCR engine\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"DisplayVersion\" \"${VERSION}\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"Publisher\" \"${PRODUCT_PUBLISHER}\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"URLInfoAbout\" \"${PRODUCT_WEB_SITE}\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"DisplayIcon\" \"${UNINST_EXE}\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"UninstallString\" \"${UNINST_EXE}\"\n  WriteRegStr HKLM \"${UNINST_KEY}\" \"QuietUninstallString\" '\"${UNINST_EXE}\" /S'\n  WriteRegDWORD HKLM \"${UNINST_KEY}\" \"NoModify\" 1\n  WriteRegDWORD HKLM \"${UNINST_KEY}\" \"NoRepair\" 1\n  ;Create uninstaller\n  WriteUninstaller \"${UNINST_EXE}\"\n  ;ExecShell \"open\" \"https://github.com/tesseract-ocr/tesseract\"\n  ;ExecShell \"open\" '\"$INSTDIR\"'\n  ;BringToFront\nSectionEnd\n\nSection \"Shortcuts creation\" SecCS\n  SetOutPath $INSTDIR\n  CreateDirectory \"$SMPROGRAMS\\${PRODUCT_NAME}\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\Console.lnk\" \"$INSTDIR\\winpath.exe\" \"cmd\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\Dokumentation.lnk\" \"$INSTDIR\\tesseract.1.html\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\Homepage.lnk\" \"${PRODUCT_WEB_SITE}\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\ReadMe.lnk\" \"${PRODUCT_WEB_SITE}/wiki/ReadMe\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\FAQ.lnk\" \"${PRODUCT_WEB_SITE}/wiki/FAQ\"\n  CreateShortCut \"$SMPROGRAMS\\${PRODUCT_NAME}\\Uninstall.lnk\" \"${UNINST_EXE}\" \"\" \"${UNINST_EXE}\" 0\n  ;CreateShortCut \"$DESKTOP\\Tesseract-OCR.lnk\" \"$INSTDIR\\tesseract.exe\" \"\" \"$INSTDIR\\tesseract.exe\" 0\n  ;CreateShortCut \"$QUICKLAUNCH\\.lnk\" \"$INSTDIR\\tesseract.exe\" \"\" \"$INSTDIR\\tesseract.exe\" 0\nSectionEnd\n\n; Language files\nSectionGroup \"Language data\" SecGrp_LD\n    Section \"English\" SecLang_eng\n    SectionIn RO\n      !insertmacro Download_Lang_Data eng\n    SectionEnd\n\n    Section \"Orientation and script detection\" SecLang_osd\n    SectionIn 1\n      !insertmacro Download_Lang_Data osd\n    SectionEnd\nSectionGroupEnd\n\n; Download script files\nSectionGroup \"Additional script data (download)\" SecGrp_ASD\n  Section /o \"Arabic script\" SecLang_Arabic\n    AddSize 8880\n    !insertmacro Download_Lang_Data script/Arabic\n  SectionEnd\n\n  Section /o \"Armenian script\" SecLang_Armenian\n    AddSize 7510\n    !insertmacro Download_Lang_Data script/Armenian\n  SectionEnd\n\n  Section /o \"Bengali script\" SecLang_Bengali\n    AddSize 5450\n    !insertmacro Download_Lang_Data script/Bengali\n  SectionEnd\n\n  Section /o \"Canadian Aboriginal script\" SecLang_Canadian_Aboriginal\n    AddSize 6850\n    !insertmacro Download_Lang_Data script/Canadian_Aboriginal\n  SectionEnd\n\n  Section /o \"Cherokee script\" SecLang_Cherokee\n    AddSize 4040\n    !insertmacro Download_Lang_Data script/Cherokee\n  SectionEnd\n\n  Section /o \"Cyrillic script\" SecLang_Cyrillic\n    AddSize 27900\n    !insertmacro Download_Lang_Data script/Cyrillic\n  SectionEnd\n\n  Section /o \"Devanagari script\" SecLang_Devanagari\n    AddSize 17100\n    !insertmacro Download_Lang_Data script/Devanagari\n  SectionEnd\n\n  Section /o \"Ethiopic script\" SecLang_Ethiopic\n    AddSize 8650\n    !insertmacro Download_Lang_Data script/Ethiopic\n  SectionEnd\n\n  Section /o \"Fraktur script\" SecLang_Fraktur\n    AddSize 10400\n    !insertmacro Download_Lang_Data script/Fraktur\n  SectionEnd\n\n  Section /o \"Georgian script\" SecLang_Georgian\n    AddSize 6630\n    !insertmacro Download_Lang_Data script/Georgian\n  SectionEnd\n\n  Section /o \"Greek script\" SecLang_Greek\n    AddSize 2900\n    !insertmacro Download_Lang_Data script/Greek\n  SectionEnd\n\n  Section /o \"Gujarati script\" SecLang_Gujarati\n    AddSize 4780\n    !insertmacro Download_Lang_Data script/Gujarati\n  SectionEnd\n\n  Section /o \"Gurmukhi script\" SecLang_Gurmukhi\n    AddSize 4020\n    !insertmacro Download_Lang_Data script/Gurmukhi\n  SectionEnd\n\n  Section /o \"Han Simplified script\" SecLang_HanS\n    AddSize 5700\n    !insertmacro Download_Lang_Data script/HanS\n  SectionEnd\n\n  Section /o \"Han Simplified vertical script\" SecLang_HanS_vert\n    AddSize 5304\n    !insertmacro Download_Lang_Data script/HanS_vert\n  SectionEnd\n\n  Section /o \"Han Traditional script\" SecLang_HanT\n    AddSize 5200\n    !insertmacro Download_Lang_Data script/HanT\n  SectionEnd\n\n  Section /o \"Han Traditional vertical script\" SecLang_HanT_vert\n    AddSize 5200\n    !insertmacro Download_Lang_Data script/HanT_vert\n  SectionEnd\n\n  Section /o \"Hangul script\" SecLang_Hangul\n    AddSize 4620\n    !insertmacro Download_Lang_Data script/Hangul\n  SectionEnd\n\n  Section /o \"Hangul vertical script\" SecLang_Hangul_vert\n    AddSize 4510\n    !insertmacro Download_Lang_Data script/Hangul_vert\n  SectionEnd\n\n  Section /o \"Hebrew script\" SecLang_Hebrew\n    AddSize 4640\n    !insertmacro Download_Lang_Data script/Hebrew\n  SectionEnd\n\n  Section /o \"Japanese script\" SecLang_Japanese\n    AddSize 5610\n    !insertmacro Download_Lang_Data script/Japanese\n  SectionEnd\n\n  Section /o \"Japanese vertical script\" SecLang_Japanese_vert\n    AddSize 6150\n    !insertmacro Download_Lang_Data script/Japanese_vert\n  SectionEnd\n\n  Section /o \"Kannada script\" SecLang_Kannada\n    AddSize 6460\n    !insertmacro Download_Lang_Data script/Kannada\n  SectionEnd\n\n  Section /o \"Khmer script\" SecLang_Khmer\n    AddSize 4270\n    !insertmacro Download_Lang_Data script/Khmer\n  SectionEnd\n\n  Section /o \"Lao script\" SecLang_Script_Lao\n    AddSize 9640\n    !insertmacro Download_Lang_Data script/Lao\n  SectionEnd\n\n  Section /o \"Latin script\" SecLang_Latin\n    AddSize 85200\n    !insertmacro Download_Lang_Data script/Latin\n  SectionEnd\n\n  Section /o \"Malayalam script\" SecLang_Malayalam\n    AddSize 8590\n    !insertmacro Download_Lang_Data script/Malayalam\n  SectionEnd\n\n  Section /o \"Myanmar script\" SecLang_Myanmar\n    AddSize 7480\n    !insertmacro Download_Lang_Data script/Myanmar\n  SectionEnd\n\n  Section /o \"Oriya script\" SecLang_Oriya\n    AddSize 5480\n    !insertmacro Download_Lang_Data script/Oriya\n  SectionEnd\n\n  Section /o \"Sinhala script\" SecLang_Sinhala\n    AddSize 4560\n    !insertmacro Download_Lang_Data script/Sinhala\n  SectionEnd\n\n  Section /o \"Syriac script\" SecLang_Syriac\n    AddSize 5530\n    !insertmacro Download_Lang_Data script/Syriac\n  SectionEnd\n\n  Section /o \"Tamil script\" SecLang_Tamil\n    AddSize 6760\n    !insertmacro Download_Lang_Data script/Tamil\n  SectionEnd\n\n  Section /o \"Telugu script\" SecLang_Telugu\n    AddSize 6180\n    !insertmacro Download_Lang_Data script/Telugu\n  SectionEnd\n\n  Section /o \"Thaana script\" SecLang_Thaana\n    AddSize 5770\n    !insertmacro Download_Lang_Data script/Thaana\n  SectionEnd\n\n  Section /o \"Thai script\" SecLang_Thai\n    AddSize 4050\n    !insertmacro Download_Lang_Data script/Thai\n  SectionEnd\n\n  Section /o \"Tibetan script\" SecLang_Tibetan\n    AddSize 5440\n    !insertmacro Download_Lang_Data script/Tibetan\n  SectionEnd\n\n  Section /o \"Vietnamese script\" SecLang_Vietnamese\n    AddSize 1590\n    !insertmacro Download_Lang_Data script/Vietnamese\n  SectionEnd\n\nSectionGroupEnd\n\n; Download language files\nSectionGroup \"Additional language data (download)\" SecGrp_ALD\n  Section /o \"Math / equation detection module\" SecLang_equ\n    AddSize 2200\n    !insertmacro Download_Lang_Data equ\n  SectionEnd\n\n  ; The language names are documented here:\n  ; https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc#languages\n\n  Section /o \"Afrikaans\" SecLang_afr\n    AddSize 2530\n    !insertmacro Download_Lang_Data afr\n  SectionEnd\n\n  Section /o \"Amharic\" SecLang_amh\n    AddSize 5220\n    !insertmacro Download_Lang_Data amh\n  SectionEnd\n\n  Section /o \"Arabic\" SecLang_ara\n    AddSize 1370\n    !insertmacro Download_Lang_Data ara\n  SectionEnd\n\n  Section /o \"Assamese\" SecLang_asm\n    AddSize 1950\n    !insertmacro Download_Lang_Data asm\n  SectionEnd\n\n  Section /o \"Azerbaijani\" SecLang_aze\n    AddSize 3360\n    !insertmacro Download_Lang_Data aze\n  SectionEnd\n\n  Section /o \"Azerbaijani (Cyrillic)\" SecLang_aze_cyrl\n    AddSize 1850\n    !insertmacro Download_Lang_Data aze_cyrl\n  SectionEnd\n\n  Section /o \"Belarusian\" SecLang_bel\n    AddSize 3520\n    !insertmacro Download_Lang_Data bel\n  SectionEnd\n\n  Section /o \"Bengali\" SecLang_ben\n    AddSize 836\n    !insertmacro Download_Lang_Data ben\n  SectionEnd\n\n  Section /o \"Tibetan\" SecLang_bod\n    AddSize 1880\n    !insertmacro Download_Lang_Data bod\n  SectionEnd\n\n  Section /o \"Bosnian\" SecLang_bos\n    AddSize 2380\n    !insertmacro Download_Lang_Data bos\n  SectionEnd\n\n  Section /o \"Breton\" SecLang_bre\n    AddSize 6188\n    !insertmacro Download_Lang_Data bre\n  SectionEnd\n\n  Section /o \"Bulgarian\" SecLang_bul\n    AddSize 1600\n    !insertmacro Download_Lang_Data bul\n  SectionEnd\n\n  Section /o \"Catalan\" SecLang_cat\n    AddSize 1090\n    !insertmacro Download_Lang_Data cat\n  SectionEnd\n\n  Section /o \"Cebuano\" SecLang_ceb\n    AddSize 699\n    !insertmacro Download_Lang_Data ceb\n  SectionEnd\n\n  Section /o \"Czech\" SecLang_ces\n    AddSize 3620\n    !insertmacro Download_Lang_Data ces\n  SectionEnd\n\n  Section /o \"Chinese (Simplified)\" SecLang_chi_sim\n    AddSize 2350\n    !insertmacro Download_Lang_Data chi_sim\n  SectionEnd\n\n  Section /o \"Chinese (Simplified vertical)\" SecLang_chi_sim_vert\n    AddSize 1840\n    !insertmacro Download_Lang_Data chi_sim_vert\n  SectionEnd\n\n  Section /o \"Chinese (Traditional)\" SecLang_chi_tra\n    AddSize 2260\n    !insertmacro Download_Lang_Data chi_tra\n  SectionEnd\n\n  Section /o \"Chinese (Traditional vertical)\" SecLang_chi_tra_vert\n    AddSize 1740\n    !insertmacro Download_Lang_Data chi_tra_vert\n  SectionEnd\n\n  Section /o \"Cherokee\" SecLang_chr\n    AddSize 366\n    !insertmacro Download_Lang_Data chr\n  SectionEnd\n\n  Section /o \"Corsican\" SecLang_cos\n    AddSize 2190\n    !insertmacro Download_Lang_Data cos\n  SectionEnd\n\n  Section /o \"Welsh\" SecLang_cym\n    AddSize 2110\n    !insertmacro Download_Lang_Data cym\n  SectionEnd\n\n  Section /o \"Danish\" SecLang_dan\n    AddSize 2460\n    !insertmacro Download_Lang_Data dan\n  SectionEnd\n\n  Section /o \"German\" SecLang_deu\n    AddSize 1450\n    !insertmacro Download_Lang_Data deu\n  SectionEnd\n\n Section /o \"German Fraktur\" SecLang_deu_latf\n    AddSize 6130\n    !insertmacro Download_Lang_Data deu_latf\n  SectionEnd\n\n  Section /o \"Divehi\" SecLang_div\n    AddSize 1690\n    !insertmacro Download_Lang_Data div\n  SectionEnd\n\n  Section /o \"Dzongkha\" SecLang_dzo\n    AddSize 439\n    !insertmacro Download_Lang_Data dzo\n  SectionEnd\n\n  Section /o \"Greek\" SecLang_ell\n    AddSize 1350\n    !insertmacro Download_Lang_Data ell\n  SectionEnd\n\n  Section /o \"English - Middle (1100-1500)\" SecLang_enm\n    AddSize 2960\n    !insertmacro Download_Lang_Data enm\n  SectionEnd\n\n  Section /o \"Esperanto\" SecLang_epo\n    AddSize 4510\n    !insertmacro Download_Lang_Data epo\n  SectionEnd\n\n  Section /o \"Estonian\" SecLang_est\n    AddSize 4250\n    !insertmacro Download_Lang_Data est\n  SectionEnd\n\n  Section /o \"Basque\" SecLang_eus\n    AddSize 4940\n    !insertmacro Download_Lang_Data eus\n  SectionEnd\n\n  Section /o \"Faroese\" SecLang_fao\n    AddSize 3280\n    !insertmacro Download_Lang_Data fao\n  SectionEnd\n\n  Section /o \"Persian\" SecLang_fas\n    AddSize 421\n    !insertmacro Download_Lang_Data fas\n  SectionEnd\n\n  Section /o \"Filipino\" SecLang_fil\n    AddSize 1760\n    !insertmacro Download_Lang_Data fil\n  SectionEnd\n\n Section /o \"Finnish\" SecLang_fin\n    AddSize 7500\n    !insertmacro Download_Lang_Data fin\n  SectionEnd\n\n  Section /o \"French\" SecLang_fra\n    AddSize 1080\n    !insertmacro Download_Lang_Data fra\n  SectionEnd\n\n Section /o \"French - Middle (ca. 1400-1600)\" SecLang_frm\n    AddSize 1930\n    !insertmacro Download_Lang_Data frm\n  SectionEnd\n\n  Section /o \"Frisian (Western)\" SecLang_fry\n    AddSize 1820\n    !insertmacro Download_Lang_Data fry\n  SectionEnd\n\n  Section /o \"Gaelic (Scots)\" SecLang_gla\n    AddSize 2930\n    !insertmacro Download_Lang_Data gla\n  SectionEnd\n\n  Section /o \"Irish\" SecLang_gle\n    AddSize 1130\n    !insertmacro Download_Lang_Data gle\n  SectionEnd\n\n  Section /o \"Galician\" SecLang_glg\n    AddSize 2440\n    !insertmacro Download_Lang_Data glg\n  SectionEnd\n\n  Section /o \"Greek, Ancient (-1453)\" SecLang_grc\n    AddSize 2140\n    !insertmacro Download_Lang_Data grc\n  SectionEnd\n\n  Section /o \"Gujarati\" SecLang_guj\n    AddSize 1350\n    !insertmacro Download_Lang_Data guj\n  SectionEnd\n\n  Section /o \"Haitian\" SecLang_hat\n    AddSize 1890\n    !insertmacro Download_Lang_Data hat\n  SectionEnd\n\n  Section /o \"Hebrew\" SecLang_heb\n    AddSize 939\n    !insertmacro Download_Lang_Data heb\n  SectionEnd\n\n  Section /o \"Hindi\" SecLang_hin\n    AddSize 1070\n    !insertmacro Download_Lang_Data hin\n  SectionEnd\n\n  Section /o \"Croatian\" SecLang_hrv\n    AddSize 3910\n    !insertmacro Download_Lang_Data hrv\n  SectionEnd\n\n  Section /o \"Hungarian\" SecLang_hun\n    AddSize 5050\n    !insertmacro Download_Lang_Data hun\n  SectionEnd\n\n  Section /o \"Armenian\" SecLang_hye\n    AddSize 3300\n    !insertmacro Download_Lang_Data hye\n  SectionEnd\n\n  Section /o \"Inuktitut\" SecLang_iku\n    AddSize 2670\n    !insertmacro Download_Lang_Data iku\n  SectionEnd\n\n  Section /o \"Indonesian\" SecLang_ind\n    AddSize 1070\n    !insertmacro Download_Lang_Data ind\n  SectionEnd\n\n  Section /o \"Icelandic\" SecLang_isl\n    AddSize 2170\n    !insertmacro Download_Lang_Data isl\n  SectionEnd\n\n  Section /o \"Italian\" SecLang_ita\n    AddSize 2580\n    !insertmacro Download_Lang_Data ita\n  SectionEnd\n\n  Section /o \"Italian (Old)\" SecLang_ita_old\n    AddSize 3130\n    !insertmacro Download_Lang_Data ita_old\n  SectionEnd\n\n  Section /o \"Javanese\" SecLang_jav\n    AddSize 2840\n    !insertmacro Download_Lang_Data jav\n  SectionEnd\n\n  Section /o \"Japanese\" SecLang_jpn\n    AddSize 2360\n    !insertmacro Download_Lang_Data jpn\n  SectionEnd\n\n  Section /o \"Japanese (vertical)\" SecLang_jpn_vert\n    AddSize 2900\n    !insertmacro Download_Lang_Data jpn_vert\n  SectionEnd\n\n  Section /o \"Kannada\" SecLang_kan\n    AddSize 3440\n    !insertmacro Download_Lang_Data kan\n  SectionEnd\n\n  Section /o \"Georgian\" SecLang_kat\n    AddSize 2410\n    !insertmacro Download_Lang_Data kat\n  SectionEnd\n\n  Section /o \"Georgian (Old)\" SecLang_kat_old\n    AddSize 413\n    !insertmacro Download_Lang_Data kat_old\n  SectionEnd\n\n  Section /o \"Kazakh\" SecLang_kaz\n    AddSize 4520\n    !insertmacro Download_Lang_Data kaz\n  SectionEnd\n\n  Section /o \"Central Khmer\" SecLang_khm\n    AddSize 1380\n    !insertmacro Download_Lang_Data khm\n  SectionEnd\n\n  Section /o \"Kirghiz\" SecLang_kir\n    AddSize 9470\n    !insertmacro Download_Lang_Data kir\n  SectionEnd\n\n  Section /o \"Korean\" SecLang_kor\n    AddSize 1600\n    !insertmacro Download_Lang_Data kor\n  SectionEnd\n\n  Section /o \"Kurdish (Kurmanji)\" SecLang_kmr\n    AddSize 3400\n    !insertmacro Download_Lang_Data kmr\n  SectionEnd\n\n  Section /o \"Lao\" SecLang_lao\n    AddSize 6090\n    !insertmacro Download_Lang_Data lao\n  SectionEnd\n\n  Section /o \"Latin\" SecLang_lat\n    AddSize 3040\n    !insertmacro Download_Lang_Data lat\n  SectionEnd\n\n  Section /o \"Latvian\" SecLang_lav\n    AddSize 2590\n    !insertmacro Download_Lang_Data lav\n  SectionEnd\n\n  Section /o \"Lithuanian\" SecLang_lit\n    AddSize 3010\n    !insertmacro Download_Lang_Data lit\n  SectionEnd\n\n  Section /o \"Luxembourgish\" SecLang_ltz\n    AddSize 2490\n    !insertmacro Download_Lang_Data ltz\n  SectionEnd\n\n  Section /o \"Malayalam\" SecLang_mal\n    AddSize 5030\n    !insertmacro Download_Lang_Data mal\n  SectionEnd\n\n  Section /o \"Marathi\" SecLang_mar\n    AddSize 2020\n    !insertmacro Download_Lang_Data mar\n  SectionEnd\n\n  Section /o \"Macedonian\" SecLang_mkd\n    AddSize 1530\n    !insertmacro Download_Lang_Data mkd\n  SectionEnd\n\n  Section /o \"Maltese\" SecLang_mlt\n    AddSize 2200\n    !insertmacro Download_Lang_Data mlt\n  SectionEnd\n\n  Section /o \"Mongolian\" SecLang_mon\n    AddSize 2040\n    !insertmacro Download_Lang_Data mon\n  SectionEnd\n\n  Section /o \"Maori\" SecLang_mri\n    AddSize 843\n    !insertmacro Download_Lang_Data mri\n  SectionEnd\n\n  Section /o \"Malay\" SecLang_msa\n    AddSize 1670\n    !insertmacro Download_Lang_Data msa\n  SectionEnd\n\n  Section /o \"Burmese\" SecLang_mya\n    AddSize 4430\n    !insertmacro Download_Lang_Data mya\n  SectionEnd\n\n  Section /o \"Nepali\" SecLang_nep\n    AddSize 979\n    !insertmacro Download_Lang_Data nep\n  SectionEnd\n\n  Section /o \"Dutch; Flemish\" SecLang_nld\n    AddSize 5770\n    !insertmacro Download_Lang_Data nld\n  SectionEnd\n\n  Section /o \"Norwegian\" SecLang_nor\n    AddSize 3440\n    !insertmacro Download_Lang_Data nor\n  SectionEnd\n\n  Section /o \"Occitan (post 1500)\" SecLang_oci\n    AddSize 6030\n    !insertmacro Download_Lang_Data oci\n  SectionEnd\n\n  Section /o \"Oriya\" SecLang_ori\n    AddSize 1410\n    !insertmacro Download_Lang_Data ori\n  SectionEnd\n\n  Section /o \"Panjabi / Punjabi\" SecLang_pan\n    AddSize 4860\n    !insertmacro Download_Lang_Data pan\n  SectionEnd\n\n  Section /o \"Polish\" SecLang_pol\n    AddSize 4540\n    !insertmacro Download_Lang_Data pol\n  SectionEnd\n\n  Section /o \"Portuguese\" SecLang_por\n    AddSize 1890\n    !insertmacro Download_Lang_Data por\n  SectionEnd\n\n  Section /o \"Pushto / Pashto\" SecLang_pus\n    AddSize 1690\n    !insertmacro Download_Lang_Data pus\n  SectionEnd\n\n  Section /o \"Quechua\" SecLang_que\n    AddSize 4790\n    !insertmacro Download_Lang_Data que\n  SectionEnd\n\n  Section /o \"Romanian\" SecLang_ron\n    AddSize 2270\n    !insertmacro Download_Lang_Data ron\n  SectionEnd\n\n  Section /o \"Russian\" SecLang_rus\n    AddSize 3680\n    !insertmacro Download_Lang_Data rus\n  SectionEnd\n\n  Section /o \"Sanskrit\" SecLang_san\n    AddSize 1180\n    !insertmacro Download_Lang_Data san\n  SectionEnd\n\n  Section /o \"Sinhala / Sinhalese\" SecLang_sin\n    AddSize 1650\n    !insertmacro Download_Lang_Data sin\n  SectionEnd\n\n  Section /o \"Slovak\" SecLang_slk\n    AddSize 4220\n    !insertmacro Download_Lang_Data slk\n  SectionEnd\n\n  Section /o \"Slovenian\" SecLang_slv\n    AddSize 2860\n    !insertmacro Download_Lang_Data slv\n  SectionEnd\n\n  Section /o \"Sindhi\" SecLang_snd\n    AddSize 1620\n    !insertmacro Download_Lang_Data snd\n  SectionEnd\n\n  Section /o \"Spanish\" SecLang_spa\n    AddSize 2190\n    !insertmacro Download_Lang_Data spa\n  SectionEnd\n\n  Section /o \"Spanish (Old)\" SecLang_spa_old\n    AddSize 2760\n    !insertmacro Download_Lang_Data spa_old\n  SectionEnd\n\n  Section /o \"Albanian\" SecLang_sqi\n    AddSize 1790\n    !insertmacro Download_Lang_Data sqi\n  SectionEnd\n\n  Section /o \"Serbian\" SecLang_srp\n    AddSize 2050\n    !insertmacro Download_Lang_Data srp\n  SectionEnd\n\n  Section /o \"Serbian (Latin)\" SecLang_srp_latn\n    AddSize 3130\n    !insertmacro Download_Lang_Data srp_latn\n  SectionEnd\n\n  Section /o \"Sundanese\" SecLang_sun\n    AddSize 1310\n    !insertmacro Download_Lang_Data sun\n  SectionEnd\n\n  Section /o \"Swahili\" SecLang_swa\n    AddSize 2070\n    !insertmacro Download_Lang_Data swa\n  SectionEnd\n\n  Section /o \"Swedish\" SecLang_swe\n    AddSize 3970\n    !insertmacro Download_Lang_Data swe\n  SectionEnd\n\n  Section /o \"Syriac\" SecLang_syr\n    AddSize 2100\n    !insertmacro Download_Lang_Data syr\n  SectionEnd\n\n Section /o \"Tamil\" SecLang_tam\n    AddSize 3090\n    !insertmacro Download_Lang_Data tam\n  SectionEnd\n\n  Section /o \"Tatar\" SecLang_tat\n    AddSize 1020\n    !insertmacro Download_Lang_Data tat\n  SectionEnd\n\n  Section /o \"Telugu\" SecLang_tel\n    AddSize 2640\n    !insertmacro Download_Lang_Data tel\n  SectionEnd\n\n  Section /o \"Tajik\" SecLang_tgk\n    AddSize 2480\n    !insertmacro Download_Lang_Data tgk\n  SectionEnd\n\n  Section /o \"Thai\" SecLang_tha\n    AddSize 1020\n    !insertmacro Download_Lang_Data tha\n  SectionEnd\n\n  Section /o \"Tigrinya\" SecLang_tir\n    AddSize 370\n    !insertmacro Download_Lang_Data tir\n  SectionEnd\n\n Section /o \"Tonga\" SecLang_ton\n    AddSize 925\n    !insertmacro Download_Lang_Data ton\n  SectionEnd\n\n  Section /o \"Turkish\" SecLang_tur\n    AddSize 4240\n    !insertmacro Download_Lang_Data tur\n  SectionEnd\n\n  Section /o \"Uighur\" SecLang_uig\n    AddSize 2660\n    !insertmacro Download_Lang_Data uig\n  SectionEnd\n\n  Section /o \"Ukrainian\" SecLang_ukr\n    AddSize 3650\n    !insertmacro Download_Lang_Data ukr\n  SectionEnd\n\n  Section /o \"Urdu\" SecLang_urd\n    AddSize 1330\n    !insertmacro Download_Lang_Data urd\n  SectionEnd\n\n  Section /o \"Uzbek\" SecLang_uzb\n    AddSize 6170\n    !insertmacro Download_Lang_Data uzb\n  SectionEnd\n\n  Section /o \"Uzbek (Cyrillic)\" SecLang_uzb_cyrl\n    AddSize 1490\n    !insertmacro Download_Lang_Data uzb_cyrl\n  SectionEnd\n\n  Section /o \"Vietnamese\" SecLang_vie\n    AddSize 519\n    !insertmacro Download_Lang_Data vie\n  SectionEnd\n\n  Section /o \"Yiddish\" SecLang_yid\n    AddSize 533\n    !insertmacro Download_Lang_Data yid\n  SectionEnd\n\n  Section /o \"Yoruba\" SecLang_yor\n    AddSize 941\n    !insertmacro Download_Lang_Data yor\n  SectionEnd\n\nSectionGroupEnd\n\n;--------------------------------\n;Descriptions\n  ; At first we need to localize installer for languages which supports well in tesseract: Eng, Spa, Ger, Ita, Dutch + Russian (it is authors native language)\n  ;Language strings\n  LangString DESC_SEC0001 ${LANG_RUSSIAN} \"Установочные файлы.\"\n  ;LangString DESC_SecHelp ${LANG_RUSSIAN} \"Справочная информация.\"\n  LangString DESC_SecCS    ${LANG_RUSSIAN} \"Добавить ярлыки в меню Пуск\"\n\n  LangString DESC_SEC0001 ${LANG_ENGLISH} \"Installation files.\"\n  ;LangString DESC_SecHelp ${LANG_ENGLISH} \"Help information.\"\n  LangString DESC_SecScrollView ${LANG_ENGLISH} \"Extracts the Java-based ScrollView JAR files, which are used primarily by developers for debugging OCR results.\"\n  LangString DESC_SecTr ${LANG_ENGLISH} \"Deploys the additional executables required for users who require training on custom Tesseract OCR models.\"\n  LangString DESC_SecCS ${LANG_ENGLISH} \"Add shortcuts to Start menu.\"\n  LangString DESC_SecGrp_LD ${LANG_ENGLISH} \"Deploys the foundational English language pack and the Orientation and Script Detection (OSD) module.\"\n  LangString DESC_SecGrp_ASD ${LANG_ENGLISH} \"An optional group of sections that download script-level data files.\"\n  LangString DESC_SecGrp_ALD ${LANG_ENGLISH} \"An optional group containing dozens of specific language packs.\"\n  LangString DESC_SecAddEnvPath ${LANG_ENGLISH} \"Allows running Tesseract from any command prompt.\"\n\n  LangString DESC_SEC0001 ${LANG_FRENCH} \"Fichier d'installation.\"\n  ;LangString DESC_SecHelp ${LANG_FRENCH} \"Aide.\"\n  LangString DESC_SecCS   ${LANG_FRENCH} \"Ajouter des raccourcis vers le menu démarrer.\"\n\n  LangString DESC_SEC0001 ${LANG_GERMAN} \"Dateien für die Installation.\"\n ;LangString DESC_SecHelp ${LANG_GERMAN} \"Hilfe.\"\n  LangString DESC_SecCS   ${LANG_GERMAN} \"Einträge im Startmenü hinzufügen.\"\n\n  LangString DESC_SEC0001 ${LANG_ITALIAN} \"File di installazione.\"\n  ;LangString DESC_SecHelp ${LANG_ITALIAN} \"Guida di informazioni.\"\n  LangString DESC_SecCS    ${LANG_ITALIAN} \"Aggiungere collegamenti al menu Start.\"\n\n  LangString DESC_SEC0001 ${LANG_PORTUGUESE} \"Arquivos de instalação.\"\n  ;LangString DESC_SecHelp ${LANG_PORTUGUESE} \"Informação de ajuda.\"\n  LangString DESC_SecScrollView ${LANG_PORTUGUESE} \"Extrai os arquivos JAR do ScrollView baseados em Java, que são usados ​​principalmente por desenvolvedores.\"\n  LangString DESC_SecTr ${LANG_PORTUGUESE} \"Instala os executáveis ​​adicionais necessários para usuários que precisam treinar modelos OCR.\"\n  LangString DESC_SecCS ${LANG_PORTUGUESE} \"Adiciona atalhos ao Menu iniciar.\"\n  LangString DESC_SecGrp_LD ${LANG_PORTUGUESE} \"Instala o pacote básico de idioma inglês e o módulo de Orientação e Detecção de Roteiro (OSD).\"\n  LangString DESC_SecGrp_ASD ${LANG_PORTUGUESE} \"Um grupo opcional de seções que baixam arquivos de dados em nível de script.\"\n  LangString DESC_SecGrp_ALD ${LANG_PORTUGUESE} \"Um grupo opcional contendo dezenas de pacotes de idiomas específicos.\"\n\n  LangString DESC_SEC0001 ${LANG_SLOVAK} \"Súbory inštalácie.\"\n  ;LangString DESC_SecHelp ${LANG_SLOVAK} \"Pomocné informácie.\"\n  LangString DESC_SecCS    ${LANG_SLOVAK} \"Pridať odkaz do Start menu.\"\n\n  LangString DESC_SEC0001 ${LANG_SPANISH} \"Los archivos de instalación.\"\n  ;LangString DESC_SecHelp ${LANG_SPANISH} \"Información de ayuda.\"\n  LangString DESC_SecCS    ${LANG_SPANISH} \"Ańadir accesos directos al menú Inicio.\"\n\n  LangString DESC_SEC0001 ${LANG_SPANISHINTERNATIONAL} \"Los archivos de instalación.\"\n  ;LangString DESC_SecHelp ${LANG_SPANISHINTERNATIONAL} \"Información de ayuda.\"\n  LangString DESC_SecCS    ${LANG_SPANISHINTERNATIONAL} \"Ańadir accesos directos al menú Inicio.\"\n\n  ;Assign language strings to sections\n  !insertmacro MUI_FUNCTION_DESCRIPTION_BEGIN\n    !insertmacro MUI_DESCRIPTION_TEXT ${SEC0001} $(DESC_SEC0001)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecScrollView} $(DESC_SecScrollView)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecTr} $(DESC_SecTr)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecCS} $(DESC_SecCS)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecGrp_LD} $(DESC_SecGrp_LD)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecGrp_ASD} $(DESC_SecGrp_ASD)\n    !insertmacro MUI_DESCRIPTION_TEXT ${SecGrp_ALD} $(DESC_SecGrp_ALD)\n  !insertmacro MUI_FUNCTION_DESCRIPTION_END\n\n;--------------------------------\n;Uninstaller Section\n\n;Section /o -un.Main UNSEC0000\nSection -un.Main UNSEC0000\n!ifdef W64\n  SetRegView 64\n!endif\n  DetailPrint \"Removing everything\"\n  Delete \"$SMPROGRAMS\\${PRODUCT_NAME}\\*.*\"\n  RMDir  \"$SMPROGRAMS\\${PRODUCT_NAME}\"\n  DetailPrint \"Removing registry info\"\n  DeleteRegKey HKLM \"Software\\Tesseract-OCR\"\n  SendMessage ${HWND_BROADCAST} ${WM_WININICHANGE} 0 \"STR:Environment\" /TIMEOUT=1000\n\n  # remove the Add/Remove information\n  DeleteRegKey HKLM \"${UNINST_KEY}\"\n  Delete \"${UNINST_EXE}\"\n  DeleteRegValue HKLM \"${REGKEY}\" Path\n  DeleteRegKey /IfEmpty HKLM \"${REGKEY}\\Components\"\n  DeleteRegKey /IfEmpty HKLM \"${REGKEY}\"\n  Delete \"$INSTDIR\\*.dll\"\n  Delete \"$INSTDIR\\*.exe\"\n  Delete \"$INSTDIR\\*.html\"\n  Delete \"$INSTDIR\\doc\\AUTHORS\"\n  Delete \"$INSTDIR\\doc\\LICENSE\"\n  Delete \"$INSTDIR\\doc\\README.md\"\n  RMDir \"$INSTDIR\\doc\"\n  RMDir /r \"$INSTDIR\\tessdata\"\n  RMDir \"$INSTDIR\"\nSectionEnd\n\nFunction PageReinstall\n\nFunctionEnd\n\nFunction PageLeaveReinstall\n\nFunctionEnd\n\n!macro REMOVE_REGKEY OLD_KEY\n  StrCmp ${OLD_KEY} HKLM 0 +3\n    DeleteRegKey HKLM \"${REGKEY}\"\n    Goto End\n  DeleteRegKey HKCU \"${REGKEY}\"\n  End:\n!macroend\n\nFunction .onInit\n!ifdef W64\n  SetRegView 64\n!endif\n  Call PreventMultipleInstances\n  !insertmacro MUI_LANGDLL_DISPLAY\n  ;RequestExecutionLevel admin\n  !insertmacro MULTIUSER_INIT\n\n  ; is tesseract already installed?\n  ReadRegStr $R0 HKCU \"${REGKEY}\" \"CurrentVersion\"\n  StrCpy $OLD_KEY HKCU\n  StrCmp $R0 \"\" TestHKLM AskUninstall\n  TestHKLM:\n    ReadRegStr $R0 HKLM \"${REGKEY}\" \"CurrentVersion\"\n    StrCpy $OLD_KEY HKLM\n    StrCmp $R0 \"\" SkipUnInstall\n  AskUninstall:\n    MessageBox MB_YESNO|MB_ICONEXCLAMATION \\\n      \"Tesseract-ocr version $R0 is installed (in $OLD_KEY)! Do you want to uninstall it first?$\\nUninstall will delete all files in '$INSTDIR'!\" \\\n       /SD IDYES IDNO SkipUnInstall IDYES UnInstall\n  UnInstall:\n    StrCmp $OLD_KEY \"HKLM\" UnInst_hklm\n       DetailPrint \"Uninstall: current user\"\n       readRegStr $R1 HKCU \"${UNINST_KEY}\" \"UninstallString\"\n       Goto try_uninstall\n    UnInst_hklm:\n       DetailPrint \"UnInstall: all users\"\n       readRegStr $R1 HKLM \"${UNINST_KEY}\" \"UninstallString\"\n    try_uninstall:\n      ClearErrors\n      ExecWait '$R1 _?=$INSTDIR'$0\n      ; Check if unstaller finished ok. If yes, then try to remove it from installer.\n      StrCmp $0 0 0 +3\n        !insertmacro REMOVE_REGKEY ${OLD_KEY}\n        Goto SkipUnInstall\n      messagebox mb_ok \"Uninstaller failed:\\n$0\\n\\nYou need to remove program manually.\"\n  SkipUnInstall:\n    ;InitPluginsDir\n    ;File /oname=$PLUGINSDIR\\splash.bmp \"${NSISDIR}\\Contrib\\Graphics\\Header\\nsis.bmp\"\n    ;File /oname=$PLUGINSDIR\\splash.bmp \"new.bmp\"\n    ;advsplash::show 1000 600 400 -1 $PLUGINSDIR\\splash\n    ;Pop $0          ; $0 has '1' if the user closed the splash screen early,\n                    ; '0' if everything closed normal, and '-1' if some error occurred.\n    ;IfFileExists $INSTDIR\\loadmain.exe PathGood\n  ;done:\n    ; Make selection based on System language ID\n    System::Call 'kernel32::GetSystemDefaultLangID() i .r0'\n    IntOp $0 $0 & 0xFFFF ; Mask the value to 16 bits to ensure only the LANGID is kept\n    ;http://msdn.microsoft.com/en-us/library/dd318693%28v=VS.85%29.aspx\n    StrCmp $0 \"1078\" Afrikaans\n    StrCmp $0 \"1052\" Albanian\n    StrCmp $0 \"5121\" Arabic\n    StrCmp $0 \"1068\" Azerbaijani\n    StrCmp $0 \"1069\" Basque\n    StrCmp $0 \"1059\" Belarusian\n    StrCmp $0 \"1093\" Bengali\n    StrCmp $0 \"1026\" Bulgarian\n    StrCmp $0 \"1027\" Catalan\n    StrCmp $0 \"1116\" Cherokee\n    StrCmp $0 \"31748\" Chinese_tra\n    StrCmp $0 \"4\" Chinese_sim\n    StrCmp $0 \"26\" Croatian\n    StrCmp $0 \"1029\" Czech\n    StrCmp $0 \"1030\" Danish\n    StrCmp $0 \"2067\" Dutch\n    StrCmp $0 \"1061\" Estonian\n    StrCmp $0 \"3079\" German\n    StrCmp $0 \"1032\" Greek\n    StrCmp $0 \"1035\" Finnish\n    StrCmp $0 \"2060\" French\n    StrCmp $0 \"1037\" Hebrew\n    StrCmp $0 \"1081\" Hindi\n    StrCmp $0 \"1038\" Hungarian\n    StrCmp $0 \"1039\" Icelandic\n    StrCmp $0 \"1057\" Indonesian\n    StrCmp $0 \"1040\" Italian\n    StrCmp $0 \"1041\" Japanese\n    StrCmp $0 \"1099\" Kannada\n    StrCmp $0 \"1042\" Korean\n    StrCmp $0 \"1062\" Latvian\n    StrCmp $0 \"1063\" Lithuanian\n    StrCmp $0 \"1071\" Macedonian\n    StrCmp $0 \"1100\" Malayalam\n    StrCmp $0 \"2110\" Malay\n    StrCmp $0 \"1082\" Maltese\n    StrCmp $0 \"1044\" Norwegian\n    StrCmp $0 \"1045\" Polish\n    StrCmp $0 \"1046\" Portuguese\n    StrCmp $0 \"1048\" Romanian\n    StrCmp $0 \"1049\" Russian\n    StrCmp $0 \"1051\" Slovak\n    StrCmp $0 \"1060\" Slovenian\n    StrCmp $0 \"11274\" Spanish\n    StrCmp $0 \"2074\" Serbian\n    StrCmp $0 \"1089\" Swahili\n    StrCmp $0 \"2077\" Swedish\n    StrCmp $0 \"1097\" Tamil\n    StrCmp $0 \"1098\" Telugu\n    StrCmp $0 \"1054\" Thai\n    StrCmp $0 \"1055\" Turkish\n    StrCmp $0 \"1058\" Ukrainian\n    StrCmp $0 \"1066\" Vietnamese\n\n    Goto lang_end\n\n    Afrikaans: !insertmacro SelectSection ${SecLang_afr}\n            Goto lang_end\n    Albanian: !insertmacro SelectSection ${SecLang_sqi}\n            Goto lang_end\n    Arabic: !insertmacro SelectSection ${SecLang_ara}\n            Goto lang_end\n    ;Assamese: !insertmacro SelectSection ${SecLang_asm}\n    ;        Goto lang_end\n    Azerbaijani: !insertmacro SelectSection ${SecLang_aze}\n            Goto lang_end\n    Basque: !insertmacro SelectSection ${SecLang_eus}\n            Goto lang_end\n    Belarusian: !insertmacro SelectSection ${SecLang_bel}\n            Goto lang_end\n    Bengali: !insertmacro SelectSection ${SecLang_ben}\n            Goto lang_end\n    Bulgarian: !insertmacro SelectSection ${SecLang_bul}\n            Goto lang_end\n    Catalan: !insertmacro SelectSection ${SecLang_cat}\n            Goto lang_end\n    Cherokee: !insertmacro SelectSection ${SecLang_chr}\n            Goto lang_end\n    Chinese_tra: !insertmacro SelectSection ${SecLang_chi_tra}\n            Goto lang_end\n    Chinese_sim: !insertmacro SelectSection ${SecLang_chi_sim}\n            Goto lang_end\n    Croatian: !insertmacro SelectSection ${SecLang_hrv}\n            Goto lang_end\n    Czech: !insertmacro SelectSection ${SecLang_ces}\n            Goto lang_end\n    Danish: !insertmacro SelectSection ${SecLang_dan}\n            Goto lang_end\n    Dutch: !insertmacro SelectSection ${SecLang_nld}\n            Goto lang_end\n    Estonian: !insertmacro SelectSection ${SecLang_hrv}\n            Goto lang_end\n    German: !insertmacro SelectSection ${SecLang_deu}\n            Goto lang_end\n    Greek: !insertmacro SelectSection ${SecLang_ell}\n            !insertmacro SelectSection ${SecLang_grc}\n            Goto lang_end\n    Finnish: !insertmacro SelectSection ${SecLang_fin}\n            !insertmacro SelectSection ${SecLang_frm}\n            Goto lang_end\n    French: !insertmacro SelectSection ${SecLang_fra}\n            Goto lang_end\n    Hebrew: !insertmacro SelectSection ${SecLang_heb}\n            ;!insertmacro SelectSection ${SecLang_heb_com}\n            Goto lang_end\n    Hungarian: !insertmacro SelectSection ${SecLang_hin}\n            Goto lang_end\n    Hindi: !insertmacro SelectSection ${SecLang_hun}\n            Goto lang_end\n    Icelandic: !insertmacro SelectSection ${SecLang_isl}\n            Goto lang_end\n    Indonesian: !insertmacro SelectSection ${SecLang_ind}\n            Goto lang_end\n    Italian: !insertmacro SelectSection ${SecLang_ita}\n            !insertmacro SelectSection ${SecLang_ita_old}\n            Goto lang_end\n    Japanese: !insertmacro SelectSection ${SecLang_jpn}\n            Goto lang_end\n    Kannada: !insertmacro SelectSection ${SecLang_kan}\n            Goto lang_end\n    Korean: !insertmacro SelectSection ${SecLang_kor}\n            Goto lang_end\n    Latvian: !insertmacro SelectSection ${SecLang_lav}\n            Goto lang_end\n    Lithuanian: !insertmacro SelectSection ${SecLang_lit}\n            Goto lang_end\n    Macedonian: !insertmacro SelectSection ${SecLang_mkd}\n            Goto lang_end\n    Malayalam: !insertmacro SelectSection ${SecLang_msa}\n            Goto lang_end\n    Malay: !insertmacro SelectSection ${SecLang_mal}\n            Goto lang_end\n    Maltese: !insertmacro SelectSection ${SecLang_mlt}\n            Goto lang_end\n    Norwegian: !insertmacro SelectSection ${SecLang_nor}\n            Goto lang_end\n    Polish: !insertmacro SelectSection ${SecLang_pol}\n            Goto lang_end\n    Portuguese: !insertmacro SelectSection ${SecLang_por}\n            Goto lang_end\n    Romanian: !insertmacro SelectSection ${SecLang_ron}\n            Goto lang_end\n    Russian: !insertmacro SelectSection ${SecLang_rus}\n            Goto lang_end\n    Slovak: !insertmacro SelectSection ${SecLang_slk}\n            Goto lang_end\n    Slovenian: !insertmacro SelectSection ${SecLang_slv}\n            Goto lang_end\n    Spanish: !insertmacro SelectSection ${SecLang_spa}\n            !insertmacro SelectSection ${SecLang_spa_old}\n            Goto lang_end\n    Serbian: !insertmacro SelectSection ${SecLang_srp}\n            Goto lang_end\n    Swahili: !insertmacro SelectSection ${SecLang_swa}\n            Goto lang_end\n    Swedish: !insertmacro SelectSection ${SecLang_swe}\n            Goto lang_end\n    Tamil: !insertmacro SelectSection ${SecLang_tam}\n            Goto lang_end\n    Telugu: !insertmacro SelectSection ${SecLang_tel}\n            Goto lang_end\n    Thai: !insertmacro SelectSection ${SecLang_tha}\n            Goto lang_end\n    Turkish: !insertmacro SelectSection ${SecLang_tur}\n            Goto lang_end\n    Ukrainian: !insertmacro SelectSection ${SecLang_ukr}\n            Goto lang_end\n    Vietnamese: !insertmacro SelectSection ${SecLang_vie}\n\n    lang_end:\nFunctionEnd\n\nFunction un.onInit\n  !insertmacro MUI_LANGDLL_DISPLAY\n  !insertmacro MULTIUSER_UNINIT\n  ;!insertmacro SELECT_UNSECTION Main ${UNSEC0000}\n  ;!insertmacro MUI_UNGETLANGUAGE\nFunctionEnd\n\nFunction .onInstFailed\n  MessageBox MB_OK \"Installation failed.\"\nFunctionEnd\n\n!ifdef SHOW_README\nFunction ShowReadme\n  Exec '\"wordpad\" \"doc\\README.md\"'\n  ;BringToFront\nFunctionEnd\n!endif\n\n; Prevent running multiple instances of the installer\nFunction PreventMultipleInstances\n  Push $R0\n  System::Call 'kernel32::CreateMutex(p 0, i 0, t \"${PRODUCT_NAME}\") p .r1 ?e'\n  Pop $R0\n  ; 183 is the Windows error code for ERROR_ALREADY_EXISTS\n  StrCmp $R0 183 0 +4\n    MessageBox MB_OK|MB_ICONEXCLAMATION \"The installer is already running.\" /SD IDOK\n    Pop $R0\n    Abort\n  Pop $R0\nFunctionEnd\n"
  },
  {
    "path": "nsis/winpath.cpp",
    "content": "// Copyright (C) 2024 Stefan Weil\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n// winpath - run a Windows program with extended PATH\n//\n// Usage:\n//\n//     winpath [CMD [ARGUMENT ...]]\n//\n// Example:\n//\n//     winpath cmd\n//\n// This will start a Windows command line with PATH extended by\n// the location of the winpath executable.\n\n#include <process.h>    // _spawnvp\n#include <stdlib.h>     // _putenv_s\n#include <string.h>     // strcpy, strcat\n\nstatic char path[4096];\n\nint main(int argc, char *argv[]) {\n  if (argc > 1) {\n    char *dir = argv[0];\n    char *last = strrchr(dir, '\\\\');\n    if (last != nullptr) {\n      *last = '\\0';\n    }\n    strcpy(path, dir);\n    strcat(path, \";\");\n    strcat(path, getenv(\"PATH\"));\n    _putenv_s(\"PATH\", path);\n    _spawnvp(_P_WAIT, argv[1], argv + 1);\n    //~ _spawnvp(_P_OVERLAY, argv[1], argv + 1);\n  }\n  return 0;\n}\n"
  },
  {
    "path": "snap/snapcraft.yaml",
    "content": "name: tesseract\nversion: git\nsummary: open source optical character recognition engine\ndescription: |\n  Tesseract has unicode (UTF-8) support, and can recognize more than 100\n  languages \"out of the box\". It can be trained to recognize other languages.\n  Tesseract supports various output formats: plain-text, hocr(html), pdf.\n\n  If you want to access the files under /media/* or /run/media/* you'll have\n  to connect the snap to the `core` snap's `removable-media` interface:\n\n      $ sudo snap connect tesseract:removable-media\n\ngrade: stable # must be 'stable' to release into candidate/stable channels\nconfinement: strict\nbase: core22\n\napps:\n  tesseract:\n    command: usr/local/bin/tesseract\n    environment:\n      TESSDATA_PREFIX: $SNAP_USER_COMMON\n    plugs:\n      - home\n      - removable-media\n\nparts:\n  tesseract:\n    source: .\n    plugin: autotools\n    build-packages:\n      - pkg-config\n      - libpng-dev\n      - libjpeg-dev\n      - libtiff-dev\n      - zlib1g-dev\n      - libicu-dev\n      - libpango1.0-dev\n      - libcairo2-dev\n    stage-packages:\n      - libgomp1\n    after: [leptonica]\n  leptonica:\n    source: https://github.com/DanBloomberg/leptonica/archive/1.83.1.tar.gz\n    plugin: autotools\n    stage-packages:\n      - libjbig0\n      - libjpeg-turbo8\n      - libopenjp2-7\n      - libtiff5\n"
  },
  {
    "path": "src/api/altorenderer.cpp",
    "content": "// File:        altorenderer.cpp\n// Description: ALTO rendering interface\n// Author:      Jake Sebright\n\n// (C) Copyright 2018\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"helpers.h\" // for copy_string\n#include \"tprintf.h\" // for tprintf\n\n#include <tesseract/baseapi.h>\n#include <tesseract/renderer.h>\n\n#include <memory>\n#include <sstream> // for std::stringstream\n\nnamespace tesseract {\n\n/// Add coordinates to specified TextBlock, TextLine or String bounding box.\n/// Add word confidence if adding to a String bounding box.\n///\nstatic void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,\n                         std::stringstream &alto_str) {\n  int left, top, right, bottom;\n  it->BoundingBox(level, &left, &top, &right, &bottom);\n\n  int hpos = left;\n  int vpos = top;\n  int height = bottom - top;\n  int width = right - left;\n\n  alto_str << \" HPOS=\\\"\" << hpos << \"\\\"\";\n  alto_str << \" VPOS=\\\"\" << vpos << \"\\\"\";\n  alto_str << \" WIDTH=\\\"\" << width << \"\\\"\";\n  alto_str << \" HEIGHT=\\\"\" << height << \"\\\"\";\n\n  if (level == RIL_WORD) {\n    int wc = it->Confidence(RIL_WORD);\n    alto_str << \" WC=\\\"0.\" << wc << \"\\\"\";\n  } else {\n    alto_str << \">\";\n  }\n}\n\nstatic std::string GetID(const char *prefix, int page_number, int counter) {\n  std::stringstream idstr;\n  // IDs will only have the counter for the first page to keep them consistent\n  // with the IDs assigned before this change was made.\n  // From the second page on, IDs will also contain the page number to make them unique.\n  if (page_number == 0) {\n    idstr << prefix << \"_\" << counter;\n  } else {\n    idstr << prefix << \"_\" << page_number << \"_\" << counter;\n  }\n\n  return idstr.str();\n}\n\n///\n/// Append the ALTO XML for the beginning of the document\n///\nbool TessAltoRenderer::BeginDocumentHandler() {\n  // Delay the XML output because we need the name of the image file.\n  begin_document = true;\n  return true;\n}\n\n///\n/// Append the ALTO XML for the layout of the image\n///\nbool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {\n  if (begin_document) {\n    AppendString(\n      \"<?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\"?>\\n\"\n      \"<alto xmlns=\\\"http://www.loc.gov/standards/alto/ns-v3#\\\" \"\n      \"xmlns:xsi=\\\"http://www.w3.org/2001/XMLSchema-instance\\\" \"\n      \"xsi:schemaLocation=\\\"http://www.loc.gov/standards/alto/ns-v3# \"\n      \"http://www.loc.gov/alto/v3/alto-3-0.xsd\\\">\\n\"\n      \"\\t<Description>\\n\"\n      \"\\t\\t<MeasurementUnit>pixel</MeasurementUnit>\\n\"\n      \"\\t\\t<sourceImageInformation>\\n\"\n      \"\\t\\t\\t<fileName>\");\n    AppendString(api->GetInputName());\n    AppendString(\n      \"</fileName>\\n\"\n      \"\\t\\t</sourceImageInformation>\\n\"\n      \"\\t\\t<OCRProcessing ID=\\\"OCR_0\\\">\\n\"\n      \"\\t\\t\\t<ocrProcessingStep>\\n\"\n      \"\\t\\t\\t\\t<processingSoftware>\\n\"\n      \"\\t\\t\\t\\t\\t<softwareName>Tesseract</softwareName>\\n\"\n      \"\\t\\t\\t\\t\\t<softwareVersion>\");\n    AppendString(TessBaseAPI::Version());\n    AppendString(\n      \"</softwareVersion>\\n\"\n      \"\\t\\t\\t\\t</processingSoftware>\\n\"\n      \"\\t\\t\\t</ocrProcessingStep>\\n\"\n      \"\\t\\t</OCRProcessing>\\n\"\n      \"\\t</Description>\\n\"\n      \"\\t<Layout>\\n\");\n    begin_document = false;\n  }\n\n  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));\n  if (text == nullptr) {\n    return false;\n  }\n\n  AppendString(text.get());\n\n  return true;\n}\n\n///\n/// Append the ALTO XML for the end of the document\n///\nbool TessAltoRenderer::EndDocumentHandler() {\n  AppendString(\"\\t</Layout>\\n</alto>\\n\");\n\n  return true;\n}\n\nTessAltoRenderer::TessAltoRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"xml\"),\n      begin_document(false) {}\n\n///\n/// Make an XML-formatted string with ALTO markup from the internal\n/// data structures.\n///\nchar *TessBaseAPI::GetAltoText(int page_number) {\n  return GetAltoText(nullptr, page_number);\n}\n\n///\n/// Make an XML-formatted string with ALTO markup from the internal\n/// data structures.\n///\nchar *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {\n  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {\n    return nullptr;\n  }\n\n  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;\n\n  if (input_file_.empty()) {\n    SetInputName(nullptr);\n  }\n\n  std::stringstream alto_str;\n  // Use \"C\" locale (needed for int values larger than 999).\n  alto_str.imbue(std::locale::classic());\n  alto_str << \"\\t\\t<Page WIDTH=\\\"\" << rect_width_ << \"\\\" HEIGHT=\\\"\" << rect_height_\n           << \"\\\" PHYSICAL_IMG_NR=\\\"\" << page_number << \"\\\"\"\n           << \" ID=\\\"page_\" << page_number << \"\\\">\\n\"\n           << \"\\t\\t\\t<PrintSpace HPOS=\\\"0\\\" VPOS=\\\"0\\\"\"\n           << \" WIDTH=\\\"\" << rect_width_ << \"\\\"\"\n           << \" HEIGHT=\\\"\" << rect_height_ << \"\\\">\\n\";\n\n  std::unique_ptr<ResultIterator> res_it(GetIterator());\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    int left, top, right, bottom;\n    auto block_type = res_it->BlockType();\n\n    switch (block_type) {\n      case PT_FLOWING_IMAGE:\n      case PT_HEADING_IMAGE:\n      case PT_PULLOUT_IMAGE: {\n        // Handle all kinds of images.\n        // TODO: optionally add TYPE, for example TYPE=\"photo\".\n        alto_str << \"\\t\\t\\t\\t<Illustration ID=\\\"\" << GetID(\"cblock\", page_number, bcnt++) << \"\\\"\";\n        AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);\n        alto_str << \"</Illustration>\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      }\n      case PT_HORZ_LINE:\n      case PT_VERT_LINE:\n        // Handle horizontal and vertical lines.\n        alto_str << \"\\t\\t\\t\\t<GraphicalElement ID=\\\"\" << GetID(\"cblock\", page_number, bcnt++) << \"\\\"\";\n        AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);\n        alto_str << \"</GraphicalElement >\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      case PT_NOISE:\n        tprintf(\"TODO: Please report image which triggers the noise case.\\n\");\n        ASSERT_HOST(false);\n      default:\n        break;\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {\n      alto_str << \"\\t\\t\\t\\t<ComposedBlock ID=\\\"\" << GetID(\"cblock\", page_number, bcnt) << \"\\\"\";\n      AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);\n      alto_str << \"\\n\";\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_PARA)) {\n      alto_str << \"\\t\\t\\t\\t\\t<TextBlock ID=\\\"\" << GetID(\"block\", page_number, tcnt) << \"\\\"\";\n      AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);\n      alto_str << \"\\n\";\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      alto_str << \"\\t\\t\\t\\t\\t\\t<TextLine ID=\\\"\" << GetID(\"line\", page_number, lcnt) << \"\\\"\";\n      AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);\n      alto_str << \"\\n\";\n    }\n\n    alto_str << \"\\t\\t\\t\\t\\t\\t\\t<String ID=\\\"\" << GetID(\"string\", page_number, wcnt) << \"\\\"\";\n    AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);\n    alto_str << \" CONTENT=\\\"\";\n\n    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);\n    bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);\n    bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);\n\n    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);\n\n    do {\n      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));\n      if (grapheme && grapheme[0] != 0) {\n        alto_str << HOcrEscape(grapheme.get()).c_str();\n      }\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n\n    alto_str << \"\\\"/>\";\n\n    wcnt++;\n\n    if (last_word_in_line) {\n      alto_str << \"\\n\\t\\t\\t\\t\\t\\t</TextLine>\\n\";\n      lcnt++;\n    } else {\n      int hpos = right;\n      int vpos = top;\n      res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);\n      int width = left - hpos;\n      alto_str << \"<SP WIDTH=\\\"\" << width << \"\\\" VPOS=\\\"\" << vpos << \"\\\" HPOS=\\\"\" << hpos\n               << \"\\\"/>\\n\";\n    }\n\n    if (last_word_in_tblock) {\n      alto_str << \"\\t\\t\\t\\t\\t</TextBlock>\\n\";\n      tcnt++;\n    }\n\n    if (last_word_in_cblock) {\n      alto_str << \"\\t\\t\\t\\t</ComposedBlock>\\n\";\n      bcnt++;\n    }\n  }\n\n  alto_str << \"\\t\\t\\t</PrintSpace>\\n\"\n           << \"\\t\\t</Page>\\n\";\n\n  return copy_string(alto_str.str());\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/baseapi.cpp",
    "content": "/**********************************************************************\n * File:        baseapi.cpp\n * Description: Simple API for calling tesseract.\n * Author:      Ray Smith\n *\n * (C) Copyright 2006, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"boxword.h\"    // for BoxWord\n#include \"coutln.h\"     // for C_OUTLINE_IT, C_OUTLINE_LIST\n#include \"dawg_cache.h\" // for DawgCache\n#include \"dict.h\"       // for Dict\n#include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH\n#include \"environ.h\"    // for l_uint8\n#ifndef DISABLED_LEGACY_ENGINE\n#include \"equationdetect.h\" // for EquationDetect, destructor of equ_detect_\n#endif // ndef DISABLED_LEGACY_ENGINE\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"helpers.h\" // for IntCastRounded, chomp_string, copy_string\n#include \"host.h\"    // for MAX_PATH\n#include \"imageio.h\" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"intfx.h\" // for INT_FX_RESULT_STRUCT\n#endif\n#include \"mutableiterator.h\" // for MutableIterator\n#include \"normalis.h\"        // for kBlnBaselineOffset, kBlnXHeight\n#include \"pageres.h\"         // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...\n#include \"paragraphs.h\"      // for DetectParagraphs\n#include \"params.h\"          // for BoolParam, IntParam, DoubleParam, Stri...\n#include \"pdblock.h\"         // for PDBLK\n#include \"points.h\"          // for FCOORD\n#include \"polyblk.h\"         // for POLY_BLOCK\n#include \"rect.h\"            // for TBOX\n#include \"stepblob.h\"        // for C_BLOB_IT, C_BLOB, C_BLOB_LIST\n#include \"tessdatamanager.h\" // for TessdataManager, kTrainedDataSuffix\n#include \"tesseractclass.h\"  // for Tesseract\n#include \"tprintf.h\"         // for tprintf\n#include \"werd.h\"            // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP\n#include \"thresholder.h\"     // for ImageThresholder\n\n#include <tesseract/baseapi.h>\n#include <tesseract/ocrclass.h>       // for ETEXT_DESC\n#include <tesseract/osdetect.h>       // for OSResults, OSBestResult, OrientationId...\n#include <tesseract/renderer.h>       // for TessResultRenderer\n#include <tesseract/resultiterator.h> // for ResultIterator\n\n#include <cmath>    // for round, M_PI\n#include <cstdint>  // for int32_t\n#include <cstring>  // for strcmp, strcpy\n#include <filesystem> // for std::filesystem\n#include <fstream>  // for size_t\n#include <iostream> // for std::cin\n#include <locale>   // for std::locale::classic\n#include <memory>   // for std::unique_ptr\n#include <set>      // for std::pair\n#include <sstream>  // for std::stringstream\n#include <vector>   // for std::vector\n\n#include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...\n#ifdef HAVE_LIBCURL\n#  include <curl/curl.h>\n#endif\n\n#ifdef __linux__\n#  include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE\n#endif\n\n#if defined(_WIN32)\n#  include <fcntl.h> // for _O_BINARY\n#  include <io.h>    // for _setmode\n#endif\n\nnamespace tesseract {\n\nstatic BOOL_VAR(stream_filelist, false, \"Stream a filelist from stdin\");\nstatic STRING_VAR(document_title, \"\", \"Title of output document (used for hOCR and PDF output)\");\n#ifdef HAVE_LIBCURL\nstatic INT_VAR(curl_timeout, 0, \"Timeout for curl in seconds\");\nstatic STRING_VAR(curl_cookiefile, \"\", \"File with cookie data for curl\");\n#endif\n\n/** Minimum sensible image size to be worth running Tesseract. */\nconst int kMinRectSize = 10;\n/** Character returned when Tesseract couldn't recognize as anything. */\nconst char kTesseractReject = '~';\n/** Character used by UNLV error counter as a reject. */\nconst char kUNLVReject = '~';\n/** Character used by UNLV as a suspect marker. */\nconst char kUNLVSuspect = '^';\n/**\n * Temp file used for storing current parameters before applying retry values.\n */\nstatic const char *kOldVarsFile = \"failed_vars.txt\";\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * Filename used for input image file, from which to derive a name to search\n * for a possible UNLV zone file, if none is specified by SetInputName.\n */\nstatic const char *kInputFile = \"noname.tif\";\nstatic const char kUnknownFontName[] = \"UnknownFont\";\n\nstatic STRING_VAR(classify_font_name, kUnknownFontName,\n                  \"Default font name to be used in training\");\n\n// Finds the name of the training font and returns it in fontname, by cutting\n// it out based on the expectation that the filename is of the form:\n// /path/to/dir/[lang].[fontname].exp[num]\n// The [lang], [fontname] and [num] fields should not have '.' characters.\n// If the global parameter classify_font_name is set, its value is used instead.\nstatic void ExtractFontName(const char* filename, std::string* fontname) {\n  *fontname = classify_font_name;\n  if (*fontname == kUnknownFontName) {\n    // filename is expected to be of the form [lang].[fontname].exp[num]\n    // The [lang], [fontname] and [num] fields should not have '.' characters.\n    const char *basename = strrchr(filename, '/');\n    const char *firstdot = strchr(basename ? basename : filename, '.');\n    const char *lastdot  = strrchr(filename, '.');\n    if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {\n      ++firstdot;\n      *fontname = firstdot;\n      fontname->resize(lastdot - firstdot);\n    }\n  }\n}\n#endif\n\n/* Add all available languages recursively.\n */\nstatic void addAvailableLanguages(const std::string &datadir,\n                                  std::vector<std::string> *langs) {\n  for (const auto& entry :\n       std::filesystem::recursive_directory_iterator(datadir,\n         std::filesystem::directory_options::follow_directory_symlink |\n         std::filesystem::directory_options::skip_permission_denied)) {\n    auto path = entry.path().lexically_relative(datadir);\n    if (path.extension() == \".traineddata\") {\n      langs->push_back(path.replace_extension(\"\").string());\n    }\n  }\n}\n\nTessBaseAPI::TessBaseAPI()\n    : tesseract_(nullptr)\n    , osd_tesseract_(nullptr)\n    , equ_detect_(nullptr)\n    , reader_(nullptr)\n    ,\n    // thresholder_ is initialized to nullptr here, but will be set before use\n    // by: A constructor of a derived API or created\n    // implicitly when used in InternalSetImage.\n    thresholder_(nullptr)\n    , paragraph_models_(nullptr)\n    , block_list_(nullptr)\n    , page_res_(nullptr)\n    , last_oem_requested_(OEM_DEFAULT)\n    , recognition_done_(false)\n    , rect_left_(0)\n    , rect_top_(0)\n    , rect_width_(0)\n    , rect_height_(0)\n    , image_width_(0)\n    , image_height_(0) {\n}\n\nTessBaseAPI::~TessBaseAPI() {\n  End();\n}\n\n/**\n * Returns the version identifier as a static string. Do not delete.\n */\nconst char *TessBaseAPI::Version() {\n  return TESSERACT_VERSION_STR;\n}\n\n/**\n * Set the name of the input file. Needed only for training and\n * loading a UNLV zone file.\n */\nvoid TessBaseAPI::SetInputName(const char *name) {\n  input_file_ = name ? name : \"\";\n}\n\n/** Set the name of the output files. Needed only for debugging. */\nvoid TessBaseAPI::SetOutputName(const char *name) {\n  output_file_ = name ? name : \"\";\n}\n\nbool TessBaseAPI::SetVariable(const char *name, const char *value) {\n  if (tesseract_ == nullptr) {\n    tesseract_ = new Tesseract;\n  }\n  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,\n                              tesseract_->params());\n}\n\nbool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {\n  if (tesseract_ == nullptr) {\n    tesseract_ = new Tesseract;\n  }\n  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());\n}\n\nbool TessBaseAPI::GetIntVariable(const char *name, int *value) const {\n  auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,\n                                            tesseract_->params()->int_params);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = (int32_t)(*p);\n  return true;\n}\n\nbool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {\n  auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,\n                                             tesseract_->params()->bool_params);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = bool(*p);\n  return true;\n}\n\nconst char *TessBaseAPI::GetStringVariable(const char *name) const {\n  auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,\n                                               tesseract_->params()->string_params);\n  return (p != nullptr) ? p->c_str() : nullptr;\n}\n\nbool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {\n  auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,\n                                               tesseract_->params()->double_params);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = (double)(*p);\n  return true;\n}\n\n/** Get value of named variable as a string, if it exists. */\nbool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {\n  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n/** Print Tesseract fonts table to the given file. */\nvoid TessBaseAPI::PrintFontsTable(FILE *fp) const {\n  const int fontinfo_size = tesseract_->get_fontinfo_table().size();\n  for (int font_index = 1; font_index < fontinfo_size; ++font_index) {\n    FontInfo font = tesseract_->get_fontinfo_table().at(font_index);\n    fprintf(fp, \"ID=%3d: %s is_italic=%s is_bold=%s\"\n                \" is_fixed_pitch=%s is_serif=%s is_fraktur=%s\\n\",\n                font_index, font.name,\n                font.is_italic() ? \"true\" : \"false\",\n                font.is_bold() ? \"true\" : \"false\",\n                font.is_fixed_pitch() ? \"true\" : \"false\",\n                font.is_serif() ? \"true\" : \"false\",\n                font.is_fraktur() ? \"true\" : \"false\");\n  }\n}\n\n#endif\n\n/** Print Tesseract parameters to the given file. */\nvoid TessBaseAPI::PrintVariables(FILE *fp) const {\n  ParamUtils::PrintParams(fp, tesseract_->params());\n}\n\n/**\n * The datapath must be the name of the data directory or\n * some other file in which the data directory resides (for instance argv[0].)\n * The language is (usually) an ISO 639-3 string or nullptr will default to eng.\n * If numeric_mode is true, then only digits and Roman numerals will\n * be returned.\n * @return: 0 on success and -1 on initialization failure.\n */\nint TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,\n                      int configs_size, const std::vector<std::string> *vars_vec,\n                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {\n  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,\n              set_only_non_debug_params, nullptr);\n}\n\n// In-memory version reads the traineddata file directly from the given\n// data[data_size] array. Also implements the version with a datapath in data,\n// flagged by data_size = 0.\nint TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,\n                      char **configs, int configs_size, const std::vector<std::string> *vars_vec,\n                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params,\n                      FileReader reader) {\n  if (language == nullptr) {\n    language = \"\";\n  }\n  if (data == nullptr) {\n    data = \"\";\n  }\n  std::string datapath = data_size == 0 ? data : language;\n  // If the datapath, OcrEngineMode or the language have changed - start again.\n  // Note that the language_ field stores the last requested language that was\n  // initialized successfully, while tesseract_->lang stores the language\n  // actually used. They differ only if the requested language was nullptr, in\n  // which case tesseract_->lang is set to the Tesseract default (\"eng\").\n  if (tesseract_ != nullptr &&\n      (datapath_.empty() || language_.empty() || datapath_ != datapath ||\n       last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {\n    delete tesseract_;\n    tesseract_ = nullptr;\n  }\n  bool reset_classifier = true;\n  if (tesseract_ == nullptr) {\n    reset_classifier = false;\n    tesseract_ = new Tesseract;\n    if (reader != nullptr) {\n      reader_ = reader;\n    }\n    TessdataManager mgr(reader_);\n    if (data_size != 0) {\n      mgr.LoadMemBuffer(language, data, data_size);\n    }\n    if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs,\n                                   configs_size, vars_vec, vars_values, set_only_non_debug_params,\n                                   &mgr) != 0) {\n      return -1;\n    }\n  }\n\n  // Update datapath and language requested for the last valid initialization.\n  datapath_ = std::move(datapath);\n  if (datapath_.empty() && !tesseract_->datadir.empty()) {\n    datapath_ = tesseract_->datadir;\n  }\n\n  language_ = language;\n  last_oem_requested_ = oem;\n\n#ifndef DISABLED_LEGACY_ENGINE\n  // For same language and datapath, just reset the adaptive classifier.\n  if (reset_classifier) {\n    tesseract_->ResetAdaptiveClassifier();\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n  return 0;\n}\n\n/**\n * Returns the languages string used in the last valid initialization.\n * If the last initialization specified \"deu+hin\" then that will be\n * returned. If hin loaded eng automatically as well, then that will\n * not be included in this list. To find the languages actually\n * loaded use GetLoadedLanguagesAsVector.\n * The returned string should NOT be deleted.\n */\nconst char *TessBaseAPI::GetInitLanguagesAsString() const {\n  return language_.c_str();\n}\n\n/**\n * Returns the loaded languages in the vector of std::string.\n * Includes all languages loaded by the last Init, including those loaded\n * as dependencies of other loaded languages.\n */\nvoid TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {\n  langs->clear();\n  if (tesseract_ != nullptr) {\n    langs->push_back(tesseract_->lang);\n    int num_subs = tesseract_->num_sub_langs();\n    for (int i = 0; i < num_subs; ++i) {\n      langs->push_back(tesseract_->get_sub_lang(i)->lang);\n    }\n  }\n}\n\n/**\n * Returns the available languages in the sorted vector of std::string.\n */\nvoid TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {\n  langs->clear();\n  if (tesseract_ != nullptr) {\n    addAvailableLanguages(tesseract_->datadir, langs);\n    std::sort(langs->begin(), langs->end());\n  }\n}\n\n/**\n * Init only for page layout analysis. Use only for calls to SetImage and\n * AnalysePage. Calls that attempt recognition will generate an error.\n */\nvoid TessBaseAPI::InitForAnalysePage() {\n  if (tesseract_ == nullptr) {\n    tesseract_ = new Tesseract;\n#ifndef DISABLED_LEGACY_ENGINE\n    tesseract_->InitAdaptiveClassifier(nullptr);\n#endif\n  }\n}\n\n/**\n * Read a \"config\" file containing a set of parameter name, value pairs.\n * Searches the standard places: tessdata/configs, tessdata/tessconfigs\n * and also accepts a relative or absolute path name.\n */\nvoid TessBaseAPI::ReadConfigFile(const char *filename) {\n  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);\n}\n\n/** Same as above, but only set debug params from the given config file. */\nvoid TessBaseAPI::ReadDebugConfigFile(const char *filename) {\n  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);\n}\n\n/**\n * Set the current page segmentation mode. Defaults to PSM_AUTO.\n * The mode is stored as an IntParam so it can also be modified by\n * ReadConfigFile or SetVariable(\"tessedit_pageseg_mode\", mode as string).\n */\nvoid TessBaseAPI::SetPageSegMode(PageSegMode mode) {\n  if (tesseract_ == nullptr) {\n    tesseract_ = new Tesseract;\n  }\n  tesseract_->tessedit_pageseg_mode.set_value(mode);\n}\n\n/** Return the current page segmentation mode. */\nPageSegMode TessBaseAPI::GetPageSegMode() const {\n  if (tesseract_ == nullptr) {\n    return PSM_SINGLE_BLOCK;\n  }\n  return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));\n}\n\n/**\n * Recognize a rectangle from an image and return the result as a string.\n * May be called many times for a single Init.\n * Currently has no error checking.\n * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.\n * Palette color images will not work properly and must be converted to\n * 24 bit.\n * Binary images of 1 bit per pixel may also be given but they must be\n * byte packed with the MSB of the first byte being the first pixel, and a\n * one pixel is WHITE. For binary images set bytes_per_pixel=0.\n * The recognized text is returned as a char* which is coded\n * as UTF8 and must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,\n                                 int bytes_per_line, int left, int top, int width, int height) {\n  if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {\n    return nullptr; // Nothing worth doing.\n  }\n\n  // Since this original api didn't give the exact size of the image,\n  // we have to invent a reasonable value.\n  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;\n  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,\n           bytes_per_line);\n  SetRectangle(left, top, width, height);\n\n  return GetUTF8Text();\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * Call between pages or documents etc to free up memory and forget\n * adaptive data.\n */\nvoid TessBaseAPI::ClearAdaptiveClassifier() {\n  if (tesseract_ == nullptr) {\n    return;\n  }\n  tesseract_->ResetAdaptiveClassifier();\n  tesseract_->ResetDocumentDictionary();\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/**\n * Provide an image for Tesseract to recognize. Format is as\n * TesseractRect above. Copies the image buffer and converts to Pix.\n * SetImage clears all recognition results, and sets the rectangle to the\n * full image, so it may be followed immediately by a GetUTF8Text, and it\n * will automatically perform recognition.\n */\nvoid TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,\n                           int bytes_per_pixel, int bytes_per_line) {\n  if (InternalSetImage()) {\n    thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);\n    SetInputImage(thresholder_->GetPixRect());\n  }\n}\n\nvoid TessBaseAPI::SetSourceResolution(int ppi) {\n  if (thresholder_) {\n    thresholder_->SetSourceYResolution(ppi);\n  } else {\n    tprintf(\"Please call SetImage before SetSourceResolution.\\n\");\n  }\n}\n\n/**\n * Provide an image for Tesseract to recognize. As with SetImage above,\n * Tesseract takes its own copy of the image, so it need not persist until\n * after Recognize.\n * Pix vs raw, which to use?\n * Use Pix where possible. Tesseract uses Pix as its internal representation\n * and it is therefore more efficient to provide a Pix directly.\n */\nvoid TessBaseAPI::SetImage(Pix *pix) {\n  if (InternalSetImage()) {\n    if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {\n      // remove alpha channel from png\n      Pix *p1 = pixRemoveAlpha(pix);\n      pixSetSpp(p1, 3);\n      (void)pixCopy(pix, p1);\n      pixDestroy(&p1);\n    }\n    thresholder_->SetImage(pix);\n    SetInputImage(thresholder_->GetPixRect());\n  }\n}\n\n/**\n * Restrict recognition to a sub-rectangle of the image. Call after SetImage.\n * Each SetRectangle clears the recognition results so multiple rectangles\n * can be recognized with the same image.\n */\nvoid TessBaseAPI::SetRectangle(int left, int top, int width, int height) {\n  if (thresholder_ == nullptr) {\n    return;\n  }\n  thresholder_->SetRectangle(left, top, width, height);\n  ClearResults();\n}\n\n/**\n * ONLY available after SetImage if you have Leptonica installed.\n * Get a copy of the internal thresholded image from Tesseract.\n */\nPix *TessBaseAPI::GetThresholdedImage() {\n  if (tesseract_ == nullptr || thresholder_ == nullptr) {\n    return nullptr;\n  }\n  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {\n    return nullptr;\n  }\n  return tesseract_->pix_binary().clone();\n}\n\n/**\n * Get the result of page layout analysis as a leptonica-style\n * Boxa, Pixa pair, in reading order.\n * Can be called before or after Recognize.\n */\nBoxa *TessBaseAPI::GetRegions(Pixa **pixa) {\n  return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);\n}\n\n/**\n * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.\n * Can be called before or after Recognize.\n * If blockids is not nullptr, the block-id of each line is also returned as an\n * array of one element per line. delete [] after use.\n * If paraids is not nullptr, the paragraph-id of each line within its block is\n * also returned as an array of one element per line. delete [] after use.\n */\nBoxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,\n                                int **blockids, int **paraids) {\n  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);\n}\n\n/**\n * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa\n * pair, in reading order. Enables downstream handling of non-rectangular\n * regions.\n * Can be called before or after Recognize.\n * If blockids is not nullptr, the block-id of each line is also returned as an\n * array of one element per line. delete [] after use.\n */\nBoxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {\n  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);\n}\n\n/**\n * Get the words as a leptonica-style\n * Boxa, Pixa pair, in reading order.\n * Can be called before or after Recognize.\n */\nBoxa *TessBaseAPI::GetWords(Pixa **pixa) {\n  return GetComponentImages(RIL_WORD, true, pixa, nullptr);\n}\n\n/**\n * Gets the individual connected (text) components (created\n * after pages segmentation step, but before recognition)\n * as a leptonica-style Boxa, Pixa pair, in reading order.\n * Can be called before or after Recognize.\n */\nBoxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {\n  return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);\n}\n\n/**\n * Get the given level kind of components (block, textline, word etc.) as a\n * leptonica-style Boxa, Pixa pair, in reading order.\n * Can be called before or after Recognize.\n * If blockids is not nullptr, the block-id of each component is also returned\n * as an array of one element per component. delete [] after use.\n * If text_only is true, then only text components are returned.\n */\nBoxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,\n                                      const int raw_padding, Pixa **pixa, int **blockids,\n                                      int **paraids) {\n  /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());\n  if (page_it == nullptr) {\n    page_it.reset(AnalyseLayout());\n  }\n  if (page_it == nullptr) {\n    return nullptr; // Failed.\n  }\n\n  // Count the components to get a size for the arrays.\n  int component_count = 0;\n  int left, top, right, bottom;\n\n  if (raw_image) {\n    // Get bounding box in original raw image with padding.\n    do {\n      if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&\n          (!text_only || PTIsTextType(page_it->BlockType()))) {\n        ++component_count;\n      }\n    } while (page_it->Next(level));\n  } else {\n    // Get bounding box from binarized imaged. Note that this could be\n    // differently scaled from the original image.\n    do {\n      if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&\n          (!text_only || PTIsTextType(page_it->BlockType()))) {\n        ++component_count;\n      }\n    } while (page_it->Next(level));\n  }\n\n  Boxa *boxa = boxaCreate(component_count);\n  if (pixa != nullptr) {\n    *pixa = pixaCreate(component_count);\n  }\n  if (blockids != nullptr) {\n    *blockids = new int[component_count];\n  }\n  if (paraids != nullptr) {\n    *paraids = new int[component_count];\n  }\n\n  int blockid = 0;\n  int paraid = 0;\n  int component_index = 0;\n  page_it->Begin();\n  do {\n    bool got_bounding_box;\n    if (raw_image) {\n      got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);\n    } else {\n      got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);\n    }\n    if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {\n      Box *lbox = boxCreate(left, top, right - left, bottom - top);\n      boxaAddBox(boxa, lbox, L_INSERT);\n      if (pixa != nullptr) {\n        Pix *pix = nullptr;\n        if (raw_image) {\n          pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);\n        } else {\n          pix = page_it->GetBinaryImage(level);\n        }\n        pixaAddPix(*pixa, pix, L_INSERT);\n        pixaAddBox(*pixa, lbox, L_CLONE);\n      }\n      if (paraids != nullptr) {\n        (*paraids)[component_index] = paraid;\n        if (page_it->IsAtFinalElement(RIL_PARA, level)) {\n          ++paraid;\n        }\n      }\n      if (blockids != nullptr) {\n        (*blockids)[component_index] = blockid;\n        if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {\n          ++blockid;\n          paraid = 0;\n        }\n      }\n      ++component_index;\n    }\n  } while (page_it->Next(level));\n  return boxa;\n}\n\nint TessBaseAPI::GetThresholdedImageScaleFactor() const {\n  if (thresholder_ == nullptr) {\n    return 0;\n  }\n  return thresholder_->GetScaleFactor();\n}\n\n/**\n * Runs page layout analysis in the mode set by SetPageSegMode.\n * May optionally be called prior to Recognize to get access to just\n * the page layout results. Returns an iterator to the results.\n * If merge_similar_words is true, words are combined where suitable for use\n * with a line recognizer. Use if you want to use AnalyseLayout to find the\n * textlines, and then want to process textline fragments with an external\n * line recognizer.\n * Returns nullptr on error or an empty page.\n * The returned iterator must be deleted after use.\n * WARNING! This class points to data held within the TessBaseAPI class, and\n * therefore can only be used while the TessBaseAPI class still exists and\n * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n * DetectOS, or anything else that changes the internal PAGE_RES.\n */\nPageIterator *TessBaseAPI::AnalyseLayout() {\n  return AnalyseLayout(false);\n}\n\nPageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {\n  if (FindLines() == 0) {\n    if (block_list_->empty()) {\n      return nullptr; // The page was empty.\n    }\n    page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);\n    DetectParagraphs(false);\n    return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),\n                            thresholder_->GetScaledYResolution(), rect_left_, rect_top_,\n                            rect_width_, rect_height_);\n  }\n  return nullptr;\n}\n\n/**\n * Recognize the tesseract global image and return the result as Tesseract\n * internal structures.\n */\nint TessBaseAPI::Recognize(ETEXT_DESC *monitor) {\n  if (tesseract_ == nullptr) {\n    return -1;\n  }\n  if (FindLines() != 0) {\n    return -1;\n  }\n  delete page_res_;\n  if (block_list_->empty()) {\n    page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);\n    return 0; // Empty page.\n  }\n\n  tesseract_->SetBlackAndWhitelist();\n  recognition_done_ = true;\n#ifndef DISABLED_LEGACY_ENGINE\n  if (tesseract_->tessedit_resegment_from_line_boxes) {\n    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);\n  } else if (tesseract_->tessedit_resegment_from_boxes) {\n    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);\n  } else\n#endif // ndef DISABLED_LEGACY_ENGINE\n  {\n    page_res_ =\n        new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);\n  }\n\n  if (page_res_ == nullptr) {\n    return -1;\n  }\n\n  if (tesseract_->tessedit_train_line_recognizer) {\n    if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {\n      return -1;\n    }\n    tesseract_->CorrectClassifyWords(page_res_);\n    return 0;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  if (tesseract_->tessedit_make_boxes_from_boxes) {\n    tesseract_->CorrectClassifyWords(page_res_);\n    return 0;\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  int result = 0;\n  if (tesseract_->interactive_display_mode) {\n#ifndef GRAPHICS_DISABLED\n    tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);\n#endif // !GRAPHICS_DISABLED\n       // The page_res is invalid after an interactive session, so cleanup\n       // in a way that lets us continue to the next page without crashing.\n    delete page_res_;\n    page_res_ = nullptr;\n    return -1;\n#ifndef DISABLED_LEGACY_ENGINE\n  } else if (tesseract_->tessedit_train_from_boxes) {\n    std::string fontname;\n    ExtractFontName(output_file_.c_str(), &fontname);\n    tesseract_->ApplyBoxTraining(fontname, page_res_);\n  } else if (tesseract_->tessedit_ambigs_training) {\n    FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());\n    // OCR the page segmented into words by tesseract.\n    tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,\n                                         training_output_file);\n    fclose(training_output_file);\n#endif // ndef DISABLED_LEGACY_ENGINE\n  } else {\n    // Now run the main recognition.\n    bool wait_for_text = true;\n    GetBoolVariable(\"paragraph_text_based\", &wait_for_text);\n    if (!wait_for_text) {\n      DetectParagraphs(false);\n    }\n    if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {\n      if (wait_for_text) {\n        DetectParagraphs(true);\n      }\n    } else {\n      result = -1;\n    }\n  }\n  return result;\n}\n\n// Takes ownership of the input pix.\nvoid TessBaseAPI::SetInputImage(Pix *pix) {\n  tesseract_->set_pix_original(pix);\n}\n\nPix *TessBaseAPI::GetInputImage() {\n  return tesseract_->pix_original();\n}\n\nconst char *TessBaseAPI::GetInputName() {\n  if (!input_file_.empty()) {\n    return input_file_.c_str();\n  }\n  return nullptr;\n}\n\nconst char *TessBaseAPI::GetDatapath() {\n  return tesseract_->datadir.c_str();\n}\n\nint TessBaseAPI::GetSourceYResolution() {\n  if (thresholder_ == nullptr)\n    return -1;\n  return thresholder_->GetSourceYResolution();\n}\n\n// If flist exists, get data from there. Otherwise get data from buf.\n// Seems convoluted, but is the easiest way I know of to meet multiple\n// goals. Support streaming from stdin, and also work on platforms\n// lacking fmemopen.\n// TODO: check different logic for flist/buf and simplify.\nbool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,\n                                       int timeout_millisec, TessResultRenderer *renderer,\n                                       int tessedit_page_number) {\n  if (!flist && !buf) {\n    return false;\n  }\n  unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;\n  char pagename[MAX_PATH];\n\n  std::vector<std::string> lines;\n  if (!flist) {\n    std::string line;\n    for (const auto ch : *buf) {\n      if (ch == '\\n') {\n        lines.push_back(line);\n        line.clear();\n      } else {\n        line.push_back(ch);\n      }\n    }\n    if (!line.empty()) {\n      // Add last line without terminating LF.\n      lines.push_back(line);\n    }\n    if (lines.empty()) {\n      return false;\n    }\n  }\n\n  // Skip to the requested page number.\n  for (unsigned i = 0; i < page; i++) {\n    if (flist) {\n      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {\n        break;\n      }\n    }\n  }\n\n  // Begin producing output\n  if (renderer && !renderer->BeginDocument(document_title.c_str())) {\n    return false;\n  }\n\n  // Loop over all pages - or just the requested one\n  while (true) {\n    if (flist) {\n      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {\n        break;\n      }\n    } else {\n      if (page >= lines.size()) {\n        break;\n      }\n      snprintf(pagename, sizeof(pagename), \"%s\", lines[page].c_str());\n    }\n    chomp_string(pagename);\n    Pix *pix = pixRead(pagename);\n    if (pix == nullptr) {\n      tprintf(\"Image file %s cannot be read!\\n\", pagename);\n      return false;\n    }\n    tprintf(\"Page %u : %s\\n\", page, pagename);\n    bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);\n    pixDestroy(&pix);\n    if (!r) {\n      return false;\n    }\n    if (tessedit_page_number >= 0) {\n      break;\n    }\n    ++page;\n  }\n\n  // Finish producing output\n  if (renderer && !renderer->EndDocument()) {\n    return false;\n  }\n  return true;\n}\n\nbool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,\n                                            const char *retry_config, int timeout_millisec,\n                                            TessResultRenderer *renderer,\n                                            int tessedit_page_number) {\n  Pix *pix = nullptr;\n  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;\n  size_t offset = 0;\n  for (;; ++page) {\n    if (tessedit_page_number >= 0) {\n      page = tessedit_page_number;\n      pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);\n    } else {\n      pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)\n                   : pixReadFromMultipageTiff(filename, &offset);\n    }\n    if (pix == nullptr) {\n      break;\n    }\n    if (offset || page > 0) {\n      // Only print page number for multipage TIFF file.\n      tprintf(\"Page %d\\n\", page + 1);\n    }\n    auto page_string = std::to_string(page);\n    SetVariable(\"applybox_page\", page_string.c_str());\n    bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);\n    pixDestroy(&pix);\n    if (!r) {\n      return false;\n    }\n    if (tessedit_page_number >= 0) {\n      break;\n    }\n    if (!offset) {\n      break;\n    }\n  }\n  return true;\n}\n\n// Master ProcessPages calls ProcessPagesInternal and then does any post-\n// processing required due to being in a training mode.\nbool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,\n                               TessResultRenderer *renderer) {\n  bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);\n#ifndef DISABLED_LEGACY_ENGINE\n  if (result) {\n    if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {\n      tprintf(\"Write of TR file failed: %s\\n\", output_file_.c_str());\n      return false;\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n  return result;\n}\n\n#ifdef HAVE_LIBCURL\nstatic size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {\n  size = size * nmemb;\n  auto *buf = reinterpret_cast<std::string *>(userp);\n  buf->append(reinterpret_cast<const char *>(contents), size);\n  return size;\n}\n#endif\n\n// In the ideal scenario, Tesseract will start working on data as soon\n// as it can. For example, if you stream a filelist through stdin, we\n// should start the OCR process as soon as the first filename is\n// available. This is particularly useful when hooking Tesseract up to\n// slow hardware such as a book scanning machine.\n//\n// Unfortunately there are tradeoffs. You can't seek on stdin. That\n// makes automatic detection of datatype (TIFF? filelist? PNG?)\n// impractical.  So we support a command line flag to explicitly\n// identify the scenario that really matters: filelists on\n// stdin. We'll still do our best if the user likes pipes.\nbool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,\n                                       int timeout_millisec, TessResultRenderer *renderer) {\n  bool stdInput = !strcmp(filename, \"stdin\") || !strcmp(filename, \"-\");\n  if (stdInput) {\n#ifdef WIN32\n    if (_setmode(_fileno(stdin), _O_BINARY) == -1)\n      tprintf(\"ERROR: cin to binary: %s\", strerror(errno));\n#endif // WIN32\n  }\n\n  if (stream_filelist) {\n    return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,\n                                tesseract_->tessedit_page_number);\n  }\n\n  // At this point we are officially in autodetection territory.\n  // That means any data in stdin must be buffered, to make it\n  // seekable.\n  std::string buf;\n  const l_uint8 *data = nullptr;\n  if (stdInput) {\n    buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));\n    data = reinterpret_cast<const l_uint8 *>(buf.data());\n  } else if (strstr(filename, \"://\") != nullptr) {\n    // Get image or image list by URL.\n#ifdef HAVE_LIBCURL\n    CURL *curl = curl_easy_init();\n    if (curl == nullptr) {\n      fprintf(stderr, \"Error, curl_easy_init failed\\n\");\n      return false;\n    } else {\n      CURLcode curlcode;\n      auto error = [curl, &curlcode](const char *function) {\n        fprintf(stderr, \"Error, %s failed with error %s\\n\", function, curl_easy_strerror(curlcode));\n        curl_easy_cleanup(curl);\n        return false;\n      };\n      curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      // Follow HTTP, HTTPS, FTP and FTPS redirects.\n      curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      // Allow no more than 8 redirections to prevent endless loops.\n      curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      int timeout = curl_timeout;\n      if (timeout > 0) {\n        curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);\n        if (curlcode != CURLE_OK) {\n          return error(\"curl_easy_setopt\");\n        }\n        curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);\n        if (curlcode != CURLE_OK) {\n          return error(\"curl_easy_setopt\");\n        }\n      }\n      std::string cookiefile = curl_cookiefile;\n      if (!cookiefile.empty()) {\n        curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str());\n        if (curlcode != CURLE_OK) {\n          return error(\"curl_easy_setopt\");\n        }\n      }\n      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, \"Tesseract OCR\");\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_setopt\");\n      }\n      curlcode = curl_easy_perform(curl);\n      if (curlcode != CURLE_OK) {\n        return error(\"curl_easy_perform\");\n      }\n      curl_easy_cleanup(curl);\n      data = reinterpret_cast<const l_uint8 *>(buf.data());\n    }\n#else\n    fprintf(stderr, \"Error, this tesseract has no URL support\\n\");\n    return false;\n#endif\n  } else {\n    // Check whether the input file can be read.\n    if (FILE *file = fopen(filename, \"rb\")) {\n      fclose(file);\n    } else {\n      fprintf(stderr, \"Error, cannot read input file %s: %s\\n\", filename, strerror(errno));\n      return false;\n    }\n  }\n\n  // Here is our autodetection\n  int format;\n  int r =\n      (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);\n\n  // Maybe we have a filelist\n  if (r != 0 || format == IFF_UNKNOWN) {\n    std::string s;\n    if (data != nullptr) {\n      s = buf.c_str();\n    } else {\n      std::ifstream t(filename);\n      std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());\n      s = u.c_str();\n    }\n    return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,\n                                tesseract_->tessedit_page_number);\n  }\n\n  // Maybe we have a TIFF which is potentially multipage\n  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||\n               format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||\n#if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76\n               format == IFF_TIFF_JPEG ||\n#endif\n               format == IFF_TIFF_ZIP);\n\n  // Fail early if we can, before producing any output\n  Pix *pix = nullptr;\n  if (!tiff) {\n    pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);\n    if (pix == nullptr) {\n      return false;\n    }\n  }\n\n  // Begin the output\n  if (renderer && !renderer->BeginDocument(document_title.c_str())) {\n    pixDestroy(&pix);\n    return false;\n  }\n\n  // Produce output\n  r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,\n                                         renderer, tesseract_->tessedit_page_number)\n             : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);\n\n  // Clean up memory as needed\n  pixDestroy(&pix);\n\n  // End the output\n  if (!r || (renderer && !renderer->EndDocument())) {\n    return false;\n  }\n  return true;\n}\n\nbool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,\n                              const char *retry_config, int timeout_millisec,\n                              TessResultRenderer *renderer) {\n  SetInputName(filename);\n  SetImage(pix);\n  bool failed = false;\n\n  if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {\n    // Disabled character recognition\n    if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {\n      failed = true;\n    }\n  } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {\n    failed = FindLines() != 0;\n  } else if (timeout_millisec > 0) {\n    // Running with a timeout.\n    ETEXT_DESC monitor;\n    monitor.cancel = nullptr;\n    monitor.cancel_this = nullptr;\n    monitor.set_deadline_msecs(timeout_millisec);\n\n    // Now run the main recognition.\n    failed = Recognize(&monitor) < 0;\n  } else {\n    // Normal layout and character recognition with no timeout.\n    failed = Recognize(nullptr) < 0;\n  }\n\n  if (tesseract_->tessedit_write_images) {\n    Pix *page_pix = GetThresholdedImage();\n    std::string output_filename = output_file_ + \".processed\";\n    if (page_index > 0) {\n      output_filename += std::to_string(page_index);\n    }\n    output_filename += \".tif\";\n    pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);\n    pixDestroy(&page_pix);\n  }\n\n  if (failed && retry_config != nullptr && retry_config[0] != '\\0') {\n    // Save current config variables before switching modes.\n    FILE *fp = fopen(kOldVarsFile, \"wb\");\n    if (fp == nullptr) {\n      tprintf(\"Error, failed to open file \\\"%s\\\"\\n\", kOldVarsFile);\n    } else {\n      PrintVariables(fp);\n      fclose(fp);\n    }\n    // Switch to alternate mode for retry.\n    ReadConfigFile(retry_config);\n    SetImage(pix);\n    Recognize(nullptr);\n    // Restore saved config variables.\n    ReadConfigFile(kOldVarsFile);\n  }\n\n  if (renderer && !failed) {\n    failed = !renderer->AddImage(this);\n  }\n\n  return !failed;\n}\n\n/**\n * Get a left-to-right iterator to the results of LayoutAnalysis and/or\n * Recognize. The returned iterator must be deleted after use.\n */\nLTRResultIterator *TessBaseAPI::GetLTRIterator() {\n  if (tesseract_ == nullptr || page_res_ == nullptr) {\n    return nullptr;\n  }\n  return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),\n                               thresholder_->GetScaledYResolution(), rect_left_, rect_top_,\n                               rect_width_, rect_height_);\n}\n\n/**\n * Get a reading-order iterator to the results of LayoutAnalysis and/or\n * Recognize. The returned iterator must be deleted after use.\n * WARNING! This class points to data held within the TessBaseAPI class, and\n * therefore can only be used while the TessBaseAPI class still exists and\n * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n * DetectOS, or anything else that changes the internal PAGE_RES.\n */\nResultIterator *TessBaseAPI::GetIterator() {\n  if (tesseract_ == nullptr || page_res_ == nullptr) {\n    return nullptr;\n  }\n  return ResultIterator::StartOfParagraph(LTRResultIterator(\n      page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),\n      rect_left_, rect_top_, rect_width_, rect_height_));\n}\n\n/**\n * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.\n * The returned iterator must be deleted after use.\n * WARNING! This class points to data held within the TessBaseAPI class, and\n * therefore can only be used while the TessBaseAPI class still exists and\n * has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n * DetectOS, or anything else that changes the internal PAGE_RES.\n */\nMutableIterator *TessBaseAPI::GetMutableIterator() {\n  if (tesseract_ == nullptr || page_res_ == nullptr) {\n    return nullptr;\n  }\n  return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),\n                             thresholder_->GetScaledYResolution(), rect_left_, rect_top_,\n                             rect_width_, rect_height_);\n}\n\n/** Make a text string from the internal data structures. */\nchar *TessBaseAPI::GetUTF8Text() {\n  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n  std::string text(\"\");\n  const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());\n  do {\n    if (it->Empty(RIL_PARA)) {\n      continue;\n    }\n    auto block_type = it->BlockType();\n    switch (block_type) {\n      case PT_FLOWING_IMAGE:\n      case PT_HEADING_IMAGE:\n      case PT_PULLOUT_IMAGE:\n      case PT_HORZ_LINE:\n      case PT_VERT_LINE:\n        // Ignore images and lines for text output.\n        continue;\n      case PT_NOISE:\n        tprintf(\"TODO: Please report image which triggers the noise case.\\n\");\n        ASSERT_HOST(false);\n      default:\n        break;\n    }\n\n    const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));\n    text += para_text.get();\n  } while (it->Next(RIL_PARA));\n  return copy_string(text);\n}\n\nstatic void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {\n  int left, top, right, bottom;\n  it->BoundingBox(level, &left, &top, &right, &bottom);\n  text += \"\\t\" + std::to_string(left);\n  text += \"\\t\" + std::to_string(top);\n  text += \"\\t\" + std::to_string(right - left);\n  text += \"\\t\" + std::to_string(bottom - top);\n}\n\n/**\n * Make a TSV-formatted string from the internal data structures.\n * page_number is 0-based but will appear in the output as 1-based.\n * Returned string must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::GetTSVText(int page_number) {\n  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n\n#if !defined(NDEBUG)\n  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;\n#endif\n  int page_id = page_number + 1; // we use 1-based page numbers.\n\n  int page_num = page_id;\n  int block_num = 0;\n  int par_num = 0;\n  int line_num = 0;\n  int word_num = 0;\n\n  std::string tsv_str;\n  tsv_str += \"1\\t\" + std::to_string(page_num); // level 1 - page\n  tsv_str += \"\\t\" + std::to_string(block_num);\n  tsv_str += \"\\t\" + std::to_string(par_num);\n  tsv_str += \"\\t\" + std::to_string(line_num);\n  tsv_str += \"\\t\" + std::to_string(word_num);\n  tsv_str += \"\\t\" + std::to_string(rect_left_);\n  tsv_str += \"\\t\" + std::to_string(rect_top_);\n  tsv_str += \"\\t\" + std::to_string(rect_width_);\n  tsv_str += \"\\t\" + std::to_string(rect_height_);\n  tsv_str += \"\\t-1\\t\\n\";\n\n  const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    // Add rows for any new block/paragraph/textline.\n    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {\n      block_num++;\n      par_num = 0;\n      line_num = 0;\n      word_num = 0;\n      tsv_str += \"2\\t\" + std::to_string(page_num); // level 2 - block\n      tsv_str += \"\\t\" + std::to_string(block_num);\n      tsv_str += \"\\t\" + std::to_string(par_num);\n      tsv_str += \"\\t\" + std::to_string(line_num);\n      tsv_str += \"\\t\" + std::to_string(word_num);\n      AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);\n      tsv_str += \"\\t-1\\t\\n\"; // end of row for block\n    }\n    if (res_it->IsAtBeginningOf(RIL_PARA)) {\n      par_num++;\n      line_num = 0;\n      word_num = 0;\n      tsv_str += \"3\\t\" + std::to_string(page_num); // level 3 - paragraph\n      tsv_str += \"\\t\" + std::to_string(block_num);\n      tsv_str += \"\\t\" + std::to_string(par_num);\n      tsv_str += \"\\t\" + std::to_string(line_num);\n      tsv_str += \"\\t\" + std::to_string(word_num);\n      AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);\n      tsv_str += \"\\t-1\\t\\n\"; // end of row for para\n    }\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      line_num++;\n      word_num = 0;\n      tsv_str += \"4\\t\" + std::to_string(page_num); // level 4 - line\n      tsv_str += \"\\t\" + std::to_string(block_num);\n      tsv_str += \"\\t\" + std::to_string(par_num);\n      tsv_str += \"\\t\" + std::to_string(line_num);\n      tsv_str += \"\\t\" + std::to_string(word_num);\n      AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);\n      tsv_str += \"\\t-1\\t\\n\"; // end of row for line\n    }\n\n    // Now, process the word...\n    int left, top, right, bottom;\n    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);\n    word_num++;\n    tsv_str += \"5\\t\" + std::to_string(page_num); // level 5 - word\n    tsv_str += \"\\t\" + std::to_string(block_num);\n    tsv_str += \"\\t\" + std::to_string(par_num);\n    tsv_str += \"\\t\" + std::to_string(line_num);\n    tsv_str += \"\\t\" + std::to_string(word_num);\n    tsv_str += \"\\t\" + std::to_string(left);\n    tsv_str += \"\\t\" + std::to_string(top);\n    tsv_str += \"\\t\" + std::to_string(right - left);\n    tsv_str += \"\\t\" + std::to_string(bottom - top);\n    tsv_str += \"\\t\" + std::to_string(res_it->Confidence(RIL_WORD));\n    tsv_str += \"\\t\";\n\n#if !defined(NDEBUG)\n    // Increment counts if at end of block/paragraph/textline.\n    if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {\n      lcnt++;\n    }\n    if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {\n      pcnt++;\n    }\n    if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {\n      bcnt++;\n    }\n#endif\n\n    do {\n      tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n    tsv_str += \"\\n\"; // end of row\n#if !defined(NDEBUG)\n    wcnt++;\n#endif\n  }\n\n  return copy_string(tsv_str);\n}\n\n/** The 5 numbers output for each box (the usual 4 and a page number.) */\nconst int kNumbersPerBlob = 5;\n/**\n * The number of bytes taken by each number. Since we use int16_t for ICOORD,\n * assume only 5 digits max.\n */\nconst int kBytesPerNumber = 5;\n/**\n * Multiplier for max expected textlength assumes (kBytesPerNumber + space)\n * * kNumbersPerBlob plus the newline. Add to this the\n * original UTF8 characters, and one kMaxBytesPerLine for safety.\n */\nconst int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;\n/** Max bytes in the decimal representation of int64_t. */\nconst int kBytesPer64BitNumber = 20;\n/**\n * A maximal single box could occupy kNumbersPerBlob numbers at\n * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a\n * space plus the newline and the maximum length of a UNICHAR.\n * Test against this on each iteration for safety.\n */\nconst int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;\n\n/**\n * The recognized text is returned as a char* which is coded\n * as a UTF8 box file.\n * page_number is a 0-base page index that will appear in the box file.\n * Returned string must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::GetBoxText(int page_number) {\n  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n  int blob_count;\n  int utf8_length = TextLength(&blob_count);\n  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;\n  char *result = new char[total_length];\n  result[0] = '\\0';\n  int output_length = 0;\n  LTRResultIterator *it = GetLTRIterator();\n  do {\n    int left, top, right, bottom;\n    if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {\n      const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));\n      // Tesseract uses space for recognition failure. Fix to a reject\n      // character, kTesseractReject so we don't create illegal box files.\n      for (int i = 0; text[i] != '\\0'; ++i) {\n        if (text[i] == ' ') {\n          text[i] = kTesseractReject;\n        }\n      }\n      snprintf(result + output_length, total_length - output_length, \"%s %d %d %d %d %d\\n\",\n               text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);\n      output_length += strlen(result + output_length);\n      // Just in case...\n      if (output_length + kMaxBytesPerLine > total_length) {\n        break;\n      }\n    }\n  } while (it->Next(RIL_SYMBOL));\n  delete it;\n  return result;\n}\n\n/**\n * Conversion table for non-latin characters.\n * Maps characters out of the latin set into the latin set.\n * TODO(rays) incorporate this translation into unicharset.\n */\nconst int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};\n/** Latin chars corresponding to the unicode chars above. */\nconst int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};\n\n/**\n * The recognized text is returned as a char* which is coded\n * as UNLV format Latin-1 with specific reject and suspect codes.\n * Returned string must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::GetUNLVText() {\n  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n  bool tilde_crunch_written = false;\n  bool last_char_was_newline = true;\n  bool last_char_was_tilde = false;\n\n  int total_length = TextLength(nullptr);\n  PAGE_RES_IT page_res_it(page_res_);\n  char *result = new char[total_length];\n  char *ptr = result;\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD_RES *word = page_res_it.word();\n    // Process the current word.\n    if (word->unlv_crunch_mode != CR_NONE) {\n      if (word->unlv_crunch_mode != CR_DELETE &&\n          (!tilde_crunch_written ||\n           (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&\n            !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {\n        if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&\n            !word->word->flag(W_FUZZY_SP)) {\n          /* Write a space to separate from preceding good text */\n          *ptr++ = ' ';\n          last_char_was_tilde = false;\n        }\n        if (!last_char_was_tilde) {\n          // Write a reject char.\n          last_char_was_tilde = true;\n          *ptr++ = kUNLVReject;\n          tilde_crunch_written = true;\n          last_char_was_newline = false;\n        }\n      }\n    } else {\n      // NORMAL PROCESSING of non tilde crunched words.\n      tilde_crunch_written = false;\n      tesseract_->set_unlv_suspects(word);\n      const char *wordstr = word->best_choice->unichar_string().c_str();\n      const auto &lengths = word->best_choice->unichar_lengths();\n      int length = lengths.length();\n      int i = 0;\n      int offset = 0;\n\n      if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {\n        // Prevent adjacent tilde across words - we know that adjacent tildes\n        // within words have been removed.\n        // Skip the first character.\n        offset = lengths[i++];\n      }\n      if (i < length && wordstr[offset] != 0) {\n        if (!last_char_was_newline) {\n          *ptr++ = ' ';\n        } else {\n          last_char_was_newline = false;\n        }\n        for (; i < length; offset += lengths[i++]) {\n          if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {\n            *ptr++ = kUNLVReject;\n            last_char_was_tilde = true;\n          } else {\n            if (word->reject_map[i].rejected()) {\n              *ptr++ = kUNLVSuspect;\n            }\n            UNICHAR ch(wordstr + offset, lengths[i]);\n            int uni_ch = ch.first_uni();\n            for (int j = 0; kUniChs[j] != 0; ++j) {\n              if (kUniChs[j] == uni_ch) {\n                uni_ch = kLatinChs[j];\n                break;\n              }\n            }\n            if (uni_ch <= 0xff) {\n              *ptr++ = static_cast<char>(uni_ch);\n              last_char_was_tilde = false;\n            } else {\n              *ptr++ = kUNLVReject;\n              last_char_was_tilde = true;\n            }\n          }\n        }\n      }\n    }\n    if (word->word->flag(W_EOL) && !last_char_was_newline) {\n      /* Add a new line output */\n      *ptr++ = '\\n';\n      tilde_crunch_written = false;\n      last_char_was_newline = true;\n      last_char_was_tilde = false;\n    }\n  }\n  *ptr++ = '\\n';\n  *ptr = '\\0';\n  return result;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n/**\n * Detect the orientation of the input image and apparent script (alphabet).\n * orient_deg is the detected clockwise rotation of the input image in degrees\n * (0, 90, 180, 270)\n * orient_conf is the confidence (15.0 is reasonably confident)\n * script_name is an ASCII string, the name of the script, e.g. \"Latin\"\n * script_conf is confidence level in the script\n * Returns true on success and writes values to each parameter as an output\n */\nbool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,\n                                          const char **script_name, float *script_conf) {\n  OSResults osr;\n\n  bool osd = DetectOS(&osr);\n  if (!osd) {\n    return false;\n  }\n\n  int orient_id = osr.best_result.orientation_id;\n  int script_id = osr.get_best_script(orient_id);\n  if (orient_conf) {\n    *orient_conf = osr.best_result.oconfidence;\n  }\n  if (orient_deg) {\n    *orient_deg = orient_id * 90; // convert quadrant to degrees\n  }\n\n  if (script_name) {\n    const char *script = osr.unicharset->get_script_from_script_id(script_id);\n\n    *script_name = script;\n  }\n\n  if (script_conf) {\n    *script_conf = osr.best_result.sconfidence;\n  }\n\n  return true;\n}\n\n/**\n * The recognized text is returned as a char* which is coded\n * as UTF8 and must be freed with the delete [] operator.\n * page_number is a 0-based page index that will appear in the osd file.\n */\nchar *TessBaseAPI::GetOsdText(int page_number) {\n  int orient_deg;\n  float orient_conf;\n  const char *script_name;\n  float script_conf;\n\n  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {\n    return nullptr;\n  }\n\n  // clockwise rotation needed to make the page upright\n  int rotate = OrientationIdToValue(orient_deg / 90);\n\n  std::stringstream stream;\n  // Use \"C\" locale (needed for float values orient_conf and script_conf).\n  stream.imbue(std::locale::classic());\n  // Use fixed notation with 2 digits after the decimal point for float values.\n  stream.precision(2);\n  stream << std::fixed << \"Page number: \" << page_number << \"\\n\"\n         << \"Orientation in degrees: \" << orient_deg << \"\\n\"\n         << \"Rotate: \" << rotate << \"\\n\"\n         << \"Orientation confidence: \" << orient_conf << \"\\n\"\n         << \"Script: \" << script_name << \"\\n\"\n         << \"Script confidence: \" << script_conf << \"\\n\";\n  return copy_string(stream.str());\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/** Returns the average word confidence for Tesseract page result. */\nint TessBaseAPI::MeanTextConf() {\n  int *conf = AllWordConfidences();\n  if (!conf) {\n    return 0;\n  }\n  int sum = 0;\n  int *pt = conf;\n  while (*pt >= 0) {\n    sum += *pt++;\n  }\n  if (pt != conf) {\n    sum /= pt - conf;\n  }\n  delete[] conf;\n  return sum;\n}\n\n/** Returns an array of all word confidences, terminated by -1. */\nint *TessBaseAPI::AllWordConfidences() {\n  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n  int n_word = 0;\n  PAGE_RES_IT res_it(page_res_);\n  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {\n    n_word++;\n  }\n\n  int *conf = new int[n_word + 1];\n  n_word = 0;\n  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {\n    WERD_RES *word = res_it.word();\n    WERD_CHOICE *choice = word->best_choice;\n    int w_conf = static_cast<int>(100 + 5 * choice->certainty());\n    // This is the eq for converting Tesseract confidence to 1..100\n    if (w_conf < 0) {\n      w_conf = 0;\n    }\n    if (w_conf > 100) {\n      w_conf = 100;\n    }\n    conf[n_word++] = w_conf;\n  }\n  conf[n_word] = -1;\n  return conf;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * Applies the given word to the adaptive classifier if possible.\n * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can\n * tell the boundaries of the graphemes.\n * Assumes that SetImage/SetRectangle have been used to set the image\n * to the given word. The mode arg should be PSM_SINGLE_WORD or\n * PSM_CIRCLE_WORD, as that will be used to control layout analysis.\n * The currently set PageSegMode is preserved.\n * Returns false if adaption was not possible for some reason.\n */\nbool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {\n  int debug = 0;\n  GetIntVariable(\"applybox_debug\", &debug);\n  bool success = true;\n  PageSegMode current_psm = GetPageSegMode();\n  SetPageSegMode(mode);\n  SetVariable(\"classify_enable_learning\", \"0\");\n  const std::unique_ptr<const char[]> text(GetUTF8Text());\n  if (debug) {\n    tprintf(\"Trying to adapt \\\"%s\\\" to \\\"%s\\\"\\n\", text.get(), wordstr);\n  }\n  if (text != nullptr) {\n    PAGE_RES_IT it(page_res_);\n    WERD_RES *word_res = it.word();\n    if (word_res != nullptr) {\n      word_res->word->set_text(wordstr);\n      // Check to see if text matches wordstr.\n      int w = 0;\n      int t;\n      for (t = 0; text[t] != '\\0'; ++t) {\n        if (text[t] == '\\n' || text[t] == ' ') {\n          continue;\n        }\n        while (wordstr[w] == ' ') {\n          ++w;\n        }\n        if (text[t] != wordstr[w]) {\n          break;\n        }\n        ++w;\n      }\n      if (text[t] != '\\0' || wordstr[w] != '\\0') {\n        // No match.\n        delete page_res_;\n        std::vector<TBOX> boxes;\n        page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);\n        tesseract_->ReSegmentByClassification(page_res_);\n        tesseract_->TidyUp(page_res_);\n        PAGE_RES_IT pr_it(page_res_);\n        if (pr_it.word() == nullptr) {\n          success = false;\n        } else {\n          word_res = pr_it.word();\n        }\n      } else {\n        word_res->BestChoiceToCorrectText();\n      }\n      if (success) {\n        tesseract_->EnableLearning = true;\n        tesseract_->LearnWord(nullptr, word_res);\n      }\n    } else {\n      success = false;\n    }\n  } else {\n    success = false;\n  }\n  SetPageSegMode(current_psm);\n  return success;\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/**\n * Free up recognition results and any stored image data, without actually\n * freeing any recognition data that would be time-consuming to reload.\n * Afterwards, you must call SetImage or TesseractRect before doing\n * any Recognize or Get* operation.\n */\nvoid TessBaseAPI::Clear() {\n  if (thresholder_ != nullptr) {\n    thresholder_->Clear();\n  }\n  ClearResults();\n  if (tesseract_ != nullptr) {\n    SetInputImage(nullptr);\n  }\n}\n\n/**\n * Close down tesseract and free up all memory. End() is equivalent to\n * destructing and reconstructing your TessBaseAPI.\n * Once End() has been used, none of the other API functions may be used\n * other than Init and anything declared above it in the class definition.\n */\nvoid TessBaseAPI::End() {\n  Clear();\n  delete thresholder_;\n  thresholder_ = nullptr;\n  delete page_res_;\n  page_res_ = nullptr;\n  delete block_list_;\n  block_list_ = nullptr;\n  if (paragraph_models_ != nullptr) {\n    for (auto model : *paragraph_models_) {\n      delete model;\n    }\n    delete paragraph_models_;\n    paragraph_models_ = nullptr;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  if (osd_tesseract_ == tesseract_) {\n    osd_tesseract_ = nullptr;\n  }\n  delete osd_tesseract_;\n  osd_tesseract_ = nullptr;\n  delete equ_detect_;\n  equ_detect_ = nullptr;\n#endif // ndef DISABLED_LEGACY_ENGINE\n  delete tesseract_;\n  tesseract_ = nullptr;\n  input_file_.clear();\n  output_file_.clear();\n  datapath_.clear();\n  language_.clear();\n}\n\n// Clear any library-level memory caches.\n// There are a variety of expensive-to-load constant data structures (mostly\n// language dictionaries) that are cached globally -- surviving the Init()\n// and End() of individual TessBaseAPI's.  This function allows the clearing\n// of these caches.\nvoid TessBaseAPI::ClearPersistentCache() {\n  Dict::GlobalDawgCache()->DeleteUnusedDawgs();\n}\n\n/**\n * Check whether a word is valid according to Tesseract's language model\n * returns 0 if the word is invalid, non-zero if valid\n */\nint TessBaseAPI::IsValidWord(const char *word) const {\n  return tesseract_->getDict().valid_word(word);\n}\n// Returns true if utf8_character is defined in the UniCharset.\nbool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {\n  return tesseract_->unicharset.contains_unichar(utf8_character);\n}\n\n// TODO(rays) Obsolete this function and replace with a more aptly named\n// function that returns image coordinates rather than tesseract coordinates.\nbool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {\n  const std::unique_ptr<const PageIterator> it(AnalyseLayout());\n  if (it == nullptr) {\n    return false;\n  }\n  int x1, x2, y1, y2;\n  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);\n  // Calculate offset and slope (NOTE: Kind of ugly)\n  if (x2 <= x1) {\n    x2 = x1 + 1;\n  }\n  // Convert the point pair to slope/offset of the baseline (in image coords.)\n  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);\n  *out_offset = static_cast<int>(y1 - *out_slope * x1);\n  // Get the y-coord of the baseline at the left and right edges of the\n  // textline's bounding box.\n  int left, top, right, bottom;\n  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {\n    return false;\n  }\n  int left_y = IntCastRounded(*out_slope * left + *out_offset);\n  int right_y = IntCastRounded(*out_slope * right + *out_offset);\n  // Shift the baseline down so it passes through the nearest bottom-corner\n  // of the textline's bounding box. This is the difference between the y\n  // at the lowest (max) edge of the box and the actual box bottom.\n  *out_offset += bottom - std::max(left_y, right_y);\n  // Switch back to bottom-up tesseract coordinates. Requires negation of\n  // the slope and height - offset for the offset.\n  *out_slope = -*out_slope;\n  *out_offset = rect_height_ - *out_offset;\n\n  return true;\n}\n\n/** Sets Dict::letter_is_okay_ function to point to the given function. */\nvoid TessBaseAPI::SetDictFunc(DictFunc f) {\n  if (tesseract_ != nullptr) {\n    tesseract_->getDict().letter_is_okay_ = f;\n  }\n}\n\n/**\n * Sets Dict::probability_in_context_ function to point to the given\n * function.\n *\n * @param f A single function that returns the probability of the current\n * \"character\" (in general a utf-8 string), given the context of a previous\n * utf-8 string.\n */\nvoid TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {\n  if (tesseract_ != nullptr) {\n    tesseract_->getDict().probability_in_context_ = f;\n    // Set it for the sublangs too.\n    int num_subs = tesseract_->num_sub_langs();\n    for (int i = 0; i < num_subs; ++i) {\n      tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;\n    }\n  }\n}\n\n/** Common code for setting the image. */\nbool TessBaseAPI::InternalSetImage() {\n  if (tesseract_ == nullptr) {\n    tprintf(\"Please call Init before attempting to set an image.\\n\");\n    return false;\n  }\n  if (thresholder_ == nullptr) {\n    thresholder_ = new ImageThresholder;\n  }\n  ClearResults();\n  return true;\n}\n\n/**\n * Run the thresholder to make the thresholded image, returned in pix,\n * which must not be nullptr. *pix must be initialized to nullptr, or point\n * to an existing pixDestroyable Pix.\n * The usual argument to Threshold is Tesseract::mutable_pix_binary().\n */\nbool TessBaseAPI::Threshold(Pix **pix) {\n  ASSERT_HOST(pix != nullptr);\n  if (*pix != nullptr) {\n    pixDestroy(pix);\n  }\n  // Zero resolution messes up the algorithms, so make sure it is credible.\n  int user_dpi = 0;\n  GetIntVariable(\"user_defined_dpi\", &user_dpi);\n  int y_res = thresholder_->GetScaledYResolution();\n  if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {\n    tprintf(\n        \"Warning: User defined image dpi is outside of expected range \"\n        \"(%d - %d)!\\n\",\n        kMinCredibleResolution, kMaxCredibleResolution);\n  }\n  // Always use user defined dpi\n  if (user_dpi) {\n    thresholder_->SetSourceYResolution(user_dpi);\n  } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {\n    if (y_res != 0) {\n      // Show warning only if a resolution was given.\n      tprintf(\"Warning: Invalid resolution %d dpi. Using %d instead.\\n\",\n              y_res, kMinCredibleResolution);\n    }\n    thresholder_->SetSourceYResolution(kMinCredibleResolution);\n  }\n\n  auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));\n\n  if (thresholding_method == ThresholdMethod::Otsu) {\n    Image pix_binary(*pix);\n    if (!thresholder_->ThresholdToPix(&pix_binary)) {\n      return false;\n    }\n    *pix = pix_binary;\n\n    if (!thresholder_->IsBinary()) {\n      tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());\n      tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());\n    } else {\n      tesseract_->set_pix_thresholds(nullptr);\n      tesseract_->set_pix_grey(nullptr);\n    }\n  } else {\n    auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);\n\n    if (!ok) {\n      return false;\n    }\n    *pix = pix_binary;\n\n    tesseract_->set_pix_thresholds(pix_thresholds);\n    tesseract_->set_pix_grey(pix_grey);\n  }\n\n  thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,\n                              &image_height_);\n\n  // Set the internal resolution that is used for layout parameters from the\n  // estimated resolution, rather than the image resolution, which may be\n  // fabricated, but we will use the image resolution, if there is one, to\n  // report output point sizes.\n  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),\n                                  kMinCredibleResolution, kMaxCredibleResolution);\n  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {\n    tprintf(\n        \"Estimated internal resolution %d out of range! \"\n        \"Corrected to %d.\\n\",\n        thresholder_->GetScaledEstimatedResolution(), estimated_res);\n  }\n  tesseract_->set_source_resolution(estimated_res);\n  return true;\n}\n\n/** Find lines from the image making the BLOCK_LIST. */\nint TessBaseAPI::FindLines() {\n  if (thresholder_ == nullptr || thresholder_->IsEmpty()) {\n    tprintf(\"Please call SetImage before attempting recognition.\\n\");\n    return -1;\n  }\n  if (recognition_done_) {\n    ClearResults();\n  }\n  if (!block_list_->empty()) {\n    return 0;\n  }\n  if (tesseract_ == nullptr) {\n    tesseract_ = new Tesseract;\n#ifndef DISABLED_LEGACY_ENGINE\n    tesseract_->InitAdaptiveClassifier(nullptr);\n#endif\n  }\n  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {\n    return -1;\n  }\n\n  tesseract_->PrepareForPageseg();\n\n#ifndef DISABLED_LEGACY_ENGINE\n  if (tesseract_->textord_equation_detect) {\n    if (equ_detect_ == nullptr && !datapath_.empty()) {\n      equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);\n    }\n    if (equ_detect_ == nullptr) {\n      tprintf(\"Warning: Could not set equation detector\\n\");\n    } else {\n      tesseract_->SetEquationDetect(equ_detect_);\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  Tesseract *osd_tess = osd_tesseract_;\n  OSResults osr;\n#ifndef DISABLED_LEGACY_ENGINE\n  if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {\n    if (strcmp(language_.c_str(), \"osd\") == 0) {\n      osd_tess = tesseract_;\n    } else {\n      osd_tesseract_ = new Tesseract;\n      TessdataManager mgr(reader_);\n      if (datapath_.empty()) {\n        tprintf(\n            \"Warning: Auto orientation and script detection requested,\"\n            \" but data path is undefined\\n\");\n        delete osd_tesseract_;\n        osd_tesseract_ = nullptr;\n      } else if (osd_tesseract_->init_tesseract(datapath_, \"\", \"osd\", OEM_TESSERACT_ONLY,\n                                                nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {\n        osd_tess = osd_tesseract_;\n        osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());\n      } else {\n        tprintf(\n            \"Warning: Auto orientation and script detection requested,\"\n            \" but osd language failed to load\\n\");\n        delete osd_tesseract_;\n        osd_tesseract_ = nullptr;\n      }\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {\n    return -1;\n  }\n\n  // If Devanagari is being recognized, we use different images for page seg\n  // and for OCR.\n  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);\n  return 0;\n}\n\n/**\n * Return average gradient of lines on page.\n */\nfloat TessBaseAPI::GetGradient() {\n  return tesseract_->gradient();\n}\n\n/** Delete the pageres and clear the block list ready for a new page. */\nvoid TessBaseAPI::ClearResults() {\n  if (tesseract_ != nullptr) {\n    tesseract_->Clear();\n  }\n  delete page_res_;\n  page_res_ = nullptr;\n  recognition_done_ = false;\n  if (block_list_ == nullptr) {\n    block_list_ = new BLOCK_LIST;\n  } else {\n    block_list_->clear();\n  }\n  if (paragraph_models_ != nullptr) {\n    for (auto model : *paragraph_models_) {\n      delete model;\n    }\n    delete paragraph_models_;\n    paragraph_models_ = nullptr;\n  }\n}\n\n/**\n * Return the length of the output text string, as UTF8, assuming\n * liberally two spacing marks after each word (as paragraphs end with two\n * newlines), and assuming a single character reject marker for each rejected\n * character.\n * Also return the number of recognized blobs in blob_count.\n */\nint TessBaseAPI::TextLength(int *blob_count) const {\n  if (tesseract_ == nullptr || page_res_ == nullptr) {\n    return 0;\n  }\n\n  PAGE_RES_IT page_res_it(page_res_);\n  int total_length = 2;\n  int total_blobs = 0;\n  // Iterate over the data structures to extract the recognition result.\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD_RES *word = page_res_it.word();\n    WERD_CHOICE *choice = word->best_choice;\n    if (choice != nullptr) {\n      total_blobs += choice->length() + 2;\n      total_length += choice->unichar_string().length() + 2;\n      for (int i = 0; i < word->reject_map.length(); ++i) {\n        if (word->reject_map[i].rejected()) {\n          ++total_length;\n        }\n      }\n    }\n  }\n  if (blob_count != nullptr) {\n    *blob_count = total_blobs;\n  }\n  return total_length;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * Estimates the Orientation And Script of the image.\n * Returns true if the image was processed successfully.\n */\nbool TessBaseAPI::DetectOS(OSResults *osr) {\n  if (tesseract_ == nullptr) {\n    return false;\n  }\n  ClearResults();\n  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {\n    return false;\n  }\n\n  if (input_file_.empty()) {\n    input_file_ = kInputFile;\n  }\n  return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;\n}\n#endif // #ifndef DISABLED_LEGACY_ENGINE\n\nvoid TessBaseAPI::set_min_orientation_margin(double margin) {\n  tesseract_->min_orientation_margin.set_value(margin);\n}\n\n/**\n * Return text orientation of each block as determined in an earlier page layout\n * analysis operation. Orientation is returned as the number of ccw 90-degree\n * rotations (in [0..3]) required to make the text in the block upright\n * (readable). Note that this may not necessary be the block orientation\n * preferred for recognition (such as the case of vertical CJK text).\n *\n * Also returns whether the text in the block is believed to have vertical\n * writing direction (when in an upright page orientation).\n *\n * The returned array is of length equal to the number of text blocks, which may\n * be less than the total number of blocks. The ordering is intended to be\n * consistent with GetTextLines().\n */\nvoid TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {\n  delete[] * block_orientation;\n  *block_orientation = nullptr;\n  delete[] * vertical_writing;\n  *vertical_writing = nullptr;\n  BLOCK_IT block_it(block_list_);\n\n  block_it.move_to_first();\n  int num_blocks = 0;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    if (!block_it.data()->pdblk.poly_block()->IsText()) {\n      continue;\n    }\n    ++num_blocks;\n  }\n  if (!num_blocks) {\n    tprintf(\"WARNING: Found no blocks\\n\");\n    return;\n  }\n  *block_orientation = new int[num_blocks];\n  *vertical_writing = new bool[num_blocks];\n  block_it.move_to_first();\n  int i = 0;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    if (!block_it.data()->pdblk.poly_block()->IsText()) {\n      continue;\n    }\n    FCOORD re_rotation = block_it.data()->re_rotation();\n    float re_theta = re_rotation.angle();\n    FCOORD classify_rotation = block_it.data()->classify_rotation();\n    float classify_theta = classify_rotation.angle();\n    double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;\n    if (rot_theta < 0) {\n      rot_theta += 4;\n    }\n    int num_rotations = static_cast<int>(rot_theta + 0.5);\n    (*block_orientation)[i] = num_rotations;\n    // The classify_rotation is non-zero only if the text has vertical\n    // writing direction.\n    (*vertical_writing)[i] = classify_rotation.y() != 0.0f;\n    ++i;\n  }\n}\n\nvoid TessBaseAPI::DetectParagraphs(bool after_text_recognition) {\n  int debug_level = 0;\n  GetIntVariable(\"paragraph_debug_level\", &debug_level);\n  if (paragraph_models_ == nullptr) {\n    paragraph_models_ = new std::vector<ParagraphModel *>;\n  }\n  MutableIterator *result_it = GetMutableIterator();\n  do { // Detect paragraphs for this block\n    std::vector<ParagraphModel *> models;\n    ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);\n    paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());\n  } while (result_it->Next(RIL_BLOCK));\n  delete result_it;\n}\n\n/** This method returns the string form of the specified unichar. */\nconst char *TessBaseAPI::GetUnichar(int unichar_id) const {\n  return tesseract_->unicharset.id_to_unichar(unichar_id);\n}\n\n/** Return the pointer to the i-th dawg loaded into tesseract_ object. */\nconst Dawg *TessBaseAPI::GetDawg(int i) const {\n  if (tesseract_ == nullptr || i >= NumDawgs()) {\n    return nullptr;\n  }\n  return tesseract_->getDict().GetDawg(i);\n}\n\n/** Return the number of dawgs loaded into tesseract_ object. */\nint TessBaseAPI::NumDawgs() const {\n  return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();\n}\n\n/** Escape a char string - replace <>&\"' with HTML codes. */\nstd::string HOcrEscape(const char *text) {\n  std::string ret;\n  const char *ptr;\n  for (ptr = text; *ptr; ptr++) {\n    switch (*ptr) {\n      case '<':\n        ret += \"&lt;\";\n        break;\n      case '>':\n        ret += \"&gt;\";\n        break;\n      case '&':\n        ret += \"&amp;\";\n        break;\n      case '\"':\n        ret += \"&quot;\";\n        break;\n      case '\\'':\n        ret += \"&#39;\";\n        break;\n      default:\n        ret += *ptr;\n    }\n  }\n  return ret;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/capi.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        capi.cpp\n// Description: C-API TessBaseAPI\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/capi.h>\n\nconst char *TessVersion() {\n  return TessBaseAPI::Version();\n}\n\nstatic char *MakeText(const std::string& srcText) {\n  auto *text = new char[srcText.size() + 1];\n  srcText.copy(text, srcText.size());\n  text[srcText.size()] = 0;\n  return text;\n}\n\nvoid TessDeleteText(const char *text) {\n  delete[] text;\n}\n\nstatic char **MakeTextArray(const std::vector<std::string>& srcArr) {\n  auto **arr = new char *[srcArr.size() + 1];\n  for (size_t i = 0; i < srcArr.size(); ++i) {\n    arr[i] = MakeText(srcArr[i]);\n  }\n  arr[srcArr.size()] = nullptr;\n  return arr;\n}\n\nvoid TessDeleteTextArray(char **arr) {\n  for (char **pos = arr; *pos != nullptr; ++pos) {\n    delete[] * pos;\n  }\n  delete[] arr;\n}\n\nvoid TessDeleteIntArray(const int *arr) {\n  delete[] arr;\n}\n\nTessResultRenderer *TessTextRendererCreate(const char *outputbase) {\n  return new tesseract::TessTextRenderer(outputbase);\n}\n\nTessResultRenderer *TessHOcrRendererCreate(const char *outputbase) {\n  return new tesseract::TessHOcrRenderer(outputbase);\n}\n\nTessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info) {\n  return new tesseract::TessHOcrRenderer(outputbase, font_info != 0);\n}\n\nTessResultRenderer *TessAltoRendererCreate(const char *outputbase) {\n  return new tesseract::TessAltoRenderer(outputbase);\n}\n\nTessResultRenderer *TessPAGERendererCreate(const char *outputbase) {\n  return new tesseract::TessPAGERenderer(outputbase);\n}\n\nTessResultRenderer *TessTsvRendererCreate(const char *outputbase) {\n  return new tesseract::TessTsvRenderer(outputbase);\n}\n\nTessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,\n                                          BOOL textonly) {\n  return new tesseract::TessPDFRenderer(outputbase, datadir, textonly != 0);\n}\n\nTessResultRenderer *TessUnlvRendererCreate(const char *outputbase) {\n  return new tesseract::TessUnlvRenderer(outputbase);\n}\n\nTessResultRenderer *TessBoxTextRendererCreate(const char *outputbase) {\n  return new tesseract::TessBoxTextRenderer(outputbase);\n}\n\nTessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase) {\n  return new tesseract::TessWordStrBoxRenderer(outputbase);\n}\n\nTessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase) {\n  return new tesseract::TessLSTMBoxRenderer(outputbase);\n}\n\nvoid TessDeleteResultRenderer(TessResultRenderer *renderer) {\n  delete renderer;\n}\n\nvoid TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next) {\n  renderer->insert(next);\n}\n\nTessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer) {\n  return renderer->next();\n}\n\nBOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title) {\n  return static_cast<int>(renderer->BeginDocument(title));\n}\n\nBOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api) {\n  return static_cast<int>(renderer->AddImage(api));\n}\n\nBOOL TessResultRendererEndDocument(TessResultRenderer *renderer) {\n  return static_cast<int>(renderer->EndDocument());\n}\n\nconst char *TessResultRendererExtention(TessResultRenderer *renderer) {\n  return renderer->file_extension();\n}\n\nconst char *TessResultRendererTitle(TessResultRenderer *renderer) {\n  return renderer->title();\n}\n\nint TessResultRendererImageNum(TessResultRenderer *renderer) {\n  return renderer->imagenum();\n}\n\nTessBaseAPI *TessBaseAPICreate() {\n  return new TessBaseAPI;\n}\n\nvoid TessBaseAPIDelete(TessBaseAPI *handle) {\n  delete handle;\n}\n\nvoid TessBaseAPISetInputName(TessBaseAPI *handle, const char *name) {\n  handle->SetInputName(name);\n}\n\nconst char *TessBaseAPIGetInputName(TessBaseAPI *handle) {\n  return handle->GetInputName();\n}\n\nvoid TessBaseAPISetInputImage(TessBaseAPI *handle, Pix *pix) {\n  handle->SetInputImage(pix);\n}\n\nPix *TessBaseAPIGetInputImage(TessBaseAPI *handle) {\n  return handle->GetInputImage();\n}\n\nint TessBaseAPIGetSourceYResolution(TessBaseAPI *handle) {\n  return handle->GetSourceYResolution();\n}\n\nconst char *TessBaseAPIGetDatapath(TessBaseAPI *handle) {\n  return handle->GetDatapath();\n}\n\nvoid TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name) {\n  handle->SetOutputName(name);\n}\n\nBOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value) {\n  return static_cast<int>(handle->SetVariable(name, value));\n}\n\nBOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value) {\n  return static_cast<int>(handle->SetDebugVariable(name, value));\n}\n\nBOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value) {\n  return static_cast<int>(handle->GetIntVariable(name, value));\n}\n\nBOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value) {\n  bool boolValue;\n  bool result = handle->GetBoolVariable(name, &boolValue);\n  if (result) {\n    *value = static_cast<int>(boolValue);\n  }\n  return static_cast<int>(result);\n}\n\nBOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name, double *value) {\n  return static_cast<int>(handle->GetDoubleVariable(name, value));\n}\n\nconst char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name) {\n  return handle->GetStringVariable(name);\n}\n\nvoid TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp) {\n  handle->PrintVariables(fp);\n}\n\nBOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename) {\n  FILE *fp = fopen(filename, \"w\");\n  if (fp != nullptr) {\n    handle->PrintVariables(fp);\n    fclose(fp);\n    return TRUE;\n  }\n  return FALSE;\n}\n\nint TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,\n                     TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,\n                     char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {\n  std::vector<std::string> varNames;\n  std::vector<std::string> varValues;\n  if (vars_vec != nullptr && vars_values != nullptr) {\n    for (size_t i = 0; i < vars_vec_size; i++) {\n      varNames.emplace_back(vars_vec[i]);\n      varValues.emplace_back(vars_values[i]);\n    }\n  }\n\n  return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues,\n                      set_only_non_debug_params != 0);\n}\n\nint TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,\n                     TessOcrEngineMode oem, char **configs, int configs_size) {\n  return handle->Init(datapath, language, oem, configs, configs_size, nullptr, nullptr, false);\n}\n\nint TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,\n                     TessOcrEngineMode oem) {\n  return handle->Init(datapath, language, oem);\n}\n\nint TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language) {\n  return handle->Init(datapath, language);\n}\n\nint TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, const char *language,\n                     TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,\n                     char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {\n  std::vector<std::string> varNames;\n  std::vector<std::string> varValues;\n  if (vars_vec != nullptr && vars_values != nullptr) {\n    for (size_t i = 0; i < vars_vec_size; i++) {\n      varNames.emplace_back(vars_vec[i]);\n      varValues.emplace_back(vars_values[i]);\n    }\n  }\n\n  return handle->Init(data, data_size, language, mode, configs, configs_size, &varNames, &varValues,\n                      set_only_non_debug_params != 0, nullptr);\n}\n\nconst char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle) {\n  return handle->GetInitLanguagesAsString();\n}\n\nchar **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle) {\n  std::vector<std::string> languages;\n  handle->GetLoadedLanguagesAsVector(&languages);\n  return MakeTextArray(languages);\n}\n\nchar **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle) {\n  std::vector<std::string> languages;\n  handle->GetAvailableLanguagesAsVector(&languages);\n  return MakeTextArray(languages);\n}\n\nvoid TessBaseAPIInitForAnalysePage(TessBaseAPI *handle) {\n  handle->InitForAnalysePage();\n}\n\nvoid TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename) {\n  handle->ReadConfigFile(filename);\n}\n\nvoid TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename) {\n  handle->ReadDebugConfigFile(filename);\n}\n\nvoid TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode) {\n  handle->SetPageSegMode(mode);\n}\n\nTessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle) {\n  return handle->GetPageSegMode();\n}\n\nchar *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata, int bytes_per_pixel,\n                      int bytes_per_line, int left, int top, int width, int height) {\n  return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width,\n                               height);\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\nvoid TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle) {\n  handle->ClearAdaptiveClassifier();\n}\n#endif\n\nvoid TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width, int height,\n                         int bytes_per_pixel, int bytes_per_line) {\n  handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);\n}\n\nvoid TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix) {\n  return handle->SetImage(pix);\n}\n\nvoid TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi) {\n  handle->SetSourceResolution(ppi);\n}\n\nvoid TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width, int height) {\n  handle->SetRectangle(left, top, width, height);\n}\n\nstruct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle) {\n  return handle->GetThresholdedImage();\n}\n\nfloat TessBaseAPIGetGradient(TessBaseAPI *handle) {\n  return handle->GetGradient();\n}\n\nvoid TessBaseAPIClearPersistentCache(TessBaseAPI * /*handle*/) {\n  TessBaseAPI::ClearPersistentCache();\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\nBOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg, float *orient_conf,\n                                        const char **script_name, float *script_conf) {\n  auto success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);\n  return static_cast<BOOL>(success);\n}\n\n#endif\n\nstruct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa) {\n  return handle->GetRegions(pixa);\n}\n\nstruct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {\n  return handle->GetTextlines(pixa, blockids);\n}\n\nstruct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, const BOOL raw_image,\n                                      const int raw_padding, struct Pixa **pixa, int **blockids,\n                                      int **paraids) {\n  return handle->GetTextlines(raw_image != 0, raw_padding, pixa, blockids, paraids);\n}\n\nstruct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {\n  return handle->GetStrips(pixa, blockids);\n}\n\nstruct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa) {\n  return handle->GetWords(pixa);\n}\n\nstruct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc) {\n  return handle->GetConnectedComponents(cc);\n}\n\nstruct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, TessPageIteratorLevel level,\n                                           BOOL text_only, struct Pixa **pixa, int **blockids) {\n  return handle->GetComponentImages(level, static_cast<bool>(text_only), pixa, blockids);\n}\n\nstruct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle, const TessPageIteratorLevel level,\n                                            const BOOL text_only, const BOOL raw_image,\n                                            const int raw_padding, struct Pixa **pixa,\n                                            int **blockids, int **paraids) {\n  return handle->GetComponentImages(level, static_cast<bool>(text_only), raw_image != 0,\n                                    raw_padding, pixa, blockids, paraids);\n}\n\nint TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle) {\n  return handle->GetThresholdedImageScaleFactor();\n}\n\nTessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle) {\n  return handle->AnalyseLayout();\n}\n\nint TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor) {\n  return handle->Recognize(monitor);\n}\n\nBOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, const char *retry_config,\n                             int timeout_millisec, TessResultRenderer *renderer) {\n  return static_cast<int>(handle->ProcessPages(filename, retry_config, timeout_millisec, renderer));\n}\n\nBOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,\n                            const char *filename, const char *retry_config, int timeout_millisec,\n                            TessResultRenderer *renderer) {\n  return static_cast<int>(\n      handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer));\n}\n\nTessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle) {\n  return handle->GetIterator();\n}\n\nTessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle) {\n  return handle->GetMutableIterator();\n}\n\nchar *TessBaseAPIGetUTF8Text(TessBaseAPI *handle) {\n  return handle->GetUTF8Text();\n}\n\nchar *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number) {\n  return handle->GetHOCRText(nullptr, page_number);\n}\n\nchar *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {\n  return handle->GetAltoText(page_number);\n}\n\nchar *TessBaseAPIGetPAGEText(TessBaseAPI *handle, int page_number) {\n  return handle->GetPAGEText(page_number);\n}\n\nchar *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {\n  return handle->GetTSVText(page_number);\n}\n\nchar *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number) {\n  return handle->GetBoxText(page_number);\n}\n\nchar *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number) {\n  return handle->GetWordStrBoxText(page_number);\n}\n\nchar *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number) {\n  return handle->GetLSTMBoxText(page_number);\n}\n\nchar *TessBaseAPIGetUNLVText(TessBaseAPI *handle) {\n  return handle->GetUNLVText();\n}\n\nint TessBaseAPIMeanTextConf(TessBaseAPI *handle) {\n  return handle->MeanTextConf();\n}\n\nint *TessBaseAPIAllWordConfidences(TessBaseAPI *handle) {\n  return handle->AllWordConfidences();\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\nBOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode, const char *wordstr) {\n  return static_cast<int>(handle->AdaptToWordStr(mode, wordstr));\n}\n#endif\n\nvoid TessBaseAPIClear(TessBaseAPI *handle) {\n  handle->Clear();\n}\n\nvoid TessBaseAPIEnd(TessBaseAPI *handle) {\n  handle->End();\n}\n\nint TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word) {\n  return handle->IsValidWord(word);\n}\n\nBOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope) {\n  return static_cast<int>(handle->GetTextDirection(out_offset, out_slope));\n}\n\nconst char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id) {\n  return handle->GetUnichar(unichar_id);\n}\n\nvoid TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin) {\n  handle->set_min_orientation_margin(margin);\n}\n\nint TessBaseAPINumDawgs(const TessBaseAPI *handle) {\n  return handle->NumDawgs();\n}\n\nTessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle) {\n  return handle->oem();\n}\n\nvoid TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,\n                                      bool **vertical_writing) {\n  handle->GetBlockTextOrientations(block_orientation, vertical_writing);\n}\n\nvoid TessPageIteratorDelete(TessPageIterator *handle) {\n  delete handle;\n}\n\nTessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle) {\n  return new TessPageIterator(*handle);\n}\n\nvoid TessPageIteratorBegin(TessPageIterator *handle) {\n  handle->Begin();\n}\n\nBOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level) {\n  return static_cast<int>(handle->Next(level));\n}\n\nBOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, TessPageIteratorLevel level) {\n  return static_cast<int>(handle->IsAtBeginningOf(level));\n}\n\nBOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, TessPageIteratorLevel level,\n                                      TessPageIteratorLevel element) {\n  return static_cast<int>(handle->IsAtFinalElement(level, element));\n}\n\nBOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, TessPageIteratorLevel level,\n                                 int *left, int *top, int *right, int *bottom) {\n  return static_cast<int>(handle->BoundingBox(level, left, top, right, bottom));\n}\n\nTessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle) {\n  return handle->BlockType();\n}\n\nstruct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,\n                                           TessPageIteratorLevel level) {\n  return handle->GetBinaryImage(level);\n}\n\nstruct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, TessPageIteratorLevel level,\n                                     int padding, struct Pix *original_image, int *left, int *top) {\n  return handle->GetImage(level, padding, original_image, left, top);\n}\n\nBOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level, int *x1,\n                              int *y1, int *x2, int *y2) {\n  return static_cast<int>(handle->Baseline(level, x1, y1, x2, y2));\n}\n\nvoid TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,\n                                 TessWritingDirection *writing_direction,\n                                 TessTextlineOrder *textline_order, float *deskew_angle) {\n  handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);\n}\n\nvoid TessPageIteratorParagraphInfo(TessPageIterator *handle,\n                                   TessParagraphJustification *justification, BOOL *is_list_item,\n                                   BOOL *is_crown, int *first_line_indent) {\n  bool bool_is_list_item;\n  bool bool_is_crown;\n  handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);\n  if (is_list_item != nullptr) {\n    *is_list_item = static_cast<int>(bool_is_list_item);\n  }\n  if (is_crown != nullptr) {\n    *is_crown = static_cast<int>(bool_is_crown);\n  }\n}\n\nvoid TessResultIteratorDelete(TessResultIterator *handle) {\n  delete handle;\n}\n\nTessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle) {\n  return new TessResultIterator(*handle);\n}\n\nTessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle) {\n  return handle;\n}\n\nconst TessPageIterator *TessResultIteratorGetPageIteratorConst(const TessResultIterator *handle) {\n  return handle;\n}\n\nTessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle) {\n  return new TessChoiceIterator(*handle);\n}\n\nBOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level) {\n  return static_cast<int>(handle->Next(level));\n}\n\nchar *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, TessPageIteratorLevel level) {\n  return handle->GetUTF8Text(level);\n}\n\nfloat TessResultIteratorConfidence(const TessResultIterator *handle, TessPageIteratorLevel level) {\n  return handle->Confidence(level);\n}\n\nconst char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle) {\n  return handle->WordRecognitionLanguage();\n}\n\nconst char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle, BOOL *is_bold,\n                                                 BOOL *is_italic, BOOL *is_underlined,\n                                                 BOOL *is_monospace, BOOL *is_serif,\n                                                 BOOL *is_smallcaps, int *pointsize, int *font_id) {\n  bool bool_is_bold;\n  bool bool_is_italic;\n  bool bool_is_underlined;\n  bool bool_is_monospace;\n  bool bool_is_serif;\n  bool bool_is_smallcaps;\n  const char *ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined,\n                                               &bool_is_monospace, &bool_is_serif,\n                                               &bool_is_smallcaps, pointsize, font_id);\n  if (is_bold != nullptr) {\n    *is_bold = static_cast<int>(bool_is_bold);\n  }\n  if (is_italic != nullptr) {\n    *is_italic = static_cast<int>(bool_is_italic);\n  }\n  if (is_underlined != nullptr) {\n    *is_underlined = static_cast<int>(bool_is_underlined);\n  }\n  if (is_monospace != nullptr) {\n    *is_monospace = static_cast<int>(bool_is_monospace);\n  }\n  if (is_serif != nullptr) {\n    *is_serif = static_cast<int>(bool_is_serif);\n  }\n  if (is_smallcaps != nullptr) {\n    *is_smallcaps = static_cast<int>(bool_is_smallcaps);\n  }\n  return ret;\n}\n\nBOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle) {\n  return static_cast<int>(handle->WordIsFromDictionary());\n}\n\nBOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle) {\n  return static_cast<int>(handle->WordIsNumeric());\n}\n\nBOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle) {\n  return static_cast<int>(handle->SymbolIsSuperscript());\n}\n\nBOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle) {\n  return static_cast<int>(handle->SymbolIsSubscript());\n}\n\nBOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle) {\n  return static_cast<int>(handle->SymbolIsDropcap());\n}\n\nvoid TessChoiceIteratorDelete(TessChoiceIterator *handle) {\n  delete handle;\n}\n\nBOOL TessChoiceIteratorNext(TessChoiceIterator *handle) {\n  return static_cast<int>(handle->Next());\n}\n\nconst char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle) {\n  return handle->GetUTF8Text();\n}\n\nfloat TessChoiceIteratorConfidence(const TessChoiceIterator *handle) {\n  return handle->Confidence();\n}\n\nETEXT_DESC *TessMonitorCreate() {\n  return new ETEXT_DESC();\n}\n\nvoid TessMonitorDelete(ETEXT_DESC *monitor) {\n  delete monitor;\n}\n\nvoid TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc) {\n  monitor->cancel = cancelFunc;\n}\n\nvoid TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis) {\n  monitor->cancel_this = cancelThis;\n}\n\nvoid *TessMonitorGetCancelThis(ETEXT_DESC *monitor) {\n  return monitor->cancel_this;\n}\n\nvoid TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc) {\n  monitor->progress_callback2 = progressFunc;\n}\n\nint TessMonitorGetProgress(ETEXT_DESC *monitor) {\n  return monitor->progress;\n}\n\nvoid TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline) {\n  monitor->set_deadline_msecs(deadline);\n}\n"
  },
  {
    "path": "src/api/hocrrenderer.cpp",
    "content": "/**********************************************************************\n * File:        hocrrenderer.cpp\n * Description: Simple API for calling tesseract.\n * Author:      Ray Smith (original code from baseapi.cpp)\n * Author:      Stefan Weil (moved to separate file and cleaned code)\n *\n * (C) Copyright 2006, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <tesseract/baseapi.h> // for TessBaseAPI\n#include <locale>              // for std::locale::classic\n#include <memory>              // for std::unique_ptr\n#include <sstream>             // for std::stringstream\n#include <tesseract/renderer.h>\n#include \"helpers.h\"        // for copy_string\n#include \"tesseractclass.h\" // for Tesseract\n\nnamespace tesseract {\n\n/**\n * Gets the block orientation at the current iterator position.\n */\nstatic tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {\n  tesseract::Orientation orientation;\n  tesseract::WritingDirection writing_direction;\n  tesseract::TextlineOrder textline_order;\n  float deskew_angle;\n  it->Orientation(&orientation, &writing_direction, &textline_order,\n                  &deskew_angle);\n  return orientation;\n}\n\n/**\n * Fits a line to the baseline at the given level, and appends its coefficients\n * to the hOCR string.\n * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for\n * rotated textlines. For this reason, on textlines that are not upright, this\n * method currently only inserts a 'textangle' property to indicate the rotation\n * direction and does not add any baseline information to the hocr string.\n */\nstatic void AddBaselineCoordsTohOCR(const PageIterator *it,\n                                    PageIteratorLevel level,\n                                    std::stringstream &hocr_str) {\n  tesseract::Orientation orientation = GetBlockTextOrientation(it);\n  if (orientation != ORIENTATION_PAGE_UP) {\n    hocr_str << \"; textangle \" << 360 - orientation * 90;\n    return;\n  }\n\n  int left, top, right, bottom;\n  it->BoundingBox(level, &left, &top, &right, &bottom);\n\n  // Try to get the baseline coordinates at this level.\n  int x1, y1, x2, y2;\n  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {\n    return;\n  }\n  // Following the description of this field of the hOCR spec, we convert the\n  // baseline coordinates so that \"the bottom left of the bounding box is the\n  // origin\".\n  x1 -= left;\n  x2 -= left;\n  y1 -= bottom;\n  y2 -= bottom;\n\n  // Now fit a line through the points so we can extract coefficients for the\n  // equation:  y = p1 x + p0\n  if (x1 == x2) {\n    // Problem computing the polynomial coefficients.\n    return;\n  }\n  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);\n  double p0 = y1 - p1 * x1;\n\n  hocr_str << \"; baseline \" << round(p1 * 1000.0) / 1000.0 << \" \"\n           << round(p0 * 1000.0) / 1000.0;\n}\n\nstatic void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,\n                         std::stringstream &hocr_str) {\n  int left, top, right, bottom;\n  it->BoundingBox(level, &left, &top, &right, &bottom);\n  // This is the only place we use double quotes instead of single quotes,\n  // but it may too late to change for consistency\n  hocr_str << \" title=\\\"bbox \" << left << \" \" << top << \" \" << right << \" \"\n           << bottom;\n  // Add baseline coordinates & heights for textlines only.\n  if (level == RIL_TEXTLINE) {\n    AddBaselineCoordsTohOCR(it, level, hocr_str);\n    // add custom height measures\n    float row_height, descenders, ascenders; // row attributes\n    it->RowAttributes(&row_height, &descenders, &ascenders);\n    // TODO(rays): Do we want to limit these to a single decimal place?\n    hocr_str << \"; x_size \" << row_height << \"; x_descenders \" << -descenders\n             << \"; x_ascenders \" << ascenders;\n  }\n  hocr_str << \"\\\">\";\n}\n\n/**\n * Make a HTML-formatted string with hOCR markup from the internal\n * data structures.\n * page_number is 0-based but will appear in the output as 1-based.\n * Image name/input_file_ can be set by SetInputName before calling\n * GetHOCRText\n * STL removed from original patch submission and refactored by rays.\n * Returned string must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::GetHOCRText(int page_number) {\n  return GetHOCRText(nullptr, page_number);\n}\n\n/**\n * Make a HTML-formatted string with hOCR markup from the internal\n * data structures.\n * page_number is 0-based but will appear in the output as 1-based.\n * Image name/input_file_ can be set by SetInputName before calling\n * GetHOCRText\n * STL removed from original patch submission and refactored by rays.\n * Returned string must be freed with the delete [] operator.\n */\nchar *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {\n  if (tesseract_ == nullptr ||\n      (page_res_ == nullptr && Recognize(monitor) < 0)) {\n    return nullptr;\n  }\n\n  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;\n  int page_id = page_number + 1; // hOCR uses 1-based page numbers.\n  bool para_is_ltr = true;       // Default direction is LTR\n  const char *paragraph_lang = nullptr;\n  bool font_info = false;\n  bool hocr_boxes = false;\n  GetBoolVariable(\"hocr_font_info\", &font_info);\n  GetBoolVariable(\"hocr_char_boxes\", &hocr_boxes);\n\n  if (input_file_.empty()) {\n    SetInputName(nullptr);\n  }\n\n  std::stringstream hocr_str;\n  // Use \"C\" locale (needed for double values x_size and x_descenders).\n  hocr_str.imbue(std::locale::classic());\n  // Use 8 digits for double values.\n  hocr_str.precision(8);\n  hocr_str << \"  <div class='ocr_page'\"\n           << \" id='\"\n           << \"page_\" << page_id << \"'\"\n           << \" title='image \\\"\";\n  if (!input_file_.empty()) {\n    hocr_str << HOcrEscape(input_file_.c_str());\n  } else {\n    hocr_str << \"unknown\";\n  }\n\n  hocr_str << \"\\\"; bbox \" << rect_left_ << \" \" << rect_top_ << \" \"\n           << rect_width_ << \" \" << rect_height_ << \"; ppageno \" << page_number\n           << \"; scan_res \" << GetSourceYResolution() << \" \"\n           << GetSourceYResolution() << \"'>\\n\";\n\n  std::unique_ptr<ResultIterator> res_it(GetIterator());\n  while (!res_it->Empty(RIL_BLOCK)) {\n    int left, top, right, bottom;\n    auto block_type = res_it->BlockType();\n    switch (block_type) {\n      case PT_FLOWING_IMAGE:\n      case PT_HEADING_IMAGE:\n      case PT_PULLOUT_IMAGE: {\n        // Handle all kinds of images.\n        res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);\n        hocr_str << \"   <div class='ocr_photo' id='block_\" << page_id << '_'\n                 << bcnt++ << \"' title=\\\"bbox \" << left << \" \" << top << \" \"\n                 << right << \" \" << bottom << \"\\\"></div>\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      }\n      case PT_HORZ_LINE:\n      case PT_VERT_LINE:\n        // Handle horizontal and vertical lines.\n        res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);\n        hocr_str << \"   <div class='ocr_separator' id='block_\" << page_id << '_'\n                 << bcnt++ << \"' title=\\\"bbox \" << left << \" \" << top << \" \"\n                 << right << \" \" << bottom << \"\\\"></div>\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      case PT_NOISE:\n        tprintf(\"TODO: Please report image which triggers the noise case.\\n\");\n        ASSERT_HOST(false);\n      default:\n        break;\n    }\n\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    // Open any new block/paragraph/textline.\n    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {\n      para_is_ltr = true; // reset to default direction\n      hocr_str << \"   <div class='ocr_carea'\"\n               << \" id='\"\n               << \"block_\" << page_id << \"_\" << bcnt << \"'\";\n      AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);\n    }\n    if (res_it->IsAtBeginningOf(RIL_PARA)) {\n      hocr_str << \"\\n    <p class='ocr_par'\";\n      para_is_ltr = res_it->ParagraphIsLtr();\n      if (!para_is_ltr) {\n        hocr_str << \" dir='rtl'\";\n      }\n      hocr_str << \" id='\"\n               << \"par_\" << page_id << \"_\" << pcnt << \"'\";\n      paragraph_lang = res_it->WordRecognitionLanguage();\n      if (paragraph_lang) {\n        hocr_str << \" lang='\" << paragraph_lang << \"'\";\n      }\n      AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);\n    }\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      hocr_str << \"\\n     <span class='\";\n      switch (block_type) {\n        case PT_HEADING_TEXT:\n          hocr_str << \"ocr_header\";\n          break;\n        case PT_PULLOUT_TEXT:\n          hocr_str << \"ocr_textfloat\";\n          break;\n        case PT_CAPTION_TEXT:\n          hocr_str << \"ocr_caption\";\n          break;\n        case PT_FLOWING_IMAGE:\n        case PT_HEADING_IMAGE:\n        case PT_PULLOUT_IMAGE:\n          ASSERT_HOST(false);\n          break;\n        default:\n          hocr_str << \"ocr_line\";\n      }\n      hocr_str << \"' id='\"\n               << \"line_\" << page_id << \"_\" << lcnt << \"'\";\n      AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);\n    }\n\n    // Now, process the word...\n    int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;\n    std::vector<std::vector<std::vector<std::pair<const char *, float>>>>\n        *rawTimestepMap = nullptr;\n    std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;\n    if (lstm_choice_mode) {\n      CTCMap = res_it->GetBestLSTMSymbolChoices();\n      rawTimestepMap = res_it->GetRawLSTMTimesteps();\n    }\n    hocr_str << \"\\n      <span class='ocrx_word'\"\n             << \" id='\"\n             << \"word_\" << page_id << \"_\" << wcnt << \"'\";\n    bool bold, italic, underlined, monospace, serif, smallcaps;\n    int pointsize, font_id;\n    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);\n    const char *font_name =\n        res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,\n                                   &serif, &smallcaps, &pointsize, &font_id);\n    hocr_str << \" title='bbox \" << left << \" \" << top << \" \" << right << \" \"\n             << bottom << \"; x_wconf \"\n             << static_cast<int>(res_it->Confidence(RIL_WORD));\n    if (font_info) {\n      if (font_name) {\n        hocr_str << \"; x_font \" << HOcrEscape(font_name).c_str();\n      }\n      hocr_str << \"; x_fsize \" << pointsize;\n    }\n    hocr_str << \"'\";\n    const char *lang = res_it->WordRecognitionLanguage();\n    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {\n      hocr_str << \" lang='\" << lang << \"'\";\n    }\n    switch (res_it->WordDirection()) {\n      // Only emit direction if different from current paragraph direction\n      case DIR_LEFT_TO_RIGHT:\n        if (!para_is_ltr) {\n          hocr_str << \" dir='ltr'\";\n        }\n        break;\n      case DIR_RIGHT_TO_LEFT:\n        if (para_is_ltr) {\n          hocr_str << \" dir='rtl'\";\n        }\n        break;\n      case DIR_MIX:\n      case DIR_NEUTRAL:\n      default: // Do nothing.\n        break;\n    }\n    hocr_str << \">\";\n    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);\n    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);\n    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);\n    if (bold) {\n      hocr_str << \"<strong>\";\n    }\n    if (italic) {\n      hocr_str << \"<em>\";\n    }\n    do {\n      const std::unique_ptr<const char[]> grapheme(\n          res_it->GetUTF8Text(RIL_SYMBOL));\n      if (grapheme && grapheme[0] != 0) {\n        if (hocr_boxes) {\n          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);\n          hocr_str << \"\\n       <span class='ocrx_cinfo' title='x_bboxes \"\n                   << left << \" \" << top << \" \" << right << \" \" << bottom\n                   << \"; x_conf \" << res_it->Confidence(RIL_SYMBOL) << \"'>\";\n        }\n        hocr_str << HOcrEscape(grapheme.get()).c_str();\n        if (hocr_boxes) {\n          hocr_str << \"</span>\";\n          tesseract::ChoiceIterator ci(*res_it);\n          if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {\n            std::vector<std::vector<std::pair<const char *, float>>> *symbol =\n                ci.Timesteps();\n            hocr_str << \"\\n        <span class='ocr_symbol'\"\n                     << \" id='\"\n                     << \"symbol_\" << page_id << \"_\" << wcnt << \"_\" << scnt\n                     << \"'>\";\n            for (const auto &timestep : *symbol) {\n              hocr_str << \"\\n         <span class='ocrx_cinfo'\"\n                       << \" id='\"\n                       << \"timestep\" << page_id << \"_\" << wcnt << \"_\" << tcnt\n                       << \"'>\";\n              for (auto conf : timestep) {\n                hocr_str << \"\\n          <span class='ocrx_cinfo'\"\n                         << \" id='\"\n                         << \"choice_\" << page_id << \"_\" << wcnt << \"_\" << ccnt\n                         << \"'\"\n                         << \" title='x_confs \" << int(conf.second * 100) << \"'>\"\n                         << HOcrEscape(conf.first).c_str() << \"</span>\";\n                ++ccnt;\n              }\n              hocr_str << \"</span>\";\n              ++tcnt;\n            }\n            hocr_str << \"\\n        </span>\";\n            ++scnt;\n          } else if (lstm_choice_mode == 2) {\n            hocr_str << \"\\n        <span class='ocrx_cinfo'\"\n                     << \" id='\"\n                     << \"lstm_choices_\" << page_id << \"_\" << wcnt << \"_\" << tcnt\n                     << \"'>\";\n            do {\n              const char *choice = ci.GetUTF8Text();\n              float choiceconf = ci.Confidence();\n              if (choice != nullptr) {\n                hocr_str << \"\\n         <span class='ocrx_cinfo'\"\n                         << \" id='\"\n                         << \"choice_\" << page_id << \"_\" << wcnt << \"_\" << ccnt\n                         << \"'\"\n                         << \" title='x_confs \" << choiceconf << \"'>\"\n                         << HOcrEscape(choice).c_str() << \"</span>\";\n                ccnt++;\n              }\n            } while (ci.Next());\n            hocr_str << \"\\n        </span>\";\n            tcnt++;\n          }\n        }\n      }\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n    if (italic) {\n      hocr_str << \"</em>\";\n    }\n    if (bold) {\n      hocr_str << \"</strong>\";\n    }\n    // If the lstm choice mode is required it is added here\n    if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {\n      for (const auto &symbol : *rawTimestepMap) {\n        hocr_str << \"\\n       <span class='ocr_symbol'\"\n                 << \" id='\"\n                 << \"symbol_\" << page_id << \"_\" << wcnt << \"_\" << scnt << \"'>\";\n        for (const auto &timestep : symbol) {\n          hocr_str << \"\\n        <span class='ocrx_cinfo'\"\n                   << \" id='\"\n                   << \"timestep\" << page_id << \"_\" << wcnt << \"_\" << tcnt\n                   << \"'>\";\n          for (auto &&conf : timestep) {\n            hocr_str << \"\\n         <span class='ocrx_cinfo'\"\n                     << \" id='\"\n                     << \"choice_\" << page_id << \"_\" << wcnt << \"_\" << ccnt\n                     << \"'\"\n                     << \" title='x_confs \" << int(conf.second * 100) << \"'>\"\n                     << HOcrEscape(conf.first).c_str() << \"</span>\";\n            ++ccnt;\n          }\n          hocr_str << \"</span>\";\n          ++tcnt;\n        }\n        hocr_str << \"</span>\";\n        ++scnt;\n      }\n    } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {\n      for (const auto &timestep : *CTCMap) {\n        if (timestep.size() > 0) {\n          hocr_str << \"\\n       <span class='ocrx_cinfo'\"\n                   << \" id='\"\n                   << \"lstm_choices_\" << page_id << \"_\" << wcnt << \"_\" << tcnt\n                   << \"'>\";\n          for (auto &j : timestep) {\n            float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;\n            if (conf < 0.0f) {\n              conf = 0.0f;\n            }\n            if (conf > 100.0f) {\n              conf = 100.0f;\n            }\n            hocr_str << \"\\n        <span class='ocrx_cinfo'\"\n                     << \" id='\"\n                     << \"choice_\" << page_id << \"_\" << wcnt << \"_\" << ccnt\n                     << \"'\"\n                     << \" title='x_confs \" << conf << \"'>\"\n                     << HOcrEscape(j.first).c_str() << \"</span>\";\n            ccnt++;\n          }\n          hocr_str << \"</span>\";\n          tcnt++;\n        }\n      }\n    }\n    // Close ocrx_word.\n    if (hocr_boxes || lstm_choice_mode > 0) {\n      hocr_str << \"\\n      \";\n    }\n    hocr_str << \"</span>\";\n    tcnt = 1;\n    ccnt = 1;\n    wcnt++;\n    // Close any ending block/paragraph/textline.\n    if (last_word_in_line) {\n      hocr_str << \"\\n     </span>\";\n      lcnt++;\n    }\n    if (last_word_in_para) {\n      hocr_str << \"\\n    </p>\\n\";\n      pcnt++;\n      para_is_ltr = true; // back to default direction\n    }\n    if (last_word_in_block) {\n      hocr_str << \"   </div>\\n\";\n      bcnt++;\n    }\n  }\n  hocr_str << \"  </div>\\n\";\n\n  return copy_string(hocr_str.str());\n}\n\n/**********************************************************************\n * HOcr Text Renderer interface implementation\n **********************************************************************/\nTessHOcrRenderer::TessHOcrRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"hocr\") {\n  font_info_ = false;\n}\n\nTessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)\n    : TessResultRenderer(outputbase, \"hocr\") {\n  font_info_ = font_info;\n}\n\nbool TessHOcrRenderer::BeginDocumentHandler() {\n  AppendString(\n      \"<?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\"?>\\n\"\n      \"<!DOCTYPE html PUBLIC \\\"-//W3C//DTD XHTML 1.0 Transitional//EN\\\"\\n\"\n      \"    \\\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\\\">\\n\"\n      \"<html xmlns=\\\"http://www.w3.org/1999/xhtml\\\" xml:lang=\\\"en\\\" \"\n      \"lang=\\\"en\\\">\\n <head>\\n  <title>\");\n  AppendString(title());\n  AppendString(\n      \"</title>\\n\"\n      \"  <meta http-equiv=\\\"Content-Type\\\" content=\\\"text/html;\"\n      \"charset=utf-8\\\"/>\\n\"\n      \"  <meta name='ocr-system' content='tesseract \" TESSERACT_VERSION_STR\n      \"' />\\n\"\n      \"  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par\"\n      \" ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf\");\n  if (font_info_) {\n    AppendString(\" ocrp_font ocrp_fsize\");\n  }\n  AppendString(\n      \"'/>\\n\"\n      \" </head>\\n\"\n      \" <body>\\n\");\n\n  return true;\n}\n\nbool TessHOcrRenderer::EndDocumentHandler() {\n  AppendString(\" </body>\\n</html>\\n\");\n\n  return true;\n}\n\nbool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));\n  if (hocr == nullptr) {\n    return false;\n  }\n\n  AppendString(hocr.get());\n\n  return true;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/lstmboxrenderer.cpp",
    "content": "/**********************************************************************\n * File:        lstmboxrenderer.cpp\n * Description: Renderer for creating box file for LSTM training.\n *              based on the tsv renderer.\n *\n * (C) Copyright 2019, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <tesseract/baseapi.h> // for TessBaseAPI\n#include <tesseract/renderer.h>\n#include \"helpers.h\"        // for copy_string\n#include \"tesseractclass.h\" // for Tesseract\n\nnamespace tesseract {\n\n/**\n * Create a UTF8 box file for LSTM training from the internal data structures.\n * page_number is a 0-base page index that will appear in the box file.\n * Returned string must be freed with the delete [] operator.\n */\nstatic void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,\n                         std::string &text) {\n  text += \" \" + std::to_string(image_height - bottom);\n  text += \" \" + std::to_string(right + 5);\n  text += \" \" + std::to_string(image_height - top);\n  text += \" \" + std::to_string(page_num);\n}\n\nchar *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {\n  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n\n  std::string lstm_box_str;\n  bool first_word = true;\n  int left = 0, top = 0, right = 0, bottom = 0;\n\n  LTRResultIterator *res_it = GetLTRIterator();\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->Empty(RIL_SYMBOL)) {\n      res_it->Next(RIL_SYMBOL);\n      continue;\n    }\n    if (!first_word) {\n      if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {\n        if (res_it->IsAtBeginningOf(RIL_WORD)) {\n          lstm_box_str += \"  \" + std::to_string(left);\n          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);\n          lstm_box_str += \"\\n\"; // end of row for word\n        }                       // word\n      } else {\n        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n          lstm_box_str += \"\\t \" + std::to_string(left);\n          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);\n          lstm_box_str += \"\\n\"; // end of row for line\n        }                       // line\n      }\n    } // not first word\n    first_word = false;\n    // Use bounding box for whole line for everything\n    res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);\n    do {\n      lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));\n    lstm_box_str += \" \" + std::to_string(left);\n    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);\n    lstm_box_str += \"\\n\"; // end of row for symbol\n  }\n  if (!first_word) { // if first_word is true  => empty page\n    lstm_box_str += \"\\t \" + std::to_string(left);\n    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);\n    lstm_box_str += \"\\n\"; // end of PAGE\n  }\n  delete res_it;\n  return copy_string(lstm_box_str);\n}\n\n/**********************************************************************\n * LSTMBox Renderer interface implementation\n **********************************************************************/\nTessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"box\") {}\n\nbool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));\n  if (lstmbox == nullptr) {\n    return false;\n  }\n\n  AppendString(lstmbox.get());\n\n  return true;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/api/pagerenderer.cpp",
    "content": "// File:        pagerenderer.cpp\n// Description: PAGE XML rendering interface\n// Author:      Jan Kamlah\n\n// (C) Copyright 2024\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"helpers.h\" // for copy_string\n#include \"tprintf.h\" // for tprintf\n\n#include <tesseract/baseapi.h>\n#include <tesseract/renderer.h>\n\n#include <ctime>\n#include <iomanip>\n#include <memory>\n#include <regex>\n#include <sstream> // for std::stringstream\n#include <unordered_set>\n\n#include <allheaders.h>\n#if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || \\\n    LIBLEPT_MAJOR_VERSION > 1\n#  include <array_internal.h>\n#  include <pix_internal.h>\n#endif\n\nnamespace tesseract {\n\n///\n/// Slope and offset between two points\n///\nstatic void GetSlopeAndOffset(float x0, float y0, float x1, float y1, float *m,\n                              float *b) {\n  float slope;\n\n  slope = ((y1 - y0) / (x1 - x0));\n  *m = slope;\n  *b = y0 - slope * x0;\n}\n\n///\n/// Write coordinates in the form of a points to a stream\n///\nstatic void AddPointsToPAGE(Pta *pts, std::stringstream &str) {\n  int num_pts;\n\n  str << \"<Coords points=\\\"\";\n  num_pts = ptaGetCount(pts);\n  for (int p = 0; p < num_pts; ++p) {\n    int x, y;\n    ptaGetIPt(pts, p, &x, &y);\n    if (p != 0) {\n      str << \" \";\n    }\n    str << std::to_string(x) << \",\" << std::to_string(y);\n  }\n  str << \"\\\"/>\\n\";\n}\n\n///\n/// Convert bbox information to top and bottom polygon\n///\nstatic void AddPointToWordPolygon(\n    const ResultIterator *res_it, PageIteratorLevel level, Pta *word_top_pts,\n    Pta *word_bottom_pts, tesseract::WritingDirection writing_direction) {\n  int left, top, right, bottom;\n\n  res_it->BoundingBox(level, &left, &top, &right, &bottom);\n\n  if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {\n    ptaAddPt(word_top_pts, left, top);\n    ptaAddPt(word_top_pts, right, top);\n\n    ptaAddPt(word_bottom_pts, left, bottom);\n    ptaAddPt(word_bottom_pts, right, bottom);\n\n  } else {\n    // Transform from ttb to ltr\n    ptaAddPt(word_top_pts, top, right);\n    ptaAddPt(word_top_pts, bottom, right);\n\n    ptaAddPt(word_bottom_pts, top, left);\n    ptaAddPt(word_bottom_pts, bottom, left);\n  }\n}\n\n///\n/// Transpose polygonline, destroy old and return new pts\n///\nstatic Pta *TransposePolygonline(Pta *pts) {\n  Pta *pts_transposed;\n\n  pts_transposed = ptaTranspose(pts);\n  ptaDestroy(&pts);\n  return pts_transposed;\n}\n\n///\n/// Reverse polygonline, destroy old and return new pts\n///\nstatic Pta *ReversePolygonline(Pta *pts, int type) {\n  Pta *pts_reversed;\n\n  pts_reversed = ptaReverse(pts, type);\n  ptaDestroy(&pts);\n  return pts_reversed;\n}\n\n///\n/// Destroy old and create new pts\n///\nstatic Pta *DestroyAndCreatePta(Pta *pts) {\n  ptaDestroy(&pts);\n  return ptaCreate(0);\n}\n\n///\n/// Recalculate linepolygon\n/// Create a hull for overlapping areas\n///\nstatic Pta *RecalcPolygonline(Pta *pts, bool upper) {\n  int num_pts, num_bin, index = 0;\n  int x0, y0, x1, y1;\n  float x_min, y_min, x_max, y_max;\n  NUMA *bin_line;\n  Pta *pts_recalc;\n\n  ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max);\n  num_bin = x_max - x_min;\n  bin_line = numaCreate(num_bin + 1);\n\n  for (int p = 0; p <= num_bin; ++p) {\n    bin_line->array[p] = -1.0f;\n  }\n\n  num_pts = ptaGetCount(pts);\n\n  if (num_pts == 2) {\n    pts_recalc = ptaCopy(pts);\n    ptaDestroy(&pts);\n    return pts_recalc;\n  }\n\n  do {\n    ptaGetIPt(pts, index, &x0, &y0);\n    ptaGetIPt(pts, index + 1, &x1, &y1);\n    for (int p = x0 - x_min; p <= x1 - x_min; ++p) {\n      if (!upper) {\n        if (bin_line->array[p] == -1.0f || y0 > bin_line->array[p]) {\n          bin_line->array[p] = y0;\n        }\n      } else {\n        if (bin_line->array[p] == -1.0f || y0 < bin_line->array[p]) {\n          bin_line->array[p] = y0;\n        }\n      }\n    }\n    index += 2;\n  } while (index < num_pts - 1);\n\n  pts_recalc = ptaCreate(0);\n\n  int y = static_cast<int>(bin_line->array[0]);\n  ptaAddPt(pts_recalc, x_min, y);\n  for (int p = 1; p <= num_bin; ++p) {\n    if (p == num_bin) {\n      ptaAddPt(pts_recalc, x_min + p, y);\n      break;\n    } else if (y != static_cast<int>(bin_line->array[p])) {\n      if (y != -1) {\n        ptaAddPt(pts_recalc, x_min + p, y);\n      }\n      y = static_cast<int>(bin_line->array[p]);\n      if (y != -1) {\n        ptaAddPt(pts_recalc, x_min + p, y);\n      }\n    }\n  }\n\n  numaDestroy(&bin_line);\n  ptaDestroy(&pts);\n  return pts_recalc;\n}\n\n///\n/// Create a rectangle hull around a single line\n///\nstatic Pta *PolygonToBoxCoords(Pta *pts) {\n  Pta *pts_box;\n  float x_min, y_min, x_max, y_max;\n\n  pts_box = ptaCreate(0);\n  ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max);\n  ptaAddPt(pts_box, x_min, y_min);\n  ptaAddPt(pts_box, x_max, y_min);\n  ptaAddPt(pts_box, x_max, y_max);\n  ptaAddPt(pts_box, x_min, y_max);\n  ptaDestroy(&pts);\n  return pts_box;\n}\n\n///\n/// Create a rectangle polygon round the existing multiple lines\n///\nstatic void UpdateBlockPoints(Pta *block_top_pts, Pta *block_bottom_pts,\n                              Pta *line_top_pts, Pta *line_bottom_pts, int lcnt,\n                              int last_word_in_cblock) {\n  int num_pts;\n  int x, y;\n\n  // Create a hull around all lines\n  if (lcnt == 0 && last_word_in_cblock) {\n    ptaJoin(block_top_pts, line_top_pts, 0, -1);\n    ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1);\n  } else if (lcnt == 0) {\n    ptaJoin(block_top_pts, line_top_pts, 0, -1);\n    num_pts = ptaGetCount(line_bottom_pts);\n    ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y);\n    ptaAddPt(block_top_pts, x, y);\n    ptaGetIPt(line_bottom_pts, 0, &x, &y);\n    ptaAddPt(block_bottom_pts, x, y);\n  } else if (last_word_in_cblock) {\n    ptaGetIPt(line_top_pts, 0, &x, &y);\n    ptaAddPt(block_bottom_pts, x, y);\n    ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1);\n    num_pts = ptaGetCount(line_top_pts);\n    ptaGetIPt(line_top_pts, num_pts - 1, &x, &y);\n    ptaAddPt(block_top_pts, x, y);\n  } else {\n    ptaGetIPt(line_top_pts, 0, &x, &y);\n    ptaAddPt(block_bottom_pts, x, y);\n    ptaGetIPt(line_bottom_pts, 0, &x, &y);\n    ptaAddPt(block_bottom_pts, x, y);\n    num_pts = ptaGetCount(line_top_pts);\n    ptaGetIPt(line_top_pts, num_pts - 1, &x, &y);\n    ptaAddPt(block_top_pts, x, y);\n    num_pts = ptaGetCount(line_bottom_pts);\n    ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y);\n    ptaAddPt(block_top_pts, x, y);\n  };\n}\n\n///\n/// Simplify polygonlines (only expanding not shrinking) (Due to recalculation\n/// currently not necessary)\n///\nstatic void SimplifyLinePolygon(Pta *polyline, int tolerance, bool upper) {\n  int x0, y0, x1, y1, x2, y2, x3, y3, index = 1;\n  float m, b, y_min, y_max;\n\n  while (index <= polyline->n - 2) {\n    ptaGetIPt(polyline, index - 1, &x0, &y0);\n    ptaGetIPt(polyline, index, &x1, &y1);\n    ptaGetIPt(polyline, index + 1, &x2, &y2);\n    if (index + 2 < polyline->n) {\n      // Delete two point indentations\n      ptaGetIPt(polyline, index + 2, &x3, &y3);\n      if (abs(x3 - x0) <= tolerance * 2) {\n        GetSlopeAndOffset(x0, y0, x3, y3, &m, &b);\n\n        if (upper && (m * x1 + b) < y1 && (m * x2 + b) < y2) {\n          ptaRemovePt(polyline, index + 1);\n          ptaRemovePt(polyline, index);\n          continue;\n        } else if (!upper && (m * x1 + b) > y1 && (m * x2 + b) > y2) {\n          ptaRemovePt(polyline, index + 1);\n          ptaRemovePt(polyline, index);\n          continue;\n        }\n      }\n    }\n    // Delete one point indentations\n    if (abs(y0 - y1) <= tolerance && abs(y1 - y2) <= tolerance) {\n      GetSlopeAndOffset(x0, y0, x2, y2, &m, &b);\n      if (upper && (m * x1 + b) <= y1) {\n        ptaRemovePt(polyline, index);\n        continue;\n      } else if (!upper && (m * x1 + b) >= y1) {\n        ptaRemovePt(polyline, index);\n        continue;\n      }\n    }\n    // Delete near by points\n    if (x1 != x0 && abs(y1 - y0) < 4 && abs(x1 - x0) <= tolerance) {\n      if (upper) {\n        y_min = std::min(y0, y1);\n        GetSlopeAndOffset(x0, y_min, x2, y2, &m, &b);\n        if ((m * x1 + b) <= y1) {\n          polyline->y[index - 1] = std::min(y0, y1);\n          ptaRemovePt(polyline, index);\n          continue;\n        }\n      } else {\n        y_max = std::max(y0, y1);\n        GetSlopeAndOffset(x0, y_max, x2, y2, &m, &b);\n        if ((m * x1 + b) >= y1) {\n          polyline->y[index - 1] = y_max;\n          ptaRemovePt(polyline, index);\n          continue;\n        }\n      }\n    }\n    index++;\n  }\n}\n\n///\n/// Directly write bounding box information as coordinates a stream\n///\nstatic void AddBoxToPAGE(const ResultIterator *it, PageIteratorLevel level,\n                         std::stringstream &page_str) {\n  int left, top, right, bottom;\n\n  it->BoundingBox(level, &left, &top, &right, &bottom);\n  page_str << \"<Coords points=\\\"\" << left << \",\" << top << \" \" << right << \",\"\n           << top << \" \" << right << \",\" << bottom << \" \" << left << \",\"\n           << bottom << \"\\\"/>\\n\";\n}\n\n///\n/// Join ltr and rtl polygon information\n///\nstatic void AppendLinePolygon(Pta *pts_ltr, Pta *pts_rtl, Pta *ptss,\n                              tesseract::WritingDirection writing_direction) {\n  // If writing direction is NOT right-to-left, handle the left-to-right case.\n  if (writing_direction != WRITING_DIRECTION_RIGHT_TO_LEFT) {\n    if (ptaGetCount(pts_rtl) != 0) {\n      ptaJoin(pts_ltr, pts_rtl, 0, -1);\n      DestroyAndCreatePta(pts_rtl);\n    }\n    ptaJoin(pts_ltr, ptss, 0, -1);\n  } else {\n    // For right-to-left, work with a copy of ptss initially.\n    PTA *ptsd = ptaCopy(ptss);\n    if (ptaGetCount(pts_rtl) != 0) {\n      ptaJoin(ptsd, pts_rtl, 0, -1);\n    }\n    ptaDestroy(&pts_rtl);\n    ptaCopy(ptsd);\n  }\n}\n\n///\n/// Convert baseline to points and add to polygon\n///\nstatic void AddBaselineToPTA(const ResultIterator *it, PageIteratorLevel level,\n                             Pta *baseline_pts) {\n  int x1, y1, x2, y2;\n\n  it->Baseline(level, &x1, &y1, &x2, &y2);\n  ptaAddPt(baseline_pts, x1, y1);\n  ptaAddPt(baseline_pts, x2, y2);\n}\n\n///\n/// Directly write baseline information as baseline points a stream\n///\nstatic void AddBaselinePtsToPAGE(Pta *baseline_pts, std::stringstream &str) {\n  int x, y, num_pts = baseline_pts->n;\n\n  str << \"<Baseline points=\\\"\";\n  for (int p = 0; p < num_pts; ++p) {\n    ptaGetIPt(baseline_pts, p, &x, &y);\n    if (p != 0) {\n      str << \" \";\n    }\n    str << std::to_string(x) << \",\" << std::to_string(y);\n  }\n  str << \"\\\"/>\\n\";\n}\n\n///\n/// Sort baseline points ascending and deleting duplicates\n///\nstatic Pta *SortBaseline(Pta *baseline_pts) {\n  int num_pts, index = 0;\n  float x0, y0, x1, y1;\n  Pta *sorted_baseline_pts;\n\n  sorted_baseline_pts =\n      ptaSort(baseline_pts, L_SORT_BY_X, L_SORT_INCREASING, nullptr);\n\n  do {\n    ptaGetPt(sorted_baseline_pts, index, &x0, &y0);\n    ptaGetPt(sorted_baseline_pts, index + 1, &x1, &y1);\n    if (x0 >= x1) {\n      sorted_baseline_pts->y[index] = std::min(y0, y1);\n      ptaRemovePt(sorted_baseline_pts, index + 1);\n    } else {\n      index++;\n    }\n    num_pts = ptaGetCount(sorted_baseline_pts);\n  } while (index < num_pts - 1);\n\n  ptaDestroy(&baseline_pts);\n  return sorted_baseline_pts;\n}\n\n///\n/// Clip baseline to range of the exsitings polygon and simplifies the baseline\n/// linepolygon\n///\nstatic Pta *ClipAndSimplifyBaseline(Pta *bottom_pts, Pta *baseline_pts,\n                                    tesseract::WritingDirection writing_direction) {\n  int num_pts;\n  float m, b, x0, y0, x1, y1;\n  float x_min, y_min, x_max, y_max;\n  Pta *baseline_clipped_pts;\n\n  ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max);\n  num_pts = ptaGetCount(baseline_pts);\n  baseline_clipped_pts = ptaCreate(0);\n\n  // Clip Baseline\n  for (int p = 0; p < num_pts; ++p) {\n    ptaGetPt(baseline_pts, p, &x0, &y0);\n    if (x0 < x_min) {\n      if (p + 1 < num_pts) {\n        ptaGetPt(baseline_pts, p + 1, &x1, &y1);\n        if (x1 < x_min) {\n          continue;\n        } else {\n          GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);\n          y0 = int(x_min * m + b);\n          x0 = x_min;\n        }\n      }\n    } else if (x0 > x_max) {\n      if (ptaGetCount(baseline_clipped_pts) > 0 && p > 0) {\n        ptaGetPt(baseline_pts, p - 1, &x1, &y1);\n        // See comment above\n        GetSlopeAndOffset(x1, y1, x0, y0, &m, &b);\n        y0 = int(x_max * m + b);\n        x0 = x_max;\n        ptaAddPt(baseline_clipped_pts, x0, y0);\n        break;\n      }\n    }\n    ptaAddPt(baseline_clipped_pts, x0, y0);\n  }\n  if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) {\n    SimplifyLinePolygon(baseline_clipped_pts, 3, 0);\n  } else {\n    SimplifyLinePolygon(baseline_clipped_pts, 3, 1);\n  }\n  SimplifyLinePolygon(\n      baseline_clipped_pts, 3,\n      writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM ? 0 : 1);\n\n  // Check the number of points in baseline_clipped_pts after processing\n  int clipped_pts_count = ptaGetCount(baseline_clipped_pts);\n\n  if (clipped_pts_count < 2) {\n    // If there's only one point in baseline_clipped_pts, duplicate it\n    ptaDestroy(&baseline_clipped_pts); // Clean up the created but unused Pta\n    baseline_clipped_pts = ptaCreate(0);\n    ptaAddPt(baseline_clipped_pts, x_min, y_min);\n    ptaAddPt(baseline_clipped_pts, x_max, y_min);\n  }\n\n  ptaDestroy(&baseline_pts);\n  return baseline_clipped_pts;\n}\n\n///\n/// Fit the baseline points into the existing polygon\n///\n#if 0 // unused\nstatic Pta *FitBaselineIntoLinePolygon(Pta *bottom_pts, Pta *baseline_pts,\n                                       tesseract::WritingDirection writing_direction) {\n  int num_pts, num_bin, x0, y0, x1, y1;\n  float m, b;\n  float x_min, y_min, x_max, y_max;\n  float delta_median, delta_median_Q1, delta_median_Q3;\n  NUMA *bin_line, *poly_bl_delta;\n  Pta *baseline_recalc_pts, *baseline_clipped_pts;\n\n  ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max);\n  num_bin = x_max - x_min;\n  bin_line = numaCreate(num_bin + 1);\n\n  for (int p = 0; p < num_bin + 1; ++p) {\n    bin_line->array[p] = -1.;\n  }\n\n  num_pts = ptaGetCount(bottom_pts);\n  // Create an interpolated polygon with stepsize 1.\n  for (int index = 0; index < num_pts - 1; ++index) {\n    ptaGetIPt(bottom_pts, index, &x0, &y0);\n    ptaGetIPt(bottom_pts, index + 1, &x1, &y1);\n    if (x0 >= x1) {\n      continue;\n    }\n    if (y0 == y1) {\n      for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) {\n        if (bin_line->array[p] == -1. || y0 > bin_line->array[p]) {\n          bin_line->array[p] = y0;\n        }\n      }\n    } else {\n      GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);\n      for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) {\n        if (bin_line->array[p] == -1. ||\n            ((p + x_min) * m + b) > bin_line->array[p]) {\n          bin_line->array[p] = ((p + x_min) * m + b);\n        }\n      }\n    }\n  }\n\n  num_pts = ptaGetCount(baseline_pts);\n  baseline_clipped_pts = ptaCreate(0);\n  poly_bl_delta = numaCreate(0);\n\n  // Clip Baseline and create a set of deltas between baseline and polygon\n  for (int p = 0; p < num_pts; ++p) {\n    ptaGetIPt(baseline_pts, p, &x0, &y0);\n\n    if (x0 < x_min) {\n      ptaGetIPt(baseline_pts, p + 1, &x1, &y1);\n      if (x1 < x_min) {\n        continue;\n      } else {\n        GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);\n        y0 = int(x_min * m + b);\n        x0 = x_min;\n      }\n    } else if (x0 > x_max) {\n      if (ptaGetCount(baseline_clipped_pts) > 0) {\n        ptaGetIPt(baseline_pts, p - 1, &x1, &y1);\n        GetSlopeAndOffset(x1, y1, x0, y0, &m, &b);\n        y0 = int(x_max * m + b);\n        x0 = x_max;\n        int x_val = x0 - x_min;\n        numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0));\n        ptaAddPt(baseline_clipped_pts, x0, y0);\n        break;\n      }\n    }\n    int x_val = x0 - x_min;\n    numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0));\n    ptaAddPt(baseline_clipped_pts, x0, y0);\n  }\n\n  ptaDestroy(&baseline_pts);\n\n  // Calculate quartiles to find outliers\n  numaGetMedian(poly_bl_delta, &delta_median);\n  numaGetRankValue(poly_bl_delta, 0.25, nullptr, 0, &delta_median_Q1);\n  numaGetRankValue(poly_bl_delta, 0.75, nullptr, 0, &delta_median_Q3);\n\n  // Fit baseline into the polygon\n  // Todo: Needs maybe some adjustments to suppress fitting to superscript\n  // glyphs\n  baseline_recalc_pts = ptaCreate(0);\n  num_pts = ptaGetCount(baseline_clipped_pts);\n  for (int p = 0; p < num_pts; ++p) {\n    ptaGetIPt(baseline_clipped_pts, p, &x0, &y0);\n    int x_val = x0 - x_min;\n    // Delete outliers with IQR\n    if (abs(y0 - bin_line->array[x_val]) >\n            1.5 * delta_median_Q3 + delta_median &&\n        p != 0 && p != num_pts - 1) {\n      continue;\n    }\n    if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) {\n      if (y0 < bin_line->array[x_val]) {\n        ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]);\n      } else {\n        ptaAddPt(baseline_recalc_pts, x0, y0);\n      }\n    } else {\n      if (y0 > bin_line->array[x_val]) {\n        ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]);\n      } else {\n        ptaAddPt(baseline_recalc_pts, x0, y0);\n      }\n    }\n  }\n  // Return recalculated baseline if this fails return the bottom line as\n  // baseline\n  numaDestroy(&bin_line);\n  ptaDestroy(&baseline_clipped_pts);\n  if (ptaGetCount(baseline_recalc_pts) < 2) {\n    ptaDestroy(&baseline_recalc_pts);\n    return ptaCopy(bottom_pts);\n  } else {\n    return baseline_recalc_pts;\n  }\n}\n#endif\n\n/// Convert writing direction to string representation\nstatic const char *WritingDirectionToStr(int wd) {\n  switch (wd) {\n    case 0:\n      return \"left-to-right\";\n    case 1:\n      return \"right-to-left\";\n    case 2:\n      return \"top-to-bottom\";\n    default:\n      return \"bottom-to-top\";\n  }\n}\n///\n/// Append the PAGE XML for the beginning of the document\n///\nbool TessPAGERenderer::BeginDocumentHandler() {\n  // Delay the XML output because we need the name of the image file.\n  begin_document = true;\n  return true;\n}\n\n///\n/// Append the PAGE XML for the layout of the image\n///\nbool TessPAGERenderer::AddImageHandler(TessBaseAPI *api) {\n  if (begin_document) {\n    AppendString(\n        \"<?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\" standalone=\\\"yes\\\"?>\\n\"\n        \"<PcGts \"\n        \"xmlns=\\\"http://schema.primaresearch.org/PAGE/gts/pagecontent/\"\n        \"2019-07-15\\\" \"\n        \"xmlns:xsi=\\\"http://www.w3.org/2001/XMLSchema-instance\\\" \"\n        \"xsi:schemaLocation=\\\"http://schema.primaresearch.org/PAGE/gts/\"\n        \"pagecontent/2019-07-15 \"\n        \"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/\"\n        \"pagecontent.xsd\\\">\\n\"\n        \"\\t<Metadata\");\n\n    // If a URL is used to recognize an image add it as <Metadata\n    // externalRef=\"url\">\n    if (std::regex_search(api->GetInputName(),\n                          std::regex(\"^(https?|ftp|ssh):\"))) {\n      AppendString(\" externalRef=\\\"\");\n      AppendString(api->GetInputName());\n      AppendString(\"\\\" \");\n    }\n\n    AppendString(\n        \">\\n\"\n        \"\\t\\t<Creator>Tesseract - \");\n    AppendString(TESSERACT_VERSION_STR);\n    // If gmtime conversion is problematic maybe l_getFormattedDate can be used\n    // here\n    // char *datestr = l_getFormattedDate();\n    std::time_t now = std::time(nullptr);\n    std::tm *now_tm = std::gmtime(&now);\n    char mbstr[100];\n    std::strftime(mbstr, sizeof(mbstr), \"%Y-%m-%dT%H:%M:%S\", now_tm);\n    AppendString(\n        \"</Creator>\\n\"\n        \"\\t\\t<Created>\");\n    AppendString(mbstr);\n    AppendString(\"</Created>\\n\");\n    AppendString(\"\\t\\t<LastChange>\");\n    AppendString(mbstr);\n    AppendString(\n        \"</LastChange>\\n\"\n        \"\\t</Metadata>\\n\");\n    begin_document = false;\n  }\n\n  const std::unique_ptr<const char[]> text(api->GetPAGEText(imagenum()));\n  if (text == nullptr) {\n    return false;\n  }\n\n  AppendString(text.get());\n\n  return true;\n}\n\n///\n/// Append the PAGE XML for the end of the document\n///\nbool TessPAGERenderer::EndDocumentHandler() {\n  AppendString(\"</PcGts>\\n\");\n  return true;\n}\n\nTessPAGERenderer::TessPAGERenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"page.xml\"), begin_document(false) {}\n\n///\n/// Make an XML-formatted string with PAGE markup from the internal\n/// data structures.\n///\nchar *TessBaseAPI::GetPAGEText(int page_number) {\n  return GetPAGEText(nullptr, page_number);\n}\n\n///\n/// Make an XML-formatted string with PAGE markup from the internal\n/// data structures.\n///\nchar *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {\n  if (tesseract_ == nullptr ||\n      (page_res_ == nullptr && Recognize(monitor) < 0)) {\n    return nullptr;\n  }\n\n  int rcnt = 0, lcnt = 0, wcnt = 0;\n\n  if (input_file_.empty()) {\n    SetInputName(nullptr);\n  }\n\n  // Used variables\n\n  std::stringstream reading_order_str;\n  std::stringstream region_content;\n  std::stringstream line_content;\n  std::stringstream word_content;\n  std::stringstream line_str;\n  std::stringstream line_inter_str;\n  std::stringstream word_str;\n  std::stringstream page_str;\n\n  float x1, y1, x2, y2;\n\n  tesseract::Orientation orientation_block = ORIENTATION_PAGE_UP;\n  tesseract::WritingDirection writing_direction_block =\n      WRITING_DIRECTION_LEFT_TO_RIGHT;\n  tesseract::TextlineOrder textline_order_block;\n\n  Pta *block_top_pts = ptaCreate(0);\n  Pta *block_bottom_pts = ptaCreate(0);\n  Pta *line_top_ltr_pts = ptaCreate(0);\n  Pta *line_bottom_ltr_pts = ptaCreate(0);\n  Pta *line_top_rtl_pts = ptaCreate(0);\n  Pta *line_bottom_rtl_pts = ptaCreate(0);\n  Pta *word_top_pts = ptaCreate(0);\n  Pta *word_bottom_pts = ptaCreate(0);\n  Pta *word_baseline_pts = ptaCreate(0);\n  Pta *line_baseline_rtl_pts = ptaCreate(0);\n  Pta *line_baseline_ltr_pts = ptaCreate(0);\n  Pta *line_baseline_pts = ptaCreate(0);\n\n  bool POLYGONFLAG;\n  GetBoolVariable(\"page_xml_polygon\", &POLYGONFLAG);\n  int LEVELFLAG;\n  GetIntVariable(\"page_xml_level\", &LEVELFLAG);\n\n  if (LEVELFLAG != 0 && LEVELFLAG != 1) {\n    tprintf(\n        \"For now, only line level and word level are available, and the level \"\n        \"is reset to line level.\\n\");\n    LEVELFLAG = 0;\n  }\n\n  // Use \"C\" locale (needed for int values larger than 999).\n  page_str.imbue(std::locale::classic());\n  reading_order_str << \"\\t<Page \" << \"imageFilename=\\\"\" << GetInputName();\n  // AppendString(api->GetInputName());\n  reading_order_str << \"\\\" \" << \"imageWidth=\\\"\" << rect_width_ << \"\\\" \"\n                    << \"imageHeight=\\\"\" << rect_height_ << \"\\\">\\n\";\n  std::size_t ro_id = std::hash<std::string>{}(GetInputName());\n  reading_order_str << \"\\t\\t<ReadingOrder>\\n\"\n                    << \"\\t\\t\\t<OrderedGroup id=\\\"ro\" << ro_id\n                    << \"\\\" caption=\\\"Regions reading order\\\">\\n\";\n\n  std::unique_ptr<ResultIterator> res_it(GetIterator());\n\n  float block_conf = 0;\n  float line_conf = 0;\n\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    auto block_type = res_it->BlockType();\n\n    switch (block_type) {\n      case PT_FLOWING_IMAGE:\n      case PT_HEADING_IMAGE:\n      case PT_PULLOUT_IMAGE: {\n        // Handle all kinds of images.\n        page_str << \"\\t\\t<GraphicRegion id=\\\"r\" << rcnt++ << \"\\\">\\n\";\n        page_str << \"\\t\\t\\t\";\n        AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);\n        page_str << \"\\t\\t</GraphicRegion>\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      }\n      case PT_HORZ_LINE:\n      case PT_VERT_LINE:\n        // Handle horizontal and vertical lines.\n        page_str << \"\\t\\t<SeparatorRegion id=\\\"r\" << rcnt++ << \"\\\">\\n\";\n        page_str << \"\\t\\t\\t\";\n        AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);\n        page_str << \"\\t\\t</SeparatorRegion>\\n\";\n        res_it->Next(RIL_BLOCK);\n        continue;\n      case PT_NOISE:\n        tprintf(\"TODO: Please report image which triggers the noise case.\\n\");\n        ASSERT_HOST(false);\n      default:\n        break;\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {\n      // Add Block to reading order\n      reading_order_str << \"\\t\\t\\t\\t<RegionRefIndexed \" << \"index=\\\"\" << rcnt\n                        << \"\\\" \" << \"regionRef=\\\"r\" << rcnt << \"\\\"/>\\n\";\n\n      float deskew_angle;\n      res_it->Orientation(&orientation_block, &writing_direction_block,\n                          &textline_order_block, &deskew_angle);\n      block_conf = res_it->Confidence(RIL_BLOCK) / 100;\n      page_str << \"\\t\\t<TextRegion id=\\\"r\" << rcnt << \"\\\" \" << \"custom=\\\"\"\n               << \"readingOrder {index:\" << rcnt << \";} \";\n      if (writing_direction_block != WRITING_DIRECTION_LEFT_TO_RIGHT) {\n        page_str << \"readingDirection {\"\n                 << WritingDirectionToStr(writing_direction_block) << \";} \";\n      }\n      page_str << \"orientation {\" << orientation_block << \";}\\\">\\n\";\n      page_str << \"\\t\\t\\t\";\n      if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&\n                            orientation_block != ORIENTATION_PAGE_DOWN)) &&\n          LEVELFLAG == 0) {\n        AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);\n      }\n    }\n\n    // Writing direction changes at a per-word granularity\n    // tesseract::WritingDirection writing_direction_before;\n    auto writing_direction = writing_direction_block;\n    if (writing_direction_block != WRITING_DIRECTION_TOP_TO_BOTTOM) {\n      switch (res_it->WordDirection()) {\n        case DIR_LEFT_TO_RIGHT:\n          writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;\n          break;\n        case DIR_RIGHT_TO_LEFT:\n          writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;\n          break;\n        default:\n          break;\n      }\n    }\n\n    bool ttb_flag = (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM);\n    // TODO: Rework polygon handling if line is skewed (90 or 180 degress),\n    // for now using LinePts\n    bool skewed_flag = (orientation_block != ORIENTATION_PAGE_UP &&\n                        orientation_block != ORIENTATION_PAGE_DOWN);\n\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      // writing_direction_before = writing_direction;\n      line_conf = res_it->Confidence(RIL_TEXTLINE) / 100;\n      char *utf8text = res_it->GetUTF8Text(RIL_TEXTLINE);\n      std::string textline = utf8text;\n      delete[] utf8text;\n      if (textline.back() == '\\n') {\n        textline.erase(textline.length() - 1);\n      }\n      line_content << HOcrEscape(textline.c_str());\n      line_str << \"\\t\\t\\t<TextLine id=\\\"r\" << rcnt << \"l\" << lcnt << \"\\\" \";\n      if (writing_direction != WRITING_DIRECTION_LEFT_TO_RIGHT &&\n          writing_direction != writing_direction_block) {\n        line_str << \"readingDirection=\\\"\"\n                 << WritingDirectionToStr(writing_direction) << \"\\\" \";\n      }\n      line_str << \"custom=\\\"\" << \"readingOrder {index:\" << lcnt << \";}\\\">\\n\";\n      // If level is linebased, get the line polygon and baseline\n      if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {\n        AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,\n                              line_bottom_ltr_pts, writing_direction);\n        AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);\n        if (ttb_flag) {\n          line_baseline_pts = TransposePolygonline(line_baseline_pts);\n        }\n      }\n    }\n\n    // Get information if word is last in line and if its last in the region\n    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);\n    bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);\n\n    float word_conf = res_it->Confidence(RIL_WORD) / 100;\n\n    // Create word stream if word level output is active\n    if (LEVELFLAG > 0) {\n      word_str << \"\\t\\t\\t\\t<Word id=\\\"r\" << rcnt << \"l\" << lcnt << \"w\" << wcnt\n               << \"\\\" readingDirection=\\\"\"\n               << WritingDirectionToStr(writing_direction) << \"\\\" \"\n               << \"custom=\\\"\" << \"readingOrder {index:\" << wcnt << \";}\\\">\\n\";\n      if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {\n        AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,\n                              writing_direction);\n      }\n    }\n\n    if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {\n      AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,\n                            writing_direction);\n    }\n\n    // Get the word baseline information\n    AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);\n\n    // Get the word text content and polygon\n    do {\n      const std::unique_ptr<const char[]> grapheme(\n          res_it->GetUTF8Text(RIL_SYMBOL));\n      if (grapheme && grapheme[0] != 0) {\n        word_content << HOcrEscape(grapheme.get()).c_str();\n        if (POLYGONFLAG && !skewed_flag && !ttb_flag) {\n          AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,\n                                word_bottom_pts, writing_direction);\n        }\n      }\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n\n    if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) {\n      // Sort wordpolygons\n      word_top_pts = RecalcPolygonline(word_top_pts, 1 - ttb_flag);\n      word_bottom_pts = RecalcPolygonline(word_bottom_pts, 0 + ttb_flag);\n\n      // AppendLinePolygon\n      AppendLinePolygon(line_top_ltr_pts, line_top_rtl_pts, word_top_pts,\n                        writing_direction);\n      AppendLinePolygon(line_bottom_ltr_pts, line_bottom_rtl_pts,\n                        word_bottom_pts, writing_direction);\n\n      // Word level polygon\n      word_bottom_pts = ReversePolygonline(word_bottom_pts, 1);\n      ptaJoin(word_top_pts, word_bottom_pts, 0, -1);\n    }\n\n    // Reverse the word baseline direction for rtl\n    if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {\n      word_baseline_pts = ReversePolygonline(word_baseline_pts, 1);\n    }\n\n    // Write word information to the output\n    if (LEVELFLAG > 0) {\n      word_str << \"\\t\\t\\t\\t\\t\";\n      if (ttb_flag) {\n        word_top_pts = TransposePolygonline(word_top_pts);\n      }\n      AddPointsToPAGE(word_top_pts, word_str);\n      word_str << \"\\t\\t\\t\\t\\t\";\n      AddBaselinePtsToPAGE(word_baseline_pts, word_str);\n      word_str << \"\\t\\t\\t\\t\\t<TextEquiv index=\\\"1\\\" conf=\\\"\"\n               << std::setprecision(4) << word_conf << \"\\\">\\n\"\n               << \"\\t\\t\\t\\t\\t\\t<Unicode>\" << word_content.str()\n               << \"</Unicode>\\n\"\n               << \"\\t\\t\\t\\t\\t</TextEquiv>\\n\"\n               << \"\\t\\t\\t\\t</Word>\\n\";\n    }\n    if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) {\n      // Add wordbaseline to linebaseline\n      if (ttb_flag) {\n        word_baseline_pts = TransposePolygonline(word_baseline_pts);\n      }\n      ptaJoin(line_baseline_pts, word_baseline_pts, 0, -1);\n    }\n    word_baseline_pts = DestroyAndCreatePta(word_baseline_pts);\n\n    // Reset word pts arrays\n    word_top_pts = DestroyAndCreatePta(word_top_pts);\n    word_bottom_pts = DestroyAndCreatePta(word_bottom_pts);\n\n    // Check why this combination of words is not working as expected!\n    // Write the word contents to the line\n#if 0\n    if (!last_word_in_line && writing_direction_before != writing_direction &&\n        writing_direction < 2 && writing_direction_before < 2 &&\n        res_it->WordDirection()) {\n      if (writing_direction_before == WRITING_DIRECTION_LEFT_TO_RIGHT) {\n        // line_content << \"‏\" << word_content.str();\n      } else {\n        // line_content << \"‎\" << word_content.str();\n      }\n    } else {\n      // line_content << word_content.str();\n    }\n    // Check if WordIsNeutral\n    if (res_it->WordDirection()) {\n      writing_direction_before = writing_direction;\n    }\n#endif\n    word_content.str(\"\");\n    wcnt++;\n\n    // Write line information to the output\n    if (last_word_in_line) {\n      // Combine ltr and rtl lines\n      if (ptaGetCount(line_top_rtl_pts) != 0) {\n        ptaJoin(line_top_ltr_pts, line_top_rtl_pts, 0, -1);\n        line_top_rtl_pts = DestroyAndCreatePta(line_top_rtl_pts);\n      }\n      if (ptaGetCount(line_bottom_rtl_pts) != 0) {\n        ptaJoin(line_bottom_ltr_pts, line_bottom_rtl_pts, 0, -1);\n        line_bottom_rtl_pts = DestroyAndCreatePta(line_bottom_rtl_pts);\n      }\n      if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) {\n        // Recalc Polygonlines\n        line_top_ltr_pts = RecalcPolygonline(line_top_ltr_pts, 1 - ttb_flag);\n        line_bottom_ltr_pts =\n            RecalcPolygonline(line_bottom_ltr_pts, 0 + ttb_flag);\n\n        // Smooth the polygonline\n        SimplifyLinePolygon(line_top_ltr_pts, 5, 1 - ttb_flag);\n        SimplifyLinePolygon(line_bottom_ltr_pts, 5, 0 + ttb_flag);\n\n        // Fit linepolygon matching the baselinepoints\n        line_baseline_pts = SortBaseline(line_baseline_pts);\n\n        // Fitting baseline into polygon is currently deactivated because\n        // it tends to push the baseline directly under superscripts,\n        // but the baseline is always inside the polygon maybe it will be useful\n        // for something line_baseline_pts =\n        // FitBaselineIntoLinePolygon(line_bottom_ltr_pts, line_baseline_pts,\n        // writing_direction); and it only cut it to the length and simplifies\n        // the linepolyon\n        line_baseline_pts = ClipAndSimplifyBaseline(\n            line_bottom_ltr_pts, line_baseline_pts, writing_direction);\n\n        // Update polygon of the block\n        UpdateBlockPoints(block_top_pts, block_bottom_pts, line_top_ltr_pts,\n                          line_bottom_ltr_pts, lcnt, last_word_in_cblock);\n      }\n      // Line level polygon\n      line_bottom_ltr_pts = ReversePolygonline(line_bottom_ltr_pts, 1);\n      ptaJoin(line_top_ltr_pts, line_bottom_ltr_pts, 0, -1);\n      line_bottom_ltr_pts = DestroyAndCreatePta(line_bottom_ltr_pts);\n\n      if (LEVELFLAG > 0 && !(POLYGONFLAG && !skewed_flag)) {\n        line_top_ltr_pts = PolygonToBoxCoords(line_top_ltr_pts);\n      }\n\n      // Write level points\n      line_str << \"\\t\\t\\t\\t\";\n      if (ttb_flag) {\n        line_top_ltr_pts = TransposePolygonline(line_top_ltr_pts);\n      }\n      AddPointsToPAGE(line_top_ltr_pts, line_str);\n      line_top_ltr_pts = DestroyAndCreatePta(line_top_ltr_pts);\n\n      // Write Baseline\n      line_str << \"\\t\\t\\t\\t\";\n      if (ttb_flag) {\n        line_baseline_pts = TransposePolygonline(line_baseline_pts);\n      }\n      AddBaselinePtsToPAGE(line_baseline_pts, line_str);\n      line_baseline_pts = DestroyAndCreatePta(line_baseline_pts);\n\n      // Add word information if word level output is active\n      line_str << word_str.str();\n      word_str.str(\"\");\n      // Write Line TextEquiv\n      line_str << \"\\t\\t\\t\\t<TextEquiv index=\\\"1\\\" conf=\\\"\"\n               << std::setprecision(4) << line_conf << \"\\\">\\n\"\n               << \"\\t\\t\\t\\t\\t<Unicode>\" << line_content.str() << \"</Unicode>\\n\"\n               << \"\\t\\t\\t\\t</TextEquiv>\\n\";\n      line_str << \"\\t\\t\\t</TextLine>\\n\";\n      region_content << line_content.str();\n      line_content.str(\"\");\n      if (!last_word_in_cblock) {\n        region_content << '\\n';\n      }\n      lcnt++;\n      wcnt = 0;\n    }\n\n    // Write region information to the output\n    if (last_word_in_cblock) {\n      if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) {\n        page_str << \"<Coords points=\\\"\";\n        block_bottom_pts = ReversePolygonline(block_bottom_pts, 1);\n        ptaJoin(block_top_pts, block_bottom_pts, 0, -1);\n        if (ttb_flag) {\n          block_top_pts = TransposePolygonline(block_top_pts);\n        }\n        ptaGetMinMax(block_top_pts, &x1, &y1, &x2, &y2);\n        page_str << static_cast<uint32_t>(x1) << \",\" << static_cast<uint32_t>(y1) << ' '\n                 << static_cast<uint32_t>(x2) << \",\" << static_cast<uint32_t>(y1) << ' '\n                 << static_cast<uint32_t>(x2) << \",\" << static_cast<uint32_t>(y2) << ' '\n                 << static_cast<uint32_t>(x1) << \",\" << static_cast<uint32_t>(y2)\n                 << \"\\\"/>\\n\";\n        block_top_pts = DestroyAndCreatePta(block_top_pts);\n        block_bottom_pts = DestroyAndCreatePta(block_bottom_pts);\n      }\n      page_str << line_str.str();\n      line_str.str(\"\");\n      page_str << \"\\t\\t\\t<TextEquiv index=\\\"1\\\" conf=\\\"\" << std::setprecision(4)\n               << block_conf << \"\\\">\\n\"\n               << \"\\t\\t\\t\\t<Unicode>\" << region_content.str() << \"</Unicode>\\n\"\n               << \"\\t\\t\\t</TextEquiv>\\n\";\n      page_str << \"\\t\\t</TextRegion>\\n\";\n      region_content.str(\"\");\n      rcnt++;\n      lcnt = 0;\n    }\n  }\n\n  // Destroy all point information\n  ptaDestroy(&block_top_pts);\n  ptaDestroy(&block_bottom_pts);\n  ptaDestroy(&line_top_ltr_pts);\n  ptaDestroy(&line_bottom_ltr_pts);\n  ptaDestroy(&line_top_rtl_pts);\n  ptaDestroy(&line_bottom_rtl_pts);\n  ptaDestroy(&word_top_pts);\n  ptaDestroy(&word_bottom_pts);\n  ptaDestroy(&word_baseline_pts);\n  ptaDestroy(&line_baseline_rtl_pts);\n  ptaDestroy(&line_baseline_ltr_pts);\n  ptaDestroy(&line_baseline_pts);\n\n  reading_order_str << \"\\t\\t\\t</OrderedGroup>\\n\"\n                    << \"\\t\\t</ReadingOrder>\\n\";\n\n  reading_order_str << page_str.str();\n  page_str.str(\"\");\n  reading_order_str << \"\\t</Page>\\n\";\n  const std::string &text = reading_order_str.str();\n  reading_order_str.str(\"\");\n\n  return copy_string(text);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/pdf_ttf.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        pdf_ttf.h\n// Description: pdf.ttf (GlyphLessFont) replacement.\n//              Generated with: \"bin2cpp pdf.ttf pdf_ttf cpp17\"\n// Author:      Zdenko Podobny\n//\n// (C) Copyright 2020, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef pdf_ttf__H\n#define pdf_ttf__H\n\n#include <cstdint> // uint8_t\n\nstatic const uint8_t pdf_ttf[] = {\n    0x0,  0x1,  0x0,  0x0,  0x0,  0xa,  0x0,  0x80, 0x0,  0x3,  0x0,  0x20, 0x4f, 0x53, 0x2f, 0x32,\n    0x56, 0xde, 0xc8, 0x94, 0x0,  0x0,  0x1,  0x28, 0x0,  0x0,  0x0,  0x60, 0x63, 0x6d, 0x61, 0x70,\n    0x0,  0xa,  0x0,  0x34, 0x0,  0x0,  0x1,  0x90, 0x0,  0x0,  0x0,  0x1e, 0x67, 0x6c, 0x79, 0x66,\n    0x15, 0x22, 0x41, 0x24, 0x0,  0x0,  0x1,  0xb8, 0x0,  0x0,  0x0,  0x18, 0x68, 0x65, 0x61, 0x64,\n    0xb,  0x78, 0xf1, 0x65, 0x0,  0x0,  0x0,  0xac, 0x0,  0x0,  0x0,  0x36, 0x68, 0x68, 0x65, 0x61,\n    0xc,  0x2,  0x4,  0x2,  0x0,  0x0,  0x0,  0xe4, 0x0,  0x0,  0x0,  0x24, 0x68, 0x6d, 0x74, 0x78,\n    0x4,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x88, 0x0,  0x0,  0x0,  0x8,  0x6c, 0x6f, 0x63, 0x61,\n    0x0,  0xc,  0x0,  0x0,  0x0,  0x0,  0x1,  0xb0, 0x0,  0x0,  0x0,  0x6,  0x6d, 0x61, 0x78, 0x70,\n    0x0,  0x4,  0x0,  0x5,  0x0,  0x0,  0x1,  0x8,  0x0,  0x0,  0x0,  0x20, 0x6e, 0x61, 0x6d, 0x65,\n    0xf2, 0xeb, 0x16, 0xda, 0x0,  0x0,  0x1,  0xd0, 0x0,  0x0,  0x0,  0x4b, 0x70, 0x6f, 0x73, 0x74,\n    0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x2,  0x1c, 0x0,  0x0,  0x0,  0x20, 0x0,  0x1,  0x0,  0x0,\n    0x0,  0x1,  0x0,  0x0,  0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf,  0x3c, 0xf5, 0x4,  0x7,  0x8,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0xcf, 0x9a, 0xfc, 0x6e, 0x0,  0x0,  0x0,  0x0,  0xd4, 0xc3, 0xa7, 0xf2,\n    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x8,  0x0,  0x0,  0x0,  0x0,  0x10, 0x0,  0x2,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x8,  0x0,  0xff, 0xff, 0x0,  0x0,  0x4,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x2,  0x0,  0x4,\n    0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x1,  0x90, 0x0,  0x5,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x47, 0x4f, 0x4f, 0x47, 0x0,  0x40, 0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0xff, 0xff,\n    0x0,  0x0,  0x0,  0x1,  0x0,  0x1,  0x80, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x14, 0x0,  0x3,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x14, 0x0,  0x6,  0x0,  0xa,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0xc,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,\n    0x8,  0x0,  0x0,  0x3,  0x0,  0x0,  0x31, 0x21, 0x11, 0x21, 0x4,  0x0,  0xfc, 0x0,  0x8,  0x0,\n    0x0,  0x0,  0x0,  0x3,  0x0,  0x2a, 0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x0,  0x5,  0x0,  0x16,\n    0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0xb,  0x0,  0x16, 0x0,  0x3,\n    0x0,  0x1,  0x4,  0x9,  0x0,  0x5,  0x0,  0x16, 0x0,  0x0,  0x0,  0x56, 0x0,  0x65, 0x0,  0x72,\n    0x0,  0x73, 0x0,  0x69, 0x0,  0x6f, 0x0,  0x6e, 0x0,  0x20, 0x0,  0x31, 0x0,  0x2e, 0x0,  0x30,\n    0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0,  0x0,  0x1,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,\n    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0};\n\n#endif\n"
  },
  {
    "path": "src/api/pdfrenderer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        pdfrenderer.cpp\n// Description: PDF rendering interface to inject into TessBaseAPI\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"pdf_ttf.h\"\n#include \"tprintf.h\"\n#include \"helpers.h\" // for Swap, copy_string\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <tesseract/publictypes.h> // for PTIsTextType()\n#include <tesseract/renderer.h>\n#include <cmath>\n#include <cstring>\n#include <fstream>   // for std::ifstream\n#include <locale>    // for std::locale::classic\n#include <memory>    // std::unique_ptr\n#include <sstream>   // for std::stringstream\n#include <string_view>\n\nusing namespace std::literals;\n\n#ifndef NDEBUG\n#define DEBUG_PDF\n#endif\n#ifdef DEBUG_PDF\n#define NO_PDF_COMPRESSION\n#endif\n\n/*\n\nDesign notes from Ken Sharp, with light editing.\n\nWe think one solution is a font with a single glyph (.notdef) and a\nCIDToGIDMap which maps all the CIDs to 0. That map would then be\nstored as a stream in the PDF file, and when flat compressed should\nbe pretty small. The font, of course, will be approximately the same\nsize as the one you currently use.\n\nI'm working on such a font now, the CIDToGIDMap is trivial, you just\ncreate a stream object which contains 128k bytes (2 bytes per possible\nCID and your CIDs range from 0 to 65535) and where you currently have\n\"/CIDToGIDMap /Identity\" you would have \"/CIDToGIDMap <object> 0 R\".\n\nNote that if, in future, you were to use a different (ie not 2 byte)\nCMap for character codes you could trivially extend the CIDToGIDMap.\n\nThe following is an explanation of how some of the font stuff works,\nthis may be too simple for you in which case please accept my\napologies, its hard to know how much knowledge someone has. You can\nskip all this anyway, its just for information.\n\nThe font embedded in a PDF file is usually intended just to be\nrendered, but extensions allow for at least some ability to locate (or\ncopy) text from a document. This isn't something which was an original\ngoal of the PDF format, but its been retro-fitted, presumably due to\npopular demand.\n\nTo do this reliably the PDF file must contain a ToUnicode CMap, a\ndevice for mapping character codes to Unicode code points. If one of\nthese is present, then this will be used to convert the character\ncodes into Unicode values. If its not present then the reader will\nfall back through a series of heuristics to try and guess the\nresult. This is, as you would expect, prone to failure.\n\nThis doesn't concern you of course, since you always write a ToUnicode\nCMap, so because you are writing the text in text rendering mode 3 it\nwould seem that you don't really need to worry about this, but in the\nPDF spec you cannot have an isolated ToUnicode CMap, it has to be\nattached to a font, so in order to get even copy/paste to work you\nneed to define a font.\n\nThis is what leads to problems, tools like pdfwrite assume that they\nare going to be able to (or even have to) modify the font entries, so\nthey require that the font being embedded be valid, and to be honest\nthe font Tesseract embeds isn't valid (for this purpose).\n\n\nTo see why lets look at how text is specified in a PDF file:\n\n(Test) Tj\n\nNow that looks like text but actually it isn't. Each of those bytes is\na 'character code'. When it comes to rendering the text a complex\nsequence of events takes place, which converts the character code into\n'something' which the font understands. Its entirely possible via\ncharacter mappings to have that text render as 'Sftu'\n\nFor simple fonts (PostScript type 1), we use the character code as the\nindex into an Encoding array (256 elements), each element of which is\na glyph name, so this gives us a glyph name. We then consult the\nCharStrings dictionary in the font, that's a complex object which\ncontains pairs of keys and values, you can use the key to retrieve a\ngiven value. So we have a glyph name, we then use that as the key to\nthe dictionary and retrieve the associated value. For a type 1 font,\nthe value is a glyph program that describes how to draw the glyph.\n\nFor CIDFonts, its a little more complicated. Because CIDFonts can be\nlarge, using a glyph name as the key is unreasonable (it would also\nlead to unfeasibly large Encoding arrays), so instead we use a 'CID'\nas the key. CIDs are just numbers.\n\nBut.... We don't use the character code as the CID. What we do is use\na CMap to convert the character code into a CID. We then use the CID\nto key the CharStrings dictionary and proceed as before. So the 'CMap'\nis the equivalent of the Encoding array, but its a more compact and\nflexible representation.\n\nNote that you have to use the CMap just to find out how many bytes\nconstitute a character code, and it can be variable. For example you\ncan say if the first byte is 0x00->0x7f then its just one byte, if its\n0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I\nhave seen CMaps defining character codes up to 5 bytes wide.\n\nNow that's fine for 'PostScript' CIDFonts, but its not sufficient for\nTrueType CIDFonts. The thing is that TrueType fonts are accessed using\na Glyph ID (GID) (and the LOCA table) which may well not be anything\nlike the CID. So for this case PDF includes a CIDToGIDMap. That maps\nthe CIDs to GIDs, and we can then use the GID to get the glyph\ndescription from the GLYF table of the font.\n\nSo for a TrueType CIDFont, character-code->CID->GID->glyf-program.\n\nLooking at the PDF file I was supplied with we see that it contains\ntext like :\n\n<0x0075> Tj\n\nSo we start by taking the character code (117) and look it up in the\nCMap. Well you don't supply a CMap, you just use the Identity-H one\nwhich is predefined. So character code 117 maps to CID 117. Then we\nuse the CIDToGIDMap, again you don't supply one, you just use the\npredefined 'Identity' map. So CID 117 maps to GID 117. But the font we\nwere supplied with only contains 116 glyphs.\n\nNow for Latin that's not a huge problem, you can just supply a bigger\nfont. But for more complex languages that *is* going to be more of a\nproblem. Either you need to supply a font which contains glyphs for\nall the possible CID->GID mappings, or we need to think laterally.\n\nOur solution using a TrueType CIDFont is to intervene at the\nCIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a\nfont with just one glyph, the .notdef glyph at GID 0. This is what I'm\nlooking into now.\n\nIt would also be possible to have a 'PostScript' (ie type 1 outlines)\nCIDFont which contained 1 glyph, and a CMap which mapped all character\ncodes to CID 0. The effect would be the same.\n\nIts possible (I haven't checked) that the PostScript CIDFont and\nassociated CMap would be smaller than the TrueType font and associated\nCIDToGIDMap.\n\n--- in a followup ---\n\nOK there is a small problem there, if I use GID 0 then Acrobat gets\nupset about it and complains it cannot extract the font. If I set the\nCIDToGIDMap so that all the entries are 1 instead, it's happy. Totally\nmad......\n\n*/\n\nnamespace tesseract {\n\n// If the font is 10 pts, nominal character width is 5 pts\nstatic const int kCharWidth = 2;\n\n// Used for memory allocation. A codepoint must take no more than this\n// many bytes, when written in the PDF way. e.g. \"<0063>\" for the\n// letter 'c'\nstatic const int kMaxBytesPerCodepoint = 20;\n\n/**********************************************************************\n * PDF Renderer interface implementation\n **********************************************************************/\nTessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)\n    : TessResultRenderer(outputbase, \"pdf\"), datadir_(datadir) {\n  obj_ = 0;\n  textonly_ = textonly;\n  offsets_.push_back(0);\n}\n\nvoid TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {\n  offsets_.push_back(objectsize + offsets_.back());\n  obj_++;\n}\n\nvoid TessPDFRenderer::AppendPDFObject(const char *data) {\n  AppendPDFObjectDIY(strlen(data));\n  AppendString(data);\n}\n\n// Helper function to prevent us from accidentally writing\n// scientific notation to an HOCR or PDF file. Besides, three\n// decimal points are all you really need.\nstatic double prec(double x) {\n  double kPrecision = 1000.0;\n  double a = round(x * kPrecision) / kPrecision;\n  if (a == -0) {\n    return 0;\n  }\n  return a;\n}\n\nstatic long dist2(int x1, int y1, int x2, int y2) {\n  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);\n}\n\n// Viewers like evince can get really confused during copy-paste when\n// the baseline wanders around. So I've decided to project every word\n// onto the (straight) line baseline. All numbers are in the native\n// PDF coordinate system, which has the origin in the bottom left and\n// the unit is points, which is 1/72 inch. Tesseract reports baselines\n// left-to-right no matter what the reading order is. We need the\n// word baseline in reading order, so we do that conversion here. Returns\n// the word's baseline origin and length.\nstatic void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,\n                            int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,\n                            int line_y2, double *x0, double *y0, double *length) {\n  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {\n    std::swap(word_x1, word_x2);\n    std::swap(word_y1, word_y2);\n  }\n  double word_length;\n  double x, y;\n  {\n    double l2 = dist2(line_x1, line_y1, line_x2, line_y2);\n    if (l2 == 0) {\n      x = line_x1;\n      y = line_y1;\n    } else {\n      int px = word_x1;\n      int py = word_y1;\n      double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;\n      x = line_x2 + t * (line_x2 - line_x1);\n      y = line_y2 + t * (line_y2 - line_y1);\n    }\n    word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));\n    word_length = word_length * 72.0 / ppi;\n    x = x * 72 / ppi;\n    y = height - (y * 72.0 / ppi);\n  }\n  *x0 = x;\n  *y0 = y;\n  *length = word_length;\n}\n\n// Compute coefficients for an affine matrix describing the rotation\n// of the text. If the text is right-to-left such as Arabic or Hebrew,\n// we reflect over the Y-axis. This matrix will set the coordinate\n// system for placing text in the PDF file.\n//\n//                           RTL\n// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]\n// [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]\nstatic void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,\n                         double *a, double *b, double *c, double *d) {\n  double theta =\n      atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));\n  *a = cos(theta);\n  *b = sin(theta);\n  *c = -sin(theta);\n  *d = cos(theta);\n  switch (writing_direction) {\n    case WRITING_DIRECTION_RIGHT_TO_LEFT:\n      *a = -*a;\n      *b = -*b;\n      break;\n    case WRITING_DIRECTION_TOP_TO_BOTTOM:\n      // TODO(jbreiden) Consider using the vertical PDF writing mode.\n      break;\n    default:\n      break;\n  }\n}\n\n// There are some really awkward PDF viewers in the wild, such as\n// 'Preview' which ships with the Mac. They do a better job with text\n// selection and highlighting when given perfectly flat baseline\n// instead of very slightly tilted. We clip small tilts to appease\n// these viewers. I chose this threshold large enough to absorb noise,\n// but small enough that lines probably won't cross each other if the\n// whole page is tilted at almost exactly the clipping threshold.\nstatic void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,\n                         int *line_x2, int *line_y2) {\n  *line_x1 = x1;\n  *line_y1 = y1;\n  *line_x2 = x2;\n  *line_y2 = y2;\n  int rise = abs(y2 - y1) * 72;\n  int run = abs(x2 - x1) * 72;\n  if (rise < 2 * ppi && 2 * ppi < run) {\n    *line_y1 = *line_y2 = (y1 + y2) / 2;\n  }\n}\n\nstatic bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {\n  if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {\n    tprintf(\"Dropping invalid codepoint %d\\n\", code);\n    return false;\n  }\n  if (code < 0x10000) {\n    snprintf(utf16, kMaxBytesPerCodepoint, \"%04X\", code);\n  } else {\n    int a = code - 0x010000;\n    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;\n    int low_surrogate = (0x03FF & a) + 0xDC00;\n    snprintf(utf16, kMaxBytesPerCodepoint, \"%04X%04X\", high_surrogate, low_surrogate);\n  }\n  return true;\n}\n\nchar *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {\n  double ppi = api->GetSourceYResolution();\n\n  // These initial conditions are all arbitrary and will be overwritten\n  double old_x = 0.0, old_y = 0.0;\n  int old_fontsize = 0;\n  tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;\n  bool new_block = true;\n  int fontsize = 0;\n  double a = 1;\n  double b = 0;\n  double c = 0;\n  double d = 1;\n\n  std::stringstream pdf_str;\n  // Use \"C\" locale (needed for double values prec()).\n  pdf_str.imbue(std::locale::classic());\n  // Use 8 digits for double values.\n  pdf_str.precision(8);\n\n  // TODO(jbreiden) This marries the text and image together.\n  // Slightly cleaner from an abstraction standpoint if this were to\n  // live inside a separate text object.\n  pdf_str << \"q \" << prec(width) << \" 0 0 \" << prec(height) << \" 0 0 cm\";\n  if (!textonly_) {\n    pdf_str << \" /Im1 Do\";\n  }\n  pdf_str << \" Q\\n\";\n\n  int line_x1 = 0;\n  int line_y1 = 0;\n  int line_x2 = 0;\n  int line_y2 = 0;\n\n  const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {\n      auto block_type = res_it->BlockType();\n      if (!PTIsTextType(block_type)) {\n        // ignore non-text blocks\n        res_it->Next(RIL_BLOCK);\n        continue;\n      }\n      pdf_str << \"BT\\n3 Tr\"; // Begin text object, use invisible ink\n      old_fontsize = 0;      // Every block will declare its fontsize\n      new_block = true;      // Every block will declare its affine matrix\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      int x1, y1, x2, y2;\n      res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);\n      ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);\n    }\n\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    // Writing direction changes at a per-word granularity\n    tesseract::WritingDirection writing_direction;\n    {\n      tesseract::Orientation orientation;\n      tesseract::TextlineOrder textline_order;\n      float deskew_angle;\n      res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);\n      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {\n        switch (res_it->WordDirection()) {\n          case DIR_LEFT_TO_RIGHT:\n            writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;\n            break;\n          case DIR_RIGHT_TO_LEFT:\n            writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;\n            break;\n          default:\n            writing_direction = old_writing_direction;\n        }\n      }\n    }\n\n    // Where is word origin and how long is it?\n    double x, y, word_length;\n    {\n      int word_x1, word_y1, word_x2, word_y2;\n      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);\n      GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,\n                      line_y1, line_x2, line_y2, &x, &y, &word_length);\n    }\n\n    if (writing_direction != old_writing_direction || new_block) {\n      AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);\n      pdf_str << \" \" << prec(a) // . This affine matrix\n              << \" \" << prec(b) // . sets the coordinate\n              << \" \" << prec(c) // . system for all\n              << \" \" << prec(d) // . text that follows.\n              << \" \" << prec(x) // .\n              << \" \" << prec(y) // .\n              << (\" Tm \");      // Place cursor absolutely\n      new_block = false;\n    } else {\n      double dx = x - old_x;\n      double dy = y - old_y;\n      pdf_str << \" \" << prec(dx * a + dy * b) << \" \" << prec(dx * c + dy * d)\n              << (\" Td \"); // Relative moveto\n    }\n    old_x = x;\n    old_y = y;\n    old_writing_direction = writing_direction;\n\n    // Adjust font size on a per word granularity. Pay attention to\n    // fontsize, old_fontsize, and pdf_str. We've found that for\n    // in Arabic, Tesseract will happily return a fontsize of zero,\n    // so we make up a default number to protect ourselves.\n    {\n      bool bold, italic, underlined, monospace, serif, smallcaps;\n      int font_id;\n      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,\n                                 &fontsize, &font_id);\n      const int kDefaultFontsize = 8;\n      if (fontsize <= 0) {\n        fontsize = kDefaultFontsize;\n      }\n      if (fontsize != old_fontsize) {\n        pdf_str << \"/f-0-0 \" << fontsize << \" Tf \";\n        old_fontsize = fontsize;\n#ifdef DEBUG_PDF\n        pdf_str << \"\\n\";\n#endif\n      }\n    }\n\n    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);\n    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);\n    std::string pdf_word;\n    int pdf_word_len = 0;\n    do {\n      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));\n      if (grapheme && grapheme[0] != '\\0') {\n        std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());\n        char utf16[kMaxBytesPerCodepoint];\n        for (char32 code : unicodes) {\n          if (CodepointToUtf16be(code, utf16)) {\n            pdf_word += utf16;\n            pdf_word_len++;\n          }\n        }\n      }\n      res_it->Next(RIL_SYMBOL);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n    if (res_it->IsAtBeginningOf(RIL_WORD)) {\n      pdf_word += \"0020\";\n    }\n    if (word_length > 0 && pdf_word_len > 0) {\n      double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));\n      pdf_str << h_stretch << \" Tz\"; // horizontal stretch\n      pdf_str\n          << \" [ <\" << pdf_word // UTF-16BE representation\n          << \"> ] TJ\";          // show the text\n#ifdef DEBUG_PDF\n      pdf_str << \"\\n\";\n#endif\n    }\n    if (last_word_in_line) {\n      pdf_str << \" \\n\";\n    }\n    if (last_word_in_block) {\n      pdf_str << \"ET\\n\"; // end the text object\n    }\n  }\n  return copy_string(pdf_str.str());\n}\n\nbool TessPDFRenderer::BeginDocumentHandler() {\n  AppendPDFObject(\"%PDF-1.5\\n%\\xDE\\xAD\\xBE\\xEB\\n\");\n\n  // CATALOG\n  AppendPDFObject(\n      \"1 0 obj\\n\"\n      \"<<\\n\"\n      \"  /Type /Catalog\\n\"\n      \"  /Pages 2 0 R\\n\"\n      \">>\\nendobj\\n\");\n\n  // We are reserving object #2 for the /Pages\n  // object, which I am going to create and write\n  // at the end of the PDF file.\n  AppendPDFObject(\"\");\n\n  // TYPE0 FONT\n  AppendPDFObject(\n      \"3 0 obj\\n\"\n      \"<<\\n\"\n      \"  /BaseFont /GlyphLessFont\\n\"\n      \"  /DescendantFonts [ 4 0 R ]\\n\" // CIDFontType2 font\n      \"  /Encoding /Identity-H\\n\"\n      \"  /Subtype /Type0\\n\"\n      \"  /ToUnicode 6 0 R\\n\" // ToUnicode\n      \"  /Type /Font\\n\"\n      \">>\\n\"\n      \"endobj\\n\");\n\n  // CIDFONTTYPE2\n  std::stringstream stream;\n  // Use \"C\" locale (needed for int values larger than 999).\n  stream.imbue(std::locale::classic());\n  stream << \"4 0 obj\\n\"\n            \"<<\\n\"\n            \"  /BaseFont /GlyphLessFont\\n\"\n            \"  /CIDToGIDMap 5 0 R\\n\" // CIDToGIDMap\n            \"  /CIDSystemInfo\\n\"\n            \"  <<\\n\"\n            \"     /Ordering (Identity)\\n\"\n            \"     /Registry (Adobe)\\n\"\n            \"     /Supplement 0\\n\"\n            \"  >>\\n\"\n            \"  /FontDescriptor 7 0 R\\n\" // Font descriptor\n            \"  /Subtype /CIDFontType2\\n\"\n            \"  /Type /Font\\n\"\n            \"  /DW \"\n         << (1000 / kCharWidth)\n         << \"\\n\"\n            \">>\\n\"\n            \"endobj\\n\";\n  AppendPDFObject(stream.str().c_str());\n\n  // CIDTOGIDMAP\n  const int kCIDToGIDMapSize = 2 * (1 << 16);\n  const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);\n  for (int i = 0; i < kCIDToGIDMapSize; i++) {\n    cidtogidmap[i] = (i % 2) ? 1 : 0;\n  }\n  size_t len = kCIDToGIDMapSize;\n#ifndef NO_PDF_COMPRESSION\n  auto comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);\n#endif\n  stream.str(\"\");\n  stream << \"5 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Length \"\n         << len\n         << \"\"\n#ifndef NO_PDF_COMPRESSION\n            \" /Filter /FlateDecode\"\n#endif\n            \"\\n\"\n            \">>\\n\"\n            \"stream\\n\"\n            ;\n  AppendString(stream.str().c_str());\n  long objsize = stream.str().size();\n#ifndef NO_PDF_COMPRESSION\n  AppendData(reinterpret_cast<char *>(comp), len);\n#else\n  AppendData(reinterpret_cast<char *>(cidtogidmap.get()), len);\n#endif\n  objsize += len;\n#ifndef NO_PDF_COMPRESSION\n  lept_free(comp);\n#endif\n  objsize += AppendData(\"endstream\\n\"sv);\n  objsize += AppendData(\"endobj\\n\"sv);\n  AppendPDFObjectDIY(objsize);\n\n  const char stream2[] =\n      \"/CIDInit /ProcSet findresource begin\\n\"\n      \"12 dict begin\\n\"\n      \"begincmap\\n\"\n      \"/CIDSystemInfo\\n\"\n      \"<<\\n\"\n      \"  /Registry (Adobe)\\n\"\n      \"  /Ordering (UCS)\\n\"\n      \"  /Supplement 0\\n\"\n      \">> def\\n\"\n      \"/CMapName /Adobe-Identify-UCS def\\n\"\n      \"/CMapType 2 def\\n\"\n      \"1 begincodespacerange\\n\"\n      \"<0000> <FFFF>\\n\"\n      \"endcodespacerange\\n\"\n      \"1 beginbfrange\\n\"\n      \"<0000> <FFFF> <0000>\\n\"\n      \"endbfrange\\n\"\n      \"endcmap\\n\"\n      \"CMapName currentdict /CMap defineresource pop\\n\"\n      \"end\\n\"\n      \"end\\n\";\n\n  // TOUNICODE\n  stream.str(\"\");\n  stream << \"6 0 obj\\n\"\n            \"<< /Length \"\n         << (sizeof(stream2) - 1)\n         << \" >>\\n\"\n            \"stream\\n\"\n         << stream2\n         << \"endstream\\n\"\n            \"endobj\\n\";\n  AppendPDFObject(stream.str().c_str());\n\n  // FONT DESCRIPTOR\n  stream.str(\"\");\n  stream << \"7 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Ascent 1000\\n\"\n            \"  /CapHeight 1000\\n\"\n            \"  /Descent -1\\n\" // Spec says must be negative\n            \"  /Flags 5\\n\"    // FixedPitch + Symbolic\n            \"  /FontBBox  [ 0 0 \"\n         << (1000 / kCharWidth)\n         << \" 1000 ]\\n\"\n            \"  /FontFile2 8 0 R\\n\"\n            \"  /FontName /GlyphLessFont\\n\"\n            \"  /ItalicAngle 0\\n\"\n            \"  /StemV 80\\n\"\n            \"  /Type /FontDescriptor\\n\"\n            \">>\\n\"\n            \"endobj\\n\";\n  AppendPDFObject(stream.str().c_str());\n\n  stream.str(\"\");\n  stream << datadir_.c_str() << \"/pdf.ttf\";\n  const uint8_t *font;\n  std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);\n  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});\n  auto size = buffer.size();\n  if (size) {\n    font = buffer.data();\n  } else {\n#if !defined(NDEBUG)\n    tprintf(\"Cannot open file \\\"%s\\\"!\\nUsing internal glyphless font.\\n\", stream.str().c_str());\n#endif\n    font = pdf_ttf;\n    size = sizeof(pdf_ttf);\n  }\n\n  // FONTFILE2\n  stream.str(\"\");\n  stream << \"8 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Length \"\n         << size\n         << \"\\n\"\n            \"  /Length1 \"\n         << size\n         << \"\\n\"\n            \">>\\n\"\n            \"stream\\n\";\n  AppendString(stream.str().c_str());\n  objsize = stream.str().size();\n  AppendData(reinterpret_cast<const char *>(font), size);\n  objsize += size;\n  objsize += AppendData(\"endstream\\n\"sv);\n  objsize += AppendData(\"endobj\\n\"sv);\n  AppendPDFObjectDIY(objsize);\n  return true;\n}\n\nbool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,\n                                    char **pdf_object, long int *pdf_object_size,\n                                    const int jpg_quality) {\n  if (!pdf_object_size || !pdf_object) {\n    return false;\n  }\n  *pdf_object = nullptr;\n  *pdf_object_size = 0;\n  if (!filename && !pix) {\n    return false;\n  }\n\n  L_Compressed_Data *cid = nullptr;\n  auto sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);\n\n  if (sad || !cid) {\n    l_CIDataDestroy(&cid);\n    return false;\n  }\n\n  const char *group4 = \"\";\n  const char *filter;\n  switch (cid->type) {\n    case L_FLATE_ENCODE:\n      filter = \"/FlateDecode\";\n      break;\n    case L_JPEG_ENCODE:\n      filter = \"/DCTDecode\";\n      break;\n    case L_G4_ENCODE:\n      filter = \"/CCITTFaxDecode\";\n      group4 = \"    /K -1\\n\";\n      break;\n    case L_JP2K_ENCODE:\n      filter = \"/JPXDecode\";\n      break;\n    default:\n      l_CIDataDestroy(&cid);\n      return false;\n  }\n\n  // Maybe someday we will accept RGBA but today is not that day.\n  // It requires creating an /SMask for the alpha channel.\n  // http://stackoverflow.com/questions/14220221\n  std::stringstream colorspace;\n  // Use \"C\" locale (needed for int values larger than 999).\n  colorspace.imbue(std::locale::classic());\n  if (cid->ncolors > 0) {\n    colorspace << \"  /ColorSpace [ /Indexed /DeviceRGB \" << (cid->ncolors - 1) << \" \"\n               << cid->cmapdatahex << \" ]\\n\";\n  } else {\n    switch (cid->spp) {\n      case 1:\n        if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {\n          colorspace.str(\n              \"  /ColorSpace /DeviceGray\\n\"\n              \"  /Decode [1 0]\\n\");\n        } else {\n          colorspace.str(\"  /ColorSpace /DeviceGray\\n\");\n        }\n        break;\n      case 3:\n        colorspace.str(\"  /ColorSpace /DeviceRGB\\n\");\n        break;\n      default:\n        l_CIDataDestroy(&cid);\n        return false;\n    }\n  }\n\n  int predictor = (cid->predictor) ? 14 : 1;\n\n  // IMAGE\n  std::stringstream b1;\n  // Use \"C\" locale (needed for int values larger than 999).\n  b1.imbue(std::locale::classic());\n  b1 << objnum\n     << \" 0 obj\\n\"\n        \"<<\\n\"\n        \"  /Length \"\n     << cid->nbytescomp\n     << \"\\n\"\n        \"  /Subtype /Image\\n\";\n\n  std::stringstream b2;\n  // Use \"C\" locale (needed for int values larger than 999).\n  b2.imbue(std::locale::classic());\n  b2 << \"  /Width \" << cid->w\n     << \"\\n\"\n        \"  /Height \"\n     << cid->h\n     << \"\\n\"\n        \"  /BitsPerComponent \"\n     << cid->bps\n     << \"\\n\"\n        \"  /Filter \"\n     << filter\n     << \"\\n\"\n        \"  /DecodeParms\\n\"\n        \"  <<\\n\"\n        \"    /Predictor \"\n     << predictor\n     << \"\\n\"\n        \"    /Colors \"\n     << cid->spp << \"\\n\"\n     << group4 << \"    /Columns \" << cid->w\n     << \"\\n\"\n        \"    /BitsPerComponent \"\n     << cid->bps\n     << \"\\n\"\n        \"  >>\\n\"\n        \">>\\n\"\n        \"stream\\n\";\n\n  const char *b3 =\n      \"endstream\\n\"\n      \"endobj\\n\";\n\n  size_t b1_len = b1.str().size();\n  size_t b2_len = b2.str().size();\n  size_t b3_len = strlen(b3);\n  size_t colorspace_len = colorspace.str().size();\n\n  *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;\n  *pdf_object = new char[*pdf_object_size];\n\n  char *p = *pdf_object;\n  memcpy(p, b1.str().c_str(), b1_len);\n  p += b1_len;\n  memcpy(p, colorspace.str().c_str(), colorspace_len);\n  p += colorspace_len;\n  memcpy(p, b2.str().c_str(), b2_len);\n  p += b2_len;\n  memcpy(p, cid->datacomp, cid->nbytescomp);\n  p += cid->nbytescomp;\n  memcpy(p, b3, b3_len);\n  l_CIDataDestroy(&cid);\n  return true;\n}\n\nbool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {\n  Pix *pix = api->GetInputImage();\n  const char *filename = api->GetInputName();\n  int ppi = api->GetSourceYResolution();\n  if (!pix || ppi <= 0) {\n    return false;\n  }\n  double width = pixGetWidth(pix) * 72.0 / ppi;\n  double height = pixGetHeight(pix) * 72.0 / ppi;\n\n  std::stringstream xobject;\n  // Use \"C\" locale (needed for int values larger than 999).\n  xobject.imbue(std::locale::classic());\n  if (!textonly_) {\n    xobject << \"/XObject << /Im1 \" << (obj_ + 2) << \" 0 R >>\\n\";\n  }\n\n  // PAGE\n  std::stringstream stream;\n  // Use \"C\" locale (needed for double values width and height).\n  stream.imbue(std::locale::classic());\n  stream.precision(2);\n  stream << std::fixed << obj_\n         << \" 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Type /Page\\n\"\n            \"  /Parent 2 0 R\\n\" // Pages object\n            \"  /MediaBox [0 0 \"\n         << width << \" \" << height\n         << \"]\\n\"\n            \"  /Contents \"\n         << (obj_ + 1)\n         << \" 0 R\\n\" // Contents object\n            \"  /Resources\\n\"\n            \"  <<\\n\"\n            \"    \"\n         << xobject.str() << // Image object\n      \"    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\\n\"\n      \"    /Font << /f-0-0 3 0 R >>\\n\" // Type0 Font\n      \"  >>\\n\"\n      \">>\\n\"\n      \"endobj\\n\";\n  pages_.push_back(obj_);\n  AppendPDFObject(stream.str().c_str());\n\n  // CONTENTS\n  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));\n  const size_t pdftext_len = strlen(pdftext.get());\n  size_t len = pdftext_len;\n#ifndef NO_PDF_COMPRESSION\n  auto comp_pdftext = zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);\n#endif\n  stream.str(\"\");\n  stream << obj_\n         << \" 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Length \"\n         << len\n         << \"\"\n#ifndef NO_PDF_COMPRESSION\n            \" /Filter /FlateDecode\"\n#endif\n            \"\\n\"\n            \">>\\n\"\n            \"stream\\n\"\n            ;\n  AppendString(stream.str().c_str());\n  long objsize = stream.str().size();\n#ifndef NO_PDF_COMPRESSION\n  AppendData(reinterpret_cast<char *>(comp_pdftext), len);\n#else\n  AppendData(reinterpret_cast<char *>(pdftext.get()), len);\n#endif\n  objsize += len;\n#ifndef NO_PDF_COMPRESSION\n  lept_free(comp_pdftext);\n#endif\n  objsize += AppendData(\"endstream\\n\"sv);\n  objsize += AppendData(\"endobj\\n\"sv);\n  AppendPDFObjectDIY(objsize);\n\n  if (!textonly_) {\n    char *pdf_object = nullptr;\n    int jpg_quality;\n    api->GetIntVariable(\"jpg_quality\", &jpg_quality);\n    if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {\n      return false;\n    }\n    AppendData(pdf_object, objsize);\n    AppendPDFObjectDIY(objsize);\n    delete[] pdf_object;\n  }\n  return true;\n}\n\nbool TessPDFRenderer::EndDocumentHandler() {\n  // We reserved the /Pages object number early, so that the /Page\n  // objects could refer to their parent. We finally have enough\n  // information to go fill it in. Using lower level calls to manipulate\n  // the offset record in two spots, because we are placing objects\n  // out of order in the file.\n\n  // PAGES\n  const long int kPagesObjectNumber = 2;\n  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1\n  std::stringstream stream;\n  // Use \"C\" locale (needed for int values larger than 999).\n  stream.imbue(std::locale::classic());\n  stream << kPagesObjectNumber << \" 0 obj\\n<<\\n  /Type /Pages\\n  /Kids [ \";\n  AppendString(stream.str().c_str());\n  size_t pages_objsize = stream.str().size();\n  for (const auto &page : pages_) {\n    stream.str(\"\");\n    stream << page << \" 0 R \";\n    AppendString(stream.str().c_str());\n    pages_objsize += stream.str().size();\n  }\n  stream.str(\"\");\n  stream << \"]\\n  /Count \" << pages_.size() << \"\\n>>\\nendobj\\n\";\n  AppendString(stream.str().c_str());\n  pages_objsize += stream.str().size();\n  offsets_.back() += pages_objsize; // manipulation #2\n\n  // INFO\n  std::string utf16_title = \"FEFF\"; // byte_order_marker\n  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());\n  char utf16[kMaxBytesPerCodepoint];\n  for (char32 code : unicodes) {\n    if (CodepointToUtf16be(code, utf16)) {\n      utf16_title += utf16;\n    }\n  }\n\n  char *datestr = l_getFormattedDate();\n  stream.str(\"\");\n  stream << obj_\n         << \" 0 obj\\n\"\n            \"<<\\n\"\n            \"  /Producer (Tesseract \"\n         << tesseract::TessBaseAPI::Version()\n         << \")\\n\"\n            \"  /CreationDate (D:\"\n         << datestr\n         << \")\\n\"\n            \"  /Title <\"\n         << utf16_title.c_str()\n         << \">\\n\"\n            \">>\\n\"\n            \"endobj\\n\";\n  lept_free(datestr);\n  AppendPDFObject(stream.str().c_str());\n  stream.str(\"\");\n  stream << \"xref\\n0 \" << obj_ << \"\\n0000000000 65535 f \\n\";\n  AppendString(stream.str().c_str());\n  for (int i = 1; i < obj_; i++) {\n    stream.str(\"\");\n    stream.width(10);\n    stream.fill('0');\n    stream << offsets_[i] << \" 00000 n \\n\";\n    AppendString(stream.str().c_str());\n  }\n  stream.str(\"\");\n  stream << \"trailer\\n<<\\n  /Size \" << obj_\n         << \"\\n\"\n            \"  /Root 1 0 R\\n\" // catalog\n            \"  /Info \"\n         << (obj_ - 1)\n         << \" 0 R\\n\" // info\n            \">>\\nstartxref\\n\"\n         << offsets_.back() << \"\\n%%EOF\\n\";\n  AppendString(stream.str().c_str());\n  return true;\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/renderer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        renderer.cpp\n// Description: Rendering interface to inject into TessBaseAPI\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n#include <tesseract/baseapi.h>\n#include <tesseract/renderer.h>\n#include <cstring>\n#include <memory>     // std::unique_ptr\n#include <string>     // std::string\n#include \"serialis.h\" // Serialize\n\nnamespace tesseract {\n\n/**********************************************************************\n * Base Renderer interface implementation\n **********************************************************************/\nTessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)\n    : next_(nullptr)\n    , fout_(stdout)\n    , file_extension_(extension)\n    , title_(\"\")\n    , imagenum_(-1)\n    , happy_(true) {\n  if (strcmp(outputbase, \"-\") && strcmp(outputbase, \"stdout\")) {\n    std::string outfile = std::string(outputbase) + \".\" + extension;\n    fout_ = fopen(outfile.c_str(), \"wb\");\n    if (fout_ == nullptr) {\n      happy_ = false;\n    }\n  }\n}\n\nTessResultRenderer::~TessResultRenderer() {\n  if (fout_ != nullptr) {\n    if (fout_ != stdout) {\n      fclose(fout_);\n    } else {\n      clearerr(fout_);\n    }\n  }\n  delete next_;\n}\n\nvoid TessResultRenderer::insert(TessResultRenderer *next) {\n  if (next == nullptr) {\n    return;\n  }\n\n  TessResultRenderer *remainder = next_;\n  next_ = next;\n  if (remainder) {\n    while (next->next_ != nullptr) {\n      next = next->next_;\n    }\n    next->next_ = remainder;\n  }\n}\n\nbool TessResultRenderer::BeginDocument(const char *title) {\n  if (!happy_) {\n    return false;\n  }\n  title_ = title;\n  imagenum_ = -1;\n  bool ok = BeginDocumentHandler();\n  if (next_) {\n    ok = next_->BeginDocument(title) && ok;\n  }\n  return ok;\n}\n\nbool TessResultRenderer::AddImage(TessBaseAPI *api) {\n  if (!happy_) {\n    return false;\n  }\n  ++imagenum_;\n  bool ok = AddImageHandler(api);\n  if (next_) {\n    ok = next_->AddImage(api) && ok;\n  }\n  return ok;\n}\n\nbool TessResultRenderer::EndDocument() {\n  if (!happy_) {\n    return false;\n  }\n  bool ok = EndDocumentHandler();\n  if (next_) {\n    ok = next_->EndDocument() && ok;\n  }\n  return ok;\n}\n\nvoid TessResultRenderer::AppendString(const char *s) {\n  if (s == nullptr) {\n    return;\n  }\n  AppendData(s, strlen(s));\n}\n\nvoid TessResultRenderer::AppendData(const char *s, int len) {\n  if (!tesseract::Serialize(fout_, s, len)) {\n    happy_ = false;\n  }\n  fflush(fout_);\n}\n\nbool TessResultRenderer::BeginDocumentHandler() {\n  return happy_;\n}\n\nbool TessResultRenderer::EndDocumentHandler() {\n  return happy_;\n}\n\n/**********************************************************************\n * UTF8 Text Renderer interface implementation\n **********************************************************************/\nTessTextRenderer::TessTextRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"txt\") {}\n\nbool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());\n  if (utf8 == nullptr) {\n    return false;\n  }\n\n  const char *pageSeparator = api->GetStringVariable(\"page_separator\");\n  if (pageSeparator != nullptr && *pageSeparator != '\\0' && imagenum() > 0) {\n    AppendString(pageSeparator);\n  }\n\n  AppendString(utf8.get());\n\n  return true;\n}\n\n/**********************************************************************\n * TSV Text Renderer interface implementation\n **********************************************************************/\nTessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, \"tsv\") {\n  font_info_ = false;\n}\n\nTessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)\n    : TessResultRenderer(outputbase, \"tsv\") {\n  font_info_ = font_info;\n}\n\nbool TessTsvRenderer::BeginDocumentHandler() {\n  // Output TSV column headings\n  AppendString(\n      \"level\\tpage_num\\tblock_num\\tpar_num\\tline_num\\tword_\"\n      \"num\\tleft\\ttop\\twidth\\theight\\tconf\\ttext\\n\");\n  return true;\n}\n\nbool TessTsvRenderer::EndDocumentHandler() {\n  return true;\n}\n\nbool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));\n  if (tsv == nullptr) {\n    return false;\n  }\n\n  AppendString(tsv.get());\n\n  return true;\n}\n\n/**********************************************************************\n * UNLV Text Renderer interface implementation\n **********************************************************************/\nTessUnlvRenderer::TessUnlvRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"unlv\") {}\n\nbool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> unlv(api->GetUNLVText());\n  if (unlv == nullptr) {\n    return false;\n  }\n\n  AppendString(unlv.get());\n\n  return true;\n}\n\n/**********************************************************************\n * BoxText Renderer interface implementation\n **********************************************************************/\nTessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"box\") {}\n\nbool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));\n  if (text == nullptr) {\n    return false;\n  }\n\n  AppendString(text.get());\n\n  return true;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n/**********************************************************************\n * Osd Text Renderer interface implementation\n **********************************************************************/\nTessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, \"osd\") {}\n\nbool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> osd(api->GetOsdText(imagenum()));\n  if (osd == nullptr) {\n    return false;\n  }\n\n  AppendString(osd.get());\n\n  return true;\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/api/wordstrboxrenderer.cpp",
    "content": "/**********************************************************************\n * File:        wordstrboxrenderer.cpp\n * Description: Renderer for creating box file with WordStr strings.\n *              based on the tsv renderer.\n *\n * (C) Copyright 2019, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <tesseract/baseapi.h> // for TessBaseAPI\n#include <tesseract/renderer.h>\n#include \"helpers.h\"        // for copy_string\n#include \"tesseractclass.h\" // for Tesseract\n\nnamespace tesseract {\n\n/**\n * Create a UTF8 box file with WordStr strings from the internal data\n * structures. page_number is a 0-base page index that will appear in the box\n * file. Returned string must be freed with the delete [] operator.\n */\n\nchar *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {\n  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {\n    return nullptr;\n  }\n\n  std::string wordstr_box_str;\n  int left = 0, top = 0, right = 0, bottom = 0;\n\n  bool first_line = true;\n\n  LTRResultIterator *res_it = GetLTRIterator();\n  while (!res_it->Empty(RIL_BLOCK)) {\n    if (res_it->Empty(RIL_WORD)) {\n      res_it->Next(RIL_WORD);\n      continue;\n    }\n\n    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {\n      if (!first_line) {\n        wordstr_box_str += \"\\n\\t \" + std::to_string(right + 1);\n        wordstr_box_str += \" \" + std::to_string(image_height_ - bottom);\n        wordstr_box_str += \" \" + std::to_string(right + 5);\n        wordstr_box_str += \" \" + std::to_string(image_height_ - top);\n        wordstr_box_str += \" \" + std::to_string(page_number); // row for tab for EOL\n        wordstr_box_str += \"\\n\";\n      } else {\n        first_line = false;\n      }\n      // Use bounding box for whole line for WordStr\n      res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);\n      wordstr_box_str += \"WordStr \" + std::to_string(left);\n      wordstr_box_str += \" \" + std::to_string(image_height_ - bottom);\n      wordstr_box_str += \" \" + std::to_string(right);\n      wordstr_box_str += \" \" + std::to_string(image_height_ - top);\n      wordstr_box_str += \" \" + std::to_string(page_number); // word\n      wordstr_box_str += \" #\";\n    }\n    do {\n      wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();\n      wordstr_box_str += \" \";\n      res_it->Next(RIL_WORD);\n    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));\n  }\n\n  if (left != 0 && top != 0 && right != 0 && bottom != 0) {\n    wordstr_box_str += \"\\n\\t \" + std::to_string(right + 1);\n    wordstr_box_str += \" \" + std::to_string(image_height_ - bottom);\n    wordstr_box_str += \" \" + std::to_string(right + 5);\n    wordstr_box_str += \" \" + std::to_string(image_height_ - top);\n    wordstr_box_str += \" \" + std::to_string(page_number); // row for tab for EOL\n    wordstr_box_str += \"\\n\";\n  }\n  delete res_it;\n  return copy_string(wordstr_box_str);\n}\n\n/**********************************************************************\n * WordStrBox Renderer interface implementation\n **********************************************************************/\nTessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)\n    : TessResultRenderer(outputbase, \"box\") {}\n\nbool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {\n  const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));\n  if (wordstrbox == nullptr) {\n    return false;\n  }\n\n  AppendString(wordstrbox.get());\n\n  return true;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/arch/dotproduct.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproduct.cpp\n// Description: Native dot product function.\n//\n// (C) Copyright 2018, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the two n-vectors u and v.\nTFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {\n  TFloat total = 0;\n#if defined(OPENMP_SIMD) || defined(_OPENMP)\n#pragma omp simd reduction(+:total)\n#endif\n  for (int k = 0; k < n; k++) {\n    total += u[k] * v[k];\n  }\n  return total;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/arch/dotproduct.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproduct.h\n// Description: Native dot product function.\n//\n// (C) Copyright 2018, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_ARCH_DOTPRODUCT_H_\n#define TESSERACT_ARCH_DOTPRODUCT_H_\n\n#include \"tesstypes.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\nTFloat DotProductNative(const TFloat *u, const TFloat *v, int n);\n\n// Uses Intel AVX intrinsics to access the SIMD instruction set.\nTFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);\n\n// Uses Intel AVX512F intrinsics to access the SIMD instruction set.\nTFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n);\n\n// Use Intel FMA.\nTFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);\n\n// Uses Intel SSE intrinsics to access the SIMD instruction set.\nTFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);\n\n// Use NEON intrinsics.\nTFloat DotProductNEON(const TFloat *u, const TFloat *v, int n);\n\n} // namespace tesseract.\n\n#endif // TESSERACT_ARCH_DOTPRODUCT_H_\n"
  },
  {
    "path": "src/arch/dotproductavx.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproductavx.cpp\n// Description: Architecture-specific dot-product function.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if !defined(__AVX__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for AVX capable architectures\n#  endif\n#else\n\n#  include <immintrin.h>\n#  include <cstdint>\n#  include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\n// Uses Intel AVX intrinsics to access the SIMD instruction set.\n#if defined(FAST_FLOAT)\nfloat DotProductAVX(const float *u, const float *v, int n) {\n  const unsigned quot = n / 8;\n  const unsigned rem = n % 8;\n  __m256 t0 = _mm256_setzero_ps();\n  for (unsigned k = 0; k < quot; k++) {\n    __m256 f0 = _mm256_loadu_ps(u);\n    __m256 f1 = _mm256_loadu_ps(v);\n    f0 = _mm256_mul_ps(f0, f1);\n    t0 = _mm256_add_ps(t0, f0);\n    u += 8;\n    v += 8;\n  }\n  alignas(32) float tmp[8];\n  _mm256_store_ps(tmp, t0);\n  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#else\ndouble DotProductAVX(const double *u, const double *v, int n) {\n  const unsigned quot = n / 8;\n  const unsigned rem = n % 8;\n  __m256d t0 = _mm256_setzero_pd();\n  __m256d t1 = _mm256_setzero_pd();\n  for (unsigned k = 0; k < quot; k++) {\n    __m256d f0 = _mm256_loadu_pd(u);\n    __m256d f1 = _mm256_loadu_pd(v);\n    f0 = _mm256_mul_pd(f0, f1);\n    t0 = _mm256_add_pd(t0, f0);\n    u += 4;\n    v += 4;\n    __m256d f2 = _mm256_loadu_pd(u);\n    __m256d f3 = _mm256_loadu_pd(v);\n    f2 = _mm256_mul_pd(f2, f3);\n    t1 = _mm256_add_pd(t1, f2);\n    u += 4;\n    v += 4;\n  }\n  t0 = _mm256_hadd_pd(t0, t1);\n  alignas(32) double tmp[4];\n  _mm256_store_pd(tmp, t0);\n  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#endif\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/dotproductavx512.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproductavx512.cpp\n// Description: Architecture-specific dot-product function.\n// Author:      Stefan Weil\n//\n// (C) Copyright 2022\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if !defined(__AVX__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for AVX capable architectures\n#  endif\n#else\n\n#  include <immintrin.h>\n#  include <cstdint>\n#  include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\n// Uses Intel AVX intrinsics to access the SIMD instruction set.\n#  if defined(FAST_FLOAT)\nfloat DotProductAVX512F(const float *u, const float *v, int n) {\n  const unsigned quot = n / 16;\n  const unsigned rem = n % 16;\n  __m512 t0 = _mm512_setzero_ps();\n  for (unsigned k = 0; k < quot; k++) {\n    __m512 f0 = _mm512_loadu_ps(u);\n    __m512 f1 = _mm512_loadu_ps(v);\n    t0 = _mm512_fmadd_ps(f0, f1, t0);\n    u += 16;\n    v += 16;\n  }\n  float result = _mm512_reduce_add_ps(t0);\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#  else\ndouble DotProductAVX512F(const double *u, const double *v, int n) {\n  const unsigned quot = n / 8;\n  const unsigned rem = n % 8;\n  __m512d t0 = _mm512_setzero_pd();\n  for (unsigned k = 0; k < quot; k++) {\n    t0 = _mm512_fmadd_pd(_mm512_loadu_pd(u), _mm512_loadu_pd(v), t0);\n    u += 8;\n    v += 8;\n  }\n  double result = _mm512_reduce_add_pd(t0);\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#  endif\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/dotproductfma.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproductfma.cpp\n// Description: Architecture-specific dot-product function.\n// Author:      Stefan Weil\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if !defined(__FMA__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for FMA capable architectures\n#  endif\n#else\n\n#  include <immintrin.h>\n#  include <cstdint>\n#  include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\n// Uses Intel FMA intrinsics to access the SIMD instruction set.\n#if defined(FAST_FLOAT)\nfloat DotProductFMA(const float *u, const float *v, int n) {\n  const unsigned quot = n / 16;\n  const unsigned rem = n % 16;\n  __m256 t0 = _mm256_setzero_ps();\n  __m256 t1 = _mm256_setzero_ps();\n  for (unsigned k = 0; k < quot; k++) {\n    __m256 f0 = _mm256_loadu_ps(u);\n    __m256 f1 = _mm256_loadu_ps(v);\n    t0 = _mm256_fmadd_ps(f0, f1, t0);\n    u += 8;\n    v += 8;\n    __m256 f2 = _mm256_loadu_ps(u);\n    __m256 f3 = _mm256_loadu_ps(v);\n    t1 = _mm256_fmadd_ps(f2, f3, t1);\n    u += 8;\n    v += 8;\n  }\n  t0 = _mm256_hadd_ps(t0, t1);\n  alignas(32) float tmp[8];\n  _mm256_store_ps(tmp, t0);\n  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#else\ndouble DotProductFMA(const double *u, const double *v, int n) {\n  const unsigned quot = n / 8;\n  const unsigned rem = n % 8;\n  __m256d t0 = _mm256_setzero_pd();\n  __m256d t1 = _mm256_setzero_pd();\n  for (unsigned k = 0; k < quot; k++) {\n    __m256d f0 = _mm256_loadu_pd(u);\n    __m256d f1 = _mm256_loadu_pd(v);\n    t0 = _mm256_fmadd_pd(f0, f1, t0);\n    u += 4;\n    v += 4;\n    __m256d f2 = _mm256_loadu_pd(u);\n    __m256d f3 = _mm256_loadu_pd(v);\n    t1 = _mm256_fmadd_pd(f2, f3, t1);\n    u += 4;\n    v += 4;\n  }\n  t0 = _mm256_hadd_pd(t0, t1);\n  alignas(32) double tmp[4];\n  _mm256_store_pd(tmp, t0);\n  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];\n  for (unsigned k = 0; k < rem; k++) {\n    result += *u++ * *v++;\n  }\n  return result;\n}\n#endif\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/dotproductneon.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproductneon.cpp\n// Description: Dot product function for ARM NEON.\n// Author:      Stefan Weil\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if defined(__ARM_NEON)\n\n#include <arm_neon.h>\n#include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Documentation:\n// https://developer.arm.com/architectures/instruction-sets/intrinsics/\n\n#if defined(FAST_FLOAT) && defined(__ARM_ARCH_ISA_A64)\n\nfloat DotProductNEON(const float *u, const float *v, int n) {\n  float32x4_t result0123 = vdupq_n_f32(0.0f);\n  float32x4_t result4567 = vdupq_n_f32(0.0f);\n  while (n > 7) {\n    // Calculate 8 dot products per iteration.\n    float32x4_t u0 = vld1q_f32(u);\n    float32x4_t v0 = vld1q_f32(v);\n    float32x4_t u4 = vld1q_f32(u + 4);\n    float32x4_t v4 = vld1q_f32(v + 4);\n    result0123 = vfmaq_f32(result0123, u0, v0);\n    result4567 = vfmaq_f32(result4567, u4, v4);\n    u += 8;\n    v += 8;\n    n -= 8;\n  }\n  float total = vaddvq_f32(result0123);\n  total += vaddvq_f32(result4567);\n  while (n > 0) {\n    total += *u++ * *v++;\n    n--;\n  }\n  return total;\n}\n\n#else\n\n// Computes and returns the dot product of the two n-vectors u and v.\nTFloat DotProductNEON(const TFloat *u, const TFloat *v, int n) {\n  TFloat total = 0;\n#if defined(OPENMP_SIMD) || defined(_OPENMP)\n#pragma omp simd reduction(+:total)\n#endif\n  for (int k = 0; k < n; k++) {\n    total += u[k] * v[k];\n  }\n  return total;\n}\n\n#endif\n\n} // namespace tesseract\n\n#endif /* __ARM_NEON */\n"
  },
  {
    "path": "src/arch/dotproductsse.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dotproductsse.cpp\n// Description: Architecture-specific dot-product function.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if !defined(__SSE4_1__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for SSE 4.1 capable architectures\n#  endif\n#else\n\n#  include <emmintrin.h>\n#  include <smmintrin.h>\n#  include <cstdint>\n#  include \"dotproduct.h\"\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\n// Uses Intel SSE intrinsics to access the SIMD instruction set.\n#if defined(FAST_FLOAT)\nfloat DotProductSSE(const float *u, const float *v, int n) {\n  int max_offset = n - 4;\n  int offset = 0;\n  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and\n  // v, and multiplying them together in parallel.\n  __m128 sum = _mm_setzero_ps();\n  if (offset <= max_offset) {\n    offset = 4;\n    // Aligned load is reputedly faster but requires 16 byte aligned input.\n    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&\n        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {\n      // Use aligned load.\n      sum = _mm_load_ps(u);\n      __m128 floats2 = _mm_load_ps(v);\n      // Multiply.\n      sum = _mm_mul_ps(sum, floats2);\n      while (offset <= max_offset) {\n        __m128 floats1 = _mm_load_ps(u + offset);\n        floats2 = _mm_load_ps(v + offset);\n        floats1 = _mm_mul_ps(floats1, floats2);\n        sum = _mm_add_ps(sum, floats1);\n        offset += 4;\n      }\n    } else {\n      // Use unaligned load.\n      sum = _mm_loadu_ps(u);\n      __m128 floats2 = _mm_loadu_ps(v);\n      // Multiply.\n      sum = _mm_mul_ps(sum, floats2);\n      while (offset <= max_offset) {\n        __m128 floats1 = _mm_loadu_ps(u + offset);\n        floats2 = _mm_loadu_ps(v + offset);\n        floats1 = _mm_mul_ps(floats1, floats2);\n        sum = _mm_add_ps(sum, floats1);\n        offset += 4;\n      }\n    }\n  }\n  // Add the 4 sums in sum horizontally.\n#if 0\n  alignas(32) float tmp[4];\n  _mm_store_ps(tmp, sum);\n  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];\n#else\n  __m128 zero = _mm_setzero_ps();\n  // https://www.felixcloutier.com/x86/haddps\n  sum = _mm_hadd_ps(sum, zero);\n  sum = _mm_hadd_ps(sum, zero);\n  // Extract the low result.\n  float result = _mm_cvtss_f32(sum);\n#endif\n  // Add on any left-over products.\n  while (offset < n) {\n    result += u[offset] * v[offset];\n    ++offset;\n  }\n  return result;\n}\n#else\ndouble DotProductSSE(const double *u, const double *v, int n) {\n  int max_offset = n - 2;\n  int offset = 0;\n  // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and\n  // v, and multiplying them together in parallel.\n  __m128d sum = _mm_setzero_pd();\n  if (offset <= max_offset) {\n    offset = 2;\n    // Aligned load is reputedly faster but requires 16 byte aligned input.\n    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&\n        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {\n      // Use aligned load.\n      sum = _mm_load_pd(u);\n      __m128d floats2 = _mm_load_pd(v);\n      // Multiply.\n      sum = _mm_mul_pd(sum, floats2);\n      while (offset <= max_offset) {\n        __m128d floats1 = _mm_load_pd(u + offset);\n        floats2 = _mm_load_pd(v + offset);\n        offset += 2;\n        floats1 = _mm_mul_pd(floats1, floats2);\n        sum = _mm_add_pd(sum, floats1);\n      }\n    } else {\n      // Use unaligned load.\n      sum = _mm_loadu_pd(u);\n      __m128d floats2 = _mm_loadu_pd(v);\n      // Multiply.\n      sum = _mm_mul_pd(sum, floats2);\n      while (offset <= max_offset) {\n        __m128d floats1 = _mm_loadu_pd(u + offset);\n        floats2 = _mm_loadu_pd(v + offset);\n        offset += 2;\n        floats1 = _mm_mul_pd(floats1, floats2);\n        sum = _mm_add_pd(sum, floats1);\n      }\n    }\n  }\n  // Add the 2 sums in sum horizontally.\n  sum = _mm_hadd_pd(sum, sum);\n  // Extract the low result.\n  double result = _mm_cvtsd_f64(sum);\n  // Add on any left-over products.\n  while (offset < n) {\n    result += u[offset] * v[offset];\n    ++offset;\n  }\n  return result;\n}\n#endif\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/intsimdmatrix.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrix.cpp\n// Description: Base class for 8-bit int SIMD matrix multipliers.\n// Author:      Ray Smith\n//\n// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"intsimdmatrix.h\"\n#include \"matrix.h\"     // for GENERIC_2D_ARRAY\n#include \"simddetect.h\" // for SIMDDetect\n\nnamespace tesseract {\n\nconst IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr;\n\n// Computes a reshaped copy of the weight matrix w.\nvoid IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,\n                         int32_t &rounded_num_out) const {\n  const int num_out = w.dim1();\n  const int num_in = w.dim2() - 1;\n  // The rounded-up sizes of the reshaped weight matrix, excluding biases.\n  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);\n  rounded_num_out = RoundOutputs(num_out);\n  // Add the bias and compute the required size.\n  shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0);\n  int shaped_index = 0;\n  int output = 0;\n  // Each number of registers needs a different format! Iterates over the\n  // different numbers of registers (each a power of 2).\n  for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) {\n    // The number of outputs that we will generate with this many registers.\n    int num_outputs_per_register_set = num_registers * num_outputs_per_register_;\n    // Use the max number of registers until we have to go fewer.\n    while (output + num_outputs_per_register_set <= rounded_num_out) {\n      // Accumulating outputs in registers saves iterating over the inputs, so\n      // we only have to do it once per output register set.\n      for (int input = 0; input < num_in; input += num_inputs_per_group_) {\n        // Iterate over the number of outputs in a register set.\n        for (int j = 0; j < num_outputs_per_register_set; ++j) {\n          // Inner-most loop corresponds to the number of inputs in an input\n          // group.\n          for (int i = 0; i < num_inputs_per_group_; ++i) {\n            int8_t weight = 0;\n            if (output + j < num_out && input + i < num_in) {\n              weight = w(output + j, input + i);\n            }\n            shaped_w[shaped_index++] = weight;\n          }\n        }\n      }\n      // Append the bias weights for the register set.\n      for (int j = 0; j < num_outputs_per_register_set; ++j) {\n        int8_t weight = 0;\n        if (output + j < num_out) {\n          weight = w(output + j, num_in);\n        }\n        shaped_w[shaped_index++] = weight;\n      }\n      output += num_outputs_per_register_set;\n    }\n  }\n}\n\n// Computes matrix.vector v = Wu.\n// u is of size W.dim2() - 1 and the output v is of size W.dim1().\n// u is imagined to have an extra element at the end with value 1, to\n// implement the bias, but it doesn't actually have it.\nvoid IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,\n                                    const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {\n  int num_out = w.dim1();\n  int num_in = w.dim2() - 1;\n  // Base implementation.\n  int i;\n  // Break up into chunks of four to facilitate vectorization\n  for (i = 0; i < (num_out / 4) * 4; i += 4) {\n    const int8_t *wi0 = w[i + 0];\n    const int8_t *wi1 = w[i + 1];\n    const int8_t *wi2 = w[i + 2];\n    const int8_t *wi3 = w[i + 3];\n    int total0 = 0;\n    int total1 = 0;\n    int total2 = 0;\n    int total3 = 0;\n    for (int j = 0; j < num_in; ++j) {\n      total0 += wi0[j] * u[j];\n      total1 += wi1[j] * u[j];\n      total2 += wi2[j] * u[j];\n      total3 += wi3[j] * u[j];\n    }\n    // Add in the bias and correct for integer values.\n    v[i + 0] = (total0 + wi0[num_in] * INT8_MAX) * scales[i + 0];\n    v[i + 1] = (total1 + wi1[num_in] * INT8_MAX) * scales[i + 1];\n    v[i + 2] = (total2 + wi2[num_in] * INT8_MAX) * scales[i + 2];\n    v[i + 3] = (total3 + wi3[num_in] * INT8_MAX) * scales[i + 3];\n  }\n\n  // Capture the remainder mod four\n  for (; i < num_out; ++i) {\n    const int8_t *wi = w[i];\n    int total = 0;\n    for (int j = 0; j < num_in; ++j) {\n      total += wi[j] * u[j];\n    }\n    // Add in the bias and correct for integer values.\n    v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/arch/intsimdmatrix.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrix.h\n// Description: Base class for 8-bit int SIMD matrix multipliers.\n// Author:      Ray Smith\n//\n// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_\n#define TESSERACT_ARCH_INTSIMDMATRIX_H_\n\n#include <tesseract/export.h>\n\n#include <cstdint>\n#include <vector>\n\n#include \"tesstypes.h\"\n\nnamespace tesseract {\n\ntemplate <class T>\nclass GENERIC_2D_ARRAY;\n\n// Base class for a SIMD function to multiply a matrix by a vector, with sources\n// of 8-bit signed integer, and result in a double, after appropriate scaling.\n// Assumes a specific method of multiplication that can be applied to any size\n// and number of SIMD registers as follows:\n// int32_t results are computed with num_outputs_per_register_ in each of\n// max_output_registers_ result registers, repeatedly until it would make too\n// many results, then the number of registers is halved, and so-on down to a\n// single result register. The last calculation only outputs the required number\n// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,\n//  num_outputs_per_register_ = 4, and max_output_registers_ = 8,\n// Step 1: 8x4=32 results are computed,\n// Step 2: 8x4=32 again, total 64,\n// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,\n// Step 4: 1x3, total 75.\n// Each step above is computed using a PartialFunc, which runs over the input\n// vector once. The input is read one registerful of num_inputs_per_register_\n// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)\n// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.\n// Since it is slow (on Intel at least) to horizontally add in a register,\n// provision is made to process num_inputs_per_group_ inputs at a time, with\n// the group being replicated num_input_groups_ times and multiplied by a\n// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.\n// This is most convenient if num_inputs_per_group_ is 4, and the product\n// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent\n// results in the process, but it doesn't have to be implemented that way.\n// The weights are re-ordered by Init() to be used sequentially by the above\n// algorithm, followed by the biases, so they can be added at the end.\n// The base class computes the base C++ implementation.\n// NOTE that, although the subclasses execute on different SIMD hardware, no\n// virtual methods are needed, as the constructor sets up everything that\n// is required to allow the base class implementation to do all the work.\nstruct TESS_API IntSimdMatrix {\n  // Computes a reshaped copy of the weight matrix w.\n  void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,\n            int32_t &rounded_num_out) const;\n\n  // Rounds the size up to a multiple of the input register size (in int8_t).\n  int RoundInputs(int size) const {\n    return Roundup(size, num_inputs_per_register_);\n  }\n  // Rounds the size up to a multiple of the output register size (in int32_t).\n  int RoundOutputs(int size) const {\n    return Roundup(size, num_outputs_per_register_);\n  }\n\n  // Computes matrix.vector v = Wu.\n  // u is of size W.dim2() - 1 and the output v is of size W.dim1().\n  // u is imagined to have an extra element at the end with value 1, to\n  // implement the bias, but it doesn't actually have it.\n  // Computes the base C++ implementation.\n  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales,\n                              const int8_t *u, TFloat *v);\n\n  // Rounds the input up to a multiple of the given factor.\n  static int Roundup(int input, int factor) {\n    return (input + factor - 1) / factor * factor;\n  }\n\n  // Computes matrix.vector v = Wu.\n  // u is of size W.dim2() - 1 and the output v is of size W.dim1().\n  // u is imagined to have an extra element at the end with value 1, to\n  // implement the bias, but it doesn't actually have it.\n  // Uses an optimized implementation with partial funcs.\n  // NOTE: The size of the input vector (u) must be padded using\n  // RoundInputs above.\n  // The input will be over-read to the extent of the padding. There are no\n  // alignment requirements.\n  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *,\n                                           TFloat *);\n  MatrixDotVectorFunction matrixDotVectorFunction;\n\n  // Number of 32 bit outputs held in each register.\n  int num_outputs_per_register_;\n  // Maximum number of registers that we will use to hold outputs.\n  int max_output_registers_;\n  // Number of 8 bit inputs in the inputs register.\n  int num_inputs_per_register_;\n  // Number of inputs in each weight group.\n  int num_inputs_per_group_;\n  // Number of groups of inputs to be broadcast.\n  // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_\n\n  static const IntSimdMatrix *intSimdMatrix;\n  // Only available with NEON.\n  static const IntSimdMatrix intSimdMatrixNEON;\n  // Only available with RVV.\n  static const IntSimdMatrix intSimdMatrixRVV;\n  // Only available with AVX2 / AVX / FMA / SSE.\n  static const IntSimdMatrix intSimdMatrixAVX2;\n  static const IntSimdMatrix intSimdMatrixSSE;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_\n"
  },
  {
    "path": "src/arch/intsimdmatrixavx2.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrixavx2.cpp\n// Description: matrix-vector product for 8-bit data on avx2.\n// Author:      Ray Smith\n//\n// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"intsimdmatrix.h\"\n\n#if !defined(__AVX2__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for AVX2 capable architectures\n#  endif\n#else\n#  include <immintrin.h>\n#  include <algorithm>\n#  include <cstdint>\n#  include <vector>\n\n#  if defined(_MSC_VER) && _MSC_VER >= 1925 && _MSC_VER <= 1929 && \\\n      defined(_WIN32) && !defined(_WIN64)\n// Optimize for size (/Os) instead of using the default optimization for some\n// versions of the 32 bit Visual Studio compiler which generate buggy code.\n#    pragma optimize(\"\", off)\n#    pragma optimize(\"s\", on)\n#  endif\n\nnamespace tesseract {\n\n// Number of outputs held in each register. 8 x 32 bit ints.\nconstexpr int kNumOutputsPerRegister = 8;\n// Maximum number of registers that we will use.\nconstexpr int kMaxOutputRegisters = 8;\n// Number of inputs in the inputs register.\nconstexpr int kNumInputsPerRegister = 32;\n// Number of inputs in each weight group.\nconstexpr int kNumInputsPerGroup = 4;\n// Number of groups of inputs to be broadcast.\nconstexpr int kNumInputGroups = kNumInputsPerRegister / kNumInputsPerGroup;\n\n// Functions to compute part of a matrix.vector multiplication. The weights\n// are in a very specific order (see above) in w, which is multiplied by\n// u of length num_in, to produce output v after scaling the integer results\n// by the corresponding member of scales.\n// The amount of w and scales consumed is fixed and not available to the\n// caller. The number of outputs written to v will be at most num_out.\n\n// Computes one set of 4x8 products of inputs and weights, adding to result.\n// Horizontally adds 4 adjacent results, making 8x32-bit results.\n// rep_input is assumed to be an 8x replicated set of 4x8-bit signed integers.\n// Note that wi must previously have been re-organized with blocks of 4x8\n// weights in contiguous memory.\n// ones is a register of 16x16-bit values all equal to 1.\n// Note: wi is incremented by the amount of data read.\n// weights and reps are scratch registers.\n// This function must be inlined with references in order for the compiler to\n// correctly use the registers declared in the caller.\nstatic inline void MultiplyGroup(const __m256i &rep_input, const __m256i &ones, const int8_t *&wi,\n                                 __m256i &weights, __m256i &reps, __m256i &result) {\n  // Load a 4x8 block of weights.\n  weights = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(wi));\n  wi += kNumInputsPerRegister;\n  // Normalize the signs on rep_input, weights, so weights is always +ve.\n  reps = _mm256_sign_epi8(rep_input, weights);\n  weights = _mm256_sign_epi8(weights, weights);\n  // Multiply 32x8-bit reps by 32x8-bit weights to make 16x16-bit results,\n  // with adjacent pairs added.\n  weights = _mm256_maddubs_epi16(weights, reps);\n  // Multiply 16x16-bit result by 16x16-bit ones to make 8x32-bit results,\n  // with  adjacent pairs added. What we really want is a horizontal add of\n  // 16+16=32 bit result, but there is no such instruction, so multiply by\n  // 16-bit ones instead. It is probably faster than all the sign-extending,\n  // permuting and adding that would otherwise be required.\n  weights = _mm256_madd_epi16(weights, ones);\n  result = _mm256_add_epi32(result, weights);\n}\n\n// Load 64 bits into the bottom of a 128bit register.\n// We don't actually care what the top 64bits are, but this ends\n// up with them being zero.\nstatic inline __m128i load64_to_128(const int8_t *wi_) {\n  const auto *wi = reinterpret_cast<const int64_t *>(wi_);\n  return _mm_set_epi64x(0, wi[0]);\n}\n\n#if defined(FAST_FLOAT)\n\nstatic inline void ExtractResults8(__m256i result, const int8_t *wi,\n                                   const float *scales, float *v) {\n  __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg\n  __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg\n  __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);\n  __m256 scale01234567 = _mm256_loadu_ps(scales);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result = _mm256_add_epi32(result, w256);     // result += bias * 127\n  __m256 res01234567 = _mm256_cvtepi32_ps(result);\n  result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));\n  res01234567 = _mm256_mul_ps(res01234567, scale01234567);\n  _mm256_storeu_ps(v, res01234567);\n}\n\nstatic inline void ExtractResults16(__m256i result0, __m256i result1,\n                                    const int8_t *&wi, const float *&scales,\n                                    float *&v) {\n  __m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));\n  // 8x8bit vals in bottom of 128bit reg\n  const __m256i bias_scale =\n      _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);\n  __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg\n  __m256 scale01234567 = _mm256_loadu_ps(scales);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result0 = _mm256_add_epi32(result0, w256);   // result += bias * 127\n  __m256 res01234567 = _mm256_cvtepi32_ps(result0);\n  result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));\n  res01234567 = _mm256_mul_ps(res01234567, scale01234567);\n  _mm256_storeu_ps(v, res01234567);\n  w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));\n  w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg\n  scale01234567 = _mm256_loadu_ps(scales + 8);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result1 = _mm256_add_epi32(result1, w256);   // result += bias * 127\n  res01234567 = _mm256_cvtepi32_ps(result1);\n  result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));\n  res01234567 = _mm256_mul_ps(res01234567, scale01234567);\n  _mm256_storeu_ps(v + 8, res01234567);\n  wi += 16;\n  scales += 16;\n  v += 16;\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=64 results.\n// The weights *must* be arranged so that consecutive reads from wi\n// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of\n// (kNumInputsPerGroup inputs))). After that there must be N consecutive\n// bias weights, before continuing with any more weights.\n// u must be padded out with zeros to\n// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.\nstatic void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u,\n                                     int num_in, float *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  __m256i result2 = _mm256_setzero_si256();\n  __m256i result3 = _mm256_setzero_si256();\n  __m256i result4 = _mm256_setzero_si256();\n  __m256i result5 = _mm256_setzero_si256();\n  __m256i result6 = _mm256_setzero_si256();\n  __m256i result7 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result4);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result5);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result6);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n  ExtractResults16(result2, result3, wi, scales, v);\n  ExtractResults16(result4, result5, wi, scales, v);\n  ExtractResults16(result6, result7, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=32 results.\n// For details see PartialMatrixDotVector64 with N=32.\nstatic void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u,\n                                     int num_in, float *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  __m256i result2 = _mm256_setzero_si256();\n  __m256i result3 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n  ExtractResults16(result2, result3, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=16 results.\n// For details see PartialMatrixDotVector64 with N=16.\nstatic void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u,\n                                     int num_in, float *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=8 results.\n// For details see PartialMatrixDotVector64 with N=8.\nstatic inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u,\n                                           int num_in, float *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n    }\n  }\n  ExtractResults8(result0, wi, scales, v);\n}\n\nstatic void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales,\n                            const int8_t *u, float *v) {\n  const int num_out = dim1;\n  const int num_in = dim2 - 1;\n  // Each call to a partial_func_ produces group_size outputs, except the\n  // last one, which can produce less.\n  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);\n  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);\n  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;\n  int output = 0;\n\n  int w_step = (rounded_num_in + 1) * group_size;\n\n  // Run with this group size, until it would produce too much output, then\n  // switch to a smaller size.\n  for (; output + group_size <= rounded_num_out; output += group_size) {\n    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n  }\n  group_size /= 2;\n  w_step /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n    output += group_size;\n  }\n  group_size /= 2;\n  w_step /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n    output += group_size;\n  }\n  group_size /= 2;\n  w_step /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);\n  }\n}\n#else\nstatic inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,\n                                   double *v) {\n  __m128i w128 = load64_to_128(wi);          // 8x8bit vals in bottom of 128bit reg\n  __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg\n  __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);\n  __m256d scale0123 = _mm256_loadu_pd(scales);\n  __m256d scale4567 = _mm256_loadu_pd(scales + 4);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result = _mm256_add_epi32(result, w256);     // result += bias * 127\n  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));\n  result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));\n  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));\n  res0123 = _mm256_mul_pd(res0123, scale0123);\n  res4567 = _mm256_mul_pd(res4567, scale4567);\n  _mm256_storeu_pd(v, res0123);\n  _mm256_storeu_pd(v + 4, res4567);\n}\n\nstatic inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi,\n                                    const double *&scales, double *&v) {\n  __m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));\n  // 8x8bit vals in bottom of 128bit reg\n  const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);\n  __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg\n  __m256d scale0123 = _mm256_loadu_pd(scales);\n  __m256d scale4567 = _mm256_loadu_pd(scales + 4);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result0 = _mm256_add_epi32(result0, w256);   // result += bias * 127\n  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));\n  result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));\n  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));\n  res0123 = _mm256_mul_pd(res0123, scale0123);\n  res4567 = _mm256_mul_pd(res4567, scale4567);\n  _mm256_storeu_pd(v, res0123);\n  _mm256_storeu_pd(v + 4, res4567);\n  w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));\n  w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg\n  scale0123 = _mm256_loadu_pd(scales + 8);\n  scale4567 = _mm256_loadu_pd(scales + 12);\n  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>\n  result1 = _mm256_add_epi32(result1, w256);   // result += bias * 127\n  res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));\n  result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));\n  res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));\n  res0123 = _mm256_mul_pd(res0123, scale0123);\n  res4567 = _mm256_mul_pd(res4567, scale4567);\n  _mm256_storeu_pd(v + 8, res0123);\n  _mm256_storeu_pd(v + 12, res4567);\n  wi += 16;\n  scales += 16;\n  v += 16;\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=64 results.\n// The weights *must* be arranged so that consecutive reads from wi\n// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of\n// (kNumInputsPerGroup inputs))). After that there must be N consecutive\n// bias weights, before continuing with any more weights.\n// u must be padded out with zeros to\n// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.\nstatic void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,\n                                     int num_in, double *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  __m256i result2 = _mm256_setzero_si256();\n  __m256i result3 = _mm256_setzero_si256();\n  __m256i result4 = _mm256_setzero_si256();\n  __m256i result5 = _mm256_setzero_si256();\n  __m256i result6 = _mm256_setzero_si256();\n  __m256i result7 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result4);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result5);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result6);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n  ExtractResults16(result2, result3, wi, scales, v);\n  ExtractResults16(result4, result5, wi, scales, v);\n  ExtractResults16(result6, result7, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=32 results.\n// For details see PartialMatrixDotVector64 with N=32.\nstatic void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,\n                                     int num_in, double *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  __m256i result2 = _mm256_setzero_si256();\n  __m256i result3 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n  ExtractResults16(result2, result3, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=16 results.\n// For details see PartialMatrixDotVector64 with N=16.\nstatic void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,\n                                     int num_in, double *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  __m256i result1 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);\n    }\n  }\n  ExtractResults16(result0, result1, wi, scales, v);\n}\n\n// Computes part of matrix.vector v = Wu. Computes N=8 results.\n// For details see PartialMatrixDotVector64 with N=8.\nstatic inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,\n                                           int num_in, double *v) {\n  // Register containing 16-bit ones for horizontal add with 16->32 bit\n  // conversion.\n  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);\n  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);\n  // Initialize all the results to 0.\n  __m256i result0 = _mm256_setzero_si256();\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in;) {\n    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));\n    // Inputs are processed in groups of kNumInputsPerGroup, replicated\n    // kNumInputGroups times.\n    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {\n      // Replicate the low 32 bits (4 inputs) 8 times.\n      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));\n      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.\n      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);\n      __m256i weights, reps;\n      // Mul-add, with horizontal add of the 4 inputs to each of the results.\n      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);\n    }\n  }\n  ExtractResults8(result0, wi, scales, v);\n}\n\nstatic void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,\n                            const int8_t *u, double *v) {\n  const int num_out = dim1;\n  const int num_in = dim2 - 1;\n  // Each call to a partial_func_ produces group_size outputs, except the\n  // last one, which can produce less.\n  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);\n  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);\n  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;\n  int output = 0;\n\n  int w_step = (rounded_num_in + 1) * group_size;\n\n  // Run with this group size, until it would produce too much output, then\n  // switch to a smaller size.\n  for (; output + group_size <= rounded_num_out; output += group_size) {\n    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n  }\n  group_size /= 2;\n  w_step /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n    output += group_size;\n  }\n  group_size /= 2;\n  w_step /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n    output += group_size;\n  }\n  group_size /= 2;\n\n  if (output + group_size <= rounded_num_out) {\n    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);\n  }\n}\n#endif\n\nconst IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {\n    // Function.\n    matrixDotVector,\n    // Number of 32 bit outputs held in each register.\n    kNumOutputsPerRegister,\n    // Maximum number of registers that we will use to hold outputs.\n    kMaxOutputRegisters,\n    // Number of 8 bit inputs in the inputs register.\n    kNumInputsPerRegister,\n    // Number of inputs in each weight group.\n    kNumInputsPerGroup\n};\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/intsimdmatrixneon.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrixneon.cpp\n// Description: matrix-vector product for 8-bit data on neon.\n// Author:      Robin Watts (from the AVX2 original by Ray Smith)\n//\n// (C) Copyright 2017, Google Inc.\n// (C) Copyright 2020, Artifex Software Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if defined(__ARM_NEON)\n\n#  include \"intsimdmatrix.h\"\n#  include \"tesstypes.h\"\n\n#  include <algorithm>\n#  include <cstdint>\n#  include <vector>\n#  include \"arm_neon.h\"\n\nnamespace tesseract {\n\n// Number of outputs held in each register. (Actually, we use a\n// pair of 4x32 registers, so 8 x 32 bit ints).\nconstexpr int kNumOutputsPerRegister = 8;\n// Maximum number of registers that we will use.\nconstexpr int kMaxOutputRegisters = 1;\n// Number of inputs in the inputs register.\nconstexpr int kNumInputsPerRegister = 8;\n// Number of inputs in each weight group.\nconstexpr int kNumInputsPerGroup = 8;\n\n// Function to compute part of a matrix.vector multiplication. The weights\n// are in a very specific order (see above) in w, which is multiplied by\n// u of length num_in, to produce output v after scaling the integer results\n// by the corresponding member of scales.\n// The amount of w and scales consumed is fixed and not available to the\n// caller.\n\n// Computes part of matrix.vector v = Wu. Computes N=8 results.\n// The weights *must* be arranged so that consecutive reads from wi\n// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of\n// (kNumInputsPerGroup inputs))). After that there must be N consecutive\n// bias weights, before continuing with any more weights.\n// u must be padded out with zeros to\n// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.\nstatic inline void PartialMatrixDotVector8(const int8_t *__restrict wi,\n                                           const TFloat *__restrict scales,\n                                           const int8_t *__restrict u, int num_in,\n                                           TFloat *__restrict v, int num_out) {\n  // Initialize all the results to 0.\n  int32x4_t result0123 = {0, 0, 0, 0};\n  int32x4_t result4567 = {0, 0, 0, 0};\n  int8x8_t bias_scale = {127, 127, 127, 127, 127, 127, 127, 127};\n  // Iterate over the input (u), one registerful at a time.\n  for (int j = 0; j < num_in; j += 8) {\n    int8x8_t vu = vld1_s8(u);              // vu     = u0  u1  u2  u3  u4  u5  u6  u7\n    int8x16_t vw01 = vld1q_s8(wi);         // vw0    = w00 w01 w02 w03 w04 w05 w06 w07\n                                           // w10 w11 w12 w13 w14 w15 w16 w17\n    int8x16_t vw23 = vld1q_s8(wi + 8 * 2); // vw2    = w20 w21 w22 w23 w24 w25 w26 w27 w30\n                                           // w31 w32 w33 w34 w35 w36 w37\n    int8x16_t vw45 = vld1q_s8(wi + 8 * 4); // vw4    = w40 w41 w42 w43 w44 w45 w46 w47 w50\n                                           // w51 w52 w53 w54 w55 w56 w57\n    int8x16_t vw67 = vld1q_s8(wi + 8 * 6); // vw6    = w60 w61 w62 w63 w64 w65 w66 w67 w70\n                                           // w71 w72 w73 w74 w75 w76 w77\n\n    int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2\n                                                        // w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7\n    int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01),\n                                vu);                    // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3\n                                                        // vw14.u4 w15.u5 w16.u6 w17.u7\n    int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2\n                                                        // w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7\n    int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23),\n                                vu);                    // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3\n                                                        // vw34.u4 w35.u5 w36.u6 w37.u7\n    int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2\n                                                        // w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7\n    int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45),\n                                vu);                    // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3\n                                                        // vw54.u4 w55.u5 w56.u6 w57.u7\n    int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2\n                                                        // w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7\n    int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67),\n                                vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3\n                                     // vw74.u4 w75.u5 w76.u6 w77.u7\n\n    int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3\n                                             // vw04.u4+w05.u5 w06.u6+w07.u7\n    int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3\n                                             // vw14.u4+w15.u5 w16.u6+w17.u7\n    int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3\n                                             // vw24.u4+w25.u5 w26.u6+w27.u7\n    int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3\n                                             // vw34.u4+w35.u5 w36.u6+w37.u7\n    int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3\n                                             // vw44.u4+w45.u5 w46.u6+w47.u7\n    int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3\n                                             // vw54.u4+w55.u5 w56.u6+w57.u7\n    int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3\n                                             // vw64.u4+w65.u5 w66.u6+w67.u7\n    int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3\n                                             // vw74.u4+w75.u5 w76.u6+w77.u7\n\n    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),\n                           vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));\n    // vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3\n    // vw14.u4+...+w17.u7\n    vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),\n                           vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));\n    // vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3\n    // vw34.u4+...+w37.u7\n    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),\n                           vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));\n    // vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3\n    // vw54.u4+...+w57.u7\n    vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),\n                           vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));\n    // vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3\n    // vw74.u4+...+w77.u7\n\n    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),\n                           vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));\n    // vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7\n    // vw30.u0+...+w37.u7\n    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),\n                           vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));\n    // vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7\n    // vw70.u0+...+w77.u7\n\n    result0123 = vaddq_s32(result0123, vrow0q2);\n    result4567 = vaddq_s32(result4567, vrow4q2);\n    u += 8;\n    wi += 64;\n  }\n  {\n    int8x8_t bias = vld1_s8(wi); // vw0    = b0  b1  b2  b3  b4  b5  b6  b7\n    int16x8_t scaled_bias = vmull_s8(bias, bias_scale);\n    result0123 = vaddw_s16(result0123, vget_low_s16(scaled_bias));\n    result4567 = vaddw_s16(result4567, vget_high_s16(scaled_bias));\n    *v++ = vget_lane_s32(vget_low_s32(result0123), 0) * *scales++;\n    if (num_out > 1)\n      *v++ = vget_lane_s32(vget_low_s32(result0123), 1) * *scales++;\n    if (num_out > 2)\n      *v++ = vget_lane_s32(vget_high_s32(result0123), 0) * *scales++;\n    if (num_out > 3)\n      *v++ = vget_lane_s32(vget_high_s32(result0123), 1) * *scales++;\n    if (num_out > 4)\n      *v++ = vget_lane_s32(vget_low_s32(result4567), 0) * *scales++;\n    if (num_out > 5)\n      *v++ = vget_lane_s32(vget_low_s32(result4567), 1) * *scales++;\n    if (num_out > 6)\n      *v++ = vget_lane_s32(vget_high_s32(result4567), 0) * *scales++;\n    if (num_out > 7)\n      *v = vget_lane_s32(vget_high_s32(result4567), 1) * *scales;\n  }\n}\n\nstatic void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,\n                            const int8_t *u, TFloat *v) {\n  const int num_out = dim1;\n  const int num_in = dim2 - 1;\n  // Each call to a partial_func_ produces group_size outputs, except the\n  // last one, which can produce less.\n  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);\n  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;\n  int output = 0;\n\n  int w_step = (rounded_num_in + 1) * group_size;\n\n  for (; output + group_size <= num_out; output += group_size) {\n    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v, kNumOutputsPerRegister);\n    wi += w_step;\n    scales += group_size;\n    v += group_size;\n  }\n  if (output < num_out)\n    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v,\n                            num_out & (kNumOutputsPerRegister - 1));\n}\n\nconst IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {\n    // Function.\n    matrixDotVector,\n    // Number of 32 bit outputs held in each register.\n    kNumOutputsPerRegister,\n    // Maximum number of registers that we will use to hold outputs.\n    kMaxOutputRegisters,\n    // Number of 8 bit inputs in the inputs register.\n    kNumInputsPerRegister,\n    // Number of inputs in each weight group.\n    kNumInputsPerGroup\n};\n\n} // namespace tesseract.\n\n#endif /* __ARM_NEON */\n"
  },
  {
    "path": "src/arch/intsimdmatrixrvv.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrixrvv.cpp\n// Description: matrix-vector product for 8-bit data on rvv.\n// Author:      sunyuechi\n//\n// Copyright (c) 2024 Institute of Software Chinese Academy of Sciences (ISCAS).\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // for HAVE_RVV, ...\n#endif\n\n#if HAVE_RVV\n#  include \"intsimdmatrix.h\"\n#  include \"tesstypes.h\"\n\nnamespace tesseract {\n\nstatic int DotProduct(const int8_t *u, const int8_t *v, int num) {\n  int total = 0;\n\n  asm __volatile__ (\n    \"  .option       arch, +v                   \\n\\t\"\n    \"  vsetvli t0,zero,e32,m8,ta,ma             \\n\\t\"\n    \"  vmv.v.i v0,0                             \\n\\t\"\n    \"1:                                         \\n\\t\"\n    \"  vsetvli t0,%[num],e8,m2,ta,ma            \\n\\t\"\n    \"  vle8.v v16,0(%[u])                       \\n\\t\"\n    \"  vle8.v v24,0(%[v])                       \\n\\t\"\n    \"  sub %[num],%[num],t0                     \\n\\t\"\n    \"  vwmul.vv v8,v24,v16                      \\n\\t\"\n    \"  add %[u],%[u],t0                         \\n\\t\"\n    \"  add %[v],%[v],t0                         \\n\\t\"\n    \"  vsetvli zero,zero,e16,m4,tu,ma           \\n\\t\"\n    \"  vwadd.wv v0,v0,v8                        \\n\\t\"\n    \"  bnez %[num],1b                           \\n\\t\"\n    \"  vsetvli t0,zero,e32,m8,ta,ma             \\n\\t\"\n    \"  vmv.s.x v8,zero                          \\n\\t\"\n    \"  vredsum.vs v0,v0,v8                      \\n\\t\"\n    \"  vmv.x.s %[total],v0                      \\n\\t\"\n    :  [u] \"+r\" (u),\n       [v] \"+r\" (v),\n       [num] \"+r\" (num),\n       [total] \"+r\" (total)\n    :\n    :  \"cc\", \"memory\"\n  );\n\n  return total;\n}\n\nstatic void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,\n                            const int8_t *u, TFloat *v) {\n  int num_out = dim1;\n  int num_in = dim2 - 1;\n  for (int i = 0; i < num_out; ++i) {\n    const int8_t *wi_start = wi + i * dim2;\n    int total = DotProduct(wi_start, u, num_in);\n    // Add in the bias and apply scaling.\n    v[i] = (total + wi_start[num_in] * INT8_MAX) * scales[i];\n  }\n}\n\nconst IntSimdMatrix IntSimdMatrix::intSimdMatrixRVV = {\n    // Function.\n    matrixDotVector,\n    // Number of 32 bit outputs held in each register.\n    1,\n    // Maximum number of registers that we will use to hold outputs.\n    1,\n    // Number of 8 bit inputs in the inputs register.\n    1,\n    // Number of inputs in each weight group.\n    1\n};\n\n} // namespace tesseract.\n\n#endif /* HAVE_RVV */\n"
  },
  {
    "path": "src/arch/intsimdmatrixsse.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsindmatrixsse.cpp\n// Description: SSE implementation of 8-bit int SIMD matrix multiply.\n// Author:      Ray Smith\n//\n// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#if !defined(__SSE4_1__)\n#  if defined(__i686__) || defined(__x86_64__)\n#    error Implementation only for SSE 4.1 capable architectures\n#  endif\n#else\n\n#  include \"intsimdmatrix.h\"\n\n#  include <emmintrin.h>\n#  include <smmintrin.h>\n#  include <cstdint>\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the n-vectors u and v.\n// Uses Intel SSE intrinsics to access the SIMD instruction set.\nstatic int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {\n  int max_offset = n - 8;\n  int offset = 0;\n  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit\n  // values, extending to 16 bit, multiplying to make 32 bit results.\n  int32_t result = 0;\n  if (offset <= max_offset) {\n    offset = 8;\n    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u));\n    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v));\n    __m128i sum = _mm_cvtepi8_epi16(packed1);\n    packed2 = _mm_cvtepi8_epi16(packed2);\n    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit\n    // ints to make 32 bit results, which are then horizontally added in pairs\n    // to make 4 32 bit results that still fit in a 128 bit register.\n    sum = _mm_madd_epi16(sum, packed2);\n    while (offset <= max_offset) {\n      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset));\n      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset));\n      offset += 8;\n      packed1 = _mm_cvtepi8_epi16(packed1);\n      packed2 = _mm_cvtepi8_epi16(packed2);\n      packed1 = _mm_madd_epi16(packed1, packed2);\n      sum = _mm_add_epi32(sum, packed1);\n    }\n    // Sum the 4 packed 32 bit sums and extract the low result.\n    sum = _mm_hadd_epi32(sum, sum);\n    sum = _mm_hadd_epi32(sum, sum);\n    result = _mm_cvtsi128_si32(sum);\n  }\n  while (offset < n) {\n    result += u[offset] * v[offset];\n    ++offset;\n  }\n  return result;\n}\n\n// Computes part of matrix.vector v = Wu. Computes 1 result.\nstatic void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u,\n                                    int num_in, TFloat *v) {\n  TFloat total = IntDotProductSSE(u, wi, num_in);\n  // Add in the bias and correct for integer values.\n  *v = (total + wi[num_in] * INT8_MAX) * *scales;\n}\n\nstatic void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,\n                            const int8_t *u, TFloat *v) {\n  const int num_out = dim1;\n  const int num_in = dim2 - 1;\n  int output = 0;\n\n  for (; output < num_out; output++) {\n    PartialMatrixDotVector1(wi, scales, u, num_in, v);\n    wi += dim2;\n    scales++;\n    v++;\n  }\n}\n\nconst IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {\n    matrixDotVector,\n    // Number of 32 bit outputs held in each register.\n    1,\n    // Maximum number of registers that we will use to hold outputs.\n    1,\n    // Number of 8 bit inputs in the inputs register.\n    1,\n    // Number of inputs in each weight group.\n    1\n};\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/arch/simddetect.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        simddetect.cpp\n// Description: Architecture detector.\n// Author:      Stefan Weil (based on code from Ray Smith)\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // for HAVE_AVX, ...\n#endif\n#include <numeric> // for std::inner_product\n#include \"dotproduct.h\"\n#include \"intsimdmatrix.h\" // for IntSimdMatrix\n#include \"params.h\"        // for STRING_VAR\n#include \"simddetect.h\"\n#include \"tprintf.h\" // for tprintf\n\n#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)\n// The GNU compiler g++ fails to compile with the Accelerate framework\n// (tested with versions 10 and 11), so unconditionally disable it.\n#undef HAVE_FRAMEWORK_ACCELERATE\n#endif\n\n#if defined(HAVE_FRAMEWORK_ACCELERATE)\n\n// Use Apple Accelerate framework.\n// https://developer.apple.com/documentation/accelerate/simd\n\n#include <Accelerate/Accelerate.h>\n\n#endif\n\n#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)\n// See https://en.wikipedia.org/wiki/CPUID.\n#  define HAS_CPUID\n#endif\n\n#if defined(HAS_CPUID)\n#  if defined(__GNUC__)\n#    include <cpuid.h>\n#  elif defined(_WIN32)\n#    include <intrin.h>\n#  endif\n#endif\n\n#if defined(HAVE_NEON) && !defined(__aarch64__)\n#  if defined(HAVE_ANDROID_GETCPUFAMILY)\n#    include <cpu-features.h>\n#  elif defined(HAVE_GETAUXVAL)\n#    include <asm/hwcap.h>\n#    include <sys/auxv.h>\n#  elif defined(HAVE_ELF_AUX_INFO)\n#    include <sys/auxv.h>\n#  endif\n#endif\n\n#if defined(HAVE_RVV)\n#  if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)\n#    include <sys/auxv.h>\n#    define HWCAP_RV(letter) (1ul << ((letter) - 'A'))\n#  endif\n#endif\n\nnamespace tesseract {\n\n// Computes and returns the dot product of the two n-vectors u and v.\n// Note: because the order of addition is different among the different dot\n// product functions, the results can (and do) vary slightly (although they\n// agree to within about 4e-15). This produces different results when running\n// training, despite all random inputs being precisely equal.\n// To get consistent results, use just one of these dot product functions.\n// On a test multi-layer network, serial is 57% slower than SSE, and AVX\n// is about 8% faster than SSE. This suggests that the time is memory\n// bandwidth constrained and could benefit from holding the reused vector\n// in AVX registers.\nDotProductFunction DotProduct;\n\nstatic STRING_VAR(dotproduct, \"auto\", \"Function used for calculation of dot product\");\n\nSIMDDetect SIMDDetect::detector;\n\n#if defined(__aarch64__)\n// ARMv8 always has NEON.\nbool SIMDDetect::neon_available_ = true;\n#elif defined(HAVE_NEON)\n// If true, then Neon has been detected.\nbool SIMDDetect::neon_available_;\n#elif defined(HAVE_RVV)\nbool SIMDDetect::rvv_available_;\n#else\n// If true, then AVX has been detected.\nbool SIMDDetect::avx_available_;\nbool SIMDDetect::avx2_available_;\nbool SIMDDetect::avx512F_available_;\nbool SIMDDetect::avx512BW_available_;\nbool SIMDDetect::avx512VNNI_available_;\n// If true, then FMA has been detected.\nbool SIMDDetect::fma_available_;\n// If true, then SSe4.1 has been detected.\nbool SIMDDetect::sse_available_;\n#endif\n\n#if defined(HAVE_FRAMEWORK_ACCELERATE)\nstatic TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {\n  TFloat total = 0;\n  const int stride = 1;\n#if defined(FAST_FLOAT)\n  vDSP_dotpr(u, stride, v, stride, &total, n);\n#else\n  vDSP_dotprD(u, stride, v, stride, &total, n);\n#endif\n  return total;\n}\n#endif\n\n// Computes and returns the dot product of the two n-vectors u and v.\nstatic TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {\n  TFloat total = 0;\n  for (int k = 0; k < n; ++k) {\n    total += u[k] * v[k];\n  }\n  return total;\n}\n\n// Compute dot product using std::inner_product.\nstatic TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {\n  return std::inner_product(u, u + n, v, static_cast<TFloat>(0));\n}\n\nstatic void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {\n  DotProduct = f;\n  IntSimdMatrix::intSimdMatrix = m;\n}\n\n// Constructor.\n// Tests the architecture in a system-dependent way to detect AVX, SSE and\n// any other available SIMD equipment.\n// __GNUC__ is also defined by compilers that include GNU extensions such as\n// clang.\nSIMDDetect::SIMDDetect() {\n  // The fallback is a generic dot product calculation.\n  SetDotProduct(DotProductGeneric);\n\n#if defined(HAS_CPUID)\n#  if defined(__GNUC__)\n  unsigned int eax, ebx, ecx, edx;\n  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {\n    // Note that these tests all use hex because the older compilers don't have\n    // the newer flags.\n#    if defined(HAVE_SSE4_1)\n    sse_available_ = (ecx & 0x00080000) != 0;\n#    endif\n#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)\n    auto xgetbv = []() {\n      uint32_t xcr0;\n      __asm__(\"xgetbv\" : \"=a\"(xcr0) : \"c\"(0) : \"%edx\");\n      return xcr0;\n    };\n    if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {\n      // OSXSAVE bit is set, XMM state and YMM state are fine.\n#      if defined(HAVE_FMA)\n      fma_available_ = (ecx & 0x00001000) != 0;\n#      endif\n#      if defined(HAVE_AVX)\n      avx_available_ = (ecx & 0x10000000) != 0;\n      if (avx_available_) {\n        // There is supposed to be a __get_cpuid_count function, but this is all\n        // there is in my cpuid.h. It is a macro for an asm statement and cannot\n        // be used inside an if.\n        __cpuid_count(7, 0, eax, ebx, ecx, edx);\n        avx2_available_ = (ebx & 0x00000020) != 0;\n        avx512F_available_ = (ebx & 0x00010000) != 0;\n        avx512BW_available_ = (ebx & 0x40000000) != 0;\n        avx512VNNI_available_ = (ecx & 0x00000800) != 0;\n      }\n#      endif\n    }\n#    endif\n  }\n#  elif defined(_WIN32)\n  int cpuInfo[4];\n  int max_function_id;\n  __cpuid(cpuInfo, 0);\n  max_function_id = cpuInfo[0];\n  if (max_function_id >= 1) {\n    __cpuid(cpuInfo, 1);\n#    if defined(HAVE_SSE4_1)\n    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;\n#    endif\n#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)\n    if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {\n      // OSXSAVE bit is set, XMM state and YMM state are fine.\n#      if defined(HAVE_FMA)\n      fma_available_ = (cpuInfo[2] & 0x00001000) != 0;\n#      endif\n#      if defined(HAVE_AVX)\n      avx_available_ = (cpuInfo[2] & 0x10000000) != 0;\n#      endif\n#      if defined(HAVE_AVX2)\n      if (max_function_id >= 7) {\n        __cpuid(cpuInfo, 7);\n        avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;\n        avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;\n        avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;\n        avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;\n      }\n#      endif\n    }\n#    endif\n  }\n#  else\n#    error \"I don't know how to test for SIMD with this compiler\"\n#  endif\n#endif\n\n#if defined(HAVE_NEON) && !defined(__aarch64__)\n#  if defined(HAVE_ANDROID_GETCPUFAMILY)\n  {\n    AndroidCpuFamily family = android_getCpuFamily();\n    if (family == ANDROID_CPU_FAMILY_ARM)\n      neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);\n  }\n#  elif defined(HAVE_GETAUXVAL)\n  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;\n#  elif defined(HAVE_ELF_AUX_INFO)\n  unsigned long hwcap = 0;\n  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);\n  neon_available_ = hwcap & HWCAP_NEON;\n#  endif\n#endif\n\n#if defined(HAVE_RVV)\n#  if defined(HAVE_GETAUXVAL)\n  const unsigned long hwcap = getauxval(AT_HWCAP);\n  rvv_available_ = hwcap & HWCAP_RV('V');\n#  elif defined(HAVE_ELF_AUX_INFO)\n  unsigned long hwcap = 0;\n  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);\n  rvv_available_ = hwcap & HWCAP_RV('V');\n#  endif\n#endif\n\n  // Select code for calculation of dot product based on autodetection.\n  if (false) {\n    // This is a dummy to support conditional compilation.\n#if defined(HAVE_AVX512F)\n  } else if (avx512F_available_) {\n    // AVX512F detected.\n    SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);\n#endif\n#if defined(HAVE_AVX2)\n  } else if (avx2_available_) {\n    // AVX2 detected.\n    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);\n#endif\n#if defined(HAVE_AVX)\n  } else if (avx_available_) {\n    // AVX detected.\n    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);\n#endif\n#if defined(HAVE_SSE4_1)\n  } else if (sse_available_) {\n    // SSE detected.\n    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);\n#endif\n#if defined(HAVE_NEON) || defined(__aarch64__)\n  } else if (neon_available_) {\n    // NEON detected.\n    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);\n#endif\n#if defined(HAVE_RVV)\n  } else if (rvv_available_) {\n    SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV);\n#endif\n  }\n\n  const char *dotproduct_env = getenv(\"DOTPRODUCT\");\n  if (dotproduct_env != nullptr) {\n    // Override automatic settings by value from environment variable.\n    dotproduct = dotproduct_env;\n    Update();\n  }\n}\n\nvoid SIMDDetect::Update() {\n  // Select code for calculation of dot product based on the\n  // value of the config variable if that value is not empty.\n  const char *dotproduct_method = \"generic\";\n  if (dotproduct == \"auto\") {\n    // Automatic detection. Nothing to be done.\n  } else if (dotproduct == \"generic\") {\n    // Generic code selected by config variable.\n    SetDotProduct(DotProductGeneric);\n    dotproduct_method = \"generic\";\n  } else if (dotproduct == \"native\") {\n    // Native optimized code selected by config variable.\n    SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);\n    dotproduct_method = \"native\";\n#if defined(HAVE_AVX2)\n  } else if (dotproduct == \"avx2\") {\n    // AVX2 selected by config variable.\n    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);\n    dotproduct_method = \"avx2\";\n#endif\n#if defined(HAVE_AVX)\n  } else if (dotproduct == \"avx\") {\n    // AVX selected by config variable.\n    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);\n    dotproduct_method = \"avx\";\n#endif\n#if defined(HAVE_FMA)\n  } else if (dotproduct == \"fma\") {\n    // FMA selected by config variable.\n    SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);\n    dotproduct_method = \"fma\";\n#endif\n#if defined(HAVE_SSE4_1)\n  } else if (dotproduct == \"sse\") {\n    // SSE selected by config variable.\n    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);\n    dotproduct_method = \"sse\";\n#endif\n#if defined(HAVE_FRAMEWORK_ACCELERATE)\n  } else if (dotproduct == \"accelerate\") {\n    SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);\n#endif\n#if defined(HAVE_NEON) || defined(__aarch64__)\n  } else if (dotproduct == \"neon\" && neon_available_) {\n    // NEON selected by config variable.\n    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);\n    dotproduct_method = \"neon\";\n#endif\n  } else if (dotproduct == \"std::inner_product\") {\n    // std::inner_product selected by config variable.\n    SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);\n    dotproduct_method = \"std::inner_product\";\n  } else {\n    // Unsupported value of config variable.\n    tprintf(\"Warning, ignoring unsupported config variable value: dotproduct=%s\\n\",\n            dotproduct.c_str());\n    tprintf(\n        \"Supported values for dotproduct: auto generic native\"\n#if defined(HAVE_AVX2)\n        \" avx2\"\n#endif\n#if defined(HAVE_AVX)\n        \" avx\"\n#endif\n#if defined(HAVE_FMA)\n        \" fma\"\n#endif\n#if defined(HAVE_SSE4_1)\n        \" sse\"\n#endif\n#if defined(HAVE_FRAMEWORK_ACCELERATE)\n        \" accelerate\"\n#endif\n        \" std::inner_product.\\n\");\n  }\n\n  dotproduct.set_value(dotproduct_method);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/arch/simddetect.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        simddetect.h\n// Description: Architecture detector.\n// Author:      Stefan Weil (based on code from Ray Smith)\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n#ifndef TESSERACT_ARCH_SIMDDETECT_H_\n#define TESSERACT_ARCH_SIMDDETECT_H_\n\n#include <tesseract/export.h>\n#include \"tesstypes.h\"\n\nnamespace tesseract {\n\n// Function pointer for best calculation of dot product.\nusing DotProductFunction = TFloat (*)(const TFloat *, const TFloat *, int);\nextern DotProductFunction DotProduct;\n\n// Architecture detector. Add code here to detect any other architectures for\n// SIMD-based faster dot product functions. Intended to be a single static\n// object, but it does no real harm to have more than one.\nclass SIMDDetect {\npublic:\n  // Returns true if AVX is available on this system.\n  static inline bool IsAVXAvailable() {\n    return detector.avx_available_;\n  }\n  // Returns true if AVX2 (integer support) is available on this system.\n  static inline bool IsAVX2Available() {\n    return detector.avx2_available_;\n  }\n  // Returns true if AVX512 Foundation (float) is available on this system.\n  static inline bool IsAVX512FAvailable() {\n    return detector.avx512F_available_;\n  }\n  // Returns true if AVX512 integer is available on this system.\n  static inline bool IsAVX512BWAvailable() {\n    return detector.avx512BW_available_;\n  }\n  // Returns true if AVX512 Vector Neural Network Instructions are available.\n  static inline bool IsAVX512VNNIAvailable() {\n    return detector.avx512VNNI_available_;\n  }\n  // Returns true if FMA is available on this system.\n  static inline bool IsFMAAvailable() {\n    return detector.fma_available_;\n  }\n  // Returns true if SSE4.1 is available on this system.\n  static inline bool IsSSEAvailable() {\n    return detector.sse_available_;\n  }\n  // Returns true if NEON is available on this system.\n  static inline bool IsNEONAvailable() {\n    return detector.neon_available_;\n  }\n  // Returns true if RVV is available on this system.\n  static inline bool IsRVVAvailable() {\n    return detector.rvv_available_;\n  }\n\n  // Update settings after config variable was set.\n  static TESS_API void Update();\n\nprivate:\n  // Constructor, must set all static member variables.\n  SIMDDetect();\n\nprivate:\n  // Singleton.\n  static SIMDDetect detector;\n  // If true, then AVX has been detected.\n  static TESS_API bool avx_available_;\n  static TESS_API bool avx2_available_;\n  static TESS_API bool avx512F_available_;\n  static TESS_API bool avx512BW_available_;\n  static TESS_API bool avx512VNNI_available_;\n  // If true, then FMA has been detected.\n  static TESS_API bool fma_available_;\n  // If true, then SSe4.1 has been detected.\n  static TESS_API bool sse_available_;\n  // If true, then NEON has been detected.\n  static TESS_API bool neon_available_;\n  // If true, then RVV has been detected.\n  static TESS_API bool rvv_available_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_ARCH_SIMDDETECT_H_\n"
  },
  {
    "path": "src/ccmain/adaptions.cpp",
    "content": "/**********************************************************************\n * File:        adaptions.cpp  (Formerly adaptions.c)\n * Description: Functions used to adapt to blobs already confidently\n *              identified\n * Author:      Chris Newton\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <cctype>\n#include <cstring>\n#include \"control.h\"\n#include \"reject.h\"\n#include \"stopper.h\"\n#include \"tesseractclass.h\"\n#include \"tessvars.h\"\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\nnamespace tesseract {\nbool Tesseract::word_adaptable( // should we adapt?\n    WERD_RES *word, uint16_t mode) {\n  if (tessedit_adaption_debug) {\n    tprintf(\"Running word_adaptable() for %s rating %.4f certainty %.4f\\n\",\n            word->best_choice->unichar_string().c_str(), word->best_choice->rating(),\n            word->best_choice->certainty());\n  }\n\n  bool status = false;\n  std::bitset<16> flags(mode);\n\n  enum MODES {\n    ADAPTABLE_WERD,\n    ACCEPTABLE_WERD,\n    CHECK_DAWGS,\n    CHECK_SPACES,\n    CHECK_ONE_ELL_CONFLICT,\n    CHECK_AMBIG_WERD\n  };\n\n  /*\n0: NO adaption\n*/\n  if (mode == 0) {\n    if (tessedit_adaption_debug) {\n      tprintf(\"adaption disabled\\n\");\n    }\n    return false;\n  }\n\n  if (flags[ADAPTABLE_WERD]) {\n    status |= word->tess_would_adapt; // result of Classify::AdaptableWord()\n    if (tessedit_adaption_debug && !status) {\n      tprintf(\"tess_would_adapt bit is false\\n\");\n    }\n  }\n\n  if (flags[ACCEPTABLE_WERD]) {\n    status |= word->tess_accepted;\n    if (tessedit_adaption_debug && !status) {\n      tprintf(\"tess_accepted bit is false\\n\");\n    }\n  }\n\n  if (!status) {  // If not set then\n    return false; // ignore other checks\n  }\n\n  if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&\n      (word->best_choice->permuter() != FREQ_DAWG_PERM) &&\n      (word->best_choice->permuter() != USER_DAWG_PERM) &&\n      (word->best_choice->permuter() != NUMBER_PERM)) {\n    if (tessedit_adaption_debug) {\n      tprintf(\"word not in dawgs\\n\");\n    }\n    return false;\n  }\n\n  if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {\n    if (tessedit_adaption_debug) {\n      tprintf(\"word has ell conflict\\n\");\n    }\n    return false;\n  }\n\n  if (flags[CHECK_SPACES] &&\n      (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {\n    if (tessedit_adaption_debug) {\n      tprintf(\"word contains spaces\\n\");\n    }\n    return false;\n  }\n\n  if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {\n    if (tessedit_adaption_debug) {\n      tprintf(\"word is ambiguous\\n\");\n    }\n    return false;\n  }\n\n  if (tessedit_adaption_debug) {\n    tprintf(\"returning status %d\\n\", status);\n  }\n  return status;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/applybox.cpp",
    "content": "/**********************************************************************\n * File:        applybox.cpp  (Formerly applybox.c)\n * Description: Re segment rows according to box file data\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef DISABLED_LEGACY_ENGINE\n#  include <allheaders.h>\n#  include <cctype>\n#  include <cerrno>\n#  include <cstring>\n#  include \"boxread.h\"\n#endif // ndef DISABLED_LEGACY_ENGINE\n#include <tesseract/unichar.h>\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"unicharset.h\"\n\n#ifndef DISABLED_LEGACY_ENGINE\n/** Max number of blobs to classify together in FindSegmentation. */\nconst int kMaxGroupSize = 4;\n/// Max fraction of median allowed as deviation in xheight before switching\n/// to median.\nconst double kMaxXHeightDeviationFraction = 0.125;\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/**\n * The box file is assumed to contain box definitions, one per line, of the\n * following format for blob-level boxes:\n * @verbatim\n *   <UTF8 str> <left> <bottom> <right> <top> <page id>\n * @endverbatim\n * and for word/line-level boxes:\n * @verbatim\n *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>\n * @endverbatim\n * NOTES:\n * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.\n *\n * <page id> is 0-based, and the page number is used for multipage input (tiff).\n *\n * In the blob-level form, each line represents a recognizable unit, which may\n * be several UTF-8 bytes, but there is a bounding box around each recognizable\n * unit, and no classifier is needed to train in this mode (bootstrapping.)\n *\n * In the word/line-level form, the line begins with the literal \"WordStr\", and\n * the bounding box bounds either a whole line or a whole word. The recognizable\n * units in the word/line are listed after the # at the end of the line and\n * are space delimited, ignoring any original spaces on the line.\n * Eg.\n * @verbatim\n * word -> #w o r d\n * multi word line -> #m u l t i w o r d l i n e\n * @endverbatim\n * The recognizable units must be space-delimited in order to allow multiple\n * unicodes to be used for a single recognizable unit, eg Hindi.\n *\n * In this mode, the classifier must have been pre-trained with the desired\n * character set, or it will not be able to find the character segmentations.\n */\n\nnamespace tesseract {\n\n#ifndef DISABLED_LEGACY_ENGINE\nstatic void clear_any_old_text(BLOCK_LIST *block_list) {\n  BLOCK_IT block_it(block_list);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    ROW_IT row_it(block_it.data()->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      WERD_IT word_it(row_it.data()->word_list());\n      for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n        word_it.data()->set_text(\"\");\n      }\n    }\n  }\n}\n\n// Applies the box file based on the image name filename, and resegments\n// the words in the block_list (page), with:\n// blob-mode: one blob per line in the box file, words as input.\n// word/line-mode: one blob per space-delimited unit after the #, and one word\n// per line in the box file. (See comment above for box file format.)\n// If find_segmentation is true, (word/line mode) then the classifier is used\n// to re-segment words/lines to match the space-delimited truth string for\n// each box. In this case, the input box may be for a word or even a whole\n// text line, and the output words will contain multiple blobs corresponding\n// to the space-delimited input string.\n// With find_segmentation false, no classifier is needed, but the chopper\n// can still be used to correctly segment touching characters with the help\n// of the input boxes.\n// In the returned PAGE_RES, the WERD_RES are setup as they would be returned\n// from normal classification, ie. with a word, chopped_word, rebuild_word,\n// seam_array, denorm, box_word, and best_state, but NO best_choice or\n// raw_choice, as they would require a UNICHARSET, which we aim to avoid.\n// Instead, the correct_text member of WERD_RES is set, and this may be later\n// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords\n// is not required before calling ApplyBoxTraining.\nPAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,\n                                BLOCK_LIST *block_list) {\n  std::vector<TBOX> boxes;\n  std::vector<std::string> texts, full_texts;\n  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {\n    return nullptr; // Can't do it.\n  }\n\n  const int box_count = boxes.size();\n  int box_failures = 0;\n\n  // In word mode, we use the boxes to make a word for each box, but\n  // in blob mode we use the existing words and maximally chop them first.\n  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);\n  clear_any_old_text(block_list);\n\n  for (int i = 0; i < box_count; i++) {\n    bool foundit = false;\n    if (page_res != nullptr) {\n      foundit =\n          ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],\n                           (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());\n    } else {\n      foundit = ResegmentWordBox(block_list, boxes[i],\n                                 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());\n    }\n    if (!foundit) {\n      box_failures++;\n      ReportFailedBox(i, boxes[i], texts[i].c_str(), \"FAILURE! Couldn't find a matching blob\");\n    }\n  }\n\n  if (page_res == nullptr) {\n    // In word/line mode, we now maximally chop all the words and resegment\n    // them with the classifier.\n    page_res = SetupApplyBoxes(boxes, block_list);\n    ReSegmentByClassification(page_res);\n  }\n  if (applybox_debug > 0) {\n    tprintf(\"APPLY_BOXES:\\n\");\n    tprintf(\"   Boxes read from boxfile:  %6d\\n\", box_count);\n    if (box_failures > 0) {\n      tprintf(\"   Boxes failed resegmentation:  %6d\\n\", box_failures);\n    }\n  }\n  TidyUp(page_res);\n  return page_res;\n}\n\n// Helper computes median xheight in the image.\nstatic double MedianXHeight(BLOCK_LIST *block_list) {\n  BLOCK_IT block_it(block_list);\n  STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    ROW_IT row_it(block_it.data()->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);\n    }\n  }\n  return xheights.median();\n}\n\n/// Any row xheight that is significantly different from the median is set\n/// to the median.\nvoid Tesseract::PreenXHeights(BLOCK_LIST *block_list) {\n  const double median_xheight = MedianXHeight(block_list);\n  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;\n  // Strip all fuzzy space markers to simplify the PAGE_RES.\n  BLOCK_IT b_it(block_list);\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    BLOCK *block = b_it.data();\n    ROW_IT r_it(block->row_list());\n    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {\n      ROW *row = r_it.data();\n      const double diff = fabs(row->x_height() - median_xheight);\n      if (diff > max_deviation) {\n        if (applybox_debug) {\n          tprintf(\"row xheight=%g, but median xheight = %g\\n\", row->x_height(), median_xheight);\n        }\n        row->set_x_height(static_cast<float>(median_xheight));\n      }\n    }\n  }\n}\n\n/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:\n/// All fuzzy spaces are removed, and all the words are maximally chopped.\nPAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {\n  PreenXHeights(block_list);\n  // Strip all fuzzy space markers to simplify the PAGE_RES.\n  BLOCK_IT b_it(block_list);\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    BLOCK *block = b_it.data();\n    ROW_IT r_it(block->row_list());\n    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {\n      ROW *row = r_it.data();\n      WERD_IT w_it(row->word_list());\n      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n        WERD *word = w_it.data();\n        if (word->cblob_list()->empty()) {\n          delete w_it.extract();\n        } else {\n          word->set_flag(W_FUZZY_SP, false);\n          word->set_flag(W_FUZZY_NON, false);\n        }\n      }\n    }\n  }\n  auto *page_res = new PAGE_RES(false, block_list, nullptr);\n  PAGE_RES_IT pr_it(page_res);\n  WERD_RES *word_res;\n  while ((word_res = pr_it.word()) != nullptr) {\n    MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);\n    pr_it.forward();\n  }\n  return page_res;\n}\n\n/// Tests the chopper by exhaustively running chop_one_blob.\n/// The word_res will contain filled chopped_word, seam_array, denorm,\n/// box_word and best_state for the maximally chopped word.\nvoid Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,\n                                  WERD_RES *word_res) {\n  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,\n                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                     poly_allow_detailed_fx, row, block)) {\n    word_res->CloneChoppedToRebuild();\n    return;\n  }\n  if (chop_debug) {\n    tprintf(\"Maximally chopping word at:\");\n    word_res->word->bounding_box().print();\n  }\n  std::vector<BLOB_CHOICE *> blob_choices;\n  ASSERT_HOST(!word_res->chopped_word->blobs.empty());\n  auto rating = static_cast<float>(INT8_MAX);\n  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {\n    // The rating and certainty are not quite arbitrary. Since\n    // select_blob_to_chop uses the worst certainty to choose, they all have\n    // to be different, so starting with INT8_MAX, subtract 1/8 for each blob\n    // in here, and then divide by e each time they are chopped, which\n    // should guarantee a set of unequal values for the whole tree of blobs\n    // produced, however much chopping is required. The chops are thus only\n    // limited by the ability of the chopper to find suitable chop points,\n    // and not by the value of the certainties.\n    auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);\n    blob_choices.push_back(choice);\n    rating -= 0.125f;\n  }\n  const double e = exp(1.0); // The base of natural logs.\n  unsigned blob_number;\n  if (!assume_fixed_pitch_char_segment) {\n    // We only chop if the language is not fixed pitch like CJK.\n    SEAM *seam = nullptr;\n    int right_chop_index = 0;\n    while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {\n      word_res->InsertSeam(blob_number, seam);\n      BLOB_CHOICE *left_choice = blob_choices[blob_number];\n      rating = left_choice->rating() / e;\n      left_choice->set_rating(rating);\n      left_choice->set_certainty(-rating);\n      // combine confidence w/ serial #\n      auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,\n                                           0.0f, 0.0f, BCC_FAKE);\n      blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);\n    }\n  }\n  word_res->CloneChoppedToRebuild();\n  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);\n}\n\n/// Helper to compute the dispute resolution metric.\n/// Disputed blob resolution. The aim is to give the blob to the most\n/// appropriate boxfile box. Most of the time it is obvious, but if\n/// two boxfile boxes overlap significantly it is not. If a small boxfile\n/// box takes most of the blob, and a large boxfile box does too, then\n/// we want the small boxfile box to get it, but if the small box\n/// is much smaller than the blob, we don't want it to get it.\n/// Details of the disputed blob resolution:\n/// Given a box with area A, and a blob with area B, with overlap area C,\n/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum\n/// miss metric gets the blob.\nstatic double BoxMissMetric(const TBOX &box1, const TBOX &box2) {\n  const int overlap_area = box1.intersection(box2).area();\n  const int a = box1.area();\n  const int b = box2.area();\n  ASSERT_HOST(a != 0 && b != 0);\n  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;\n}\n\n/// Gather consecutive blobs that match the given box into the best_state\n/// and corresponding correct_text.\n///\n/// Fights over which box owns which blobs are settled by pre-chopping and\n/// applying the blobs to box or next_box with the least non-overlap.\n/// @return false if the box was in error, which can only be caused by\n/// failing to find an appropriate blob for a box.\n///\n/// This means that occasionally, blobs may be incorrectly segmented if the\n/// chopper fails to find a suitable chop point.\nbool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,\n                                 const TBOX *next_box, const char *correct_text) {\n  if (applybox_debug > 1) {\n    tprintf(\"\\nAPPLY_BOX: in ResegmentCharBox() for %s\\n\", correct_text);\n  }\n  PAGE_RES_IT page_res_it(page_res);\n  WERD_RES *word_res;\n  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {\n    if (!word_res->box_word->bounding_box().major_overlap(box)) {\n      continue;\n    }\n    if (applybox_debug > 1) {\n      tprintf(\"Checking word box:\");\n      word_res->box_word->bounding_box().print();\n    }\n    int word_len = word_res->box_word->length();\n    for (int i = 0; i < word_len; ++i) {\n      TBOX char_box = TBOX();\n      int blob_count = 0;\n      for (blob_count = 0; i + blob_count < word_len; ++blob_count) {\n        TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);\n        if (!blob_box.major_overlap(box)) {\n          break;\n        }\n        if (word_res->correct_text[i + blob_count].length() > 0) {\n          break; // Blob is claimed already.\n        }\n        if (next_box != nullptr) {\n          const double current_box_miss_metric = BoxMissMetric(blob_box, box);\n          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);\n          if (applybox_debug > 2) {\n            tprintf(\"Checking blob:\");\n            blob_box.print();\n            tprintf(\"Current miss metric = %g, next = %g\\n\", current_box_miss_metric,\n                    next_box_miss_metric);\n          }\n          if (current_box_miss_metric > next_box_miss_metric) {\n            break; // Blob is a better match for next box.\n          }\n        }\n        char_box += blob_box;\n      }\n      if (blob_count > 0) {\n        if (applybox_debug > 1) {\n          tprintf(\"Index [%d, %d) seem good.\\n\", i, i + blob_count);\n        }\n        if (!char_box.almost_equal(box, 3) &&\n            ((next_box != nullptr && box.x_gap(*next_box) < -3) ||\n             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {\n          return false;\n        }\n        // We refine just the box_word, best_state and correct_text here.\n        // The rebuild_word is made in TidyUp.\n        // blob_count blobs are put together to match the box. Merge the\n        // box_word boxes, save the blob_count in the state and the text.\n        word_res->box_word->MergeBoxes(i, i + blob_count);\n        word_res->best_state[i] = blob_count;\n        word_res->correct_text[i] = correct_text;\n        if (applybox_debug > 2) {\n          tprintf(\"%d Blobs match: blob box:\", blob_count);\n          word_res->box_word->BlobBox(i).print();\n          tprintf(\"Matches box:\");\n          box.print();\n          if (next_box != nullptr) {\n            tprintf(\"With next box:\");\n            next_box->print();\n          }\n        }\n        // Eliminated best_state and correct_text entries for the consumed\n        // blobs.\n        for (int j = 1; j < blob_count; ++j) {\n          word_res->best_state.erase(word_res->best_state.begin() + i + 1);\n          word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);\n        }\n        // Assume that no box spans multiple source words, so we are done with\n        // this box.\n        if (applybox_debug > 1) {\n          tprintf(\"Best state = \");\n          for (auto best_state : word_res->best_state) {\n            tprintf(\"%d \", best_state);\n          }\n          tprintf(\"\\n\");\n          tprintf(\"Correct text = [[ \");\n          for (auto &it : word_res->correct_text) {\n            tprintf(\"%s \", it.c_str());\n          }\n          tprintf(\"]]\\n\");\n        }\n        return true;\n      }\n    }\n  }\n  if (applybox_debug > 0) {\n    tprintf(\"FAIL!\\n\");\n  }\n  return false; // Failure.\n}\n\n/// Consume all source blobs that strongly overlap the given box,\n/// putting them into a new word, with the correct_text label.\n/// Fights over which box owns which blobs are settled by\n/// applying the blobs to box or next_box with the least non-overlap.\n/// @return false if the box was in error, which can only be caused by\n/// failing to find an overlapping blob for a box.\nbool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,\n                                 const char *correct_text) {\n  if (applybox_debug > 1) {\n    tprintf(\"\\nAPPLY_BOX: in ResegmentWordBox() for %s\\n\", correct_text);\n  }\n  WERD *new_word = nullptr;\n  BLOCK_IT b_it(block_list);\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    BLOCK *block = b_it.data();\n    if (!box.major_overlap(block->pdblk.bounding_box())) {\n      continue;\n    }\n    ROW_IT r_it(block->row_list());\n    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {\n      ROW *row = r_it.data();\n      if (!box.major_overlap(row->bounding_box())) {\n        continue;\n      }\n      WERD_IT w_it(row->word_list());\n      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n        WERD *word = w_it.data();\n        if (applybox_debug > 2) {\n          tprintf(\"Checking word:\");\n          word->bounding_box().print();\n        }\n        if (word->text() != nullptr && word->text()[0] != '\\0') {\n          continue; // Ignore words that are already done.\n        }\n        if (!box.major_overlap(word->bounding_box())) {\n          continue;\n        }\n        C_BLOB_IT blob_it(word->cblob_list());\n        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n          C_BLOB *blob = blob_it.data();\n          TBOX blob_box = blob->bounding_box();\n          if (!blob_box.major_overlap(box)) {\n            continue;\n          }\n          if (next_box != nullptr) {\n            const double current_box_miss_metric = BoxMissMetric(blob_box, box);\n            const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);\n            if (applybox_debug > 2) {\n              tprintf(\"Checking blob:\");\n              blob_box.print();\n              tprintf(\"Current miss metric = %g, next = %g\\n\", current_box_miss_metric,\n                      next_box_miss_metric);\n            }\n            if (current_box_miss_metric > next_box_miss_metric) {\n              continue; // Blob is a better match for next box.\n            }\n          }\n          if (applybox_debug > 2) {\n            tprintf(\"Blob match: blob:\");\n            blob_box.print();\n            tprintf(\"Matches box:\");\n            box.print();\n            if (next_box != nullptr) {\n              tprintf(\"With next box:\");\n              next_box->print();\n            }\n          }\n          if (new_word == nullptr) {\n            // Make a new word with a single blob.\n            new_word = word->shallow_copy();\n            new_word->set_text(correct_text);\n            w_it.add_to_end(new_word);\n          }\n          C_BLOB_IT new_blob_it(new_word->cblob_list());\n          new_blob_it.add_to_end(blob_it.extract());\n        }\n      }\n    }\n  }\n  if (new_word == nullptr && applybox_debug > 0) {\n    tprintf(\"FAIL!\\n\");\n  }\n  return new_word != nullptr;\n}\n\n/// Resegments the words by running the classifier in an attempt to find the\n/// correct segmentation that produces the required string.\nvoid Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {\n  PAGE_RES_IT pr_it(page_res);\n  WERD_RES *word_res;\n  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {\n    const WERD *word = word_res->word;\n    if (word->text() == nullptr || word->text()[0] == '\\0') {\n      continue; // Ignore words that have no text.\n    }\n    // Convert the correct text to a vector of UNICHAR_ID\n    std::vector<UNICHAR_ID> target_text;\n    if (!ConvertStringToUnichars(word->text(), &target_text)) {\n      tprintf(\"APPLY_BOX: FAILURE: can't find class_id for '%s'\\n\", word->text());\n      pr_it.DeleteCurrentWord();\n      continue;\n    }\n    if (!FindSegmentation(target_text, word_res)) {\n      tprintf(\"APPLY_BOX: FAILURE: can't find segmentation for '%s'\\n\", word->text());\n      pr_it.DeleteCurrentWord();\n      continue;\n    }\n  }\n}\n\n/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.\n/// @return false if an invalid UNICHAR_ID is encountered.\nbool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {\n  for (int step = 0; *utf8 != '\\0'; utf8 += step) {\n    const char *next_space = strchr(utf8, ' ');\n    if (next_space == nullptr) {\n      next_space = utf8 + strlen(utf8);\n    }\n    step = next_space - utf8;\n    UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);\n    if (class_id == INVALID_UNICHAR_ID) {\n      return false;\n    }\n    while (utf8[step] == ' ') {\n      ++step;\n    }\n    class_ids->push_back(class_id);\n  }\n  return true;\n}\n\n/// Resegments the word to achieve the target_text from the classifier.\n/// Returns false if the re-segmentation fails.\n/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and\n/// applies a full search on the classifier results to find the best classified\n/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity\n/// substitutions ARE used.\nbool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {\n  // Classify all required combinations of blobs and save results in choices.\n  const int word_length = word_res->box_word->length();\n  auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];\n  for (int i = 0; i < word_length; ++i) {\n    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {\n      BLOB_CHOICE_LIST *match_result =\n          classify_piece(word_res->seam_array, i, i + j - 1, \"Applybox\", word_res->chopped_word,\n                         word_res->blamer_bundle);\n      if (applybox_debug > 2) {\n        tprintf(\"%d+%d:\", i, j);\n        print_ratings_list(\"Segment:\", match_result, unicharset);\n      }\n      choices[i].push_back(match_result);\n    }\n  }\n  // Search the segmentation graph for the target text. Must be an exact\n  // match. Using wildcards makes it difficult to find the correct\n  // segmentation even when it is there.\n  word_res->best_state.clear();\n  std::vector<int> search_segmentation;\n  float best_rating = 0.0f;\n  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,\n                &word_res->best_state);\n  for (int i = 0; i < word_length; ++i) {\n    for (auto choice : choices[i]) {\n      delete choice;\n    }\n  }\n  delete[] choices;\n  if (word_res->best_state.empty()) {\n    // Build the original segmentation and if it is the same length as the\n    // truth, assume it will do.\n    int blob_count = 1;\n    for (auto s : word_res->seam_array) {\n      SEAM *seam = s;\n      if (!seam->HasAnySplits()) {\n        word_res->best_state.push_back(blob_count);\n        blob_count = 1;\n      } else {\n        ++blob_count;\n      }\n    }\n    word_res->best_state.push_back(blob_count);\n    if (word_res->best_state.size() != target_text.size()) {\n      word_res->best_state.clear(); // No good. Original segmentation bad size.\n      return false;\n    }\n  }\n  word_res->correct_text.clear();\n  for (auto &text : target_text) {\n    word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));\n  }\n  return true;\n}\n\n/// Recursive helper to find a match to the target_text (from text_index\n/// position) in the choices (from choices_pos position).\n/// @param choices is an array of vectors of length choices_length,\n/// with each element representing a starting position in the word, and the\n/// #vector holding classification results for a sequence of consecutive\n/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.\n/// @param choices_pos\n/// @param choices_length\n/// @param target_text\n/// @param text_index\n/// @param rating\n/// @param segmentation\n/// @param best_rating\n/// @param best_segmentation\nvoid Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,\n                              unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,\n                              unsigned text_index, float rating, std::vector<int> *segmentation,\n                              float *best_rating, std::vector<int> *best_segmentation) {\n  const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();\n  for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {\n    // Rating of matching choice or worst choice if no match.\n    float choice_rating = 0.0f;\n    // Find the corresponding best BLOB_CHOICE.\n    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);\n    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {\n      const BLOB_CHOICE *choice = choice_it.data();\n      choice_rating = choice->rating();\n      auto class_id = choice->unichar_id();\n      if (class_id == target_text[text_index]) {\n        break;\n      }\n      // Search ambigs table.\n      if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {\n        AmbigSpec_IT spec_it(table[class_id]);\n        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {\n          const AmbigSpec *ambig_spec = spec_it.data();\n          // We'll only do 1-1.\n          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&\n              ambig_spec->correct_ngram_id == target_text[text_index]) {\n            break;\n          }\n        }\n        if (!spec_it.cycled_list()) {\n          break; // Found an ambig.\n        }\n      }\n    }\n    if (choice_it.cycled_list()) {\n      continue; // No match.\n    }\n    segmentation->push_back(length);\n    if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {\n      // This is a complete match. If the rating is good record a new best.\n      if (applybox_debug > 2) {\n        tesserr << \"Complete match, rating = \" << rating + choice_rating\n                << \", best=\" << *best_rating\n                << \", seglength=\" << segmentation->size()\n                << \", best=\" << best_segmentation->size() << '\\n';\n      }\n      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {\n        *best_segmentation = *segmentation;\n        *best_rating = rating + choice_rating;\n      }\n    } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {\n      if (applybox_debug > 3) {\n        tprintf(\"Match found for %d=%s:%s, at %d+%d, recursing...\\n\", target_text[text_index],\n                unicharset.id_to_unichar(target_text[text_index]),\n                choice_it.data()->unichar_id() == target_text[text_index] ? \"Match\" : \"Ambig\",\n                choices_pos, length);\n      }\n      SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,\n                    rating + choice_rating, segmentation, best_rating, best_segmentation);\n      if (applybox_debug > 3) {\n        tprintf(\"End recursion for %d=%s\\n\", target_text[text_index],\n                unicharset.id_to_unichar(target_text[text_index]));\n      }\n    }\n    segmentation->resize(segmentation->size() - 1);\n  }\n}\n\n/// - Counts up the labelled words and the blobs within.\n/// - Deletes all unused or emptied words, counting the unused ones.\n/// - Resets W_BOL and W_EOL flags correctly.\n/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.\nvoid Tesseract::TidyUp(PAGE_RES *page_res) {\n  int ok_blob_count = 0;\n  int bad_blob_count = 0;\n  // TODO: check usage of ok_word_count.\n  int ok_word_count = 0;\n  int unlabelled_words = 0;\n  PAGE_RES_IT pr_it(page_res);\n  WERD_RES *word_res;\n  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {\n    int ok_in_word = 0;\n    int blob_count = word_res->correct_text.size();\n    auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);\n    word_choice->set_permuter(TOP_CHOICE_PERM);\n    for (int c = 0; c < blob_count; ++c) {\n      if (word_res->correct_text[c].length() > 0) {\n        ++ok_in_word;\n      }\n      // Since we only need a fake word_res->best_choice, the actual\n      // unichar_ids do not matter. Which is fortunate, since TidyUp()\n      // can be called while training Tesseract, at the stage where\n      // unicharset is not meaningful yet.\n      word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],\n                                                     1.0f, -1.0f);\n    }\n    if (ok_in_word > 0) {\n      ok_blob_count += ok_in_word;\n      bad_blob_count += word_res->correct_text.size() - ok_in_word;\n      word_res->LogNewRawChoice(word_choice);\n      word_res->LogNewCookedChoice(1, false, word_choice);\n    } else {\n      ++unlabelled_words;\n      if (applybox_debug > 0) {\n        tprintf(\"APPLY_BOXES: Unlabelled word at :\");\n        word_res->word->bounding_box().print();\n      }\n      pr_it.DeleteCurrentWord();\n      delete word_choice;\n    }\n  }\n  pr_it.restart_page();\n  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {\n    // Denormalize back to a BoxWord.\n    word_res->RebuildBestState();\n    word_res->SetupBoxWord();\n    word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());\n    word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());\n  }\n  if (applybox_debug > 0) {\n    tprintf(\"   Found %d good blobs.\\n\", ok_blob_count);\n    if (bad_blob_count > 0) {\n      tprintf(\"   Leaving %d unlabelled blobs in %d words.\\n\", bad_blob_count, ok_word_count);\n    }\n    if (unlabelled_words > 0) {\n      tprintf(\"   %d remaining unlabelled words deleted.\\n\", unlabelled_words);\n    }\n  }\n}\n\n/** Logs a bad box by line in the box file and box coords.*/\nvoid Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,\n                                const char *err_msg) {\n  tprintf(\"APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\\n\", boxfile_lineno + 1, box_ch,\n          box.left(), box.bottom(), box.right(), box.top(), err_msg);\n}\n\n/// Calls #LearnWord to extract features for labelled blobs within each word.\n/// Features are stored in an internal buffer.\nvoid Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {\n  PAGE_RES_IT pr_it(page_res);\n  int word_count = 0;\n  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {\n    LearnWord(fontname.c_str(), word_res);\n    ++word_count;\n  }\n  tprintf(\"Generated training data for %d words\\n\", word_count);\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/\nvoid Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {\n  PAGE_RES_IT pr_it(page_res);\n  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {\n    auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());\n    for (auto &correct_text : word_res->correct_text) {\n      // The part before the first space is the real ground truth, and the\n      // rest is the bounding box location and page number.\n      std::vector<std::string> tokens = split(correct_text, ' ');\n      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());\n      choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);\n    }\n    word_res->ClearWordChoices();\n    word_res->LogNewRawChoice(choice);\n    word_res->LogNewCookedChoice(1, false, choice);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/control.cpp",
    "content": "/******************************************************************\n * File:        control.cpp  (Formerly control.c)\n * Description: Module-independent matcher controller.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <cctype>\n#include <cmath>\n#include <cstdint> // for int16_t, int32_t\n#include <cstdio>  // for fclose, fopen, FILE\n#include <ctime>   // for clock\n#include \"control.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"docqual.h\"\n#  include \"drawfx.h\"\n#  include \"fixspace.h\"\n#endif\n#include <tesseract/ocrclass.h>\n#include \"lstmrecognizer.h\"\n#include \"output.h\"\n#include \"pageres.h\" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"reject.h\"\n#endif\n#include \"sorthelper.h\"\n#include \"tesseractclass.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"tessvars.h\"\n#include \"werdit.h\"\n\nconst char *const kBackUpConfigFile = \"tempconfigdata.config\";\n#ifndef DISABLED_LEGACY_ENGINE\n// Min believable x-height for any text when refitting as a fraction of\n// original x-height\nconst double kMinRefitXHeightFraction = 0.5;\n#endif // ! DISABLED_LEGACY_ENGINE\n\nnamespace tesseract {\n\n/**\n * Make a word from the selected blobs and run Tess on them.\n *\n * @param page_res recognise blobs\n * @param selection_box within this box\n */\n\nvoid Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {\n  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);\n  if (it != nullptr) {\n    recog_interactive(it);\n    it->DeleteCurrentWord();\n    delete it;\n  }\n}\n\n/**\n * Recognize a single word in interactive mode.\n *\n * @param pr_it the page results iterator\n */\nbool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {\n  WordData word_data(*pr_it);\n  SetupWordPassN(2, &word_data);\n  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.\n  if (lstm_recognizer_ == nullptr) {\n#ifndef DISABLED_LEGACY_ENGINE\n    classify_word_and_language(2, pr_it, &word_data);\n#endif // ndef DISABLED_LEGACY_ENGINE\n  } else {\n    classify_word_and_language(1, pr_it, &word_data);\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  if (tessedit_debug_quality_metrics) {\n    int16_t char_qual;\n    int16_t good_char_qual;\n    WERD_RES *word_res = pr_it->word();\n    word_char_quality(word_res, &char_qual, &good_char_qual);\n    tprintf(\n        \"\\n%d chars;  word_blob_quality: %d;  outline_errs: %d; \"\n        \"char_quality: %d; good_char_quality: %d\\n\",\n        word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),\n        char_qual, good_char_qual);\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n  return true;\n}\n\n// Helper function to check for a target word and handle it appropriately.\n// Inspired by Jetsoft's requirement to process only single words on pass2\n// and beyond.\n// If word_config is not null:\n//   If the word_box and target_word_box overlap, read the word_config file\n//   else reset to previous config data.\n//   return true.\n// else\n//   If the word_box and target_word_box overlap or pass <= 1, return true.\n// Note that this function uses a fixed temporary file for storing the previous\n// configs, so it is neither thread-safe, nor process-safe, but the assumption\n// is that it will only be used for one debug window at a time.\n//\n// Since this function is used for debugging (and not to change OCR results)\n// set only debug params from the word config file.\nbool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,\n                                  const char *word_config, int pass) {\n  if (word_config != nullptr) {\n    if (word_box.major_overlap(target_word_box)) {\n      if (backup_config_file_ == nullptr) {\n        backup_config_file_ = kBackUpConfigFile;\n        FILE *config_fp = fopen(backup_config_file_, \"wb\");\n        if (config_fp == nullptr) {\n          tprintf(\"Error, failed to open file \\\"%s\\\"\\n\", backup_config_file_);\n        } else {\n          ParamUtils::PrintParams(config_fp, params());\n          fclose(config_fp);\n        }\n        ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());\n      }\n    } else {\n      if (backup_config_file_ != nullptr) {\n        ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());\n        backup_config_file_ = nullptr;\n      }\n    }\n  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {\n    return false;\n  }\n  return true;\n}\n\n/** If tesseract is to be run, sets the words up ready for it. */\nvoid Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,\n                                   PAGE_RES *page_res, std::vector<WordData> *words) {\n  // Prepare all the words.\n  PAGE_RES_IT page_res_it(page_res);\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),\n                                                        *target_word_box, word_config, 1)) {\n      words->push_back(WordData(page_res_it));\n    }\n  }\n  // Setup all the words for recognition with polygonal approximation.\n  for (unsigned w = 0; w < words->size(); ++w) {\n    SetupWordPassN(pass_n, &(*words)[w]);\n    if (w > 0) {\n      (*words)[w].prev_word = &(*words)[w - 1];\n    }\n  }\n}\n\n// Sets up the single word ready for whichever engine is to be run.\nvoid Tesseract::SetupWordPassN(int pass_n, WordData *word) {\n  if (pass_n == 1 || !word->word->done) {\n    if (pass_n == 1) {\n      word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,\n                                      nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                      poly_allow_detailed_fx, word->row, word->block);\n    } else if (pass_n == 2) {\n      // TODO(rays) Should we do this on pass1 too?\n      word->word->caps_height = 0.0;\n      if (word->word->x_height == 0.0f) {\n        word->word->x_height = word->row->x_height();\n      }\n    }\n    word->lang_words.truncate(0);\n    for (unsigned s = 0; s <= sub_langs_.size(); ++s) {\n      // The sub_langs_.size() entry is for the master language.\n      Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;\n      auto *word_res = new WERD_RES;\n      word_res->InitForRetryRecognition(*word->word);\n      word->lang_words.push_back(word_res);\n      // LSTM doesn't get setup for pass2.\n      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {\n        word_res->SetupForRecognition(\n            lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,\n            lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,\n            lang_t->poly_allow_detailed_fx, word->row, word->block);\n      }\n    }\n  }\n}\n\n// Runs word recognition on all the words.\nbool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,\n                                   std::vector<WordData> *words) {\n  // TODO(rays) Before this loop can be parallelized (it would yield a massive\n  // speed-up) all remaining member globals need to be converted to local/heap\n  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be\n  // added. The results will be significantly different with adaption on, and\n  // deterioration will need investigation.\n  pr_it->restart_page();\n  for (unsigned w = 0; w < words->size(); ++w) {\n    WordData *word = &(*words)[w];\n    if (w > 0) {\n      word->prev_word = &(*words)[w - 1];\n    }\n    if (monitor != nullptr) {\n      monitor->ocr_alive = true;\n      if (pass_n == 1) {\n        monitor->progress = 70 * w / words->size();\n      } else {\n        monitor->progress = 70 + 30 * w / words->size();\n      }\n      if (monitor->progress_callback2 != nullptr) {\n        TBOX box = pr_it->word()->word->bounding_box();\n        (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());\n      }\n      if (monitor->deadline_exceeded() ||\n          (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {\n        // Timeout. Fake out the rest of the words.\n        for (; w < words->size(); ++w) {\n          (*words)[w].word->SetupFake(unicharset);\n        }\n        return false;\n      }\n    }\n    if (word->word->tess_failed) {\n      unsigned s;\n      for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {\n      }\n      // If all are failed, skip it. Image words are skipped by this test.\n      if (s > word->lang_words.size()) {\n        continue;\n      }\n    }\n    // Sync pr_it with the WordData.\n    while (pr_it->word() != nullptr && pr_it->word() != word->word) {\n      pr_it->forward();\n    }\n    ASSERT_HOST(pr_it->word() != nullptr);\n    bool make_next_word_fuzzy = false;\n#ifndef DISABLED_LEGACY_ENGINE\n    if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {\n      // Needs to be setup again to see the new outlines in the chopped_word.\n      SetupWordPassN(pass_n, word);\n    }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n    classify_word_and_language(pass_n, pr_it, word);\n    if (tessedit_dump_choices || debug_noise_removal) {\n      tprintf(\"Pass%d: %s [%s]\\n\", pass_n, word->word->best_choice->unichar_string().c_str(),\n              word->word->best_choice->debug_string().c_str());\n    }\n    pr_it->forward();\n    if (make_next_word_fuzzy && pr_it->word() != nullptr) {\n      pr_it->MakeCurrentWordFuzzy();\n    }\n  }\n  return true;\n}\n\n/**\n * recog_all_words()\n *\n * Walk the page_res, recognizing all the words.\n * If monitor is not null, it is used as a progress monitor/timeout/cancel.\n * If dopasses is 0, all recognition passes are run,\n * 1 just pass 1, 2 passes2 and higher.\n * If target_word_box is not null, special things are done to words that\n * overlap the target_word_box:\n * if word_config is not null, the word config file is read for just the\n * target word(s), otherwise, on pass 2 and beyond ONLY the target words\n * are processed (Jetsoft modification.)\n * Returns false if we cancelled prematurely.\n *\n * @param page_res page structure\n * @param monitor progress monitor\n * @param word_config word_config file\n * @param target_word_box specifies just to extract a rectangle\n * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher\n */\n\nbool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,\n                                const TBOX *target_word_box, const char *word_config,\n                                int dopasses) {\n  PAGE_RES_IT page_res_it(page_res);\n\n  if (tessedit_minimal_rej_pass1) {\n    tessedit_test_adaption.set_value(true);\n    tessedit_minimal_rejection.set_value(true);\n  }\n\n  if (dopasses == 0 || dopasses == 1) {\n    page_res_it.restart_page();\n    // ****************** Pass 1 *******************\n\n#ifndef DISABLED_LEGACY_ENGINE\n    // If the adaptive classifier is full switch to one we prepared earlier,\n    // ie on the previous page. If the current adaptive classifier is non-empty,\n    // prepare a backup starting at this page, in case it fills up. Do all this\n    // independently for each language.\n    if (AdaptiveClassifierIsFull()) {\n      SwitchAdaptiveClassifier();\n    } else if (!AdaptiveClassifierIsEmpty()) {\n      StartBackupAdaptiveClassifier();\n    }\n    // Now check the sub-langs as well.\n    for (auto &lang : sub_langs_) {\n      if (lang->AdaptiveClassifierIsFull()) {\n        lang->SwitchAdaptiveClassifier();\n      } else if (!lang->AdaptiveClassifierIsEmpty()) {\n        lang->StartBackupAdaptiveClassifier();\n      }\n    }\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n    // Set up all words ready for recognition, so that if parallelism is on\n    // all the input and output classes are ready to run the classifier.\n    std::vector<WordData> words;\n    SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);\n#ifndef DISABLED_LEGACY_ENGINE\n    if (tessedit_parallelize) {\n      PrerecAllWordsPar(words);\n    }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n    stats_.word_count = words.size();\n\n    stats_.dict_words = 0;\n    stats_.doc_blob_quality = 0;\n    stats_.doc_outline_errs = 0;\n    stats_.doc_char_quality = 0;\n    stats_.good_char_count = 0;\n    stats_.doc_good_char_quality = 0;\n\n    most_recently_used_ = this;\n    // Run pass 1 word recognition.\n    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {\n      return false;\n    }\n    // Pass 1 post-processing.\n    for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n      if (page_res_it.word()->word->flag(W_REP_CHAR)) {\n        fix_rep_char(&page_res_it);\n        continue;\n      }\n\n      // Count dict words.\n      if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {\n        ++(stats_.dict_words);\n      }\n\n      // Update misadaption log (we only need to do it on pass 1, since\n      // adaption only happens on this pass).\n      if (page_res_it.word()->blamer_bundle != nullptr &&\n          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {\n        page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());\n      }\n    }\n  }\n\n  if (dopasses == 1) {\n    return true;\n  }\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n  // ****************** Pass 2 *******************\n  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {\n    page_res_it.restart_page();\n    std::vector<WordData> words;\n    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);\n    if (tessedit_parallelize) {\n      PrerecAllWordsPar(words);\n    }\n    most_recently_used_ = this;\n    // Run pass 2 word recognition.\n    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {\n      return false;\n    }\n  }\n\n  // The next passes are only required for Tess-only.\n  if (AnyTessLang() && !AnyLSTMLang()) {\n    // ****************** Pass 3 *******************\n    // Fix fuzzy spaces.\n\n    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&\n        !right_to_left()) {\n      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);\n    }\n\n    // ****************** Pass 4 *******************\n    if (tessedit_enable_dict_correction) {\n      dictionary_correction_pass(page_res);\n    }\n    if (tessedit_enable_bigram_correction) {\n      bigram_correction_pass(page_res);\n    }\n\n    // ****************** Pass 5,6 *******************\n    rejection_passes(page_res, monitor, target_word_box, word_config);\n\n    // ****************** Pass 8 *******************\n    font_recognition_pass(page_res);\n\n    // ****************** Pass 9 *******************\n    // Check the correctness of the final results.\n    blamer_pass(page_res);\n    script_pos_pass(page_res);\n  }\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  // Write results pass.\n  // This is now redundant, but retained commented so show how to obtain\n  // bounding boxes and style information.\n\n#ifndef DISABLED_LEGACY_ENGINE\n  // changed by jetsoft\n  // needed for dll to output memory structure\n  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {\n    output_pass(page_res_it, target_word_box);\n  }\n// end jetsoft\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));\n  textord_.CleanupSingleRowResult(pageseg_mode, page_res);\n\n  // Remove empty words, as these mess up the result iterators.\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    const WERD_RES *word = page_res_it.word();\n    const POLY_BLOCK *pb = page_res_it.block()->block != nullptr\n                               ? page_res_it.block()->block->pdblk.poly_block()\n                               : nullptr;\n    if (word->best_choice == nullptr || word->best_choice->empty() ||\n        (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {\n      page_res_it.DeleteCurrentWord();\n    }\n  }\n\n  if (monitor != nullptr) {\n    monitor->progress = 100;\n  }\n  return true;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\nvoid Tesseract::bigram_correction_pass(PAGE_RES *page_res) {\n  PAGE_RES_IT word_it(page_res);\n\n  WERD_RES *w_prev = nullptr;\n  WERD_RES *w = word_it.word();\n  while (true) {\n    w_prev = w;\n    while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {\n      // advance word_it, skipping over parts of combos\n    }\n    if (!word_it.word()) {\n      break;\n    }\n    w = word_it.word();\n    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {\n      continue;\n    }\n    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {\n      if (tessedit_bigram_debug) {\n        tprintf(\"Skipping because one of the words is W_REP_CHAR\\n\");\n      }\n      continue;\n    }\n    // Two words sharing the same language model, excellent!\n    std::vector<WERD_CHOICE *> overrides_word1;\n    std::vector<WERD_CHOICE *> overrides_word2;\n\n    const auto &orig_w1_str = w_prev->best_choice->unichar_string();\n    const auto &orig_w2_str = w->best_choice->unichar_string();\n    WERD_CHOICE prev_best(w->uch_set);\n    {\n      int w1start, w1end;\n      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);\n      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);\n    }\n    WERD_CHOICE this_best(w->uch_set);\n    {\n      int w2start, w2end;\n      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);\n      this_best = w->best_choice->shallow_copy(w2start, w2end);\n    }\n\n    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {\n      if (tessedit_bigram_debug) {\n        tprintf(\"Top choice \\\"%s %s\\\" verified by bigram model.\\n\", orig_w1_str.c_str(),\n                orig_w2_str.c_str());\n      }\n      continue;\n    }\n    if (tessedit_bigram_debug > 2) {\n      tprintf(\"Examining alt choices for \\\"%s %s\\\".\\n\", orig_w1_str.c_str(), orig_w2_str.c_str());\n    }\n    if (tessedit_bigram_debug > 1) {\n      if (!w_prev->best_choices.singleton()) {\n        w_prev->PrintBestChoices();\n      }\n      if (!w->best_choices.singleton()) {\n        w->PrintBestChoices();\n      }\n    }\n    float best_rating = 0.0;\n    int best_idx = 0;\n    WERD_CHOICE_IT prev_it(&w_prev->best_choices);\n    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {\n      WERD_CHOICE *p1 = prev_it.data();\n      WERD_CHOICE strip1(w->uch_set);\n      {\n        int p1start, p1end;\n        p1->GetNonSuperscriptSpan(&p1start, &p1end);\n        strip1 = p1->shallow_copy(p1start, p1end);\n      }\n      WERD_CHOICE_IT w_it(&w->best_choices);\n      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n        WERD_CHOICE *p2 = w_it.data();\n        WERD_CHOICE strip2(w->uch_set);\n        {\n          int p2start, p2end;\n          p2->GetNonSuperscriptSpan(&p2start, &p2end);\n          strip2 = p2->shallow_copy(p2start, p2end);\n        }\n        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {\n          overrides_word1.push_back(p1);\n          overrides_word2.push_back(p2);\n          if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {\n            best_rating = p1->rating() + p2->rating();\n            best_idx = overrides_word1.size() - 1;\n          }\n        }\n      }\n    }\n    if (!overrides_word1.empty()) {\n      // Excellent, we have some bigram matches.\n      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&\n          EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {\n        if (tessedit_bigram_debug > 1) {\n          tprintf(\n              \"Top choice \\\"%s %s\\\" verified (sans case) by bigram \"\n              \"model.\\n\",\n              orig_w1_str.c_str(), orig_w2_str.c_str());\n        }\n        continue;\n      }\n      const auto &new_w1_str = overrides_word1[best_idx]->unichar_string();\n      const auto &new_w2_str = overrides_word2[best_idx]->unichar_string();\n      if (new_w1_str != orig_w1_str) {\n        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);\n      }\n      if (new_w2_str != orig_w2_str) {\n        w->ReplaceBestChoice(overrides_word2[best_idx]);\n      }\n      if (tessedit_bigram_debug > 0) {\n        std::string choices_description;\n        int num_bigram_choices = overrides_word1.size() * overrides_word2.size();\n        if (num_bigram_choices == 1) {\n          choices_description = \"This was the unique bigram choice.\";\n        } else {\n          if (tessedit_bigram_debug > 1) {\n            std::string bigrams_list;\n            const int kMaxChoicesToPrint = 20;\n            for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {\n              if (i > 0) {\n                bigrams_list += \", \";\n              }\n              WERD_CHOICE *p1 = overrides_word1[i];\n              WERD_CHOICE *p2 = overrides_word2[i];\n              bigrams_list += p1->unichar_string() + \" \" + p2->unichar_string();\n            }\n            choices_description = \"There were many choices: {\";\n            choices_description += bigrams_list;\n            choices_description += \"}\";\n          } else {\n            choices_description += \"There were \" + std::to_string(num_bigram_choices);\n            choices_description += \" compatible bigrams.\";\n          }\n        }\n        tprintf(\"Replaced \\\"%s %s\\\" with \\\"%s %s\\\" with bigram model. %s\\n\", orig_w1_str.c_str(),\n                orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),\n                choices_description.c_str());\n      }\n    }\n  }\n}\n\nvoid Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,\n                                 const TBOX *target_word_box, const char *word_config) {\n  PAGE_RES_IT page_res_it(page_res);\n  // ****************** Pass 5 *******************\n  // Gather statistics on rejects.\n  int word_index = 0;\n  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {\n    WERD_RES *word = page_res_it.word();\n    word_index++;\n    if (monitor != nullptr) {\n      monitor->ocr_alive = true;\n      monitor->progress = 95 + 5 * word_index / stats_.word_count;\n    }\n    if (word->rebuild_word == nullptr) {\n      // Word was not processed by tesseract.\n      page_res_it.forward();\n      continue;\n    }\n    check_debug_pt(word, 70);\n\n    // changed by jetsoft\n    // specific to its needs to extract one word when need\n    if (target_word_box &&\n        !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {\n      page_res_it.forward();\n      continue;\n    }\n    // end jetsoft\n\n    page_res_it.rej_stat_word();\n    const int chars_in_word = word->reject_map.length();\n    const int rejects_in_word = word->reject_map.reject_count();\n\n    const int blob_quality = word_blob_quality(word);\n    stats_.doc_blob_quality += blob_quality;\n    const int outline_errs = word_outline_errs(word);\n    stats_.doc_outline_errs += outline_errs;\n    int16_t all_char_quality;\n    int16_t accepted_all_char_quality;\n    word_char_quality(word, &all_char_quality, &accepted_all_char_quality);\n    stats_.doc_char_quality += all_char_quality;\n    const uint8_t permuter_type = word->best_choice->permuter();\n    if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||\n        (permuter_type == USER_DAWG_PERM)) {\n      stats_.good_char_count += chars_in_word - rejects_in_word;\n      stats_.doc_good_char_quality += accepted_all_char_quality;\n    }\n    check_debug_pt(word, 80);\n    if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {\n      word->reject_map.rej_word_bad_quality();\n    }\n    check_debug_pt(word, 90);\n    page_res_it.forward();\n  }\n\n  if (tessedit_debug_quality_metrics) {\n    tprintf(\n        \"QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f\"\n        \" outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\\n\",\n        page_res->char_count, page_res->rej_count,\n        page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,\n        stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,\n        stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,\n        stats_.doc_char_quality / static_cast<float>(page_res->char_count),\n        stats_.doc_good_char_quality,\n        (stats_.good_char_count > 0)\n            ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))\n            : 0.0);\n  }\n  bool good_quality_doc =\n      ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&\n      (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&\n      (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&\n      (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);\n\n  // ****************** Pass 6 *******************\n  // Do whole document or whole block rejection pass\n  if (!tessedit_test_adaption) {\n    quality_based_rejection(page_res_it, good_quality_doc);\n  }\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\nvoid Tesseract::blamer_pass(PAGE_RES *page_res) {\n  if (!wordrec_run_blamer) {\n    return;\n  }\n  PAGE_RES_IT page_res_it(page_res);\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD_RES *word = page_res_it.word();\n    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);\n    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;\n  }\n  tprintf(\"Blame reasons:\\n\");\n  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {\n    tprintf(\"%s %d\\n\", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),\n            page_res->blame_reasons[bl]);\n  }\n  if (page_res->misadaption_log.size() > 0) {\n    tprintf(\"Misadaption log:\\n\");\n    for (auto &log : page_res->misadaption_log) {\n      tprintf(\"%s\\n\", log.c_str());\n    }\n  }\n}\n\n// Sets script positions and detects smallcaps on all output words.\nvoid Tesseract::script_pos_pass(PAGE_RES *page_res) {\n  PAGE_RES_IT page_res_it(page_res);\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD_RES *word = page_res_it.word();\n    if (word->word->flag(W_REP_CHAR)) {\n      page_res_it.forward();\n      continue;\n    }\n    const float x_height = page_res_it.block()->block->x_height();\n    float word_x_height = word->x_height;\n    if (word_x_height < word->best_choice->min_x_height() ||\n        word_x_height > word->best_choice->max_x_height()) {\n      word_x_height =\n          (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;\n    }\n    // Test for small caps. Word capheight must be close to block xheight,\n    // and word must contain no lower case letters, and at least one upper case.\n    const double small_cap_xheight = x_height * kXHeightCapRatio;\n    const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;\n    if (word->uch_set->script_has_xheight() &&\n        small_cap_xheight - small_cap_delta <= word_x_height &&\n        word_x_height <= small_cap_xheight + small_cap_delta) {\n      // Scan for upper/lower.\n      int num_upper = 0;\n      int num_lower = 0;\n      for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n        if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {\n          ++num_upper;\n        } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {\n          ++num_lower;\n        }\n      }\n      if (num_upper > 0 && num_lower == 0) {\n        word->small_caps = true;\n      }\n    }\n    word->SetScriptPositions();\n  }\n}\n\n// Helper finds the gap between the index word and the next.\nstatic void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {\n  *right = -INT32_MAX;\n  *next_left = INT32_MAX;\n  if (index < words.size()) {\n    *right = words[index]->word->bounding_box().right();\n    if (index + 1 < words.size()) {\n      *next_left = words[index + 1]->word->bounding_box().left();\n    }\n  }\n}\n\n// Factored helper computes the rating, certainty, badness and validity of\n// the permuter of the words in [first_index, end_index).\nstatic void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,\n                             float *rating, float *certainty, bool *bad, bool *valid_permuter) {\n  if (end_index <= first_index) {\n    *bad = true;\n    *valid_permuter = false;\n  }\n  for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {\n    WERD_CHOICE *choice = words[index]->best_choice;\n    if (choice == nullptr) {\n      *bad = true;\n    } else {\n      *rating += choice->rating();\n      *certainty = std::min(*certainty, choice->certainty());\n      if (!Dict::valid_word_permuter(choice->permuter(), false)) {\n        *valid_permuter = false;\n      }\n    }\n  }\n}\n\n// Helper chooses the best combination of words, transferring good ones from\n// new_words to best_words. To win, a new word must have (better rating and\n// certainty) or (better permuter status and rating within rating ratio and\n// certainty within certainty margin) than current best.\n// All the new_words are consumed (moved to best_words or deleted.)\n// The return value is the number of new_words used minus the number of\n// best_words that remain in the output.\nstatic int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,\n                           PointerVector<WERD_RES> *new_words,\n                           PointerVector<WERD_RES> *best_words) {\n  // Process the smallest groups of words that have an overlapping word\n  // boundary at the end.\n  std::vector<WERD_RES *> out_words;\n  // Index into each word vector (best, new).\n  unsigned b = 0, n = 0;\n  int num_best = 0, num_new = 0;\n  while (b < best_words->size() || n < new_words->size()) {\n    // Start of the current run in each.\n    auto start_b = b, start_n = n;\n    while (b < best_words->size() || n < new_words->size()) {\n      int b_right = -INT32_MAX;\n      int next_b_left = INT32_MAX;\n      WordGap(*best_words, b, &b_right, &next_b_left);\n      int n_right = -INT32_MAX;\n      int next_n_left = INT32_MAX;\n      WordGap(*new_words, n, &n_right, &next_n_left);\n      if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {\n        // The word breaks overlap. [start_b,b] and [start_n, n] match.\n        break;\n      }\n      // Keep searching for the matching word break.\n      if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {\n        ++b;\n      } else {\n        ++n;\n      }\n    }\n    // Rating of the current run in each.\n    float b_rating = 0.0f, n_rating = 0.0f;\n    // Certainty of the current run in each.\n    float b_certainty = 0.0f, n_certainty = 0.0f;\n    // True if any word is missing its best choice.\n    bool b_bad = false, n_bad = false;\n    // True if all words have a valid permuter.\n    bool b_valid_permuter = true, n_valid_permuter = true;\n    const int end_b = b < best_words->size() ? b + 1 : b;\n    const int end_n = n < new_words->size() ? n + 1 : n;\n    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,\n                     &b_valid_permuter);\n    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,\n                     &n_valid_permuter);\n    bool new_better = false;\n    if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||\n                   (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&\n                    n_certainty > b_certainty - certainty_margin))) {\n      // New is better.\n      for (int i = start_n; i < end_n; ++i) {\n        out_words.push_back((*new_words)[i]);\n        (*new_words)[i] = nullptr;\n        ++num_new;\n      }\n      new_better = true;\n    } else if (!b_bad) {\n      // Current best is better.\n      for (int i = start_b; i < end_b; ++i) {\n        out_words.push_back((*best_words)[i]);\n        (*best_words)[i] = nullptr;\n        ++num_best;\n      }\n    }\n    if (debug) {\n      tprintf(\n          \"%d new words %s than %d old words: r: %g v %g c: %g v %g\"\n          \" valid dict: %d v %d\\n\",\n          end_n - start_n, new_better ? \"better\" : \"worse\", end_b - start_b, n_rating, b_rating,\n          n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);\n    }\n    // Move on to the next group.\n    b = end_b;\n    n = end_n;\n  }\n  // Transfer from out_words to best_words.\n  best_words->clear();\n  for (auto &out_word : out_words) {\n    best_words->push_back(out_word);\n  }\n  return num_new - num_best;\n}\n\n// Helper to recognize the word using the given (language-specific) tesseract.\n// Returns positive if this recognizer found more new best words than the\n// number kept from best_words.\nint Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,\n                                 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {\n  if (debug) {\n    tprintf(\"Trying word using lang %s, oem %d\\n\", lang.c_str(),\n            static_cast<int>(tessedit_ocr_engine_mode));\n  }\n  // Run the recognizer on the word.\n  PointerVector<WERD_RES> new_words;\n  (this->*recognizer)(word_data, in_word, &new_words);\n  if (new_words.empty()) {\n    // Transfer input word to new_words, as the classifier must have put\n    // the result back in the input.\n    new_words.push_back(*in_word);\n    *in_word = nullptr;\n  }\n  if (debug) {\n    for (unsigned i = 0; i < new_words.size(); ++i) {\n      new_words[i]->DebugTopChoice(\"Lang result\");\n    }\n  }\n  // Initial version is a bit of a hack based on better certainty and rating\n  // or a dictionary vs non-dictionary word.\n  return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,\n                         &new_words, best_words);\n}\n\n// Helper returns true if all the words are acceptable.\nstatic bool WordsAcceptable(const PointerVector<WERD_RES> &words) {\n  for (unsigned w = 0; w < words.size(); ++w) {\n    if (words[w]->tess_failed || !words[w]->tess_accepted) {\n      return false;\n    }\n  }\n  return true;\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n// Moves good-looking \"noise\"/diacritics from the reject list to the main\n// blob list on the current word. Returns true if anything was done, and\n// sets make_next_word_fuzzy if blob(s) were added to the end of the word.\nbool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {\n  *make_next_word_fuzzy = false;\n  WERD *real_word = pr_it->word()->word;\n  if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||\n      real_word->rej_cblob_list()->length() > noise_maxperword) {\n    return false;\n  }\n  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);\n  // Get the noise outlines into a vector with matching bool map.\n  std::vector<C_OUTLINE *> outlines;\n  real_word->GetNoiseOutlines(&outlines);\n  std::vector<bool> word_wanted;\n  std::vector<bool> overlapped_any_blob;\n  std::vector<C_BLOB *> target_blobs;\n  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,\n                                     &overlapped_any_blob, &target_blobs);\n  // Filter the outlines that overlapped any blob and put them into the word\n  // now. This simplifies the remaining task and also makes it more accurate\n  // as it has more completed blobs to work on.\n  std::vector<bool> wanted;\n  std::vector<C_BLOB *> wanted_blobs;\n  std::vector<C_OUTLINE *> wanted_outlines;\n  int num_overlapped = 0;\n  int num_overlapped_used = 0;\n  for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {\n    if (overlapped_any_blob[i]) {\n      ++num_overlapped;\n      if (word_wanted[i]) {\n        ++num_overlapped_used;\n      }\n      wanted.push_back(word_wanted[i]);\n      wanted_blobs.push_back(target_blobs[i]);\n      wanted_outlines.push_back(outlines[i]);\n      outlines[i] = nullptr;\n    }\n  }\n  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);\n  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);\n  // TODO: check code.\n  int non_overlapped = 0;\n  int non_overlapped_used = 0;\n  for (unsigned i = 0; i < word_wanted.size(); ++i) {\n    if (word_wanted[i]) {\n      ++non_overlapped_used;\n    }\n    if (outlines[i] != nullptr) {\n      ++non_overlapped_used;\n    }\n  }\n  if (debug_noise_removal) {\n    tprintf(\"Used %d/%d overlapped %d/%d non-overlapped diacritics on word:\", num_overlapped_used,\n            num_overlapped, non_overlapped_used, non_overlapped);\n    real_word->bounding_box().print();\n  }\n  // Now we have decided which outlines we want, put them into the real_word.\n  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {\n    pr_it->MakeCurrentWordFuzzy();\n  }\n  // TODO(rays) Parts of combos have a deep copy of the real word, and need\n  // to have their noise outlines moved/assigned in the same way!!\n  return num_overlapped_used != 0 || non_overlapped_used != 0;\n}\n\n// Attempts to put noise/diacritic outlines into the blobs that they overlap.\n// Input: a set of noisy outlines that probably belong to the real_word.\n// Output: word_wanted indicates which outlines are to be assigned to a blob,\n//   target_blobs indicates which to assign to, and overlapped_any_blob is\n//   true for all outlines that overlapped a blob.\nvoid Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,\n                                                   int pass, WERD *real_word, PAGE_RES_IT *pr_it,\n                                                   std::vector<bool> *word_wanted,\n                                                   std::vector<bool> *overlapped_any_blob,\n                                                   std::vector<C_BLOB *> *target_blobs) {\n  std::vector<bool> blob_wanted;\n  word_wanted->clear();\n  word_wanted->resize(outlines.size());\n  overlapped_any_blob->clear();\n  overlapped_any_blob->resize(outlines.size());\n  target_blobs->clear();\n  target_blobs->resize(outlines.size());\n  // For each real blob, find the outlines that seriously overlap it.\n  // A single blob could be several merged characters, so there can be quite\n  // a few outlines overlapping, and the full engine needs to be used to chop\n  // and join to get a sensible result.\n  C_BLOB_IT blob_it(real_word->cblob_list());\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    C_BLOB *blob = blob_it.data();\n    const TBOX blob_box = blob->bounding_box();\n    blob_wanted.clear();\n    blob_wanted.resize(outlines.size());\n    int num_blob_outlines = 0;\n    for (unsigned i = 0; i < outlines.size(); ++i) {\n      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {\n        blob_wanted[i] = true;\n        (*overlapped_any_blob)[i] = true;\n        ++num_blob_outlines;\n      }\n    }\n    if (debug_noise_removal) {\n      tprintf(\"%d noise outlines overlap blob at:\", num_blob_outlines);\n      blob_box.print();\n    }\n    // If any outlines overlap the blob, and not too many, classify the blob\n    // (using the full engine, languages and all), and choose the maximal\n    // combination of outlines that doesn't hurt the end-result classification\n    // by too much. Mark them as wanted.\n    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {\n      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,\n                                      num_blob_outlines, &blob_wanted)) {\n        for (unsigned i = 0; i < blob_wanted.size(); ++i) {\n          if (blob_wanted[i]) {\n            // Claim the outline and record where it is going.\n            (*word_wanted)[i] = true;\n            (*target_blobs)[i] = blob;\n          }\n        }\n      }\n    }\n  }\n}\n\n// Attempts to assign non-overlapping outlines to their nearest blobs or\n// make new blobs out of them.\nvoid Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,\n                                           WERD *real_word, PAGE_RES_IT *pr_it,\n                                           std::vector<bool> *word_wanted,\n                                           std::vector<C_BLOB *> *target_blobs) {\n  std::vector<bool> blob_wanted;\n  word_wanted->clear();\n  word_wanted->resize(outlines.size());\n  target_blobs->clear();\n  target_blobs->resize(outlines.size());\n  // Check for outlines that need to be turned into stand-alone blobs.\n  for (unsigned i = 0; i < outlines.size(); ++i) {\n    if (outlines[i] == nullptr) {\n      continue;\n    }\n    // Get a set of adjacent outlines that don't overlap any existing blob.\n    blob_wanted.clear();\n    blob_wanted.resize(outlines.size());\n    int num_blob_outlines = 0;\n    TBOX total_ol_box(outlines[i]->bounding_box());\n    while (i < outlines.size() && outlines[i] != nullptr) {\n      blob_wanted[i] = true;\n      total_ol_box += outlines[i]->bounding_box();\n      ++i;\n      ++num_blob_outlines;\n    }\n    // Find the insertion point.\n    C_BLOB_IT blob_it(real_word->cblob_list());\n    while (!blob_it.at_last() &&\n           blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {\n      blob_it.forward();\n    }\n    // Choose which combination of them we actually want and where to put\n    // them.\n    if (debug_noise_removal) {\n      tprintf(\"Num blobless outlines = %d\\n\", num_blob_outlines);\n    }\n    C_BLOB *left_blob = blob_it.data();\n    TBOX left_box = left_blob->bounding_box();\n    C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);\n    if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||\n         !right_blob->bounding_box().x_overlap(total_ol_box)) &&\n        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,\n                                    num_blob_outlines, &blob_wanted)) {\n      if (debug_noise_removal) {\n        tprintf(\"Added to left blob\\n\");\n      }\n      for (unsigned j = 0; j < blob_wanted.size(); ++j) {\n        if (blob_wanted[j]) {\n          (*word_wanted)[j] = true;\n          (*target_blobs)[j] = left_blob;\n        }\n      }\n    } else if (right_blob != nullptr &&\n               (!left_box.x_overlap(total_ol_box) ||\n                right_blob->bounding_box().x_overlap(total_ol_box)) &&\n               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,\n                                           num_blob_outlines, &blob_wanted)) {\n      if (debug_noise_removal) {\n        tprintf(\"Added to right blob\\n\");\n      }\n      for (unsigned j = 0; j < blob_wanted.size(); ++j) {\n        if (blob_wanted[j]) {\n          (*word_wanted)[j] = true;\n          (*target_blobs)[j] = right_blob;\n        }\n      }\n    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,\n                                           num_blob_outlines, &blob_wanted)) {\n      if (debug_noise_removal) {\n        tprintf(\"Fitted between blobs\\n\");\n      }\n      for (unsigned j = 0; j < blob_wanted.size(); ++j) {\n        if (blob_wanted[j]) {\n          (*word_wanted)[j] = true;\n          (*target_blobs)[j] = nullptr;\n        }\n      }\n    }\n  }\n}\n\n// Starting with ok_outlines set to indicate which outlines overlap the blob,\n// chooses the optimal set (approximately) and returns true if any outlines\n// are desired, in which case ok_outlines indicates which ones.\nbool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,\n                                            C_BLOB *blob,\n                                            const std::vector<C_OUTLINE *> &outlines,\n                                            int num_outlines, std::vector<bool> *ok_outlines) {\n  float target_cert = certainty_threshold;\n  if (blob != nullptr) {\n    std::string best_str;\n    float target_c2;\n    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);\n    if (debug_noise_removal) {\n      tprintf(\"No Noise blob classified as %s=%g(%g) at:\", best_str.c_str(), target_cert,\n              target_c2);\n      blob->bounding_box().print();\n    }\n    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;\n  }\n  std::vector<bool> test_outlines = *ok_outlines;\n  // Start with all the outlines in.\n  std::string all_str;\n  std::vector<bool> best_outlines = *ok_outlines;\n  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);\n  if (debug_noise_removal) {\n    TBOX ol_box;\n    for (unsigned i = 0; i < test_outlines.size(); ++i) {\n      if (test_outlines[i]) {\n        ol_box += outlines[i]->bounding_box();\n      }\n    }\n    tprintf(\"All Noise blob classified as %s=%g, delta=%g at:\", all_str.c_str(), best_cert,\n            best_cert - target_cert);\n    ol_box.print();\n  }\n  // Iteratively zero out the bit that improves the certainty the most, until\n  // we get past the threshold, have zero bits, or fail to improve.\n  int best_index = 0; // To zero out.\n  while (num_outlines > 1 && best_index >= 0 &&\n         (blob == nullptr || best_cert < target_cert || blob != nullptr)) {\n    // Find the best bit to zero out.\n    best_index = -1;\n    for (unsigned i = 0; i < outlines.size(); ++i) {\n      if (test_outlines[i]) {\n        test_outlines[i] = false;\n        std::string str;\n        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);\n        if (debug_noise_removal) {\n          TBOX ol_box;\n          for (unsigned j = 0; j < outlines.size(); ++j) {\n            if (test_outlines[j]) {\n              ol_box += outlines[j]->bounding_box();\n            }\n            tprintf(\"%c\", test_outlines[j] ? 'T' : 'F');\n          }\n          tprintf(\" blob classified as %s=%g, delta=%g) at:\", str.c_str(), cert,\n                  cert - target_cert);\n          ol_box.print();\n        }\n        if (cert > best_cert) {\n          best_cert = cert;\n          best_index = i;\n          best_outlines = test_outlines;\n        }\n        test_outlines[i] = true;\n      }\n    }\n    if (best_index >= 0) {\n      test_outlines[best_index] = false;\n      --num_outlines;\n    }\n  }\n  if (best_cert >= target_cert) {\n    // Save the best combination.\n    *ok_outlines = best_outlines;\n    if (debug_noise_removal) {\n      tprintf(\"%s noise combination \", blob ? \"Adding\" : \"New\");\n      for (auto &&best_outline : best_outlines) {\n        tprintf(\"%c\", best_outline ? 'T' : 'F');\n      }\n      tprintf(\" yields certainty %g, beating target of %g\\n\", best_cert, target_cert);\n    }\n    return true;\n  }\n\n  return false;\n}\n\n// Classifies the given blob plus the outlines flagged by ok_outlines, undoes\n// the inclusion of the outlines, and returns the certainty of the raw choice.\nfloat Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,\n                                          const std::vector<C_OUTLINE *> &outlines, int pass_n,\n                                          PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {\n  C_OUTLINE_IT ol_it;\n  C_OUTLINE *first_to_keep = nullptr;\n  C_BLOB *local_blob = nullptr;\n  if (blob != nullptr) {\n    // Add the required outlines to the blob.\n    ol_it.set_to_list(blob->out_list());\n    first_to_keep = ol_it.data();\n  }\n  for (unsigned i = 0; i < ok_outlines.size(); ++i) {\n    if (ok_outlines[i]) {\n      // This outline is to be added.\n      if (blob == nullptr) {\n        local_blob = new C_BLOB(outlines[i]);\n        blob = local_blob;\n        ol_it.set_to_list(blob->out_list());\n      } else {\n        ol_it.add_before_stay_put(outlines[i]);\n      }\n    }\n  }\n  float c2;\n  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);\n  ol_it.move_to_first();\n  if (first_to_keep == nullptr) {\n    // We created blob. Empty its outlines and delete it.\n    for (; !ol_it.empty(); ol_it.forward()) {\n      ol_it.extract();\n    }\n    delete local_blob;\n    cert = -c2;\n  } else {\n    // Remove the outlines that we put in.\n    for (; ol_it.data() != first_to_keep; ol_it.forward()) {\n      ol_it.extract();\n    }\n  }\n  return cert;\n}\n\n// Classifies the given blob (part of word_data->word->word) as an individual\n// word, using languages, chopper etc, returning only the certainty of the\n// best raw choice, and undoing all the work done to fake out the word.\nfloat Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,\n                                    float *c2) {\n  WERD *real_word = pr_it->word()->word;\n  WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),\n                                                  C_BLOB::deep_copy(blob));\n  WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);\n  // Get a new iterator that points to the new word.\n  PAGE_RES_IT it(pr_it->page_res);\n  while (it.word() != word_res && it.word() != nullptr) {\n    it.forward();\n  }\n  ASSERT_HOST(it.word() == word_res);\n  WordData wd(it);\n  // Force full initialization.\n  SetupWordPassN(1, &wd);\n  classify_word_and_language(pass_n, &it, &wd);\n  if (debug_noise_removal) {\n    if (wd.word->raw_choice != nullptr) {\n      tprintf(\"word xheight=%g, row=%g, range=[%g,%g]\\n\", word_res->x_height, wd.row->x_height(),\n              wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());\n    } else {\n      tprintf(\"Got word with null raw choice xheight=%g, row=%g\\n\", word_res->x_height,\n              wd.row->x_height());\n    }\n  }\n  float cert = 0.0f;\n  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...\n    cert = wd.word->raw_choice->certainty();\n    float rat = wd.word->raw_choice->rating();\n    *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;\n    best_str = wd.word->raw_choice->unichar_string();\n  } else {\n    *c2 = 0.0f;\n    best_str.clear();\n  }\n  it.DeleteCurrentWord();\n  pr_it->ResetWordIterator();\n  return cert;\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n// Generic function for classifying a word. Can be used either for pass1 or\n// pass2 according to the function passed to recognizer.\n// word_data holds the word to be recognized, and its block and row, and\n// pr_it points to the word as well, in case we are running LSTM and it wants\n// to output multiple words.\n// Recognizes in the current language, and if successful that is all.\n// If recognition was not successful, tries all available languages until\n// it gets a successful result or runs out of languages. Keeps the best result.\nvoid Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {\n#ifdef DISABLED_LEGACY_ENGINE\n  WordRecognizer recognizer = &Tesseract::classify_word_pass1;\n#else\n  WordRecognizer recognizer =\n      pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;\n#endif // def DISABLED_LEGACY_ENGINE\n\n  // Best result so far.\n  PointerVector<WERD_RES> best_words;\n  // Points to the best result. May be word or in lang_words.\n  const WERD_RES *word = word_data->word;\n  clock_t total_time = 0;\n  const bool timing_debug = tessedit_timing_debug;\n  if (timing_debug) {\n    total_time = clock();\n  }\n  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;\n  if (debug) {\n    tprintf(\"%s word with lang %s at:\", word->done ? \"Already done\" : \"Processing\",\n            most_recently_used_->lang.c_str());\n    word->word->bounding_box().print();\n  }\n  if (word->done) {\n    // If done on pass1, leave it as-is.\n    if (!word->tess_failed) {\n      most_recently_used_ = word->tesseract;\n    }\n    return;\n  }\n  auto sub = sub_langs_.size();\n  if (most_recently_used_ != this) {\n    // Get the index of the most_recently_used_.\n    for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {\n    }\n  }\n  most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],\n                                         &best_words);\n  Tesseract *best_lang_tess = most_recently_used_;\n  if (!WordsAcceptable(best_words)) {\n    // Try all the other languages to see if they are any better.\n    if (most_recently_used_ != this &&\n        this->RetryWithLanguage(*word_data, recognizer, debug,\n                                &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {\n      best_lang_tess = this;\n    }\n    for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {\n      if (most_recently_used_ != sub_langs_[i] &&\n          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],\n                                           &best_words) > 0) {\n        best_lang_tess = sub_langs_[i];\n      }\n    }\n  }\n  most_recently_used_ = best_lang_tess;\n  if (!best_words.empty()) {\n    if (best_words.size() == 1 && !best_words[0]->combination) {\n      // Move the best single result to the main word.\n      word_data->word->ConsumeWordResults(best_words[0]);\n    } else {\n      // Words came from LSTM, and must be moved to the PAGE_RES properly.\n      word_data->word = best_words.back();\n      pr_it->ReplaceCurrentWord(&best_words);\n    }\n    ASSERT_HOST(word_data->word->box_word != nullptr);\n  } else {\n    tprintf(\"no best words!!\\n\");\n  }\n  if (timing_debug) {\n    total_time = clock() - total_time;\n    tesserr << word_data->word->best_choice->unichar_string()\n            << \" (ocr took \" << 1000 * total_time / CLOCKS_PER_SEC << \" ms)\\n\";\n  }\n}\n\n/**\n * classify_word_pass1\n *\n * Baseline normalize the word and pass it to Tess.\n */\n\nvoid Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,\n                                    PointerVector<WERD_RES> *out_words) {\n  ROW *row = word_data.row;\n  BLOCK *block = word_data.block;\n  prev_word_best_choice_ =\n      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;\n#ifdef DISABLED_LEGACY_ENGINE\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n#else\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||\n      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {\n#endif // def DISABLED_LEGACY_ENGINE\n    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n      LSTMRecognizeWord(*block, row, *in_word, out_words);\n      if (!out_words->empty()) {\n        return; // Successful lstm recognition.\n      }\n    }\n    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n      // No fallback allowed, so use a fake.\n      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());\n      return;\n    }\n\n#ifndef DISABLED_LEGACY_ENGINE\n    // Fall back to tesseract for failed words or odd words.\n    (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,\n                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                    poly_allow_detailed_fx, row, block);\n#endif // ndef DISABLED_LEGACY_ENGINE\n  }\n\n#ifndef DISABLED_LEGACY_ENGINE\n  WERD_RES *word = *in_word;\n  match_word_pass_n(1, word, row, block);\n  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {\n    word->tess_would_adapt = AdaptableWord(word);\n    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);\n\n    if (adapt_ok) {\n      // Send word to adaptive classifier for training.\n      word->BestChoiceToCorrectText();\n      LearnWord(nullptr, word);\n      // Mark misadaptions if running blamer.\n      if (word->blamer_bundle != nullptr) {\n        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);\n      }\n    }\n\n    if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {\n      tess_add_doc_word(word->best_choice);\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n// Helper to report the result of the xheight fix.\nvoid Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,\n                                   WERD_RES *new_word) {\n  tprintf(\"New XHT Match:%s = %s \", word->best_choice->unichar_string().c_str(),\n          word->best_choice->debug_string().c_str());\n  word->reject_map.print(debug_fp);\n  tprintf(\" -> %s = %s \", new_word->best_choice->unichar_string().c_str(),\n          new_word->best_choice->debug_string().c_str());\n  new_word->reject_map.print(debug_fp);\n  tprintf(\" %s->%s %s %s\\n\", word->guessed_x_ht ? \"GUESS\" : \"CERT\",\n          new_word->guessed_x_ht ? \"GUESS\" : \"CERT\", new_x_ht > 0.1 ? \"STILL DOUBT\" : \"OK\",\n          accept_new_word ? \"ACCEPTED\" : \"\");\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n// Run the x-height fix-up, based on min/max top/bottom information in\n// unicharset.\n// Returns true if the word was changed.\n// See the comment in fixxht.cpp for a description of the overall process.\nbool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {\n  int original_misfits = CountMisfitTops(word);\n  if (original_misfits == 0) {\n    return false;\n  }\n  float baseline_shift = 0.0f;\n  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);\n  if (baseline_shift != 0.0f) {\n    // Try the shift on its own first.\n    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {\n      return false;\n    }\n    original_misfits = CountMisfitTops(word);\n    if (original_misfits > 0) {\n      float new_baseline_shift;\n      // Now recompute the new x_height.\n      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);\n      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {\n        // No test of return value here, as we are definitely making a change\n        // to the word by shifting the baseline.\n        TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);\n      }\n    }\n    return true;\n  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {\n    return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);\n  } else {\n    return false;\n  }\n}\n\n// Runs recognition with the test baseline shift and x-height and returns true\n// if there was an improvement in recognition result.\nbool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,\n                                     WERD_RES *word, BLOCK *block, ROW *row) {\n  bool accept_new_x_ht = false;\n  WERD_RES new_x_ht_word(word->word);\n  if (word->blamer_bundle != nullptr) {\n    new_x_ht_word.blamer_bundle = new BlamerBundle();\n    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));\n  }\n  new_x_ht_word.x_height = new_x_ht;\n  new_x_ht_word.baseline_shift = baseline_shift;\n  new_x_ht_word.caps_height = 0.0;\n  new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,\n                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                    poly_allow_detailed_fx, row, block);\n  match_word_pass_n(2, &new_x_ht_word, row, block);\n  if (!new_x_ht_word.tess_failed) {\n    int new_misfits = CountMisfitTops(&new_x_ht_word);\n    if (debug_x_ht_level >= 1) {\n      tprintf(\"Old misfits=%d with x-height %f, new=%d with x-height %f\\n\", original_misfits,\n              word->x_height, new_misfits, new_x_ht);\n      tprintf(\"Old rating= %f, certainty=%f, new=%f, %f\\n\", word->best_choice->rating(),\n              word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),\n              new_x_ht_word.best_choice->certainty());\n    }\n    // The misfits must improve and either the rating or certainty.\n    accept_new_x_ht = new_misfits < original_misfits &&\n                      (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||\n                       new_x_ht_word.best_choice->rating() < word->best_choice->rating());\n    if (debug_x_ht_level >= 1) {\n      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);\n    }\n  }\n  if (accept_new_x_ht) {\n    word->ConsumeWordResults(&new_x_ht_word);\n    return true;\n  }\n  return false;\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n/**\n * classify_word_pass2\n *\n * Control what to do with the word in pass 2\n */\n\nvoid Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,\n                                    PointerVector<WERD_RES> *out_words) {\n  // Return if we do not want to run Tesseract.\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n    return;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  ROW *row = word_data.row;\n  BLOCK *block = word_data.block;\n  WERD_RES *word = *in_word;\n  prev_word_best_choice_ =\n      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;\n\n  check_debug_pt(word, 30);\n  if (!word->done) {\n    word->caps_height = 0.0;\n    if (word->x_height == 0.0f) {\n      word->x_height = row->x_height();\n    }\n    match_word_pass_n(2, word, row, block);\n    check_debug_pt(word, 40);\n  }\n\n  SubAndSuperscriptFix(word);\n\n  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {\n    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&\n        block->classify_rotation().y() == 0.0f) {\n      // Use the tops and bottoms since they are available.\n      TrainedXheightFix(word, block, row);\n    }\n  }\n#  ifndef GRAPHICS_DISABLED\n  if (tessedit_display_outwords) {\n    if (fx_win == nullptr) {\n      create_fx_win();\n    }\n    clear_fx_win();\n    word->rebuild_word->plot(fx_win);\n    TBOX wbox = word->rebuild_word->bounding_box();\n    fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());\n    ScrollView::Update();\n  }\n#  endif\n  check_debug_pt(word, 50);\n#endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * match_word_pass2\n *\n * Baseline normalize the word and pass it to Tess.\n */\nvoid Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {\n  if (word->tess_failed) {\n    return;\n  }\n  tess_segment_pass_n(pass_n, word);\n\n  if (!word->tess_failed) {\n    if (!word->word->flag(W_REP_CHAR)) {\n      word->fix_quotes();\n      if (tessedit_fix_hyphens) {\n        word->fix_hyphens();\n      }\n      /* Don't trust fix_quotes! - though I think I've fixed the bug */\n      if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {\n        tprintf(\n            \"POST FIX_QUOTES FAIL String:\\\"%s\\\"; Strlen=%d;\"\n            \" #Blobs=%u\\n\",\n            word->best_choice->debug_string().c_str(), word->best_choice->length(),\n            word->box_word->length());\n      }\n      word->tess_accepted = tess_acceptable_word(word);\n\n      // Also sets word->done flag\n      make_reject_map(word, row, pass_n);\n    }\n  }\n  set_word_fonts(word);\n\n  ASSERT_HOST(word->raw_choice != nullptr);\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n// Helper to return the best rated BLOB_CHOICE in the whole word that matches\n// the given char_id, or nullptr if none can be found.\nstatic BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {\n  // Find the corresponding best BLOB_CHOICE from any position in the word_res.\n  BLOB_CHOICE *best_choice = nullptr;\n  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {\n    BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));\n    if (choice != nullptr) {\n      if (best_choice == nullptr || choice->rating() < best_choice->rating()) {\n        best_choice = choice;\n      }\n    }\n  }\n  return best_choice;\n}\n\n// Helper to insert blob_choice in each location in the leader word if there is\n// no matching BLOB_CHOICE there already, and correct any incorrect results\n// in the best_choice.\nstatic void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {\n  WERD_CHOICE *word = word_res->best_choice;\n  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {\n    BLOB_CHOICE *choice =\n        FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));\n    if (choice == nullptr) {\n      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));\n      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));\n    }\n  }\n  // Correct any incorrect results in word.\n  for (unsigned i = 0; i < word->length(); ++i) {\n    if (word->unichar_id(i) != blob_choice->unichar_id()) {\n      word->set_unichar_id(blob_choice->unichar_id(), i);\n    }\n  }\n}\n\n/**\n * fix_rep_char()\n * The word is a repeated char. (Leader.) Find the repeated char character.\n * Create the appropriate single-word or multi-word sequence according to\n * the size of spaces in between blobs, and correct the classifications\n * where some of the characters disagree with the majority.\n */\nvoid Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {\n  WERD_RES *word_res = page_res_it->word();\n  const WERD_CHOICE &word = *(word_res->best_choice);\n\n  // Find the frequency of each unique character in the word.\n  SortHelper<UNICHAR_ID> rep_ch(word.length());\n  for (unsigned i = 0; i < word.length(); ++i) {\n    rep_ch.Add(word.unichar_id(i), 1);\n  }\n\n  // Find the most frequent result.\n  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char\n  int max_count = rep_ch.MaxCount(&maxch_id);\n  // Find the best exemplar of a classifier result for maxch_id.\n  BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);\n  if (best_choice == nullptr) {\n    tprintf(\"Failed to find a choice for %s, occurring %d times\\n\",\n            word_res->uch_set->debug_str(maxch_id).c_str(), max_count);\n    return;\n  }\n  word_res->done = true;\n\n  // Just correct existing classification.\n  CorrectRepcharChoices(best_choice, word_res);\n  word_res->reject_map.initialise(word.length());\n}\n\nACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,\n                                                       const char *lengths) {\n  int i = 0;\n  int offset = 0;\n  int leading_punct_count;\n  int upper_count = 0;\n  int hyphen_pos = -1;\n  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;\n\n  if (strlen(lengths) > 20) {\n    return word_type;\n  }\n\n  /* Single Leading punctuation char*/\n\n  if (s[offset] != '\\0' && chs_leading_punct.contains(s[offset])) {\n    offset += lengths[i++];\n  }\n  leading_punct_count = i;\n\n  /* Initial cap */\n  while (s[offset] != '\\0' && char_set.get_isupper(s + offset, lengths[i])) {\n    offset += lengths[i++];\n    upper_count++;\n  }\n  if (upper_count > 1) {\n    word_type = AC_UPPER_CASE;\n  } else {\n    /* Lower case word, possibly with an initial cap */\n    while (s[offset] != '\\0' && char_set.get_islower(s + offset, lengths[i])) {\n      offset += lengths[i++];\n    }\n    if (i - leading_punct_count < quality_min_initial_alphas_reqd) {\n      goto not_a_word;\n    }\n    /*\nAllow a single hyphen in a lower case word\n- don't trust upper case - I've seen several cases of \"H\" -> \"I-I\"\n*/\n    if (lengths[i] == 1 && s[offset] == '-') {\n      hyphen_pos = i;\n      offset += lengths[i++];\n      if (s[offset] != '\\0') {\n        while ((s[offset] != '\\0') && char_set.get_islower(s + offset, lengths[i])) {\n          offset += lengths[i++];\n        }\n        if (i < hyphen_pos + 3) {\n          goto not_a_word;\n        }\n      }\n    } else {\n      /* Allow \"'s\" in NON hyphenated lower case words */\n      if (lengths[i] == 1 && (s[offset] == '\\'') && lengths[i + 1] == 1 &&\n          (s[offset + lengths[i]] == 's')) {\n        offset += lengths[i++];\n        offset += lengths[i++];\n      }\n    }\n    if (upper_count > 0) {\n      word_type = AC_INITIAL_CAP;\n    } else {\n      word_type = AC_LOWER_CASE;\n    }\n  }\n\n  /* Up to two different, constrained trailing punctuation chars */\n  if (lengths[i] == 1 && s[offset] != '\\0' && chs_trailing_punct1.contains(s[offset])) {\n    offset += lengths[i++];\n  }\n  if (lengths[i] == 1 && s[offset] != '\\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&\n      chs_trailing_punct2.contains(s[offset])) {\n    offset += lengths[i++];\n  }\n\n  if (s[offset] != '\\0') {\n    word_type = AC_UNACCEPTABLE;\n  }\n\nnot_a_word:\n\n  if (word_type == AC_UNACCEPTABLE) {\n    /* Look for abbreviation string */\n    i = 0;\n    offset = 0;\n    if (s[0] != '\\0' && char_set.get_isupper(s, lengths[0])) {\n      word_type = AC_UC_ABBREV;\n      while (s[offset] != '\\0' && char_set.get_isupper(s + offset, lengths[i]) &&\n             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {\n        offset += lengths[i++];\n        offset += lengths[i++];\n      }\n    } else if (s[0] != '\\0' && char_set.get_islower(s, lengths[0])) {\n      word_type = AC_LC_ABBREV;\n      while (s[offset] != '\\0' && char_set.get_islower(s + offset, lengths[i]) &&\n             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {\n        offset += lengths[i++];\n        offset += lengths[i++];\n      }\n    }\n    if (s[offset] != '\\0') {\n      word_type = AC_UNACCEPTABLE;\n    }\n  }\n\n  return word_type;\n}\n\nbool Tesseract::check_debug_pt(WERD_RES *word, int location) {\n  if (!test_pt) {\n    return false;\n  }\n\n  tessedit_rejection_debug.set_value(false);\n  debug_x_ht_level.set_value(0);\n\n  if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {\n    if (location < 0) {\n      return true; // For breakpoint use\n    }\n    bool show_map_detail = false;\n    tessedit_rejection_debug.set_value(true);\n    debug_x_ht_level.set_value(2);\n    tprintf(\"\\n\\nTESTWD::\");\n    switch (location) {\n      case 0:\n        tprintf(\"classify_word_pass1 start\\n\");\n        word->word->print();\n        break;\n      case 10:\n        tprintf(\"make_reject_map: initial map\");\n        break;\n      case 20:\n        tprintf(\"make_reject_map: after NN\");\n        break;\n      case 30:\n        tprintf(\"classify_word_pass2 - START\");\n        break;\n      case 40:\n        tprintf(\"classify_word_pass2 - Pre Xht\");\n        break;\n      case 50:\n        tprintf(\"classify_word_pass2 - END\");\n        show_map_detail = true;\n        break;\n      case 60:\n        tprintf(\"fixspace\");\n        break;\n      case 70:\n        tprintf(\"MM pass START\");\n        break;\n      case 80:\n        tprintf(\"MM pass END\");\n        break;\n      case 90:\n        tprintf(\"After Poor quality rejection\");\n        break;\n      case 100:\n        tprintf(\"unrej_good_quality_words - START\");\n        break;\n      case 110:\n        tprintf(\"unrej_good_quality_words - END\");\n        break;\n      case 120:\n        tprintf(\"Write results pass\");\n        show_map_detail = true;\n        break;\n    }\n    if (word->best_choice != nullptr) {\n      tprintf(\" \\\"%s\\\" \", word->best_choice->unichar_string().c_str());\n      word->reject_map.print(debug_fp);\n      tprintf(\"\\n\");\n      if (show_map_detail) {\n        tprintf(\"\\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n        for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\\0'; i++) {\n          tprintf(\"**** \\\"%c\\\" ****\\n\", word->best_choice->unichar_string()[i]);\n          word->reject_map[i].full_print(debug_fp);\n        }\n      }\n    } else {\n      tprintf(\"null best choice\\n\");\n    }\n    tprintf(\"Tess Accepted: %s\\n\", word->tess_accepted ? \"TRUE\" : \"FALSE\");\n    tprintf(\"Done flag: %s\\n\\n\", word->done ? \"TRUE\" : \"FALSE\");\n    return true;\n  } else {\n    return false;\n  }\n}\n\n/**\n * find_modal_font\n *\n * Find the modal font and remove from the stats.\n */\n#ifndef DISABLED_LEGACY_ENGINE\nstatic void find_modal_font( // good chars in word\n    STATS *fonts,            // font stats\n    int16_t *font_out,       // output font\n    int8_t *font_count       // output count\n) {\n  if (fonts->get_total() > 0) {\n    // font index\n    int16_t font = static_cast<int16_t>(fonts->mode());\n    *font_out = font;\n    // pile count\n    int32_t count = fonts->pile_count(font);\n    *font_count = count < INT8_MAX ? count : INT8_MAX;\n    fonts->add(font, -*font_count);\n  } else {\n    *font_out = -1;\n    *font_count = 0;\n  }\n}\n#endif // ! DISABLED_LEGACY_ENGINE\n\n/**\n * set_word_fonts\n *\n * Get the fonts for the word.\n */\nvoid Tesseract::set_word_fonts(WERD_RES *word) {\n  // Don't try to set the word fonts for an lstm word, as the configs\n  // will be meaningless.\n  if (word->chopped_word == nullptr) {\n    return;\n  }\n  ASSERT_HOST(word->best_choice != nullptr);\n\n#ifndef DISABLED_LEGACY_ENGINE\n  const int fontinfo_size = fontinfo_table_.size();\n  if (fontinfo_size == 0) {\n    return;\n  }\n  if (tessedit_font_id > 0) {\n    if (tessedit_font_id >= fontinfo_size) {\n      tprintf(\"Error, invalid font ID provided: must be below %d.\\n\"\n              \"Falling back to font auto-detection.\\n\", fontinfo_size);\n    } else {\n      word->fontinfo = &fontinfo_table_.at(tessedit_font_id);\n      word->fontinfo2 = nullptr;\n      word->fontinfo_id_count = INT8_MAX;\n      word->fontinfo_id2_count = 0;\n      return;\n    }\n  }\n  std::vector<int> font_total_score(fontinfo_size);\n\n  // Compute the font scores for the word\n  if (tessedit_debug_fonts) {\n    tprintf(\"Examining fonts in %s\\n\", word->best_choice->debug_string().c_str());\n  }\n  for (unsigned b = 0; b < word->best_choice->length(); ++b) {\n    const BLOB_CHOICE *choice = word->GetBlobChoice(b);\n    if (choice == nullptr) {\n      continue;\n    }\n    auto &fonts = choice->fonts();\n    for (auto &f : fonts) {\n      const int fontinfo_id = f.fontinfo_id;\n      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {\n        font_total_score[fontinfo_id] += f.score;\n      }\n    }\n  }\n  // Find the top and 2nd choice for the word.\n  int score1 = 0, score2 = 0;\n  int16_t font_id1 = -1, font_id2 = -1;\n  for (int f = 0; f < fontinfo_size; ++f) {\n    if (tessedit_debug_fonts && font_total_score[f] > 0) {\n      tprintf(\"Font %s, total score = %d\\n\", fontinfo_table_.at(f).name, font_total_score[f]);\n    }\n    if (font_total_score[f] > score1) {\n      score2 = score1;\n      font_id2 = font_id1;\n      score1 = font_total_score[f];\n      font_id1 = f;\n    } else if (font_total_score[f] > score2) {\n      score2 = font_total_score[f];\n      font_id2 = f;\n    }\n  }\n  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;\n  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;\n  // Each score has a limit of UINT16_MAX, so divide by that to get the number\n  // of \"votes\" for that font, ie number of perfect scores.\n  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);\n  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);\n  if (score1 > 0) {\n    const FontInfo fi = fontinfo_table_.at(font_id1);\n    if (tessedit_debug_fonts) {\n      if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {\n        tprintf(\"Word modal font=%s, score=%d, 2nd choice %s/%d\\n\", fi.name,\n                word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,\n                word->fontinfo_id2_count);\n      } else {\n        tprintf(\"Word modal font=%s, score=%d. No 2nd choice\\n\", fi.name, word->fontinfo_id_count);\n      }\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n/**\n * font_recognition_pass\n *\n * Smooth the fonts for the document.\n */\nvoid Tesseract::font_recognition_pass(PAGE_RES *page_res) {\n  PAGE_RES_IT page_res_it(page_res);\n  WERD_RES *word;                       // current word\n  STATS doc_fonts(0, font_table_size_ - 1); // font counters\n\n  // Gather font id statistics.\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    word = page_res_it.word();\n    if (word->fontinfo != nullptr) {\n      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);\n    }\n    if (word->fontinfo2 != nullptr) {\n      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);\n    }\n  }\n  int16_t doc_font;      // modal font\n  int8_t doc_font_count; // modal font\n  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);\n  if (doc_font_count == 0) {\n    return;\n  }\n  // Get the modal font pointer.\n  const FontInfo *modal_font = nullptr;\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    word = page_res_it.word();\n    if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {\n      modal_font = word->fontinfo;\n      break;\n    }\n    if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {\n      modal_font = word->fontinfo2;\n      break;\n    }\n  }\n  ASSERT_HOST(modal_font != nullptr);\n\n  // Assign modal font to weak words.\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    word = page_res_it.word();\n    const int length = word->best_choice->length();\n\n    const int count = word->fontinfo_id_count;\n    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {\n      word->fontinfo = modal_font;\n      // Counts only get 1 as it came from the doc.\n      word->fontinfo_id_count = 1;\n    }\n  }\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n// If a word has multiple alternates check if the best choice is in the\n// dictionary. If not, replace it with an alternate that exists in the\n// dictionary.\nvoid Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {\n  PAGE_RES_IT word_it(page_res);\n  for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {\n    if (word->best_choices.singleton()) {\n      continue; // There are no alternates.\n    }\n\n    const WERD_CHOICE *best = word->best_choice;\n    if (word->tesseract->getDict().valid_word(*best) != 0) {\n      continue; // The best choice is in the dictionary.\n    }\n\n    WERD_CHOICE_IT choice_it(&word->best_choices);\n    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {\n      WERD_CHOICE *alternate = choice_it.data();\n      if (word->tesseract->getDict().valid_word(*alternate)) {\n        // The alternate choice is in the dictionary.\n        if (tessedit_bigram_debug) {\n          tprintf(\"Dictionary correction replaces best choice '%s' with '%s'\\n\",\n                  best->unichar_string().c_str(), alternate->unichar_string().c_str());\n        }\n        // Replace the 'best' choice with a better choice.\n        word->ReplaceBestChoice(alternate);\n        break;\n      }\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/control.h",
    "content": "/**********************************************************************\n * File:        control.h  (Formerly control.h)\n * Description: Module-independent matcher controller.\n * Author:      Ray Smith\n * Created:     Thu Apr 23 11:09:58 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n/**\n * @file control.h\n * Module-independent matcher controller.\n */\n\n#ifndef CONTROL_H\n#define CONTROL_H\n\nenum ACCEPTABLE_WERD_TYPE {\n  AC_UNACCEPTABLE, ///< Unacceptable word\n  AC_LOWER_CASE,   ///< ALL lower case\n  AC_UPPER_CASE,   ///< ALL upper case\n  AC_INITIAL_CAP,  ///< ALL but initial lc\n  AC_LC_ABBREV,    ///< a.b.c.\n  AC_UC_ABBREV     ///< A.B.C.\n};\n\n#endif\n"
  },
  {
    "path": "src/ccmain/docqual.cpp",
    "content": "/******************************************************************\n * File:        docqual.cpp  (Formerly docqual.c)\n * Description: Document Quality Metrics\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"docqual.h\"\n#include <cctype>\n#include \"reject.h\"\n#include \"tesseractclass.h\"\n#include \"tessvars.h\"\n\nnamespace tesseract {\n\nstatic void countMatchingBlobs(int16_t &match_count, int /*index*/) {\n  ++match_count;\n}\n\nstatic void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,\n                               int index) {\n  if (word->reject_map[index].accepted()) {\n    ++accepted_match_count;\n  }\n  ++match_count;\n}\n\nstatic void acceptIfGoodQuality(WERD_RES *word, int index) {\n  if (word->reject_map[index].accept_if_good_quality()) {\n    word->reject_map[index].setrej_quality_accept();\n  }\n}\n\n/*************************************************************************\n * word_blob_quality()\n * How many blobs in the box_word are identical to those of the inword?\n * ASSUME blobs in both initial word and box_word are in ascending order of\n * left hand blob edge.\n *************************************************************************/\nint16_t Tesseract::word_blob_quality(WERD_RES *word) {\n  int16_t match_count = 0;\n  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&\n      !word->rebuild_word->blobs.empty()) {\n    using namespace std::placeholders; // for _1\n    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,\n                                         std::bind(countMatchingBlobs, match_count, _1));\n  }\n  return match_count;\n}\n\nint16_t Tesseract::word_outline_errs(WERD_RES *word) {\n  int16_t err_count = 0;\n\n  if (word->rebuild_word != nullptr) {\n    int16_t i = 0;\n    for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {\n      TBLOB *blob = word->rebuild_word->blobs[b];\n      err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());\n      i++;\n    }\n  }\n  return err_count;\n}\n\n/*************************************************************************\n * word_char_quality()\n * Combination of blob quality and outline quality - how many good chars are\n * there? - I.e chars which pass the blob AND outline tests.\n *************************************************************************/\nvoid Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,\n                                  int16_t *accepted_match_count) {\n  *match_count = 0;\n  *accepted_match_count = 0;\n  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&\n      !word->rebuild_word->blobs.empty()) {\n    using namespace std::placeholders; // for _1\n    word->bln_boxes->ProcessMatchedBlobs(\n        *word->rebuild_word,\n        std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));\n  }\n}\n\n/*************************************************************************\n * unrej_good_chs()\n * Unreject POTENTIAL rejects if the blob passes the blob and outline checks\n *************************************************************************/\nvoid Tesseract::unrej_good_chs(WERD_RES *word) {\n  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&\n      word->rebuild_word->blobs.empty()) {\n    using namespace std::placeholders; // for _1\n    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,\n                                         std::bind(acceptIfGoodQuality, word, _1));\n  }\n}\n\nint16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {\n  int expected_outline_count;\n\n  if (outlines_odd.contains(c)) {\n    return 0; // Don't use this char\n  } else if (outlines_2.contains(c)) {\n    expected_outline_count = 2;\n  } else {\n    expected_outline_count = 1;\n  }\n  return abs(outline_count - expected_outline_count);\n}\n\nvoid Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {\n  if ((tessedit_good_quality_unrej && good_quality_doc)) {\n    unrej_good_quality_words(page_res_it);\n  }\n  doc_and_block_rejection(page_res_it, good_quality_doc);\n  if (unlv_tilde_crunching) {\n    tilde_crunch(page_res_it);\n    tilde_delete(page_res_it);\n  }\n}\n\n/*************************************************************************\n * unrej_good_quality_words()\n * Accept potential rejects in words which pass the following checks:\n *    - Contains a potential reject\n *    - Word looks like a sensible alpha word.\n *    - Word segmentation is the same as the original image\n *    - All characters have the expected number of outlines\n * NOTE - the rejection counts are recalculated after unrejection\n *      - CAN'T do it in a single pass without a bit of fiddling\n *    - keep it simple but inefficient\n *************************************************************************/\nvoid Tesseract::unrej_good_quality_words( // unreject potential\n    PAGE_RES_IT &page_res_it) {\n  WERD_RES *word;\n  ROW_RES *current_row;\n  BLOCK_RES *current_block;\n  int i;\n\n  page_res_it.restart_page();\n  while (page_res_it.word() != nullptr) {\n    check_debug_pt(page_res_it.word(), 100);\n    if (bland_unrej) {\n      word = page_res_it.word();\n      for (i = 0; i < word->reject_map.length(); i++) {\n        if (word->reject_map[i].accept_if_good_quality()) {\n          word->reject_map[i].setrej_quality_accept();\n        }\n      }\n      page_res_it.forward();\n    } else if ((page_res_it.row()->char_count > 0) &&\n               ((page_res_it.row()->rej_count /\n                 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {\n      word = page_res_it.word();\n      if (word->reject_map.quality_recoverable_rejects() &&\n          (tessedit_unrej_any_wd ||\n           acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),\n                                  word->best_choice->unichar_lengths().c_str()) !=\n               AC_UNACCEPTABLE)) {\n        unrej_good_chs(word);\n      }\n      page_res_it.forward();\n    } else {\n      // Skip to end of dodgy row.\n      current_row = page_res_it.row();\n      while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {\n        page_res_it.forward();\n      }\n    }\n    check_debug_pt(page_res_it.word(), 110);\n  }\n  page_res_it.restart_page();\n  page_res_it.page_res->char_count = 0;\n  page_res_it.page_res->rej_count = 0;\n  current_block = nullptr;\n  current_row = nullptr;\n  while (page_res_it.word() != nullptr) {\n    if (current_block != page_res_it.block()) {\n      current_block = page_res_it.block();\n      current_block->char_count = 0;\n      current_block->rej_count = 0;\n    }\n    if (current_row != page_res_it.row()) {\n      current_row = page_res_it.row();\n      current_row->char_count = 0;\n      current_row->rej_count = 0;\n      current_row->whole_word_rej_count = 0;\n    }\n    page_res_it.rej_stat_word();\n    page_res_it.forward();\n  }\n}\n\n/*************************************************************************\n * doc_and_block_rejection()\n *\n * If the page has too many rejects - reject all of it.\n * If any block has too many rejects - reject all words in the block\n *************************************************************************/\n\nvoid Tesseract::doc_and_block_rejection( // reject big chunks\n    PAGE_RES_IT &page_res_it, bool good_quality_doc) {\n  BLOCK_RES *current_block;\n\n  int16_t char_quality = 0;\n  int16_t accepted_char_quality;\n\n  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >\n      tessedit_reject_doc_percent) {\n    reject_whole_page(page_res_it);\n    if (tessedit_debug_doc_rejection) {\n      tprintf(\"REJECT ALL #chars: %d #Rejects: %d; \\n\", page_res_it.page_res->char_count,\n              page_res_it.page_res->rej_count);\n    }\n  } else {\n    if (tessedit_debug_doc_rejection) {\n      tprintf(\"NO PAGE REJECTION #chars: %d  # Rejects: %d; \\n\", page_res_it.page_res->char_count,\n              page_res_it.page_res->rej_count);\n    }\n\n    /* Walk blocks testing for block rejection */\n\n    page_res_it.restart_page();\n    WERD_RES *word;\n    while ((word = page_res_it.word()) != nullptr) {\n      current_block = page_res_it.block();\n      int16_t block_no = current_block->block->pdblk.index();\n      if (current_block->char_count > 0 &&\n          (current_block->rej_count * 100.0 / current_block->char_count) >\n              tessedit_reject_block_percent) {\n        if (tessedit_debug_block_rejection) {\n          tprintf(\"REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\\n\", block_no,\n                  current_block->char_count, current_block->rej_count);\n        }\n        bool prev_word_rejected = false;\n        while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {\n          bool rej_word;\n          if (tessedit_preserve_blk_rej_perfect_wds) {\n            rej_word = word->reject_map.reject_count() > 0 ||\n                       word->reject_map.length() < tessedit_preserve_min_wd_len;\n            if (rej_word && tessedit_dont_blkrej_good_wds &&\n                word->reject_map.length() >= tessedit_preserve_min_wd_len &&\n                acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),\n                                       word->best_choice->unichar_lengths().c_str()) !=\n                    AC_UNACCEPTABLE) {\n              word_char_quality(word, &char_quality, &accepted_char_quality);\n              rej_word = char_quality != word->reject_map.length();\n            }\n          } else {\n            rej_word = true;\n          }\n          if (rej_word) {\n            /*\n  Reject spacing if both current and prev words are rejected.\n  NOTE - this is NOT restricted to FUZZY spaces. - When tried this\n  generated more space errors.\n*/\n            if (tessedit_use_reject_spaces && prev_word_rejected &&\n                page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {\n              word->reject_spaces = true;\n            }\n            word->reject_map.rej_word_block_rej();\n          }\n          prev_word_rejected = rej_word;\n          page_res_it.forward();\n        }\n      } else {\n        if (tessedit_debug_block_rejection) {\n          tprintf(\"NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \\n\", block_no,\n                  page_res_it.block()->char_count, page_res_it.block()->rej_count);\n        }\n\n        /* Walk rows in block testing for row rejection */\n        int16_t row_no = 0;\n        while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {\n          ROW_RES *current_row = page_res_it.row();\n          row_no++;\n          /* Reject whole row if:\n  fraction of chars on row which are rejected exceed a limit AND\n  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a\n  limit\n*/\n          if (current_row->char_count > 0 &&\n              (current_row->rej_count * 100.0 / current_row->char_count) >\n                  tessedit_reject_row_percent &&\n              (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <\n                  tessedit_whole_wd_rej_row_percent) {\n            if (tessedit_debug_block_rejection) {\n              tprintf(\"REJECTING ROW %d  #chars: %d;  #Rejects: %d\\n\", row_no,\n                      current_row->char_count, current_row->rej_count);\n            }\n            bool prev_word_rejected = false;\n            while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {\n              /* Preserve words on good docs unless they are mostly rejected*/\n              bool rej_word;\n              if (!tessedit_row_rej_good_docs && good_quality_doc) {\n                rej_word = word->reject_map.reject_count() /\n                               static_cast<float>(word->reject_map.length()) >\n                           tessedit_good_doc_still_rowrej_wd;\n              } else if (tessedit_preserve_row_rej_perfect_wds) {\n                /* Preserve perfect words anyway */\n                rej_word = word->reject_map.reject_count() > 0 ||\n                           word->reject_map.length() < tessedit_preserve_min_wd_len;\n                if (rej_word && tessedit_dont_rowrej_good_wds &&\n                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&\n                    acceptable_word_string(\n                        *word->uch_set, word->best_choice->unichar_string().c_str(),\n                        word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {\n                  word_char_quality(word, &char_quality, &accepted_char_quality);\n                  rej_word = char_quality != word->reject_map.length();\n                }\n              } else {\n                rej_word = true;\n              }\n              if (rej_word) {\n                /*\n  Reject spacing if both current and prev words are rejected.\n  NOTE - this is NOT restricted to FUZZY spaces. - When tried\n  this generated more space errors.\n*/\n                if (tessedit_use_reject_spaces && prev_word_rejected &&\n                    page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {\n                  word->reject_spaces = true;\n                }\n                word->reject_map.rej_word_row_rej();\n              }\n              prev_word_rejected = rej_word;\n              page_res_it.forward();\n            }\n          } else {\n            if (tessedit_debug_block_rejection) {\n              tprintf(\"NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \\n\", row_no,\n                      current_row->char_count, current_row->rej_count);\n            }\n            while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {\n              page_res_it.forward();\n            }\n          }\n        }\n      }\n    }\n  }\n}\n\n/*************************************************************************\n * reject_whole_page()\n * Don't believe any of it - set the reject map to 00..00 in all words\n *\n *************************************************************************/\n\nvoid reject_whole_page(PAGE_RES_IT &page_res_it) {\n  page_res_it.restart_page();\n  while (page_res_it.word() != nullptr) {\n    page_res_it.word()->reject_map.rej_word_doc_rej();\n    page_res_it.forward();\n  }\n  // whole page is rejected\n  page_res_it.page_res->rejected = true;\n}\n\nvoid Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {\n  WERD_RES *word;\n  GARBAGE_LEVEL garbage_level;\n  PAGE_RES_IT copy_it;\n  bool prev_potential_marked = false;\n  bool found_terrible_word = false;\n  bool ok_dict_word;\n\n  page_res_it.restart_page();\n  while (page_res_it.word() != nullptr) {\n    POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();\n    if (pb != nullptr && !pb->IsText()) {\n      page_res_it.forward();\n      continue;\n    }\n    word = page_res_it.word();\n\n    if (crunch_early_convert_bad_unlv_chs) {\n      convert_bad_unlv_chs(word);\n    }\n\n    if (crunch_early_merge_tess_fails) {\n      word->merge_tess_fails();\n    }\n\n    if (word->reject_map.accept_count() != 0) {\n      found_terrible_word = false;\n      // Forget earlier potential crunches\n      prev_potential_marked = false;\n    } else {\n      ok_dict_word = safe_dict_word(word);\n      garbage_level = garbage_word(word, ok_dict_word);\n\n      if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {\n        if (crunch_debug > 0) {\n          tprintf(\"T CRUNCHING: \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n        }\n        word->unlv_crunch_mode = CR_KEEP_SPACE;\n        if (prev_potential_marked) {\n          while (copy_it.word() != word) {\n            if (crunch_debug > 0) {\n              tprintf(\"P1 CRUNCHING: \\\"%s\\\"\\n\",\n                      copy_it.word()->best_choice->unichar_string().c_str());\n            }\n            copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;\n            copy_it.forward();\n          }\n          prev_potential_marked = false;\n        }\n        found_terrible_word = true;\n      } else if ((garbage_level != G_NEVER_CRUNCH) &&\n                 (potential_word_crunch(word, garbage_level, ok_dict_word))) {\n        if (found_terrible_word) {\n          if (crunch_debug > 0) {\n            tprintf(\"P2 CRUNCHING: \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n          }\n          word->unlv_crunch_mode = CR_KEEP_SPACE;\n        } else if (!prev_potential_marked) {\n          copy_it = page_res_it;\n          prev_potential_marked = true;\n          if (crunch_debug > 1) {\n            tprintf(\"P3 CRUNCHING: \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n          }\n        }\n      } else {\n        found_terrible_word = false;\n        // Forget earlier potential crunches\n        prev_potential_marked = false;\n        if (crunch_debug > 2) {\n          tprintf(\"NO CRUNCH: \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n        }\n      }\n    }\n    page_res_it.forward();\n  }\n}\n\nbool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {\n  int crunch_mode = 0;\n\n  if (word->best_choice->unichar_string().empty() ||\n      (strspn(word->best_choice->unichar_string().c_str(), \" \") ==\n       word->best_choice->unichar_string().size())) {\n    crunch_mode = 1;\n  } else {\n    int adjusted_len = word->reject_map.length();\n    if (adjusted_len > crunch_rating_max) {\n      adjusted_len = crunch_rating_max;\n    }\n    float rating_per_ch = word->best_choice->rating() / adjusted_len;\n\n    if (rating_per_ch > crunch_terrible_rating) {\n      crunch_mode = 2;\n    } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {\n      crunch_mode = 3;\n    } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&\n               (garbage_level != G_OK)) {\n      crunch_mode = 4;\n    } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {\n      crunch_mode = 5;\n    }\n  }\n  if (crunch_mode > 0) {\n    if (crunch_debug > 2) {\n      tprintf(\"Terrible_word_crunch (%d) on \\\"%s\\\"\\n\", crunch_mode,\n              word->best_choice->unichar_string().c_str());\n    }\n    return true;\n  } else {\n    return false;\n  }\n}\n\nbool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,\n                                      bool ok_dict_word) {\n  float rating_per_ch;\n  int adjusted_len;\n  const char *str = word->best_choice->unichar_string().c_str();\n  const char *lengths = word->best_choice->unichar_lengths().c_str();\n  bool word_crunchable;\n  int poor_indicator_count = 0;\n\n  word_crunchable =\n      !crunch_leave_accept_strings || word->reject_map.length() < 3 ||\n      (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);\n\n  adjusted_len = word->reject_map.length();\n  if (adjusted_len > 10) {\n    adjusted_len = 10;\n  }\n  rating_per_ch = word->best_choice->rating() / adjusted_len;\n\n  if (rating_per_ch > crunch_pot_poor_rate) {\n    if (crunch_debug > 2) {\n      tprintf(\"Potential poor rating on \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n    }\n    poor_indicator_count++;\n  }\n\n  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {\n    if (crunch_debug > 2) {\n      tprintf(\"Potential poor cert on \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n    }\n    poor_indicator_count++;\n  }\n\n  if (garbage_level != G_OK) {\n    if (crunch_debug > 2) {\n      tprintf(\"Potential garbage on \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n    }\n    poor_indicator_count++;\n  }\n  return poor_indicator_count >= crunch_pot_indicators;\n}\n\nvoid Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {\n  PAGE_RES_IT copy_it;\n  bool deleting_from_bol = false;\n  bool marked_delete_point = false;\n  int16_t debug_delete_mode;\n  CRUNCH_MODE delete_mode;\n  int16_t x_debug_delete_mode;\n  CRUNCH_MODE x_delete_mode;\n\n  page_res_it.restart_page();\n  while (page_res_it.word() != nullptr) {\n    WERD_RES *word = page_res_it.word();\n\n    delete_mode = word_deletable(word, debug_delete_mode);\n    if (delete_mode != CR_NONE) {\n      if (word->word->flag(W_BOL) || deleting_from_bol) {\n        if (crunch_debug > 0) {\n          tprintf(\"BOL CRUNCH DELETING(%d): \\\"%s\\\"\\n\", debug_delete_mode,\n                  word->best_choice->unichar_string().c_str());\n        }\n        word->unlv_crunch_mode = delete_mode;\n        deleting_from_bol = true;\n      } else if (word->word->flag(W_EOL)) {\n        if (marked_delete_point) {\n          while (copy_it.word() != word) {\n            x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);\n            if (crunch_debug > 0) {\n              tprintf(\"EOL CRUNCH DELETING(%d): \\\"%s\\\"\\n\", x_debug_delete_mode,\n                      copy_it.word()->best_choice->unichar_string().c_str());\n            }\n            copy_it.word()->unlv_crunch_mode = x_delete_mode;\n            copy_it.forward();\n          }\n        }\n        if (crunch_debug > 0) {\n          tprintf(\"EOL CRUNCH DELETING(%d): \\\"%s\\\"\\n\", debug_delete_mode,\n                  word->best_choice->unichar_string().c_str());\n        }\n        word->unlv_crunch_mode = delete_mode;\n        deleting_from_bol = false;\n        marked_delete_point = false;\n      } else {\n        if (!marked_delete_point) {\n          copy_it = page_res_it;\n          marked_delete_point = true;\n        }\n      }\n    } else {\n      deleting_from_bol = false;\n      // Forget earlier potential crunches\n      marked_delete_point = false;\n    }\n    /*\n  The following step has been left till now as the tess fails are used to\n  determine if the word is deletable.\n*/\n    if (!crunch_early_merge_tess_fails) {\n      word->merge_tess_fails();\n    }\n    page_res_it.forward();\n  }\n}\n\nvoid Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {\n  int i;\n  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id(\"-\");\n  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(\" \");\n  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id(\"~\");\n  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id(\"^\");\n  for (i = 0; i < word_res->reject_map.length(); ++i) {\n    if (word_res->best_choice->unichar_id(i) == unichar_tilde) {\n      word_res->best_choice->set_unichar_id(unichar_dash, i);\n      if (word_res->reject_map[i].accepted()) {\n        word_res->reject_map[i].setrej_unlv_rej();\n      }\n    }\n    if (word_res->best_choice->unichar_id(i) == unichar_pow) {\n      word_res->best_choice->set_unichar_id(unichar_space, i);\n      if (word_res->reject_map[i].accepted()) {\n        word_res->reject_map[i].setrej_unlv_rej();\n      }\n    }\n  }\n}\n\nGARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {\n  enum STATES {\n    JUNK,\n    FIRST_UPPER,\n    FIRST_LOWER,\n    FIRST_NUM,\n    SUBSEQUENT_UPPER,\n    SUBSEQUENT_LOWER,\n    SUBSEQUENT_NUM\n  };\n  const char *str = word->best_choice->unichar_string().c_str();\n  const char *lengths = word->best_choice->unichar_lengths().c_str();\n  STATES state = JUNK;\n  int len = 0;\n  int isolated_digits = 0;\n  int isolated_alphas = 0;\n  int bad_char_count = 0;\n  int tess_rejs = 0;\n  int dodgy_chars = 0;\n  int ok_chars;\n  UNICHAR_ID last_char = -1;\n  int alpha_repetition_count = 0;\n  int longest_alpha_repetition_count = 0;\n  int longest_lower_run_len = 0;\n  int lower_string_count = 0;\n  int longest_upper_run_len = 0;\n  int upper_string_count = 0;\n  int total_alpha_count = 0;\n  int total_digit_count = 0;\n\n  for (; *str != '\\0'; str += *(lengths++)) {\n    len++;\n    if (word->uch_set->get_isupper(str, *lengths)) {\n      total_alpha_count++;\n      switch (state) {\n        case SUBSEQUENT_UPPER:\n        case FIRST_UPPER:\n          state = SUBSEQUENT_UPPER;\n          upper_string_count++;\n          if (longest_upper_run_len < upper_string_count) {\n            longest_upper_run_len = upper_string_count;\n          }\n          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {\n            alpha_repetition_count++;\n            if (longest_alpha_repetition_count < alpha_repetition_count) {\n              longest_alpha_repetition_count = alpha_repetition_count;\n            }\n          } else {\n            last_char = word->uch_set->unichar_to_id(str, *lengths);\n            alpha_repetition_count = 1;\n          }\n          break;\n        case FIRST_NUM:\n          isolated_digits++;\n          // Fall through.\n        default:\n          state = FIRST_UPPER;\n          last_char = word->uch_set->unichar_to_id(str, *lengths);\n          alpha_repetition_count = 1;\n          upper_string_count = 1;\n          break;\n      }\n    } else if (word->uch_set->get_islower(str, *lengths)) {\n      total_alpha_count++;\n      switch (state) {\n        case SUBSEQUENT_LOWER:\n        case FIRST_LOWER:\n          state = SUBSEQUENT_LOWER;\n          lower_string_count++;\n          if (longest_lower_run_len < lower_string_count) {\n            longest_lower_run_len = lower_string_count;\n          }\n          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {\n            alpha_repetition_count++;\n            if (longest_alpha_repetition_count < alpha_repetition_count) {\n              longest_alpha_repetition_count = alpha_repetition_count;\n            }\n          } else {\n            last_char = word->uch_set->unichar_to_id(str, *lengths);\n            alpha_repetition_count = 1;\n          }\n          break;\n        case FIRST_NUM:\n          isolated_digits++;\n          // Fall through.\n        default:\n          state = FIRST_LOWER;\n          last_char = word->uch_set->unichar_to_id(str, *lengths);\n          alpha_repetition_count = 1;\n          lower_string_count = 1;\n          break;\n      }\n    } else if (word->uch_set->get_isdigit(str, *lengths)) {\n      total_digit_count++;\n      switch (state) {\n        case FIRST_NUM:\n          state = SUBSEQUENT_NUM;\n        case SUBSEQUENT_NUM:\n          break;\n        case FIRST_UPPER:\n        case FIRST_LOWER:\n          isolated_alphas++;\n          // Fall through.\n        default:\n          state = FIRST_NUM;\n          break;\n      }\n    } else {\n      if (*lengths == 1 && *str == ' ') {\n        tess_rejs++;\n      } else {\n        bad_char_count++;\n      }\n      switch (state) {\n        case FIRST_NUM:\n          isolated_digits++;\n          break;\n        case FIRST_UPPER:\n        case FIRST_LOWER:\n          isolated_alphas++;\n        default:\n          break;\n      }\n      state = JUNK;\n    }\n  }\n\n  switch (state) {\n    case FIRST_NUM:\n      isolated_digits++;\n      break;\n    case FIRST_UPPER:\n    case FIRST_LOWER:\n      isolated_alphas++;\n    default:\n      break;\n  }\n\n  if (crunch_include_numerals) {\n    total_alpha_count += total_digit_count - isolated_digits;\n  }\n\n  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&\n      longest_alpha_repetition_count < crunch_long_repetitions) {\n    if ((crunch_accept_ok &&\n         acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||\n        longest_lower_run_len > crunch_leave_lc_strings ||\n        longest_upper_run_len > crunch_leave_uc_strings) {\n      return G_NEVER_CRUNCH;\n    }\n  }\n  if (word->reject_map.length() > 1 && strpbrk(str, \" \") == nullptr &&\n      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||\n       word->best_choice->permuter() == FREQ_DAWG_PERM ||\n       word->best_choice->permuter() == USER_DAWG_PERM ||\n       word->best_choice->permuter() == NUMBER_PERM ||\n       acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {\n    return G_OK;\n  }\n\n  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;\n\n  if (crunch_debug > 3) {\n    tprintf(\"garbage_word: \\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n    tprintf(\"LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\\n\", len, bad_char_count,\n            isolated_digits, isolated_alphas, tess_rejs);\n  }\n  if (bad_char_count == 0 && tess_rejs == 0 &&\n      (len > isolated_digits + isolated_alphas || len <= 2)) {\n    return G_OK;\n  }\n\n  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {\n    return G_TERRIBLE;\n  }\n\n  if (len > 4) {\n    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;\n    if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {\n      return G_DODGY;\n    } else {\n      return G_OK;\n    }\n  } else {\n    dodgy_chars = 2 * tess_rejs + bad_char_count;\n    if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {\n      return G_DODGY;\n    } else {\n      return G_OK;\n    }\n  }\n}\n\n/*************************************************************************\n * word_deletable()\n *     DELETE WERDS AT ENDS OF ROWS IF\n *        Word is crunched &&\n *        ( string length = 0                                          OR\n *          > 50% of chars are \"|\" (before merging)                    OR\n *          certainty < -10                                            OR\n *          rating /char > 60                                          OR\n *          TOP of word is more than 0.5 xht BELOW baseline            OR\n *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR\n *          length of word < 3xht                                      OR\n *          height of word < 0.7 xht                                   OR\n *          height of word > 3.0 xht                                   OR\n *          >75% of the outline BBs have longest dimension < 0.5xht\n *************************************************************************/\n\nCRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {\n  int word_len = word->reject_map.length();\n  float rating_per_ch;\n  TBOX box; // BB of word\n\n  if (word->unlv_crunch_mode == CR_NONE) {\n    delete_mode = 0;\n    return CR_NONE;\n  }\n\n  if (word_len == 0) {\n    delete_mode = 1;\n    return CR_DELETE;\n  }\n\n  if (word->rebuild_word != nullptr) {\n    // Cube leaves rebuild_word nullptr.\n    box = word->rebuild_word->bounding_box();\n    if (box.height() < crunch_del_min_ht * kBlnXHeight) {\n      delete_mode = 4;\n      return CR_DELETE;\n    }\n\n    if (noise_outlines(word->rebuild_word)) {\n      delete_mode = 5;\n      return CR_DELETE;\n    }\n  }\n\n  if ((failure_count(word) * 1.5) > word_len) {\n    delete_mode = 2;\n    return CR_LOOSE_SPACE;\n  }\n\n  if (word->best_choice->certainty() < crunch_del_cert) {\n    delete_mode = 7;\n    return CR_LOOSE_SPACE;\n  }\n\n  rating_per_ch = word->best_choice->rating() / word_len;\n\n  if (rating_per_ch > crunch_del_rating) {\n    delete_mode = 8;\n    return CR_LOOSE_SPACE;\n  }\n\n  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {\n    delete_mode = 9;\n    return CR_LOOSE_SPACE;\n  }\n\n  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {\n    delete_mode = 10;\n    return CR_LOOSE_SPACE;\n  }\n\n  if (box.height() > crunch_del_max_ht * kBlnXHeight) {\n    delete_mode = 11;\n    return CR_LOOSE_SPACE;\n  }\n\n  if (box.width() < crunch_del_min_width * kBlnXHeight) {\n    delete_mode = 3;\n    return CR_LOOSE_SPACE;\n  }\n\n  delete_mode = 0;\n  return CR_NONE;\n}\n\nint16_t Tesseract::failure_count(WERD_RES *word) {\n  const char *str = word->best_choice->unichar_string().c_str();\n  int tess_rejs = 0;\n\n  for (; *str != '\\0'; str++) {\n    if (*str == ' ') {\n      tess_rejs++;\n    }\n  }\n  return tess_rejs;\n}\n\nbool Tesseract::noise_outlines(TWERD *word) {\n  TBOX box; // BB of outline\n  int16_t outline_count = 0;\n  int16_t small_outline_count = 0;\n  int16_t max_dimension;\n  float small_limit = kBlnXHeight * crunch_small_outlines_size;\n\n  for (unsigned b = 0; b < word->NumBlobs(); ++b) {\n    TBLOB *blob = word->blobs[b];\n    for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {\n      outline_count++;\n      box = ol->bounding_box();\n      if (box.height() > box.width()) {\n        max_dimension = box.height();\n      } else {\n        max_dimension = box.width();\n      }\n      if (max_dimension < small_limit) {\n        small_outline_count++;\n      }\n    }\n  }\n  return small_outline_count >= outline_count;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/docqual.h",
    "content": "/******************************************************************\n * File:        docqual.h  (Formerly docqual.h)\n * Description: Document Quality Metrics\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef DOCQUAL_H\n#define DOCQUAL_H\n\n#include <cstdint> // for int16_t\n\nnamespace tesseract {\n\nclass PAGE_RES_IT;\nclass ROW;\nclass WERD_RES;\n\nenum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE };\n\nint16_t word_blob_quality(WERD_RES *word);\nvoid reject_whole_page(PAGE_RES_IT &page_res_it);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccmain/equationdetect.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        equationdetect.cpp\n// Description: Helper classes to detect equations.\n// Author:      Zongyi (Joe) Liu (joeliu@google.com)\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"equationdetect.h\"\n\n#include \"bbgrid.h\"\n#include \"classify.h\"\n#include \"colpartition.h\"\n#include \"colpartitiongrid.h\"\n#include \"colpartitionset.h\"\n#include \"ratngs.h\"\n#include \"tesseractclass.h\"\n\n#include \"helpers.h\"\n\n#include <algorithm>\n#include <cfloat>\n#include <cmath>\n#include <limits>\n#include <memory>\n\nnamespace tesseract {\n\n// Config variables.\nstatic BOOL_VAR(equationdetect_save_bi_image, false, \"Save input bi image\");\nstatic BOOL_VAR(equationdetect_save_spt_image, false, \"Save special character image\");\nstatic BOOL_VAR(equationdetect_save_seed_image, false, \"Save the seed image\");\nstatic BOOL_VAR(equationdetect_save_merged_image, false, \"Save the merged image\");\n\n///////////////////////////////////////////////////////////////////////////\n// Utility ColPartition sort functions.\n///////////////////////////////////////////////////////////////////////////\nstatic int SortCPByTopReverse(const void *p1, const void *p2) {\n  const ColPartition *cp1 = *static_cast<ColPartition *const *>(p1);\n  const ColPartition *cp2 = *static_cast<ColPartition *const *>(p2);\n  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);\n  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());\n  return box2.top() - box1.top();\n}\n\nstatic int SortCPByBottom(const void *p1, const void *p2) {\n  const ColPartition *cp1 = *static_cast<ColPartition *const *>(p1);\n  const ColPartition *cp2 = *static_cast<ColPartition *const *>(p2);\n  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);\n  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());\n  return box1.bottom() - box2.bottom();\n}\n\nstatic int SortCPByHeight(const void *p1, const void *p2) {\n  const ColPartition *cp1 = *static_cast<ColPartition *const *>(p1);\n  const ColPartition *cp2 = *static_cast<ColPartition *const *>(p2);\n  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);\n  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());\n  return box1.height() - box2.height();\n}\n\n// TODO(joeliu): we may want to parameterize these constants.\nconst float kMathDigitDensityTh1 = 0.25;\nconst float kMathDigitDensityTh2 = 0.1;\nconst float kMathItalicDensityTh = 0.5;\nconst float kUnclearDensityTh = 0.25;\nconst int kSeedBlobsCountTh = 10;\nconst int kLeftIndentAlignmentCountTh = 1;\n\n// Returns true if PolyBlockType is of text type or equation type.\ninline bool IsTextOrEquationType(PolyBlockType type) {\n  return PTIsTextType(type) || type == PT_EQUATION;\n}\n\ninline bool IsLeftIndented(const EquationDetect::IndentType type) {\n  return type == EquationDetect::LEFT_INDENT || type == EquationDetect::BOTH_INDENT;\n}\n\ninline bool IsRightIndented(const EquationDetect::IndentType type) {\n  return type == EquationDetect::RIGHT_INDENT || type == EquationDetect::BOTH_INDENT;\n}\n\nEquationDetect::EquationDetect(const char *equ_datapath, const char *equ_name) {\n  const char *default_name = \"equ\";\n  if (equ_name == nullptr) {\n    equ_name = default_name;\n  }\n  lang_tesseract_ = nullptr;\n  resolution_ = 0;\n  page_count_ = 0;\n\n  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name, OEM_TESSERACT_ONLY)) {\n    tprintf(\n        \"Warning: equation region detection requested,\"\n        \" but %s failed to load from %s\\n\",\n        equ_name, equ_datapath);\n  }\n\n  cps_super_bbox_ = nullptr;\n}\n\nEquationDetect::~EquationDetect() {\n  delete (cps_super_bbox_);\n}\n\nvoid EquationDetect::SetLangTesseract(Tesseract *lang_tesseract) {\n  lang_tesseract_ = lang_tesseract;\n}\n\nvoid EquationDetect::SetResolution(const int resolution) {\n  resolution_ = resolution;\n}\n\nint EquationDetect::LabelSpecialText(TO_BLOCK *to_block) {\n  if (to_block == nullptr) {\n    tprintf(\"Warning: input to_block is nullptr!\\n\");\n    return -1;\n  }\n\n  std::vector<BLOBNBOX_LIST *> blob_lists;\n  blob_lists.push_back(&(to_block->blobs));\n  blob_lists.push_back(&(to_block->large_blobs));\n  for (auto &blob_list : blob_lists) {\n    BLOBNBOX_IT bbox_it(blob_list);\n    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n      bbox_it.data()->set_special_text_type(BSTT_NONE);\n    }\n  }\n\n  return 0;\n}\n\nvoid EquationDetect::IdentifySpecialText(BLOBNBOX *blobnbox, const int height_th) {\n  ASSERT_HOST(blobnbox != nullptr);\n  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {\n    // For small blob, we simply set to BSTT_NONE.\n    blobnbox->set_special_text_type(BSTT_NONE);\n    return;\n  }\n\n  BLOB_CHOICE_LIST ratings_equ, ratings_lang;\n  C_BLOB *blob = blobnbox->cblob();\n  // TODO(joeliu/rays) Fix this. We may have to normalize separately for\n  // each classifier here, as they may require different PolygonalCopy.\n  TBLOB *tblob = TBLOB::PolygonalCopy(false, blob);\n  const TBOX &box = tblob->bounding_box();\n\n  // Normalize the blob. Set the origin to the place we want to be the\n  // bottom-middle, and scaling is to make the height the x-height.\n  const float scaling = static_cast<float>(kBlnXHeight) / box.height();\n  const float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();\n  std::unique_ptr<TBLOB> normed_blob(new TBLOB(*tblob));\n  normed_blob->Normalize(nullptr, nullptr, nullptr, x_orig, y_orig, scaling, scaling, 0.0f,\n                         static_cast<float>(kBlnBaselineOffset), false, nullptr);\n  equ_tesseract_.AdaptiveClassifier(normed_blob.get(), &ratings_equ);\n  lang_tesseract_->AdaptiveClassifier(normed_blob.get(), &ratings_lang);\n  delete tblob;\n\n  // Get the best choice from ratings_lang and rating_equ. As the choice in the\n  // list has already been sorted by the certainty, we simply use the first\n  // choice.\n  BLOB_CHOICE *lang_choice = nullptr, *equ_choice = nullptr;\n  if (ratings_lang.length() > 0) {\n    BLOB_CHOICE_IT choice_it(&ratings_lang);\n    lang_choice = choice_it.data();\n  }\n  if (ratings_equ.length() > 0) {\n    BLOB_CHOICE_IT choice_it(&ratings_equ);\n    equ_choice = choice_it.data();\n  }\n\n  const float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;\n  const float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;\n\n  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;\n  // The scores here are negative, so the max/min == fabs(min/max).\n  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);\n  const float diff = std::fabs(lang_score - equ_score);\n  BlobSpecialTextType type = BSTT_NONE;\n\n  // Classification.\n  if (std::fmax(lang_score, equ_score) < kConfScoreTh) {\n    // If both score are very small, then mark it as unclear.\n    type = BSTT_UNCLEAR;\n  } else if (diff > kConfDiffTh && equ_score > lang_score) {\n    // If equ_score is significantly higher, then we classify this character as\n    // math symbol.\n    type = BSTT_MATH;\n  } else if (lang_choice) {\n    // For other cases: lang_score is similar or significantly higher.\n    type = EstimateTypeForUnichar(lang_tesseract_->unicharset, lang_choice->unichar_id());\n  }\n\n  if (type == BSTT_NONE &&\n      lang_tesseract_->get_fontinfo_table().at(lang_choice->fontinfo_id()).is_italic()) {\n    // For text symbol, we still check if it is italic.\n    blobnbox->set_special_text_type(BSTT_ITALIC);\n  } else {\n    blobnbox->set_special_text_type(type);\n  }\n}\n\nBlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &unicharset,\n                                                           const UNICHAR_ID id) const {\n  const std::string s = unicharset.id_to_unichar(id);\n  if (unicharset.get_isalpha(id)) {\n    return BSTT_NONE;\n  }\n\n  if (unicharset.get_ispunctuation(id)) {\n    // Exclude some special texts that are likely to be confused as math symbol.\n    static std::vector<UNICHAR_ID> ids_to_exclude;\n    if (ids_to_exclude.empty()) {\n      static const char *kCharsToEx[] = {\"'\",  \"`\",  \"\\\"\", \"\\\\\", \",\",  \".\",\n                                         \"〈\", \"〉\", \"《\", \"》\", \"」\", \"「\"};\n      for (auto &i : kCharsToEx) {\n        ids_to_exclude.push_back(unicharset.unichar_to_id(i));\n      }\n      std::sort(ids_to_exclude.begin(), ids_to_exclude.end());\n    }\n    auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id);\n    return found ? BSTT_NONE : BSTT_MATH;\n  }\n\n  // Check if it is digit. In addition to the isdigit attribute, we also check\n  // if this character belongs to those likely to be confused with a digit.\n  static const char kDigitsChars[] = \"|\";\n  if (unicharset.get_isdigit(id) || (s.length() == 1 && strchr(kDigitsChars, s[0]) != nullptr)) {\n    return BSTT_DIGIT;\n  } else {\n    return BSTT_MATH;\n  }\n}\n\nvoid EquationDetect::IdentifySpecialText() {\n  // Set configuration for Tesseract::AdaptiveClassifier.\n  equ_tesseract_.tess_cn_matching.set_value(true); // turn it on\n  equ_tesseract_.tess_bn_matching.set_value(false);\n\n  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.\n  const int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;\n  const int classify_integer_matcher = lang_tesseract_->classify_integer_matcher_multiplier;\n  lang_tesseract_->classify_class_pruner_multiplier.set_value(0);\n  lang_tesseract_->classify_integer_matcher_multiplier.set_value(0);\n\n  ColPartitionGridSearch gsearch(part_grid_);\n  ColPartition *part = nullptr;\n  gsearch.StartFullSearch();\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (!IsTextOrEquationType(part->type())) {\n      continue;\n    }\n    IdentifyBlobsToSkip(part);\n    BLOBNBOX_C_IT bbox_it(part->boxes());\n    // Compute the height threshold.\n    std::vector<int> blob_heights;\n    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n      if (bbox_it.data()->special_text_type() != BSTT_SKIP) {\n        blob_heights.push_back(bbox_it.data()->bounding_box().height());\n      }\n    }\n    std::sort(blob_heights.begin(), blob_heights.end());\n    const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;\n    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n      if (bbox_it.data()->special_text_type() != BSTT_SKIP) {\n        IdentifySpecialText(bbox_it.data(), height_th);\n      }\n    }\n  }\n\n  // Set the multiplier values back.\n  lang_tesseract_->classify_class_pruner_multiplier.set_value(classify_class_pruner);\n  lang_tesseract_->classify_integer_matcher_multiplier.set_value(classify_integer_matcher);\n\n  if (equationdetect_save_spt_image) { // For debug.\n    std::string outfile;\n    GetOutputTiffName(\"_spt\", outfile);\n    PaintSpecialTexts(outfile);\n  }\n}\n\nvoid EquationDetect::IdentifyBlobsToSkip(ColPartition *part) {\n  ASSERT_HOST(part);\n  BLOBNBOX_C_IT blob_it(part->boxes());\n\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    // At this moment, no blob should have been joined.\n    ASSERT_HOST(!blob_it.data()->joined_to_prev());\n  }\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {\n      continue;\n    }\n    TBOX blob_box = blob->bounding_box();\n\n    // Search if any blob can be merged into blob. If found, then we mark all\n    // these blobs as BSTT_SKIP.\n    BLOBNBOX_C_IT blob_it2 = blob_it;\n    bool found = false;\n    while (!blob_it2.at_last()) {\n      BLOBNBOX *nextblob = blob_it2.forward();\n      const TBOX &nextblob_box = nextblob->bounding_box();\n      if (nextblob_box.left() >= blob_box.right()) {\n        break;\n      }\n      const float kWidthR = 0.4, kHeightR = 0.3;\n      const bool xoverlap = blob_box.major_x_overlap(nextblob_box),\n                 yoverlap = blob_box.y_overlap(nextblob_box);\n      const float widthR = static_cast<float>(std::min(nextblob_box.width(), blob_box.width())) /\n                           std::max(nextblob_box.width(), blob_box.width());\n      const float heightR = static_cast<float>(std::min(nextblob_box.height(), blob_box.height())) /\n                            std::max(nextblob_box.height(), blob_box.height());\n\n      if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {\n        // Found one, set nextblob type and recompute blob_box.\n        found = true;\n        nextblob->set_special_text_type(BSTT_SKIP);\n        blob_box += nextblob_box;\n      }\n    }\n    if (found) {\n      blob->set_special_text_type(BSTT_SKIP);\n    }\n  }\n}\n\nint EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) {\n  if (!lang_tesseract_) {\n    tprintf(\"Warning: lang_tesseract_ is nullptr!\\n\");\n    return -1;\n  }\n  if (!part_grid || !best_columns) {\n    tprintf(\"part_grid/best_columns is nullptr!!\\n\");\n    return -1;\n  }\n  cp_seeds_.clear();\n  part_grid_ = part_grid;\n  best_columns_ = best_columns;\n  resolution_ = lang_tesseract_->source_resolution();\n  std::string outfile;\n  page_count_++;\n\n  if (equationdetect_save_bi_image) {\n    GetOutputTiffName(\"_bi\", outfile);\n    pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);\n  }\n\n  // Pass 0: Compute special text type for blobs.\n  IdentifySpecialText();\n\n  // Pass 1: Merge parts by overlap.\n  MergePartsByLocation();\n\n  // Pass 2: compute the math blob density and find the seed partition.\n  IdentifySeedParts();\n  // We still need separate seed into block seed and inline seed partition.\n  IdentifyInlineParts();\n\n  if (equationdetect_save_seed_image) {\n    GetOutputTiffName(\"_seed\", outfile);\n    PaintColParts(outfile);\n  }\n\n  // Pass 3: expand block equation seeds.\n  while (!cp_seeds_.empty()) {\n    std::vector<ColPartition *> seeds_expanded;\n    for (auto &cp_seed : cp_seeds_) {\n      if (ExpandSeed(cp_seed)) {\n        // If this seed is expanded, then we add it into seeds_expanded. Note\n        // this seed has been removed from part_grid_ if it is expanded.\n        seeds_expanded.push_back(cp_seed);\n      }\n    }\n    // Add seeds_expanded back into part_grid_ and reset cp_seeds_.\n    for (auto &i : seeds_expanded) {\n      InsertPartAfterAbsorb(i);\n    }\n    cp_seeds_ = std::move(seeds_expanded);\n  }\n\n  // Pass 4: find math block satellite text partitions and merge them.\n  ProcessMathBlockSatelliteParts();\n\n  if (equationdetect_save_merged_image) { // For debug.\n    GetOutputTiffName(\"_merged\", outfile);\n    PaintColParts(outfile);\n  }\n\n  return 0;\n}\n\nvoid EquationDetect::MergePartsByLocation() {\n  while (true) {\n    ColPartition *part = nullptr;\n    // partitions that have been updated.\n    std::vector<ColPartition *> parts_updated;\n    ColPartitionGridSearch gsearch(part_grid_);\n    gsearch.StartFullSearch();\n    while ((part = gsearch.NextFullSearch()) != nullptr) {\n      if (!IsTextOrEquationType(part->type())) {\n        continue;\n      }\n      std::vector<ColPartition *> parts_to_merge;\n      SearchByOverlap(part, &parts_to_merge);\n      if (parts_to_merge.empty()) {\n        continue;\n      }\n\n      // Merge parts_to_merge with part, and remove them from part_grid_.\n      part_grid_->RemoveBBox(part);\n      for (auto &i : parts_to_merge) {\n        ASSERT_HOST(i != nullptr && i != part);\n        part->Absorb(i, nullptr);\n      }\n      gsearch.RepositionIterator();\n\n      parts_updated.push_back(part);\n    }\n\n    if (parts_updated.empty()) { // Exit the loop\n      break;\n    }\n\n    // Re-insert parts_updated into part_grid_.\n    for (auto &i : parts_updated) {\n      InsertPartAfterAbsorb(i);\n    }\n  }\n}\n\nvoid EquationDetect::SearchByOverlap(ColPartition *seed,\n                                     std::vector<ColPartition *> *parts_overlap) {\n  ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);\n  if (!IsTextOrEquationType(seed->type())) {\n    return;\n  }\n  ColPartitionGridSearch search(part_grid_);\n  const TBOX &seed_box(seed->bounding_box());\n  const int kRadNeighborCells = 30;\n  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,\n                        (seed_box.top() + seed_box.bottom()) / 2, kRadNeighborCells);\n  search.SetUniqueMode(true);\n\n  // Search iteratively.\n  ColPartition *part;\n  std::vector<ColPartition *> parts;\n  const float kLargeOverlapTh = 0.95;\n  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;\n  while ((part = search.NextRadSearch()) != nullptr) {\n    if (part == seed || !IsTextOrEquationType(part->type())) {\n      continue;\n    }\n    const TBOX &part_box(part->bounding_box());\n    bool merge = false;\n\n    const float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),\n                y_overlap_fraction = part_box.y_overlap_fraction(seed_box);\n\n    // If part is large overlapped with seed, then set merge to true.\n    if (x_overlap_fraction >= kLargeOverlapTh && y_overlap_fraction >= kLargeOverlapTh) {\n      merge = true;\n    } else if (seed->type() == PT_EQUATION && IsTextOrEquationType(part->type())) {\n      if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||\n          (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {\n        merge = true;\n      }\n    }\n\n    if (merge) { // Remove the part from search and put it into parts.\n      search.RemoveBBox();\n      parts_overlap->push_back(part);\n    }\n  }\n}\n\nvoid EquationDetect::InsertPartAfterAbsorb(ColPartition *part) {\n  ASSERT_HOST(part);\n\n  // Before insert part back into part_grid_, we will need re-compute some\n  // of its attributes such as first_column_, last_column_. However, we still\n  // want to preserve its type.\n  BlobTextFlowType flow_type = part->flow();\n  PolyBlockType part_type = part->type();\n  BlobRegionType blob_type = part->blob_type();\n\n  // Call SetPartitionType to re-compute the attributes of part.\n  const TBOX &part_box(part->bounding_box());\n  int grid_x, grid_y;\n  part_grid_->GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);\n  part->SetPartitionType(resolution_, best_columns_[grid_y]);\n\n  // Reset the types back.\n  part->set_type(part_type);\n  part->set_blob_type(blob_type);\n  part->set_flow(flow_type);\n  part->SetBlobTypes();\n\n  // Insert into part_grid_.\n  part_grid_->InsertBBox(true, true, part);\n}\n\nvoid EquationDetect::IdentifySeedParts() {\n  ColPartitionGridSearch gsearch(part_grid_);\n  ColPartition *part = nullptr;\n  gsearch.StartFullSearch();\n\n  std::vector<ColPartition *> seeds1, seeds2;\n  // The left coordinates of indented text partitions.\n  std::vector<int> indented_texts_left;\n  // The foreground density of text partitions.\n  std::vector<float> texts_foreground_density;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (!IsTextOrEquationType(part->type())) {\n      continue;\n    }\n    part->ComputeSpecialBlobsDensity();\n    const bool blobs_check = CheckSeedBlobsCount(part);\n    const int kTextBlobsTh = 20;\n\n    if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) && blobs_check) {\n      // Passed high density threshold test, save into seeds1.\n      seeds1.push_back(part);\n    } else {\n      IndentType indent = IsIndented(part);\n      if (IsLeftIndented(indent) && blobs_check &&\n          CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) {\n        // Passed low density threshold test and is indented, save into seeds2.\n        seeds2.push_back(part);\n      } else if (!IsRightIndented(indent) && part->boxes_count() > kTextBlobsTh) {\n        // This is likely to be a text part, save the features.\n        const TBOX &box = part->bounding_box();\n        if (IsLeftIndented(indent)) {\n          indented_texts_left.push_back(box.left());\n        }\n        texts_foreground_density.push_back(ComputeForegroundDensity(box));\n      }\n    }\n  }\n\n  // Sort the features collected from text regions.\n  std::sort(indented_texts_left.begin(), indented_texts_left.end());\n  std::sort(texts_foreground_density.begin(), texts_foreground_density.end());\n  float foreground_density_th = 0.15; // Default value.\n  if (!texts_foreground_density.empty()) {\n    // Use the median of the texts_foreground_density.\n    foreground_density_th = 0.8 * texts_foreground_density[texts_foreground_density.size() / 2];\n  }\n\n  for (auto &i : seeds1) {\n    const TBOX &box = i->bounding_box();\n    if (CheckSeedFgDensity(foreground_density_th, i) &&\n        !(IsLeftIndented(IsIndented(i)) &&\n          CountAlignment(indented_texts_left, box.left()) >= kLeftIndentAlignmentCountTh)) {\n      // Mark as PT_EQUATION type.\n      i->set_type(PT_EQUATION);\n      cp_seeds_.push_back(i);\n    } else { // Mark as PT_INLINE_EQUATION type.\n      i->set_type(PT_INLINE_EQUATION);\n    }\n  }\n\n  for (auto &i : seeds2) {\n    if (CheckForSeed2(indented_texts_left, foreground_density_th, i)) {\n      i->set_type(PT_EQUATION);\n      cp_seeds_.push_back(i);\n    }\n  }\n}\n\nfloat EquationDetect::ComputeForegroundDensity(const TBOX &tbox) {\n  Image pix_bi = lang_tesseract_->pix_binary();\n  const int pix_height = pixGetHeight(pix_bi);\n  Box *box = boxCreate(tbox.left(), pix_height - tbox.top(), tbox.width(), tbox.height());\n  Image pix_sub = pixClipRectangle(pix_bi, box, nullptr);\n  l_float32 fract;\n  pixForegroundFraction(pix_sub, &fract);\n  pix_sub.destroy();\n  boxDestroy(&box);\n\n  return fract;\n}\n\nbool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *part) {\n  ASSERT_HOST(part);\n\n  // Split part horizontally, and check for each sub part.\n  std::vector<TBOX> sub_boxes;\n  SplitCPHorLite(part, &sub_boxes);\n  float parts_passed = 0.0;\n  for (auto &sub_boxe : sub_boxes) {\n    const float density = ComputeForegroundDensity(sub_boxe);\n    if (density < density_th) {\n      parts_passed++;\n    }\n  }\n\n  // If most sub parts passed, then we return true.\n  const float kSeedPartRatioTh = 0.3;\n  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);\n\n  return retval;\n}\n\nvoid EquationDetect::SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {\n  ASSERT_HOST(part && parts_splitted);\n  if (part->median_width() == 0 || part->boxes_count() == 0) {\n    return;\n  }\n\n  // Make a copy of part, and reset parts_splitted.\n  ColPartition *right_part = part->CopyButDontOwnBlobs();\n  for (auto data : *parts_splitted) {\n    delete data;\n  }\n  parts_splitted->clear();\n\n  const double kThreshold = part->median_width() * 3.0;\n  bool found_split = true;\n  while (found_split) {\n    found_split = false;\n    BLOBNBOX_C_IT box_it(right_part->boxes());\n    // Blobs are sorted left side first. If blobs overlap,\n    // the previous blob may have a \"more right\" right side.\n    // Account for this by always keeping the largest \"right\"\n    // so far.\n    int previous_right = INT32_MIN;\n\n    // Look for the next split in the partition.\n    for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {\n      const TBOX &box = box_it.data()->bounding_box();\n      if (previous_right != INT32_MIN && box.left() - previous_right > kThreshold) {\n        // We have a split position. Split the partition in two pieces.\n        // Insert the left piece in the grid and keep processing the right.\n        const int mid_x = (box.left() + previous_right) / 2;\n        ColPartition *left_part = right_part;\n        right_part = left_part->SplitAt(mid_x);\n\n        parts_splitted->push_back(left_part);\n        left_part->ComputeSpecialBlobsDensity();\n        found_split = true;\n        break;\n      }\n\n      // The right side of the previous blobs.\n      previous_right = std::max(previous_right, static_cast<int>(box.right()));\n    }\n  }\n\n  // Add the last piece.\n  right_part->ComputeSpecialBlobsDensity();\n  parts_splitted->push_back(right_part);\n}\n\nvoid EquationDetect::SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {\n  ASSERT_HOST(part && splitted_boxes);\n  splitted_boxes->clear();\n  if (part->median_width() == 0) {\n    return;\n  }\n\n  const double kThreshold = part->median_width() * 3.0;\n\n  // Blobs are sorted left side first. If blobs overlap,\n  // the previous blob may have a \"more right\" right side.\n  // Account for this by always keeping the largest \"right\"\n  // so far.\n  TBOX union_box;\n  int previous_right = INT32_MIN;\n  BLOBNBOX_C_IT box_it(part->boxes());\n  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {\n    const TBOX &box = box_it.data()->bounding_box();\n    if (previous_right != INT32_MIN && box.left() - previous_right > kThreshold) {\n      // We have a split position.\n      splitted_boxes->push_back(union_box);\n      previous_right = INT32_MIN;\n    }\n    if (previous_right == INT32_MIN) {\n      union_box = box;\n    } else {\n      union_box += box;\n    }\n    // The right side of the previous blobs.\n    previous_right = std::max(previous_right, static_cast<int>(box.right()));\n  }\n\n  // Add the last piece.\n  if (previous_right != INT32_MIN) {\n    splitted_boxes->push_back(union_box);\n  }\n}\n\nbool EquationDetect::CheckForSeed2(const std::vector<int> &indented_texts_left,\n                                   const float foreground_density_th, ColPartition *part) {\n  ASSERT_HOST(part);\n  const TBOX &box = part->bounding_box();\n\n  // Check if it is aligned with any indented_texts_left.\n  if (!indented_texts_left.empty() &&\n      CountAlignment(indented_texts_left, box.left()) >= kLeftIndentAlignmentCountTh) {\n    return false;\n  }\n\n  // Check the foreground density.\n  if (ComputeForegroundDensity(box) > foreground_density_th) {\n    return false;\n  }\n\n  return true;\n}\n\nint EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int val) const {\n  if (sorted_vec.empty()) {\n    return 0;\n  }\n  const int kDistTh = static_cast<int>(std::round(0.03f * resolution_));\n  auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val);\n  if (pos > sorted_vec.begin()) {\n    --pos;\n  }\n  int count = 0;\n\n  // Search left side.\n  auto index = pos - sorted_vec.begin();\n  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {\n    count++;\n  }\n\n  // Search right side.\n  index = pos + 1 - sorted_vec.begin();\n  while (static_cast<size_t>(index) < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {\n    count++;\n  }\n\n  return count;\n}\n\nvoid EquationDetect::IdentifyInlineParts() {\n  ComputeCPsSuperBBox();\n  IdentifyInlinePartsHorizontal();\n  const int textparts_linespacing = EstimateTextPartLineSpacing();\n  IdentifyInlinePartsVertical(true, textparts_linespacing);\n  IdentifyInlinePartsVertical(false, textparts_linespacing);\n}\n\nvoid EquationDetect::ComputeCPsSuperBBox() {\n  ColPartitionGridSearch gsearch(part_grid_);\n  ColPartition *part = nullptr;\n  gsearch.StartFullSearch();\n  delete cps_super_bbox_;\n  cps_super_bbox_ = new TBOX();\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    (*cps_super_bbox_) += part->bounding_box();\n  }\n}\n\nvoid EquationDetect::IdentifyInlinePartsHorizontal() {\n  ASSERT_HOST(cps_super_bbox_);\n  std::vector<ColPartition *> new_seeds;\n  const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution());\n  const int kGapTh = static_cast<int>(std::round(1.0f * lang_tesseract_->source_resolution()));\n  ColPartitionGridSearch search(part_grid_);\n  search.SetUniqueMode(true);\n  // The center x coordinate of the cp_super_bbox_.\n  const int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;\n  for (auto part : cp_seeds_) {\n    const TBOX &part_box(part->bounding_box());\n    const int left_margin = part_box.left() - cps_super_bbox_->left(),\n              right_margin = cps_super_bbox_->right() - part_box.right();\n    bool right_to_left;\n    if (left_margin + kMarginDiffTh < right_margin && left_margin < kMarginDiffTh) {\n      // part is left aligned, so we search if it has any right neighbor.\n      search.StartSideSearch(part_box.right(), part_box.top(), part_box.bottom());\n      right_to_left = false;\n    } else if (left_margin > cps_cx) {\n      // part locates on the right half on image, so search if it has any left\n      // neighbor.\n      search.StartSideSearch(part_box.left(), part_box.top(), part_box.bottom());\n      right_to_left = true;\n    } else { // part is not an inline equation.\n      new_seeds.push_back(part);\n      continue;\n    }\n    ColPartition *neighbor = nullptr;\n    bool side_neighbor_found = false;\n    while ((neighbor = search.NextSideSearch(right_to_left)) != nullptr) {\n      const TBOX &neighbor_box(neighbor->bounding_box());\n      if (!IsTextOrEquationType(neighbor->type()) || part_box.x_gap(neighbor_box) > kGapTh ||\n          !part_box.major_y_overlap(neighbor_box) || part_box.major_x_overlap(neighbor_box)) {\n        continue;\n      }\n      // We have found one. Set the side_neighbor_found flag.\n      side_neighbor_found = true;\n      break;\n    }\n    if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.\n      part->set_type(PT_INLINE_EQUATION);\n    } else {\n      // Check the geometric feature of neighbor.\n      const TBOX &neighbor_box(neighbor->bounding_box());\n      if (neighbor_box.width() > part_box.width() &&\n          neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.\n        part->set_type(PT_INLINE_EQUATION);\n      } else { // part is not an inline equation type.\n        new_seeds.push_back(part);\n      }\n    }\n  }\n\n  // Reset the cp_seeds_ using the new_seeds.\n  cp_seeds_ = std::move(new_seeds);\n}\n\nint EquationDetect::EstimateTextPartLineSpacing() {\n  ColPartitionGridSearch gsearch(part_grid_);\n\n  // Get the y gap between text partitions;\n  ColPartition *current = nullptr, *prev = nullptr;\n  gsearch.StartFullSearch();\n  std::vector<int> ygaps;\n  while ((current = gsearch.NextFullSearch()) != nullptr) {\n    if (!PTIsTextType(current->type())) {\n      continue;\n    }\n    if (prev != nullptr) {\n      const TBOX &current_box = current->bounding_box();\n      const TBOX &prev_box = prev->bounding_box();\n      // prev and current should be x major overlap and non y overlap.\n      if (current_box.major_x_overlap(prev_box) && !current_box.y_overlap(prev_box)) {\n        int gap = current_box.y_gap(prev_box);\n        if (gap < std::min(current_box.height(), prev_box.height())) {\n          // The gap should be smaller than the height of the bounding boxes.\n          ygaps.push_back(gap);\n        }\n      }\n    }\n    prev = current;\n  }\n\n  if (ygaps.size() < 8) { // We do not have enough data.\n    return -1;\n  }\n\n  // Compute the line spacing from ygaps: use the mean of the first half.\n  std::sort(ygaps.begin(), ygaps.end());\n  int spacing = 0;\n  unsigned count;\n  for (count = 0; count < ygaps.size() / 2; count++) {\n    spacing += ygaps[count];\n  }\n  return spacing / count;\n}\n\nvoid EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom,\n                                                 const int textparts_linespacing) {\n  if (cp_seeds_.empty()) {\n    return;\n  }\n\n  // Sort cp_seeds_.\n  if (top_to_bottom) { // From top to bottom.\n    std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse);\n  } else { // From bottom to top.\n    std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom);\n  }\n\n  std::vector<ColPartition *> new_seeds;\n  for (auto part : cp_seeds_) {\n    // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look\n    // for its top neighbors, so that if two/more inline regions are connected\n    // to each other, then we will identify the top one, and then use it to\n    // identify the bottom one.\n    if (IsInline(!top_to_bottom, textparts_linespacing, part)) {\n      part->set_type(PT_INLINE_EQUATION);\n    } else {\n      new_seeds.push_back(part);\n    }\n  }\n  cp_seeds_ = std::move(new_seeds);\n}\n\nbool EquationDetect::IsInline(const bool search_bottom, const int textparts_linespacing,\n                              ColPartition *part) {\n  ASSERT_HOST(part != nullptr);\n  // Look for its nearest vertical neighbor that hardly overlaps in y but\n  // largely overlaps in x.\n  ColPartitionGridSearch search(part_grid_);\n  ColPartition *neighbor = nullptr;\n  const TBOX &part_box(part->bounding_box());\n  const float kYGapRatioTh = 1.0;\n\n  if (search_bottom) {\n    search.StartVerticalSearch(part_box.left(), part_box.right(), part_box.bottom());\n  } else {\n    search.StartVerticalSearch(part_box.left(), part_box.right(), part_box.top());\n  }\n  search.SetUniqueMode(true);\n  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {\n    const TBOX &neighbor_box(neighbor->bounding_box());\n    if (part_box.y_gap(neighbor_box) >\n        kYGapRatioTh * std::min(part_box.height(), neighbor_box.height())) {\n      // Finished searching.\n      break;\n    }\n    if (!PTIsTextType(neighbor->type())) {\n      continue;\n    }\n\n    // Check if neighbor and part is inline similar.\n    const float kHeightRatioTh = 0.5;\n    const int kYGapTh = textparts_linespacing > 0\n                            ? textparts_linespacing + static_cast<int>(std::round(0.02f * resolution_))\n                            : static_cast<int>(std::round(0.05f * resolution_)); // Default value.\n    if (part_box.x_overlap(neighbor_box) &&                                 // Location feature.\n        part_box.y_gap(neighbor_box) <= kYGapTh &&                          // Line spacing.\n        // Geo feature.\n        static_cast<float>(std::min(part_box.height(), neighbor_box.height())) /\n                std::max(part_box.height(), neighbor_box.height()) >\n            kHeightRatioTh) {\n      return true;\n    }\n  }\n\n  return false;\n}\n\nbool EquationDetect::CheckSeedBlobsCount(ColPartition *part) {\n  if (!part) {\n    return false;\n  }\n  const int kSeedMathBlobsCount = 2;\n  const int kSeedMathDigitBlobsCount = 5;\n\n  const int blobs = part->boxes_count(), math_blobs = part->SpecialBlobsCount(BSTT_MATH),\n            digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);\n  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||\n      math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {\n    return false;\n  }\n\n  return true;\n}\n\nbool EquationDetect::CheckSeedDensity(const float math_density_high, const float math_density_low,\n                                      const ColPartition *part) const {\n  ASSERT_HOST(part);\n  float math_digit_density =\n      part->SpecialBlobsDensity(BSTT_MATH) + part->SpecialBlobsDensity(BSTT_DIGIT);\n  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);\n  if (math_digit_density > math_density_high) {\n    return true;\n  }\n  if (math_digit_density + italic_density > kMathItalicDensityTh &&\n      math_digit_density > math_density_low) {\n    return true;\n  }\n\n  return false;\n}\n\nEquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) {\n  ASSERT_HOST(part);\n\n  ColPartitionGridSearch search(part_grid_);\n  ColPartition *neighbor = nullptr;\n  const TBOX &part_box(part->bounding_box());\n  const int kXGapTh = static_cast<int>(std::round(0.5f * resolution_));\n  const int kRadiusTh = static_cast<int>(std::round(3.0f * resolution_));\n  const int kYGapTh = static_cast<int>(std::round(0.5f * resolution_));\n\n  // Here we use a simple approximation algorithm: from the center of part, We\n  // perform the radius search, and check if we can find a neighboring partition\n  // that locates on the top/bottom left of part.\n  search.StartRadSearch((part_box.left() + part_box.right()) / 2,\n                        (part_box.top() + part_box.bottom()) / 2, kRadiusTh);\n  search.SetUniqueMode(true);\n  bool left_indented = false, right_indented = false;\n  while ((neighbor = search.NextRadSearch()) != nullptr && (!left_indented || !right_indented)) {\n    if (neighbor == part) {\n      continue;\n    }\n    const TBOX &neighbor_box(neighbor->bounding_box());\n\n    if (part_box.major_y_overlap(neighbor_box) && part_box.x_gap(neighbor_box) < kXGapTh) {\n      // When this happens, it is likely part is a fragment of an\n      // over-segmented colpartition. So we return false.\n      return NO_INDENT;\n    }\n\n    if (!IsTextOrEquationType(neighbor->type())) {\n      continue;\n    }\n\n    // The neighbor should be above/below part, and overlap in x direction.\n    if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {\n      continue;\n    }\n\n    if (part_box.y_gap(neighbor_box) < kYGapTh) {\n      const int left_gap = part_box.left() - neighbor_box.left();\n      const int right_gap = neighbor_box.right() - part_box.right();\n      if (left_gap > kXGapTh) {\n        left_indented = true;\n      }\n      if (right_gap > kXGapTh) {\n        right_indented = true;\n      }\n    }\n  }\n\n  if (left_indented && right_indented) {\n    return BOTH_INDENT;\n  }\n  if (left_indented) {\n    return LEFT_INDENT;\n  }\n  if (right_indented) {\n    return RIGHT_INDENT;\n  }\n  return NO_INDENT;\n}\n\nbool EquationDetect::ExpandSeed(ColPartition *seed) {\n  if (seed == nullptr ||        // This seed has been absorbed by other seeds.\n      seed->IsVerticalType()) { // We skip vertical type right now.\n    return false;\n  }\n\n  // Expand in four directions.\n  std::vector<ColPartition *> parts_to_merge;\n  ExpandSeedHorizontal(true, seed, &parts_to_merge);\n  ExpandSeedHorizontal(false, seed, &parts_to_merge);\n  ExpandSeedVertical(true, seed, &parts_to_merge);\n  ExpandSeedVertical(false, seed, &parts_to_merge);\n  SearchByOverlap(seed, &parts_to_merge);\n\n  if (parts_to_merge.empty()) { // We don't find any partition to merge.\n    return false;\n  }\n\n  // Merge all partitions in parts_to_merge with seed. We first remove seed\n  // from part_grid_ as its bounding box is going to expand. Then we add it\n  // back after it absorbs all parts_to_merge partitions.\n  part_grid_->RemoveBBox(seed);\n  for (auto part : parts_to_merge) {\n    if (part->type() == PT_EQUATION) {\n      // If part is in cp_seeds_, then we mark it as nullptr so that we won't\n      // process it again.\n      for (auto &cp_seed : cp_seeds_) {\n        if (part == cp_seed) {\n          cp_seed = nullptr;\n          break;\n        }\n      }\n    }\n\n    // part has already been removed from part_grid_ in function\n    // ExpandSeedHorizontal/ExpandSeedVertical.\n    seed->Absorb(part, nullptr);\n  }\n\n  return true;\n}\n\nvoid EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed,\n                                          std::vector<ColPartition *> *parts_to_merge) {\n  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);\n  const float kYOverlapTh = 0.6;\n  const int kXGapTh = static_cast<int>(std::round(0.2f * resolution_));\n\n  ColPartitionGridSearch search(part_grid_);\n  const TBOX &seed_box(seed->bounding_box());\n  const int x = search_left ? seed_box.left() : seed_box.right();\n  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());\n  search.SetUniqueMode(true);\n\n  // Search iteratively.\n  ColPartition *part = nullptr;\n  while ((part = search.NextSideSearch(search_left)) != nullptr) {\n    if (part == seed) {\n      continue;\n    }\n    const TBOX &part_box(part->bounding_box());\n    if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.\n      break;\n    }\n\n    // Check part location.\n    if ((part_box.left() >= seed_box.left() && search_left) ||\n        (part_box.right() <= seed_box.right() && !search_left)) {\n      continue;\n    }\n\n    if (part->type() != PT_EQUATION) { // Non-equation type.\n      // Skip PT_LINLINE_EQUATION and non text type.\n      if (part->type() == PT_INLINE_EQUATION ||\n          (!IsTextOrEquationType(part->type()) && part->blob_type() != BRT_HLINE)) {\n        continue;\n      }\n      // For other types, it should be the near small neighbor of seed.\n      if (!IsNearSmallNeighbor(seed_box, part_box) || !CheckSeedNeighborDensity(part)) {\n        continue;\n      }\n    } else { // Equation type, check the y overlap.\n      if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&\n          seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {\n        continue;\n      }\n    }\n\n    // Passed the check, delete it from search and add into parts_to_merge.\n    search.RemoveBBox();\n    parts_to_merge->push_back(part);\n  }\n}\n\nvoid EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed,\n                                        std::vector<ColPartition *> *parts_to_merge) {\n  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr);\n  const float kXOverlapTh = 0.4;\n  const int kYGapTh = static_cast<int>(std::round(0.2f * resolution_));\n\n  ColPartitionGridSearch search(part_grid_);\n  const TBOX &seed_box(seed->bounding_box());\n  const int y = search_bottom ? seed_box.bottom() : seed_box.top();\n  search.StartVerticalSearch(cps_super_bbox_->left(), cps_super_bbox_->right(), y);\n  search.SetUniqueMode(true);\n\n  // Search iteratively.\n  ColPartition *part = nullptr;\n  std::vector<ColPartition *> parts;\n  int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;\n  while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {\n    if (part == seed) {\n      continue;\n    }\n    const TBOX &part_box(part->bounding_box());\n\n    if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.\n      break;\n    }\n\n    // Check part location.\n    if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||\n        (part_box.top() <= seed_box.top() && !search_bottom)) {\n      continue;\n    }\n\n    bool skip_part = false;\n    if (part->type() != PT_EQUATION) { // Non-equation type.\n      // Skip PT_LINLINE_EQUATION and non text type.\n      if (part->type() == PT_INLINE_EQUATION ||\n          (!IsTextOrEquationType(part->type()) && part->blob_type() != BRT_HLINE)) {\n        skip_part = true;\n      } else if (!IsNearSmallNeighbor(seed_box, part_box) || !CheckSeedNeighborDensity(part)) {\n        // For other types, it should be the near small neighbor of seed.\n        skip_part = true;\n      }\n    } else { // Equation type, check the x overlap.\n      if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&\n          seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {\n        skip_part = true;\n      }\n    }\n    if (skip_part) {\n      if (part->type() != PT_EQUATION) {\n        if (skipped_min_top > part_box.top()) {\n          skipped_min_top = part_box.top();\n        }\n        if (skipped_max_bottom < part_box.bottom()) {\n          skipped_max_bottom = part_box.bottom();\n        }\n      }\n    } else {\n      parts.push_back(part);\n    }\n  }\n\n  // For every part in parts, we need verify it is not above skipped_min_top\n  // when search top, or not below skipped_max_bottom when search bottom. I.e.,\n  // we will skip a part if it looks like:\n  //             search bottom      |         search top\n  // seed:     ******************   | part:    **********\n  // skipped: xxx                   | skipped:  xxx\n  // part:       **********         | seed:    ***********\n  for (auto &part : parts) {\n    const TBOX &part_box(part->bounding_box());\n    if ((search_bottom && part_box.top() <= skipped_max_bottom) ||\n        (!search_bottom && part_box.bottom() >= skipped_min_top)) {\n      continue;\n    }\n    // Add parts[i] into parts_to_merge, and delete it from part_grid_.\n    parts_to_merge->push_back(part);\n    part_grid_->RemoveBBox(part);\n  }\n}\n\nbool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const {\n  const int kXGapTh = static_cast<int>(std::round(0.25f * resolution_));\n  const int kYGapTh = static_cast<int>(std::round(0.05f * resolution_));\n\n  // Check geometric feature.\n  if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) {\n    return false;\n  }\n\n  // Check overlap and distance.\n  if ((!part_box.major_x_overlap(seed_box) || part_box.y_gap(seed_box) > kYGapTh) &&\n      (!part_box.major_y_overlap(seed_box) || part_box.x_gap(seed_box) > kXGapTh)) {\n    return false;\n  }\n\n  return true;\n}\n\nbool EquationDetect::CheckSeedNeighborDensity(const ColPartition *part) const {\n  ASSERT_HOST(part);\n  if (part->boxes_count() < kSeedBlobsCountTh) {\n    // Too few blobs, skip the check.\n    return true;\n  }\n\n  // We check the math blobs density and the unclear blobs density.\n  if (part->SpecialBlobsDensity(BSTT_MATH) + part->SpecialBlobsDensity(BSTT_DIGIT) >\n          kMathDigitDensityTh1 ||\n      part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {\n    return true;\n  }\n\n  return false;\n}\n\nvoid EquationDetect::ProcessMathBlockSatelliteParts() {\n  // Iterate over part_grid_, and find all parts that are text type but not\n  // equation type.\n  ColPartition *part = nullptr;\n  std::vector<ColPartition *> text_parts;\n  ColPartitionGridSearch gsearch(part_grid_);\n  gsearch.StartFullSearch();\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {\n      text_parts.push_back(part);\n    }\n  }\n  if (text_parts.empty()) {\n    return;\n  }\n\n  // Compute the medium height of the text_parts.\n  std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight);\n  const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box();\n  int med_height = text_box.height();\n  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {\n    const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box();\n    med_height = static_cast<int>(std::round(0.5f * (text_box.height() + med_height)));\n  }\n\n  // Iterate every text_parts and check if it is a math block satellite.\n  for (auto &text_part : text_parts) {\n    const TBOX &text_box(text_part->bounding_box());\n    if (text_box.height() > med_height) {\n      continue;\n    }\n    std::vector<ColPartition *> math_blocks;\n    if (!IsMathBlockSatellite(text_part, &math_blocks)) {\n      continue;\n    }\n\n    // Found. merge text_parts[i] with math_blocks.\n    part_grid_->RemoveBBox(text_part);\n    text_part->set_type(PT_EQUATION);\n    for (auto &math_block : math_blocks) {\n      part_grid_->RemoveBBox(math_block);\n      text_part->Absorb(math_block, nullptr);\n    }\n    InsertPartAfterAbsorb(text_part);\n  }\n}\n\nbool EquationDetect::IsMathBlockSatellite(ColPartition *part,\n                                          std::vector<ColPartition *> *math_blocks) {\n  ASSERT_HOST(part != nullptr && math_blocks != nullptr);\n  math_blocks->clear();\n  const TBOX &part_box(part->bounding_box());\n  // Find the top/bottom nearest neighbor of part.\n  ColPartition *neighbors[2];\n  int y_gaps[2] = {std::numeric_limits<int>::max(), std::numeric_limits<int>::max()};\n  // The horizontal boundary of the neighbors.\n  int neighbors_left = std::numeric_limits<int>::max(), neighbors_right = 0;\n  for (int i = 0; i < 2; ++i) {\n    neighbors[i] = SearchNNVertical(i != 0, part);\n    if (neighbors[i]) {\n      const TBOX &neighbor_box = neighbors[i]->bounding_box();\n      y_gaps[i] = neighbor_box.y_gap(part_box);\n      if (neighbor_box.left() < neighbors_left) {\n        neighbors_left = neighbor_box.left();\n      }\n      if (neighbor_box.right() > neighbors_right) {\n        neighbors_right = neighbor_box.right();\n      }\n    }\n  }\n  if (neighbors[0] == neighbors[1]) {\n    // This happens when part is inside neighbor.\n    neighbors[1] = nullptr;\n    y_gaps[1] = std::numeric_limits<int>::max();\n  }\n\n  // Check if part is within [neighbors_left, neighbors_right].\n  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {\n    return false;\n  }\n\n  // Get the index of the near one in neighbors.\n  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;\n\n  // Check the near one.\n  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {\n    math_blocks->push_back(neighbors[index]);\n  } else {\n    // If the near one failed the check, then we skip checking the far one.\n    return false;\n  }\n\n  // Check the far one.\n  index = 1 - index;\n  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {\n    math_blocks->push_back(neighbors[index]);\n  }\n\n  return true;\n}\n\nColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) {\n  ASSERT_HOST(part);\n  ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;\n  const int kYGapTh = static_cast<int>(std::round(resolution_ * 0.5f));\n\n  ColPartitionGridSearch search(part_grid_);\n  search.SetUniqueMode(true);\n  const TBOX &part_box(part->bounding_box());\n  int y = search_bottom ? part_box.bottom() : part_box.top();\n  search.StartVerticalSearch(part_box.left(), part_box.right(), y);\n  int min_y_gap = std::numeric_limits<int>::max();\n  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {\n    if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {\n      continue;\n    }\n    const TBOX &neighbor_box(neighbor->bounding_box());\n    int y_gap = neighbor_box.y_gap(part_box);\n    if (y_gap > kYGapTh) { // Out of scope.\n      break;\n    }\n    if (!neighbor_box.major_x_overlap(part_box) ||\n        (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||\n        (!search_bottom && neighbor_box.top() < part_box.top())) {\n      continue;\n    }\n    if (y_gap < min_y_gap) {\n      min_y_gap = y_gap;\n      nearest_neighbor = neighbor;\n    }\n  }\n\n  return nearest_neighbor;\n}\n\nbool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const {\n  if (!neighbor) {\n    return false;\n  }\n  const int kYGapTh = static_cast<int>(std::round(resolution_ * 0.1f));\n  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;\n}\n\nvoid EquationDetect::GetOutputTiffName(const char *name, std::string &image_name) const {\n  ASSERT_HOST(name);\n  char page[50];\n  snprintf(page, sizeof(page), \"%04d\", page_count_);\n  image_name = (lang_tesseract_->imagebasename) + page + name + \".tif\";\n}\n\nvoid EquationDetect::PaintSpecialTexts(const std::string &outfile) const {\n  Image pix = nullptr, pixBi = lang_tesseract_->pix_binary();\n  pix = pixConvertTo32(pixBi);\n  ColPartitionGridSearch gsearch(part_grid_);\n  ColPartition *part = nullptr;\n  gsearch.StartFullSearch();\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BLOBNBOX_C_IT blob_it(part->boxes());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      RenderSpecialText(pix, blob_it.data());\n    }\n  }\n\n  pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);\n  pix.destroy();\n}\n\nvoid EquationDetect::PaintColParts(const std::string &outfile) const {\n  Image pix = pixConvertTo32(lang_tesseract_->BestPix());\n  ColPartitionGridSearch gsearch(part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &tbox = part->bounding_box();\n    Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(), tbox.width(), tbox.height());\n    if (part->type() == PT_EQUATION) {\n      pixRenderBoxArb(pix, box, 5, 255, 0, 0);\n    } else if (part->type() == PT_INLINE_EQUATION) {\n      pixRenderBoxArb(pix, box, 5, 0, 255, 0);\n    } else {\n      pixRenderBoxArb(pix, box, 5, 0, 0, 255);\n    }\n    boxDestroy(&box);\n  }\n\n  pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);\n  pix.destroy();\n}\n\nvoid EquationDetect::PrintSpecialBlobsDensity(const ColPartition *part) const {\n  ASSERT_HOST(part);\n  TBOX box(part->bounding_box());\n  int h = pixGetHeight(lang_tesseract_->BestPix());\n  tprintf(\"Printing special blobs density values for ColPartition (t=%d,b=%d) \", h - box.top(),\n          h - box.bottom());\n  box.print();\n  tprintf(\"blobs count = %d, density = \", part->boxes_count());\n  for (int i = 0; i < BSTT_COUNT; ++i) {\n    auto type = static_cast<BlobSpecialTextType>(i);\n    tprintf(\"%d:%f \", i, part->SpecialBlobsDensity(type));\n  }\n  tprintf(\"\\n\");\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/equationdetect.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        equationdetect.h\n// Description: The equation detection class that inherits equationdetectbase.\n// Author:      Zongyi (Joe) Liu (joeliu@google.com)\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_\n#define TESSERACT_CCMAIN_EQUATIONDETECT_H_\n\n#include <tesseract/unichar.h>  // for UNICHAR_ID\n#include \"blobbox.h\"            // for BLOBNBOX (ptr only), BlobSpecialText...\n#include \"equationdetectbase.h\" // for EquationDetectBase\n#include \"tesseractclass.h\"     // for Tesseract\n\nclass TBOX;\nclass UNICHARSET;\n\nnamespace tesseract {\n\nclass Tesseract;\nclass ColPartition;\nclass ColPartitionGrid;\nclass ColPartitionSet;\n\nclass TESS_API EquationDetect : public EquationDetectBase {\npublic:\n  EquationDetect(const char *equ_datapath, const char *equ_language);\n  ~EquationDetect() override;\n\n  enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT };\n\n  // Reset the lang_tesseract_ pointer. This function should be called before we\n  // do any detector work.\n  void SetLangTesseract(Tesseract *lang_tesseract);\n\n  // Iterate over the blobs inside to_block, and set the blobs that we want to\n  // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function\n  // returns 0 upon success.\n  int LabelSpecialText(TO_BLOCK *to_block) override;\n\n  // Find possible equation partitions from part_grid. Should be called\n  // after the special_text_type of blobs are set.\n  // It returns 0 upon success.\n  int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override;\n\n  // Reset the resolution of the processing image. TEST only function.\n  void SetResolution(const int resolution);\n\nprotected:\n  // Identify the special text type for one blob, and update its field. When\n  // height_th is set (> 0), we will label the blob as BSTT_NONE if its height\n  // is less than height_th.\n  void IdentifySpecialText(BLOBNBOX *blob, const int height_th);\n\n  // Estimate the type for one unichar.\n  BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset,\n                                             const UNICHAR_ID id) const;\n\n  // Compute special text type for each blobs in part_grid_.\n  void IdentifySpecialText();\n\n  // Identify blobs that we want to skip during special blob type\n  // classification.\n  void IdentifyBlobsToSkip(ColPartition *part);\n\n  // The ColPartitions in part_grid_ maybe over-segmented, particularly in the\n  // block equation regions. So we like to identify these partitions and merge\n  // them before we do the searching.\n  void MergePartsByLocation();\n\n  // Staring from the seed center, we do radius search. And for partitions that\n  // have large overlaps with seed, we remove them from part_grid_ and add into\n  // parts_overlap. Note: this function may update the part_grid_, so if the\n  // caller is also running ColPartitionGridSearch, use the RepositionIterator\n  // to continue.\n  void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);\n\n  // Insert part back into part_grid_, after it absorbs some other parts.\n  void InsertPartAfterAbsorb(ColPartition *part);\n\n  // Identify the colparitions in part_grid_, label them as PT_EQUATION, and\n  // save them into cp_seeds_.\n  void IdentifySeedParts();\n\n  // Check the blobs count for a seed region candidate.\n  bool CheckSeedBlobsCount(ColPartition *part);\n\n  // Compute the foreground pixel density for a tbox area.\n  float ComputeForegroundDensity(const TBOX &tbox);\n\n  // Check if part from seed2 label: with low math density and left indented. We\n  // are using two checks:\n  // 1. If its left is aligned with any coordinates in indented_texts_left,\n  // which we assume have been sorted.\n  // 2. If its foreground density is over foreground_density_th.\n  bool CheckForSeed2(const std::vector<int> &indented_texts_left,\n                     const float foreground_density_th, ColPartition *part);\n\n  // Count the number of values in sorted_vec that is close to val, used to\n  // check if a partition is aligned with text partitions.\n  int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;\n\n  // Check for a seed candidate using the foreground pixel density. And we\n  // return true if the density is below a certain threshold, because characters\n  // in equation regions usually are apart with more white spaces.\n  bool CheckSeedFgDensity(const float density_th, ColPartition *part);\n\n  // A light version of SplitCPHor: instead of really doing the part split, we\n  // simply compute the union bounding box of each split part.\n  void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);\n\n  // Split the part (horizontally), and save the split result into\n  // parts_splitted. Note that it is caller's responsibility to release the\n  // memory owns by parts_splitted. On the other hand, the part is unchanged\n  // during this process and still owns the blobs, so do NOT call DeleteBoxes\n  // when freeing the colpartitions in parts_splitted.\n  void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);\n\n  // Check the density for a seed candidate (part) using its math density and\n  // italic density, returns true if the check passed.\n  bool CheckSeedDensity(const float math_density_high, const float math_density_low,\n                        const ColPartition *part) const;\n\n  // Check if part is indented.\n  IndentType IsIndented(ColPartition *part);\n\n  // Identify inline partitions from cp_seeds_, and re-label them.\n  void IdentifyInlineParts();\n\n  // Compute the super bounding box for all colpartitions inside part_grid_.\n  void ComputeCPsSuperBBox();\n\n  // Identify inline partitions from cp_seeds_ using the horizontal search.\n  void IdentifyInlinePartsHorizontal();\n\n  // Estimate the line spacing between two text partitions. Returns -1 if not\n  // enough data.\n  int EstimateTextPartLineSpacing();\n\n  // Identify inline partitions from cp_seeds_ using vertical search.\n  void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing);\n\n  // Check if part is an inline equation zone. This should be called after we\n  // identified the seed regions.\n  bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part);\n\n  // For a given seed partition, we search the part_grid_ and see if there is\n  // any partition can be merged with it. It returns true if the seed has been\n  // expanded.\n  bool ExpandSeed(ColPartition *seed);\n\n  // Starting from the seed position, we search the part_grid_\n  // horizontally/vertically, find all partitions that can be\n  // merged with seed, remove them from part_grid_, and put them  into\n  // parts_to_merge.\n  void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,\n                            std::vector<ColPartition *> *parts_to_merge);\n  void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,\n                          std::vector<ColPartition *> *parts_to_merge);\n\n  // Check if a part_box is the small neighbor of seed_box.\n  bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;\n\n  // Perform the density check for part, which we assume is nearing a seed\n  // partition. It returns true if the check passed.\n  bool CheckSeedNeighborDensity(const ColPartition *part) const;\n\n  // After identify the math blocks, we do one more scanning on all text\n  // partitions, and check if any of them is the satellite of:\n  // math blocks: here a p is the satellite of q if:\n  // 1. q is the nearest vertical neighbor of p, and\n  // 2. y_gap(p, q) is less than a threshold, and\n  // 3. x_overlap(p, q) is over a threshold.\n  // Note that p can be the satellites of two blocks: its top neighbor and\n  // bottom neighbor.\n  void ProcessMathBlockSatelliteParts();\n\n  // Check if part is the satellite of one/two math blocks. If it is, we return\n  // true, and save the blocks into math_blocks.\n  bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);\n\n  // Search the nearest neighbor of part in one vertical direction as defined in\n  // search_bottom. It returns the neighbor found that major x overlap with it,\n  // or nullptr when not found.\n  ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part);\n\n  // Check if the neighbor with vertical distance of y_gap is a near and math\n  // block partition.\n  bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;\n\n  // Generate the tiff file name for output/debug file.\n  void GetOutputTiffName(const char *name, std::string &image_name) const;\n\n  // Debugger function that renders ColPartitions on the input image, where:\n  // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION\n  // will be painted in green, and other parts will be painted in blue.\n  void PaintColParts(const std::string &outfile) const;\n\n  // Debugger function that renders the blobs in part_grid_ over the input\n  // image.\n  void PaintSpecialTexts(const std::string &outfile) const;\n\n  // Debugger function that print the math blobs density values for a\n  // ColPartition object.\n  void PrintSpecialBlobsDensity(const ColPartition *part) const;\n\n  // The tesseract engine initialized from equation training data.\n  Tesseract equ_tesseract_;\n\n  // The tesseract engine used for OCR. This pointer is passed in by the caller,\n  // so do NOT destroy it in this class.\n  Tesseract *lang_tesseract_;\n\n  // The ColPartitionGrid that we are processing. This pointer is passed in from\n  // the caller, so do NOT destroy it in the class.\n  ColPartitionGrid *part_grid_ = nullptr;\n\n  // A simple array of pointers to the best assigned column division at\n  // each grid y coordinate. This pointer is passed in from the caller, so do\n  // NOT destroy it in the class.\n  ColPartitionSet **best_columns_ = nullptr;\n\n  // The super bounding box of all cps in the part_grid_.\n  TBOX *cps_super_bbox_;\n\n  // The seed ColPartition for equation region.\n  std::vector<ColPartition *> cp_seeds_;\n\n  // The resolution (dpi) of the processing image.\n  int resolution_;\n\n  // The number of pages we have processed.\n  int page_count_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_\n"
  },
  {
    "path": "src/ccmain/fixspace.cpp",
    "content": "/******************************************************************\n * File:        fixspace.cpp  (Formerly fixspace.c)\n * Description: Implements a pass over the page res, exploring the alternative\n *              spacing possibilities, trying to use context to improve the\n *              word spacing\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"fixspace.h\"\n\n#include \"blobs.h\"          // for TWERD, TBLOB, TESSLINE\n#include \"boxword.h\"        // for BoxWord\n#include \"errcode.h\"        // for ASSERT_HOST\n#include \"normalis.h\"       // for kBlnXHeight, kBlnBaselineOffset\n#include \"pageres.h\"        // for WERD_RES_IT, WERD_RES, WERD_RES_LIST\n#include \"params.h\"         // for IntParam, StringParam, BoolParam, DoubleParam, ...\n#include \"ratngs.h\"         // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM\n#include \"rect.h\"           // for TBOX\n#include \"stepblob.h\"       // for C_BLOB_IT, C_BLOB_LIST, C_BLOB\n#include \"tesseractclass.h\" // for Tesseract, TesseractStats, WordData\n#include \"tessvars.h\"       // for debug_fp\n#include \"tprintf.h\"        // for tprintf\n#include \"unicharset.h\"     // for UNICHARSET\n#include \"werd.h\"           // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP\n\n#include <tesseract/ocrclass.h> // for ETEXT_DESC\n#include <tesseract/unichar.h>  // for UNICHAR_ID\n\n#include <cstdint> // for INT16_MAX, int16_t, int32_t\n\nnamespace tesseract {\n\nclass BLOCK;\nclass ROW;\n\n#define PERFECT_WERDS 999\n\n/**********************************************************************\n *  c_blob_comparator()\n *\n *  Blob comparator used to sort a blob list so that blobs are in increasing\n *  order of left edge.\n **********************************************************************/\n\nstatic int c_blob_comparator( // sort blobs\n    const C_BLOB *blob1,\n    const C_BLOB *blob2\n) {\n  return blob1->bounding_box().left() - blob2->bounding_box().left();\n}\n\n/**\n * @name fix_fuzzy_spaces()\n * Walk over the page finding sequences of words joined by fuzzy spaces. Extract\n * them as a sublist, process the sublist to find the optimal arrangement of\n * spaces then replace the sublist in the ROW_RES.\n *\n * @param monitor progress monitor\n * @param word_count count of words in doc\n * @param[out] page_res\n */\nvoid Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {\n  BLOCK_RES_IT block_res_it;\n  ROW_RES_IT row_res_it;\n  WERD_RES_IT word_res_it_from;\n  WERD_RES_IT word_res_it_to;\n  WERD_RES *word_res;\n  WERD_RES_LIST fuzzy_space_words;\n  int16_t new_length;\n  bool prevent_null_wd_fixsp; // DON'T process blobless wds\n  int32_t word_index;         // current word\n\n  block_res_it.set_to_list(&page_res->block_res_list);\n  word_index = 0;\n  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {\n    row_res_it.set_to_list(&block_res_it.data()->row_res_list);\n    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {\n      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);\n      while (!word_res_it_from.at_last()) {\n        word_res = word_res_it_from.data();\n        while (!word_res_it_from.at_last() &&\n               !(word_res->combination ||\n                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||\n                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {\n          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);\n          word_res = word_res_it_from.forward();\n          word_index++;\n          if (monitor != nullptr) {\n            monitor->ocr_alive = true;\n            monitor->progress = 90 + 5 * word_index / word_count;\n            if (monitor->deadline_exceeded() ||\n                (monitor->cancel != nullptr &&\n                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {\n              return;\n            }\n          }\n        }\n\n        if (!word_res_it_from.at_last()) {\n          word_res_it_to = word_res_it_from;\n          prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();\n          if (check_debug_pt(word_res, 60)) {\n            debug_fix_space_level.set_value(10);\n          }\n          word_res_it_to.forward();\n          word_index++;\n          if (monitor != nullptr) {\n            monitor->ocr_alive = true;\n            monitor->progress = 90 + 5 * word_index / word_count;\n            if (monitor->deadline_exceeded() ||\n                (monitor->cancel != nullptr &&\n                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {\n              return;\n            }\n          }\n          while (!word_res_it_to.at_last() &&\n                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||\n                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {\n            if (check_debug_pt(word_res, 60)) {\n              debug_fix_space_level.set_value(10);\n            }\n            if (word_res->word->cblob_list()->empty()) {\n              prevent_null_wd_fixsp = true;\n            }\n            word_res = word_res_it_to.forward();\n          }\n          if (check_debug_pt(word_res, 60)) {\n            debug_fix_space_level.set_value(10);\n          }\n          if (word_res->word->cblob_list()->empty()) {\n            prevent_null_wd_fixsp = true;\n          }\n          if (prevent_null_wd_fixsp) {\n            word_res_it_from = word_res_it_to;\n          } else {\n            fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);\n            fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,\n                                 block_res_it.data()->block);\n            new_length = fuzzy_space_words.length();\n            word_res_it_from.add_list_before(&fuzzy_space_words);\n            for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {\n              word_res_it_from.forward();\n            }\n          }\n          if (test_pt) {\n            debug_fix_space_level.set_value(0);\n          }\n        }\n        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);\n        // Last word in row\n      }\n    }\n  }\n}\n\nvoid Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {\n  int16_t best_score;\n  WERD_RES_LIST current_perm;\n  bool improved = false;\n\n  best_score = eval_word_spacing(best_perm); // default score\n  dump_words(best_perm, best_score, 1, improved);\n\n  if (best_score != PERFECT_WERDS) {\n    initialise_search(best_perm, current_perm);\n  }\n\n  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {\n    match_current_words(current_perm, row, block);\n    int16_t current_score = eval_word_spacing(current_perm);\n    dump_words(current_perm, current_score, 2, improved);\n    if (current_score > best_score) {\n      best_perm.clear();\n      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);\n      best_score = current_score;\n      improved = true;\n    }\n    if (current_score < PERFECT_WERDS) {\n      transform_to_next_perm(current_perm);\n    }\n  }\n  dump_words(best_perm, best_score, 3, improved);\n}\n\nvoid initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {\n  WERD_RES_IT src_it(&src_list);\n  WERD_RES_IT new_it(&new_list);\n  WERD_RES *new_wd;\n\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    WERD_RES *src_wd = src_it.data();\n    if (!src_wd->combination) {\n      new_wd = WERD_RES::deep_copy(src_wd);\n      new_wd->combination = false;\n      new_wd->part_of_combo = false;\n      new_it.add_after_then_move(new_wd);\n    }\n  }\n}\n\nvoid Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {\n  WERD_RES_IT word_it(&words);\n  WERD_RES *word;\n  // Since we are not using PAGE_RES to iterate over words, we need to update\n  // prev_word_best_choice_ before calling classify_word_pass2().\n  prev_word_best_choice_ = nullptr;\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data();\n    if ((!word->part_of_combo) && (word->box_word == nullptr)) {\n      WordData word_data(block, row, word);\n      SetupWordPassN(2, &word_data);\n      classify_word_and_language(2, nullptr, &word_data);\n    }\n    prev_word_best_choice_ = word->best_choice;\n  }\n}\n\n/**\n * @name eval_word_spacing()\n * The basic measure is the number of characters in contextually confirmed\n * words. (I.e the word is done)\n * If all words are contextually confirmed the evaluation is deemed perfect.\n *\n * Some fiddles are done to handle \"1\"s as these are VERY frequent causes of\n * fuzzy spaces. The problem with the basic measure is that \"561 63\" would score\n * the same as \"56163\", though given our knowledge that the space is fuzzy, and\n * that there is a \"1\" next to the fuzzy space, we need to ensure that \"56163\"\n * is preferred.\n *\n * The solution is to NOT COUNT the score of any word which has a digit at one\n * end and a \"1Il\" as the character the other side of the space.\n *\n * Conversely, any character next to a \"1\" within a word is counted as a\n * positive score. Thus \"561 63\" would score 4 (3 chars in a numeric word plus 1\n * side of the \"1\" joined).  \"56163\" would score 7 - all chars in a numeric word\n * + 2 sides of a \"1\" joined.\n *\n * The joined 1 rule is applied to any word REGARDLESS of contextual\n * confirmation.  Thus \"PS7a71 3/7a\" scores 1 (neither word is contexutally\n * confirmed. The only score is from the joined 1. \"PS7a713/7a\" scores 2.\n *\n */\nint16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {\n  WERD_RES_IT word_res_it(&word_res_list);\n  int16_t total_score = 0;\n  int16_t word_count = 0;\n  int16_t done_word_count = 0;\n  int i;\n  int16_t offset;\n  int16_t prev_word_score = 0;\n  bool prev_word_done = false;\n  bool prev_char_1 = false;     // prev ch a \"1/I/l\"?\n  bool prev_char_digit = false; // prev ch 2..9 or 0\n  const char *punct_chars = \"!\\\"`',.:;\";\n  do {\n    // current word\n    WERD_RES *word = word_res_it.data();\n    bool word_done = fixspace_thinks_word_done(word);\n    word_count++;\n    if (word->tess_failed) {\n      total_score += prev_word_score;\n      if (prev_word_done) {\n        done_word_count++;\n      }\n      prev_word_score = 0;\n      prev_char_1 = false;\n      prev_char_digit = false;\n      prev_word_done = false;\n    } else {\n      /*\n  Can we add the prev word score and potentially count this word?\n  Yes IF it didn't end in a 1 when the first char of this word is a digit\n    AND it didn't end in a digit when the first char of this word is a 1\n*/\n      auto word_len = word->reject_map.length();\n      bool current_word_ok_so_far = false;\n      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||\n            (prev_char_digit &&\n             ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&\n               word->best_choice->unichar_string()[0] == '1') ||\n              (!word_done &&\n               conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {\n        total_score += prev_word_score;\n        if (prev_word_done) {\n          done_word_count++;\n        }\n        current_word_ok_so_far = word_done;\n      }\n\n      if (current_word_ok_so_far) {\n        prev_word_done = true;\n        prev_word_score = word_len;\n      } else {\n        prev_word_done = false;\n        prev_word_score = 0;\n      }\n\n      /* Add 1 to total score for every joined 1 regardless of context and\n   rejtn */\n      for (i = 0, prev_char_1 = false; i < word_len; i++) {\n        bool current_char_1 = word->best_choice->unichar_string()[i] == '1';\n        if (prev_char_1 || (current_char_1 && (i > 0))) {\n          total_score++;\n        }\n        prev_char_1 = current_char_1;\n      }\n\n      /* Add 1 to total score for every joined punctuation regardless of context\n  and rejtn */\n      if (tessedit_prefer_joined_punct) {\n        bool prev_char_punct;\n        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;\n             offset += word->best_choice->unichar_lengths()[i++]) {\n          bool current_char_punct =\n              strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;\n          if (prev_char_punct || (current_char_punct && i > 0)) {\n            total_score++;\n          }\n          prev_char_punct = current_char_punct;\n        }\n      }\n      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);\n      for (i = 0, offset = 0; i < word_len - 1;\n           offset += word->best_choice->unichar_lengths()[i++]) {\n        ;\n      }\n      prev_char_1 =\n          ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||\n           (!word_done &&\n            conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));\n    }\n    /* Find next word */\n    do {\n      word_res_it.forward();\n    } while (word_res_it.data()->part_of_combo);\n  } while (!word_res_it.at_first());\n  total_score += prev_word_score;\n  if (prev_word_done) {\n    done_word_count++;\n  }\n  if (done_word_count == word_count) {\n    return PERFECT_WERDS;\n  } else {\n    return total_score;\n  }\n}\n\nbool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {\n  int i;\n  int offset;\n\n  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {\n    ;\n  }\n  return (\n      word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,\n                                 word->best_choice->unichar_lengths()[i]) ||\n      (word->best_choice->permuter() == NUMBER_PERM &&\n       numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));\n}\n\n/**\n * @name transform_to_next_perm()\n * Examines the current word list to find the smallest word gap size. Then walks\n * the word list closing any gaps of this size by either inserted new\n * combination words, or extending existing ones.\n *\n * The routine COULD be limited to stop it building words longer than N blobs.\n *\n * If there are no more gaps then it DELETES the entire list and returns the\n * empty list to cause termination.\n */\nvoid transform_to_next_perm(WERD_RES_LIST &words) {\n  WERD_RES_IT word_it(&words);\n  WERD_RES_IT prev_word_it(&words);\n  WERD_RES *word;\n  WERD_RES *prev_word;\n  int16_t prev_right = -INT16_MAX;\n  TBOX box;\n  int16_t gap;\n  int16_t min_gap = INT16_MAX;\n\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data();\n    if (!word->part_of_combo) {\n      box = word->word->bounding_box();\n      if (prev_right > -INT16_MAX) {\n        gap = box.left() - prev_right;\n        if (gap < min_gap) {\n          min_gap = gap;\n        }\n      }\n      prev_right = box.right();\n    }\n  }\n  if (min_gap < INT16_MAX) {\n    prev_right = -INT16_MAX; // back to start\n    word_it.set_to_list(&words);\n    // Note: we can't use cycle_pt due to inserted combos at start of list.\n    for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {\n      word = word_it.data();\n      if (!word->part_of_combo) {\n        box = word->word->bounding_box();\n        if (prev_right > -INT16_MAX) {\n          gap = box.left() - prev_right;\n          if (gap <= min_gap) {\n            prev_word = prev_word_it.data();\n            WERD_RES *combo;\n            if (prev_word->combination) {\n              combo = prev_word;\n            } else {\n              /* Make a new combination and insert before\n               * the first word being joined. */\n              auto *copy_word = new WERD;\n              *copy_word = *(prev_word->word);\n              // deep copy\n              combo = new WERD_RES(copy_word);\n              combo->combination = true;\n              combo->x_height = prev_word->x_height;\n              prev_word->part_of_combo = true;\n              prev_word_it.add_before_then_move(combo);\n            }\n            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));\n            if (word->combination) {\n              combo->word->join_on(word->word);\n              // Move blobs to combo\n              // old combo no longer needed\n              delete word_it.extract();\n            } else {\n              // Copy current wd to combo\n              combo->copy_on(word);\n              word->part_of_combo = true;\n            }\n            combo->done = false;\n            combo->ClearResults();\n          } else {\n            prev_word_it = word_it; // catch up\n          }\n        }\n        prev_right = box.right();\n      }\n    }\n  } else {\n    words.clear(); // signal termination\n  }\n}\n\nvoid Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {\n  WERD_RES_IT word_res_it(&perm);\n\n  if (debug_fix_space_level > 0) {\n    if (mode == 1) {\n      stats_.dump_words_str = \"\";\n      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {\n        if (!word_res_it.data()->part_of_combo) {\n          stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();\n          stats_.dump_words_str += ' ';\n        }\n      }\n    }\n\n    if (debug_fix_space_level > 1) {\n      switch (mode) {\n        case 1:\n          tprintf(\"EXTRACTED (%d): \\\"\", score);\n          break;\n        case 2:\n          tprintf(\"TESTED (%d): \\\"\", score);\n          break;\n        case 3:\n          tprintf(\"RETURNED (%d): \\\"\", score);\n          break;\n      }\n\n      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {\n        if (!word_res_it.data()->part_of_combo) {\n          tprintf(\"%s/%1d \", word_res_it.data()->best_choice->unichar_string().c_str(),\n                  static_cast<int>(word_res_it.data()->best_choice->permuter()));\n        }\n      }\n      tprintf(\"\\\"\\n\");\n    } else if (improved) {\n      tprintf(\"FIX SPACING \\\"%s\\\" => \\\"\", stats_.dump_words_str.c_str());\n      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {\n        if (!word_res_it.data()->part_of_combo) {\n          tprintf(\"%s/%1d \", word_res_it.data()->best_choice->unichar_string().c_str(),\n                  static_cast<int>(word_res_it.data()->best_choice->permuter()));\n        }\n      }\n      tprintf(\"\\\"\\n\");\n    }\n  }\n}\n\nbool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {\n  if (word->done) {\n    return true;\n  }\n\n  /*\n  Use all the standard pass 2 conditions for mode 5 in set_done() in\n  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T\n  CARE WHETHER WE HAVE of/at on/an etc.\n*/\n  if (fixsp_done_mode > 0 &&\n      (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||\n       fixsp_done_mode == 3) &&\n      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&\n      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||\n       (word->best_choice->permuter() == FREQ_DAWG_PERM) ||\n       (word->best_choice->permuter() == USER_DAWG_PERM) ||\n       (word->best_choice->permuter() == NUMBER_PERM))) {\n    return true;\n  } else {\n    return false;\n  }\n}\n\n/**\n * @name fix_sp_fp_word()\n * Test the current word to see if it can be split by deleting noise blobs. If\n * so, do the business.\n * Return with the iterator pointing to the same place if the word is unchanged,\n * or the last of the replacement words.\n */\nvoid Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {\n  WERD_RES *word_res;\n  WERD_RES_LIST sub_word_list;\n  WERD_RES_IT sub_word_list_it(&sub_word_list);\n  int16_t new_length;\n  float junk;\n\n  word_res = word_res_it.data();\n  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||\n      !word_res->word->flag(W_DONT_CHOP)) {\n    return;\n  }\n\n  auto blob_index = worst_noise_blob(word_res, &junk);\n  if (blob_index < 0) {\n    return;\n  }\n\n  if (debug_fix_space_level > 1) {\n    tprintf(\"FP fixspace working on \\\"%s\\\"\\n\", word_res->best_choice->unichar_string().c_str());\n  }\n  word_res->word->rej_cblob_list()->sort(c_blob_comparator);\n  sub_word_list_it.add_after_stay_put(word_res_it.extract());\n  fix_noisy_space_list(sub_word_list, row, block);\n  new_length = sub_word_list.length();\n  word_res_it.add_list_before(&sub_word_list);\n  for (; !word_res_it.at_last() && new_length > 1; new_length--) {\n    word_res_it.forward();\n  }\n}\n\nvoid Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {\n  int16_t best_score;\n  WERD_RES_IT best_perm_it(&best_perm);\n  WERD_RES_LIST current_perm;\n  WERD_RES_IT current_perm_it(&current_perm);\n  WERD_RES *old_word_res;\n  int16_t current_score;\n  bool improved = false;\n\n  best_score = fp_eval_word_spacing(best_perm); // default score\n\n  dump_words(best_perm, best_score, 1, improved);\n\n  old_word_res = best_perm_it.data();\n  // Even deep_copy doesn't copy the underlying WERD unless its combination\n  // flag is true!.\n  old_word_res->combination = true; // Kludge to force deep copy\n  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));\n  old_word_res->combination = false; // Undo kludge\n\n  break_noisiest_blob_word(current_perm);\n\n  while (best_score != PERFECT_WERDS && !current_perm.empty()) {\n    match_current_words(current_perm, row, block);\n    current_score = fp_eval_word_spacing(current_perm);\n    dump_words(current_perm, current_score, 2, improved);\n    if (current_score > best_score) {\n      best_perm.clear();\n      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);\n      best_score = current_score;\n      improved = true;\n    }\n    if (current_score < PERFECT_WERDS) {\n      break_noisiest_blob_word(current_perm);\n    }\n  }\n  dump_words(best_perm, best_score, 3, improved);\n}\n\n/**\n * break_noisiest_blob_word()\n * Find the word with the blob which looks like the worst noise.\n * Break the word into two, deleting the noise blob.\n */\nvoid Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {\n  WERD_RES_IT word_it(&words);\n  WERD_RES_IT worst_word_it;\n  float worst_noise_score = 9999;\n  int worst_blob_index = -1; // Noisiest blob of noisiest wd\n  float noise_score;         // of wds noisiest blob\n  WERD_RES *word_res;\n  C_BLOB_IT blob_it;\n  C_BLOB_IT rej_cblob_it;\n  C_BLOB_LIST new_blob_list;\n  C_BLOB_IT new_blob_it;\n  C_BLOB_IT new_rej_cblob_it;\n  WERD *new_word;\n  int16_t start_of_noise_blob;\n  int16_t i;\n\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    auto blob_index = worst_noise_blob(word_it.data(), &noise_score);\n    if (blob_index > -1 && worst_noise_score > noise_score) {\n      worst_noise_score = noise_score;\n      worst_blob_index = blob_index;\n      worst_word_it = word_it;\n    }\n  }\n  if (worst_blob_index < 0) {\n    words.clear(); // signal termination\n    return;\n  }\n\n  /* Now split the worst_word_it */\n\n  word_res = worst_word_it.data();\n\n  /* Move blobs before noise blob to a new bloblist */\n\n  new_blob_it.set_to_list(&new_blob_list);\n  blob_it.set_to_list(word_res->word->cblob_list());\n  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {\n    new_blob_it.add_after_then_move(blob_it.extract());\n  }\n  start_of_noise_blob = blob_it.data()->bounding_box().left();\n  delete blob_it.extract(); // throw out noise blob\n\n  new_word = new WERD(&new_blob_list, word_res->word);\n  new_word->set_flag(W_EOL, false);\n  word_res->word->set_flag(W_BOL, false);\n  word_res->word->set_blanks(1); // After break\n\n  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());\n  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());\n  for (; (!rej_cblob_it.empty() &&\n          (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));\n       rej_cblob_it.forward()) {\n    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());\n  }\n\n  auto *new_word_res = new WERD_RES(new_word);\n  new_word_res->combination = true;\n  worst_word_it.add_before_then_move(new_word_res);\n\n  word_res->ClearResults();\n}\n\nint16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {\n  float noise_score[512];\n  int min_noise_blob; // 1st contender\n  int max_noise_blob; // last contender\n  int non_noise_count;\n  int worst_noise_blob; // Worst blob\n  float small_limit = kBlnXHeight * fixsp_small_outlines_size;\n  float non_noise_limit = kBlnXHeight * 0.8;\n\n  if (word_res->rebuild_word == nullptr) {\n    return -1; // Can't handle cube words.\n  }\n\n  // Normalised.\n  auto blob_count = word_res->box_word->length();\n  ASSERT_HOST(blob_count <= 512);\n  if (blob_count < 5) {\n    return -1; // too short to split\n  }\n\n    /* Get the noise scores for all blobs */\n\n#ifndef SECURE_NAMES\n  if (debug_fix_space_level > 5) {\n    tprintf(\"FP fixspace Noise metrics for \\\"%s\\\": \",\n            word_res->best_choice->unichar_string().c_str());\n  }\n#endif\n\n  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {\n    TBLOB *blob = word_res->rebuild_word->blobs[i];\n    if (word_res->reject_map[i].accepted()) {\n      noise_score[i] = non_noise_limit;\n    } else {\n      noise_score[i] = blob_noise_score(blob);\n    }\n\n    if (debug_fix_space_level > 5) {\n      tprintf(\"%1.1f \", noise_score[i]);\n    }\n  }\n  if (debug_fix_space_level > 5) {\n    tprintf(\"\\n\");\n  }\n\n  /* Now find the worst one which is far enough away from the end of the word */\n\n  non_noise_count = 0;\n  int i;\n  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {\n    if (noise_score[i] >= non_noise_limit) {\n      non_noise_count++;\n    }\n  }\n  if (non_noise_count < fixsp_non_noise_limit) {\n    return -1;\n  }\n\n  min_noise_blob = i;\n\n  non_noise_count = 0;\n  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {\n    if (noise_score[i] >= non_noise_limit) {\n      non_noise_count++;\n    }\n  }\n  if (non_noise_count < fixsp_non_noise_limit) {\n    return -1;\n  }\n\n  max_noise_blob = i;\n\n  if (min_noise_blob > max_noise_blob) {\n    return -1;\n  }\n\n  *worst_noise_score = small_limit;\n  worst_noise_blob = -1;\n  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {\n    if (noise_score[i] < *worst_noise_score) {\n      worst_noise_blob = i;\n      *worst_noise_score = noise_score[i];\n    }\n  }\n  return worst_noise_blob;\n}\n\nfloat Tesseract::blob_noise_score(TBLOB *blob) {\n  TBOX box; // BB of outline\n  int16_t outline_count = 0;\n  int16_t max_dimension;\n  int16_t largest_outline_dimension = 0;\n\n  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {\n    outline_count++;\n    box = ol->bounding_box();\n    if (box.height() > box.width()) {\n      max_dimension = box.height();\n    } else {\n      max_dimension = box.width();\n    }\n\n    if (largest_outline_dimension < max_dimension) {\n      largest_outline_dimension = max_dimension;\n    }\n  }\n\n  if (outline_count > 5) {\n    // penalise LOTS of blobs\n    largest_outline_dimension *= 2;\n  }\n\n  box = blob->bounding_box();\n  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {\n    // Lax blob is if high or low\n    largest_outline_dimension /= 2;\n  }\n\n  return largest_outline_dimension;\n}\n\nvoid fixspace_dbg(WERD_RES *word) {\n  TBOX box = word->word->bounding_box();\n  const bool show_map_detail = false;\n\n  box.print();\n  tprintf(\" \\\"%s\\\" \", word->best_choice->unichar_string().c_str());\n  tprintf(\"Blob count: %d (word); %d/%d (rebuild word)\\n\", word->word->cblob_list()->length(),\n          word->rebuild_word->NumBlobs(), word->box_word->length());\n  word->reject_map.print(debug_fp);\n  tprintf(\"\\n\");\n  if (show_map_detail) {\n    tprintf(\"\\\"%s\\\"\\n\", word->best_choice->unichar_string().c_str());\n    for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\\0'; i++) {\n      tprintf(\"**** \\\"%c\\\" ****\\n\", word->best_choice->unichar_string()[i]);\n      word->reject_map[i].full_print(debug_fp);\n    }\n  }\n\n  tprintf(\"Tess Accepted: %s\\n\", word->tess_accepted ? \"TRUE\" : \"FALSE\");\n  tprintf(\"Done flag: %s\\n\\n\", word->done ? \"TRUE\" : \"FALSE\");\n}\n\n/**\n * fp_eval_word_spacing()\n * Evaluation function for fixed pitch word lists.\n *\n * Basically, count the number of \"nice\" characters - those which are in tess\n * acceptable words or in dict words and are not rejected.\n * Penalise any potential noise chars\n */\nint16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {\n  WERD_RES_IT word_it(&word_res_list);\n  WERD_RES *word;\n  int16_t score = 0;\n  float small_limit = kBlnXHeight * fixsp_small_outlines_size;\n\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data();\n    if (word->rebuild_word == nullptr) {\n      continue; // Can't handle cube words.\n    }\n    if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||\n        word->best_choice->permuter() == FREQ_DAWG_PERM ||\n        word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {\n      auto num_blobs = word->rebuild_word->NumBlobs();\n      UNICHAR_ID space = word->uch_set->unichar_to_id(\" \");\n      for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {\n        TBLOB *blob = word->rebuild_word->blobs[i];\n        if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {\n          score -= 1; // penalise possibly erroneous non-space\n        } else if (word->reject_map[i].accepted()) {\n          score++;\n        }\n      }\n    }\n  }\n  if (score < 0) {\n    score = 0;\n  }\n  return score;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/fixspace.h",
    "content": "/******************************************************************\n * File:        fixspace.h  (Formerly fixspace.h)\n * Description: Implements a pass over the page res, exploring the alternative\n *              spacing possibilities, trying to use context to improve the\n *              word spacing\n * Author:      Phil Cheatle\n * Created:     Thu Oct 21 11:38:43 BST 1993\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef FIXSPACE_H\n#define FIXSPACE_H\n\nnamespace tesseract {\n\nclass WERD_RES;\nclass WERD_RES_LIST;\n\nvoid initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);\nvoid transform_to_next_perm(WERD_RES_LIST &words);\nvoid fixspace_dbg(WERD_RES *word);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccmain/fixxht.cpp",
    "content": "/**********************************************************************\n * File:        fixxht.cpp  (Formerly fixxht.c)\n * Description: Improve x_ht and look out for case inconsistencies\n * Author:      Phil Cheatle\n * Created:     Thu Aug  5 14:11:08 BST 1993\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"float2int.h\"\n#include \"params.h\"\n#include \"tesseractclass.h\"\n\n#include <algorithm>\n#include <cctype>\n#include <cmath>\n#include <cstring>\n\nnamespace tesseract {\n\n// Fixxht overview.\n// Premise: Initial estimate of x-height is adequate most of the time, but\n// occasionally it is incorrect. Most notable causes of failure are:\n// 1. Small caps, where the top of the caps is the same as the body text\n// xheight. For small caps words the xheight needs to be reduced to correctly\n// recognize the caps in the small caps word.\n// 2. All xheight lines, such as summer. Here the initial estimate will have\n// guessed that the blob tops are caps and will have placed the xheight too low.\n// 3. Noise/logos beside words, or changes in font size on a line. Such\n// things can blow the statistics and cause an incorrect estimate.\n// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.\n// In this case the x-height is often still correct.\n//\n// Algorithm.\n// Compare the vertical position (top only) of alphnumerics in a word with\n// the range of positions in training data (in the unicharset).\n// See CountMisfitTops. If any characters disagree sufficiently with the\n// initial xheight estimate, then recalculate the xheight, re-run OCR on\n// the word, and if the number of vertical misfits goes down, along with\n// either the word rating or certainty, then keep the new xheight.\n// The new xheight is calculated as follows:ComputeCompatibleXHeight\n// For each alphanumeric character that has a vertically misplaced top\n// (a misfit), yet its bottom is within the acceptable range (ie it is not\n// likely a sub-or super-script) calculate the range of acceptable xheight\n// positions from its range of tops, and give each value in the range a\n// number of votes equal to the distance of its top from its acceptance range.\n// The x-height position with the median of the votes becomes the new\n// x-height. This assumes that most characters will be correctly recognized\n// even if the x-height is incorrect. This is not a terrible assumption, but\n// it is not great. An improvement would be to use a classifier that does\n// not care about vertical position or scaling at all.\n// Separately collect stats on shifted baselines and apply the same logic to\n// computing a best-fit shift to fix the error. If the baseline needs to be\n// shifted, but the x-height is OK, returns the original x-height along with\n// the baseline shift to indicate that recognition needs to re-run.\n\n// If the max-min top of a unicharset char is bigger than kMaxCharTopRange\n// then the char top cannot be used to judge misfits or suggest a new top.\nconst int kMaxCharTopRange = 48;\n\n// Returns the number of misfit blob tops in this word.\nint Tesseract::CountMisfitTops(WERD_RES *word_res) {\n  int bad_blobs = 0;\n  int num_blobs = word_res->rebuild_word->NumBlobs();\n  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {\n    TBLOB *blob = word_res->rebuild_word->blobs[blob_id];\n    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);\n    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {\n      int top = blob->bounding_box().top();\n      if (top >= INT_FEAT_RANGE) {\n        top = INT_FEAT_RANGE - 1;\n      }\n      int min_bottom, max_bottom, min_top, max_top;\n      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);\n      if (max_top - min_top > kMaxCharTopRange) {\n        continue;\n      }\n      bool bad =\n          top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;\n      if (bad) {\n        ++bad_blobs;\n      }\n      if (debug_x_ht_level >= 1) {\n        tprintf(\"Class %s is %s with top %d vs limits of %d->%d, +/-%d\\n\",\n                unicharset.id_to_unichar(class_id), bad ? \"Misfit\" : \"OK\", top, min_top, max_top,\n                static_cast<int>(x_ht_acceptance_tolerance));\n      }\n    }\n  }\n  return bad_blobs;\n}\n\n// Returns a new x-height maximally compatible with the result in word_res.\n// See comment above for overall algorithm.\nfloat Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {\n  STATS top_stats(0, UINT8_MAX - 1);\n  STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1);\n  int bottom_shift = 0;\n  int num_blobs = word_res->rebuild_word->NumBlobs();\n  do {\n    top_stats.clear();\n    shift_stats.clear();\n    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {\n      TBLOB *blob = word_res->rebuild_word->blobs[blob_id];\n      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);\n      if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {\n        int top = blob->bounding_box().top() + bottom_shift;\n        // Clip the top to the limit of normalized feature space.\n        if (top >= INT_FEAT_RANGE) {\n          top = INT_FEAT_RANGE - 1;\n        }\n        int bottom = blob->bounding_box().bottom() + bottom_shift;\n        int min_bottom, max_bottom, min_top, max_top;\n        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);\n        // Chars with a wild top range would mess up the result so ignore them.\n        if (max_top - min_top > kMaxCharTopRange) {\n          continue;\n        }\n        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,\n                                   top - (max_top + x_ht_acceptance_tolerance));\n        int height = top - kBlnBaselineOffset;\n        if (debug_x_ht_level >= 2) {\n          tprintf(\"Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: \",\n                  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,\n                  max_top, bottom, top);\n        }\n        // Use only chars that fit in the expected bottom range, and where\n        // the range of tops is sensibly near the xheight.\n        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&\n            bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&\n            max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {\n          // Compute the x-height position using proportionality between the\n          // actual height and expected height.\n          int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);\n          int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);\n          if (debug_x_ht_level >= 2) {\n            tprintf(\" xht range min=%d, max=%d\\n\", min_xht, max_xht);\n          }\n          // The range of expected heights gets a vote equal to the distance\n          // of the actual top from the expected top.\n          for (int y = min_xht; y <= max_xht; ++y) {\n            top_stats.add(y, misfit_dist);\n          }\n        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||\n                    bottom - x_ht_acceptance_tolerance > max_bottom) &&\n                   bottom_shift == 0) {\n          // Get the range of required bottom shift.\n          int min_shift = min_bottom - bottom;\n          int max_shift = max_bottom - bottom;\n          if (debug_x_ht_level >= 2) {\n            tprintf(\" bottom shift min=%d, max=%d\\n\", min_shift, max_shift);\n          }\n          // The range of expected shifts gets a vote equal to the min distance\n          // of the actual bottom from the expected bottom, spread over the\n          // range of its acceptance.\n          int misfit_weight = abs(min_shift);\n          if (max_shift > min_shift) {\n            misfit_weight /= max_shift - min_shift;\n          }\n          for (int y = min_shift; y <= max_shift; ++y) {\n            shift_stats.add(y, misfit_weight);\n          }\n        } else {\n          if (bottom_shift == 0) {\n            // Things with bottoms that are already ok need to say so, on the\n            // 1st iteration only.\n            shift_stats.add(0, kBlnBaselineOffset);\n          }\n          if (debug_x_ht_level >= 2) {\n            tprintf(\" already OK\\n\");\n          }\n        }\n      }\n    }\n    if (shift_stats.get_total() > top_stats.get_total()) {\n      bottom_shift = IntCastRounded(shift_stats.median());\n      if (debug_x_ht_level >= 2) {\n        tprintf(\"Applying bottom shift=%d\\n\", bottom_shift);\n      }\n    }\n  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());\n  // Baseline shift is opposite sign to the bottom shift.\n  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();\n  if (debug_x_ht_level >= 2) {\n    tprintf(\"baseline shift=%g\\n\", *baseline_shift);\n  }\n  if (top_stats.get_total() == 0) {\n    return bottom_shift != 0 ? word_res->x_height : 0.0f;\n  }\n  // The new xheight is just the median vote, which is then scaled out\n  // of BLN space back to pixel space to get the x-height in pixel space.\n  float new_xht = top_stats.median();\n  if (debug_x_ht_level >= 2) {\n    tprintf(\"Median xht=%f\\n\", new_xht);\n    tprintf(\"Mode20:A: New x-height = %f (norm), %f (orig)\\n\", new_xht,\n            new_xht / word_res->denorm.y_scale());\n  }\n  // The xheight must change by at least x_ht_min_change to be used.\n  if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {\n    return new_xht / word_res->denorm.y_scale();\n  } else {\n    return bottom_shift != 0 ? word_res->x_height : 0.0f;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/linerec.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        linerec.cpp\n// Description: Top-level line-based recognition module for Tesseract.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"tesseractclass.h\"\n\n#include <allheaders.h>\n#include \"boxread.h\"\n#include \"imagedata.h\" // for ImageData\n#include \"lstmrecognizer.h\"\n#include \"pageres.h\"\n#include \"recodebeam.h\"\n#include \"tprintf.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// Scale factor to make certainty more comparable to Tesseract.\nconst float kCertaintyScale = 7.0f;\n// Worst acceptable certainty for a dictionary word.\nconst float kWorstDictCertainty = -25.0f;\n\n// Generates training data for training a line recognizer, eg LSTM.\n// Breaks the page into lines, according to the boxes, and writes them to a\n// serialized DocumentData based on output_basename.\n// Return true if successful, false if an error occurred.\nbool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,\n                                    BLOCK_LIST *block_list) {\n  std::string lstmf_name = output_basename + \".lstmf\";\n  DocumentData images(lstmf_name);\n  if (applybox_page > 0) {\n    // Load existing document for the previous pages.\n    if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {\n      tprintf(\"Failed to read training data from %s!\\n\", lstmf_name.c_str());\n      return false;\n    }\n  }\n  std::vector<TBOX> boxes;\n  std::vector<std::string> texts;\n  // Get the boxes for this page, if there are any.\n  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||\n      boxes.empty()) {\n    tprintf(\"Failed to read boxes from %s\\n\", input_imagename);\n    return false;\n  }\n  TrainFromBoxes(boxes, texts, block_list, &images);\n  if (images.PagesSize() == 0) {\n    tprintf(\"Failed to read pages from %s\\n\", input_imagename);\n    return false;\n  }\n  images.Shuffle();\n  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {\n    tprintf(\"Failed to write training data to %s!\\n\", lstmf_name.c_str());\n    return false;\n  }\n  return true;\n}\n\n// Generates training data for training a line recognizer, eg LSTM.\n// Breaks the boxes into lines, normalizes them, converts to ImageData and\n// appends them to the given training_data.\nvoid Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,\n                               BLOCK_LIST *block_list, DocumentData *training_data) {\n  auto box_count = boxes.size();\n  // Process all the text lines in this page, as defined by the boxes.\n  unsigned end_box = 0;\n  // Don't let \\t, which marks newlines in the box file, get into the line\n  // content, as that makes the line unusable in training.\n  while (end_box < texts.size() && texts[end_box] == \"\\t\") {\n    ++end_box;\n  }\n  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {\n    // Find the textline of boxes starting at start and their bounding box.\n    TBOX line_box = boxes[start_box];\n    std::string line_str = texts[start_box];\n    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != \"\\t\"; ++end_box) {\n      line_box += boxes[end_box];\n      line_str += texts[end_box];\n    }\n    // Find the most overlapping block.\n    BLOCK *best_block = nullptr;\n    int best_overlap = 0;\n    BLOCK_IT b_it(block_list);\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      BLOCK *block = b_it.data();\n      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {\n        continue; // Not a text block.\n      }\n      TBOX block_box = block->pdblk.bounding_box();\n      block_box.rotate(block->re_rotation());\n      if (block_box.major_overlap(line_box)) {\n        TBOX overlap_box = line_box.intersection(block_box);\n        if (overlap_box.area() > best_overlap) {\n          best_overlap = overlap_box.area();\n          best_block = block;\n        }\n      }\n    }\n    ImageData *imagedata = nullptr;\n    if (best_block == nullptr) {\n      tprintf(\"No block overlapping textline: %s\\n\", line_str.c_str());\n    } else {\n      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);\n    }\n    if (imagedata != nullptr) {\n      training_data->AddPageToDocument(imagedata);\n    }\n    // Don't let \\t, which marks newlines in the box file, get into the line\n    // content, as that makes the line unusable in training.\n    while (end_box < texts.size() && texts[end_box] == \"\\t\") {\n      ++end_box;\n    }\n  }\n}\n\n// Returns an Imagedata containing the image of the given box,\n// and ground truth boxes/truth text if available in the input.\n// The image is not normalized in any way.\nImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,\n                                  const std::vector<std::string> &texts, int start_box, int end_box,\n                                  const BLOCK &block) {\n  TBOX revised_box;\n  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);\n  if (image_data == nullptr) {\n    return nullptr;\n  }\n  image_data->set_page_number(applybox_page);\n  // Copy the boxes and shift them so they are relative to the image.\n  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());\n  ICOORD shift = -revised_box.botleft();\n  std::vector<TBOX> line_boxes;\n  std::vector<std::string> line_texts;\n  for (int b = start_box; b < end_box; ++b) {\n    TBOX box = boxes[b];\n    box.rotate(block_rotation);\n    box.move(shift);\n    line_boxes.push_back(box);\n    line_texts.push_back(texts[b]);\n  }\n  std::vector<int> page_numbers(line_boxes.size(), applybox_page);\n  image_data->AddBoxes(line_boxes, line_texts, page_numbers);\n  return image_data;\n}\n\n// Helper gets the image of a rectangle, using the block.re_rotation() if\n// needed to get to the image, and rotating the result back to horizontal\n// layout. (CJK characters will be on their left sides) The vertical text flag\n// is set in the returned ImageData if the text was originally vertical, which\n// can be used to invoke a different CJK recognition engine. The revised_box\n// is also returned to enable calculation of output bounding boxes.\nImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,\n                                   TBOX *revised_box) const {\n  TBOX wbox = box;\n  wbox.pad(padding, padding);\n  *revised_box = wbox;\n  // Number of clockwise 90 degree rotations needed to get back to tesseract\n  // coords from the clipped image.\n  int num_rotations = 0;\n  if (block.re_rotation().y() > 0.0f) {\n    num_rotations = 1;\n  } else if (block.re_rotation().x() < 0.0f) {\n    num_rotations = 2;\n  } else if (block.re_rotation().y() < 0.0f) {\n    num_rotations = 3;\n  }\n  // Handle two cases automatically: 1 the box came from the block, 2 the box\n  // came from a box file, and refers to the image, which the block may not.\n  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {\n    revised_box->rotate(block.re_rotation());\n  }\n  // Now revised_box always refers to the image.\n  // BestPix is never colormapped, but may be of any depth.\n  Image pix = BestPix();\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  TBOX image_box(0, 0, width, height);\n  // Clip to image bounds;\n  *revised_box &= image_box;\n  if (revised_box->null_box()) {\n    return nullptr;\n  }\n  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),\n                            revised_box->height());\n  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);\n  boxDestroy(&clip_box);\n  if (box_pix == nullptr) {\n    return nullptr;\n  }\n  if (num_rotations > 0) {\n    Image rot_pix = pixRotateOrth(box_pix, num_rotations);\n    box_pix.destroy();\n    box_pix = rot_pix;\n  }\n  // Convert sub-8-bit images to 8 bit.\n  int depth = pixGetDepth(box_pix);\n  if (depth < 8) {\n    Image grey;\n    grey = pixConvertTo8(box_pix, false);\n    box_pix.destroy();\n    box_pix = grey;\n  }\n  bool vertical_text = false;\n  if (num_rotations > 0) {\n    // Rotated the clipped revised box back to internal coordinates.\n    FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());\n    revised_box->rotate(rotation);\n    if (num_rotations != 2) {\n      vertical_text = true;\n    }\n  }\n  return new ImageData(vertical_text, box_pix);\n}\n\n// Recognizes a word or group of words, converting to WERD_RES in *words.\n// Analogous to classify_word_pass1, but can handle a group of words as well.\nvoid Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,\n                                  PointerVector<WERD_RES> *words) {\n  TBOX word_box = word->word->bounding_box();\n  // Get the word image - no frills.\n  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {\n    // In single word mode, use the whole image without any other row/word\n    // interpretation.\n    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());\n  } else {\n    float baseline = row->base_line((word_box.left() + word_box.right()) / 2);\n    if (baseline + row->descenders() < word_box.bottom()) {\n      word_box.set_bottom(baseline + row->descenders());\n    }\n    if (baseline + row->x_height() + row->ascenders() > word_box.top()) {\n      word_box.set_top(baseline + row->x_height() + row->ascenders());\n    }\n  }\n  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);\n  if (im_data == nullptr) {\n    return;\n  }\n\n  bool do_invert = tessedit_do_invert;\n  float threshold = do_invert ? double(invert_threshold) : 0.0f;\n  lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,\n                                  kWorstDictCertainty / kCertaintyScale, word_box, words,\n                                  lstm_choice_mode, lstm_choice_iterations);\n  delete im_data;\n  SearchWords(words);\n}\n\n// Apply segmentation search to the given set of words, within the constraints\n// of the existing ratings matrix. If there is already a best_choice on a word\n// leaves it untouched and just sets the done/accepted etc flags.\nvoid Tesseract::SearchWords(PointerVector<WERD_RES> *words) {\n  // Run the segmentation search on the network outputs and make a BoxWord\n  // for each of the output words.\n  // If we drop a word as junk, then there is always a space in front of the\n  // next.\n  const Dict *stopper_dict = lstm_recognizer_->GetDict();\n  if (stopper_dict == nullptr) {\n    stopper_dict = &getDict();\n  }\n  for (unsigned w = 0; w < words->size(); ++w) {\n    WERD_RES *word = (*words)[w];\n    if (word->best_choice == nullptr) {\n      // It is a dud.\n      word->SetupFake(lstm_recognizer_->GetUnicharset());\n    } else {\n      // Set the best state.\n      for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n        int length = word->best_choice->state(i);\n        word->best_state.push_back(length);\n      }\n      word->reject_map.initialise(word->best_choice->length());\n      word->tess_failed = false;\n      word->tess_accepted = true;\n      word->tess_would_adapt = false;\n      word->done = true;\n      word->tesseract = this;\n      float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());\n      word_certainty *= kCertaintyScale;\n      if (getDict().stopper_debug_level >= 1) {\n        tprintf(\"Best choice certainty=%g, space=%g, scaled=%g, final=%g\\n\",\n                word->best_choice->certainty(), word->space_certainty,\n                std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,\n                word_certainty);\n        word->best_choice->print();\n      }\n      word->best_choice->set_certainty(word_certainty);\n\n      word->tess_accepted = stopper_dict->AcceptableResult(word);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/ltrresultiterator.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ltrresultiterator.cpp\n// Description: Iterator for tesseract results in strict left-to-right\n//              order that avoids using tesseract internal data structures.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/ltrresultiterator.h>\n\n#include \"helpers.h\"  // for copy_string\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n\n#include <allheaders.h>\n\nnamespace tesseract {\n\nLTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,\n                                     int scaled_yres, int rect_left, int rect_top, int rect_width,\n                                     int rect_height)\n    : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,\n                   rect_height)\n    , line_separator_(\"\\n\")\n    , paragraph_separator_(\"\\n\") {}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nLTRResultIterator::~LTRResultIterator() = default;\n\n// Returns the null terminated UTF-8 encoded text string for the current\n// object at the given level. Use delete [] to free after use.\nchar *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {\n  if (it_->word() == nullptr) {\n    return nullptr; // Already at the end!\n  }\n  std::string text;\n  PAGE_RES_IT res_it(*it_);\n  WERD_CHOICE *best_choice = res_it.word()->best_choice;\n  ASSERT_HOST(best_choice != nullptr);\n  if (level == RIL_SYMBOL) {\n    text = res_it.word()->BestUTF8(blob_index_, false);\n  } else if (level == RIL_WORD) {\n    text = best_choice->unichar_string();\n  } else {\n    bool eol = false; // end of line?\n    bool eop = false; // end of paragraph?\n    do {              // for each paragraph in a block\n      do {            // for each text line in a paragraph\n        do {          // for each word in a text line\n          best_choice = res_it.word()->best_choice;\n          ASSERT_HOST(best_choice != nullptr);\n          text += best_choice->unichar_string();\n          text += \" \";\n          res_it.forward();\n          eol = res_it.row() != res_it.prev_row();\n        } while (!eol);\n        text.resize(text.length() - 1);\n        text += line_separator_;\n        eop = res_it.block() != res_it.prev_block() ||\n              res_it.row()->row->para() != res_it.prev_row()->row->para();\n      } while (level != RIL_TEXTLINE && !eop);\n      if (eop) {\n        text += paragraph_separator_;\n      }\n    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());\n  }\n  return copy_string(text);\n}\n\n// Set the string inserted at the end of each text line. \"\\n\" by default.\nvoid LTRResultIterator::SetLineSeparator(const char *new_line) {\n  line_separator_ = new_line;\n}\n\n// Set the string inserted at the end of each paragraph. \"\\n\" by default.\nvoid LTRResultIterator::SetParagraphSeparator(const char *new_para) {\n  paragraph_separator_ = new_para;\n}\n\n// Returns the mean confidence of the current object at the given level.\n// The number should be interpreted as a percent probability. (0.0f-100.0f)\nfloat LTRResultIterator::Confidence(PageIteratorLevel level) const {\n  if (it_->word() == nullptr) {\n    return 0.0f; // Already at the end!\n  }\n  float mean_certainty = 0.0f;\n  int certainty_count = 0;\n  PAGE_RES_IT res_it(*it_);\n  WERD_CHOICE *best_choice;\n  switch (level) {\n    case RIL_BLOCK:\n      do {\n        best_choice = res_it.word()->best_choice;\n        mean_certainty += best_choice->certainty();\n        ++certainty_count;\n        res_it.forward();\n      } while (res_it.block() == res_it.prev_block());\n      break;\n    case RIL_PARA:\n      do {\n        best_choice = res_it.word()->best_choice;\n        mean_certainty += best_choice->certainty();\n        ++certainty_count;\n        res_it.forward();\n      } while (res_it.block() == res_it.prev_block() &&\n               res_it.row()->row->para() == res_it.prev_row()->row->para());\n      break;\n    case RIL_TEXTLINE:\n      do {\n        best_choice = res_it.word()->best_choice;\n        mean_certainty += best_choice->certainty();\n        ++certainty_count;\n        res_it.forward();\n      } while (res_it.row() == res_it.prev_row());\n      break;\n    case RIL_WORD:\n      best_choice = res_it.word()->best_choice;\n      mean_certainty = best_choice->certainty();\n      certainty_count = 1;\n      break;\n    case RIL_SYMBOL:\n      best_choice = res_it.word()->best_choice;\n      mean_certainty = best_choice->certainty(blob_index_);\n      certainty_count = 1;\n  }\n  if (certainty_count > 0) {\n    mean_certainty /= certainty_count;\n    return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);\n  }\n  return 0.0f;\n}\n\n// Returns the font attributes of the current word. If iterating at a higher\n// level object than words, eg textlines, then this will return the\n// attributes of the first word in that textline.\n// The actual return value is a string representing a font name. It points\n// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as\n// the iterator itself, ie rendered invalid by various members of\n// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.\n// Pointsize is returned in printers points (1/72 inch.)\nconst char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,\n                                                  bool *is_underlined, bool *is_monospace,\n                                                  bool *is_serif, bool *is_smallcaps,\n                                                  int *pointsize, int *font_id) const {\n  const char *result = nullptr;\n\n  if (it_->word() == nullptr) {\n    // Already at the end!\n    *pointsize = 0;\n  } else {\n    float row_height =\n        it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();\n    // Convert from pixels to printers points.\n    *pointsize =\n        scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;\n\n#ifndef DISABLED_LEGACY_ENGINE\n    const FontInfo *font_info = it_->word()->fontinfo;\n    if (font_info) {\n      // Font information available.\n      *font_id = font_info->universal_id;\n      *is_bold = font_info->is_bold();\n      *is_italic = font_info->is_italic();\n      *is_underlined = false; // TODO(rays) fix this!\n      *is_monospace = font_info->is_fixed_pitch();\n      *is_serif = font_info->is_serif();\n      result = font_info->name;\n    }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n    *is_smallcaps = it_->word()->small_caps;\n  }\n\n  if (!result) {\n    *is_bold = false;\n    *is_italic = false;\n    *is_underlined = false;\n    *is_monospace = false;\n    *is_serif = false;\n    *is_smallcaps = false;\n    *font_id = -1;\n  }\n\n  return result;\n}\n\n// Returns the name of the language used to recognize this word.\nconst char *LTRResultIterator::WordRecognitionLanguage() const {\n  if (it_->word() == nullptr || it_->word()->tesseract == nullptr) {\n    return nullptr;\n  }\n  return it_->word()->tesseract->lang.c_str();\n}\n\n// Return the overall directionality of this word.\nStrongScriptDirection LTRResultIterator::WordDirection() const {\n  if (it_->word() == nullptr) {\n    return DIR_NEUTRAL;\n  }\n  bool has_rtl = it_->word()->AnyRtlCharsInWord();\n  bool has_ltr = it_->word()->AnyLtrCharsInWord();\n  if (has_rtl && !has_ltr) {\n    return DIR_RIGHT_TO_LEFT;\n  }\n  if (has_ltr && !has_rtl) {\n    return DIR_LEFT_TO_RIGHT;\n  }\n  if (!has_ltr && !has_rtl) {\n    return DIR_NEUTRAL;\n  }\n  return DIR_MIX;\n}\n\n// Returns true if the current word was found in a dictionary.\nbool LTRResultIterator::WordIsFromDictionary() const {\n  if (it_->word() == nullptr) {\n    return false; // Already at the end!\n  }\n  int permuter = it_->word()->best_choice->permuter();\n  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;\n}\n\n// Returns the number of blanks before the current word.\nint LTRResultIterator::BlanksBeforeWord() const {\n  if (it_->word() == nullptr) {\n    return 1;\n  }\n  return it_->word()->word->space();\n}\n\n// Returns true if the current word is numeric.\nbool LTRResultIterator::WordIsNumeric() const {\n  if (it_->word() == nullptr) {\n    return false; // Already at the end!\n  }\n  int permuter = it_->word()->best_choice->permuter();\n  return permuter == NUMBER_PERM;\n}\n\n// Returns true if the word contains blamer information.\nbool LTRResultIterator::HasBlamerInfo() const {\n  return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&\n         it_->word()->blamer_bundle->HasDebugInfo();\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle\n// of the current word.\nconst void *LTRResultIterator::GetParamsTrainingBundle() const {\n  return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)\n             ? &(it_->word()->blamer_bundle->params_training_bundle())\n             : nullptr;\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n// Returns the pointer to the string with blamer information for this word.\n// Assumes that the word's blamer_bundle is not nullptr.\nconst char *LTRResultIterator::GetBlamerDebug() const {\n  return it_->word()->blamer_bundle->debug().c_str();\n}\n\n// Returns the pointer to the string with misadaption information for this word.\n// Assumes that the word's blamer_bundle is not nullptr.\nconst char *LTRResultIterator::GetBlamerMisadaptionDebug() const {\n  return it_->word()->blamer_bundle->misadaption_debug().c_str();\n}\n\n// Returns true if a truth string was recorded for the current word.\nbool LTRResultIterator::HasTruthString() const {\n  if (it_->word() == nullptr) {\n    return false; // Already at the end!\n  }\n  if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {\n    return false; // no truth information for this word\n  }\n  return true;\n}\n\n// Returns true if the given string is equivalent to the truth string for\n// the current word.\nbool LTRResultIterator::EquivalentToTruth(const char *str) const {\n  if (!HasTruthString()) {\n    return false;\n  }\n  ASSERT_HOST(it_->word()->uch_set != nullptr);\n  WERD_CHOICE str_wd(str, *(it_->word()->uch_set));\n  return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);\n}\n\n// Returns the null terminated UTF-8 encoded truth string for the current word.\n// Use delete [] to free after use.\nchar *LTRResultIterator::WordTruthUTF8Text() const {\n  if (!HasTruthString()) {\n    return nullptr;\n  }\n  return copy_string(it_->word()->blamer_bundle->TruthString());\n}\n\n// Returns the null terminated UTF-8 encoded normalized OCR string for the\n// current word. Use delete [] to free after use.\nchar *LTRResultIterator::WordNormedUTF8Text() const {\n  if (it_->word() == nullptr) {\n    return nullptr; // Already at the end!\n  }\n  std::string ocr_text;\n  WERD_CHOICE *best_choice = it_->word()->best_choice;\n  const UNICHARSET *unicharset = it_->word()->uch_set;\n  for (unsigned i = 0; i < best_choice->length(); ++i) {\n    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));\n  }\n  return copy_string(ocr_text);\n}\n\n// Returns a pointer to serialized choice lattice.\n// Fills lattice_size with the number of bytes in lattice data.\nconst char *LTRResultIterator::WordLattice(int *lattice_size) const {\n  if (it_->word() == nullptr) {\n    return nullptr; // Already at the end!\n  }\n  if (it_->word()->blamer_bundle == nullptr) {\n    return nullptr;\n  }\n  *lattice_size = it_->word()->blamer_bundle->lattice_size();\n  return it_->word()->blamer_bundle->lattice_data();\n}\n\n// Returns true if the current symbol is a superscript.\n// If iterating at a higher level object than symbols, eg words, then\n// this will return the attributes of the first symbol in that word.\nbool LTRResultIterator::SymbolIsSuperscript() const {\n  if (cblob_it_ == nullptr && it_->word() != nullptr) {\n    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;\n  }\n  return false;\n}\n\n// Returns true if the current symbol is a subscript.\n// If iterating at a higher level object than symbols, eg words, then\n// this will return the attributes of the first symbol in that word.\nbool LTRResultIterator::SymbolIsSubscript() const {\n  if (cblob_it_ == nullptr && it_->word() != nullptr) {\n    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;\n  }\n  return false;\n}\n\n// Returns true if the current symbol is a dropcap.\n// If iterating at a higher level object than symbols, eg words, then\n// this will return the attributes of the first symbol in that word.\nbool LTRResultIterator::SymbolIsDropcap() const {\n  if (cblob_it_ == nullptr && it_->word() != nullptr) {\n    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;\n  }\n  return false;\n}\n\nChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {\n  ASSERT_HOST(result_it.it_->word() != nullptr);\n  word_res_ = result_it.it_->word();\n  oemLSTM_ = word_res_->tesseract->AnyLSTMLang();\n  // Is there legacy engine related trained data?\n  bool oemLegacy = word_res_->tesseract->AnyTessLang();\n  // Is lstm_choice_mode activated?\n  bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;\n  rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;\n  blanks_before_word_ = result_it.BlanksBeforeWord();\n  BLOB_CHOICE_LIST *choices = nullptr;\n  tstep_index_ = &result_it.blob_index_;\n  if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {\n    if (!word_res_->CTC_symbol_choices[0].empty() &&\n        strcmp(word_res_->CTC_symbol_choices[0][0].first, \" \")) {\n      blanks_before_word_ = 0;\n    }\n    unsigned index = *tstep_index_;\n    index += blanks_before_word_;\n    if (index < word_res_->CTC_symbol_choices.size()) {\n      LSTM_choices_ = &word_res_->CTC_symbol_choices[index];\n      filterSpaces();\n    }\n  }\n  if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr) {\n    choices = word_res_->GetBlobChoices(result_it.blob_index_);\n  }\n  if (choices != nullptr && !choices->empty()) {\n    choice_it_ = new BLOB_CHOICE_IT(choices);\n    choice_it_->mark_cycle_pt();\n  } else {\n    choice_it_ = nullptr;\n  }\n  if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {\n    LSTM_choice_it_ = LSTM_choices_->begin();\n  }\n}\nChoiceIterator::~ChoiceIterator() {\n  delete choice_it_;\n}\n\n// Moves to the next choice for the symbol and returns false if there\n// are none left.\nbool ChoiceIterator::Next() {\n  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {\n    if (LSTM_choice_it_ == LSTM_choices_->end() ||\n        next(LSTM_choice_it_) == LSTM_choices_->end()) {\n      return false;\n    } else {\n      ++LSTM_choice_it_;\n      return true;\n    }\n  } else {\n    if (choice_it_ == nullptr) {\n      return false;\n    }\n    choice_it_->forward();\n    return !choice_it_->cycled_list();\n  }\n}\n\n// Returns the null terminated UTF-8 encoded text string for the current\n// choice. Do NOT use delete [] to free after use.\nconst char *ChoiceIterator::GetUTF8Text() const {\n  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {\n    std::pair<const char *, float> choice = *LSTM_choice_it_;\n    return choice.first;\n  } else {\n    if (choice_it_ == nullptr) {\n      return nullptr;\n    }\n    UNICHAR_ID id = choice_it_->data()->unichar_id();\n    return word_res_->uch_set->id_to_unichar_ext(id);\n  }\n}\n\n// Returns the confidence of the current choice depending on the used language\n// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All\n// choices for one symbol should roughly add up to 1.0f.\n// If only traineddata of the legacy engine is used, the number should be\n// interpreted as a percent probability. (0.0f-100.0f) In this case\n// probabilities won't add up to 100. Each one stands on its own.\nfloat ChoiceIterator::Confidence() const {\n  float confidence;\n  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {\n    std::pair<const char *, float> choice = *LSTM_choice_it_;\n    confidence = 100 - rating_coefficient_ * choice.second;\n  } else {\n    if (choice_it_ == nullptr) {\n      return 0.0f;\n    }\n    confidence = 100 + 5 * choice_it_->data()->certainty();\n  }\n  return ClipToRange(confidence, 0.0f, 100.0f);\n}\n\n// Returns the set of timesteps which belong to the current symbol\nstd::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {\n  unsigned offset = *tstep_index_ + blanks_before_word_;\n  if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {\n    return nullptr;\n  }\n  return &word_res_->segmented_timesteps[offset];\n}\n\nvoid ChoiceIterator::filterSpaces() {\n  if (LSTM_choices_->empty()) {\n    return;\n  }\n  std::vector<std::pair<const char *, float>>::iterator it;\n  for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {\n    if (!strcmp(it->first, \" \")) {\n      it = LSTM_choices_->erase(it);\n    } else {\n      ++it;\n    }\n  }\n}\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/mutableiterator.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"mutableiterator.h\"\n\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nMutableIterator::~MutableIterator() = default;\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/mutableiterator.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        mutableiterator.h\n// Description: Iterator for tesseract results providing access to\n//              both high-level API and Tesseract internal data structures.\n// Author:      David Eger\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_\n#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_\n\n#include <tesseract/resultiterator.h>\n\nclass BLOB_CHOICE_IT;\n\nnamespace tesseract {\n\nclass Tesseract;\n\n// Class to iterate over tesseract results, providing access to all levels\n// of the page hierarchy, without including any tesseract headers or having\n// to handle any tesseract structures.\n// WARNING! This class points to data held within the TessBaseAPI class, and\n// therefore can only be used while the TessBaseAPI class still exists and\n// has not been subjected to a call of Init, SetImage, Recognize, Clear, End\n// DetectOS, or anything else that changes the internal PAGE_RES.\n// See tesseract/publictypes.h for the definition of PageIteratorLevel.\n// See also base class PageIterator, which contains the bulk of the interface.\n// ResultIterator adds text-specific methods for access to OCR output.\n// MutableIterator adds access to internal data structures.\n\nclass TESS_API MutableIterator : public ResultIterator {\npublic:\n  // See argument descriptions in ResultIterator()\n  MutableIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,\n                  int rect_left, int rect_top, int rect_width, int rect_height)\n      : ResultIterator(LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,\n                                         rect_top, rect_width, rect_height)) {}\n  ~MutableIterator() override;\n\n  // See PageIterator and ResultIterator for most calls.\n\n  // Return access to Tesseract internals.\n  const PAGE_RES_IT *PageResIt() const {\n    return it_;\n  }\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_\n"
  },
  {
    "path": "src/ccmain/osdetect.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        osdetect.cpp\n// Description: Orientation and script detection.\n// Author:      Samuel Charron\n//              Ranjith Unnikrishnan\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/osdetect.h>\n\n#include \"blobbox.h\"\n#include \"blread.h\"\n#include \"colfind.h\"\n#include \"fontinfo.h\"\n#include \"imagefind.h\"\n#include \"linefind.h\"\n#include \"oldlist.h\"\n#include \"qrsequence.h\"\n#include \"ratngs.h\"\n#include \"tabvector.h\"\n#include \"tesseractclass.h\"\n#include \"textord.h\"\n\n#include <algorithm>\n#include <cmath> // for std::fabs\n#include <memory>\n\nnamespace tesseract {\n\nconst float kSizeRatioToReject = 2.0;\nconst int kMinAcceptableBlobHeight = 10;\n\nconst float kScriptAcceptRatio = 1.3;\n\nconst float kHanRatioInKorean = 0.7;\nconst float kHanRatioInJapanese = 0.3;\n\nconst float kNonAmbiguousMargin = 1.0;\n\nvoid OSResults::update_best_orientation() {\n  float first = orientations[0];\n  float second = orientations[1];\n  best_result.orientation_id = 0;\n  if (orientations[0] < orientations[1]) {\n    first = orientations[1];\n    second = orientations[0];\n    best_result.orientation_id = 1;\n  }\n  for (int i = 2; i < 4; ++i) {\n    if (orientations[i] > first) {\n      second = first;\n      first = orientations[i];\n      best_result.orientation_id = i;\n    } else if (orientations[i] > second) {\n      second = orientations[i];\n    }\n  }\n  // Store difference of top two orientation scores.\n  best_result.oconfidence = first - second;\n}\n\nvoid OSResults::set_best_orientation(int orientation_id) {\n  best_result.orientation_id = orientation_id;\n  best_result.oconfidence = 0;\n}\n\nvoid OSResults::update_best_script(int orientation) {\n  // We skip index 0 to ignore the \"Common\" script.\n  float first = scripts_na[orientation][1];\n  float second = scripts_na[orientation][2];\n  best_result.script_id = 1;\n  if (scripts_na[orientation][1] < scripts_na[orientation][2]) {\n    first = scripts_na[orientation][2];\n    second = scripts_na[orientation][1];\n    best_result.script_id = 2;\n  }\n  for (int i = 3; i < kMaxNumberOfScripts; ++i) {\n    if (scripts_na[orientation][i] > first) {\n      best_result.script_id = i;\n      second = first;\n      first = scripts_na[orientation][i];\n    } else if (scripts_na[orientation][i] > second) {\n      second = scripts_na[orientation][i];\n    }\n  }\n  best_result.sconfidence =\n      (second == 0.0f) ? 2.0f : (first / second - 1.0) / (kScriptAcceptRatio - 1.0);\n}\n\nint OSResults::get_best_script(int orientation_id) const {\n  int max_id = -1;\n  for (int j = 0; j < kMaxNumberOfScripts; ++j) {\n    const char *script = unicharset->get_script_from_script_id(j);\n    if (strcmp(script, \"Common\") && strcmp(script, \"NULL\")) {\n      if (max_id == -1 || scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id]) {\n        max_id = j;\n      }\n    }\n  }\n  return max_id;\n}\n\n// Print the script scores for all possible orientations.\nvoid OSResults::print_scores(void) const {\n  for (int i = 0; i < 4; ++i) {\n    tprintf(\"Orientation id #%d\", i);\n    print_scores(i);\n  }\n}\n\n// Print the script scores for the given candidate orientation.\nvoid OSResults::print_scores(int orientation_id) const {\n  for (int j = 0; j < kMaxNumberOfScripts; ++j) {\n    if (scripts_na[orientation_id][j]) {\n      tprintf(\"%12s\\t: %f\\n\", unicharset->get_script_from_script_id(j),\n              scripts_na[orientation_id][j]);\n    }\n  }\n}\n\n// Accumulate scores with given OSResults instance and update the best script.\nvoid OSResults::accumulate(const OSResults &osr) {\n  for (int i = 0; i < 4; ++i) {\n    orientations[i] += osr.orientations[i];\n    for (int j = 0; j < kMaxNumberOfScripts; ++j) {\n      scripts_na[i][j] += osr.scripts_na[i][j];\n    }\n  }\n  unicharset = osr.unicharset;\n  update_best_orientation();\n  update_best_script(best_result.orientation_id);\n}\n\n// Detect and erase horizontal/vertical lines and picture regions from the\n// image, so that non-text blobs are removed from consideration.\nstatic void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,\n                                   TO_BLOCK_LIST *to_blocks) {\n  Image pix = tess->pix_binary();\n  ASSERT_HOST(pix != nullptr);\n  int vertical_x = 0;\n  int vertical_y = 1;\n  tesseract::TabVector_LIST v_lines;\n  tesseract::TabVector_LIST h_lines;\n  int resolution;\n  if (kMinCredibleResolution > pixGetXRes(pix)) {\n    resolution = kMinCredibleResolution;\n    tprintf(\"Warning. Invalid resolution %d dpi. Using %d instead.\\n\", pixGetXRes(pix), resolution);\n  } else {\n    resolution = pixGetXRes(pix);\n  }\n\n  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y,\n                                            nullptr, &v_lines, &h_lines);\n  Image im_pix = tesseract::ImageFind::FindImages(pix, nullptr);\n  if (im_pix != nullptr) {\n    pixSubtract(pix, pix, im_pix);\n    im_pix.destroy();\n  }\n  tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks);\n}\n\n// Find connected components in the page and process a subset until finished or\n// a stopping criterion is met.\n// Returns the number of blobs used in making the estimate. 0 implies failure.\nint orientation_and_script_detection(const char *filename, OSResults *osr,\n                                     tesseract::Tesseract *tess) {\n  std::string name = filename; // truncated name\n\n  const char *lastdot = strrchr(name.c_str(), '.');\n  if (lastdot != nullptr) {\n    name[lastdot - name.c_str()] = '\\0';\n  }\n\n  ASSERT_HOST(tess->pix_binary() != nullptr);\n  int width = pixGetWidth(tess->pix_binary());\n  int height = pixGetHeight(tess->pix_binary());\n\n  BLOCK_LIST blocks;\n  if (!read_unlv_file(name, width, height, &blocks)) {\n    FullPageBlock(width, height, &blocks);\n  }\n\n  // Try to remove non-text regions from consideration.\n  TO_BLOCK_LIST land_blocks, port_blocks;\n  remove_nontext_regions(tess, &blocks, &port_blocks);\n\n  if (port_blocks.empty()) {\n    // page segmentation did not succeed, so we need to find_components first.\n    tess->mutable_textord()->find_components(tess->pix_binary(), &blocks, &port_blocks);\n  } else {\n    TBOX page_box(0, 0, width, height);\n    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.\n    tess->mutable_textord()->filter_blobs(page_box.topright(), &port_blocks, true);\n  }\n\n  return os_detect(&port_blocks, osr, tess);\n}\n\n// Filter and sample the blobs.\n// Returns a non-zero number of blobs if the page was successfully processed, or\n// zero if the page had too few characters to be reliable\nint os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess) {\n#if !defined(NDEBUG)\n  int blobs_total = 0;\n#endif\n  TO_BLOCK_IT block_it;\n  block_it.set_to_list(port_blocks);\n\n  BLOBNBOX_CLIST filtered_list;\n  BLOBNBOX_C_IT filtered_it(&filtered_list);\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    TO_BLOCK *to_block = block_it.data();\n    if (to_block->block->pdblk.poly_block() && !to_block->block->pdblk.poly_block()->IsText()) {\n      continue;\n    }\n    BLOBNBOX_IT bbox_it;\n    bbox_it.set_to_list(&to_block->blobs);\n    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n      BLOBNBOX *bbox = bbox_it.data();\n      C_BLOB *blob = bbox->cblob();\n      TBOX box = blob->bounding_box();\n#if !defined(NDEBUG)\n      ++blobs_total;\n#endif\n\n      // Catch illegal value of box width and avoid division by zero.\n      if (box.width() == 0) {\n        continue;\n      }\n      // TODO: Can height and width be negative? If not, remove fabs.\n      float y_x = std::fabs((box.height() * 1.0f) / box.width());\n      float x_y = 1.0f / y_x;\n      // Select a >= 1.0 ratio\n      float ratio = x_y > y_x ? x_y : y_x;\n      // Blob is ambiguous\n      if (ratio > kSizeRatioToReject) {\n        continue;\n      }\n      if (box.height() < kMinAcceptableBlobHeight) {\n        continue;\n      }\n      filtered_it.add_to_end(bbox);\n    }\n  }\n  return os_detect_blobs(nullptr, &filtered_list, osr, tess);\n}\n\n// Detect orientation and script from a list of blobs.\n// Returns a non-zero number of blobs if the list was successfully processed, or\n// zero if the list had too few characters to be reliable.\n// If allowed_scripts is non-null and non-empty, it is a list of scripts that\n// constrains both orientation and script detection to consider only scripts\n// from the list.\nint os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,\n                    OSResults *osr, tesseract::Tesseract *tess) {\n  OSResults osr_;\n  int minCharactersToTry = tess->min_characters_to_try;\n  int maxCharactersToTry = 5 * minCharactersToTry;\n  if (osr == nullptr) {\n    osr = &osr_;\n  }\n\n  osr->unicharset = &tess->unicharset;\n  OrientationDetector o(allowed_scripts, osr);\n  ScriptDetector s(allowed_scripts, osr, tess);\n\n  BLOBNBOX_C_IT filtered_it(blob_list);\n  int real_max = std::min(filtered_it.length(), maxCharactersToTry);\n  // tprintf(\"Total blobs found = %d\\n\", blobs_total);\n  // tprintf(\"Number of blobs post-filtering = %d\\n\", filtered_it.length());\n  // tprintf(\"Number of blobs to try = %d\\n\", real_max);\n\n  // If there are too few characters, skip this page entirely.\n  if (real_max < minCharactersToTry / 2) {\n    tprintf(\"Too few characters. Skipping this page\\n\");\n    return 0;\n  }\n\n  auto **blobs = new BLOBNBOX *[filtered_it.length()];\n  int number_of_blobs = 0;\n  for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list(); filtered_it.forward()) {\n    blobs[number_of_blobs++] = filtered_it.data();\n  }\n  QRSequenceGenerator sequence(number_of_blobs);\n  int num_blobs_evaluated = 0;\n  for (int i = 0; i < real_max; ++i) {\n    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess) && i > minCharactersToTry) {\n      break;\n    }\n    ++num_blobs_evaluated;\n  }\n  delete[] blobs;\n\n  // Make sure the best_result is up-to-date\n  int orientation = o.get_orientation();\n  osr->update_best_script(orientation);\n  return num_blobs_evaluated;\n}\n\n// Processes a single blob to estimate script and orientation.\n// Return true if estimate of orientation and script satisfies stopping\n// criteria.\nbool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *osr,\n                    tesseract::Tesseract *tess) {\n  tess->tess_cn_matching.set_value(true); // turn it on\n  tess->tess_bn_matching.set_value(false);\n  C_BLOB *blob = bbox->cblob();\n  TBLOB *tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);\n  TBOX box = tblob->bounding_box();\n  FCOORD current_rotation(1.0f, 0.0f);\n  FCOORD rotation90(0.0f, 1.0f);\n  BLOB_CHOICE_LIST ratings[4];\n  // Test the 4 orientations\n  for (int i = 0; i < 4; ++i) {\n    // Normalize the blob. Set the origin to the place we want to be the\n    // bottom-middle after rotation.\n    // Scaling is to make the rotated height the x-height.\n    float scaling = static_cast<float>(kBlnXHeight) / box.height();\n    float x_origin = (box.left() + box.right()) / 2.0f;\n    float y_origin = (box.bottom() + box.top()) / 2.0f;\n    if (i == 0 || i == 2) {\n      // Rotation is 0 or 180.\n      y_origin = i == 0 ? box.bottom() : box.top();\n    } else {\n      // Rotation is 90 or 270.\n      scaling = static_cast<float>(kBlnXHeight) / box.width();\n      x_origin = i == 1 ? box.left() : box.right();\n    }\n    std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));\n    rotated_blob->Normalize(nullptr, &current_rotation, nullptr, x_origin, y_origin, scaling,\n                            scaling, 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);\n    tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);\n    current_rotation.rotate(rotation90);\n  }\n  delete tblob;\n\n  bool stop = o->detect_blob(ratings);\n  s->detect_blob(ratings);\n  int orientation = o->get_orientation();\n  stop = s->must_stop(orientation) && stop;\n  return stop;\n}\n\nOrientationDetector::OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *osr) {\n  osr_ = osr;\n  allowed_scripts_ = allowed_scripts;\n}\n\n// Score the given blob and return true if it is now sure of the orientation\n// after adding this block.\nbool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {\n  float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};\n  float total_blob_o_score = 0.0f;\n\n  for (int i = 0; i < 4; ++i) {\n    BLOB_CHOICE_IT choice_it(scores + i);\n    if (!choice_it.empty()) {\n      BLOB_CHOICE *choice = nullptr;\n      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {\n        // Find the top choice in an allowed script.\n        for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;\n             choice_it.forward()) {\n          int choice_script = choice_it.data()->script_id();\n          for (auto script : *allowed_scripts_) {\n            if (script == choice_script) {\n              choice = choice_it.data();\n              break;\n            }\n          }\n        }\n      } else {\n        choice = choice_it.data();\n      }\n      if (choice != nullptr) {\n        // The certainty score ranges between [-20,0]. This is converted here to\n        // [0,1], with 1 indicating best match.\n        blob_o_score[i] = 1 + 0.05 * choice->certainty();\n        total_blob_o_score += blob_o_score[i];\n      }\n    }\n  }\n  if (total_blob_o_score == 0.0) {\n    return false;\n  }\n  // Fill in any blanks with the worst score of the others. This is better than\n  // picking an arbitrary probability for it and way better than -inf.\n  float worst_score = 0.0f;\n  int num_good_scores = 0;\n  for (float f : blob_o_score) {\n    if (f > 0.0f) {\n      ++num_good_scores;\n      if (worst_score == 0.0f || f < worst_score) {\n        worst_score = f;\n      }\n    }\n  }\n  if (num_good_scores == 1) {\n    // Lower worst if there is only one.\n    worst_score /= 2.0f;\n  }\n  for (float &f : blob_o_score) {\n    if (f == 0.0f) {\n      f = worst_score;\n      total_blob_o_score += worst_score;\n    }\n  }\n  // Normalize the orientation scores for the blob and use them to\n  // update the aggregated orientation score.\n  for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {\n    osr_->orientations[i] += std::log(blob_o_score[i] / total_blob_o_score);\n  }\n\n  // TODO(ranjith) Add an early exit test, based on min_orientation_margin,\n  // as used in pagesegmain.cpp.\n  return false;\n}\n\nint OrientationDetector::get_orientation() {\n  osr_->update_best_orientation();\n  return osr_->best_result.orientation_id;\n}\n\nScriptDetector::ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,\n                               tesseract::Tesseract *tess) {\n  osr_ = osr;\n  tess_ = tess;\n  allowed_scripts_ = allowed_scripts;\n  // General scripts\n  katakana_id_ = tess_->unicharset.add_script(\"Katakana\");\n  hiragana_id_ = tess_->unicharset.add_script(\"Hiragana\");\n  han_id_ = tess_->unicharset.add_script(\"Han\");\n  hangul_id_ = tess_->unicharset.add_script(\"Hangul\");\n  latin_id_ = tess_->unicharset.add_script(\"Latin\");\n  // Pseudo-scripts\n  fraktur_id_ = tess_->unicharset.add_script(\"Fraktur\");\n  japanese_id_ = tess_->unicharset.add_script(\"Japanese\");\n  korean_id_ = tess_->unicharset.add_script(\"Korean\");\n}\n\n// Score the given blob and return true if it is now sure of the script after\n// adding this blob.\nvoid ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {\n  for (int i = 0; i < 4; ++i) {\n    std::vector<bool> done(kMaxNumberOfScripts);\n\n    BLOB_CHOICE_IT choice_it;\n    choice_it.set_to_list(scores + i);\n\n    float prev_score = -1;\n    int script_count = 0;\n    int prev_id = -1;\n    int prev_fontinfo_id = -1;\n    const char *prev_unichar = \"\";\n    const char *unichar = \"\";\n\n    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {\n      BLOB_CHOICE *choice = choice_it.data();\n      int id = choice->script_id();\n      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {\n        // Check that the choice is in an allowed script.\n        size_t s = 0;\n        for (s = 0; s < allowed_scripts_->size(); ++s) {\n          if ((*allowed_scripts_)[s] == id) {\n            break;\n          }\n        }\n        if (s == allowed_scripts_->size()) {\n          continue; // Not found in list.\n        }\n      }\n      // Script already processed before.\n      if (done.at(id)) {\n        continue;\n      }\n      done[id] = true;\n\n      unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());\n      // Save data from the first match\n      if (prev_score < 0) {\n        prev_score = -choice->certainty();\n        script_count = 1;\n        prev_id = id;\n        prev_unichar = unichar;\n        prev_fontinfo_id = choice->fontinfo_id();\n      } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {\n        ++script_count;\n      }\n\n      if (strlen(prev_unichar) == 1) {\n        if (unichar[0] >= '0' && unichar[0] <= '9') {\n          break;\n        }\n      }\n\n      // if script_count is >= 2, character is ambiguous, skip other matches\n      // since they are useless.\n      if (script_count >= 2) {\n        break;\n      }\n    }\n    // Character is non ambiguous\n    if (script_count == 1) {\n      // Update the score of the winning script\n      osr_->scripts_na[i][prev_id] += 1.0;\n\n      // Workaround for Fraktur\n      if (prev_id == latin_id_) {\n        if (prev_fontinfo_id >= 0) {\n          const tesseract::FontInfo &fi = tess_->get_fontinfo_table().at(prev_fontinfo_id);\n          // printf(\"Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\\n\", fi.name,\n          //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),\n          //       fi.is_serif(), fi.is_fraktur(),\n          //       prev_unichar);\n          if (fi.is_fraktur()) {\n            osr_->scripts_na[i][prev_id] -= 1.0;\n            osr_->scripts_na[i][fraktur_id_] += 1.0;\n          }\n        }\n      }\n\n      // Update Japanese / Korean pseudo-scripts\n      if (prev_id == katakana_id_) {\n        osr_->scripts_na[i][japanese_id_] += 1.0;\n      }\n      if (prev_id == hiragana_id_) {\n        osr_->scripts_na[i][japanese_id_] += 1.0;\n      }\n      if (prev_id == hangul_id_) {\n        osr_->scripts_na[i][korean_id_] += 1.0;\n      }\n      if (prev_id == han_id_) {\n        osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;\n        osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;\n      }\n    }\n  } // iterate over each orientation\n}\n\nbool ScriptDetector::must_stop(int orientation) const {\n  osr_->update_best_script(orientation);\n  return osr_->best_result.sconfidence > 1;\n}\n\n// Helper method to convert an orientation index to its value in degrees.\n// The value represents the amount of clockwise rotation in degrees that must be\n// applied for the text to be upright (readable).\nint OrientationIdToValue(const int &id) {\n  switch (id) {\n    case 0:\n      return 0;\n    case 1:\n      return 270;\n    case 2:\n      return 180;\n    case 3:\n      return 90;\n    default:\n      return -1;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/output.cpp",
    "content": "/******************************************************************\n * File:        output.cpp  (Formerly output.c)\n * Description: Output pass\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"output.h\"\n\n#include \"control.h\"\n#include \"tesseractclass.h\"\n#include \"tessvars.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"docqual.h\"\n#  include \"reject.h\"\n#endif\n\n#include \"helpers.h\"\n\n#include <cctype>\n#include <cerrno>\n#include <cstring>\n\n#define CTRL_NEWLINE '\\012'  // newline\n#define CTRL_HARDLINE '\\015' // cr\n\nnamespace tesseract {\nvoid Tesseract::output_pass( // Tess output pass //send to api\n    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {\n  BLOCK_RES *block_of_last_word;\n  bool force_eol;   // During output\n  BLOCK *nextblock; // block of next word\n  WERD *nextword;   // next word\n\n  page_res_it.restart_page();\n  block_of_last_word = nullptr;\n  while (page_res_it.word() != nullptr) {\n    check_debug_pt(page_res_it.word(), 120);\n\n    if (target_word_box) {\n      TBOX current_word_box = page_res_it.word()->word->bounding_box();\n      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,\n                       (current_word_box.bottom() + current_word_box.top()) / 2);\n      if (!target_word_box->contains(center_pt)) {\n        page_res_it.forward();\n        continue;\n      }\n    }\n    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {\n      block_of_last_word = page_res_it.block();\n    }\n\n    force_eol =\n        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||\n        (page_res_it.next_word() == nullptr);\n\n    if (page_res_it.next_word() != nullptr) {\n      nextword = page_res_it.next_word()->word;\n    } else {\n      nextword = nullptr;\n    }\n    if (page_res_it.next_block() != nullptr) {\n      nextblock = page_res_it.next_block()->block;\n    } else {\n      nextblock = nullptr;\n    }\n    // regardless of tilde crunching\n    write_results(page_res_it,\n                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,\n                                         nextword, nextblock),\n                  force_eol);\n    page_res_it.forward();\n  }\n}\n\n/*************************************************************************\n * write_results()\n *\n * All recognition and rejection has now been done. Generate the following:\n *   .txt file     - giving the final best choices with NO highlighting\n *   .raw file     - giving the tesseract top choice output for each word\n *   .map file     - showing how the .txt file has been rejected in the .ep file\n *   epchoice list - a list of one element per word, containing the text for the\n *                   epaper. Reject strings are inserted.\n *   inset list    - a list of bounding boxes of reject insets - indexed by the\n *                   reject strings in the epchoice text.\n *************************************************************************/\nvoid Tesseract::write_results(PAGE_RES_IT &page_res_it,\n                              char newline_type, // type of newline\n                              bool force_eol) {  // override tilde crunch?\n  WERD_RES *word = page_res_it.word();\n  const UNICHARSET &uchset = *word->uch_set;\n  UNICHAR_ID space = uchset.unichar_to_id(\" \");\n\n  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&\n      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {\n    bool need_reject = false;\n    if ((word->unlv_crunch_mode != CR_DELETE) &&\n        (!stats_.tilde_crunch_written ||\n         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&\n          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {\n      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&\n          !word->word->flag(W_FUZZY_SP)) {\n        stats_.last_char_was_tilde = false;\n      }\n      need_reject = true;\n    }\n    if ((need_reject && !stats_.last_char_was_tilde) ||\n        (force_eol && stats_.write_results_empty_block)) {\n      /* Write a reject char - mark as rejected unless zero_rejection mode */\n      stats_.last_char_was_tilde = true;\n      stats_.tilde_crunch_written = true;\n      stats_.last_char_was_newline = false;\n      stats_.write_results_empty_block = false;\n    }\n\n    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {\n      stats_.tilde_crunch_written = false;\n      stats_.last_char_was_newline = true;\n      stats_.last_char_was_tilde = false;\n    }\n\n    if (force_eol) {\n      stats_.write_results_empty_block = true;\n    }\n    return;\n  }\n\n  /* NORMAL PROCESSING of non tilde crunched words */\n\n  stats_.tilde_crunch_written = false;\n  if (newline_type) {\n    stats_.last_char_was_newline = true;\n  } else {\n    stats_.last_char_was_newline = false;\n  }\n  stats_.write_results_empty_block = force_eol; // about to write a real word\n\n  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&\n      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&\n      (word->best_choice->unichar_id(0) == space)) {\n    /* Prevent adjacent tilde across words - we know that adjacent tildes within\n   words have been removed */\n    word->MergeAdjacentBlobs(0);\n  }\n  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {\n    stats_.last_char_was_tilde = false;\n  } else {\n    if (word->reject_map.length() > 0) {\n      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {\n        stats_.last_char_was_tilde = true;\n      } else {\n        stats_.last_char_was_tilde = false;\n      }\n    } else if (word->word->space() > 0) {\n      stats_.last_char_was_tilde = false;\n    }\n    /* else it is unchanged as there are no output chars */\n  }\n\n  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());\n\n  set_unlv_suspects(word);\n  check_debug_pt(word, 120);\n  if (tessedit_rejection_debug) {\n    tprintf(\"Dict word: \\\"%s\\\": %d\\n\", word->best_choice->debug_string().c_str(),\n            dict_word(*(word->best_choice)));\n  }\n  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {\n    if (tessedit_zero_rejection) {\n      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */\n      for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n        if (word->reject_map[i].rejected()) {\n          word->reject_map[i].setrej_minimal_rej_accept();\n        }\n      }\n    }\n    if (tessedit_minimal_rejection) {\n      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */\n      for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {\n          word->reject_map[i].setrej_minimal_rej_accept();\n        }\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * determine_newline_type\n *\n * Find whether we have a wrapping or hard newline.\n * Return false if not at end of line.\n **********************************************************************/\n\nchar determine_newline_type( // test line ends\n    WERD *word,              // word to do\n    BLOCK *block,            // current block\n    WERD *next_word,         // next word\n    BLOCK *next_block        // block of next word\n) {\n  int16_t end_gap; // to right edge\n  int16_t width;   // of next word\n  TBOX word_box;   // bounding\n  TBOX next_box;   // next word\n  TBOX block_box;  // block bounding\n\n  if (!word->flag(W_EOL)) {\n    return false; // not end of line\n  }\n  if (next_word == nullptr || next_block == nullptr || block != next_block) {\n    return CTRL_NEWLINE;\n  }\n  if (next_word->space() > 0) {\n    return CTRL_HARDLINE; // it is tabbed\n  }\n  word_box = word->bounding_box();\n  next_box = next_word->bounding_box();\n  block_box = block->pdblk.bounding_box();\n  // gap to eol\n  end_gap = block_box.right() - word_box.right();\n  end_gap -= static_cast<int32_t>(block->space());\n  width = next_box.right() - next_box.left();\n  //      tprintf(\"end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\\n\",\n  //              block_box.right(),word_box.right(),end_gap,\n  //              next_box.right(),next_box.left(),width,\n  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);\n  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;\n}\n\n/*************************************************************************\n * get_rep_char()\n * Return the first accepted character from the repetition string. This is the\n * character which is repeated - as determined earlier by fix_rep_char()\n *************************************************************************/\nUNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?\n  int i;\n  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {\n    ;\n  }\n\n  if (i < word->reject_map.length()) {\n    return word->best_choice->unichar_id(i);\n  } else {\n    return word->uch_set->unichar_to_id(unrecognised_char.c_str());\n  }\n}\n\n/*************************************************************************\n * SUSPECT LEVELS\n *\n * 0 - don't reject ANYTHING\n * 1,2 - partial rejection\n * 3 - BEST\n *\n * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and\n * tessedit_minimal_rejection.\n *************************************************************************/\nvoid Tesseract::set_unlv_suspects(WERD_RES *word_res) {\n  int len = word_res->reject_map.length();\n  const WERD_CHOICE &word = *(word_res->best_choice);\n  const UNICHARSET &uchset = *word.unicharset();\n  int i;\n  float rating_per_ch;\n\n  if (suspect_level == 0) {\n    for (i = 0; i < len; i++) {\n      if (word_res->reject_map[i].rejected()) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n    }\n    return;\n  }\n\n  if (suspect_level >= 3) {\n    return; // Use defaults\n  }\n\n  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/\n\n  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {\n    /* Unreject alphas in dictionary words */\n    for (i = 0; i < len; ++i) {\n      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n    }\n  }\n\n  rating_per_ch = word.rating() / word_res->reject_map.length();\n\n  if (rating_per_ch >= suspect_rating_per_ch) {\n    return; // Don't touch bad ratings\n  }\n\n  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {\n    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/\n    for (i = 0; i < len; ++i) {\n      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), \" \"))) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n    }\n  }\n\n  for (i = 0; i < len; i++) {\n    if (word_res->reject_map[i].rejected()) {\n      if (word_res->reject_map[i].flag(R_DOC_REJ)) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n      if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n      if (word_res->reject_map[i].flag(R_ROW_REJ)) {\n        word_res->reject_map[i].setrej_minimal_rej_accept();\n      }\n    }\n  }\n\n  if (suspect_level == 2) {\n    return;\n  }\n\n  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {\n    for (i = 0; i < len; i++) {\n      if (word_res->reject_map[i].rejected()) {\n        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||\n             word_res->reject_map[i].flag(R_POSTNN_1IL))) {\n          word_res->reject_map[i].setrej_minimal_rej_accept();\n        }\n\n        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {\n          word_res->reject_map[i].setrej_minimal_rej_accept();\n        }\n      }\n    }\n  }\n\n  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),\n                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||\n      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {\n    if (word_res->reject_map.length() > suspect_short_words) {\n      for (i = 0; i < len; i++) {\n        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||\n                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||\n                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||\n                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {\n          word_res->reject_map[i].setrej_minimal_rej_accept();\n        }\n      }\n    }\n  }\n}\n\nint16_t Tesseract::count_alphas(const WERD_CHOICE &word) {\n  int count = 0;\n  for (unsigned i = 0; i < word.length(); ++i) {\n    if (word.unicharset()->get_isalpha(word.unichar_id(i))) {\n      count++;\n    }\n  }\n  return count;\n}\n\nint16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {\n  int count = 0;\n  for (unsigned i = 0; i < word.length(); ++i) {\n    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||\n        word.unicharset()->get_isdigit(word.unichar_id(i))) {\n      count++;\n    }\n  }\n  return count;\n}\n\nbool Tesseract::acceptable_number_string(const char *s, const char *lengths) {\n  bool prev_digit = false;\n\n  if (*lengths == 1 && *s == '(') {\n    s++;\n  }\n\n  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {\n    s++;\n  }\n\n  for (; *s != '\\0'; s += *(lengths++)) {\n    if (unicharset.get_isdigit(s, *lengths)) {\n      prev_digit = true;\n    } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {\n      prev_digit = false;\n    } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\\0') &&\n               ((*s == '%') || (*s == ')'))) {\n      return true;\n    } else if (prev_digit && *lengths == 1 && (*s == '%') &&\n               (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&\n               (*(s + *lengths + *(lengths + 1)) == '\\0')) {\n      return true;\n    } else {\n      return false;\n    }\n  }\n  return true;\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/output.h",
    "content": "/******************************************************************\n * File:        output.h  (Formerly output.h)\n * Description: Output pass\n * Author:      Phil Cheatle\n * Created:     Thu Aug  4 10:56:08 BST 1994\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef OUTPUT_H\n#define OUTPUT_H\n\nnamespace tesseract {\n\nclass BLOCK;\nclass WERD;\n\n/** test line ends */\nchar determine_newline_type(WERD *word,       ///< word to do\n                            BLOCK *block,     ///< current block\n                            WERD *next_word,  ///< next word\n                            BLOCK *next_block ///< block of next word\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccmain/pageiterator.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        pageiterator.cpp\n// Description: Iterator for tesseract page structure that avoids using\n//              tesseract internal data structures.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <allheaders.h>\n#include <tesseract/pageiterator.h>\n#include \"helpers.h\"\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\nPageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,\n                           int scaled_yres, int rect_left, int rect_top,\n                           int rect_width, int rect_height)\n    : page_res_(page_res),\n      tesseract_(tesseract),\n      word_(nullptr),\n      word_length_(0),\n      blob_index_(0),\n      cblob_it_(nullptr),\n      include_upper_dots_(false),\n      include_lower_dots_(false),\n      scale_(scale),\n      scaled_yres_(scaled_yres),\n      rect_left_(rect_left),\n      rect_top_(rect_top),\n      rect_width_(rect_width),\n      rect_height_(rect_height) {\n  it_ = new PAGE_RES_IT(page_res);\n  PageIterator::Begin();\n}\n\nPageIterator::~PageIterator() {\n  delete it_;\n  delete cblob_it_;\n}\n\n/**\n * PageIterators may be copied! This makes it possible to iterate over\n * all the objects at a lower level, while maintaining an iterator to\n * objects at a higher level.\n */\nPageIterator::PageIterator(const PageIterator &src)\n    : page_res_(src.page_res_),\n      tesseract_(src.tesseract_),\n      word_(nullptr),\n      word_length_(src.word_length_),\n      blob_index_(src.blob_index_),\n      cblob_it_(nullptr),\n      include_upper_dots_(src.include_upper_dots_),\n      include_lower_dots_(src.include_lower_dots_),\n      scale_(src.scale_),\n      scaled_yres_(src.scaled_yres_),\n      rect_left_(src.rect_left_),\n      rect_top_(src.rect_top_),\n      rect_width_(src.rect_width_),\n      rect_height_(src.rect_height_) {\n  it_ = new PAGE_RES_IT(*src.it_);\n  BeginWord(src.blob_index_);\n}\n\nconst PageIterator &PageIterator::operator=(const PageIterator &src) {\n  page_res_ = src.page_res_;\n  tesseract_ = src.tesseract_;\n  include_upper_dots_ = src.include_upper_dots_;\n  include_lower_dots_ = src.include_lower_dots_;\n  scale_ = src.scale_;\n  scaled_yres_ = src.scaled_yres_;\n  rect_left_ = src.rect_left_;\n  rect_top_ = src.rect_top_;\n  rect_width_ = src.rect_width_;\n  rect_height_ = src.rect_height_;\n  delete it_;\n  it_ = new PAGE_RES_IT(*src.it_);\n  BeginWord(src.blob_index_);\n  return *this;\n}\n\nbool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {\n  return (it_ == nullptr && it_ == other) ||\n         ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));\n}\n\n// ============= Moving around within the page ============.\n\n/** Resets the iterator to point to the start of the page. */\nvoid PageIterator::Begin() {\n  it_->restart_page_with_empties();\n  BeginWord(0);\n}\n\nvoid PageIterator::RestartParagraph() {\n  if (it_->block() == nullptr) {\n    return; // At end of the document.\n  }\n  PAGE_RES_IT para(page_res_);\n  PAGE_RES_IT next_para(para);\n  next_para.forward_paragraph();\n  while (next_para.cmp(*it_) <= 0) {\n    para = next_para;\n    next_para.forward_paragraph();\n  }\n  *it_ = para;\n  BeginWord(0);\n}\n\nbool PageIterator::IsWithinFirstTextlineOfParagraph() const {\n  PageIterator p_start(*this);\n  p_start.RestartParagraph();\n  return p_start.it_->row() == it_->row();\n}\n\nvoid PageIterator::RestartRow() {\n  it_->restart_row();\n  BeginWord(0);\n}\n\n/**\n * Moves to the start of the next object at the given level in the\n * page hierarchy, and returns false if the end of the page was reached.\n * NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each\n * non-text block at least once.\n * Think of non text blocks as containing a single para, with at least one\n * line, with a single imaginary word, containing a single symbol.\n * The bounding boxes mark out any polygonal nature of the block, and\n * PTIsTextType(BLockType()) is false for non-text blocks.\n * Calls to Next with different levels may be freely intermixed.\n * This function iterates words in right-to-left scripts correctly, if\n * the appropriate language has been loaded into Tesseract.\n */\nbool PageIterator::Next(PageIteratorLevel level) {\n  if (it_->block() == nullptr) {\n    return false; // Already at the end!\n  }\n  if (it_->word() == nullptr) {\n    level = RIL_BLOCK;\n  }\n\n  switch (level) {\n    case RIL_BLOCK:\n      it_->forward_block();\n      break;\n    case RIL_PARA:\n      it_->forward_paragraph();\n      break;\n    case RIL_TEXTLINE:\n      for (it_->forward_with_empties(); it_->row() == it_->prev_row();\n           it_->forward_with_empties()) {\n        ;\n      }\n      break;\n    case RIL_WORD:\n      it_->forward_with_empties();\n      break;\n    case RIL_SYMBOL:\n      if (cblob_it_ != nullptr) {\n        cblob_it_->forward();\n      }\n      ++blob_index_;\n      if (blob_index_ >= word_length_) {\n        it_->forward_with_empties();\n      } else {\n        return true;\n      }\n      break;\n  }\n  BeginWord(0);\n  return it_->block() != nullptr;\n}\n\n/**\n * Returns true if the iterator is at the start of an object at the given\n * level. Possible uses include determining if a call to Next(RIL_WORD)\n * moved to the start of a RIL_PARA.\n */\nbool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {\n  if (it_->block() == nullptr) {\n    return false; // Already at the end!\n  }\n  if (it_->word() == nullptr) {\n    return true; // In an image block.\n  }\n  switch (level) {\n    case RIL_BLOCK:\n      return blob_index_ == 0 && it_->block() != it_->prev_block();\n    case RIL_PARA:\n      return blob_index_ == 0 &&\n             (it_->block() != it_->prev_block() ||\n              it_->row()->row->para() != it_->prev_row()->row->para());\n    case RIL_TEXTLINE:\n      return blob_index_ == 0 && it_->row() != it_->prev_row();\n    case RIL_WORD:\n      return blob_index_ == 0;\n    case RIL_SYMBOL:\n      return true;\n  }\n  return false;\n}\n\n/**\n * Returns whether the iterator is positioned at the last element in a\n * given level. (e.g. the last word in a line, the last line in a block)\n */\nbool PageIterator::IsAtFinalElement(PageIteratorLevel level,\n                                    PageIteratorLevel element) const {\n  if (Empty(element)) {\n    return true; // Already at the end!\n  }\n  // The result is true if we step forward by element and find we are\n  // at the end of the page or at beginning of *all* levels in:\n  // [level, element).\n  // When there is more than one level difference between element and level,\n  // we could for instance move forward one symbol and still be at the first\n  // word on a line, so we also have to be at the first symbol in a word.\n  PageIterator next(*this);\n  next.Next(element);\n  if (next.Empty(element)) {\n    return true; // Reached the end of the page.\n  }\n  while (element > level) {\n    element = static_cast<PageIteratorLevel>(element - 1);\n    if (!next.IsAtBeginningOf(element)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n/**\n * Returns whether this iterator is positioned\n *   before other:   -1\n *   equal to other:  0\n *   after other:     1\n */\nint PageIterator::Cmp(const PageIterator &other) const {\n  int word_cmp = it_->cmp(*other.it_);\n  if (word_cmp != 0) {\n    return word_cmp;\n  }\n  if (blob_index_ < other.blob_index_) {\n    return -1;\n  }\n  if (blob_index_ == other.blob_index_) {\n    return 0;\n  }\n  return 1;\n}\n\n// ============= Accessing data ==============.\n// Coordinate system:\n// Integer coordinates are at the cracks between the pixels.\n// The top-left corner of the top-left pixel in the image is at (0,0).\n// The bottom-right corner of the bottom-right pixel in the image is at\n// (width, height).\n// Every bounding box goes from the top-left of the top-left contained\n// pixel to the bottom-right of the bottom-right contained pixel, so\n// the bounding box of the single top-left pixel in the image is:\n// (0,0)->(1,1).\n// If an image rectangle has been set in the API, then returned coordinates\n// relate to the original (full) image, rather than the rectangle.\n\n/**\n * Returns the bounding rectangle of the current object at the given level in\n * the coordinates of the working image that is pix_binary().\n * See comment on coordinate system above.\n * Returns false if there is no such object at the current position.\n */\nbool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left,\n                                       int *top, int *right,\n                                       int *bottom) const {\n  if (Empty(level)) {\n    return false;\n  }\n  TBOX box;\n  PARA *para = nullptr;\n  switch (level) {\n    case RIL_BLOCK:\n      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,\n                                                         include_lower_dots_);\n      break;\n    case RIL_PARA:\n      para = it_->row()->row->para();\n      // Fall through.\n    case RIL_TEXTLINE:\n      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,\n                                                     include_lower_dots_);\n      break;\n    case RIL_WORD:\n      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,\n                                                       include_lower_dots_);\n      break;\n    case RIL_SYMBOL:\n      if (cblob_it_ == nullptr) {\n        box = it_->word()->box_word->BlobBox(blob_index_);\n      } else {\n        box = cblob_it_->data()->bounding_box();\n      }\n  }\n  if (level == RIL_PARA) {\n    PageIterator other = *this;\n    other.Begin();\n    do {\n      if (other.it_->block() &&\n          other.it_->block()->block == it_->block()->block &&\n          other.it_->row() && other.it_->row()->row &&\n          other.it_->row()->row->para() == para) {\n        box = box.bounding_union(other.it_->row()->row->bounding_box());\n      }\n    } while (other.Next(RIL_TEXTLINE));\n  }\n  if (level != RIL_SYMBOL || cblob_it_ != nullptr) {\n    box.rotate(it_->block()->block->re_rotation());\n  }\n  // Now we have a box in tesseract coordinates relative to the image rectangle,\n  // we have to convert the coords to a top-down system.\n  const int pix_height = pixGetHeight(tesseract_->pix_binary());\n  const int pix_width = pixGetWidth(tesseract_->pix_binary());\n  *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);\n  *top = ClipToRange(pix_height - box.top(), 0, pix_height);\n  *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);\n  *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);\n  return true;\n}\n\n/**\n * Returns the bounding rectangle of the current object at the given level in\n * coordinates of the original image.\n * See comment on coordinate system above.\n * Returns false if there is no such object at the current position.\n */\nbool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top,\n                               int *right, int *bottom) const {\n  return BoundingBox(level, 0, left, top, right, bottom);\n}\n\nbool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,\n                               int *left, int *top, int *right,\n                               int *bottom) const {\n  if (!BoundingBoxInternal(level, left, top, right, bottom)) {\n    return false;\n  }\n  // Convert to the coordinate system of the original image.\n  *left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_,\n                      rect_left_ + rect_width_);\n  *top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_,\n                     rect_top_ + rect_height_);\n  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,\n                       *left, rect_left_ + rect_width_);\n  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,\n                        *top, rect_top_ + rect_height_);\n  return true;\n}\n\n/** Return that there is no such object at a given level. */\nbool PageIterator::Empty(PageIteratorLevel level) const {\n  if (it_->block() == nullptr) {\n    return true; // Already at the end!\n  }\n  if (it_->word() == nullptr && level != RIL_BLOCK) {\n    return true; // image block\n  }\n  if (level == RIL_SYMBOL && blob_index_ >= word_length_) {\n    return true; // Zero length word, or already at the end of it.\n  }\n  return false;\n}\n\n/** Returns the type of the current block.\n *  See tesseract/publictypes.h for PolyBlockType. */\nPolyBlockType PageIterator::BlockType() const {\n  if (it_->block() == nullptr || it_->block()->block == nullptr) {\n    return PT_UNKNOWN; // Already at the end!\n  }\n  if (it_->block()->block->pdblk.poly_block() == nullptr) {\n    return PT_FLOWING_TEXT; // No layout analysis used - assume text.\n  }\n  return it_->block()->block->pdblk.poly_block()->isA();\n}\n\n/** Returns the polygon outline of the current block. The returned Pta must\n *  be ptaDestroy-ed after use. */\nPta *PageIterator::BlockPolygon() const {\n  if (it_->block() == nullptr || it_->block()->block == nullptr) {\n    return nullptr; // Already at the end!\n  }\n  if (it_->block()->block->pdblk.poly_block() == nullptr) {\n    return nullptr; // No layout analysis used - no polygon.\n  }\n  // Copy polygon, so we can unrotate it to image coordinates.\n  POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();\n  ICOORDELT_LIST vertices;\n  vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);\n  POLY_BLOCK poly(&vertices, internal_poly->isA());\n  poly.rotate(it_->block()->block->re_rotation());\n  ICOORDELT_IT it(poly.points());\n  Pta *pta = ptaCreate(it.length());\n  int num_pts = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {\n    ICOORD *pt = it.data();\n    // Convert to top-down coords within the input image.\n    int x = static_cast<float>(pt->x()) / scale_ + rect_left_;\n    int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;\n    x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);\n    y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);\n    ptaAddPt(pta, x, y);\n  }\n  return pta;\n}\n\n/**\n * Returns a binary image of the current object at the given level.\n * The position and size match the return from BoundingBoxInternal, and so this\n * could be upscaled with respect to the original input image.\n * Use pixDestroy to delete the image after use.\n * The following methods are used to generate the images:\n * RIL_BLOCK: mask the page image with the block polygon.\n * RIL_TEXTLINE: Clip the rectangle of the line box from the page image.\n * TODO(rays) fix this to generate and use a line polygon.\n * RIL_WORD: Clip the rectangle of the word box from the page image.\n * RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior\n * to recognition) or the bounding box otherwise.\n * A reconstruction of the original image (using xor to check for double\n * representation) should be reasonably accurate,\n * apart from removed noise, at the block level. Below the block level, the\n * reconstruction will be missing images and line separators.\n * At the symbol level, kerned characters will be invade the bounding box\n * if rendered after recognition, making an xor reconstruction inaccurate, but\n * an or construction better. Before recognition, symbol-level reconstruction\n * should be good, even with xor, since the images come from the connected\n * components.\n */\nPix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {\n  int left, top, right, bottom;\n  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) {\n    return nullptr;\n  }\n  if (level == RIL_SYMBOL && cblob_it_ != nullptr &&\n      cblob_it_->data()->area() != 0) {\n    return cblob_it_->data()->render();\n  }\n  Box *box = boxCreate(left, top, right - left, bottom - top);\n  Image pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);\n  boxDestroy(&box);\n  if (level == RIL_BLOCK || level == RIL_PARA) {\n    // Clip to the block polygon as well.\n    TBOX mask_box;\n    Image mask = it_->block()->block->render_mask(&mask_box);\n    int mask_x = left - mask_box.left();\n    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());\n    // AND the mask and pix, putting the result in pix.\n    pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y),\n                pixGetWidth(pix), pixGetHeight(pix), PIX_SRC & PIX_DST, mask,\n                std::max(0, mask_x), std::max(0, mask_y));\n    mask.destroy();\n  }\n  return pix;\n}\n\n/**\n * Returns an image of the current object at the given level in greyscale\n * if available in the input. To guarantee a binary image use BinaryImage.\n * NOTE that in order to give the best possible image, the bounds are\n * expanded slightly over the binary connected component, by the supplied\n * padding, so the top-left position of the returned image is returned\n * in (left,top). These will most likely not match the coordinates\n * returned by BoundingBox.\n * If you do not supply an original image, you will get a binary one.\n * Use pixDestroy to delete the image after use.\n */\nPix *PageIterator::GetImage(PageIteratorLevel level, int padding,\n                            Pix *original_img, int *left, int *top) const {\n  int right, bottom;\n  if (!BoundingBox(level, left, top, &right, &bottom)) {\n    return nullptr;\n  }\n  if (original_img == nullptr) {\n    return GetBinaryImage(level);\n  }\n\n  // Expand the box.\n  *left = std::max(*left - padding, 0);\n  *top = std::max(*top - padding, 0);\n  right = std::min(right + padding, rect_width_);\n  bottom = std::min(bottom + padding, rect_height_);\n  Box *box = boxCreate(*left, *top, right - *left, bottom - *top);\n  Image grey_pix = pixClipRectangle(original_img, box, nullptr);\n  boxDestroy(&box);\n  if (level == RIL_BLOCK || level == RIL_PARA) {\n    // Clip to the block polygon as well.\n    TBOX mask_box;\n    Image mask = it_->block()->block->render_mask(&mask_box);\n    // Copy the mask registered correctly into an image the size of grey_pix.\n    int mask_x = *left - mask_box.left();\n    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());\n    int width = pixGetWidth(grey_pix);\n    int height = pixGetHeight(grey_pix);\n    Image resized_mask = pixCreate(width, height, 1);\n    pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width,\n                height, PIX_SRC, mask, std::max(0, mask_x),\n                std::max(0, mask_y));\n    mask.destroy();\n    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,\n                   2 * padding + 1);\n    pixInvert(resized_mask, resized_mask);\n    pixSetMasked(grey_pix, resized_mask, UINT32_MAX);\n    resized_mask.destroy();\n  }\n  return grey_pix;\n}\n\n/**\n * Returns the baseline of the current object at the given level.\n * The baseline is the line that passes through (x1, y1) and (x2, y2).\n * WARNING: with vertical text, baselines may be vertical!\n */\nbool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,\n                            int *y2) const {\n  if (it_->word() == nullptr) {\n    return false; // Already at the end!\n  }\n  ROW *row = it_->row()->row;\n  WERD *word = it_->word()->word;\n  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box()\n                                                        : row->bounding_box();\n  int left = box.left();\n  ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));\n  int right = box.right();\n  ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));\n  // Rotate to image coordinates and convert to global image coords.\n  startpt.rotate(it_->block()->block->re_rotation());\n  endpt.rotate(it_->block()->block->re_rotation());\n  *x1 = startpt.x() / scale_ + rect_left_;\n  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;\n  *x2 = endpt.x() / scale_ + rect_left_;\n  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;\n  return true;\n}\n\nvoid PageIterator::RowAttributes(float *row_height, float *descenders,\n                                 float *ascenders) const {\n  *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -\n                it_->row()->row->descenders();\n  *descenders = it_->row()->row->descenders();\n  *ascenders = it_->row()->row->ascenders();\n}\n\nvoid PageIterator::Orientation(tesseract::Orientation *orientation,\n                               tesseract::WritingDirection *writing_direction,\n                               tesseract::TextlineOrder *textline_order,\n                               float *deskew_angle) const {\n  auto *block_res = it_->block();\n  if (block_res == nullptr) {\n    // Nothing can be done, so return default values.\n    *orientation = ORIENTATION_PAGE_UP;\n    *writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;\n    *textline_order = TEXTLINE_ORDER_TOP_TO_BOTTOM;\n    return;\n  }\n  auto *block = block_res->block;\n\n  // Orientation\n  FCOORD up_in_image(0.0, 1.0);\n  up_in_image.unrotate(block->classify_rotation());\n  up_in_image.rotate(block->re_rotation());\n\n  if (up_in_image.x() == 0.0F) {\n    if (up_in_image.y() > 0.0F) {\n      *orientation = ORIENTATION_PAGE_UP;\n    } else {\n      *orientation = ORIENTATION_PAGE_DOWN;\n    }\n  } else if (up_in_image.x() > 0.0F) {\n    *orientation = ORIENTATION_PAGE_RIGHT;\n  } else {\n    *orientation = ORIENTATION_PAGE_LEFT;\n  }\n\n  // Writing direction\n  bool is_vertical_text = (block->classify_rotation().x() == 0.0);\n  bool right_to_left = block->right_to_left();\n  *writing_direction = is_vertical_text\n                           ? WRITING_DIRECTION_TOP_TO_BOTTOM\n                           : (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT\n                                            : WRITING_DIRECTION_LEFT_TO_RIGHT);\n\n  // Textline Order\n  const bool is_mongolian = false; // TODO(eger): fix me\n  *textline_order = is_vertical_text\n                        ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT\n                                        : TEXTLINE_ORDER_RIGHT_TO_LEFT)\n                        : TEXTLINE_ORDER_TOP_TO_BOTTOM;\n\n  // Deskew angle\n  FCOORD skew = block->skew(); // true horizontal for textlines\n  *deskew_angle = -skew.angle();\n}\n\nvoid PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,\n                                 bool *is_list_item, bool *is_crown,\n                                 int *first_line_indent) const {\n  *just = tesseract::JUSTIFICATION_UNKNOWN;\n  if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||\n      !it_->row()->row->para()->model) {\n    return;\n  }\n\n  PARA *para = it_->row()->row->para();\n  *is_list_item = para->is_list_item;\n  *is_crown = para->is_very_first_or_continuation;\n  *first_line_indent = para->model->first_indent() - para->model->body_indent();\n  *just = para->model->justification();\n}\n\n/**\n * Sets up the internal data for iterating the blobs of a new word, then\n * moves the iterator to the given offset.\n */\nvoid PageIterator::BeginWord(int offset) {\n  WERD_RES *word_res = it_->word();\n  if (word_res == nullptr) {\n    // This is a non-text block, so there is no word.\n    word_length_ = 0;\n    blob_index_ = 0;\n    word_ = nullptr;\n    return;\n  }\n  if (word_res->best_choice != nullptr) {\n    // Recognition has been done, so we are using the box_word, which\n    // is already baseline denormalized.\n    word_length_ = word_res->best_choice->length();\n    if (word_res->box_word != nullptr) {\n      if (word_res->box_word->length() != static_cast<unsigned>(word_length_)) {\n        tprintf(\"Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: \",\n                word_length_, word_res->best_choice->unichar_string().c_str(),\n                word_res->box_word->length());\n        word_res->box_word->bounding_box().print();\n      }\n      ASSERT_HOST(word_res->box_word->length() ==\n                  static_cast<unsigned>(word_length_));\n    }\n    word_ = nullptr;\n    // We will be iterating the box_word.\n    delete cblob_it_;\n    cblob_it_ = nullptr;\n  } else {\n    // No recognition yet, so a \"symbol\" is a cblob.\n    word_ = word_res->word;\n    ASSERT_HOST(word_->cblob_list() != nullptr);\n    word_length_ = word_->cblob_list()->length();\n    if (cblob_it_ == nullptr) {\n      cblob_it_ = new C_BLOB_IT;\n    }\n    cblob_it_->set_to_list(word_->cblob_list());\n  }\n  for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {\n    if (cblob_it_ != nullptr) {\n      cblob_it_->forward();\n    }\n  }\n}\n\nbool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {\n  if (it_->word() != nullptr) {\n    it_->word()->blamer_bundle = blamer_bundle;\n    return true;\n  } else {\n    return false;\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/pagesegmain.cpp",
    "content": "/**********************************************************************\n * File:        pagesegmain.cpp\n * Description: Top-level page segmenter for Tesseract.\n * Author:      Ray Smith\n *\n * (C) Copyright 2008, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifdef _WIN32\n#  ifndef unlink\n#    include <io.h>\n#  endif\n#else\n#  include <unistd.h>\n#endif // _WIN32\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <allheaders.h>\n#include \"blobbox.h\"\n#include \"blread.h\"\n#include \"colfind.h\"\n#include \"debugpixa.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"equationdetect.h\"\n#endif\n#include <tesseract/osdetect.h>\n#include \"imagefind.h\"\n#include \"linefind.h\"\n#include \"makerow.h\"\n#include \"tabvector.h\"\n#include \"tesseractclass.h\"\n#include \"tessvars.h\"\n#include \"textord.h\"\n#include \"tordmain.h\"\n#include \"wordseg.h\"\n\nnamespace tesseract {\n\n// Max erosions to perform in removing an enclosing circle.\nconst int kMaxCircleErosions = 8;\n\n// Helper to remove an enclosing circle from an image.\n// If there isn't one, then the image will most likely get badly mangled.\n// The returned pix must be pixDestroyed after use. nullptr may be returned\n// if the image doesn't meet the trivial conditions that it uses to determine\n// success.\nstatic Image RemoveEnclosingCircle(Image pixs) {\n  Image pixsi = pixInvert(nullptr, pixs);\n  Image pixc = pixCreateTemplate(pixs);\n  pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);\n  pixSeedfillBinary(pixc, pixc, pixsi, 4);\n  pixInvert(pixc, pixc);\n  pixsi.destroy();\n  Image pixt = pixs & pixc;\n  l_int32 max_count;\n  pixCountConnComp(pixt, 8, &max_count);\n  // The count has to go up before we start looking for the minimum.\n  l_int32 min_count = INT32_MAX;\n  Image pixout = nullptr;\n  for (int i = 1; i < kMaxCircleErosions; i++) {\n    pixt.destroy();\n    pixErodeBrick(pixc, pixc, 3, 3);\n    pixt = pixs & pixc;\n    l_int32 count;\n    pixCountConnComp(pixt, 8, &count);\n    if (i == 1 || count > max_count) {\n      max_count = count;\n      min_count = count;\n    } else if (count < min_count) {\n      min_count = count;\n      pixout.destroy();\n      pixout = pixt.copy(); // Save the best.\n    } else if (count >= min_count) {\n      break; // We have passed by the best.\n    }\n  }\n  pixt.destroy();\n  pixc.destroy();\n  return pixout;\n}\n\n/**\n * Segment the page according to the current value of tessedit_pageseg_mode.\n * pix_binary_ is used as the source image and should not be nullptr.\n * On return the blocks list owns all the constructed page layout.\n */\nint Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,\n                           OSResults *osr) {\n  ASSERT_HOST(pix_binary_ != nullptr);\n  int width = pixGetWidth(pix_binary_);\n  int height = pixGetHeight(pix_binary_);\n  // Get page segmentation mode.\n  auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));\n  // If a UNLV zone file can be found, use that instead of segmentation.\n  if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\\0') {\n    std::string name = input_file;\n    auto lastdot = name.find_last_of('.');\n    if (lastdot != std::string::npos) {\n      name.resize(lastdot);\n    }\n    read_unlv_file(name, width, height, blocks);\n  }\n  if (blocks->empty()) {\n    // No UNLV file present. Work according to the PageSegMode.\n    // First make a single block covering the whole image.\n    BLOCK_IT block_it(blocks);\n    auto *block = new BLOCK(\"\", true, 0, 0, 0, 0, width, height);\n    block->set_right_to_left(right_to_left());\n    block_it.add_to_end(block);\n  } else {\n    // UNLV file present. Use PSM_SINGLE_BLOCK.\n    pageseg_mode = PSM_SINGLE_BLOCK;\n  }\n  // The diacritic_blobs holds noise blobs that may be diacritics. They\n  // are separated out on areas of the image that seem noisy and short-circuit\n  // the layout process, going straight from the initial partition creation\n  // right through to after word segmentation, where they are added to the\n  // rej_cblobs list of the most appropriate word. From there classification\n  // will determine whether they are used.\n  BLOBNBOX_LIST diacritic_blobs;\n  int auto_page_seg_ret_val = 0;\n  TO_BLOCK_LIST to_blocks;\n  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||\n      PSM_SPARSE(pageseg_mode)) {\n    auto_page_seg_ret_val =\n        AutoPageSeg(pageseg_mode, blocks, &to_blocks,\n                    enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);\n    if (pageseg_mode == PSM_OSD_ONLY) {\n      return auto_page_seg_ret_val;\n    }\n    // To create blobs from the image region bounds uncomment this line:\n    //  to_blocks.clear();  // Uncomment to go back to the old mode.\n  } else {\n    deskew_ = FCOORD(1.0f, 0.0f);\n    reskew_ = FCOORD(1.0f, 0.0f);\n    if (pageseg_mode == PSM_CIRCLE_WORD) {\n      Image pixcleaned = RemoveEnclosingCircle(pix_binary_);\n      if (pixcleaned != nullptr) {\n        pix_binary_.destroy();\n        pix_binary_ = pixcleaned;\n      }\n    }\n  }\n\n  if (auto_page_seg_ret_val < 0) {\n    return -1;\n  }\n\n  if (blocks->empty()) {\n    if (textord_debug_tabfind) {\n      tprintf(\"Empty page\\n\");\n    }\n    return 0; // AutoPageSeg found an empty page.\n  }\n  bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;\n  bool cjk_mode = textord_use_cjk_fp_model;\n\n  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,\n                       pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks, &gradient_);\n  return auto_page_seg_ret_val;\n}\n\n/**\n * Auto page segmentation. Divide the page image into blocks of uniform\n * text linespacing and images.\n *\n * Resolution (in ppi) is derived from the input image.\n *\n * The output goes in the blocks list with corresponding TO_BLOCKs in the\n * to_blocks list.\n *\n * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide\n * the image into columns, but multiple blocks are still made if the text is\n * of non-uniform linespacing.\n *\n * If diacritic_blobs is non-null, then diacritics/noise blobs, that would\n * confuse layout analysis by causing textline overlap, are placed there,\n * with the expectation that they will be reassigned to words later and\n * noise/diacriticness determined via classification.\n *\n * If osd (orientation and script detection) is true then that is performed\n * as well. If only_osd is true, then only orientation and script detection is\n * performed. If osd is desired, (osd or only_osd) then osr_tess must be\n * another Tesseract that was initialized especially for osd, and the results\n * will be output into osr (orientation and script result).\n */\nint Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,\n                           BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {\n  Image photomask_pix = nullptr;\n  Image musicmask_pix = nullptr;\n  // The blocks made by the ColumnFinder. Moved to blocks before return.\n  BLOCK_LIST found_blocks;\n  TO_BLOCK_LIST temp_blocks;\n\n  ColumnFinder *finder = SetupPageSegAndDetectOrientation(\n      pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,\n      pageseg_apply_music_mask ? &musicmask_pix : nullptr);\n  int result = 0;\n  if (finder != nullptr) {\n    TO_BLOCK_IT to_block_it(&temp_blocks);\n    TO_BLOCK *to_block = to_block_it.data();\n    if (musicmask_pix != nullptr) {\n      // TODO(rays) pass the musicmask_pix into FindBlocks and mark music\n      // blocks separately. For now combine with photomask_pix.\n      photomask_pix |= musicmask_pix;\n    }\n#ifndef DISABLED_LEGACY_ENGINE\n    if (equ_detect_) {\n      finder->SetEquationDetect(equ_detect_);\n    }\n#endif // ndef DISABLED_LEGACY_ENGINE\n    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,\n                                photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,\n                                &found_blocks, diacritic_blobs, to_blocks);\n    if (result >= 0) {\n      finder->GetDeskewVectors(&deskew_, &reskew_);\n    }\n    delete finder;\n  }\n  photomask_pix.destroy();\n  musicmask_pix.destroy();\n  if (result < 0) {\n    return result;\n  }\n\n  blocks->clear();\n  BLOCK_IT block_it(blocks);\n  // Move the found blocks to the input/output blocks.\n  block_it.add_list_after(&found_blocks);\n  return result;\n}\n\n// Helper adds all the scripts from sid_set converted to ids from osd_set to\n// allowed_ids.\nstatic void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,\n                                   std::vector<int> *allowed_ids) {\n  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {\n    if (i != sid_set.null_sid()) {\n      const char *script = sid_set.get_script_from_script_id(i);\n      allowed_ids->push_back(osd_set.get_script_id_from_name(script));\n    }\n  }\n}\n\n/**\n * Sets up auto page segmentation, determines the orientation, and corrects it.\n * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to\n * facilitate testing.\n * photo_mask_pix is a pointer to a nullptr pointer that will be filled on\n * return with the leptonica photo mask, which must be pixDestroyed by the\n * caller. to_blocks is an empty list that will be filled with (usually a\n * single) block that is used during layout analysis. This ugly API is required\n * because of the possibility of a unlv zone file.\n * TODO(rays) clean this up.\n * See AutoPageSeg for other arguments.\n * The returned ColumnFinder must be deleted after use.\n */\nColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,\n                                                          BLOCK_LIST *blocks, Tesseract *osd_tess,\n                                                          OSResults *osr, TO_BLOCK_LIST *to_blocks,\n                                                          Image *photo_mask_pix,\n                                                          Image *music_mask_pix) {\n  int vertical_x = 0;\n  int vertical_y = 1;\n  TabVector_LIST v_lines;\n  TabVector_LIST h_lines;\n  ICOORD bleft(0, 0);\n\n  ASSERT_HOST(pix_binary_ != nullptr);\n  if (tessedit_dump_pageseg_images) {\n    pixa_debug_.AddPix(pix_binary_, \"PageSegInput\");\n  }\n  // Leptonica is used to find the rule/separator lines in the input.\n  LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,\n                                 &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);\n  if (tessedit_dump_pageseg_images) {\n    pixa_debug_.AddPix(pix_binary_, \"NoLines\");\n  }\n  // Leptonica is used to find a mask of the photo regions in the input.\n  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);\n  if (tessedit_dump_pageseg_images) {\n    Image pix_no_image_ = nullptr;\n    if (*photo_mask_pix != nullptr) {\n      pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);\n    } else {\n      pix_no_image_ = pix_binary_.clone();\n    }\n    pixa_debug_.AddPix(pix_no_image_, \"NoImages\");\n    pix_no_image_.destroy();\n  }\n  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {\n    v_lines.clear();\n  }\n\n  // The rest of the algorithm uses the usual connected components.\n  textord_.find_components(pix_binary_, blocks, to_blocks);\n\n  TO_BLOCK_IT to_block_it(to_blocks);\n  // There must be exactly one input block.\n  // TODO(rays) handle new textline finding with a UNLV zone file.\n  ASSERT_HOST(to_blocks->singleton());\n  TO_BLOCK *to_block = to_block_it.data();\n  TBOX blkbox = to_block->block->pdblk.bounding_box();\n  ColumnFinder *finder = nullptr;\n  int estimated_resolution = source_resolution_;\n  if (source_resolution_ == kMinCredibleResolution) {\n    // Try to estimate resolution from typical body text size.\n    int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);\n    if (res > estimated_resolution && res < kMaxCredibleResolution) {\n      estimated_resolution = res;\n      tprintf(\"Estimating resolution as %d\\n\", estimated_resolution);\n    }\n  }\n\n  if (to_block->line_size >= 2) {\n    finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),\n                              blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,\n                              textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,\n                              vertical_y);\n\n    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);\n\n  #ifndef DISABLED_LEGACY_ENGINE\n    if (equ_detect_) {\n      equ_detect_->LabelSpecialText(to_block);\n    }\n  #endif\n\n    BLOBNBOX_CLIST osd_blobs;\n    // osd_orientation is the number of 90 degree rotations to make the\n    // characters upright. (See tesseract/osdetect.h for precise definition.)\n    // We want the text lines horizontal, (vertical text indicates vertical\n    // textlines) which may conflict (eg vertically written CJK).\n    int osd_orientation = 0;\n    bool vertical_text =\n        textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;\n    if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {\n      vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,\n                                                      &osd_blobs);\n    }\n\n  #ifndef DISABLED_LEGACY_ENGINE\n    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {\n      std::vector<int> osd_scripts;\n      if (osd_tess != this) {\n        // We are running osd as part of layout analysis, so constrain the\n        // scripts to those allowed by *this.\n        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);\n        for (auto &lang : sub_langs_) {\n          AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);\n        }\n      }\n      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);\n      if (pageseg_mode == PSM_OSD_ONLY) {\n        delete finder;\n        return nullptr;\n      }\n      osd_orientation = osr->best_result.orientation_id;\n      double osd_score = osr->orientations[osd_orientation];\n      double osd_margin = min_orientation_margin * 2;\n      for (int i = 0; i < 4; ++i) {\n        if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {\n          osd_margin = osd_score - osr->orientations[i];\n        }\n      }\n      int best_script_id = osr->best_result.script_id;\n      const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);\n      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||\n                 best_script_id == osd_tess->unicharset.hiragana_sid() ||\n                 best_script_id == osd_tess->unicharset.katakana_sid() ||\n                 strcmp(\"Japanese\", best_script_str) == 0 ||\n                 strcmp(\"Korean\", best_script_str) == 0 || strcmp(\"Hangul\", best_script_str) == 0;\n      if (cjk) {\n        finder->set_cjk_script(true);\n      }\n      if (osd_margin < min_orientation_margin) {\n        // The margin is weak.\n        if (!cjk && !vertical_text && osd_orientation == 2) {\n          // upside down latin text is improbable with such a weak margin.\n          tprintf(\n              \"OSD: Weak margin (%.2f), horiz textlines, not CJK: \"\n              \"Don't rotate.\\n\",\n              osd_margin);\n          osd_orientation = 0;\n        } else {\n          tprintf(\n              \"OSD: Weak margin (%.2f) for %d blob text block, \"\n              \"but using orientation anyway: %d\\n\",\n              osd_margin, osd_blobs.length(), osd_orientation);\n        }\n      }\n    }\n  #endif // ndef DISABLED_LEGACY_ENGINE\n\n    osd_blobs.shallow_clear();\n    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);\n  }\n\n  return finder;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/pagewalk.cpp",
    "content": "/**********************************************************************\n * File:        pagewalk.cpp  (Formerly walkers.c)\n * Description: Block list processors\n * Author:      Phil Cheatle\n * Created:     Thu Oct 10 16:25:24 BST 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n\nnamespace tesseract {\n/**\n * @name process_selected_words()\n *\n * Walk the current block list applying the specified word processor function\n * to each word that overlaps the selection_box.\n */\nvoid Tesseract::process_selected_words(\n    PAGE_RES *page_res, // blocks to check\n    TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) {\n  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD *word = page_res_it.word()->word;\n    if (word->bounding_box().overlap(selection_box)) {\n      if (!(this->*word_processor)(&page_res_it)) {\n        return;\n      }\n    }\n  }\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/par_control.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        par_control.cpp\n// Description: Control code for parallel implementation.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"tesseractclass.h\"\n#ifdef _OPENMP\n#  include <omp.h>\n#endif // _OPENMP\n\nnamespace tesseract {\n\nstruct BlobData {\n  BlobData() = default;\n  BlobData(int index, Tesseract *tess, const WERD_RES &word)\n      : blob(word.chopped_word->blobs[index])\n      , tesseract(tess)\n      , choices(&(*word.ratings)(index, index)) {}\n\n  TBLOB *blob = nullptr;\n  Tesseract *tesseract = nullptr;\n  BLOB_CHOICE_LIST **choices = nullptr;\n};\n\nvoid Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {\n  // Prepare all the blobs.\n  std::vector<BlobData> blobs;\n  for (const auto &w : words) {\n    if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {\n      for (size_t s = 0; s < w.lang_words.size(); ++s) {\n        Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;\n        const WERD_RES &word = *w.lang_words[s];\n        for (unsigned b = 0; b < word.chopped_word->NumBlobs(); ++b) {\n          blobs.emplace_back(b, sub, word);\n        }\n      }\n    }\n  }\n  // Pre-classify all the blobs.\n  if (tessedit_parallelize > 1) {\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(10)\n#endif // _OPENMP\n    // NOLINTNEXTLINE(modernize-loop-convert)\n    for (size_t b = 0; b < blobs.size(); ++b) {\n      *blobs[b].choices =\n          blobs[b].tesseract->classify_blob(blobs[b].blob, \"par\", ScrollView::WHITE, nullptr);\n    }\n  } else {\n    // TODO(AMD) parallelize this.\n    for (auto &blob : blobs) {\n      *blob.choices = blob.tesseract->classify_blob(blob.blob, \"par\", ScrollView::WHITE, nullptr);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/paragraphs.cpp",
    "content": "/**********************************************************************\n * File:        paragraphs.cpp\n * Description: Paragraph detection for tesseract.\n * Author:      David Eger\n *\n * (C) Copyright 2011, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"paragraphs.h\"\n\n#include \"helpers.h\"             // for UpdateRange, ClipToRange\n#include \"host.h\"                // for NearlyEqual\n#include \"mutableiterator.h\"     // for MutableIterator\n#include \"ocrblock.h\"            // for BLOCK\n#include \"ocrpara.h\"             // for ParagraphModel, PARA, PARA_IT, PARA...\n#include \"ocrrow.h\"              // for ROW\n#include \"pageres.h\"             // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...\n#include \"paragraphs_internal.h\" // for RowScratchRegisters, SetOfModels\n#include \"pdblock.h\"             // for PDBLK\n#include \"polyblk.h\"             // for POLY_BLOCK\n#include \"ratngs.h\"              // for WERD_CHOICE\n#include \"rect.h\"                // for TBOX\n#include \"statistc.h\"            // for STATS\n#include \"tesserrstream.h\"       // for tesserr\n#include \"tprintf.h\"             // for tprintf\n#include \"unicharset.h\"          // for UNICHARSET\n#include \"werd.h\"                // for WERD, W_REP_CHAR\n\n#include <tesseract/pageiterator.h> // for PageIterator\n#include <tesseract/publictypes.h>  // for JUSTIFICATION_LEFT, JUSTIFICATION_R...\n#include <tesseract/unichar.h>      // for UNICHAR, UNICHAR_ID\n\n#include <algorithm> // for max\n#include <cctype>    // for isspace\n#include <cmath>     // for abs\n#include <cstdio>    // for snprintf\n#include <cstdlib>   // for abs\n#include <cstring>   // for strchr, strlen\n#include <memory>    // for unique_ptr\n\nstatic const char *const kRLE = \"\\u202A\"; // Right-to-Left Embedding\nstatic const char *const kPDF = \"\\u202C\"; // Pop Directional Formatting\n\nnamespace tesseract {\n\n// Special \"weak\" ParagraphModels.\nconst ParagraphModel *kCrownLeft =\n    reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));\nconst ParagraphModel *kCrownRight =\n    reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));\n\n// Do the text and geometry of two rows support a paragraph break between them?\nstatic bool LikelyParagraphStart(const RowScratchRegisters &before,\n                                 const RowScratchRegisters &after,\n                                 tesseract::ParagraphJustification j);\n\n// Given the width of a typical space between words, what is the threshold\n// by which by which we think left and right alignments for paragraphs\n// can vary and still be aligned.\nstatic int Epsilon(int space_pix) {\n  return space_pix * 4 / 5;\n}\n\nstatic bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,\n                              const std::vector<RowScratchRegisters> *rows, int row_start,\n                              int row_end) {\n  if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {\n    tesserr << \"Invalid arguments rows[\" << row_start << \", \" << row_end\n            << \") while rows is of size \" << rows->size() << \".\\n\";\n    return false;\n  }\n  if (row_end - row_start < min_num_rows) {\n    if (debug_level > 1) {\n      tprintf(\"# Too few rows[%d, %d) for %s.\\n\", row_start, row_end, function_name);\n    }\n    return false;\n  }\n  return true;\n}\n\n// =============================== Debug Code ================================\n\n// Given a row-major matrix of unicode text and a column separator, print\n// a formatted table.  For ASCII, we get good column alignment.\nstatic void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {\n  std::vector<int> max_col_widths;\n  for (const auto &row : rows) {\n    auto num_columns = row.size();\n    for (size_t c = 0; c < num_columns; c++) {\n      int num_unicodes = 0;\n      for (char i : row[c]) {\n        if ((i & 0xC0) != 0x80) {\n          num_unicodes++;\n        }\n      }\n      if (c >= max_col_widths.size()) {\n        max_col_widths.push_back(num_unicodes);\n      } else {\n        if (num_unicodes > max_col_widths[c]) {\n          max_col_widths[c] = num_unicodes;\n        }\n      }\n    }\n  }\n\n  std::vector<std::string> col_width_patterns;\n  col_width_patterns.reserve(max_col_widths.size());\n  for (int max_col_width : max_col_widths) {\n    col_width_patterns.push_back(std::string(\"%-\") + std::to_string(max_col_width) + \"s\");\n  }\n\n  for (const auto &row : rows) {\n    for (unsigned c = 0; c < row.size(); c++) {\n      if (c > 0) {\n        tprintf(\"%s\", colsep);\n      }\n      tprintf(col_width_patterns[c].c_str(), row[c].c_str());\n    }\n    tprintf(\"\\n\");\n  }\n}\n\nstatic std::string RtlEmbed(const std::string &word, bool rtlify) {\n  if (rtlify) {\n    return std::string(kRLE) + word + std::string(kPDF);\n  }\n  return word;\n}\n\n// Print the current thoughts of the paragraph detector.\nstatic void PrintDetectorState(const ParagraphTheory &theory,\n                               const std::vector<RowScratchRegisters> &rows) {\n  std::vector<std::vector<std::string>> output;\n  output.emplace_back();\n  output.back().push_back(\"#row\");\n  output.back().push_back(\"space\");\n  output.back().push_back(\"..\");\n  output.back().push_back(\"lword[widthSEL]\");\n  output.back().push_back(\"rword[widthSEL]\");\n  RowScratchRegisters::AppendDebugHeaderFields(output.back());\n  output.back().push_back(\"text\");\n\n  for (unsigned i = 0; i < rows.size(); i++) {\n    output.emplace_back();\n    std::vector<std::string> &row = output.back();\n    const RowInfo &ri = *rows[i].ri_;\n    row.push_back(std::to_string(i));\n    row.push_back(std::to_string(ri.average_interword_space));\n    row.emplace_back(ri.has_leaders ? \"..\" : \" \");\n    row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + \"[\" + std::to_string(ri.lword_box.width()) +\n                  (ri.lword_likely_starts_idea ? \"S\" : \"s\") +\n                  (ri.lword_likely_ends_idea ? \"E\" : \"e\") +\n                  (ri.lword_indicates_list_item ? \"L\" : \"l\") + \"]\");\n    row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + \"[\" + std::to_string(ri.rword_box.width()) +\n                  (ri.rword_likely_starts_idea ? \"S\" : \"s\") +\n                  (ri.rword_likely_ends_idea ? \"E\" : \"e\") +\n                  (ri.rword_indicates_list_item ? \"L\" : \"l\") + \"]\");\n    rows[i].AppendDebugInfo(theory, row);\n    row.push_back(RtlEmbed(ri.text, !ri.ltr));\n  }\n  PrintTable(output, \" \");\n\n  tprintf(\"Active Paragraph Models:\\n\");\n  unsigned m = 0;\n  for (const auto &model : theory.models()) {\n    tprintf(\" %d: %s\\n\", ++m, model->ToString().c_str());\n  }\n}\n\nstatic void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,\n                      const std::vector<RowScratchRegisters> &rows) {\n  if (!should_print) {\n    return;\n  }\n  tprintf(\"# %s\\n\", phase);\n  PrintDetectorState(theory, rows);\n}\n\n// Print out the text for rows[row_start, row_end)\nstatic void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,\n                          int row_end) {\n  tprintf(\"======================================\\n\");\n  for (int row = row_start; row < row_end; row++) {\n    tprintf(\"%s\\n\", rows[row].ri_->text.c_str());\n  }\n  tprintf(\"======================================\\n\");\n}\n\n// ============= Brain Dead Language Model (ASCII Version) ===================\n\nstatic bool IsLatinLetter(int ch) {\n  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');\n}\n\nstatic bool IsDigitLike(int ch) {\n  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';\n}\n\nstatic bool IsOpeningPunct(int ch) {\n  return strchr(\"'\\\"({[\", ch) != nullptr;\n}\n\nstatic bool IsTerminalPunct(int ch) {\n  return strchr(\":'\\\".?!]})\", ch) != nullptr;\n}\n\n// Return a pointer after consuming as much text as qualifies as roman numeral.\nstatic const char *SkipChars(const char *str, const char *toskip) {\n  while (*str != '\\0' && strchr(toskip, *str)) {\n    str++;\n  }\n  return str;\n}\n\nstatic const char *SkipChars(const char *str, bool (*skip)(int)) {\n  while (*str != '\\0' && skip(*str)) {\n    str++;\n  }\n  return str;\n}\n\nstatic const char *SkipOne(const char *str, const char *toskip) {\n  if (*str != '\\0' && strchr(toskip, *str)) {\n    return str + 1;\n  }\n  return str;\n}\n\n// Return whether it is very likely that this is a numeral marker that could\n// start a list item.  Some examples include:\n//   A   I   iii.   VI   (2)   3.5.   [C-4]\nstatic bool LikelyListNumeral(const std::string &word) {\n  const char *kRomans = \"ivxlmdIVXLMD\";\n  const char *kDigits = \"012345789\";\n  const char *kOpen = \"[{(\";\n  const char *kSep = \":;-.,\";\n  const char *kClose = \"]})\";\n\n  int num_segments = 0;\n  const char *pos = word.c_str();\n  while (*pos != '\\0' && num_segments < 3) {\n    // skip up to two open parens.\n    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);\n    const char *numeral_end = SkipChars(numeral_start, kRomans);\n    if (numeral_end != numeral_start) {\n      // Got Roman Numeral. Great.\n    } else {\n      numeral_end = SkipChars(numeral_start, kDigits);\n      if (numeral_end == numeral_start) {\n        // If there's a single latin letter, we can use that.\n        numeral_end = SkipChars(numeral_start, IsLatinLetter);\n        if (numeral_end - numeral_start != 1) {\n          break;\n        }\n      }\n    }\n    // We got some sort of numeral.\n    num_segments++;\n    // Skip any trailing parens or punctuation.\n    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);\n    if (pos == numeral_end) {\n      break;\n    }\n  }\n  return *pos == '\\0';\n}\n\nstatic bool LikelyListMark(const std::string &word) {\n  const char *kListMarks = \"0Oo*.,+.\";\n  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;\n}\n\nbool AsciiLikelyListItem(const std::string &word) {\n  return LikelyListMark(word) || LikelyListNumeral(word);\n}\n\n// ========== Brain Dead Language Model (Tesseract Version) ================\n\n// Return the first Unicode Codepoint from werd[pos].\nstatic int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {\n  if (!u || !werd || pos > werd->length()) {\n    return 0;\n  }\n  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();\n}\n\n// A useful helper class for finding the first j >= i so that word[j]\n// does not have given character type.\nclass UnicodeSpanSkipper {\npublic:\n  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)\n      : u_(unicharset), word_(word), wordlen_(word->length()) {\n  }\n\n  // Given an input position, return the first position >= pos not punc.\n  unsigned SkipPunc(unsigned pos);\n  // Given an input position, return the first position >= pos not digit.\n  unsigned SkipDigits(unsigned pos);\n  // Given an input position, return the first position >= pos not roman.\n  unsigned SkipRomans(unsigned pos);\n  // Given an input position, return the first position >= pos not alpha.\n  unsigned SkipAlpha(unsigned pos);\n\nprivate:\n  const UNICHARSET *u_;\n  const WERD_CHOICE *word_;\n  unsigned wordlen_;\n};\n\nunsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {\n  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {\n    pos++;\n  }\n  return pos;\n}\n\nunsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {\n  while (pos < wordlen_ &&\n         (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {\n    pos++;\n  }\n  return pos;\n}\n\nunsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {\n  const char *kRomans = \"ivxlmdIVXLMD\";\n  while (pos < wordlen_) {\n    int ch = UnicodeFor(u_, word_, pos);\n    if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {\n      break;\n    }\n    pos++;\n  }\n  return pos;\n}\n\nunsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {\n  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {\n    pos++;\n  }\n  return pos;\n}\n\nstatic bool LikelyListMarkUnicode(int ch) {\n  if (ch < 0x80) {\n    std::string single_ch;\n    single_ch += ch;\n    return LikelyListMark(single_ch);\n  }\n  switch (ch) {\n    // TODO(eger) expand this list of unicodes as needed.\n    case 0x00B0: // degree sign\n    case 0x2022: // bullet\n    case 0x25E6: // white bullet\n    case 0x00B7: // middle dot\n    case 0x25A1: // white square\n    case 0x25A0: // black square\n    case 0x25AA: // black small square\n    case 0x2B1D: // black very small square\n    case 0x25BA: // black right-pointing pointer\n    case 0x25CF: // black circle\n    case 0x25CB: // white circle\n      return true;\n    default:\n      break; // fall through\n  }\n  return false;\n}\n\n// Return whether it is very likely that this is a numeral marker that could\n// start a list item.  Some examples include:\n//   A   I   iii.   VI   (2)   3.5.   [C-4]\nstatic bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {\n  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {\n    return true;\n  }\n\n  UnicodeSpanSkipper m(u, werd);\n  int num_segments = 0;\n  unsigned pos = 0;\n  while (pos < werd->length() && num_segments < 3) {\n    auto numeral_start = m.SkipPunc(pos);\n    if (numeral_start > pos + 1) {\n      break;\n    }\n    auto numeral_end = m.SkipRomans(numeral_start);\n    if (numeral_end == numeral_start) {\n      numeral_end = m.SkipDigits(numeral_start);\n      if (numeral_end == numeral_start) {\n        // If there's a single latin letter, we can use that.\n        numeral_end = m.SkipAlpha(numeral_start);\n        if (numeral_end - numeral_start != 1) {\n          break;\n        }\n      }\n    }\n    // We got some sort of numeral.\n    num_segments++;\n    // Skip any trailing punctuation.\n    pos = m.SkipPunc(numeral_end);\n    if (pos == numeral_end) {\n      break;\n    }\n  }\n  return pos == werd->length();\n}\n\ntemplate<class T>\nvoid push_back_new(std::vector<T> &vector, const T &data) {\n  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {\n    vector.push_back(data);\n  }\n}\n\n// ========= Brain Dead Language Model (combined entry points) ================\n\n// Given the leftmost word of a line either as a Tesseract unicharset + werd\n// or a utf8 string, set the following attributes for it:\n//   is_list -      this word might be a list number or bullet.\n//   starts_idea -  this word is likely to start a sentence.\n//   ends_idea -    this word is likely to end a sentence.\nvoid LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,\n                        bool *is_list, bool *starts_idea, bool *ends_idea) {\n  *is_list = false;\n  *starts_idea = false;\n  *ends_idea = false;\n  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty\n    *ends_idea = true;\n    return;\n  }\n\n  if (unicharset && werd) { // We have a proper werd and unicharset so use it.\n    if (UniLikelyListItem(unicharset, werd)) {\n      *is_list = true;\n      *starts_idea = true;\n      *ends_idea = true;\n    }\n    if (unicharset->get_isupper(werd->unichar_id(0))) {\n      *starts_idea = true;\n    }\n    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {\n      *starts_idea = true;\n      *ends_idea = true;\n    }\n  } else { // Assume utf8 is mostly ASCII\n    if (AsciiLikelyListItem(utf8)) {\n      *is_list = true;\n      *starts_idea = true;\n    }\n    int start_letter = utf8[0];\n    if (IsOpeningPunct(start_letter)) {\n      *starts_idea = true;\n    }\n    if (IsTerminalPunct(start_letter)) {\n      *ends_idea = true;\n    }\n    if (start_letter >= 'A' && start_letter <= 'Z') {\n      *starts_idea = true;\n    }\n  }\n}\n\n// Given the rightmost word of a line either as a Tesseract unicharset + werd\n// or a utf8 string, set the following attributes for it:\n//   is_list -      this word might be a list number or bullet.\n//   starts_idea -  this word is likely to start a sentence.\n//   ends_idea -    this word is likely to end a sentence.\nvoid RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,\n                         bool *is_list, bool *starts_idea, bool *ends_idea) {\n  *is_list = false;\n  *starts_idea = false;\n  *ends_idea = false;\n  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty\n    *ends_idea = true;\n    return;\n  }\n\n  if (unicharset && werd) { // We have a proper werd and unicharset so use it.\n    if (UniLikelyListItem(unicharset, werd)) {\n      *is_list = true;\n      *starts_idea = true;\n    }\n    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);\n    if (unicharset->get_ispunctuation(last_letter)) {\n      *ends_idea = true;\n    }\n  } else { // Assume utf8 is mostly ASCII\n    if (AsciiLikelyListItem(utf8)) {\n      *is_list = true;\n      *starts_idea = true;\n    }\n    int last_letter = utf8[utf8.size() - 1];\n    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {\n      *ends_idea = true;\n    }\n  }\n}\n\n// =============== Implementation of RowScratchRegisters =====================\n/* static */\nvoid RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {\n  header.emplace_back(\"[lmarg,lind;rind,rmarg]\");\n  header.emplace_back(\"model\");\n}\n\nvoid RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,\n                                          std::vector<std::string> &dbg) const {\n  char s[60];\n  // The largest (positive and negative) numbers are reported for lindent & rindent.\n  // While the column header has widths 5,4,4,5, it is therefore opportune to slightly\n  // offset the widths in the format string here to allow ample space for lindent & rindent\n  // while keeping the final table output nicely readable: 4,5,5,4.\n  snprintf(s, sizeof(s), \"[%4d,%5d;%5d,%4d]\", lmargin_, lindent_, rindent_, rmargin_);\n  dbg.emplace_back(s);\n  std::string model_string;\n  model_string += static_cast<char>(GetLineType());\n  model_string += \":\";\n\n  int model_numbers = 0;\n  for (const auto &hypothese : hypotheses_) {\n    if (hypothese.model == nullptr) {\n      continue;\n    }\n    if (model_numbers > 0) {\n      model_string += \",\";\n    }\n    if (StrongModel(hypothese.model)) {\n      model_string += std::to_string(1 + theory.IndexOf(hypothese.model));\n    } else if (hypothese.model == kCrownLeft) {\n      model_string += \"CrL\";\n    } else if (hypothese.model == kCrownRight) {\n      model_string += \"CrR\";\n    }\n    model_numbers++;\n  }\n  if (model_numbers == 0) {\n    model_string += \"0\";\n  }\n\n  dbg.push_back(model_string);\n}\n\nvoid RowScratchRegisters::Init(const RowInfo &row) {\n  ri_ = &row;\n  lmargin_ = 0;\n  lindent_ = row.pix_ldistance;\n  rmargin_ = 0;\n  rindent_ = row.pix_rdistance;\n}\n\nLineType RowScratchRegisters::GetLineType() const {\n  if (hypotheses_.empty()) {\n    return LT_UNKNOWN;\n  }\n  bool has_start = false;\n  bool has_body = false;\n  for (const auto &hypothese : hypotheses_) {\n    switch (hypothese.ty) {\n      case LT_START:\n        has_start = true;\n        break;\n      case LT_BODY:\n        has_body = true;\n        break;\n      default:\n        tprintf(\"Encountered bad value in hypothesis list: %c\\n\", hypothese.ty);\n        break;\n    }\n  }\n  if (has_start && has_body) {\n    return LT_MULTIPLE;\n  }\n  return has_start ? LT_START : LT_BODY;\n}\n\nLineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {\n  if (hypotheses_.empty()) {\n    return LT_UNKNOWN;\n  }\n  bool has_start = false;\n  bool has_body = false;\n  for (const auto &hypothese : hypotheses_) {\n    if (hypothese.model != model) {\n      continue;\n    }\n    switch (hypothese.ty) {\n      case LT_START:\n        has_start = true;\n        break;\n      case LT_BODY:\n        has_body = true;\n        break;\n      default:\n        tprintf(\"Encountered bad value in hypothesis list: %c\\n\", hypothese.ty);\n        break;\n    }\n  }\n  if (has_start && has_body) {\n    return LT_MULTIPLE;\n  }\n  return has_start ? LT_START : LT_BODY;\n}\n\nvoid RowScratchRegisters::SetStartLine() {\n  LineType current_lt = GetLineType();\n  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {\n    tprintf(\"Trying to set a line to be START when it's already BODY.\\n\");\n  }\n  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {\n    push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));\n  }\n}\n\nvoid RowScratchRegisters::SetBodyLine() {\n  LineType current_lt = GetLineType();\n  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {\n    tprintf(\"Trying to set a line to be BODY when it's already START.\\n\");\n  }\n  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {\n    push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));\n  }\n}\n\nvoid RowScratchRegisters::AddStartLine(const ParagraphModel *model) {\n  push_back_new(hypotheses_, LineHypothesis(LT_START, model));\n  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));\n  if (found != hypotheses_.end()) {\n    hypotheses_.erase(found);\n  }\n}\n\nvoid RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {\n  push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));\n  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));\n  if (found != hypotheses_.end()) {\n    hypotheses_.erase(found);\n  }\n}\n\nvoid RowScratchRegisters::StartHypotheses(SetOfModels *models) const {\n  for (const auto &hypothese : hypotheses_) {\n    if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {\n      push_back_new(*models, hypothese.model);\n    }\n  }\n}\n\nvoid RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {\n  for (const auto &hypothese : hypotheses_) {\n    if (StrongModel(hypothese.model)) {\n      push_back_new(*models, hypothese.model);\n    }\n  }\n}\n\nvoid RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {\n  for (const auto &hypothese : hypotheses_) {\n    if (hypothese.model != nullptr) {\n      push_back_new(*models, hypothese.model);\n    }\n  }\n}\n\nconst ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {\n  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {\n    return nullptr;\n  }\n  return hypotheses_[0].model;\n}\n\nconst ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {\n  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {\n    return nullptr;\n  }\n  return hypotheses_[0].model;\n}\n\n// Discard any hypotheses whose model is not in the given list.\nvoid RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) {\n  if (models.empty()) {\n    return;\n  }\n  for (int h = hypotheses_.size() - 1; h >= 0; h--) {\n    if (!contains(models, hypotheses_[h].model)) {\n      hypotheses_.erase(hypotheses_.begin() + h);\n    }\n  }\n}\n\n// ============ Geometry based Paragraph Detection Algorithm =================\n\nstruct Cluster {\n  Cluster() : center(0), count(0) {}\n  Cluster(int cen, int num) : center(cen), count(num) {}\n\n  int center; // The center of the cluster.\n  int count;  // The number of entries within the cluster.\n};\n\nclass SimpleClusterer {\npublic:\n  explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}\n  void Add(int value) {\n    values_.push_back(value);\n  }\n  size_t size() const {\n    return values_.size();\n  }\n  void GetClusters(std::vector<Cluster> *clusters);\n\nprivate:\n  int max_cluster_width_;\n  std::vector<int> values_;\n};\n\n// Return the index of the cluster closest to value.\nstatic int ClosestCluster(const std::vector<Cluster> &clusters, int value) {\n  unsigned best_index = 0;\n  for (unsigned i = 0; i < clusters.size(); i++) {\n    if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {\n      best_index = i;\n    }\n  }\n  return best_index;\n}\n\nvoid SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {\n  clusters->clear();\n  std::sort(values_.begin(), values_.end());\n  for (unsigned i = 0; i < values_.size();) {\n    int orig_i = i;\n    int lo = values_[i];\n    int hi = lo;\n    while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {\n      hi = values_[i];\n    }\n    clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));\n  }\n}\n\n// Calculate left- and right-indent tab stop values seen in\n// rows[row_start, row_end) given a tolerance of tolerance.\nstatic void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,\n                              int tolerance, std::vector<Cluster> *left_tabs,\n                              std::vector<Cluster> *right_tabs) {\n  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {\n    return;\n  }\n  // First pass: toss all left and right indents into clusterers.\n  SimpleClusterer initial_lefts(tolerance);\n  SimpleClusterer initial_rights(tolerance);\n  std::vector<Cluster> initial_left_tabs;\n  std::vector<Cluster> initial_right_tabs;\n  for (int i = row_start; i < row_end; i++) {\n    initial_lefts.Add((*rows)[i].lindent_);\n    initial_rights.Add((*rows)[i].rindent_);\n  }\n  initial_lefts.GetClusters(&initial_left_tabs);\n  initial_rights.GetClusters(&initial_right_tabs);\n\n  // Second pass: cluster only lines that are not \"stray\"\n  //   An example of a stray line is a page number -- a line whose start\n  //   and end tab-stops are far outside the typical start and end tab-stops\n  //   for the block.\n  //   Put another way, we only cluster data from lines whose start or end\n  //   tab stop is frequent.\n  SimpleClusterer lefts(tolerance);\n  SimpleClusterer rights(tolerance);\n\n  // Outlier elimination.  We might want to switch this to test outlier-ness\n  // based on how strange a position an outlier is in instead of or in addition\n  // to how rare it is.  These outliers get re-added if we end up having too\n  // few tab stops, to work with, however.\n  int infrequent_enough_to_ignore = 0;\n  if (row_end - row_start >= 8) {\n    infrequent_enough_to_ignore = 1;\n  }\n  if (row_end - row_start >= 20) {\n    infrequent_enough_to_ignore = 2;\n  }\n\n  for (int i = row_start; i < row_end; i++) {\n    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);\n    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);\n    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||\n        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {\n      lefts.Add((*rows)[i].lindent_);\n      rights.Add((*rows)[i].rindent_);\n    }\n  }\n  lefts.GetClusters(left_tabs);\n  rights.GetClusters(right_tabs);\n\n  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||\n      (right_tabs->size() == 1 && left_tabs->size() >= 4)) {\n    // One side is really ragged, and the other only has one tab stop,\n    // so those \"insignificant outliers\" are probably important, actually.\n    // This often happens on a page of an index.  Add back in the ones\n    // we omitted in the first pass.\n    for (int i = row_start; i < row_end; i++) {\n      int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);\n      int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);\n      if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||\n            initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {\n        lefts.Add((*rows)[i].lindent_);\n        rights.Add((*rows)[i].rindent_);\n      }\n    }\n  }\n  lefts.GetClusters(left_tabs);\n  rights.GetClusters(right_tabs);\n\n  // If one side is almost a two-indent aligned side, and the other clearly\n  // isn't, try to prune out the least frequent tab stop from that side.\n  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {\n    int to_prune = -1;\n    for (int i = left_tabs->size() - 1; i >= 0; i--) {\n      if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {\n        to_prune = i;\n      }\n    }\n    if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {\n      left_tabs->erase(left_tabs->begin() + to_prune);\n    }\n  }\n  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {\n    int to_prune = -1;\n    for (int i = right_tabs->size() - 1; i >= 0; i--) {\n      if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {\n        to_prune = i;\n      }\n    }\n    if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {\n      right_tabs->erase(right_tabs->begin() + to_prune);\n    }\n  }\n}\n\n// Given a paragraph model mark rows[row_start, row_end) as said model\n// start or body lines.\n//\n// Case 1: model->first_indent_ != model->body_indent_\n//   Differentiating the paragraph start lines from the paragraph body lines in\n//   this case is easy, we just see how far each line is indented.\n//\n// Case 2: model->first_indent_ == model->body_indent_\n//   Here, we find end-of-paragraph lines by looking for \"short lines.\"\n//   What constitutes a \"short line\" changes depending on whether the text\n//   ragged-right[left] or fully justified (aligned left and right).\n//\n//   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)\n//     We have a new paragraph it the first word would have at the end\n//     of the previous line.\n//\n//   Case 2b: Fully Justified.  (eop_threshold > 0)\n//     We mark a line as short (end of paragraph) if the offside indent\n//     is greater than eop_threshold.\nstatic void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,\n                              const ParagraphModel *model, bool ltr, int eop_threshold) {\n  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {\n    return;\n  }\n  for (int row = row_start; row < row_end; row++) {\n    bool valid_first = ValidFirstLine(rows, row, model);\n    bool valid_body = ValidBodyLine(rows, row, model);\n    if (valid_first && !valid_body) {\n      (*rows)[row].AddStartLine(model);\n    } else if (valid_body && !valid_first) {\n      (*rows)[row].AddBodyLine(model);\n    } else if (valid_body && valid_first) {\n      bool after_eop = (row == row_start);\n      if (row > row_start) {\n        if (eop_threshold > 0) {\n          if (model->justification() == JUSTIFICATION_LEFT) {\n            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;\n          } else {\n            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;\n          }\n        } else {\n          after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());\n        }\n      }\n      if (after_eop) {\n        (*rows)[row].AddStartLine(model);\n      } else {\n        (*rows)[row].AddBodyLine(model);\n      }\n    } else {\n      // Do nothing. Stray row.\n    }\n  }\n}\n\n// GeometricClassifierState holds all of the information we'll use while\n// trying to determine a paragraph model for the text lines in a block of\n// text:\n//   + the rows under consideration [row_start, row_end)\n//   + the common left- and right-indent tab stops\n//   + does the block start out left-to-right or right-to-left\n// Further, this struct holds the data we amass for the (single) ParagraphModel\n// we'll assign to the text lines (assuming we get that far).\nstruct GeometricClassifierState {\n  GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,\n                           int r_end)\n      : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {\n    tolerance = InterwordSpace(*r, r_start, r_end);\n    CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);\n    if (debug_level >= 3) {\n      tesserr << \"Geometry: TabStop cluster tolerance = \" << tolerance << \"; \"\n              << left_tabs.size() << \" left tabs; \"\n              << right_tabs.size() << \" right tabs\\n\";\n    }\n    ltr = (*r)[r_start].ri_->ltr;\n  }\n\n  void AssumeLeftJustification() {\n    just = tesseract::JUSTIFICATION_LEFT;\n    margin = (*rows)[row_start].lmargin_;\n  }\n\n  void AssumeRightJustification() {\n    just = tesseract::JUSTIFICATION_RIGHT;\n    margin = (*rows)[row_start].rmargin_;\n  }\n\n  // Align tabs are the tab stops the text is aligned to.\n  const std::vector<Cluster> &AlignTabs() const {\n    if (just == tesseract::JUSTIFICATION_RIGHT) {\n      return right_tabs;\n    }\n    return left_tabs;\n  }\n\n  // Offside tabs are the tab stops opposite the tabs used to align the text.\n  //\n  // Note that for a left-to-right text which is aligned to the right such as\n  //     this function comment, the offside tabs are the horizontal tab stops\n  //                 marking the beginning of (\"Note\", \"this\" and \"marking\").\n  const std::vector<Cluster> &OffsideTabs() const {\n    if (just == tesseract::JUSTIFICATION_RIGHT) {\n      return left_tabs;\n    }\n    return right_tabs;\n  }\n\n  // Return whether the i'th row extends from the leftmost left tab stop\n  // to the right most right tab stop.\n  bool IsFullRow(int i) const {\n    return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&\n           ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;\n  }\n\n  int AlignsideTabIndex(int row_idx) const {\n    return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));\n  }\n\n  // Given what we know about the paragraph justification (just), would the\n  // first word of row_b have fit at the end of row_a?\n  bool FirstWordWouldHaveFit(int row_a, int row_b) {\n    return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just);\n  }\n\n  void PrintRows() const {\n    PrintRowRange(*rows, row_start, row_end);\n  }\n\n  void Fail(int min_debug_level, const char *why) const {\n    if (debug_level < min_debug_level) {\n      return;\n    }\n    tprintf(\"# %s\\n\", why);\n    PrintRows();\n  }\n\n  ParagraphModel Model() const {\n    return ParagraphModel(just, margin, first_indent, body_indent, tolerance);\n  }\n\n  // We print out messages with a debug level at least as great as debug_level.\n  int debug_level = 0;\n\n  // The Geometric Classifier was asked to find a single paragraph model\n  // to fit the text rows (*rows)[row_start, row_end)\n  std::vector<RowScratchRegisters> *rows;\n  int row_start = 0;\n  int row_end = 0;\n\n  // The amount by which we expect the text edge can vary and still be aligned.\n  int tolerance = 0;\n\n  // Is the script in this text block left-to-right?\n  // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve\n  bool ltr = false;\n\n  // These left and right tab stops were determined to be the common tab\n  // stops for the given text.\n  std::vector<Cluster> left_tabs;\n  std::vector<Cluster> right_tabs;\n\n  // These are parameters we must determine to create a ParagraphModel.\n  tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;\n  int margin = 0;\n  int first_indent = 0;\n  int body_indent = 0;\n\n  // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()\n  int eop_threshold = 0;\n};\n\n// Given a section of text where strong textual clues did not help identifying\n// paragraph breaks, and for which the left and right indents have exactly\n// three tab stops between them, attempt to find the paragraph breaks based\n// solely on the outline of the text and whether the script is left-to-right.\n//\n// Algorithm Detail:\n//   The selected rows are in the form of a rectangle except\n//   for some number of \"short lines\" of the same length:\n//\n//   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx\n//           xxxxxxxxxxx       xxxxxxxxxx    # A \"short\" line.\n//         xxxxxxxxxxxxx       xxxxxxxxxxxx\n//         xxxxxxxxxxxxx       xxxxxxxxxxxx\n//\n//   We have a slightly different situation if the only short\n//   line is at the end of the excerpt.\n//\n//   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx\n//        xxxxxxxxxxxxx       xxxxxxxxxxxx\n//        xxxxxxxxxxxxx       xxxxxxxxxxxx\n//          xxxxxxxxxxx       xxxxxxxxxx     # A \"short\" line.\n//\n//   We'll interpret these as follows based on the reasoning in the comment for\n//   GeometricClassify():\n//       [script direction: first indent, body indent]\n//   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0\n//   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0\nstatic void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,\n                                                   ParagraphTheory *theory) {\n  int num_rows = s.row_end - s.row_start;\n  int num_full_rows = 0;\n  int last_row_full = 0;\n  for (int i = s.row_start; i < s.row_end; i++) {\n    if (s.IsFullRow(i)) {\n      num_full_rows++;\n      if (i == s.row_end - 1) {\n        last_row_full++;\n      }\n    }\n  }\n\n  if (num_full_rows < 0.7 * num_rows) {\n    s.Fail(1, \"Not enough full lines to know which lines start paras.\");\n    return;\n  }\n\n  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()\n  s.eop_threshold = 0;\n\n  if (s.ltr) {\n    s.AssumeLeftJustification();\n  } else {\n    s.AssumeRightJustification();\n  }\n\n  if (debug_level > 0) {\n    tprintf(\n        \"# Not enough variety for clear outline classification. \"\n        \"Guessing these are %s aligned based on script.\\n\",\n        s.ltr ? \"left\" : \"right\");\n    s.PrintRows();\n  }\n\n  if (s.AlignTabs().size() == 2) { // case A1 or A2\n    s.first_indent = s.AlignTabs()[1].center;\n    s.body_indent = s.AlignTabs()[0].center;\n  } else { // case B1 or B2\n    if (num_rows - 1 == num_full_rows - last_row_full) {\n      // case B2\n      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;\n      (*s.rows)[s.row_start].AddStartLine(model);\n      for (int i = s.row_start + 1; i < s.row_end; i++) {\n        (*s.rows)[i].AddBodyLine(model);\n      }\n      return;\n    } else {\n      // case B1\n      s.first_indent = s.body_indent = s.AlignTabs()[0].center;\n      s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;\n    }\n  }\n  const ParagraphModel *model = theory->AddModel(s.Model());\n  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);\n  return;\n}\n\n// This function is called if strong textual clues were not available, but\n// the caller hopes that the paragraph breaks will be super obvious just\n// by the outline of the text.\n//\n// The particularly difficult case is figuring out what's going on if you\n// don't have enough short paragraph end lines to tell us what's going on.\n//\n// For instance, let's say you have the following outline:\n//\n//   (A1)  xxxxxxxxxxxxxxxxxxxxxx\n//           xxxxxxxxxxxxxxxxxxxx\n//         xxxxxxxxxxxxxxxxxxxxxx\n//         xxxxxxxxxxxxxxxxxxxxxx\n//\n// Even if we know that the text is left-to-right and so will probably be\n// left-aligned, both of the following are possible texts:\n//\n//  (A1a)  1. Here our list item\n//           with two full lines.\n//         2. Here a second item.\n//         3. Here our third one.\n//\n//  (A1b)  so ends paragraph one.\n//           Here  starts another\n//         paragraph  we want  to\n//         read.  This  continues\n//\n// These examples are obvious from the text and should have been caught\n// by the StrongEvidenceClassify pass.  However, for languages where we don't\n// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),\n// it's worth guessing that (A1b) is the correct interpretation if there are\n// far more \"full\" lines than \"short\" lines.\nstatic void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,\n                              int row_start, int row_end, ParagraphTheory *theory) {\n  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {\n    return;\n  }\n  if (debug_level > 1) {\n    tprintf(\"###############################################\\n\");\n    tprintf(\"##### GeometricClassify( rows[%d:%d) )   ####\\n\", row_start, row_end);\n    tprintf(\"###############################################\\n\");\n  }\n  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);\n\n  GeometricClassifierState s(debug_level, rows, row_start, row_end);\n  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {\n    s.Fail(2, \"Too much variety for simple outline classification.\");\n    return;\n  }\n  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {\n    s.Fail(1, \"Not enough variety for simple outline classification.\");\n    return;\n  }\n  if (s.left_tabs.size() + s.right_tabs.size() == 3) {\n    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);\n    return;\n  }\n\n  // At this point, we know that one side has at least two tab stops, and the\n  // other side has one or two tab stops.\n  // Left to determine:\n  //   (1) Which is the body indent and which is the first line indent?\n  //   (2) Is the text fully justified?\n\n  // If one side happens to have three or more tab stops, assume that side\n  // is opposite of the aligned side.\n  if (s.right_tabs.size() > 2) {\n    s.AssumeLeftJustification();\n  } else if (s.left_tabs.size() > 2) {\n    s.AssumeRightJustification();\n  } else if (s.ltr) { // guess based on script direction\n    s.AssumeLeftJustification();\n  } else {\n    s.AssumeRightJustification();\n  }\n\n  if (s.AlignTabs().size() == 2) {\n    // For each tab stop on the aligned side, how many of them appear\n    // to be paragraph start lines?  [first lines]\n    int firsts[2] = {0, 0};\n    // Count the first line as a likely paragraph start line.\n    firsts[s.AlignsideTabIndex(s.row_start)]++;\n    // For each line, if the first word would have fit on the previous\n    // line count it as a likely paragraph start line.\n    bool jam_packed = true;\n    for (int i = s.row_start + 1; i < s.row_end; i++) {\n      if (s.FirstWordWouldHaveFit(i - 1, i)) {\n        firsts[s.AlignsideTabIndex(i)]++;\n        jam_packed = false;\n      }\n    }\n    // Make an extra accounting for the last line of the paragraph just\n    // in case it's the only short line in the block.  That is, take its\n    // first word as typical and see if this looks like the *last* line\n    // of a paragraph.  If so, mark the *other* indent as probably a first.\n    if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {\n      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;\n    }\n\n    int percent0firsts, percent1firsts;\n    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;\n    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;\n\n    // TODO(eger): Tune these constants if necessary.\n    if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {\n      s.first_indent = s.AlignTabs()[1].center;\n      s.body_indent = s.AlignTabs()[0].center;\n    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||\n               percent1firsts + 30 < percent0firsts) {\n      s.first_indent = s.AlignTabs()[0].center;\n      s.body_indent = s.AlignTabs()[1].center;\n    } else {\n      // Ambiguous! Probably lineated (poetry)\n      if (debug_level > 1) {\n        tprintf(\"# Cannot determine %s indent likely to start paragraphs.\\n\",\n                s.just == tesseract::JUSTIFICATION_LEFT ? \"left\" : \"right\");\n        tprintf(\"# Indent of %d looks like a first line %d%% of the time.\\n\",\n                s.AlignTabs()[0].center, percent0firsts);\n        tprintf(\"# Indent of %d looks like a first line %d%% of the time.\\n\",\n                s.AlignTabs()[1].center, percent1firsts);\n        s.PrintRows();\n      }\n      return;\n    }\n  } else {\n    // There's only one tab stop for the \"aligned to\" side.\n    s.first_indent = s.body_indent = s.AlignTabs()[0].center;\n  }\n\n  // At this point, we have our model.\n  const ParagraphModel *model = theory->AddModel(s.Model());\n\n  // Now all we have to do is figure out if the text is fully justified or not.\n  // eop_threshold: default to fully justified unless we see evidence below.\n  //    See description on MarkRowsWithModel()\n  s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;\n  // If the text is not fully justified, re-set the eop_threshold to 0.\n  if (s.AlignTabs().size() == 2) {\n    // Paragraphs with a paragraph-start indent.\n    for (int i = s.row_start; i < s.row_end - 1; i++) {\n      if (ValidFirstLine(s.rows, i + 1, model) &&\n          !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),\n                       s.tolerance)) {\n        // We found a non-end-of-paragraph short line: not fully justified.\n        s.eop_threshold = 0;\n        break;\n      }\n    }\n  } else {\n    // Paragraphs with no paragraph-start indent.\n    for (int i = s.row_start; i < s.row_end - 1; i++) {\n      if (!s.FirstWordWouldHaveFit(i, i + 1) &&\n          !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),\n                       s.tolerance)) {\n        // We found a non-end-of-paragraph short line: not fully justified.\n        s.eop_threshold = 0;\n        break;\n      }\n    }\n  }\n  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);\n}\n\n// =============== Implementation of ParagraphTheory =====================\n\nconst ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {\n  for (const auto &m : *models_) {\n    if (m->Comparable(model)) {\n      return m;\n    }\n  }\n  auto *m = new ParagraphModel(model);\n  models_->push_back(m);\n  push_back_new(models_we_added_, m);\n  return m;\n}\n\nvoid ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {\n  size_t w = 0;\n  for (size_t r = 0; r < models_->size(); r++) {\n    ParagraphModel *m = (*models_)[r];\n    if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {\n      delete m;\n    } else {\n      if (r > w) {\n        (*models_)[w] = m;\n      }\n      w++;\n    }\n  }\n  models_->resize(w);\n}\n\n// Examine rows[start, end) and try to determine if an existing non-centered\n// paragraph model would fit them perfectly.  If so, return a pointer to it.\n// If not, return nullptr.\nconst ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,\n                                            int start, int end) const {\n  for (const auto *model : *models_) {\n    if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {\n      return model;\n    }\n  }\n  return nullptr;\n}\n\nvoid ParagraphTheory::NonCenteredModels(SetOfModels *models) {\n  for (const auto *model : *models_) {\n    if (model->justification() != JUSTIFICATION_CENTER) {\n      push_back_new(*models, model);\n    }\n  }\n}\n\nint ParagraphTheory::IndexOf(const ParagraphModel *model) const {\n  int i = 0;\n  for (const auto *m : *models_) {\n    if (m == model) {\n      return i;\n    }\n    i++;\n  }\n  return -1;\n}\n\nbool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,\n                    const ParagraphModel *model) {\n  if (!StrongModel(model)) {\n    tprintf(\"ValidFirstLine() should only be called with strong models!\\n\");\n  }\n  return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,\n                                                     (*rows)[row].rindent_, (*rows)[row].rmargin_);\n}\n\nbool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,\n                   const ParagraphModel *model) {\n  if (!StrongModel(model)) {\n    tprintf(\"ValidBodyLine() should only be called with strong models!\\n\");\n  }\n  return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,\n                                                    (*rows)[row].rindent_, (*rows)[row].rmargin_);\n}\n\nbool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,\n                     const ParagraphModel *model) {\n  if (model != kCrownRight && model != kCrownLeft) {\n    tprintf(\"CrownCompatible() should only be called with crown models!\\n\");\n    return false;\n  }\n  auto &row_a = (*rows)[a];\n  auto &row_b = (*rows)[b];\n  if (model == kCrownRight) {\n    return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,\n                       Epsilon(row_a.ri_->average_interword_space));\n  }\n  return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,\n                     Epsilon(row_a.ri_->average_interword_space));\n}\n\n// =============== Implementation of ParagraphModelSmearer ====================\n\nParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,\n                                             int row_start, int row_end, ParagraphTheory *theory)\n    : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {\n  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {\n    row_start_ = 0;\n    row_end_ = 0;\n    return;\n  }\n  open_models_.resize(open_models_.size() + row_end - row_start + 2);\n}\n\n// see paragraphs_internal.h\nvoid ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {\n  SetOfModels no_models;\n  if (row_start < row_start_) {\n    row_start = row_start_;\n  }\n  if (row_end > row_end_) {\n    row_end = row_end_;\n  }\n\n  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {\n    if ((*rows_)[row].ri_->num_words == 0) {\n      OpenModels(row + 1) = no_models;\n    } else {\n      SetOfModels &opened = OpenModels(row);\n      (*rows_)[row].StartHypotheses(&opened);\n\n      // Which models survive the transition from row to row + 1?\n      SetOfModels still_open;\n      for (auto &m : opened) {\n        if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {\n          // This is basic filtering; we check likely paragraph starty-ness down\n          // below in Smear() -- you know, whether the first word would have fit\n          // and such.\n          push_back_new(still_open, m);\n        }\n      }\n      OpenModels(row + 1) = std::move(still_open);\n    }\n  }\n}\n\n// see paragraphs_internal.h\nvoid ParagraphModelSmearer::Smear() {\n  CalculateOpenModels(row_start_, row_end_);\n\n  // For each row which we're unsure about (that is, it is LT_UNKNOWN or\n  // we have multiple LT_START hypotheses), see if there's a model that\n  // was recently used (an \"open\" model) which might model it well.\n  for (int i = row_start_; i < row_end_; i++) {\n    RowScratchRegisters &row = (*rows_)[i];\n    if (row.ri_->num_words == 0) {\n      continue;\n    }\n\n    // Step One:\n    //   Figure out if there are \"open\" models which are left-alined or\n    //   right-aligned.  This is important for determining whether the\n    //   \"first\" word in a row would fit at the \"end\" of the previous row.\n    bool left_align_open = false;\n    bool right_align_open = false;\n    for (auto &m : OpenModels(i)) {\n      switch (m->justification()) {\n        case JUSTIFICATION_LEFT:\n          left_align_open = true;\n          break;\n        case JUSTIFICATION_RIGHT:\n          right_align_open = true;\n          break;\n        default:\n          left_align_open = right_align_open = true;\n      }\n    }\n    // Step Two:\n    //   Use that knowledge to figure out if this row is likely to\n    //   start a paragraph.\n    bool likely_start;\n    if (i == 0) {\n      likely_start = true;\n    } else {\n      if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {\n        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||\n                       LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);\n      } else if (left_align_open) {\n        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);\n      } else {\n        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);\n      }\n    }\n\n    // Step Three:\n    //   If this text line seems like an obvious first line of an\n    //   open model, or an obvious continuation of an existing\n    //   modelled paragraph, mark it up.\n    if (likely_start) {\n      // Add Start Hypotheses for all Open models that fit.\n      for (unsigned m = 0; m < OpenModels(i).size(); m++) {\n        if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {\n          row.AddStartLine(OpenModels(i)[m]);\n        }\n      }\n    } else {\n      // Add relevant body line hypotheses.\n      SetOfModels last_line_models;\n      if (i > 0) {\n        (*rows_)[i - 1].StrongHypotheses(&last_line_models);\n      } else {\n        theory_->NonCenteredModels(&last_line_models);\n      }\n      for (auto model : last_line_models) {\n        if (ValidBodyLine(rows_, i, model)) {\n          row.AddBodyLine(model);\n        }\n      }\n    }\n\n    // Step Four:\n    //   If we're still quite unsure about this line, go through all\n    //   models in our theory and see if this row could be the start\n    //   of any of our  models.\n    if (row.GetLineType() == LT_UNKNOWN ||\n        (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {\n      SetOfModels all_models;\n      theory_->NonCenteredModels(&all_models);\n      for (auto &all_model : all_models) {\n        if (ValidFirstLine(rows_, i, all_model)) {\n          row.AddStartLine(all_model);\n        }\n      }\n    }\n    // Step Five:\n    //   Since we may have updated the hypotheses about this row, we need\n    //   to recalculate the Open models for the rest of rows[i + 1, row_end)\n    if (row.GetLineType() != LT_UNKNOWN) {\n      CalculateOpenModels(i + 1, row_end_);\n    }\n  }\n}\n\n// ================ Main Paragraph Detection Algorithm =======================\n\n// Find out what ParagraphModels are actually used, and discard any\n// that are not.\nstatic void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,\n                                ParagraphTheory *theory) {\n  SetOfModels used_models;\n  for (const auto &row : rows) {\n    row.StrongHypotheses(&used_models);\n  }\n  theory->DiscardUnusedModels(used_models);\n}\n\n// DowngradeWeakestToCrowns:\n//   Forget any flush-{left, right} models unless we see two or more\n//   of them in sequence.\n//\n// In pass 3, we start to classify even flush-left paragraphs (paragraphs\n// where the first line and body indent are the same) as having proper Models.\n// This is generally dangerous, since if you start imagining that flush-left\n// is a typical paragraph model when it is not, it will lead you to chop normal\n// indented paragraphs in the middle whenever a sentence happens to start on a\n// new line (see \"This\" above).  What to do?\n//   What we do is to take any paragraph which is flush left and is not\n// preceded by another paragraph of the same model and convert it to a \"Crown\"\n// paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder\n// for later.  It means that the paragraph is flush, but it would be desirable\n// to mark it as the same model as following text if it fits.  This downgrade\n// FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we\n// avoid making flush left Paragraph Models whenever we see a top-of-the-page\n// half-of-a-paragraph. and instead we mark it the same as normal body text.\n//\n// Implementation:\n//\n//   Comb backwards through the row scratch registers, and turn any\n//   sequences of body lines of equivalent type abutted against the beginning\n//   or a body or start line of a different type into a crown paragraph.\nstatic void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,\n                                     std::vector<RowScratchRegisters> *rows) {\n  int start;\n  for (int end = rows->size(); end > 0; end = start) {\n    // Search back for a body line of a unique type.\n    const ParagraphModel *model = nullptr;\n    while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {\n      end--;\n    }\n    if (end == 0) {\n      break;\n    }\n    start = end - 1;\n    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {\n      start--; // walk back to the first line that is not the same body type.\n    }\n    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&\n        NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {\n      start--;\n    }\n    start++;\n    // Now rows[start, end) is a sequence of unique body hypotheses of model.\n    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {\n      continue;\n    }\n    if (!StrongModel(model)) {\n      while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {\n        start--;\n      }\n    }\n    if (start == 0 || (!StrongModel(model)) ||\n        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {\n      // crownify rows[start, end)\n      const ParagraphModel *crown_model = model;\n      if (StrongModel(model)) {\n        if (model->justification() == JUSTIFICATION_LEFT) {\n          crown_model = kCrownLeft;\n        } else {\n          crown_model = kCrownRight;\n        }\n      }\n      (*rows)[start].SetUnknown();\n      (*rows)[start].AddStartLine(crown_model);\n      for (int row = start + 1; row < end; row++) {\n        (*rows)[row].SetUnknown();\n        (*rows)[row].AddBodyLine(crown_model);\n      }\n    }\n  }\n  DiscardUnusedModels(*rows, theory);\n}\n\n// Clear all hypotheses about lines [start, end) and reset margins.\n//\n// The empty space between the left of a row and the block boundary (and\n// similarly for the right) is split into two pieces: margin and indent.\n// In initial processing, we assume the block is tight and the margin for\n// all lines is set to zero.   However, if our first pass does not yield\n// models for  everything,  it may be  due to an  inset paragraph like a\n// block-quote.   In that case, we make a second pass over that unmarked\n// section of the page and reset the \"margin\" portion of the empty space\n// to the common amount of space at  the ends of the lines under consid-\n// eration.    This would be equivalent to percentile set to 0. However,\n// sometimes we have a single character sticking out in the right margin\n// of a text block  (like the 'r' in 'for' on line 3 above),  and we can\n// really  just ignore it as an outlier.   To express this, we allow the\n// user to specify  the percentile (0..100)  of indent values  to use as\n// the common margin for each row in the run of rows[start, end).\nvoid RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,\n                                        int end, int percentile) {\n  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {\n    return;\n  }\n\n  int lmin, lmax, rmin, rmax;\n  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;\n  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;\n  for (int i = start; i < end; i++) {\n    RowScratchRegisters &sr = (*rows)[i];\n    sr.SetUnknown();\n    if (sr.ri_->num_words == 0) {\n      continue;\n    }\n    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);\n    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);\n  }\n  STATS lefts(lmin, lmax);\n  STATS rights(rmin, rmax);\n  for (int i = start; i < end; i++) {\n    RowScratchRegisters &sr = (*rows)[i];\n    if (sr.ri_->num_words == 0) {\n      continue;\n    }\n    lefts.add(sr.lmargin_ + sr.lindent_, 1);\n    rights.add(sr.rmargin_ + sr.rindent_, 1);\n  }\n  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);\n  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);\n  for (int i = start; i < end; i++) {\n    RowScratchRegisters &sr = (*rows)[i];\n    int ldelta = ignorable_left - sr.lmargin_;\n    sr.lmargin_ += ldelta;\n    sr.lindent_ -= ldelta;\n    int rdelta = ignorable_right - sr.rmargin_;\n    sr.rmargin_ += rdelta;\n    sr.rindent_ -= rdelta;\n  }\n}\n\n// Return the median inter-word space in rows[row_start, row_end).\nint InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {\n  if (row_end < row_start + 1) {\n    return 1;\n  }\n  int word_height =\n      (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;\n  int word_width =\n      (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;\n  STATS spacing_widths(0, 4 + word_width);\n  for (int i = row_start; i < row_end; i++) {\n    if (rows[i].ri_->num_words > 1) {\n      spacing_widths.add(rows[i].ri_->average_interword_space, 1);\n    }\n  }\n  int minimum_reasonable_space = word_height / 3;\n  if (minimum_reasonable_space < 2) {\n    minimum_reasonable_space = 2;\n  }\n  int median = spacing_widths.median();\n  return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;\n}\n\n// Return whether the first word on the after line can fit in the space at\n// the end of the before line (knowing which way the text is aligned and read).\nbool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,\n                           tesseract::ParagraphJustification justification) {\n  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {\n    return true;\n  }\n\n  if (justification == JUSTIFICATION_UNKNOWN) {\n    tprintf(\"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\\n\");\n  }\n  int available_space;\n  if (justification == JUSTIFICATION_CENTER) {\n    available_space = before.lindent_ + before.rindent_;\n  } else {\n    available_space = before.OffsideIndent(justification);\n  }\n  available_space -= before.ri_->average_interword_space;\n\n  if (before.ri_->ltr) {\n    return after.ri_->lword_box.width() < available_space;\n  }\n  return after.ri_->rword_box.width() < available_space;\n}\n\n// Return whether the first word on the after line can fit in the space at\n// the end of the before line (not knowing which way the text goes) in a left\n// or right alignment.\nbool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) {\n  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {\n    return true;\n  }\n\n  int available_space = before.lindent_;\n  if (before.rindent_ > available_space) {\n    available_space = before.rindent_;\n  }\n  available_space -= before.ri_->average_interword_space;\n\n  if (before.ri_->ltr) {\n    return after.ri_->lword_box.width() < available_space;\n  }\n  return after.ri_->rword_box.width() < available_space;\n}\n\nstatic bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {\n  if (before.ri_->ltr) {\n    return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;\n  } else {\n    return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;\n  }\n}\n\nstatic bool LikelyParagraphStart(const RowScratchRegisters &before,\n                                 const RowScratchRegisters &after,\n                                 tesseract::ParagraphJustification j) {\n  return before.ri_->num_words == 0 ||\n         (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));\n}\n\n// Examine rows[start, end) and try to determine what sort of ParagraphModel\n// would fit them as a single paragraph.\n// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.\n// If the rows given could be a consistent start to a paragraph, set *consistent\n// true.\nstatic ParagraphModel InternalParagraphModelByOutline(\n    const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,\n    bool *consistent) {\n  int ltr_line_count = 0;\n  for (int i = start; i < end; i++) {\n    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);\n  }\n  bool ltr = (ltr_line_count >= (end - start) / 2);\n\n  *consistent = true;\n  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {\n    return ParagraphModel();\n  }\n\n  // Ensure the caller only passed us a region with a common rmargin and\n  // lmargin.\n  int lmargin = (*rows)[start].lmargin_;\n  int rmargin = (*rows)[start].rmargin_;\n  int lmin, lmax, rmin, rmax, cmin, cmax;\n  lmin = lmax = (*rows)[start + 1].lindent_;\n  rmin = rmax = (*rows)[start + 1].rindent_;\n  cmin = cmax = 0;\n  for (int i = start + 1; i < end; i++) {\n    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {\n      tprintf(\"Margins don't match! Software error.\\n\");\n      *consistent = false;\n      return ParagraphModel();\n    }\n    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);\n    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);\n    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);\n  }\n  int ldiff = lmax - lmin;\n  int rdiff = rmax - rmin;\n  int cdiff = cmax - cmin;\n  if (rdiff > tolerance && ldiff > tolerance) {\n    if (cdiff < tolerance * 2) {\n      if (end - start < 3) {\n        return ParagraphModel();\n      }\n      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);\n    }\n    *consistent = false;\n    return ParagraphModel();\n  }\n  if (end - start < 3) { // Don't return a model for two line paras.\n    return ParagraphModel();\n  }\n\n  // These booleans keep us from saying something is aligned left when the body\n  // left variance is too large.\n  bool body_admits_left_alignment = ldiff < tolerance;\n  bool body_admits_right_alignment = rdiff < tolerance;\n\n  ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,\n                                             (lmin + lmax) / 2, tolerance);\n  ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,\n                                              (rmin + rmax) / 2, tolerance);\n\n  // These booleans keep us from having an indent on the \"wrong side\" for the\n  // first line.\n  bool text_admits_left_alignment = ltr || left_model.is_flush();\n  bool text_admits_right_alignment = !ltr || right_model.is_flush();\n\n  // At least one of the edges is less than tolerance in variance.\n  // If the other is obviously ragged, it can't be the one aligned to.\n  // [Note the last line is included in this raggedness.]\n  if (tolerance < rdiff) {\n    if (body_admits_left_alignment && text_admits_left_alignment) {\n      return left_model;\n    }\n    *consistent = false;\n    return ParagraphModel();\n  }\n  if (tolerance < ldiff) {\n    if (body_admits_right_alignment && text_admits_right_alignment) {\n      return right_model;\n    }\n    *consistent = false;\n    return ParagraphModel();\n  }\n\n  // At this point, we know the body text doesn't vary much on either side.\n\n  // If the first line juts out oddly in one direction or the other,\n  // that likely indicates the side aligned to.\n  int first_left = (*rows)[start].lindent_;\n  int first_right = (*rows)[start].rindent_;\n\n  if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {\n    return left_model;\n  }\n  if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {\n    return right_model;\n  }\n\n  *consistent = false;\n  return ParagraphModel();\n}\n\n// Examine rows[start, end) and try to determine what sort of ParagraphModel\n// would fit them as a single paragraph.   If nothing fits,\n// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug\n// output if we're debugging.\nstatic ParagraphModel ParagraphModelByOutline(int debug_level,\n                                              const std::vector<RowScratchRegisters> *rows,\n                                              int start, int end, int tolerance) {\n  bool unused_consistent;\n  ParagraphModel retval =\n      InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);\n  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {\n    tprintf(\"Could not determine a model for this paragraph:\\n\");\n    PrintRowRange(*rows, start, end);\n  }\n  return retval;\n}\n\n// Do rows[start, end) form a single instance of the given paragraph model?\nbool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,\n                  const ParagraphModel *model) {\n  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {\n    return false;\n  }\n  if (!ValidFirstLine(rows, start, model)) {\n    return false;\n  }\n  for (int i = start + 1; i < end; i++) {\n    if (!ValidBodyLine(rows, i, model)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Examine rows[row_start, row_end) as an independent section of text,\n// and mark rows that are exceptionally clear as start-of-paragraph\n// and paragraph-body lines.\n//\n// We presume that any lines surrounding rows[row_start, row_end) may\n// have wildly different paragraph models, so we don't key any data off\n// of those lines.\n//\n// We only take the very strongest signals, as we don't want to get\n// confused and marking up centered text, poetry, or source code as\n// clearly part of a typical paragraph.\nstatic void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,\n                               int row_end) {\n  // Record patently obvious body text.\n  for (int i = row_start + 1; i < row_end; i++) {\n    const RowScratchRegisters &prev = (*rows)[i - 1];\n    RowScratchRegisters &curr = (*rows)[i];\n    tesseract::ParagraphJustification typical_justification =\n        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;\n    if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&\n        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {\n      curr.SetBodyLine();\n    }\n  }\n\n  // Record patently obvious start paragraph lines.\n  //\n  // It's an extremely good signal of the start of a paragraph that\n  // the first word would have fit on the end of the previous line.\n  // However, applying just that signal would have us mark random\n  // start lines of lineated text (poetry and source code) and some\n  // centered headings as paragraph start lines.  Therefore, we use\n  // a second qualification for a paragraph start: Not only should\n  // the first word of this line have fit on the previous line,\n  // but also, this line should go full to the right of the block,\n  // disallowing a subsequent word from having fit on this line.\n\n  // First row:\n  {\n    RowScratchRegisters &curr = (*rows)[row_start];\n    RowScratchRegisters &next = (*rows)[row_start + 1];\n    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;\n    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&\n        (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {\n      curr.SetStartLine();\n    }\n  }\n  // Middle rows\n  for (int i = row_start + 1; i < row_end - 1; i++) {\n    RowScratchRegisters &prev = (*rows)[i - 1];\n    RowScratchRegisters &curr = (*rows)[i];\n    RowScratchRegisters &next = (*rows)[i + 1];\n    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;\n    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&\n        LikelyParagraphStart(prev, curr, j)) {\n      curr.SetStartLine();\n    }\n  }\n  // Last row\n  { // the short circuit at the top means we have at least two lines.\n    RowScratchRegisters &prev = (*rows)[row_end - 2];\n    RowScratchRegisters &curr = (*rows)[row_end - 1];\n    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;\n    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&\n        LikelyParagraphStart(prev, curr, j)) {\n      curr.SetStartLine();\n    }\n  }\n}\n\n// Look for sequences of a start line followed by some body lines in\n// rows[row_start, row_end) and create ParagraphModels for them if\n// they seem coherent.\nstatic void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,\n                                int row_start, int row_end, bool allow_flush_models,\n                                ParagraphTheory *theory) {\n  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {\n    return;\n  }\n\n  int start = row_start;\n  while (start < row_end) {\n    while (start < row_end && (*rows)[start].GetLineType() != LT_START) {\n      start++;\n    }\n    if (start >= row_end - 1) {\n      break;\n    }\n\n    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);\n    int end = start;\n    ParagraphModel last_model;\n    bool next_consistent;\n    do {\n      ++end;\n      // rows[row, end) was consistent.\n      // If rows[row, end + 1) is not consistent,\n      //   just model rows[row, end)\n      if (end < row_end - 1) {\n        RowScratchRegisters &next = (*rows)[end];\n        LineType lt = next.GetLineType();\n        next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&\n                                            !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));\n      } else {\n        next_consistent = false;\n      }\n      if (next_consistent) {\n        ParagraphModel next_model =\n            InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);\n        if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&\n             next_model.justification() != JUSTIFICATION_LEFT) ||\n            (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&\n             next_model.justification() != JUSTIFICATION_RIGHT)) {\n          next_consistent = false;\n        }\n        last_model = next_model;\n      } else {\n        next_consistent = false;\n      }\n    } while (next_consistent && end < row_end);\n    // At this point, rows[start, end) looked like it could have been a\n    // single paragraph.  If we can make a good ParagraphModel for it,\n    // do so and mark this sequence with that model.\n    if (end > start + 1) {\n      // emit a new paragraph if we have more than one line.\n      const ParagraphModel *model = nullptr;\n      ParagraphModel new_model = ParagraphModelByOutline(\n          debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));\n      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {\n        // couldn't create a good model, oh well.\n      } else if (new_model.is_flush()) {\n        if (end == start + 2) {\n          // It's very likely we just got two paragraph starts in a row.\n          end = start + 1;\n        } else if (start == row_start) {\n          // Mark this as a Crown.\n          if (new_model.justification() == JUSTIFICATION_LEFT) {\n            model = kCrownLeft;\n          } else {\n            model = kCrownRight;\n          }\n        } else if (allow_flush_models) {\n          model = theory->AddModel(new_model);\n        }\n      } else {\n        model = theory->AddModel(new_model);\n      }\n      if (model) {\n        (*rows)[start].AddStartLine(model);\n        for (int i = start + 1; i < end; i++) {\n          (*rows)[i].AddBodyLine(model);\n        }\n      }\n    }\n    start = end;\n  }\n}\n\n// We examine rows[row_start, row_end) and do the following:\n//   (1) Clear all existing hypotheses for the rows being considered.\n//   (2) Mark up any rows as exceptionally likely to be paragraph starts\n//       or paragraph body lines as such using both geometric and textual\n//       clues.\n//   (3) Form models for any sequence of start + continuation lines.\n//   (4) Smear the paragraph models to cover surrounding text.\nstatic void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,\n                                   int row_start, int row_end, ParagraphTheory *theory) {\n  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {\n    return;\n  }\n\n  if (debug_level > 1) {\n    tprintf(\"#############################################\\n\");\n    tprintf(\"# StrongEvidenceClassify( rows[%d:%d) )\\n\", row_start, row_end);\n    tprintf(\"#############################################\\n\");\n  }\n\n  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);\n  MarkStrongEvidence(rows, row_start, row_end);\n\n  DebugDump(debug_level > 2, \"Initial strong signals.\", *theory, *rows);\n\n  // Create paragraph models.\n  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);\n\n  DebugDump(debug_level > 2, \"Unsmeared hypotheses.s.\", *theory, *rows);\n\n  // At this point, some rows are marked up as paragraphs with model numbers,\n  // and some rows are marked up as either LT_START or LT_BODY.  Now let's\n  // smear any good paragraph hypotheses forward and backward.\n  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);\n  smearer.Smear();\n}\n\nstatic void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,\n                                      int row_end, ParagraphTheory *theory) {\n  for (int i = row_start + 1; i < row_end - 1; i++) {\n    if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&\n        (*rows)[i + 1].ri_->has_leaders) {\n      const ParagraphModel *model =\n          theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));\n      (*rows)[i].AddStartLine(model);\n    }\n  }\n}\n\n// Collect sequences of unique hypotheses in row registers and create proper\n// paragraphs for them, referencing the paragraphs in row_owners.\nstatic void ConvertHypothesizedModelRunsToParagraphs(int debug_level,\n                                                     std::vector<RowScratchRegisters> &rows,\n                                                     std::vector<PARA *> *row_owners,\n                                                     ParagraphTheory *theory) {\n  int end = rows.size();\n  int start;\n  for (; end > 0; end = start) {\n    start = end - 1;\n    const ParagraphModel *model = nullptr;\n    // TODO(eger): Be smarter about dealing with multiple hypotheses.\n    bool single_line_paragraph = false;\n    SetOfModels models;\n    rows[start].NonNullHypotheses(&models);\n    if (!models.empty()) {\n      model = models[0];\n      if (rows[start].GetLineType(model) != LT_BODY) {\n        single_line_paragraph = true;\n      }\n    }\n    if (model && !single_line_paragraph) {\n      // walk back looking for more body lines and then a start line.\n      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {\n        // do nothing\n      }\n      if (start < 0 || rows[start].GetLineType(model) != LT_START) {\n        model = nullptr;\n      }\n    }\n    if (model == nullptr) {\n      continue;\n    }\n    // rows[start, end) should be a paragraph.\n    PARA *p = new PARA();\n    if (model == kCrownLeft || model == kCrownRight) {\n      p->is_very_first_or_continuation = true;\n      // Crown paragraph.\n      //   If we can find an existing ParagraphModel that fits, use it,\n      //   else create a new one.\n      for (unsigned row = end; row < rows.size(); row++) {\n        if ((*row_owners)[row] &&\n            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&\n             (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {\n          model = (*row_owners)[row]->model;\n          break;\n        }\n      }\n      if (model == kCrownLeft) {\n        // No subsequent model fits, so cons one up.\n        model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,\n                                                rows[start].lmargin_ + rows[start].lindent_, 0, 0,\n                                                Epsilon(rows[start].ri_->average_interword_space)));\n      } else if (model == kCrownRight) {\n        // No subsequent model fits, so cons one up.\n        model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,\n                                                rows[start].rmargin_ + rows[start].rmargin_, 0, 0,\n                                                Epsilon(rows[start].ri_->average_interword_space)));\n      }\n    }\n    rows[start].SetUnknown();\n    rows[start].AddStartLine(model);\n    for (int i = start + 1; i < end; i++) {\n      rows[i].SetUnknown();\n      rows[i].AddBodyLine(model);\n    }\n    p->model = model;\n    p->has_drop_cap = rows[start].ri_->has_drop_cap;\n    p->is_list_item = model->justification() == JUSTIFICATION_RIGHT\n                          ? rows[start].ri_->rword_indicates_list_item\n                          : rows[start].ri_->lword_indicates_list_item;\n    for (int row = start; row < end; row++) {\n      if ((*row_owners)[row] != nullptr) {\n        tprintf(\n            \"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called \"\n            \"more than once!\\n\");\n        delete (*row_owners)[row];\n      }\n      (*row_owners)[row] = p;\n    }\n  }\n}\n\nstruct Interval {\n  Interval() : begin(0), end(0) {}\n  Interval(int b, int e) : begin(b), end(e) {}\n\n  int begin;\n  int end;\n};\n\n// Return whether rows[row] appears to be stranded, meaning that the evidence\n// for this row is very weak due to context.  For instance, two lines of source\n// code may happen to be indented at the same tab vector as body text starts,\n// leading us to think they are two start-of-paragraph lines.  This is not\n// optimal.  However, we also don't want to mark a sequence of short dialog\n// as \"weak,\" so our heuristic is:\n//   (1) If a line is surrounded by lines of unknown type, it's weak.\n//   (2) If two lines in a row are start lines for a given paragraph type, but\n//       after that the same paragraph type does not continue, they're weak.\nstatic bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {\n  SetOfModels row_models;\n  rows[row].StrongHypotheses(&row_models);\n\n  for (auto &row_model : row_models) {\n    bool all_starts = rows[row].GetLineType();\n    int run_length = 1;\n    bool continues = true;\n    for (int i = row - 1; i >= 0 && continues; i--) {\n      SetOfModels models;\n      rows[i].NonNullHypotheses(&models);\n      switch (rows[i].GetLineType(row_model)) {\n        case LT_START:\n          run_length++;\n          break;\n        case LT_MULTIPLE: // explicit fall-through\n        case LT_BODY:\n          run_length++;\n          all_starts = false;\n          break;\n        case LT_UNKNOWN: // explicit fall-through\n        default:\n          continues = false;\n      }\n    }\n    continues = true;\n    for (unsigned i = row + 1; i < rows.size() && continues; i++) {\n      SetOfModels models;\n      rows[i].NonNullHypotheses(&models);\n      switch (rows[i].GetLineType(row_model)) {\n        case LT_START:\n          run_length++;\n          break;\n        case LT_MULTIPLE: // explicit fall-through\n        case LT_BODY:\n          run_length++;\n          all_starts = false;\n          break;\n        case LT_UNKNOWN: // explicit fall-through\n        default:\n          continues = false;\n      }\n    }\n    if (run_length > 2 || (!all_starts && run_length > 1)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Go through rows[row_start, row_end) and gather up sequences that need better\n// classification.\n// + Sequences of non-empty rows without hypotheses.\n// + Crown paragraphs not immediately followed by a strongly modeled line.\n// + Single line paragraphs surrounded by text that doesn't match the\n//   model.\nstatic void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,\n                             std::vector<Interval> *to_fix, int row_start, int row_end) {\n  to_fix->clear();\n  for (int i = row_start; i < row_end; i++) {\n    bool needs_fixing = false;\n\n    SetOfModels models;\n    SetOfModels models_w_crowns;\n    rows[i].StrongHypotheses(&models);\n    rows[i].NonNullHypotheses(&models_w_crowns);\n    if (models.empty() && !models_w_crowns.empty()) {\n      // Crown paragraph.  Is it followed by a modeled line?\n      for (unsigned end = i + 1; end < rows.size(); end++) {\n        SetOfModels end_models;\n        SetOfModels strong_end_models;\n        rows[end].NonNullHypotheses(&end_models);\n        rows[end].StrongHypotheses(&strong_end_models);\n        if (end_models.empty()) {\n          needs_fixing = true;\n          break;\n        } else if (!strong_end_models.empty()) {\n          needs_fixing = false;\n          break;\n        }\n      }\n    } else if (models.empty() && rows[i].ri_->num_words > 0) {\n      // No models at all.\n      needs_fixing = true;\n    }\n\n    if (!needs_fixing && !models.empty()) {\n      needs_fixing = RowIsStranded(rows, i);\n    }\n\n    if (needs_fixing) {\n      if (!to_fix->empty() && to_fix->back().end == i - 1) {\n        to_fix->back().end = i;\n      } else {\n        to_fix->push_back(Interval(i, i));\n      }\n    }\n  }\n  // Convert inclusive intervals to half-open intervals.\n  for (auto &i : *to_fix) {\n    i.end = i.end + 1;\n  }\n}\n\n// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),\n// normalize each row_owner to point to an actual PARA, and output the\n// paragraphs in order onto paragraphs.\nvoid CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {\n  std::vector<PARA *> &rows = *row_owners;\n  paragraphs->clear();\n  PARA_IT out(paragraphs);\n  PARA *formerly_null = nullptr;\n  for (unsigned i = 0; i < rows.size(); i++) {\n    if (rows[i] == nullptr) {\n      if (i == 0 || rows[i - 1] != formerly_null) {\n        rows[i] = formerly_null = new PARA();\n      } else {\n        rows[i] = formerly_null;\n        continue;\n      }\n    } else if (i > 0 && rows[i - 1] == rows[i]) {\n      continue;\n    }\n    out.add_after_then_move(rows[i]);\n  }\n}\n\n// Main entry point for Paragraph Detection Algorithm.\n//\n// Given a set of equally spaced textlines (described by row_infos),\n// Split them into paragraphs.\n//\n// Output:\n//   row_owners - one pointer for each row, to the paragraph it belongs to.\n//   paragraphs - this is the actual list of PARA objects.\n//   models - the list of paragraph models referenced by the PARA objects.\n//            caller is responsible for deleting the models.\nvoid DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,\n                      std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,\n                      std::vector<ParagraphModel *> *models) {\n  ParagraphTheory theory(models);\n\n  // Initialize row_owners to be a bunch of nullptr pointers.\n  row_owners->clear();\n  row_owners->resize(row_infos->size());\n\n  // Set up row scratch registers for the main algorithm.\n  std::vector<RowScratchRegisters> rows(row_infos->size());\n  for (unsigned i = 0; i < row_infos->size(); i++) {\n    rows[i].Init((*row_infos)[i]);\n  }\n\n  // Pass 1:\n  //   Detect sequences of lines that all contain leader dots (.....)\n  //   These are likely Tables of Contents.  If there are three text lines in\n  //   a row with leader dots, it's pretty safe to say the middle one should\n  //   be a paragraph of its own.\n  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);\n\n  DebugDump(debug_level > 1, \"End of Pass 1\", theory, rows);\n\n  std::vector<Interval> leftovers;\n  LeftoverSegments(rows, &leftovers, 0, rows.size());\n  for (auto &leftover : leftovers) {\n    // Pass 2a:\n    //   Find any strongly evidenced start-of-paragraph lines.  If they're\n    //   followed by two lines that look like body lines, make a paragraph\n    //   model for that and see if that model applies throughout the text\n    //   (that is, \"smear\" it).\n    StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);\n\n    // Pass 2b:\n    //   If we had any luck in pass 2a, we got part of the page and didn't\n    //   know how to classify a few runs of rows. Take the segments that\n    //   didn't find a model and reprocess them individually.\n    std::vector<Interval> leftovers2;\n    LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);\n    bool pass2a_was_useful =\n        leftovers2.size() > 1 ||\n        (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));\n    if (pass2a_was_useful) {\n      for (auto &leftover2 : leftovers2) {\n        StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);\n      }\n    }\n  }\n\n  DebugDump(debug_level > 1, \"End of Pass 2\", theory, rows);\n\n  // Pass 3:\n  //   These are the dregs for which we didn't have enough strong textual\n  //   and geometric clues to form matching models for.  Let's see if\n  //   the geometric clues are simple enough that we could just use those.\n  LeftoverSegments(rows, &leftovers, 0, rows.size());\n  for (auto &leftover : leftovers) {\n    GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);\n  }\n\n  // Undo any flush models for which there's little evidence.\n  DowngradeWeakestToCrowns(debug_level, &theory, &rows);\n\n  DebugDump(debug_level > 1, \"End of Pass 3\", theory, rows);\n\n  // Pass 4:\n  //   Take everything that's still not marked up well and clear all markings.\n  LeftoverSegments(rows, &leftovers, 0, rows.size());\n  for (auto &leftover : leftovers) {\n    for (int j = leftover.begin; j < leftover.end; j++) {\n      rows[j].SetUnknown();\n    }\n  }\n\n  DebugDump(debug_level > 1, \"End of Pass 4\", theory, rows);\n\n  // Convert all of the unique hypothesis runs to PARAs.\n  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);\n\n  DebugDump(debug_level > 0, \"Final Paragraph Segmentation\", theory, rows);\n\n  // Finally, clean up any dangling nullptr row paragraph parents.\n  CanonicalizeDetectionResults(row_owners, paragraphs);\n}\n\n// ============ Code interfacing with the rest of Tesseract ==================\n\nstatic void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {\n  // Set up text, lword_text, and rword_text (mostly for debug printing).\n  std::string fake_text;\n  PageIterator pit(static_cast<const PageIterator &>(it));\n  if (!pit.Empty(RIL_WORD)) {\n    bool first_word = true;\n    do {\n      fake_text += \"x\";\n      if (first_word) {\n        info->lword_text += \"x\";\n      }\n      info->rword_text += \"x\";\n      if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&\n          !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {\n        fake_text += \" \";\n        info->rword_text = \"\";\n        first_word = false;\n      }\n    } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));\n  }\n  if (fake_text.empty()) {\n    return;\n  }\n\n  int lspaces = info->pix_ldistance / info->average_interword_space;\n  for (int i = 0; i < lspaces; i++) {\n    info->text += ' ';\n  }\n  info->text += fake_text;\n\n  // Set up lword_box, rword_box, and num_words.\n  PAGE_RES_IT page_res_it = *it.PageResIt();\n  WERD_RES *word_res = page_res_it.restart_row();\n  ROW_RES *this_row = page_res_it.row();\n\n  WERD_RES *lword = nullptr;\n  WERD_RES *rword = nullptr;\n  info->num_words = 0;\n  do {\n    if (word_res) {\n      if (!lword) {\n        lword = word_res;\n      }\n      if (rword != word_res) {\n        info->num_words++;\n      }\n      rword = word_res;\n    }\n    word_res = page_res_it.forward();\n  } while (page_res_it.row() == this_row);\n\n  if (lword) {\n    info->lword_box = lword->word->bounding_box();\n  }\n  if (rword) {\n    info->rword_box = rword->word->bounding_box();\n  }\n}\n\n// Given a Tesseract Iterator pointing to a text line, fill in the paragraph\n// detector RowInfo with all relevant information from the row.\nstatic void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {\n  if (it.PageResIt()->row() != nullptr) {\n    ROW *row = it.PageResIt()->row()->row;\n    info->pix_ldistance = row->lmargin();\n    info->pix_rdistance = row->rmargin();\n    info->average_interword_space =\n        row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);\n    info->pix_xheight = row->x_height();\n    info->has_leaders = false;\n    info->has_drop_cap = row->has_drop_cap();\n    info->ltr = true; // set below depending on word scripts\n  } else {\n    info->pix_ldistance = info->pix_rdistance = 0;\n    info->average_interword_space = 1;\n    info->pix_xheight = 1.0;\n    info->has_leaders = false;\n    info->has_drop_cap = false;\n    info->ltr = true;\n  }\n\n  info->num_words = 0;\n  info->lword_indicates_list_item = false;\n  info->lword_likely_starts_idea = false;\n  info->lword_likely_ends_idea = false;\n  info->rword_indicates_list_item = false;\n  info->rword_likely_starts_idea = false;\n  info->rword_likely_ends_idea = false;\n  info->has_leaders = false;\n  info->ltr = true;\n\n  if (!after_recognition) {\n    InitializeTextAndBoxesPreRecognition(it, info);\n    return;\n  }\n  info->text = \"\";\n  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));\n  int trailing_ws_idx = strlen(text.get()); // strip trailing space\n  while (trailing_ws_idx > 0 &&\n         // isspace() only takes ASCII\n         isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {\n    trailing_ws_idx--;\n  }\n  if (trailing_ws_idx > 0) {\n    int lspaces = info->pix_ldistance / info->average_interword_space;\n    for (int i = 0; i < lspaces; i++) {\n      info->text += ' ';\n    }\n    for (int i = 0; i < trailing_ws_idx; i++) {\n      info->text += text[i];\n    }\n  }\n\n  if (info->text.empty()) {\n    return;\n  }\n\n  PAGE_RES_IT page_res_it = *it.PageResIt();\n  std::vector<WERD_RES *> werds;\n  WERD_RES *word_res = page_res_it.restart_row();\n  ROW_RES *this_row = page_res_it.row();\n  int num_leaders = 0;\n  int ltr = 0;\n  int rtl = 0;\n  do {\n    if (word_res && word_res->best_choice->unichar_string().length() > 0) {\n      werds.push_back(word_res);\n      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;\n      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;\n      if (word_res->word->flag(W_REP_CHAR)) {\n        num_leaders++;\n      }\n    }\n    word_res = page_res_it.forward();\n  } while (page_res_it.row() == this_row);\n  info->ltr = ltr >= rtl;\n  info->has_leaders = num_leaders > 3;\n  info->num_words = werds.size();\n  if (!werds.empty()) {\n    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];\n    info->lword_text = lword->best_choice->unichar_string().c_str();\n    info->rword_text = rword->best_choice->unichar_string().c_str();\n    info->lword_box = lword->word->bounding_box();\n    info->rword_box = rword->word->bounding_box();\n    LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,\n                       &info->lword_indicates_list_item, &info->lword_likely_starts_idea,\n                       &info->lword_likely_ends_idea);\n    RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,\n                        &info->rword_indicates_list_item, &info->rword_likely_starts_idea,\n                        &info->rword_likely_ends_idea);\n  }\n}\n\n// This is called after rows have been identified and words are recognized.\n// Much of this could be implemented before word recognition, but text helps\n// to identify bulleted lists and gives good signals for sentence boundaries.\nvoid DetectParagraphs(int debug_level, bool after_text_recognition,\n                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {\n  // Clear out any preconceived notions.\n  if (block_start->Empty(RIL_TEXTLINE)) {\n    return;\n  }\n  BLOCK *block = block_start->PageResIt()->block()->block;\n  block->para_list()->clear();\n  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();\n\n  // Convert the Tesseract structures to RowInfos\n  // for the paragraph detection algorithm.\n  MutableIterator row(*block_start);\n  if (row.Empty(RIL_TEXTLINE)) {\n    return; // end of input already.\n  }\n\n  std::vector<RowInfo> row_infos;\n  do {\n    if (!row.PageResIt()->row()) {\n      continue; // empty row.\n    }\n    row.PageResIt()->row()->row->set_para(nullptr);\n    row_infos.emplace_back();\n    RowInfo &ri = row_infos.back();\n    InitializeRowInfo(after_text_recognition, row, &ri);\n  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));\n\n  // If we're called before text recognition, we might not have\n  // tight block bounding boxes, so trim by the minimum on each side.\n  if (!row_infos.empty()) {\n    int min_lmargin = row_infos[0].pix_ldistance;\n    int min_rmargin = row_infos[0].pix_rdistance;\n    for (unsigned i = 1; i < row_infos.size(); i++) {\n      if (row_infos[i].pix_ldistance < min_lmargin) {\n        min_lmargin = row_infos[i].pix_ldistance;\n      }\n      if (row_infos[i].pix_rdistance < min_rmargin) {\n        min_rmargin = row_infos[i].pix_rdistance;\n      }\n    }\n    if (min_lmargin > 0 || min_rmargin > 0) {\n      for (auto &row_info : row_infos) {\n        row_info.pix_ldistance -= min_lmargin;\n        row_info.pix_rdistance -= min_rmargin;\n      }\n    }\n  }\n\n  // Run the paragraph detection algorithm.\n  std::vector<PARA *> row_owners;\n  if (!is_image_block) {\n    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);\n  } else {\n    row_owners.resize(row_infos.size());\n    CanonicalizeDetectionResults(&row_owners, block->para_list());\n  }\n\n  // Now stitch in the row_owners into the rows.\n  row = *block_start;\n  for (auto &row_owner : row_owners) {\n    while (!row.PageResIt()->row()) {\n      row.Next(RIL_TEXTLINE);\n    }\n    row.PageResIt()->row()->row->set_para(row_owner);\n    row.Next(RIL_TEXTLINE);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/paragraphs.h",
    "content": "/**********************************************************************\n * File:        paragraphs.h\n * Description: Paragraph Detection data structures.\n * Author:      David Eger\n * Created:     25 February 2011\n *\n * (C) Copyright 2011, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_\n#define TESSERACT_CCMAIN_PARAGRAPHS_H_\n\n#include <list>\n#include <string>\n#include \"rect.h\"   // for TBOX\n\nnamespace tesseract {\n\nclass MutableIterator;\nclass ParagraphModel;\nclass PARA_LIST;\nstruct PARA;\n\n// This structure captures all information needed about a text line for the\n// purposes of paragraph detection.  It is meant to be exceedingly light-weight\n// so that we can easily test paragraph detection independent of the rest of\n// Tesseract.\nclass RowInfo {\npublic:\n  // Constant data derived from Tesseract output.\n  std::string text; // the full UTF-8 text of the line.\n  bool ltr;    // whether the majority of the text is left-to-right\n               // TODO(eger) make this more fine-grained.\n\n  bool has_leaders;            // does the line contain leader dots (.....)?\n  bool has_drop_cap;           // does the line have a drop cap?\n  int pix_ldistance;           // distance to the left pblock boundary in pixels\n  int pix_rdistance;           // distance to the right pblock boundary in pixels\n  float pix_xheight;           // guessed xheight for the line\n  int average_interword_space; // average space between words in pixels.\n\n  int num_words;\n  TBOX lword_box; // in normalized (horiz text rows) space\n  TBOX rword_box; // in normalized (horiz text rows) space\n\n  std::string lword_text; // the UTF-8 text of the leftmost werd\n  std::string rword_text; // the UTF-8 text of the rightmost werd\n\n  //   The text of a paragraph typically starts with the start of an idea and\n  // ends with the end of an idea.  Here we define paragraph as something that\n  // may have a first line indent and a body indent which may be different.\n  // Typical words that start an idea are:\n  //   1. Words in western scripts that start with\n  //      a capital letter, for example \"The\"\n  //   2. Bulleted or numbered list items, for\n  //      example \"2.\"\n  // Typical words which end an idea are words ending in punctuation marks. In\n  // this vocabulary, each list item is represented as a paragraph.\n  bool lword_indicates_list_item;\n  bool lword_likely_starts_idea;\n  bool lword_likely_ends_idea;\n\n  bool rword_indicates_list_item;\n  bool rword_likely_starts_idea;\n  bool rword_likely_ends_idea;\n};\n\n// Main entry point for Paragraph Detection Algorithm.\n//\n// Given a set of equally spaced textlines (described by row_infos),\n// Split them into paragraphs.  See http://goto/paragraphstalk\n//\n// Output:\n//   row_owners - one pointer for each row, to the paragraph it belongs to.\n//   paragraphs - this is the actual list of PARA objects.\n//   models - the list of paragraph models referenced by the PARA objects.\n//            caller is responsible for deleting the models.\nTESS_API\nvoid DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,\n                      std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,\n                      std::vector<ParagraphModel *> *models);\n\n// Given a MutableIterator to the start of a block, run DetectParagraphs on\n// that block and commit the results to the underlying ROW and BLOCK structs,\n// saving the ParagraphModels in models.  Caller owns the models.\n// We use unicharset during the function to answer questions such as \"is the\n// first letter of this word upper case?\"\nTESS_API\nvoid DetectParagraphs(int debug_level, bool after_text_recognition,\n                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_\n"
  },
  {
    "path": "src/ccmain/paragraphs_internal.h",
    "content": "/**********************************************************************\n * File:        paragraphs_internal.h\n * Description: Paragraph Detection internal data structures.\n * Author:      David Eger\n *\n * (C) Copyright 2011, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_\n#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_\n\n#include <tesseract/publictypes.h> // for ParagraphJustification\n#include \"paragraphs.h\"\n\n// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS\n// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.\n\nnamespace tesseract {\n\nclass UNICHARSET;\nclass WERD_CHOICE;\n\n// Return whether the given word is likely to be a list item start word.\nTESS_API\nbool AsciiLikelyListItem(const std::string &word);\n\n// Set right word attributes given either a unicharset and werd or a utf8\n// string.\nTESS_API\nvoid RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,\n                         bool *is_list, bool *starts_idea, bool *ends_idea);\n\n// Set left word attributes given either a unicharset and werd or a utf8 string.\nTESS_API\nvoid LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,\n                        bool *is_list, bool *starts_idea, bool *ends_idea);\n\nenum LineType {\n  LT_START = 'S',    // First line of a paragraph.\n  LT_BODY = 'C',     // Continuation line of a paragraph.\n  LT_UNKNOWN = 'U',  // No clues.\n  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.\n};\n\n// The first paragraph in a page of body text is often un-indented.\n// This is a typographic convention which is common to indicate either that:\n// (1) The paragraph is the continuation of a previous paragraph, or\n// (2) The paragraph is the first paragraph in a chapter.\n//\n// I refer to such paragraphs as \"crown\"s, and the output of the paragraph\n// detection algorithm attempts to give them the same paragraph model as\n// the rest of the body text.\n//\n// Nonetheless, while building hypotheses, it is useful to mark the lines\n// of crown paragraphs temporarily as crowns, either aligned left or right.\nextern const ParagraphModel *kCrownLeft;\nextern const ParagraphModel *kCrownRight;\n\ninline bool StrongModel(const ParagraphModel *model) {\n  return model != nullptr && model != kCrownLeft && model != kCrownRight;\n}\n\nstruct LineHypothesis {\n  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}\n  LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}\n  LineHypothesis(const LineHypothesis &other) = default;\n\n  // Copy assignment operator.\n  LineHypothesis &operator=(const LineHypothesis &other) = default;\n\n  bool operator==(const LineHypothesis &other) const {\n    return ty == other.ty && model == other.model;\n  }\n\n  LineType ty;\n  const ParagraphModel *model;\n};\n\nclass ParagraphTheory; // Forward Declaration\n\nusing SetOfModels = std::vector<const ParagraphModel *>;\n\n// Row Scratch Registers are data generated by the paragraph detection\n// algorithm based on a RowInfo input.\nclass RowScratchRegisters {\npublic:\n  // We presume row will outlive us.\n  void Init(const RowInfo &row);\n\n  LineType GetLineType() const;\n\n  LineType GetLineType(const ParagraphModel *model) const;\n\n  // Mark this as a start line type, sans model.  This is useful for the\n  // initial marking of probable body lines or paragraph start lines.\n  void SetStartLine();\n\n  // Mark this as a body line type, sans model.  This is useful for the\n  // initial marking of probably body lines or paragraph start lines.\n  void SetBodyLine();\n\n  // Record that this row fits as a paragraph start line in the given model,\n  void AddStartLine(const ParagraphModel *model);\n  // Record that this row fits as a paragraph body line in the given model,\n  void AddBodyLine(const ParagraphModel *model);\n\n  // Clear all hypotheses about this line.\n  void SetUnknown() {\n    hypotheses_.clear();\n  }\n\n  // Append all hypotheses of strong models that match this row as a start.\n  void StartHypotheses(SetOfModels *models) const;\n\n  // Append all hypotheses of strong models matching this row.\n  void StrongHypotheses(SetOfModels *models) const;\n\n  // Append all hypotheses for this row.\n  void NonNullHypotheses(SetOfModels *models) const;\n\n  // Discard any hypotheses whose model is not in the given list.\n  void DiscardNonMatchingHypotheses(const SetOfModels &models);\n\n  // If we have only one hypothesis and that is that this line is a paragraph\n  // start line of a certain model, return that model.  Else return nullptr.\n  const ParagraphModel *UniqueStartHypothesis() const;\n\n  // If we have only one hypothesis and that is that this line is a paragraph\n  // body line of a certain model, return that model.  Else return nullptr.\n  const ParagraphModel *UniqueBodyHypothesis() const;\n\n  // Return the indentation for the side opposite of the aligned side.\n  int OffsideIndent(tesseract::ParagraphJustification just) const {\n    switch (just) {\n      case tesseract::JUSTIFICATION_RIGHT:\n        return lindent_;\n      case tesseract::JUSTIFICATION_LEFT:\n        return rindent_;\n      default:\n        return lindent_ > rindent_ ? lindent_ : rindent_;\n    }\n  }\n\n  // Return the indentation for the side the text is aligned to.\n  int AlignsideIndent(tesseract::ParagraphJustification just) const {\n    switch (just) {\n      case tesseract::JUSTIFICATION_RIGHT:\n        return rindent_;\n      case tesseract::JUSTIFICATION_LEFT:\n        return lindent_;\n      default:\n        return lindent_ > rindent_ ? lindent_ : rindent_;\n    }\n  }\n\n  // Append header fields to a vector of row headings.\n  static void AppendDebugHeaderFields(std::vector<std::string> &header);\n\n  // Append data for this row to a vector of debug strings.\n  void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;\n\n  const RowInfo *ri_;\n\n  // These four constants form a horizontal box model for the white space\n  // on the edges of each line.  At each point in the algorithm, the following\n  // shall hold:\n  //   ri_->pix_ldistance = lmargin_ + lindent_\n  //   ri_->pix_rdistance = rindent_ + rmargin_\n  int lmargin_;\n  int lindent_;\n  int rindent_;\n  int rmargin_;\n\nprivate:\n  // Hypotheses of either LT_START or LT_BODY\n  std::vector<LineHypothesis> hypotheses_;\n};\n\n// A collection of convenience functions for wrapping the set of\n// Paragraph Models we believe correctly model the paragraphs in the image.\nclass ParagraphTheory {\npublic:\n  // We presume models will outlive us, and that models will take ownership\n  // of any ParagraphModel *'s we add.\n  explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}\n  std::vector<ParagraphModel *> &models() {\n    return *models_;\n  }\n  const std::vector<ParagraphModel *> &models() const {\n    return *models_;\n  }\n\n  // Return an existing model if one that is Comparable() can be found.\n  // Else, allocate a new copy of model to save and return a pointer to it.\n  const ParagraphModel *AddModel(const ParagraphModel &model);\n\n  // Discard any models we've made that are not in the list of used models.\n  void DiscardUnusedModels(const SetOfModels &used_models);\n\n  // Return the set of all non-centered models.\n  void NonCenteredModels(SetOfModels *models);\n\n  // If any of the non-centered paragraph models we know about fit\n  // rows[start, end), return it.  Else nullptr.\n  const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,\n                             int end) const;\n\n  int IndexOf(const ParagraphModel *model) const;\n\nprivate:\n  std::vector<ParagraphModel *> *models_;\n  std::vector<ParagraphModel *> models_we_added_;\n};\n\nbool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,\n                    const ParagraphModel *model);\nbool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,\n                   const ParagraphModel *model);\nbool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,\n                     const ParagraphModel *model);\n\n// A class for smearing Paragraph Model hypotheses to surrounding rows.\n// The idea here is that StrongEvidenceClassify first marks only exceedingly\n// obvious start and body rows and constructs models of them.  Thereafter,\n// we may have left over unmarked lines (mostly end-of-paragraph lines) which\n// were too short to have much confidence about, but which fit the models we've\n// constructed perfectly and which we ought to mark.  This class is used to\n// \"smear\" our models over the text.\nclass ParagraphModelSmearer {\npublic:\n  ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,\n                        ParagraphTheory *theory);\n\n  // Smear forward paragraph models from existing row markings to subsequent\n  // text lines if they fit, and mark any thereafter still unmodeled rows\n  // with any model in the theory that fits them.\n  void Smear();\n\nprivate:\n  // Record in open_models_ for rows [start_row, end_row) the list of models\n  // currently open at each row.\n  // A model is still open in a row if some previous row has said model as a\n  // start hypothesis, and all rows since (including this row) would fit as\n  // either a body or start line in that model.\n  void CalculateOpenModels(int row_start, int row_end);\n\n  SetOfModels &OpenModels(int row) {\n    return open_models_[row - row_start_ + 1];\n  }\n\n  ParagraphTheory *theory_;\n  std::vector<RowScratchRegisters> *rows_;\n  int row_start_;\n  int row_end_;\n\n  // open_models_ corresponds to rows[start_row_ - 1, end_row_]\n  //\n  // open_models_:  Contains models which there was an active (open) paragraph\n  //                as of the previous line and for which the left and right\n  //                indents admit the possibility that this text line continues\n  //                to fit the same model.\n  // TODO(eger): Think about whether we can get rid of \"Open\" models and just\n  //   use the current hypotheses on RowScratchRegisters.\n  std::vector<SetOfModels> open_models_;\n};\n\n// Clear all hypotheses about lines [start, end) and reset the margins to the\n// percentile (0..100) value of the left and right row edges for this run of\n// rows.\nvoid RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,\n                                        int end, int percentile);\n\n// Return the median inter-word space in rows[row_start, row_end).\nint InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);\n\n// Return whether the first word on the after line can fit in the space at\n// the end of the before line (knowing which way the text is aligned and read).\nbool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,\n                           tesseract::ParagraphJustification justification);\n\n// Return whether the first word on the after line can fit in the space at\n// the end of the before line (not knowing the text alignment).\nbool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);\n\n// Do rows[start, end) form a single instance of the given paragraph model?\nbool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,\n                  const ParagraphModel *model);\n\n// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),\n// normalize each row_owner to point to an actual PARA, and output the\n// paragraphs in order onto paragraphs.\nvoid CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_\n"
  },
  {
    "path": "src/ccmain/paramsd.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        paramsd.cpp\n// Description: Tesseract parameter Editor\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// The parameters editor is used to edit all the parameters used within\n// tesseract from the ui.\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#ifndef GRAPHICS_DISABLED\n\n#  include \"params.h\" // for ParamsVectors, StringParam, BoolParam\n#  include \"paramsd.h\"\n#  include \"scrollview.h\"     // for SVEvent, ScrollView, SVET_POPUP\n#  include \"svmnode.h\"        // for SVMenuNode\n#  include \"tesseractclass.h\" // for Tesseract\n\n#  include <cstdio>  // for fclose, fopen, fprintf, FILE\n#  include <cstdlib> // for atoi\n#  include <cstring> // for strcmp, strcspn, strlen, strncpy\n#  include <locale>  // for std::locale::classic\n#  include <map>     // for map, _Rb_tree_iterator, map<>::iterator\n#  include <memory>  // for unique_ptr\n#  include <sstream> // for std::stringstream\n#  include <utility> // for pair\n\nnamespace tesseract {\n\n#  define VARDIR \"configs/\" /*parameters files */\n#  define MAX_ITEMS_IN_SUBMENU 30\n\n// The following variables should remain static globals, since they\n// are used by debug editor, which uses a single Tesseract instance.\n//\n// Contains the mappings from unique VC ids to their actual pointers.\nstatic std::map<int, ParamContent *> vcMap;\nstatic int nrParams = 0;\nstatic int writeCommands[2];\n\n// Constructors for the various ParamTypes.\nParamContent::ParamContent(tesseract::StringParam *it) {\n  my_id_ = nrParams;\n  nrParams++;\n  param_type_ = VT_STRING;\n  sIt = it;\n  vcMap[my_id_] = this;\n}\n// Constructors for the various ParamTypes.\nParamContent::ParamContent(tesseract::IntParam *it) {\n  my_id_ = nrParams;\n  nrParams++;\n  param_type_ = VT_INTEGER;\n  iIt = it;\n  vcMap[my_id_] = this;\n}\n// Constructors for the various ParamTypes.\nParamContent::ParamContent(tesseract::BoolParam *it) {\n  my_id_ = nrParams;\n  nrParams++;\n  param_type_ = VT_BOOLEAN;\n  bIt = it;\n  vcMap[my_id_] = this;\n}\n// Constructors for the various ParamTypes.\nParamContent::ParamContent(tesseract::DoubleParam *it) {\n  my_id_ = nrParams;\n  nrParams++;\n  param_type_ = VT_DOUBLE;\n  dIt = it;\n  vcMap[my_id_] = this;\n}\n\n// Gets a VC object identified by its ID.\nParamContent *ParamContent::GetParamContentById(int id) {\n  return vcMap[id];\n}\n\n// Copy the first N words from the source string to the target string.\n// Words are delimited by \"_\".\nvoid ParamsEditor::GetFirstWords(const char *s, // source string\n                                 int n,         // number of words\n                                 char *t        // target string\n) {\n  int full_length = strlen(s);\n  int reqd_len = 0; // No. of chars required\n  const char *next_word = s;\n\n  while ((n > 0) && reqd_len < full_length) {\n    reqd_len += strcspn(next_word, \"_\") + 1;\n    next_word += reqd_len;\n    n--;\n  }\n  strncpy(t, s, reqd_len);\n  t[reqd_len] = '\\0'; // ensure null terminal\n}\n\n// Getter for the name.\nconst char *ParamContent::GetName() const {\n  if (param_type_ == VT_INTEGER) {\n    return iIt->name_str();\n  } else if (param_type_ == VT_BOOLEAN) {\n    return bIt->name_str();\n  } else if (param_type_ == VT_DOUBLE) {\n    return dIt->name_str();\n  } else if (param_type_ == VT_STRING) {\n    return sIt->name_str();\n  } else {\n    return \"ERROR: ParamContent::GetName()\";\n  }\n}\n\n// Getter for the description.\nconst char *ParamContent::GetDescription() const {\n  if (param_type_ == VT_INTEGER) {\n    return iIt->info_str();\n  } else if (param_type_ == VT_BOOLEAN) {\n    return bIt->info_str();\n  } else if (param_type_ == VT_DOUBLE) {\n    return dIt->info_str();\n  } else if (param_type_ == VT_STRING) {\n    return sIt->info_str();\n  } else {\n    return nullptr;\n  }\n}\n\n// Getter for the value.\nstd::string ParamContent::GetValue() const {\n  std::string result;\n  if (param_type_ == VT_INTEGER) {\n    result += std::to_string(*iIt);\n  } else if (param_type_ == VT_BOOLEAN) {\n    result += std::to_string(*bIt);\n  } else if (param_type_ == VT_DOUBLE) {\n    result += std::to_string(*dIt);\n  } else if (param_type_ == VT_STRING) {\n    result = sIt->c_str();\n  }\n  return result;\n}\n\n// Setter for the value.\nvoid ParamContent::SetValue(const char *val) {\n  // TODO (wanke) Test if the values actually are properly converted.\n  // (Quickly visible impacts?)\n  changed_ = true;\n  if (param_type_ == VT_INTEGER) {\n    iIt->set_value(atoi(val));\n  } else if (param_type_ == VT_BOOLEAN) {\n    bIt->set_value(atoi(val));\n  } else if (param_type_ == VT_DOUBLE) {\n    std::stringstream stream(val);\n    // Use \"C\" locale for reading double value.\n    stream.imbue(std::locale::classic());\n    double d = 0;\n    stream >> d;\n    dIt->set_value(d);\n  } else if (param_type_ == VT_STRING) {\n    sIt->set_value(val);\n  }\n}\n\n// Gets the up to the first 3 prefixes from s (split by _).\n// For example, tesseract_foo_bar will be split into tesseract,foo and bar.\nvoid ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,\n                               std::string *level_three) {\n  std::unique_ptr<char[]> p(new char[1024]);\n  GetFirstWords(s, 1, p.get());\n  *level_one = p.get();\n  GetFirstWords(s, 2, p.get());\n  *level_two = p.get();\n  GetFirstWords(s, 3, p.get());\n  *level_three = p.get();\n}\n\n// Compare two VC objects by their name.\nint ParamContent::Compare(const ParamContent *one, const ParamContent *two) {\n  return strcmp(one->GetName(), two->GetName());\n}\n\n// Find all editable parameters used within tesseract and create a\n// SVMenuNode tree from it.\n// TODO (wanke): This is actually sort of hackish.\nSVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {\n  auto *mr = new SVMenuNode();\n  ParamContent_LIST vclist;\n  ParamContent_IT vc_it(&vclist);\n  // Amount counts the number of entries for a specific char*.\n  // TODO(rays) get rid of the use of std::map.\n  std::map<const char *, int> amount;\n\n  // Add all parameters to a list.\n  int num_iterations = (tess->params() == nullptr) ? 1 : 2;\n  for (int v = 0; v < num_iterations; ++v) {\n    tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();\n    for (auto &param : vec->int_params) {\n      vc_it.add_after_then_move(new ParamContent(param));\n    }\n    for (auto &param : vec->bool_params) {\n      vc_it.add_after_then_move(new ParamContent(param));\n    }\n    for (auto &param : vec->string_params) {\n      vc_it.add_after_then_move(new ParamContent(param));\n    }\n    for (auto &param : vec->double_params) {\n      vc_it.add_after_then_move(new ParamContent(param));\n    }\n  }\n\n  // Count the # of entries starting with a specific prefix.\n  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {\n    ParamContent *vc = vc_it.data();\n    std::string tag;\n    std::string tag2;\n    std::string tag3;\n\n    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);\n    amount[tag.c_str()]++;\n    amount[tag2.c_str()]++;\n    amount[tag3.c_str()]++;\n  }\n\n  vclist.sort(ParamContent::Compare); // Sort the list alphabetically.\n\n  SVMenuNode *other = mr->AddChild(\"OTHER\");\n\n  // go through the list again and this time create the menu structure.\n  vc_it.move_to_first();\n  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {\n    ParamContent *vc = vc_it.data();\n    std::string tag;\n    std::string tag2;\n    std::string tag3;\n    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);\n\n    if (amount[tag.c_str()] == 1) {\n      other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());\n    } else { // More than one would use this submenu -> create submenu.\n      SVMenuNode *sv = mr->AddChild(tag.c_str());\n      if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) || (amount[tag2.c_str()] <= 1)) {\n        sv->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());\n      } else { // Make subsubmenus.\n        SVMenuNode *sv2 = sv->AddChild(tag2.c_str());\n        sv2->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());\n      }\n    }\n  }\n  return mr;\n}\n\n// Event listener. Waits for SVET_POPUP events and processes them.\nvoid ParamsEditor::Notify(const SVEvent *sve) {\n  if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!\n    char *param = sve->parameter;\n    if (sve->command_id == writeCommands[0]) {\n      WriteParams(param, false);\n    } else if (sve->command_id == writeCommands[1]) {\n      WriteParams(param, true);\n    } else {\n      ParamContent *vc = ParamContent::GetParamContentById(sve->command_id);\n      vc->SetValue(param);\n      sv_window_->AddMessageF(\"Setting %s to %s\", vc->GetName(), vc->GetValue().c_str());\n    }\n  }\n}\n\n// Integrate the parameters editor as popupmenu into the existing scrollview\n// window (usually the pg editor). If sv == null, create a new empty\n// empty window and attach the parameters editor to that window (ugly).\nParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {\n  if (sv == nullptr) {\n    const char *name = \"ParamEditorMAIN\";\n    sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);\n  }\n\n  sv_window_ = sv;\n\n  // Only one event handler per window.\n  // sv->AddEventHandler((SVEventHandler*) this);\n\n  SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);\n\n  std::string paramfile;\n  paramfile = tess->datadir;\n  paramfile += VARDIR;   // parameters dir\n  paramfile += \"edited\"; // actual name\n\n  SVMenuNode *std_menu = svMenuRoot->AddChild(\"Build Config File\");\n\n  writeCommands[0] = nrParams + 1;\n  std_menu->AddChild(\"All Parameters\", writeCommands[0], paramfile.c_str(), \"Config file name?\");\n\n  writeCommands[1] = nrParams + 2;\n  std_menu->AddChild(\"changed_ Parameters Only\", writeCommands[1], paramfile.c_str(),\n                     \"Config file name?\");\n\n  svMenuRoot->BuildMenu(sv, false);\n}\n\n// Write all (changed_) parameters to a config file.\nvoid ParamsEditor::WriteParams(char *filename, bool changes_only) {\n  FILE *fp; // input file\n  // if file exists\n  if ((fp = fopen(filename, \"rb\")) != nullptr) {\n    fclose(fp);\n    std::stringstream msg;\n    msg << \"Overwrite file \" << filename << \"? (Y/N)\";\n    int a = sv_window_->ShowYesNoDialog(msg.str().c_str());\n    if (a == 'n') {\n      return;\n    } // don't write\n  }\n\n  fp = fopen(filename, \"wb\"); // can we write to it?\n  if (fp == nullptr) {\n    sv_window_->AddMessageF(\"Can't write to file %s\", filename);\n    return;\n  }\n  for (auto &iter : vcMap) {\n    ParamContent *cur = iter.second;\n    if (!changes_only || cur->HasChanged()) {\n      fprintf(fp, \"%-25s   %-12s   # %s\\n\", cur->GetName(), cur->GetValue().c_str(),\n              cur->GetDescription());\n    }\n  }\n  fclose(fp);\n}\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n"
  },
  {
    "path": "src/ccmain/paramsd.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        paramsd.h\n// Description: Tesseract parameter editor\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// Tesseract parameter editor is used to edit all the parameters used\n// within tesseract from the ui.\n#ifndef TESSERACT_CCMAIN_PARAMSD_H_\n#define TESSERACT_CCMAIN_PARAMSD_H_\n\n#ifndef GRAPHICS_DISABLED\n\n#  include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#  include \"scrollview.h\" // for ScrollView (ptr only), SVEvent (ptr only)\n\nnamespace tesseract {\n\nclass SVMenuNode;\n\nclass BoolParam;\nclass DoubleParam;\nclass IntParam;\nclass StringParam;\nclass Tesseract;\n\n// A list of all possible parameter types used.\nenum ParamType { VT_INTEGER, VT_BOOLEAN, VT_STRING, VT_DOUBLE };\n\n// A rather hackish helper structure which can take any kind of parameter input\n// (defined by ParamType) and do a couple of common operations on them, like\n// comparisond or getting its value. It is used in the context of the\n// ParamsEditor as a bridge from the internal tesseract parameters to the\n// ones displayed by the ScrollView server.\nclass ParamContent : public ELIST<ParamContent>::LINK {\npublic:\n  // Compare two VC objects by their name.\n  static int Compare(const ParamContent *v1, const ParamContent *v2);\n\n  // Gets a VC object identified by its ID.\n  static ParamContent *GetParamContentById(int id);\n\n  // Constructors for the various ParamTypes.\n  ParamContent() = default;\n  explicit ParamContent(tesseract::StringParam *it);\n  explicit ParamContent(tesseract::IntParam *it);\n  explicit ParamContent(tesseract::BoolParam *it);\n  explicit ParamContent(tesseract::DoubleParam *it);\n\n  // Getters and Setters.\n  void SetValue(const char *val);\n  std::string GetValue() const;\n  const char *GetName() const;\n  const char *GetDescription() const;\n\n  int GetId() const {\n    return my_id_;\n  }\n  bool HasChanged() const {\n    return changed_;\n  }\n\nprivate:\n  // The unique ID of this VC object.\n  int my_id_;\n  // Whether the parameter was changed_ and thus needs to be rewritten.\n  bool changed_ = false;\n  // The actual ParamType of this VC object.\n  ParamType param_type_;\n\n  union {\n    tesseract::StringParam *sIt;\n    tesseract::IntParam *iIt;\n    tesseract::BoolParam *bIt;\n    tesseract::DoubleParam *dIt;\n  };\n};\n\nELISTIZEH(ParamContent)\n\n// The parameters editor enables the user to edit all the parameters used within\n// tesseract. It can be invoked on its own, but is supposed to be invoked by\n// the program editor.\nclass ParamsEditor : public SVEventHandler {\npublic:\n  // Integrate the parameters editor as popupmenu into the existing scrollview\n  // window (usually the pg editor). If sv == null, create a new empty\n  // empty window and attach the parameter editor to that window (ugly).\n  explicit ParamsEditor(tesseract::Tesseract *, ScrollView *sv = nullptr);\n\n  // Event listener. Waits for SVET_POPUP events and processes them.\n  void Notify(const SVEvent *sve) override;\n\nprivate:\n  // Gets the up to the first 3 prefixes from s (split by _).\n  // For example, tesseract_foo_bar will be split into tesseract,foo and bar.\n  void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);\n\n  // Gets the first n words (split by _) and puts them in t.\n  // For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.\n  void GetFirstWords(const char *s, // source string\n                     int n,         // number of words\n                     char *t);      // target string\n\n  // Find all editable parameters used within tesseract and create a\n  // SVMenuNode tree from it.\n  SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);\n\n  // Write all (changed_) parameters to a config file.\n  void WriteParams(char *filename, bool changes_only);\n\n  ScrollView *sv_window_;\n};\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n#endif // TESSERACT_CCMAIN_PARAMSD_H_\n"
  },
  {
    "path": "src/ccmain/pgedit.cpp",
    "content": "/**********************************************************************\n * File:        pgedit.cpp (Formerly pgeditor.c)\n * Description: Page structure file editor\n * Author:      Phil Cheatle\n *\n *(C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0(the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http:// www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"pgedit.h\"\n\n#include \"blread.h\"\n#include \"control.h\"\n#include \"pageres.h\"\n#include \"paramsd.h\"\n#include \"scrollview.h\"\n#include \"statistc.h\"\n#include \"svmnode.h\"\n#include \"tesseractclass.h\"\n#include \"tordmain.h\"\n#include \"werdit.h\"\n\n#include <cctype>\n#include <cmath>\n#include <iomanip> // for std::setprecision\n#include <locale>  // for std::locale::classic\n#include <sstream> // for std::stringstream\n\n#ifndef GRAPHICS_DISABLED\nnamespace tesseract {\n#  define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight)\n#  define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight)\n#  define BL_HEIGHT kBlnBaselineOffset\n#  define DESC_HEIGHT 0\n\nenum CMD_EVENTS {\n  NULL_CMD_EVENT,\n  CHANGE_DISP_CMD_EVENT,\n  DUMP_WERD_CMD_EVENT,\n  SHOW_POINT_CMD_EVENT,\n  SHOW_BLN_WERD_CMD_EVENT,\n  DEBUG_WERD_CMD_EVENT,\n  BLAMER_CMD_EVENT,\n  BOUNDING_BOX_CMD_EVENT,\n  CORRECT_TEXT_CMD_EVENT,\n  POLYGONAL_CMD_EVENT,\n  BL_NORM_CMD_EVENT,\n  BITMAP_CMD_EVENT,\n  IMAGE_CMD_EVENT,\n  BLOCKS_CMD_EVENT,\n  BASELINES_CMD_EVENT,\n  UNIFORM_DISP_CMD_EVENT,\n  REFRESH_CMD_EVENT,\n  QUIT_CMD_EVENT,\n  RECOG_WERDS,\n  RECOG_PSEUDO,\n  SHOW_BLOB_FEATURES,\n  SHOW_SUBSCRIPT_CMD_EVENT,\n  SHOW_SUPERSCRIPT_CMD_EVENT,\n  SHOW_ITALIC_CMD_EVENT,\n  SHOW_BOLD_CMD_EVENT,\n  SHOW_UNDERLINE_CMD_EVENT,\n  SHOW_FIXEDPITCH_CMD_EVENT,\n  SHOW_SERIF_CMD_EVENT,\n  SHOW_SMALLCAPS_CMD_EVENT,\n  SHOW_DROPCAPS_CMD_EVENT,\n};\n\nenum ColorationMode {\n  CM_RAINBOW,\n  CM_SUBSCRIPT,\n  CM_SUPERSCRIPT,\n  CM_ITALIC,\n  CM_BOLD,\n  CM_UNDERLINE,\n  CM_FIXEDPITCH,\n  CM_SERIF,\n  CM_SMALLCAPS,\n  CM_DROPCAPS\n};\n\n/*\n *\n *  Some global data\n *\n */\n\nstatic ScrollView *image_win;\nstatic ParamsEditor *pe;\nstatic bool stillRunning = false;\n\nstatic ScrollView *bln_word_window = nullptr; // baseline norm words\n\nstatic CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op\n\nstatic bool recog_done = false; // recog_all_words was called\n\n// These variables should remain global, since they are only used for the\n// debug mode (in which only a single Tesseract thread/instance will exist).\nstatic std::bitset<16> word_display_mode;\nstatic ColorationMode color_mode = CM_RAINBOW;\nstatic bool display_image = false;\nstatic bool display_blocks = false;\nstatic bool display_baselines = false;\n\nstatic PAGE_RES *current_page_res = nullptr;\n\nSTRING_VAR(editor_image_win_name, \"EditorImage\", \"Editor image window name\");\nINT_VAR(editor_image_xpos, 590, \"Editor image X Pos\");\nINT_VAR(editor_image_ypos, 10, \"Editor image Y Pos\");\nstatic INT_VAR(editor_image_menuheight, 50, \"Add to image height for menu bar\");\nINT_VAR(editor_image_word_bb_color, ScrollView::BLUE, \"Word bounding box colour\");\nINT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW, \"Blob bounding box colour\");\n\nSTRING_VAR(editor_word_name, \"BlnWords\", \"BL normalized word window\");\nINT_VAR(editor_word_xpos, 60, \"Word window X Pos\");\nINT_VAR(editor_word_ypos, 510, \"Word window Y Pos\");\nINT_VAR(editor_word_height, 240, \"Word window height\");\nINT_VAR(editor_word_width, 655, \"Word window width\");\n\n/**\n * show_point()\n *\n * Show coords of point, blob bounding box, word bounding box and offset from\n * row baseline\n */\n\nstatic void show_point(PAGE_RES *page_res, float x, float y) {\n  FCOORD pt(x, y);\n  PAGE_RES_IT pr_it(page_res);\n\n  std::stringstream msg;\n  msg.imbue(std::locale::classic());\n  msg << std::fixed << std::setprecision(3) << \"Pt:(\" << x << \", \" << y << \") \";\n\n  for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {\n    if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) {\n      msg << \"BL(x)=\" << pr_it.row()->row->base_line(x) << ' ';\n    }\n    if (word->word->bounding_box().contains(pt)) {\n      TBOX box = word->word->bounding_box();\n      msg << \"Wd(\" << box.left() << \", \" << box.bottom() << \")/(\"\n          << box.right() << \", \" << box.top() << \") \";\n      C_BLOB_IT cblob_it(word->word->cblob_list());\n      for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {\n        C_BLOB *cblob = cblob_it.data();\n        box = cblob->bounding_box();\n        if (box.contains(pt)) {\n          msg << \"CBlb(\" << box.left() << \", \" << box.bottom() << \")/(\"\n              << box.right() << \", \" << box.top() << \") \";\n        }\n      }\n    }\n  }\n  image_win->AddMessage(msg.str().c_str());\n}\n\n/**\n * pgeditor_msg()\n *\n * Display a message - in the command window if there is one, or to stdout\n */\n\nstatic void pgeditor_msg( // message display\n    const char *msg) {\n  image_win->AddMessage(msg);\n}\n\nclass BlnEventHandler : public SVEventHandler {\npublic:\n  void Notify(const SVEvent *sv_event) override {\n    if (sv_event->type == SVET_DESTROY) {\n      bln_word_window = nullptr;\n    } else if (sv_event->type == SVET_CLICK) {\n      show_point(current_page_res, sv_event->x, sv_event->y);\n    }\n  }\n};\n\n/**\n *  bln_word_window_handle()\n *\n *  @return a WINDOW for the word window, creating it if necessary\n */\nstatic ScrollView *bln_word_window_handle() { // return handle\n                                              // not opened yet\n  if (bln_word_window == nullptr) {\n    pgeditor_msg(\"Creating BLN word window...\");\n    bln_word_window = new ScrollView(editor_word_name.c_str(), editor_word_xpos, editor_word_ypos,\n                                     editor_word_width, editor_word_height, 4000, 4000, true);\n    auto *a = new BlnEventHandler();\n    bln_word_window->AddEventHandler(a);\n    pgeditor_msg(\"Creating BLN word window...Done\");\n  }\n  return bln_word_window;\n}\n\n/**\n *  build_image_window()\n *\n *  Destroy the existing image window if there is one.  Work out how big the\n *  new window needs to be. Create it and re-display.\n */\n\nstatic void build_image_window(int width, int height) {\n  delete image_win;\n  image_win = new ScrollView(editor_image_win_name.c_str(), editor_image_xpos, editor_image_ypos,\n                             width + 1, height + editor_image_menuheight + 1, width, height, true);\n}\n\n/**\n *  display_bln_lines()\n *\n *  Display normalized baseline, x-height, ascender limit and descender limit\n */\n\nstatic void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor,\n                              float y_offset, float minx, float maxx) {\n  window->Pen(colour);\n  window->Line(minx, y_offset + scale_factor * DESC_HEIGHT, maxx,\n               y_offset + scale_factor * DESC_HEIGHT);\n  window->Line(minx, y_offset + scale_factor * BL_HEIGHT, maxx,\n               y_offset + scale_factor * BL_HEIGHT);\n  window->Line(minx, y_offset + scale_factor * X_HEIGHT, maxx, y_offset + scale_factor * X_HEIGHT);\n  window->Line(minx, y_offset + scale_factor * ASC_HEIGHT, maxx,\n               y_offset + scale_factor * ASC_HEIGHT);\n}\n\n/**\n *  notify()\n *\n *  Event handler that processes incoming events, either forwarding\n *  them to process_cmd_win_event or process_image_event.\n *\n */\n\nvoid PGEventHandler::Notify(const SVEvent *event) {\n  char myval = '0';\n  if (event->type == SVET_POPUP) {\n    pe->Notify(event);\n  } // These are handled by ParamsEditor\n  else if (event->type == SVET_EXIT) {\n    stillRunning = false;\n  } else if (event->type == SVET_MENU) {\n    if (strcmp(event->parameter, \"true\") == 0) {\n      myval = 'T';\n    } else if (strcmp(event->parameter, \"false\") == 0) {\n      myval = 'F';\n    }\n    tess_->process_cmd_win_event(event->command_id, &myval);\n  } else {\n    tess_->process_image_event(*event);\n  }\n}\n\n/**\n *  build_menu()\n *\n *  Construct the menu tree used by the command window\n */\nSVMenuNode *Tesseract::build_menu_new() {\n  SVMenuNode *parent_menu;\n  auto *root_menu_item = new SVMenuNode();\n\n  SVMenuNode *modes_menu_item = root_menu_item->AddChild(\"MODES\");\n\n  modes_menu_item->AddChild(\"Change Display\", CHANGE_DISP_CMD_EVENT);\n  modes_menu_item->AddChild(\"Dump Word\", DUMP_WERD_CMD_EVENT);\n  modes_menu_item->AddChild(\"Show Point\", SHOW_POINT_CMD_EVENT);\n  modes_menu_item->AddChild(\"Show BL Norm Word\", SHOW_BLN_WERD_CMD_EVENT);\n  modes_menu_item->AddChild(\"Config Words\", DEBUG_WERD_CMD_EVENT);\n  modes_menu_item->AddChild(\"Recog Words\", RECOG_WERDS);\n  modes_menu_item->AddChild(\"Recog Blobs\", RECOG_PSEUDO);\n  modes_menu_item->AddChild(\"Show Blob Features\", SHOW_BLOB_FEATURES);\n\n  parent_menu = root_menu_item->AddChild(\"DISPLAY\");\n\n  parent_menu->AddChild(\"Blamer\", BLAMER_CMD_EVENT, false);\n  parent_menu->AddChild(\"Bounding Boxes\", BOUNDING_BOX_CMD_EVENT, false);\n  parent_menu->AddChild(\"Correct Text\", CORRECT_TEXT_CMD_EVENT, false);\n  parent_menu->AddChild(\"Polygonal Approx\", POLYGONAL_CMD_EVENT, false);\n  parent_menu->AddChild(\"Baseline Normalized\", BL_NORM_CMD_EVENT, false);\n  parent_menu->AddChild(\"Edge Steps\", BITMAP_CMD_EVENT, true);\n  parent_menu->AddChild(\"Subscripts\", SHOW_SUBSCRIPT_CMD_EVENT);\n  parent_menu->AddChild(\"Superscripts\", SHOW_SUPERSCRIPT_CMD_EVENT);\n  parent_menu->AddChild(\"Italics\", SHOW_ITALIC_CMD_EVENT);\n  parent_menu->AddChild(\"Bold\", SHOW_BOLD_CMD_EVENT);\n  parent_menu->AddChild(\"Underline\", SHOW_UNDERLINE_CMD_EVENT);\n  parent_menu->AddChild(\"FixedPitch\", SHOW_FIXEDPITCH_CMD_EVENT);\n  parent_menu->AddChild(\"Serifs\", SHOW_SERIF_CMD_EVENT);\n  parent_menu->AddChild(\"SmallCaps\", SHOW_SMALLCAPS_CMD_EVENT);\n  parent_menu->AddChild(\"DropCaps\", SHOW_DROPCAPS_CMD_EVENT);\n\n  parent_menu = root_menu_item->AddChild(\"OTHER\");\n\n  parent_menu->AddChild(\"Quit\", QUIT_CMD_EVENT);\n  parent_menu->AddChild(\"Show Image\", IMAGE_CMD_EVENT, false);\n  parent_menu->AddChild(\"ShowBlock Outlines\", BLOCKS_CMD_EVENT, false);\n  parent_menu->AddChild(\"Show Baselines\", BASELINES_CMD_EVENT, false);\n  parent_menu->AddChild(\"Uniform Display\", UNIFORM_DISP_CMD_EVENT);\n  parent_menu->AddChild(\"Refresh Display\", REFRESH_CMD_EVENT);\n\n  return root_menu_item;\n}\n\n/**\n *  do_re_display()\n *\n *  Redisplay page\n */\nvoid Tesseract::do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) {\n  int block_count = 1;\n\n  image_win->Clear();\n  if (display_image) {\n    image_win->Draw(pix_binary_, 0, 0);\n  }\n\n  image_win->Brush(ScrollView::NONE);\n  PAGE_RES_IT pr_it(current_page_res);\n  for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {\n    (this->*word_painter)(&pr_it);\n    if (display_baselines && pr_it.row() != pr_it.prev_row()) {\n      pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);\n    }\n    if (display_blocks && pr_it.block() != pr_it.prev_block()) {\n      pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);\n    }\n  }\n  image_win->Update();\n}\n\n/**\n *  pgeditor_main()\n *\n *  Top level editor operation:\n *  Setup a new window and an according event handler\n *\n */\n\nvoid Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {\n  current_page_res = page_res;\n  if (current_page_res->block_res_list.empty()) {\n    return;\n  }\n\n  recog_done = false;\n  stillRunning = true;\n\n  build_image_window(width, height);\n  word_display_mode.set(DF_EDGE_STEP);\n  do_re_display(&tesseract::Tesseract::word_set_display);\n#  ifndef GRAPHICS_DISABLED\n  pe = new ParamsEditor(this, image_win);\n#  endif\n  PGEventHandler pgEventHandler(this);\n\n  image_win->AddEventHandler(&pgEventHandler);\n  image_win->AddMessageBox();\n\n  SVMenuNode *svMenuRoot = build_menu_new();\n\n  svMenuRoot->BuildMenu(image_win);\n  image_win->SetVisible(true);\n\n  image_win->AwaitEvent(SVET_DESTROY);\n  image_win->AddEventHandler(nullptr);\n}\n\n/**\n *  process_cmd_win_event()\n *\n *  Process a command returned from the command window\n * (Just call the appropriate command handler)\n */\n\nbool Tesseract::process_cmd_win_event( // UI command semantics\n    int32_t cmd_event,                 // which menu item?\n    char *new_value                    // any prompt data\n) {\n  char msg[160];\n  bool exit = false;\n\n  color_mode = CM_RAINBOW;\n\n  // Run recognition on the full page if needed.\n  switch (cmd_event) {\n    case BLAMER_CMD_EVENT:\n    case SHOW_SUBSCRIPT_CMD_EVENT:\n    case SHOW_SUPERSCRIPT_CMD_EVENT:\n    case SHOW_ITALIC_CMD_EVENT:\n    case SHOW_BOLD_CMD_EVENT:\n    case SHOW_UNDERLINE_CMD_EVENT:\n    case SHOW_FIXEDPITCH_CMD_EVENT:\n    case SHOW_SERIF_CMD_EVENT:\n    case SHOW_SMALLCAPS_CMD_EVENT:\n    case SHOW_DROPCAPS_CMD_EVENT:\n      if (!recog_done) {\n        recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);\n        recog_done = true;\n      }\n      break;\n    default:\n      break;\n  }\n\n  char *parameter;\n\n  switch (cmd_event) {\n    case NULL_CMD_EVENT:\n      break;\n\n    case CHANGE_DISP_CMD_EVENT:\n    case DUMP_WERD_CMD_EVENT:\n    case SHOW_POINT_CMD_EVENT:\n    case SHOW_BLN_WERD_CMD_EVENT:\n    case RECOG_WERDS:\n    case RECOG_PSEUDO:\n    case SHOW_BLOB_FEATURES:\n      mode = static_cast<CMD_EVENTS>(cmd_event);\n      break;\n    case DEBUG_WERD_CMD_EVENT:\n      mode = DEBUG_WERD_CMD_EVENT;\n      parameter = image_win->ShowInputDialog(\"Config File Name\");\n      word_config_ = parameter;\n      delete[] parameter;\n      break;\n    case BOUNDING_BOX_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_BOX);\n      } else {\n        word_display_mode.reset(DF_BOX);\n      }\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case BLAMER_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_BLAMER);\n      } else {\n        word_display_mode.reset(DF_BLAMER);\n      }\n      do_re_display(&tesseract::Tesseract::word_display);\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case CORRECT_TEXT_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_TEXT);\n      } else {\n        word_display_mode.reset(DF_TEXT);\n      }\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case POLYGONAL_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_POLYGONAL);\n      } else {\n        word_display_mode.reset(DF_POLYGONAL);\n      }\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case BL_NORM_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_BN_POLYGONAL);\n      } else {\n        word_display_mode.reset(DF_BN_POLYGONAL);\n      }\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case BITMAP_CMD_EVENT:\n      if (new_value[0] == 'T') {\n        word_display_mode.set(DF_EDGE_STEP);\n      } else {\n        word_display_mode.reset(DF_EDGE_STEP);\n      }\n      mode = CHANGE_DISP_CMD_EVENT;\n      break;\n    case UNIFORM_DISP_CMD_EVENT:\n      do_re_display(&tesseract::Tesseract::word_set_display);\n      break;\n    case IMAGE_CMD_EVENT:\n      display_image = (new_value[0] == 'T');\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case BLOCKS_CMD_EVENT:\n      display_blocks = (new_value[0] == 'T');\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case BASELINES_CMD_EVENT:\n      display_baselines = (new_value[0] == 'T');\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_SUBSCRIPT_CMD_EVENT:\n      color_mode = CM_SUBSCRIPT;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_SUPERSCRIPT_CMD_EVENT:\n      color_mode = CM_SUPERSCRIPT;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_ITALIC_CMD_EVENT:\n      color_mode = CM_ITALIC;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_BOLD_CMD_EVENT:\n      color_mode = CM_BOLD;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_UNDERLINE_CMD_EVENT:\n      color_mode = CM_UNDERLINE;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_FIXEDPITCH_CMD_EVENT:\n      color_mode = CM_FIXEDPITCH;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_SERIF_CMD_EVENT:\n      color_mode = CM_SERIF;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_SMALLCAPS_CMD_EVENT:\n      color_mode = CM_SMALLCAPS;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case SHOW_DROPCAPS_CMD_EVENT:\n      color_mode = CM_DROPCAPS;\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case REFRESH_CMD_EVENT:\n      do_re_display(&tesseract::Tesseract::word_display);\n      break;\n    case QUIT_CMD_EVENT:\n      exit = true;\n      ScrollView::Exit();\n      break;\n\n    default:\n      snprintf(msg, sizeof(msg), \"Unrecognised event %\" PRId32 \"(%s)\", cmd_event, new_value);\n      image_win->AddMessage(msg);\n      break;\n  }\n  return exit;\n}\n\n/**\n * process_image_event()\n *\n * User has done something in the image window - mouse down or up.  Work out\n * what it is and do something with it.\n * If DOWN - just remember where it was.\n * If UP - for each word in the selected area do the operation defined by\n * the current mode.\n */\nvoid Tesseract::process_image_event( // action in image win\n    const SVEvent &event) {\n  // The following variable should remain static, since it is used by\n  // debug editor, which uses a single Tesseract instance.\n  static ICOORD down;\n  ICOORD up;\n  TBOX selection_box;\n  char msg[80];\n\n  switch (event.type) {\n    case SVET_SELECTION:\n      if (event.type == SVET_SELECTION) {\n        down.set_x(event.x + event.x_size);\n        down.set_y(event.y + event.y_size);\n        if (mode == SHOW_POINT_CMD_EVENT) {\n          show_point(current_page_res, event.x, event.y);\n        }\n      }\n\n      up.set_x(event.x);\n      up.set_y(event.y);\n\n      selection_box = TBOX(down, up);\n\n      switch (mode) {\n        case CHANGE_DISP_CMD_EVENT:\n          process_selected_words(current_page_res, selection_box,\n                                 &tesseract::Tesseract::word_blank_and_set_display);\n          break;\n        case DUMP_WERD_CMD_EVENT:\n          process_selected_words(current_page_res, selection_box,\n                                 &tesseract::Tesseract::word_dumper);\n          break;\n        case SHOW_BLN_WERD_CMD_EVENT:\n          process_selected_words(current_page_res, selection_box,\n                                 &tesseract::Tesseract::word_bln_display);\n          break;\n        case DEBUG_WERD_CMD_EVENT:\n          debug_word(current_page_res, selection_box);\n          break;\n        case SHOW_POINT_CMD_EVENT:\n          break; // ignore up event\n\n        case RECOG_WERDS:\n#  ifndef DISABLED_LEGACY_ENGINE\n          image_win->AddMessage(\"Recogging selected words\");\n          this->process_selected_words(current_page_res, selection_box,\n                                       &Tesseract::recog_interactive);\n#  endif // ndef DISABLED_LEGACY_ENGINE\n          break;\n        case RECOG_PSEUDO:\n          image_win->AddMessage(\"Recogging selected blobs\");\n          recog_pseudo_word(current_page_res, selection_box);\n          break;\n        case SHOW_BLOB_FEATURES:\n          blob_feature_display(current_page_res, selection_box);\n          break;\n\n        default:\n          snprintf(msg, sizeof(msg), \"Mode %d not yet implemented\", mode);\n          image_win->AddMessage(msg);\n          break;\n      }\n    default:\n      break;\n  }\n}\n\n/**\n * debug_word\n *\n * Process the whole image, but load word_config_ for the selected word(s).\n */\nvoid Tesseract::debug_word(PAGE_RES *page_res, const TBOX &selection_box) {\n#  ifndef DISABLED_LEGACY_ENGINE\n  ResetAdaptiveClassifier();\n#  endif\n  recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);\n}\n\n/**********************************************************************\n * WERD PROCESSOR FUNCTIONS\n * ========================\n *\n * These routines are invoked by one or more of:\n *    process_all_words()\n *    process_selected_words()\n * or\n *    process_all_words_it()\n *    process_selected_words_it()\n * for each word to be processed\n **********************************************************************/\n\n/**\n * word_blank_and_set_display()  Word processor\n *\n * Blank display of word then redisplay word according to current display mode\n * settings\n */\n\nbool Tesseract::word_blank_and_set_display(PAGE_RES_IT *pr_it) {\n  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);\n  return word_set_display(pr_it);\n}\n\n/**\n * word_bln_display()\n *\n * Normalize word and display in word window\n */\nbool Tesseract::word_bln_display(PAGE_RES_IT *pr_it) {\n  WERD_RES *word_res = pr_it->word();\n  if (word_res->chopped_word == nullptr) {\n    // Setup word normalization parameters.\n    word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,\n                                  classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                  poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);\n  }\n  bln_word_window_handle()->Clear();\n  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);\n  C_BLOB_IT it(word_res->word->cblob_list());\n  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());\n    color = WERD::NextColor(color);\n  }\n  bln_word_window_handle()->Update();\n  return true;\n}\n\n/**\n *  word_display()  Word Processor\n *\n *  Display a word according to its display modes\n */\nbool Tesseract::word_display(PAGE_RES_IT *pr_it) {\n  WERD_RES *word_res = pr_it->word();\n  WERD *word = word_res->word;\n  TBOX word_bb;    // word bounding box\n  bool displayed_something = false;\n\n  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {\n#  ifndef DISABLED_LEGACY_ENGINE\n    BoxWord *box_word = word_res->box_word;\n    WERD_CHOICE *best_choice = word_res->best_choice;\n    int length = box_word->length();\n    if (word_res->fontinfo == nullptr) {\n      return false;\n    }\n    const FontInfo &font_info = *word_res->fontinfo;\n    for (int i = 0; i < length; ++i) {\n      ScrollView::Color color = ScrollView::GREEN;\n      switch (color_mode) {\n        case CM_SUBSCRIPT:\n          if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_SUPERSCRIPT:\n          if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_ITALIC:\n          if (font_info.is_italic()) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_BOLD:\n          if (font_info.is_bold()) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_FIXEDPITCH:\n          if (font_info.is_fixed_pitch()) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_SERIF:\n          if (font_info.is_serif()) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_SMALLCAPS:\n          if (word_res->small_caps) {\n            color = ScrollView::RED;\n          }\n          break;\n        case CM_DROPCAPS:\n          if (best_choice->BlobPosition(i) == SP_DROPCAP) {\n            color = ScrollView::RED;\n          }\n          break;\n          // TODO(rays) underline is currently completely unsupported.\n        case CM_UNDERLINE:\n        default:\n          break;\n      }\n      image_win->Pen(color);\n      TBOX box = box_word->BlobBox(i);\n      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());\n    }\n    return true;\n#  else\n    return false;\n#  endif // ndef DISABLED_LEGACY_ENGINE\n  }\n  /*\n  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)\n  etc. are to keep the compiler happy.\n*/\n  // display bounding box\n  if (word->display_flag(DF_BOX)) {\n    word->bounding_box().plot(image_win,\n                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),\n                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));\n\n    auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);\n    image_win->Pen(c);\n    // cblob iterator\n    C_BLOB_IT c_it(word->cblob_list());\n    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {\n      c_it.data()->bounding_box().plot(image_win);\n    }\n    displayed_something = true;\n  }\n\n  // display edge steps\n  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available\n    word->plot(image_win);                // rainbow colors\n    displayed_something = true;\n  }\n\n  // display poly approx\n  if (word->display_flag(DF_POLYGONAL)) {\n    // need to convert\n    TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);\n    tword->plot(image_win);\n    delete tword;\n    displayed_something = true;\n  }\n\n  // Display correct text and blamer information.\n  std::string text;\n  std::string blame;\n  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {\n    text = word->text();\n  }\n  if (word->display_flag(DF_BLAMER) &&\n      !(word_res->blamer_bundle != nullptr &&\n        word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {\n    text = \"\";\n    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;\n    if (blamer_bundle == nullptr) {\n      text += \"NULL\";\n    } else {\n      text = blamer_bundle->TruthString();\n    }\n    text += \" -> \";\n    std::string best_choice_str;\n    if (word_res->best_choice == nullptr) {\n      best_choice_str = \"NULL\";\n    } else {\n      word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);\n    }\n    text += best_choice_str;\n    IncorrectResultReason reason =\n        (blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();\n    ASSERT_HOST(reason < IRR_NUM_REASONS);\n    blame += \" [\";\n    blame += BlamerBundle::IncorrectReasonName(reason);\n    blame += \"]\";\n  }\n  if (text.length() > 0) {\n    word_bb = word->bounding_box();\n    image_win->Pen(ScrollView::RED);\n    auto word_height = word_bb.height();\n    int text_height = word_height / 2;\n    if (text_height > 20) {\n      text_height = 20;\n    }\n    image_win->TextAttributes(\"Arial\", text_height, false, false, false);\n    // from bot left\n    float shift = (word_height < word_bb.width()) ? 0.25f * word_height : 0.0f;\n    image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());\n    if (blame.length() > 0) {\n      image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,\n                      blame.c_str());\n    }\n\n    displayed_something = true;\n  }\n\n  if (!displayed_something) { // display BBox anyway\n    word->bounding_box().plot(image_win,\n                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),\n                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));\n  }\n  return true;\n}\n} // namespace tesseract\n#endif // !GRAPHICS_DISABLED\n\nnamespace tesseract {\n/**\n * word_dumper()\n *\n * Dump members to the debug window\n */\nbool Tesseract::word_dumper(PAGE_RES_IT *pr_it) {\n  if (pr_it->block()->block != nullptr) {\n    tprintf(\"\\nBlock data...\\n\");\n    pr_it->block()->block->print(nullptr, false);\n  }\n  tprintf(\"\\nRow data...\\n\");\n  pr_it->row()->row->print(nullptr);\n  tprintf(\"\\nWord data...\\n\");\n  WERD_RES *word_res = pr_it->word();\n  word_res->word->print();\n  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&\n      word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {\n    tprintf(\"Current blamer debug: %s\\n\", word_res->blamer_bundle->debug().c_str());\n  }\n  return true;\n}\n\n#ifndef GRAPHICS_DISABLED\n/**\n * word_set_display()  Word processor\n *\n * Display word according to current display mode settings\n */\nbool Tesseract::word_set_display(PAGE_RES_IT *pr_it) {\n  WERD *word = pr_it->word()->word;\n  word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);\n  word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);\n  word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);\n  word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);\n  word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);\n  word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);\n  return word_display(pr_it);\n}\n\n// page_res is non-const because the iterator doesn't know if you are going\n// to change the items it points to! Really a const here though.\nvoid Tesseract::blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box) {\n#  ifndef DISABLED_LEGACY_ENGINE\n  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);\n  if (it != nullptr) {\n    WERD_RES *word_res = it->word();\n    word_res->x_height = it->row()->row->x_height();\n    word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,\n                                  classify_bln_numeric_mode, textord_use_cjk_fp_model,\n                                  poly_allow_detailed_fx, it->row()->row, it->block()->block);\n    TWERD *bln_word = word_res->chopped_word;\n    TBLOB *bln_blob = bln_word->blobs[0];\n    INT_FX_RESULT_STRUCT fx_info;\n    std::vector<INT_FEATURE_STRUCT> bl_features;\n    std::vector<INT_FEATURE_STRUCT> cn_features;\n    Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,\n                              &fx_info, nullptr);\n    // Display baseline features.\n    ScrollView *bl_win = CreateFeatureSpaceWindow(\"BL Features\", 512, 0);\n    ClearFeatureSpaceWindow(baseline, bl_win);\n    for (auto &bl_feature : bl_features) {\n      RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);\n    }\n    bl_win->Update();\n    // Display cn features.\n    ScrollView *cn_win = CreateFeatureSpaceWindow(\"CN Features\", 512, 0);\n    ClearFeatureSpaceWindow(character, cn_win);\n    for (auto &cn_feature : cn_features) {\n      RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);\n    }\n    cn_win->Update();\n\n    it->DeleteCurrentWord();\n    delete it;\n  }\n#  endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/pgedit.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        pgedit.h\n// Description: Page structure file editor\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef PGEDIT_H\n#define PGEDIT_H\n\n#include \"params.h\"     // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam\n#include \"scrollview.h\" // for SVEvent (ptr only), SVEventHandler, ScrollView\n\nnamespace tesseract {\n\nclass BLOCK_LIST;\nclass PAGE_RES;\n\nclass Tesseract;\n\n#ifndef GRAPHICS_DISABLED\n// A small event handler class to process incoming events to\n// this window.\nclass PGEventHandler : public SVEventHandler {\npublic:\n  PGEventHandler(tesseract::Tesseract *tess) : tess_(tess) {}\n  void Notify(const SVEvent *sve) override;\n\nprivate:\n  tesseract::Tesseract *tess_;\n};\n#endif // !GRAPHICS_DISABLED\n\nextern BLOCK_LIST *current_block_list;\nextern STRING_VAR_H(editor_image_win_name);\nextern INT_VAR_H(editor_image_xpos);\nextern INT_VAR_H(editor_image_ypos);\nextern INT_VAR_H(editor_image_word_bb_color);\nextern INT_VAR_H(editor_image_blob_bb_color);\nextern STRING_VAR_H(editor_word_name);\nextern INT_VAR_H(editor_word_xpos);\nextern INT_VAR_H(editor_word_ypos);\nextern INT_VAR_H(editor_word_height);\nextern INT_VAR_H(editor_word_width);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccmain/recogtraining.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        recogtraining.cpp\n// Description: Functions for ambiguity and parameter training.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"tesseractclass.h\"\n\n#include \"boxread.h\"\n#include \"control.h\"\n#include \"host.h\" // for NearlyEqual\n#include \"ratngs.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"reject.h\"\n#endif\n#include \"stopper.h\"\n\nnamespace tesseract {\n\nconst int16_t kMaxBoxEdgeDiff = 2;\n\n// Sets flags necessary for recognition in the training mode.\n// Opens and returns the pointer to the output file.\nFILE *Tesseract::init_recog_training(const char *filename) {\n  if (tessedit_ambigs_training) {\n    tessedit_tess_adaption_mode.set_value(0); // turn off adaption\n    tessedit_enable_doc_dict.set_value(false); // turn off document dictionary\n    // Explore all segmentations.\n    getDict().stopper_no_acceptable_choices.set_value(true);\n  }\n\n  std::string output_fname = filename;\n  const char *lastdot = strrchr(output_fname.c_str(), '.');\n  if (lastdot != nullptr) {\n    output_fname[lastdot - output_fname.c_str()] = '\\0';\n  }\n  output_fname += \".txt\";\n  FILE *output_file = fopen(output_fname.c_str(), \"a+\");\n  if (output_file == nullptr) {\n    tprintf(\"Error: Could not open file %s\\n\", output_fname.c_str());\n    ASSERT_HOST(output_file);\n  }\n  return output_file;\n}\n\n// Copies the bounding box from page_res_it->word() to the given TBOX.\nstatic bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {\n  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {\n    page_res_it->forward();\n  }\n\n  if (page_res_it->word() != nullptr) {\n    *tbox = page_res_it->word()->word->bounding_box();\n\n    // If tbox->left() is negative, the training image has vertical text and\n    // all the coordinates of bounding boxes of page_res are rotated by 90\n    // degrees in a counterclockwise direction. We need to rotate the TBOX back\n    // in order to compare with the TBOXes of box files.\n    if (tbox->left() < 0) {\n      tbox->rotate(FCOORD(0.0, -1.0));\n    }\n\n    return true;\n  } else {\n    return false;\n  }\n}\n\n// This function takes tif/box pair of files and runs recognition on the image,\n// while making sure that the word bounds that tesseract identified roughly\n// match to those specified by the input box file. For each word (ngram in a\n// single bounding box from the input box file) it outputs the ocred result,\n// the correct label, rating and certainty.\nvoid Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,\n                                         volatile ETEXT_DESC *monitor, FILE *output_file) {\n  std::string box_fname = filename;\n  const char *lastdot = strrchr(box_fname.c_str(), '.');\n  if (lastdot != nullptr) {\n    box_fname[lastdot - box_fname.c_str()] = '\\0';\n  }\n  box_fname += \".box\";\n  // ReadNextBox() will close box_file\n  FILE *box_file = fopen(box_fname.c_str(), \"r\");\n  if (box_file == nullptr) {\n    tprintf(\"Error: Could not open file %s\\n\", box_fname.c_str());\n    ASSERT_HOST(box_file);\n  }\n\n  PAGE_RES_IT page_res_it;\n  page_res_it.page_res = page_res;\n  page_res_it.restart_page();\n  std::string label;\n\n  // Process all the words on this page.\n  TBOX tbox; // tesseract-identified box\n  TBOX bbox; // box from the box file\n  bool keep_going;\n  int line_number = 0;\n  int examined_words = 0;\n  do {\n    keep_going = read_t(&page_res_it, &tbox);\n    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);\n    // Align bottom left points of the TBOXes.\n    while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {\n      if (bbox.bottom() < tbox.bottom()) {\n        page_res_it.forward();\n        keep_going = read_t(&page_res_it, &tbox);\n      } else {\n        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);\n      }\n    }\n    while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {\n      if (bbox.left() > tbox.left()) {\n        page_res_it.forward();\n        keep_going = read_t(&page_res_it, &tbox);\n      } else {\n        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);\n      }\n    }\n    // OCR the word if top right points of the TBOXes are similar.\n    if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&\n        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {\n      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);\n      examined_words++;\n    }\n    page_res_it.forward();\n  } while (keep_going);\n\n  // Set up scripts on all of the words that did not get sent to\n  // ambigs_classify_and_output.  They all should have, but if all the\n  // werd_res's don't get uch_sets, tesseract will crash when you try\n  // to iterate over them. :-(\n  int total_words = 0;\n  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {\n    if (page_res_it.word()) {\n      if (page_res_it.word()->uch_set == nullptr) {\n        page_res_it.word()->SetupFake(unicharset);\n      }\n      total_words++;\n    }\n  }\n  if (examined_words < 0.85 * total_words) {\n    tprintf(\n        \"TODO(antonova): clean up recog_training_segmented; \"\n        \" It examined only a small fraction of the ambigs image.\\n\");\n  }\n  tprintf(\"recog_training_segmented: examined %d / %d words.\\n\", examined_words, total_words);\n}\n\n// Helper prints the given set of blob choices.\nstatic void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,\n                      const char *label, FILE *output_file) {\n  float rating = 0.0f;\n  float certainty = 0.0f;\n  for (int i = 0; i < length; ++i) {\n    const BLOB_CHOICE *blob_choice = blob_choices[i];\n    fprintf(output_file, \"%s\", unicharset.id_to_unichar(blob_choice->unichar_id()));\n    rating += blob_choice->rating();\n    if (certainty > blob_choice->certainty()) {\n      certainty = blob_choice->certainty();\n    }\n  }\n  fprintf(output_file, \"\\t%s\\t%.4f\\t%.4f\\n\", label, rating, certainty);\n}\n\n// Helper recursively prints all paths through the ratings matrix, starting\n// at column col.\nstatic void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,\n                             const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,\n                             const char *label, FILE *output_file) {\n  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {\n    if (ratings.get(col, row) != NOT_CLASSIFIED) {\n      BLOB_CHOICE_IT bc_it(ratings.get(col, row));\n      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {\n        blob_choices[length] = bc_it.data();\n        if (row + 1 < dim) {\n          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,\n                           output_file);\n        } else {\n          PrintPath(length + 1, blob_choices, unicharset, label, output_file);\n        }\n      }\n    }\n  }\n}\n\n// Runs classify_word_pass1() on the current word. Outputs Tesseract's\n// raw choice as a result of the classification. For words labeled with a\n// single unichar also outputs all alternatives from blob_choices of the\n// best choice.\nvoid Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,\n                                           FILE *output_file) {\n  // Classify word.\n  fflush(stdout);\n  WordData word_data(*pr_it);\n  SetupWordPassN(1, &word_data);\n  classify_word_and_language(1, pr_it, &word_data);\n  WERD_RES *werd_res = word_data.word;\n  WERD_CHOICE *best_choice = werd_res->best_choice;\n  ASSERT_HOST(best_choice != nullptr);\n\n  // Compute the number of unichars in the label.\n  std::vector<UNICHAR_ID> encoding;\n  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {\n    tprintf(\"Not outputting illegal unichar %s\\n\", label);\n    return;\n  }\n\n  // Dump all paths through the ratings matrix (which is normally small).\n  int dim = werd_res->ratings->dimension();\n  const auto **blob_choices = new const BLOB_CHOICE *[dim];\n  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);\n  delete[] blob_choices;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/reject.cpp",
    "content": "/**********************************************************************\n * File:        reject.cpp  (Formerly reject.c)\n * Description: Rejection functions used in tessedit\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include \"tesseractclass.h\"\n\nnamespace tesseract {\n\nint16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {\n  const WERD_CHOICE &word = *werd_res->best_choice;\n  int dict_word_type = werd_res->tesseract->dict_word(word);\n  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;\n}\n} // namespace tesseract\n\n#else\n\n#  include \"reject.h\"\n\n#  include \"control.h\"\n#  include \"docqual.h\"\n#  include \"tesseractclass.h\"\n#  include \"tessvars.h\"\n\n#  include \"helpers.h\"\n\n#  include <algorithm> // for std::sort\n#  include <cctype>\n#  include <cerrno>\n#  include <cstring>\n#  include <vector> // for std::vector\n\nnamespace tesseract {\n\n/*************************************************************************\n * set_done()\n *\n * Set the done flag based on the word acceptability criteria\n *************************************************************************/\n\nvoid Tesseract::set_done(WERD_RES *word, int16_t pass) {\n  word->done =\n      word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);\n  bool word_is_ambig = word->best_choice->dangerous_ambig_found();\n  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||\n                        word->best_choice->permuter() == FREQ_DAWG_PERM ||\n                        word->best_choice->permuter() == USER_DAWG_PERM;\n  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&\n      one_ell_conflict(word, false)) {\n    if (tessedit_rejection_debug) {\n      tprintf(\"one_ell_conflict detected\\n\");\n    }\n    word->done = false;\n  }\n  if (word->done &&\n      ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {\n    if (tessedit_rejection_debug) {\n      tprintf(\"non-dict or ambig word detected\\n\");\n    }\n    word->done = false;\n  }\n  if (tessedit_rejection_debug) {\n    tprintf(\"set_done(): done=%d\\n\", word->done);\n    word->best_choice->print(\"\");\n  }\n}\n\n/*************************************************************************\n * make_reject_map()\n *\n * Sets the done flag to indicate whether the resylt is acceptable.\n *\n * Sets a reject map for the word.\n *************************************************************************/\nvoid Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {\n  flip_0O(word);\n  check_debug_pt(word, -1); // For trap only\n  set_done(word, pass);     // Set acceptance\n  word->reject_map.initialise(word->best_choice->unichar_lengths().length());\n  reject_blanks(word);\n  /*\n0: Rays original heuristic - the baseline\n*/\n  if (tessedit_reject_mode == 0) {\n    if (!word->done) {\n      reject_poor_matches(word);\n    }\n  } else if (tessedit_reject_mode == 5) {\n    /*\n5: Reject I/1/l from words where there is no strong contextual confirmation;\n  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);\n  and the whole of any words which are very small\n*/\n    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {\n      word->reject_map.rej_word_small_xht();\n    } else {\n      one_ell_conflict(word, true);\n      /*\n  Originally the code here just used the done flag. Now I have duplicated\n  and unpacked the conditions for setting the done flag so that each\n  mechanism can be turned on or off independently. This works WITHOUT\n  affecting the done flag setting.\n*/\n      if (rej_use_tess_accepted && !word->tess_accepted) {\n        word->reject_map.rej_word_not_tess_accepted();\n      }\n\n      if (rej_use_tess_blanks &&\n          (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {\n        word->reject_map.rej_word_contains_blanks();\n      }\n\n      WERD_CHOICE *best_choice = word->best_choice;\n      if (rej_use_good_perm) {\n        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||\n             best_choice->permuter() == FREQ_DAWG_PERM ||\n             best_choice->permuter() == USER_DAWG_PERM) &&\n            (!rej_use_sensible_wd ||\n             acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),\n                                    best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {\n          // PASSED TEST\n        } else if (best_choice->permuter() == NUMBER_PERM) {\n          if (rej_alphas_in_number_perm) {\n            for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\\0';\n                 offset += best_choice->unichar_lengths()[i++]) {\n              if (word->reject_map[i].accepted() &&\n                  word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,\n                                             best_choice->unichar_lengths()[i])) {\n                word->reject_map[i].setrej_bad_permuter();\n              }\n              // rej alpha\n            }\n          }\n        } else {\n          word->reject_map.rej_word_bad_permuter();\n        }\n      }\n      /* Ambig word rejection was here once !!*/\n    }\n  } else {\n    tprintf(\"BAD tessedit_reject_mode\\n\");\n    ASSERT_HOST(\"Fatal error encountered!\" == nullptr);\n  }\n\n  if (tessedit_image_border > -1) {\n    reject_edge_blobs(word);\n  }\n\n  check_debug_pt(word, 10);\n  if (tessedit_rejection_debug) {\n    tprintf(\"Permuter Type = %d\\n\", word->best_choice->permuter());\n    tprintf(\"Certainty: %f     Rating: %f\\n\", word->best_choice->certainty(),\n            word->best_choice->rating());\n    tprintf(\"Dict word: %d\\n\", dict_word(*(word->best_choice)));\n  }\n\n  flip_hyphens(word);\n  check_debug_pt(word, 20);\n}\n\nvoid reject_blanks(WERD_RES *word) {\n  int16_t i;\n  int16_t offset;\n\n  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\\0';\n       offset += word->best_choice->unichar_lengths()[i], i += 1) {\n    if (word->best_choice->unichar_string()[offset] == ' ') {\n      // rej unrecognised blobs\n      word->reject_map[i].setrej_tess_failure();\n    }\n  }\n}\n\nvoid Tesseract::reject_I_1_L(WERD_RES *word) {\n  int16_t i;\n  int16_t offset;\n\n  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\\0';\n       offset += word->best_choice->unichar_lengths()[i], i += 1) {\n    if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {\n      // rej 1Il conflict\n      word->reject_map[i].setrej_1Il_conflict();\n    }\n  }\n}\n\nvoid reject_poor_matches(WERD_RES *word) {\n  float threshold = compute_reject_threshold(word->best_choice);\n  for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {\n      word->reject_map[i].setrej_tess_failure();\n    } else if (word->best_choice->certainty(i) < threshold) {\n      word->reject_map[i].setrej_poor_match();\n    }\n  }\n}\n\n/**********************************************************************\n * compute_reject_threshold\n *\n * Set a rejection threshold for this word.\n * Initially this is a trivial function which looks for the largest\n * gap in the certainty value.\n **********************************************************************/\n\nfloat compute_reject_threshold(WERD_CHOICE *word) {\n  float threshold;      // rejection threshold\n  float bestgap = 0.0f; // biggest gap\n  float gapstart;       // bottom of gap\n\n  auto blob_count = word->length();\n  std::vector<float> ratings;\n  ratings.reserve(blob_count);\n  for (unsigned i = 0; i < blob_count; ++i) {\n    ratings.push_back(word->certainty(i));\n  }\n  std::sort(ratings.begin(), ratings.end());\n  gapstart = ratings[0] - 1; // all reject if none better\n  if (blob_count >= 3) {\n    for (unsigned index = 0; index < blob_count - 1; index++) {\n      if (ratings[index + 1] - ratings[index] > bestgap) {\n        bestgap = ratings[index + 1] - ratings[index];\n        // find biggest\n        gapstart = ratings[index];\n      }\n    }\n  }\n  threshold = gapstart + bestgap / 2;\n\n  return threshold;\n}\n\n/*************************************************************************\n * reject_edge_blobs()\n *\n * If the word is perilously close to the edge of the image, reject those blobs\n * in the word which are too close to the edge as they could be clipped.\n *************************************************************************/\nvoid Tesseract::reject_edge_blobs(WERD_RES *word) {\n  TBOX word_box = word->word->bounding_box();\n  // Use the box_word as it is already denormed back to image coordinates.\n  int blobcount = word->box_word->length();\n\n  if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||\n      word_box.right() + tessedit_image_border > ImageWidth() - 1 ||\n      word_box.top() + tessedit_image_border > ImageHeight() - 1) {\n    ASSERT_HOST(word->reject_map.length() == blobcount);\n    for (int blobindex = 0; blobindex < blobcount; blobindex++) {\n      TBOX blob_box = word->box_word->BlobBox(blobindex);\n      if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||\n          blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||\n          blob_box.top() + tessedit_image_border > ImageHeight() - 1) {\n        word->reject_map[blobindex].setrej_edge_char();\n        // Close to edge\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * one_ell_conflict()\n *\n * Identify words where there is a potential I/l/1 error.\n * - A bundle of contextual heuristics!\n **********************************************************************/\nbool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {\n  const char *word;\n  const char *lengths;\n  int16_t word_len; // its length\n  int16_t first_alphanum_index_;\n  int16_t first_alphanum_offset_;\n  int16_t i;\n  int16_t offset;\n  bool non_conflict_set_char; // non conf set a/n?\n  ACCEPTABLE_WERD_TYPE word_type;\n  bool dict_perm_type;\n  bool dict_word_ok;\n  int dict_word_type;\n\n  word = word_res->best_choice->unichar_string().c_str();\n  lengths = word_res->best_choice->unichar_lengths().c_str();\n  word_len = strlen(lengths);\n  /*\n  If there are no occurrences of the conflict set characters then the word\n  is OK.\n*/\n  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {\n    return false;\n  }\n\n  /*\n  There is a conflict if there are NO other (confirmed) alphanumerics apart\n  from those in the conflict set.\n*/\n\n  for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;\n       offset += lengths[i++]) {\n    non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||\n                             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&\n                            !conflict_set_I_l_1.contains(word[offset]);\n  }\n  if (!non_conflict_set_char) {\n    if (update_map) {\n      reject_I_1_L(word_res);\n    }\n    return true;\n  }\n\n  /*\n  If the word is accepted by a dawg permuter, and the first alpha character\n  is \"I\" or \"l\", check to see if the alternative is also a dawg word. If it\n  is, then there is a potential error otherwise the word is ok.\n*/\n\n  dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||\n                   (word_res->best_choice->permuter() == USER_DAWG_PERM) ||\n                   (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||\n                   (word_res->best_choice->permuter() == FREQ_DAWG_PERM);\n  dict_word_type = dict_word(*(word_res->best_choice));\n  dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));\n\n  if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||\n      (dict_perm_type && dict_word_ok)) {\n    first_alphanum_index_ = first_alphanum_index(word, lengths);\n    first_alphanum_offset_ = first_alphanum_offset(word, lengths);\n    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {\n      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';\n      if (safe_dict_word(word_res) > 0) {\n        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';\n        if (update_map) {\n          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();\n        }\n        return true;\n      } else {\n        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';\n        return false;\n      }\n    }\n\n    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {\n      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';\n      if (safe_dict_word(word_res) > 0) {\n        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';\n        if (update_map) {\n          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();\n        }\n        return true;\n      } else {\n        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';\n        return false;\n      }\n    }\n    return false;\n  }\n\n  /*\n  NEW 1Il code. The old code relied on permuter types too much. In fact,\n  tess will use TOP_CHOICE permute for good things like \"palette\".\n  In this code the string is examined independently to see if it looks like\n  a well formed word.\n*/\n\n  /*\n  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a\n  dictionary word.\n*/\n  first_alphanum_index_ = first_alphanum_index(word, lengths);\n  first_alphanum_offset_ = first_alphanum_offset(word, lengths);\n  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {\n    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';\n    if (safe_dict_word(word_res) > 0) {\n      return false;\n    } else {\n      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';\n    }\n  } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {\n    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';\n    if (safe_dict_word(word_res) > 0) {\n      return false;\n    } else {\n      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';\n    }\n  }\n  /*\n  For strings containing digits:\n    If there are no alphas OR the numeric permuter liked the word,\n      reject any non 1 conflict chs\n    Else reject all conflict chs\n*/\n  if (word_contains_non_1_digit(word, lengths)) {\n    bool allow_1s =\n        (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);\n\n    int16_t offset;\n    bool conflict = false;\n    for (i = 0, offset = 0; word[offset] != '\\0';\n         offset += word_res->best_choice->unichar_lengths()[i++]) {\n      if ((!allow_1s || (word[offset] != '1')) &&\n          conflict_set_I_l_1.contains(word[offset])) {\n        if (update_map) {\n          word_res->reject_map[i].setrej_1Il_conflict();\n        }\n        conflict = true;\n      }\n    }\n    return conflict;\n  }\n  /*\n  For anything else. See if it conforms to an acceptable word type. If so,\n  treat accordingly.\n*/\n  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);\n  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {\n    first_alphanum_index_ = first_alphanum_index(word, lengths);\n    first_alphanum_offset_ = first_alphanum_offset(word, lengths);\n    if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {\n      if (update_map) {\n        word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();\n      }\n      return true;\n    } else {\n      return false;\n    }\n  } else if (word_type == AC_UPPER_CASE) {\n    return false;\n  } else {\n    if (update_map) {\n      reject_I_1_L(word_res);\n    }\n    return true;\n  }\n}\n\nint16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {\n  int16_t i;\n  int16_t offset;\n\n  for (i = 0, offset = 0; word[offset] != '\\0'; offset += word_lengths[i++]) {\n    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||\n        unicharset.get_isdigit(word + offset, word_lengths[i])) {\n      return i;\n    }\n  }\n  return -1;\n}\n\nint16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {\n  int16_t i;\n  int16_t offset;\n\n  for (i = 0, offset = 0; word[offset] != '\\0'; offset += word_lengths[i++]) {\n    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||\n        unicharset.get_isdigit(word + offset, word_lengths[i])) {\n      return offset;\n    }\n  }\n  return -1;\n}\n\nint16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {\n  int16_t i;\n  int16_t offset;\n  int16_t count = 0;\n\n  for (i = 0, offset = 0; word[offset] != '\\0'; offset += word_lengths[i++]) {\n    if (unicharset.get_isalpha(word + offset, word_lengths[i])) {\n      count++;\n    }\n  }\n  return count;\n}\n\nbool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {\n  int16_t i;\n  int16_t offset;\n\n  for (i = 0, offset = 0; word[offset] != '\\0'; offset += word_lengths[i++]) {\n    if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&\n        (word_lengths[i] != 1 || word[offset] != '1')) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/*************************************************************************\n * dont_allow_1Il()\n * Don't unreject LONE accepted 1Il conflict set chars\n *************************************************************************/\nvoid Tesseract::dont_allow_1Il(WERD_RES *word) {\n  int word_len = word->reject_map.length();\n  const char *s = word->best_choice->unichar_string().c_str();\n  const char *lengths = word->best_choice->unichar_lengths().c_str();\n  bool accepted_1Il = false;\n\n  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {\n    if (word->reject_map[i].accepted()) {\n      if (conflict_set_I_l_1.contains(s[offset])) {\n        accepted_1Il = true;\n      } else {\n        if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||\n            word->uch_set->get_isdigit(s + offset, lengths[i])) {\n          return; // >=1 non 1Il ch accepted\n        }\n      }\n    }\n  }\n  if (!accepted_1Il) {\n    return; // Nothing to worry about\n  }\n\n  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {\n    if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {\n      word->reject_map[i].setrej_postNN_1Il();\n    }\n  }\n}\n\nint16_t Tesseract::count_alphanums(WERD_RES *word_res) {\n  int count = 0;\n  const WERD_CHOICE *best_choice = word_res->best_choice;\n  for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {\n    if ((word_res->reject_map[i].accepted()) &&\n        (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||\n         word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {\n      count++;\n    }\n  }\n  return count;\n}\n\n// reject all if most rejected.\nvoid Tesseract::reject_mostly_rejects(WERD_RES *word) {\n  /* Reject the whole of the word if the fraction of rejects exceeds a limit */\n\n  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=\n      rej_whole_of_mostly_reject_word_fract) {\n    word->reject_map.rej_word_mostly_rej();\n  }\n}\n\nbool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {\n  if (word->best_choice->unichar_lengths().length() <= 1) {\n    return false;\n  }\n\n  if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {\n    return false;\n  }\n\n  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);\n  for (unsigned i = 1; i < word->best_choice->length(); ++i) {\n    if (word->best_choice->unichar_id(i) != uch_id) {\n      return false;\n    }\n  }\n\n  int16_t char_quality;\n  int16_t accepted_char_quality;\n  word_char_quality(word, &char_quality, &accepted_char_quality);\n\n  if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&\n      (char_quality == accepted_char_quality)) {\n    return true;\n  } else {\n    return false;\n  }\n}\n\nint16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {\n  const WERD_CHOICE &word = *werd_res->best_choice;\n  int dict_word_type = werd_res->tesseract->dict_word(word);\n  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;\n}\n\n// Note: After running this function word_res->ratings\n// might not contain the right BLOB_CHOICE corresponding to each character\n// in word_res->best_choice.\nvoid Tesseract::flip_hyphens(WERD_RES *word_res) {\n  WERD_CHOICE *best_choice = word_res->best_choice;\n  int prev_right = -9999;\n  int next_left;\n  TBOX out_box;\n  float aspect_ratio;\n\n  if (tessedit_lower_flip_hyphen <= 1) {\n    return;\n  }\n\n  auto num_blobs = word_res->rebuild_word->NumBlobs();\n  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id(\"-\");\n  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {\n    TBLOB *blob = word_res->rebuild_word->blobs[i];\n    out_box = blob->bounding_box();\n    if (i + 1 == num_blobs) {\n      next_left = 9999;\n    } else {\n      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();\n    }\n    // Don't touch small or touching blobs - it is too dangerous.\n    if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&\n        (out_box.right() < next_left)) {\n      aspect_ratio = out_box.width() / static_cast<float>(out_box.height());\n      if (word_res->uch_set->eq(best_choice->unichar_id(i), \".\")) {\n        if (aspect_ratio >= tessedit_upper_flip_hyphen &&\n            word_res->uch_set->contains_unichar_id(unichar_dash) &&\n            word_res->uch_set->get_enabled(unichar_dash)) {\n          /* Certain HYPHEN */\n          best_choice->set_unichar_id(unichar_dash, i);\n          if (word_res->reject_map[i].rejected()) {\n            word_res->reject_map[i].setrej_hyphen_accept();\n          }\n        }\n        if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {\n          // Suspected HYPHEN\n          word_res->reject_map[i].setrej_hyphen();\n        }\n      } else if (best_choice->unichar_id(i) == unichar_dash) {\n        if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {\n          word_res->reject_map[i].setrej_hyphen_accept();\n        }\n        // Certain HYPHEN\n\n        if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {\n          // Suspected HYPHEN\n          word_res->reject_map[i].setrej_hyphen();\n        }\n      }\n    }\n    prev_right = out_box.right();\n  }\n}\n\n// Note: After running this function word_res->ratings\n// might not contain the right BLOB_CHOICE corresponding to each character\n// in word_res->best_choice.\nvoid Tesseract::flip_0O(WERD_RES *word_res) {\n  WERD_CHOICE *best_choice = word_res->best_choice;\n  TBOX out_box;\n\n  if (!tessedit_flip_0O) {\n    return;\n  }\n\n  auto num_blobs = word_res->rebuild_word->NumBlobs();\n  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {\n    TBLOB *blob = word_res->rebuild_word->blobs[i];\n    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||\n        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {\n      out_box = blob->bounding_box();\n      if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||\n          (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {\n        return; // Beware words with sub/superscripts\n      }\n    }\n  }\n  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id(\"0\");\n  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id(\"O\");\n  if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||\n      unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {\n    return; // 0 or O are not present/enabled in unicharset\n  }\n  for (unsigned i = 1; i < best_choice->length(); ++i) {\n    if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {\n      /* A0A */\n      if ((i + 1) < best_choice->length() &&\n          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {\n        best_choice->set_unichar_id(unichar_O, i);\n      }\n      /* A00A */\n      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (i + 1) < best_choice->length() &&\n          (best_choice->unichar_id(i + 1) == unichar_0 ||\n           best_choice->unichar_id(i + 1) == unichar_O) &&\n          (i + 2) < best_choice->length() &&\n          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {\n        best_choice->set_unichar_id(unichar_O, i);\n        i++;\n      }\n      /* AA0<non digit or end of word> */\n      if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&\n          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (((i + 1) < best_choice->length() &&\n            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&\n            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), \"l\") &&\n            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), \"I\")) ||\n           (i == best_choice->length() - 1))) {\n        best_choice->set_unichar_id(unichar_O, i);\n      }\n      /* 9O9 */\n      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (i + 1) < best_choice->length() &&\n          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {\n        best_choice->set_unichar_id(unichar_0, i);\n      }\n      /* 9OOO */\n      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (i + 2) < best_choice->length() &&\n          (best_choice->unichar_id(i + 1) == unichar_0 ||\n           best_choice->unichar_id(i + 1) == unichar_O) &&\n          (best_choice->unichar_id(i + 2) == unichar_0 ||\n           best_choice->unichar_id(i + 2) == unichar_O)) {\n        best_choice->set_unichar_id(unichar_0, i);\n        best_choice->set_unichar_id(unichar_0, i + 1);\n        best_choice->set_unichar_id(unichar_0, i + 2);\n        i += 2;\n      }\n      /* 9OO<non upper> */\n      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (i + 2) < best_choice->length() &&\n          (best_choice->unichar_id(i + 1) == unichar_0 ||\n           best_choice->unichar_id(i + 1) == unichar_O) &&\n          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {\n        best_choice->set_unichar_id(unichar_0, i);\n        best_choice->set_unichar_id(unichar_0, i + 1);\n        i++;\n      }\n      /* 9O<non upper> */\n      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&\n          (i + 1) < best_choice->length() &&\n          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {\n        best_choice->set_unichar_id(unichar_0, i);\n      }\n      /* 9[.,]OOO.. */\n      if ((i > 1) &&\n          (word_res->uch_set->eq(best_choice->unichar_id(i - 1), \".\") ||\n           word_res->uch_set->eq(best_choice->unichar_id(i - 1), \",\")) &&\n          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||\n           best_choice->unichar_id(i - 2) == unichar_O)) {\n        if (best_choice->unichar_id(i - 2) == unichar_O) {\n          best_choice->set_unichar_id(unichar_0, i - 2);\n        }\n        while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||\n                                             best_choice->unichar_id(i) == unichar_0)) {\n          best_choice->set_unichar_id(unichar_0, i);\n          i++;\n        }\n        i--;\n      }\n    }\n  }\n}\n\nbool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {\n  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, \"O\");\n}\n\nbool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {\n  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, \"0\");\n}\n} // namespace tesseract\n\n#endif // def DISABLED_LEGACY_ENGINE\n"
  },
  {
    "path": "src/ccmain/reject.h",
    "content": "/**********************************************************************\n * File:        reject.h\n * Description: Rejection functions used in tessedit\n * Author:      Phil Cheatle\n * Created:     Wed Sep 23 16:50:21 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef REJECT_H\n#define REJECT_H\n\nnamespace tesseract {\n\nclass WERD_CHOICE;\nclass WERD_RES;\n\nvoid reject_blanks(WERD_RES *word);\nvoid reject_poor_matches(WERD_RES *word);\nfloat compute_reject_threshold(WERD_CHOICE *word);\nbool word_contains_non_1_digit(const char *word, const char *word_lengths);\nvoid dont_allow_1Il(WERD_RES *word);\nvoid flip_hyphens(WERD_RES *word);\nvoid flip_0O(WERD_RES *word);\nbool non_0_digit(const char *str, int length);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccmain/resultiterator.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        resultiterator.cpp\n// Description: Iterator for tesseract results that is capable of\n//              iterating in proper reading order over Bi Directional\n//              (e.g. mixed Hebrew and English) text.\n// Author:      David Eger\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/resultiterator.h>\n\n#include \"helpers.h\"  // for copy_string\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n#include \"unicharset.h\"\n\n#include <allheaders.h>\n\n#include <set>\n#include <vector>\n\nstatic const char *const kLRM = \"\\u200E\"; // Left-to-Right Mark\nstatic const char *const kRLM = \"\\u200F\"; // Right-to-Left Mark\n\nnamespace tesseract {\n\nResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {\n  in_minor_direction_ = false;\n  at_beginning_of_minor_run_ = false;\n  preserve_interword_spaces_ = false;\n\n  auto *p = ParamUtils::FindParam<BoolParam>(\n      \"preserve_interword_spaces\", GlobalParams()->bool_params, tesseract_->params()->bool_params);\n  if (p != nullptr) {\n    preserve_interword_spaces_ = (bool)(*p);\n  }\n\n  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();\n  MoveToLogicalStartOfTextline();\n}\n\nResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {\n  return new ResultIterator(resit);\n}\n\nbool ResultIterator::ParagraphIsLtr() const {\n  return current_paragraph_is_ltr_;\n}\n\nbool ResultIterator::CurrentParagraphIsLtr() const {\n  if (!it_->word()) {\n    return true; // doesn't matter.\n  }\n  LTRResultIterator it(*this);\n  it.RestartParagraph();\n  // Try to figure out the ltr-ness of the paragraph.  The rules below\n  // make more sense in the context of a difficult paragraph example.\n  // Here we denote {ltr characters, RTL CHARACTERS}:\n  //\n  //   \"don't go in there!\" DAIS EH\n  //   EHT OTNI DEPMUJ FELSMIH NEHT DNA\n  //                  .GNIDLIUB GNINRUB\n  //\n  // On the first line, the left-most word is LTR and the rightmost word\n  // is RTL.  Thus, we are better off taking the majority direction for\n  // the whole paragraph contents.  So instead of \"the leftmost word is LTR\"\n  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs\n  // would not do:  Typically an RTL paragraph would *not* start with an LTR\n  // word.  So our heuristics are as follows:\n  //\n  // (1) If the first text line has an RTL word in the left-most position\n  //     it is RTL.\n  // (2) If the first text line has an LTR word in the right-most position\n  //     it is LTR.\n  // (3) If neither of the above is true, take the majority count for the\n  //     paragraph -- if there are more rtl words, it is RTL.  If there\n  //     are more LTR words, it's LTR.\n  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;\n  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;\n  int num_ltr, num_rtl;\n  num_rtl = leftmost_rtl ? 1 : 0;\n  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;\n  for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);\n       it.Next(RIL_WORD)) {\n    StrongScriptDirection dir = it.WordDirection();\n    rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);\n    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;\n    num_ltr += rightmost_ltr ? 1 : 0;\n  }\n  if (leftmost_rtl) {\n    return false;\n  }\n  if (rightmost_ltr) {\n    return true;\n  }\n  // First line is ambiguous.  Take statistics on the whole paragraph.\n  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {\n    do {\n      StrongScriptDirection dir = it.WordDirection();\n      num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;\n      num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;\n    } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));\n  }\n  return num_ltr >= num_rtl;\n}\n\nconst int ResultIterator::kMinorRunStart = -1;\nconst int ResultIterator::kMinorRunEnd = -2;\nconst int ResultIterator::kComplexWord = -3;\n\nvoid ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {\n  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;\n  blob_indices->clear();\n  if (Empty(RIL_WORD)) {\n    return;\n  }\n  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {\n    // Easy! just return the blobs in order;\n    for (int i = 0; i < word_length_; i++) {\n      blob_indices->push_back(i);\n    }\n    return;\n  }\n\n  // The blobs are in left-to-right order, but the current reading context\n  // is right-to-left.\n  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;\n  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;\n  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;\n  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;\n  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;\n  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;\n  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;\n\n  // Step 1: Scan for and mark European Number sequences\n  //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*\n  std::vector<int> letter_types;\n  letter_types.reserve(word_length_);\n  for (int i = 0; i < word_length_; i++) {\n    letter_types.push_back(it_->word()->SymbolDirection(i));\n  }\n  // Convert a single separator sandwiched between two ENs into an EN.\n  for (int i = 0; i + 2 < word_length_; i++) {\n    if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&\n        (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {\n      letter_types[i + 1] = U_EURO_NUM;\n    }\n  }\n  // Scan for sequences of European Number Terminators around ENs and convert\n  // them to ENs.\n  for (int i = 0; i < word_length_; i++) {\n    if (letter_types[i] == U_EURO_NUM_TERM) {\n      int j = i + 1;\n      while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {\n        j++;\n      }\n      if (j < word_length_ && letter_types[j] == U_EURO_NUM) {\n        // The sequence [i..j] should be converted to all European Numbers.\n        for (int k = i; k < j; k++) {\n          letter_types[k] = U_EURO_NUM;\n        }\n      }\n      j = i - 1;\n      while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {\n        j--;\n      }\n      if (j > -1 && letter_types[j] == U_EURO_NUM) {\n        // The sequence [j..i] should be converted to all European Numbers.\n        for (int k = j; k <= i; k++) {\n          letter_types[k] = U_EURO_NUM;\n        }\n      }\n    }\n  }\n  // Step 2: Convert all remaining types to either L or R.\n  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.\n  // All other are R.\n  for (int i = 0; i < word_length_;) {\n    int ti = letter_types[i];\n    if (ti == U_LTR || ti == U_EURO_NUM) {\n      // Left to right sequence; scan to the end of it.\n      int last_good = i;\n      for (int j = i + 1; j < word_length_; j++) {\n        int tj = letter_types[j];\n        if (tj == U_LTR || tj == U_EURO_NUM) {\n          last_good = j;\n        } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {\n          // do nothing.\n        } else {\n          break;\n        }\n      }\n      // [i..last_good] is the L sequence\n      for (int k = i; k <= last_good; k++) {\n        letter_types[k] = U_LTR;\n      }\n      i = last_good + 1;\n    } else {\n      letter_types[i] = U_RTL;\n      i++;\n    }\n  }\n\n  // At this point, letter_types is entirely U_LTR or U_RTL.\n  for (int i = word_length_ - 1; i >= 0;) {\n    if (letter_types[i] == U_RTL) {\n      blob_indices->push_back(i);\n      i--;\n    } else {\n      // left to right sequence.  scan to the beginning.\n      int j = i - 1;\n      for (; j >= 0 && letter_types[j] != U_RTL; j--) {\n      } // pass\n      // Now (j, i] is LTR\n      for (int k = j + 1; k <= i; k++) {\n        blob_indices->push_back(k);\n      }\n      i = j;\n    }\n  }\n  ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));\n}\n\nstatic void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {\n  for (auto dir : dirs) {\n    switch (dir) {\n      case DIR_NEUTRAL:\n        tprintf(\"N \");\n        break;\n      case DIR_LEFT_TO_RIGHT:\n        tprintf(\"L \");\n        break;\n      case DIR_RIGHT_TO_LEFT:\n        tprintf(\"R \");\n        break;\n      case DIR_MIX:\n        tprintf(\"Z \");\n        break;\n      default:\n        tprintf(\"? \");\n        break;\n    }\n  }\n  tprintf(\"\\n\");\n}\n\nvoid ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,\n                                            std::vector<int> *word_indices) const {\n  std::vector<StrongScriptDirection> directions;\n  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);\n}\n\nvoid ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,\n                                            std::vector<StrongScriptDirection> *dirs_arg,\n                                            std::vector<int> *word_indices) const {\n  std::vector<StrongScriptDirection> dirs;\n  std::vector<StrongScriptDirection> *directions;\n  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;\n  directions->clear();\n\n  // A LTRResultIterator goes strictly left-to-right word order.\n  LTRResultIterator ltr_it(resit);\n  ltr_it.RestartRow();\n  if (ltr_it.Empty(RIL_WORD)) {\n    return;\n  }\n  do {\n    directions->push_back(ltr_it.WordDirection());\n  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));\n\n  word_indices->clear();\n  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);\n}\n\nvoid ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,\n                                            const std::vector<StrongScriptDirection> &word_dirs,\n                                            std::vector<int> *reading_order) {\n  reading_order->clear();\n  if (word_dirs.empty()) {\n    return;\n  }\n\n  // Take all of the runs of minor direction words and insert them\n  // in reverse order.\n  int minor_direction, major_direction, major_step, start, end;\n  if (paragraph_is_ltr) {\n    start = 0;\n    end = word_dirs.size();\n    major_step = 1;\n    major_direction = DIR_LEFT_TO_RIGHT;\n    minor_direction = DIR_RIGHT_TO_LEFT;\n  } else {\n    start = word_dirs.size() - 1;\n    end = -1;\n    major_step = -1;\n    major_direction = DIR_RIGHT_TO_LEFT;\n    minor_direction = DIR_LEFT_TO_RIGHT;\n    // Special rule: if there are neutral words at the right most side\n    //   of a line adjacent to a left-to-right word in the middle of the\n    //   line, we interpret the end of the line as a single LTR sequence.\n    if (word_dirs[start] == DIR_NEUTRAL) {\n      int neutral_end = start;\n      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {\n        neutral_end--;\n      }\n      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {\n        // LTR followed by neutrals.\n        // Scan for the beginning of the minor left-to-right run.\n        int left = neutral_end;\n        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {\n          if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {\n            left = i;\n          }\n        }\n        reading_order->push_back(kMinorRunStart);\n        for (unsigned i = left; i < word_dirs.size(); i++) {\n          reading_order->push_back(i);\n          if (word_dirs[i] == DIR_MIX) {\n            reading_order->push_back(kComplexWord);\n          }\n        }\n        reading_order->push_back(kMinorRunEnd);\n        start = left - 1;\n      }\n    }\n  }\n  for (int i = start; i != end;) {\n    if (word_dirs[i] == minor_direction) {\n      int j = i;\n      while (j != end && word_dirs[j] != major_direction) {\n        j += major_step;\n      }\n      if (j == end) {\n        j -= major_step;\n      }\n      while (j != i && word_dirs[j] != minor_direction) {\n        j -= major_step;\n      }\n      //  [j..i] is a minor direction run.\n      reading_order->push_back(kMinorRunStart);\n      for (int k = j; k != i; k -= major_step) {\n        reading_order->push_back(k);\n      }\n      reading_order->push_back(i);\n      reading_order->push_back(kMinorRunEnd);\n      i = j + major_step;\n    } else {\n      reading_order->push_back(i);\n      if (word_dirs[i] == DIR_MIX) {\n        reading_order->push_back(kComplexWord);\n      }\n      i += major_step;\n    }\n  }\n}\n\nint ResultIterator::LTRWordIndex() const {\n  int this_word_index = 0;\n  LTRResultIterator textline(*this);\n  textline.RestartRow();\n  while (!textline.PositionedAtSameWord(it_)) {\n    this_word_index++;\n    textline.Next(RIL_WORD);\n  }\n  return this_word_index;\n}\n\nvoid ResultIterator::MoveToLogicalStartOfWord() {\n  if (word_length_ == 0) {\n    BeginWord(0);\n    return;\n  }\n  std::vector<int> blob_order;\n  CalculateBlobOrder(&blob_order);\n  if (blob_order.empty() || blob_order[0] == 0) {\n    return;\n  }\n  BeginWord(blob_order[0]);\n}\n\nbool ResultIterator::IsAtFinalSymbolOfWord() const {\n  if (!it_->word()) {\n    return true;\n  }\n  std::vector<int> blob_order;\n  CalculateBlobOrder(&blob_order);\n  return blob_order.empty() || blob_order.back() == blob_index_;\n}\n\nbool ResultIterator::IsAtFirstSymbolOfWord() const {\n  if (!it_->word()) {\n    return true;\n  }\n  std::vector<int> blob_order;\n  CalculateBlobOrder(&blob_order);\n  return blob_order.empty() || blob_order[0] == blob_index_;\n}\n\nvoid ResultIterator::AppendSuffixMarks(std::string *text) const {\n  if (!it_->word()) {\n    return;\n  }\n  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;\n  // scan forward to see what meta-information the word ordering algorithm\n  // left us.\n  // If this word is at the  *end* of a minor run, insert the other\n  // direction's mark;  else if this was a complex word, insert the\n  // current reading order's mark.\n  std::vector<int> textline_order;\n  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);\n  int this_word_index = LTRWordIndex();\n  size_t i = 0;\n  for (const auto word_index : textline_order) {\n    if (word_index == this_word_index) {\n      break;\n    }\n    i++;\n  }\n  if (i == textline_order.size()) {\n    return;\n  }\n\n  int last_non_word_mark = 0;\n  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {\n    last_non_word_mark = textline_order[i];\n  }\n  if (last_non_word_mark == kComplexWord) {\n    *text += reading_direction_is_ltr ? kLRM : kRLM;\n  } else if (last_non_word_mark == kMinorRunEnd) {\n    if (current_paragraph_is_ltr_) {\n      *text += kLRM;\n    } else {\n      *text += kRLM;\n    }\n  }\n}\n\nvoid ResultIterator::MoveToLogicalStartOfTextline() {\n  std::vector<int> word_indices;\n  RestartRow();\n  CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),\n                         &word_indices);\n  unsigned i = 0;\n  for (; i < word_indices.size() && word_indices[i] < 0; i++) {\n    if (word_indices[i] == kMinorRunStart) {\n      in_minor_direction_ = true;\n    } else if (word_indices[i] == kMinorRunEnd) {\n      in_minor_direction_ = false;\n    }\n  }\n  if (in_minor_direction_) {\n    at_beginning_of_minor_run_ = true;\n  }\n  if (i >= word_indices.size()) {\n    return;\n  }\n  int first_word_index = word_indices[i];\n  for (int j = 0; j < first_word_index; j++) {\n    PageIterator::Next(RIL_WORD);\n  }\n  MoveToLogicalStartOfWord();\n}\n\nvoid ResultIterator::Begin() {\n  LTRResultIterator::Begin();\n  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();\n  in_minor_direction_ = false;\n  at_beginning_of_minor_run_ = false;\n  MoveToLogicalStartOfTextline();\n}\n\nbool ResultIterator::Next(PageIteratorLevel level) {\n  if (it_->block() == nullptr) {\n    return false; // already at end!\n  }\n  switch (level) {\n    case RIL_BLOCK: // explicit fall-through\n    case RIL_PARA:  // explicit fall-through\n    case RIL_TEXTLINE:\n      if (!PageIterator::Next(level)) {\n        return false;\n      }\n      if (IsWithinFirstTextlineOfParagraph()) {\n        // if we've advanced to a new paragraph,\n        // recalculate current_paragraph_is_ltr_\n        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();\n      }\n      in_minor_direction_ = false;\n      MoveToLogicalStartOfTextline();\n      return it_->block() != nullptr;\n    case RIL_SYMBOL: {\n      std::vector<int> blob_order;\n      CalculateBlobOrder(&blob_order);\n      unsigned next_blob = 0;\n      while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {\n        next_blob++;\n      }\n      next_blob++;\n      if (next_blob < blob_order.size()) {\n        // we're in the same word; simply advance one blob.\n        BeginWord(blob_order[next_blob]);\n        at_beginning_of_minor_run_ = false;\n        return true;\n      }\n      level = RIL_WORD; // we've fallen through to the next word.\n    }\n      // Fall through.\n    case RIL_WORD: // explicit fall-through.\n    {\n      if (it_->word() == nullptr) {\n        return Next(RIL_BLOCK);\n      }\n      std::vector<int> word_indices;\n      int this_word_index = LTRWordIndex();\n      CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);\n      int final_real_index = word_indices.size() - 1;\n      while (final_real_index > 0 && word_indices[final_real_index] < 0) {\n        final_real_index--;\n      }\n      for (int i = 0; i < final_real_index; i++) {\n        if (word_indices[i] == this_word_index) {\n          int j = i + 1;\n          for (; j < final_real_index && word_indices[j] < 0; j++) {\n            if (word_indices[j] == kMinorRunStart) {\n              in_minor_direction_ = true;\n            }\n            if (word_indices[j] == kMinorRunEnd) {\n              in_minor_direction_ = false;\n            }\n          }\n          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);\n          // awesome, we move to word_indices[j]\n          if (BidiDebug(3)) {\n            tprintf(\"Next(RIL_WORD): %d -> %d\\n\", this_word_index, word_indices[j]);\n          }\n          PageIterator::RestartRow();\n          for (int k = 0; k < word_indices[j]; k++) {\n            PageIterator::Next(RIL_WORD);\n          }\n          MoveToLogicalStartOfWord();\n          return true;\n        }\n      }\n      if (BidiDebug(3)) {\n        tprintf(\"Next(RIL_WORD): %d -> EOL\\n\", this_word_index);\n      }\n      // we're going off the end of the text line.\n      return Next(RIL_TEXTLINE);\n    }\n  }\n  ASSERT_HOST(false); // shouldn't happen.\n  return false;\n}\n\nbool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {\n  if (it_->block() == nullptr) {\n    return false; // Already at the end!\n  }\n  if (it_->word() == nullptr) {\n    return true; // In an image block.\n  }\n  if (level == RIL_SYMBOL) {\n    return true; // Always at beginning of a symbol.\n  }\n\n  bool at_word_start = IsAtFirstSymbolOfWord();\n  if (level == RIL_WORD) {\n    return at_word_start;\n  }\n\n  ResultIterator line_start(*this);\n  // move to the first word in the line...\n  line_start.MoveToLogicalStartOfTextline();\n\n  bool at_textline_start = at_word_start && *line_start.it_ == *it_;\n  if (level == RIL_TEXTLINE) {\n    return at_textline_start;\n  }\n\n  // now we move to the left-most word...\n  line_start.RestartRow();\n  bool at_block_start =\n      at_textline_start && line_start.it_->block() != line_start.it_->prev_block();\n  if (level == RIL_BLOCK) {\n    return at_block_start;\n  }\n\n  bool at_para_start =\n      at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=\n                                                  line_start.it_->prev_row()->row->para());\n  if (level == RIL_PARA) {\n    return at_para_start;\n  }\n\n  ASSERT_HOST(false); // shouldn't happen.\n  return false;\n}\n\n/**\n * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the\n *   change that the variable next is now a ResultIterator instead of a\n *   PageIterator.\n */\nbool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {\n  if (Empty(element)) {\n    return true; // Already at the end!\n  }\n  // The result is true if we step forward by element and find we are\n  // at the end of the page or at beginning of *all* levels in:\n  // [level, element).\n  // When there is more than one level difference between element and level,\n  // we could for instance move forward one symbol and still be at the first\n  // word on a line, so we also have to be at the first symbol in a word.\n  ResultIterator next(*this);\n  next.Next(element);\n  if (next.Empty(element)) {\n    return true; // Reached the end of the page.\n  }\n  while (element > level) {\n    element = static_cast<PageIteratorLevel>(element - 1);\n    if (!next.IsAtBeginningOf(element)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Returns the number of blanks before the current word.\nint ResultIterator::BlanksBeforeWord() const {\n  if (CurrentParagraphIsLtr()) {\n    return LTRResultIterator::BlanksBeforeWord();\n  }\n  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;\n}\n\n/**\n * Returns the null terminated UTF-8 encoded text string for the current\n * object at the given level. Use delete [] to free after use.\n */\nchar *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {\n  if (it_->word() == nullptr) {\n    return nullptr; // Already at the end!\n  }\n  std::string text;\n  switch (level) {\n    case RIL_BLOCK: {\n      ResultIterator pp(*this);\n      do {\n        pp.AppendUTF8ParagraphText(&text);\n      } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());\n    } break;\n    case RIL_PARA:\n      AppendUTF8ParagraphText(&text);\n      break;\n    case RIL_TEXTLINE: {\n      ResultIterator it(*this);\n      it.MoveToLogicalStartOfTextline();\n      it.IterateAndAppendUTF8TextlineText(&text);\n    } break;\n    case RIL_WORD:\n      AppendUTF8WordText(&text);\n      break;\n    case RIL_SYMBOL: {\n      bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;\n      if (at_beginning_of_minor_run_) {\n        text += reading_direction_is_ltr ? kLRM : kRLM;\n      }\n      text = it_->word()->BestUTF8(blob_index_, false);\n      if (IsAtFinalSymbolOfWord()) {\n        AppendSuffixMarks(&text);\n      }\n    } break;\n  }\n  return copy_string(text);\n}\nstd::vector<std::vector<std::vector<std::pair<const char *, float>>>>\n    *ResultIterator::GetRawLSTMTimesteps() const {\n  if (it_->word() != nullptr) {\n    return &it_->word()->segmented_timesteps;\n  } else {\n    return nullptr;\n  }\n}\n\nstd::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()\n    const {\n  if (it_->word() != nullptr) {\n    return &it_->word()->CTC_symbol_choices;\n  } else {\n    return nullptr;\n  }\n}\n\nvoid ResultIterator::AppendUTF8WordText(std::string *text) const {\n  if (!it_->word()) {\n    return;\n  }\n  ASSERT_HOST(it_->word()->best_choice != nullptr);\n  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;\n  if (at_beginning_of_minor_run_) {\n    *text += reading_direction_is_ltr ? kLRM : kRLM;\n  }\n\n  std::vector<int> blob_order;\n  CalculateBlobOrder(&blob_order);\n  for (int i : blob_order) {\n    *text += it_->word()->BestUTF8(i, false);\n  }\n  AppendSuffixMarks(text);\n}\n\nvoid ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {\n  if (Empty(RIL_WORD)) {\n    Next(RIL_WORD);\n    return;\n  }\n  if (BidiDebug(1)) {\n    std::vector<int> textline_order;\n    std::vector<StrongScriptDirection> dirs;\n    CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);\n    tprintf(\"Strong Script dirs     [%p/P=%s]: \",\n            static_cast<void *>(it_->row()),\n            current_paragraph_is_ltr_ ? \"ltr\" : \"rtl\");\n    PrintScriptDirs(dirs);\n    tprintf(\"Logical textline order [%p/P=%s]: \",\n            static_cast<void *>(it_->row()),\n            current_paragraph_is_ltr_ ? \"ltr\" : \"rtl\");\n    for (int i : textline_order) {\n      tprintf(\"%d \", i);\n    }\n    tprintf(\"\\n\");\n  }\n\n  int words_appended = 0;\n  do {\n    int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);\n    for (int i = 0; i < numSpaces; ++i) {\n      *text += \" \";\n    }\n    AppendUTF8WordText(text);\n    words_appended++;\n    if (BidiDebug(2)) {\n      tprintf(\"Num spaces=%d, text=%s\\n\", numSpaces, text->c_str());\n    }\n  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));\n  if (BidiDebug(1)) {\n    tprintf(\"%d words printed\\n\", words_appended);\n  }\n  *text += line_separator_;\n  // If we just finished a paragraph, add an extra newline.\n  if (IsAtBeginningOf(RIL_PARA)) {\n    *text += paragraph_separator_;\n  }\n}\n\nvoid ResultIterator::AppendUTF8ParagraphText(std::string *text) const {\n  ResultIterator it(*this);\n  it.RestartParagraph();\n  it.MoveToLogicalStartOfTextline();\n  if (it.Empty(RIL_WORD)) {\n    return;\n  }\n  do {\n    it.IterateAndAppendUTF8TextlineText(text);\n  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));\n}\n\nbool ResultIterator::BidiDebug(int min_level) const {\n  int debug_level = 1;\n  auto *p = ParamUtils::FindParam<IntParam>(\"bidi_debug\", GlobalParams()->int_params,\n                                            tesseract_->params()->int_params);\n  if (p != nullptr) {\n    debug_level = (int32_t)(*p);\n  }\n  return debug_level >= min_level;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/superscript.cpp",
    "content": "/******************************************************************\n * File:        superscript.cpp\n * Description: Correction pass to fix superscripts and subscripts.\n * Author:      David Eger\n *\n * (C) Copyright 2012, Google, Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"normalis.h\"\n#include \"tesseractclass.h\"\n\nnamespace tesseract {\n\nstatic int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {\n  int num_chopped = 0;\n  for (int i = 0; i < num_unichars; i++) {\n    num_chopped += word->best_state[i];\n  }\n  return num_chopped;\n}\n\nstatic int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {\n  int num_chopped = 0;\n  for (int i = 0; i < num_unichars; i++) {\n    num_chopped += word->best_state[word->best_state.size() - 1 - i];\n  }\n  return num_chopped;\n}\n\n/**\n * Given a recognized blob, see if a contiguous collection of sub-pieces\n * (chopped blobs) starting at its left might qualify as being a subscript\n * or superscript letter based only on y position.  Also do this for the\n * right side.\n */\nstatic void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,\n                           int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,\n                           ScriptPos *trailing_pos, int *num_trailing_outliers) {\n  ScriptPos sp_unused1, sp_unused2;\n  int unused1, unused2;\n  if (!leading_pos) {\n    leading_pos = &sp_unused1;\n  }\n  if (!num_leading_outliers) {\n    num_leading_outliers = &unused1;\n  }\n  if (!trailing_pos) {\n    trailing_pos = &sp_unused2;\n  }\n  if (!num_trailing_outliers) {\n    num_trailing_outliers = &unused2;\n  }\n\n  *num_leading_outliers = *num_trailing_outliers = 0;\n  *leading_pos = *trailing_pos = SP_NORMAL;\n\n  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);\n  int num_chopped_pieces = word->best_state[rebuilt_blob_index];\n  ScriptPos last_pos = SP_NORMAL;\n  int trailing_outliers = 0;\n  for (int i = 0; i < num_chopped_pieces; i++) {\n    TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();\n    ScriptPos pos = SP_NORMAL;\n    if (box.bottom() >= super_y_bottom) {\n      pos = SP_SUPERSCRIPT;\n    } else if (box.top() <= sub_y_top) {\n      pos = SP_SUBSCRIPT;\n    }\n    if (pos == SP_NORMAL) {\n      if (trailing_outliers == i) {\n        *num_leading_outliers = trailing_outliers;\n        *leading_pos = last_pos;\n      }\n      trailing_outliers = 0;\n    } else {\n      if (pos == last_pos) {\n        trailing_outliers++;\n      } else {\n        trailing_outliers = 1;\n      }\n    }\n    last_pos = pos;\n  }\n  *num_trailing_outliers = trailing_outliers;\n  *trailing_pos = last_pos;\n}\n\n/**\n * Attempt to split off any high (or low) bits at the ends of the word with poor\n * certainty and recognize them separately.  If the certainty gets much better\n * and other sanity checks pass, accept.\n *\n * This superscript fix is meant to be called in the second pass of recognition\n * when we have tried once and already have a preliminary answer for word.\n *\n * @return Whether we modified the given word.\n */\nbool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {\n  if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {\n    return false;\n  }\n  int num_leading, num_trailing;\n  ScriptPos sp_leading, sp_trailing;\n  float leading_certainty, trailing_certainty;\n  float avg_certainty, unlikely_threshold;\n\n  // Calculate the number of whole suspicious characters at the edges.\n  GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,\n                                 &sp_trailing, &trailing_certainty, &avg_certainty,\n                                 &unlikely_threshold);\n\n  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? \"sub\" : \"super\";\n  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? \"sub\" : \"super\";\n\n  int num_blobs = word->best_choice->length();\n\n  // Calculate the remainder (partial characters) at the edges.\n  // This accounts for us having classified the best version of\n  // a word as [speaker?'] when it was instead [speaker.^{21}]\n  // (that is we accidentally thought the 2 was attached to the period).\n  int num_remainder_leading = 0, num_remainder_trailing = 0;\n  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {\n    int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;\n    int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;\n    int last_word_char = num_blobs - 1 - num_trailing;\n    float last_char_certainty = word->best_choice->certainty(last_word_char);\n    if (word->best_choice->unichar_id(last_word_char) != 0 &&\n        last_char_certainty <= unlikely_threshold) {\n      ScriptPos rpos;\n      YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,\n                     &num_remainder_trailing);\n      if (num_trailing > 0 && rpos != sp_trailing) {\n        num_remainder_trailing = 0;\n      }\n      if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {\n        trailing_certainty = last_char_certainty;\n      }\n    }\n    bool another_blob_available =\n        (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;\n    int first_char_certainty = word->best_choice->certainty(num_leading);\n    if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&\n        first_char_certainty <= unlikely_threshold) {\n      ScriptPos lpos;\n      YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,\n                     nullptr, nullptr);\n      if (num_leading > 0 && lpos != sp_leading) {\n        num_remainder_leading = 0;\n      }\n      if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {\n        leading_certainty = first_char_certainty;\n      }\n    }\n  }\n\n  // If nothing to do, bail now.\n  if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {\n    return false;\n  }\n\n  if (superscript_debug >= 1) {\n    tprintf(\"Candidate for superscript detection: %s (\",\n            word->best_choice->unichar_string().c_str());\n    if (num_leading || num_remainder_leading) {\n      tprintf(\"%d.%d %s-leading \", num_leading, num_remainder_leading, leading_pos);\n    }\n    if (num_trailing || num_remainder_trailing) {\n      tprintf(\"%d.%d %s-trailing \", num_trailing, num_remainder_trailing, trailing_pos);\n    }\n    tprintf(\")\\n\");\n  }\n  if (superscript_debug >= 3) {\n    word->best_choice->print();\n  }\n  if (superscript_debug >= 2) {\n    tprintf(\" Certainties -- Average: %.2f  Unlikely thresh: %.2f  \", avg_certainty,\n            unlikely_threshold);\n    if (num_leading) {\n      tprintf(\"Orig. leading (min): %.2f  \", leading_certainty);\n    }\n    if (num_trailing) {\n      tprintf(\"Orig. trailing (min): %.2f  \", trailing_certainty);\n    }\n    tprintf(\"\\n\");\n  }\n\n  // We've now calculated the number of rebuilt blobs we want to carve off.\n  // However, split_word() works from TBLOBs in chopped_word, so we need to\n  // convert to those.\n  int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;\n  int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;\n\n  int retry_leading = 0;\n  int retry_trailing = 0;\n  bool is_good = false;\n  WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,\n                                           num_chopped_trailing, trailing_certainty, sp_trailing,\n                                           word, &is_good, &retry_leading, &retry_trailing);\n  if (is_good) {\n    word->ConsumeWordResults(revised);\n  } else if (retry_leading || retry_trailing) {\n    int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);\n    int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);\n    WERD_RES *revised2 = TrySuperscriptSplits(\n        retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,\n        trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);\n    if (is_good) {\n      word->ConsumeWordResults(revised2);\n    }\n    delete revised2;\n  }\n  delete revised;\n  return is_good;\n}\n\n/**\n * Determine how many characters (rebuilt blobs) on each end of a given word\n * might plausibly be superscripts so SubAndSuperscriptFix can try to\n * re-recognize them.  Even if we find no whole blobs at either end,\n * we will set *unlikely_threshold to a certainty that might be used to\n * select \"bad enough\" outlier characters.  If *unlikely_threshold is set to 0,\n * though, there's really no hope.\n *\n * @param[in]  word    The word to examine.\n * @param[out] num_rebuilt_leading   the number of rebuilt blobs at the start\n *                                   of the word which are all up or down and\n *                                   seem badly classified.\n * @param[out] leading_pos        \"super\" or \"sub\" (for debugging)\n * @param[out] leading_certainty  the worst certainty in the leading blobs.\n * @param[out] num_rebuilt_trailing   the number of rebuilt blobs at the end\n *                                    of the word which are all up or down and\n *                                    seem badly classified.\n * @param[out] trailing_pos        \"super\" or \"sub\" (for debugging)\n * @param[out] trailing_certainty  the worst certainty in the trailing blobs.\n * @param[out] avg_certainty       the average certainty of \"normal\" blobs in\n *                                 the word.\n * @param[out] unlikely_threshold  the threshold (on certainty) we used to\n *                                 select \"bad enough\" outlier characters.\n */\nvoid Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,\n                                               ScriptPos *leading_pos, float *leading_certainty,\n                                               int *num_rebuilt_trailing, ScriptPos *trailing_pos,\n                                               float *trailing_certainty, float *avg_certainty,\n                                               float *unlikely_threshold) {\n  *avg_certainty = *unlikely_threshold = 0.0f;\n  *num_rebuilt_leading = *num_rebuilt_trailing = 0;\n  *leading_certainty = *trailing_certainty = 0.0f;\n\n  int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;\n  int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;\n\n  // Step one: Get an average certainty for \"normally placed\" characters.\n\n  // Counts here are of blobs in the rebuild_word / unichars in best_choice.\n  *leading_pos = *trailing_pos = SP_NORMAL;\n  int leading_outliers = 0;\n  int trailing_outliers = 0;\n  int num_normal = 0;\n  float normal_certainty_total = 0.0f;\n  float worst_normal_certainty = 0.0f;\n  ScriptPos last_pos = SP_NORMAL;\n  int num_blobs = word->rebuild_word->NumBlobs();\n  for (int b = 0; b < num_blobs; ++b) {\n    TBOX box = word->rebuild_word->blobs[b]->bounding_box();\n    ScriptPos pos = SP_NORMAL;\n    if (box.bottom() >= super_y_bottom) {\n      pos = SP_SUPERSCRIPT;\n    } else if (box.top() <= sub_y_top) {\n      pos = SP_SUBSCRIPT;\n    }\n    if (pos == SP_NORMAL) {\n      if (word->best_choice->unichar_id(b) != 0) {\n        float char_certainty = word->best_choice->certainty(b);\n        if (char_certainty < worst_normal_certainty) {\n          worst_normal_certainty = char_certainty;\n        }\n        num_normal++;\n        normal_certainty_total += char_certainty;\n      }\n      if (trailing_outliers == b) {\n        leading_outliers = trailing_outliers;\n        *leading_pos = last_pos;\n      }\n      trailing_outliers = 0;\n    } else {\n      if (last_pos == pos) {\n        trailing_outliers++;\n      } else {\n        trailing_outliers = 1;\n      }\n    }\n    last_pos = pos;\n  }\n  *trailing_pos = last_pos;\n  if (num_normal >= 3) { // throw out the worst as an outlier.\n    num_normal--;\n    normal_certainty_total -= worst_normal_certainty;\n  }\n  if (num_normal > 0) {\n    *avg_certainty = normal_certainty_total / num_normal;\n    *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);\n  }\n  if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {\n    return;\n  }\n\n  // Step two: Try to split off bits of the word that are both outliers\n  //           and have much lower certainty than average\n  // Calculate num_leading and leading_certainty.\n  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;\n       (*num_rebuilt_leading)++) {\n    float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);\n    if (char_certainty > *unlikely_threshold) {\n      break;\n    }\n    if (char_certainty < *leading_certainty) {\n      *leading_certainty = char_certainty;\n    }\n  }\n\n  // Calculate num_trailing and trailing_certainty.\n  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;\n       *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {\n    int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;\n    float char_certainty = word->best_choice->certainty(blob_idx);\n    if (char_certainty > *unlikely_threshold) {\n      break;\n    }\n    if (char_certainty < *trailing_certainty) {\n      *trailing_certainty = char_certainty;\n    }\n  }\n}\n\n/**\n * Try splitting off the given number of (chopped) blobs from the front and\n * back of the given word and recognizing the pieces.\n *\n * @param[in]  num_chopped_leading   how many chopped blobs from the left\n *                    end of the word to chop off and try recognizing as a\n *                    superscript (or subscript)\n * @param[in]  leading_certainty     the (minimum) certainty had by the\n *                    characters in the original leading section.\n * @param[in]  leading_pos    \"super\" or \"sub\" (for debugging)\n * @param[in]  num_chopped_trailing  how many chopped blobs from the right\n *                    end of the word to chop off and try recognizing as a\n *                    superscript (or subscript)\n * @param[in]  trailing_certainty    the (minimum) certainty had by the\n *                    characters in the original trailing section.\n * @param[in]  trailing_pos      \"super\" or \"sub\" (for debugging)\n * @param[in]  word              the word to try to chop up.\n * @param[out] is_good           do we believe our result?\n * @param[out] retry_rebuild_leading, retry_rebuild_trailing\n *         If non-zero, and !is_good, then the caller may have luck trying\n *         to split the returned word with this number of (rebuilt) leading\n *         and trailing blobs / unichars.\n * @return A word which is the result of re-recognizing as asked.\n */\nWERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,\n                                          ScriptPos leading_pos, int num_chopped_trailing,\n                                          float trailing_certainty, ScriptPos trailing_pos,\n                                          WERD_RES *word, bool *is_good, int *retry_rebuild_leading,\n                                          int *retry_rebuild_trailing) {\n  int num_chopped = word->chopped_word->NumBlobs();\n\n  *retry_rebuild_leading = *retry_rebuild_trailing = 0;\n\n  // Chop apart the word into up to three pieces.\n\n  BlamerBundle *bb0 = nullptr;\n  BlamerBundle *bb1 = nullptr;\n  WERD_RES *prefix = nullptr;\n  WERD_RES *core = nullptr;\n  WERD_RES *suffix = nullptr;\n  if (num_chopped_leading > 0) {\n    prefix = new WERD_RES(*word);\n    split_word(prefix, num_chopped_leading, &core, &bb0);\n  } else {\n    core = new WERD_RES(*word);\n  }\n\n  if (num_chopped_trailing > 0) {\n    int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;\n    split_word(core, split_pt, &suffix, &bb1);\n  }\n\n  //  Recognize the pieces in turn.\n  int saved_cp_multiplier = classify_class_pruner_multiplier;\n  int saved_im_multiplier = classify_integer_matcher_multiplier;\n  if (prefix) {\n    // Turn off Tesseract's y-position penalties for the leading superscript.\n    classify_class_pruner_multiplier.set_value(0);\n    classify_integer_matcher_multiplier.set_value(0);\n\n    // Adjust our expectations about the baseline for this prefix.\n    if (superscript_debug >= 3) {\n      tprintf(\" recognizing first %d chopped blobs\\n\", num_chopped_leading);\n    }\n    recog_word_recursive(prefix);\n    if (superscript_debug >= 2) {\n      tprintf(\" The leading bits look like %s %s\\n\", ScriptPosToString(leading_pos),\n              prefix->best_choice->unichar_string().c_str());\n    }\n\n    // Restore the normal y-position penalties.\n    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);\n    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);\n  }\n\n  if (superscript_debug >= 3) {\n    tprintf(\" recognizing middle %d chopped blobs\\n\",\n            num_chopped - num_chopped_leading - num_chopped_trailing);\n  }\n\n  if (suffix) {\n    // Turn off Tesseract's y-position penalties for the trailing superscript.\n    classify_class_pruner_multiplier.set_value(0);\n    classify_integer_matcher_multiplier.set_value(0);\n\n    if (superscript_debug >= 3) {\n      tprintf(\" recognizing last %d chopped blobs\\n\", num_chopped_trailing);\n    }\n    recog_word_recursive(suffix);\n    if (superscript_debug >= 2) {\n      tprintf(\" The trailing bits look like %s %s\\n\", ScriptPosToString(trailing_pos),\n              suffix->best_choice->unichar_string().c_str());\n    }\n\n    // Restore the normal y-position penalties.\n    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);\n    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);\n  }\n\n  // Evaluate whether we think the results are believably better\n  // than what we already had.\n  bool good_prefix =\n      !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,\n                                       superscript_bettered_certainty * leading_certainty,\n                                       retry_rebuild_leading, nullptr);\n  bool good_suffix =\n      !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,\n                                       superscript_bettered_certainty * trailing_certainty, nullptr,\n                                       retry_rebuild_trailing);\n\n  *is_good = good_prefix && good_suffix;\n  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {\n    // None of it is any good. Quit now.\n    delete core;\n    delete prefix;\n    delete suffix;\n    delete bb1;\n    return nullptr;\n  }\n  recog_word_recursive(core);\n\n  // Now paste the results together into core.\n  if (suffix) {\n    suffix->SetAllScriptPositions(trailing_pos);\n    join_words(core, suffix, bb1);\n  }\n  if (prefix) {\n    prefix->SetAllScriptPositions(leading_pos);\n    join_words(prefix, core, bb0);\n    core = prefix;\n    prefix = nullptr;\n  }\n\n  if (superscript_debug >= 1) {\n    tprintf(\"%s superscript fix: %s\\n\", *is_good ? \"ACCEPT\" : \"REJECT\",\n            core->best_choice->unichar_string().c_str());\n  }\n  return core;\n}\n\n/**\n * Return whether this is believable superscript or subscript text.\n *\n * We insist that:\n *   + there are no punctuation marks.\n *   + there are no italics.\n *   + no normal-sized character is smaller than superscript_scaledown_ratio\n *     of what it ought to be, and\n *   + each character is at least as certain as certainty_threshold.\n *\n *  @param[in]  debug  If true, spew debug output\n *  @param[in]  word   The word whose best_choice we're evaluating\n *  @param[in]  certainty_threshold   If any of the characters have less\n *                    certainty than this, reject.\n *  @param[out]  left_ok  How many left-side characters were ok?\n *  @param[out]  right_ok  How many right-side characters were ok?\n *  @return  Whether the complete best choice is believable as a superscript.\n */\nbool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,\n                                      int *left_ok, int *right_ok) const {\n  unsigned initial_ok_run_count = 0;\n  unsigned ok_run_count = 0;\n  float worst_certainty = 0.0f;\n  const WERD_CHOICE &wc = *word.best_choice;\n\n  const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();\n  for (unsigned i = 0; i < wc.length(); i++) {\n    TBLOB *blob = word.rebuild_word->blobs[i];\n    UNICHAR_ID unichar_id = wc.unichar_id(i);\n    float char_certainty = wc.certainty(i);\n    bool bad_certainty = char_certainty < certainty_threshold;\n    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);\n    bool is_italic = word.fontinfo && word.fontinfo->is_italic();\n    BLOB_CHOICE *choice = word.GetBlobChoice(i);\n    if (choice && fontinfo_table.size() > 0) {\n      // Get better information from the specific choice, if available.\n      int font_id1 = choice->fontinfo_id();\n      bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;\n      int font_id2 = choice->fontinfo_id2();\n      is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());\n    }\n\n    float height_fraction = 1.0f;\n    float char_height = blob->bounding_box().height();\n    float normal_height = char_height;\n    if (wc.unicharset()->top_bottom_useful()) {\n      int min_bot, max_bot, min_top, max_top;\n      wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);\n      float hi_height = max_top - max_bot;\n      float lo_height = min_top - min_bot;\n      normal_height = (hi_height + lo_height) / 2;\n      if (normal_height >= kBlnXHeight) {\n        // Only ding characters that we have decent information for because\n        // they're supposed to be normal sized, not tiny specks or dashes.\n        height_fraction = char_height / normal_height;\n      }\n    }\n    bool bad_height = height_fraction < superscript_scaledown_ratio;\n\n    if (debug) {\n      if (is_italic) {\n        tprintf(\" Rejecting: superscript is italic.\\n\");\n      }\n      if (is_punc) {\n        tprintf(\" Rejecting: punctuation present.\\n\");\n      }\n      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);\n      if (bad_certainty) {\n        tprintf(\n            \" Rejecting: don't believe character %s with certainty %.2f \"\n            \"which is less than threshold %.2f\\n\",\n            char_str, char_certainty, certainty_threshold);\n      }\n      if (bad_height) {\n        tprintf(\n            \" Rejecting: character %s seems too small @ %.2f versus \"\n            \"expected %.2f\\n\",\n            char_str, char_height, normal_height);\n      }\n    }\n    if (bad_certainty || bad_height || is_punc || is_italic) {\n      if (ok_run_count == i) {\n        initial_ok_run_count = ok_run_count;\n      }\n      ok_run_count = 0;\n    } else {\n      ok_run_count++;\n    }\n    if (char_certainty < worst_certainty) {\n      worst_certainty = char_certainty;\n    }\n  }\n  bool all_ok = ok_run_count == wc.length();\n  if (all_ok && debug) {\n    tprintf(\" Accept: worst revised certainty is %.2f\\n\", worst_certainty);\n  }\n  if (!all_ok) {\n    if (left_ok) {\n      *left_ok = initial_ok_run_count;\n    }\n    if (right_ok) {\n      *right_ok = ok_run_count;\n    }\n  }\n  return all_ok;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/tessbox.cpp",
    "content": "/**********************************************************************\n * File:        tessbox.cpp  (Formerly tessbox.c)\n * Description: Black boxed Tess for developing a resaljet.\n * Author:      Ray Smith\n * Created:     Thu Apr 23 11:03:36 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"mfoutline.h\"\n#include \"tesseractclass.h\"\n\n/**\n * @name tess_segment_pass_n\n *\n * Segment a word using the pass_n conditions of the tess segmenter.\n * @param pass_n pass number\n * @param word word to do\n */\n\nnamespace tesseract {\nvoid Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {\n  int saved_enable_assoc = 0;\n  int saved_chop_enable = 0;\n\n  if (word->word->flag(W_DONT_CHOP)) {\n    saved_enable_assoc = wordrec_enable_assoc;\n    saved_chop_enable = chop_enable;\n    wordrec_enable_assoc.set_value(false);\n    chop_enable.set_value(false);\n  }\n  if (pass_n == 1) {\n    set_pass1();\n  } else {\n    set_pass2();\n  }\n  recog_word(word);\n  if (word->best_choice == nullptr) {\n    word->SetupFake(*word->uch_set);\n  }\n  if (word->word->flag(W_DONT_CHOP)) {\n    wordrec_enable_assoc.set_value(saved_enable_assoc);\n    chop_enable.set_value(saved_chop_enable);\n  }\n}\n\n/**\n * @name tess_acceptable_word\n *\n * @return true if the word is regarded as \"good enough\".\n * @param word_choice after context\n * @param raw_choice before context\n */\nbool Tesseract::tess_acceptable_word(WERD_RES *word) {\n  return getDict().AcceptableResult(word);\n}\n\n/**\n * @name tess_add_doc_word\n *\n * Add the given word to the document dictionary\n */\nvoid Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {\n  getDict().add_document_word(*word_choice);\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/tessedit.cpp",
    "content": "/**********************************************************************\n * File:        tessedit.cpp  (Formerly tessedit.c)\n * Description: (Previously) Main program for merge of tess and editor.\n *              Now just code to load the language model and various\n *              engine-specific data files.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"control.h\"\n#include \"matchdefs.h\"\n#include \"pageres.h\"\n#include \"params.h\"\n#include \"stopper.h\"\n#include \"tesseractclass.h\"\n#include \"tessvars.h\"\n#include \"tprintf.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"chop.h\"\n#  include \"intmatcher.h\"\n#  include \"reject.h\"\n#endif\n#include \"lstmrecognizer.h\"\n\nnamespace tesseract {\n\n// Read a \"config\" file containing a set of variable, value pairs.\n// Searches the standard places: tessdata/configs, tessdata/tessconfigs\n// and also accepts a relative or absolute path name.\nvoid Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {\n  std::string path = datadir;\n  path += \"configs/\";\n  path += filename;\n  FILE *fp;\n  if ((fp = fopen(path.c_str(), \"rb\")) != nullptr) {\n    fclose(fp);\n  } else {\n    path = datadir;\n    path += \"tessconfigs/\";\n    path += filename;\n    if ((fp = fopen(path.c_str(), \"rb\")) != nullptr) {\n      fclose(fp);\n    } else {\n      path = filename;\n    }\n  }\n  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());\n}\n\n// Returns false if a unicharset file for the specified language was not found\n// or was invalid.\n// This function initializes TessdataManager. After TessdataManager is\n// no longer needed, TessdataManager::End() should be called.\n//\n// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless\n// it is OEM_DEFAULT, in which case the value of the variable will be obtained\n// from the language-specific config file (stored in [lang].traineddata), from\n// the config files specified on the command line or left as the default\n// OEM_TESSERACT_ONLY if none of the configs specify this variable.\nbool Tesseract::init_tesseract_lang_data(const std::string &arg0,\n                                         const std::string &language, OcrEngineMode oem,\n                                         char **configs, int configs_size,\n                                         const std::vector<std::string> *vars_vec,\n                                         const std::vector<std::string> *vars_values,\n                                         bool set_only_non_debug_params, TessdataManager *mgr) {\n  // Set the language data path prefix\n  lang = !language.empty() ? language : \"eng\";\n  language_data_path_prefix = datadir;\n  language_data_path_prefix += lang;\n  language_data_path_prefix += \".\";\n\n  // Initialize TessdataManager.\n  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;\n  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {\n    tprintf(\"Error opening data file %s\\n\", tessdata_path.c_str());\n    tprintf(\n        \"Please make sure the TESSDATA_PREFIX environment variable is set\"\n        \" to your \\\"tessdata\\\" directory.\\n\");\n    return false;\n  }\n#ifdef DISABLED_LEGACY_ENGINE\n  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);\n#else\n  if (oem == OEM_DEFAULT) {\n    // Set the engine mode from availability, which can then be overridden by\n    // the config file when we read it below.\n    if (!mgr->IsLSTMAvailable()) {\n      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);\n    } else if (!mgr->IsBaseAvailable()) {\n      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);\n    } else {\n      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  // If a language specific config file (lang.config) exists, load it in.\n  TFile fp;\n  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {\n    ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());\n  }\n\n  SetParamConstraint set_params_constraint =\n      set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;\n  // Load tesseract variables from config files. This is done after loading\n  // language-specific variables from [lang].traineddata file, so that custom\n  // config files can override values in [lang].traineddata file.\n  for (int i = 0; i < configs_size; ++i) {\n    read_config_file(configs[i], set_params_constraint);\n  }\n\n  // Set params specified in vars_vec (done after setting params from config\n  // files, so that params in vars_vec can override those from files).\n  if (vars_vec != nullptr && vars_values != nullptr) {\n    for (unsigned i = 0; i < vars_vec->size(); ++i) {\n      if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),\n                                set_params_constraint, this->params())) {\n        tprintf(\"Warning: The parameter '%s' was not found.\\n\", (*vars_vec)[i].c_str());\n      }\n    }\n  }\n\n  if (!tessedit_write_params_to_file.empty()) {\n    FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), \"wb\");\n    if (params_file != nullptr) {\n      ParamUtils::PrintParams(params_file, this->params());\n      fclose(params_file);\n    } else {\n      tprintf(\"Failed to open %s for writing params.\\n\", tessedit_write_params_to_file.c_str());\n    }\n  }\n\n#ifndef DISABLED_LEGACY_ENGINE\n  // Determine which ocr engine(s) should be loaded and used for recognition.\n  if (oem != OEM_DEFAULT) {\n    tessedit_ocr_engine_mode.set_value(oem);\n  }\n#endif\n\n  // If we are only loading the config file (and so not planning on doing any\n  // recognition) then there's nothing else do here.\n  if (tessedit_init_config_only) {\n    return true;\n  }\n\n// The various OcrEngineMode settings (see tesseract/publictypes.h) determine\n// which engine-specific data files need to be loaded. If LSTM_ONLY is\n// requested, the base Tesseract files are *Not* required.\n#ifdef DISABLED_LEGACY_ENGINE\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n#else\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||\n      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {\n#endif // ndef DISABLED_LEGACY_ENGINE\n    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {\n      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());\n      ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : \"\", mgr));\n    } else {\n      tprintf(\"Error: LSTM requested, but not present!! Loading tesseract.\\n\");\n      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);\n    }\n  }\n\n  // Load the unicharset\n  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {\n    // Avoid requiring a unicharset when we aren't running base tesseract.\n    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {\n    tprintf(\n        \"Error: Tesseract (legacy) engine requested, but components are \"\n        \"not present in %s!!\\n\",\n        tessdata_path.c_str());\n    return false;\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n  if (unicharset.size() > MAX_NUM_CLASSES) {\n    tprintf(\"Error: Size of unicharset is greater than MAX_NUM_CLASSES\\n\");\n    return false;\n  }\n  right_to_left_ = unicharset.major_right_to_left();\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n  // Setup initial unichar ambigs table and read universal ambigs.\n  UNICHARSET encoder_unicharset;\n  encoder_unicharset.CopyFrom(unicharset);\n  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);\n  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);\n\n  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {\n    unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,\n                                     use_ambigs_for_adaption, &unicharset);\n  }\n\n  // Init ParamsModel.\n  // Load pass1 and pass2 weights (for now these two sets are the same, but in\n  // the future separate sets of weights can be generated).\n  for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {\n    language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));\n    if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {\n      if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {\n        return false;\n      }\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  return true;\n}\n\n// Helper returns true if the given string is in the vector of strings.\nstatic bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {\n  for (const auto &i : str_list) {\n    if (i == str) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Parse a string of the form [~]<lang>[+[~]<lang>]*.\n// Langs with no prefix get appended to to_load, provided they\n// are not in there already.\n// Langs with ~ prefix get appended to not_to_load, provided they are not in\n// there already.\nvoid Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,\n                                    std::vector<std::string> *not_to_load) {\n  std::string remains(lang_str);\n  // Look whether the model file uses a prefix which must be applied to\n  // included model files as well.\n  std::string prefix;\n  size_t found = lang.find_last_of('/');\n  if (found != std::string::npos) {\n    // A prefix was found.\n    prefix = lang.substr(0, found + 1);\n  }\n  while (!remains.empty()) {\n    // Find the start of the lang code and which vector to add to.\n    const char *start = remains.c_str();\n    while (*start == '+') {\n      ++start;\n    }\n    std::vector<std::string> *target = to_load;\n    if (*start == '~') {\n      target = not_to_load;\n      ++start;\n    }\n    // Find the index of the end of the lang code in string start.\n    int end = strlen(start);\n    const char *plus = strchr(start, '+');\n    if (plus != nullptr && plus - start < end) {\n      end = plus - start;\n    }\n    std::string lang_code(start);\n    lang_code.resize(end);\n    std::string next(start + end);\n    remains = std::move(next);\n    lang_code = prefix + lang_code;\n    // Check whether lang_code is already in the target vector and add.\n    if (!IsStrInList(lang_code, *target)) {\n      target->push_back(lang_code);\n    }\n  }\n}\n\n// Initialize for potentially a set of languages defined by the language\n// string and recursively any additional languages required by any language\n// traineddata file (via tessedit_load_sublangs in its config) that is loaded.\n// See init_tesseract_internal for args.\nint Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,\n                              const std::string &language, OcrEngineMode oem, char **configs,\n                              int configs_size, const std::vector<std::string> *vars_vec,\n                              const std::vector<std::string> *vars_values,\n                              bool set_only_non_debug_params, TessdataManager *mgr) {\n  std::vector<std::string> langs_to_load;\n  std::vector<std::string> langs_not_to_load;\n  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);\n\n  for (auto *lang : sub_langs_) {\n    delete lang;\n  }\n\n  // Set the basename, compute the data directory.\n  main_setup(arg0, textbase);\n\n  sub_langs_.clear();\n  // Find the first loadable lang and load into this.\n  // Add any languages that this language requires\n  bool loaded_primary = false;\n  // Load the rest into sub_langs_.\n  // WARNING: A range based for loop does not work here because langs_to_load\n  // might be changed in the loop when a new submodel is found.\n  for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {\n    auto &lang_to_load = langs_to_load[lang_index];\n    if (!IsStrInList(lang_to_load, langs_not_to_load)) {\n      const char *lang_str = lang_to_load.c_str();\n      Tesseract *tess_to_init;\n      if (!loaded_primary) {\n        tess_to_init = this;\n      } else {\n        tess_to_init = new Tesseract;\n        tess_to_init->main_setup(arg0, textbase);\n      }\n\n      int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,\n                                                         configs_size, vars_vec, vars_values,\n                                                         set_only_non_debug_params, mgr);\n      // Forget that language, but keep any reader we were given.\n      mgr->Clear();\n\n      if (!loaded_primary) {\n        if (result < 0) {\n          tprintf(\"Failed loading language '%s'\\n\", lang_str);\n        } else {\n          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,\n                              &langs_not_to_load);\n          loaded_primary = true;\n        }\n      } else {\n        if (result < 0) {\n          tprintf(\"Failed loading language '%s'\\n\", lang_str);\n          delete tess_to_init;\n        } else {\n          sub_langs_.push_back(tess_to_init);\n          // Add any languages that this language requires\n          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,\n                              &langs_not_to_load);\n        }\n      }\n    }\n  }\n  if (!loaded_primary && !langs_to_load.empty()) {\n    tprintf(\"Tesseract couldn't load any languages!\\n\");\n    return -1; // Couldn't load any language!\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  if (!sub_langs_.empty()) {\n    // In multilingual mode word ratings have to be directly comparable,\n    // so use the same language model weights for all languages:\n    // use the primary language's params model if\n    // tessedit_use_primary_params_model is set,\n    // otherwise use default language model weights.\n    if (tessedit_use_primary_params_model) {\n      for (auto &sub_lang : sub_langs_) {\n        sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());\n      }\n      tprintf(\"Using params model of the primary language\\n\");\n    } else {\n      this->language_model_->getParamsModel().Clear();\n      for (auto &sub_lang : sub_langs_) {\n        sub_lang->language_model_->getParamsModel().Clear();\n      }\n    }\n  }\n\n  SetupUniversalFontIds();\n#endif // ndef DISABLED_LEGACY_ENGINE\n  return 0;\n}\n\n// Common initialization for a single language.\n// arg0 is the datapath for the tessdata directory, which could be the\n// path of the tessdata directory with no trailing /, or (if tessdata\n// lives in the same directory as the executable, the path of the executable,\n// hence the name arg0.\n// textbase is an optional output file basename (used only for training)\n// language is the language code to load.\n// oem controls which engine(s) will operate on the image\n// configs (argv) is an array of config filenames to load variables from.\n// May be nullptr.\n// configs_size (argc) is the number of elements in configs.\n// vars_vec is an optional vector of variables to set.\n// vars_values is an optional corresponding vector of values for the variables\n// in vars_vec.\n// If set_only_non_debug_params is true, only params that do not contain\n// \"debug\" in the name will be set.\nint Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,\n                                       const std::string &language, OcrEngineMode oem,\n                                       char **configs, int configs_size,\n                                       const std::vector<std::string> *vars_vec,\n                                       const std::vector<std::string> *vars_values,\n                                       bool set_only_non_debug_params, TessdataManager *mgr) {\n  if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,\n                                vars_values, set_only_non_debug_params, mgr)) {\n    return -1;\n  }\n  if (tessedit_init_config_only) {\n    return 0;\n  }\n  // If only LSTM will be used, skip loading Tesseract classifier's\n  // pre-trained templates and dictionary.\n  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;\n  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);\n  return 0; // Normal exit\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n// Helper builds the all_fonts table by adding new fonts from new_fonts.\nstatic void CollectFonts(const UnicityTable<FontInfo> &new_fonts,\n                         UnicityTable<FontInfo> *all_fonts) {\n  for (int i = 0; i < new_fonts.size(); ++i) {\n    // UnicityTable uniques as we go.\n    all_fonts->push_back(new_fonts.at(i));\n  }\n}\n\n// Helper assigns an id to lang_fonts using the index in all_fonts table.\nstatic void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {\n  for (int i = 0; i < lang_fonts->size(); ++i) {\n    auto index = all_fonts.get_index(lang_fonts->at(i));\n    lang_fonts->at(i).universal_id = index;\n  }\n}\n\n// Set the universal_id member of each font to be unique among all\n// instances of the same font loaded.\nvoid Tesseract::SetupUniversalFontIds() {\n  // Note that we can get away with bitwise copying FontInfo in\n  // all_fonts, as it is a temporary structure and we avoid setting the\n  // delete callback.\n  UnicityTable<FontInfo> all_fonts;\n\n  // Create the universal ID table.\n  CollectFonts(get_fontinfo_table(), &all_fonts);\n  for (auto &sub_lang : sub_langs_) {\n    CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);\n  }\n  // Assign ids from the table to each font table.\n  AssignIds(all_fonts, &get_fontinfo_table());\n  for (auto &sub_lang : sub_langs_) {\n    AssignIds(all_fonts, &sub_lang->get_fontinfo_table());\n  }\n  font_table_size_ = all_fonts.size();\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\nvoid Tesseract::end_tesseract() {\n  end_recog();\n}\n\n/* Define command type identifiers */\n\nenum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/tesseractclass.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tesseractclass.cpp\n// Description: The Tesseract class. It holds/owns everything needed\n//              to run Tesseract on a single language, and also a set of\n//              sub-Tesseracts to run sub-languages. For thread safety, *every*\n//              variable that was previously global or static (except for\n//              constant data, and some visual debugging flags) has been moved\n//              in here, directly, or indirectly.\n//              This makes it safe to run multiple Tesseracts in different\n//              threads in parallel, and keeps the different language\n//              instances separate.\n//              Some global functions remain, but they are isolated re-entrant\n//              functions that operate on their arguments. Functions that work\n//              on variable data have been moved to an appropriate class based\n//              mostly on the directory hierarchy. For more information see\n//              slide 6 of \"2ArchitectureAndDataStructures\" in\n// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing\n//              Some global data and related functions still exist in the\n//              training-related code, but they don't interfere with normal\n//              recognition operation.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"tesseractclass.h\"\n\n#include <allheaders.h>\n#include \"edgblob.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"equationdetect.h\"\n#endif\n#include \"lstmrecognizer.h\"\n#include \"thresholder.h\" // for ThresholdMethod\n\nnamespace tesseract {\n\nTesseract::Tesseract()\n    : BOOL_MEMBER(tessedit_resegment_from_boxes, false,\n                  \"Take segmentation and labeling from box file\", this->params())\n    , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,\n                  \"Conversion of word/line box file to char box file\", this->params())\n    , BOOL_MEMBER(tessedit_train_from_boxes, false, \"Generate training data from boxed chars\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, \"Generate more boxes from boxed chars\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_train_line_recognizer, false,\n                  \"Break input into lines and remap boxes if present\", this->params())\n    , BOOL_MEMBER(tessedit_dump_pageseg_images, false,\n                  \"Dump intermediate images made during page segmentation\", this->params())\n    // TODO: remove deprecated tessedit_do_invert in release 6.\n    , BOOL_MEMBER(tessedit_do_invert, true,\n                  \"Try inverted line image if necessary (deprecated, will be \"\n                  \"removed in release 6, use the 'invert_threshold' parameter instead)\",\n                  this->params())\n    , double_MEMBER(invert_threshold, 0.7,\n                    \"For lines with a mean confidence below this value, OCR is also tried with an inverted image\",\n                    this->params())\n    ,\n    // The default for pageseg_mode is the old behaviour, so as not to\n    // upset anything that relies on that.\n    INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,\n               \"Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, \"\n               \"4=column,\"\n               \" 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,\"\n               \"11=sparse_text, 12=sparse_text+osd, 13=raw_line\"\n               \" (Values from PageSegMode enum in tesseract/publictypes.h)\",\n               this->params())\n    , INT_MEMBER(thresholding_method,\n                 static_cast<int>(ThresholdMethod::Otsu),\n                 \"Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = \"\n                 \"Sauvola\",\n                 this->params())\n    , BOOL_MEMBER(thresholding_debug, false,\n                  \"Debug the thresholding process\",\n                  this->params())\n    , double_MEMBER(thresholding_window_size, 0.33,\n                    \"Window size for measuring local statistics (to be \"\n                    \"multiplied by image DPI). \"\n                    \"This parameter is used by the Sauvola thresholding method\",\n                    this->params())\n    , double_MEMBER(thresholding_kfactor, 0.34,\n                    \"Factor for reducing threshold due to variance. \"\n                    \"This parameter is used by the Sauvola thresholding method.\"\n                    \" Normal range: 0.2-0.5\",\n                    this->params())\n    , double_MEMBER(thresholding_tile_size, 0.33,\n                    \"Desired tile size (to be multiplied by image DPI). \"\n                    \"This parameter is used by the LeptonicaOtsu thresholding \"\n                    \"method\",\n                    this->params())\n    , double_MEMBER(thresholding_smooth_kernel_size, 0.0,\n                    \"Size of convolution kernel applied to threshold array \"\n                    \"(to be multiplied by image DPI). Use 0 for no smoothing. \"\n                    \"This parameter is used by the LeptonicaOtsu thresholding \"\n                    \"method\",\n                    this->params())\n    , double_MEMBER(thresholding_score_fraction, 0.1,\n                    \"Fraction of the max Otsu score. \"\n                    \"This parameter is used by the LeptonicaOtsu thresholding \"\n                    \"method. \"\n                    \"For standard Otsu use 0.0, otherwise 0.1 is recommended\",\n                    this->params())\n    , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,\n                      \"Which OCR engine(s) to run (Tesseract, LSTM, both).\"\n                      \" Defaults to loading and running the most accurate\"\n                      \" available.\",\n                      this->params())\n    , STRING_MEMBER(tessedit_char_blacklist, \"\", \"Blacklist of chars not to recognize\",\n                    this->params())\n    , STRING_MEMBER(tessedit_char_whitelist, \"\", \"Whitelist of chars to recognize\", this->params())\n    , STRING_MEMBER(tessedit_char_unblacklist, \"\",\n                    \"List of chars to override tessedit_char_blacklist\", this->params())\n    , BOOL_MEMBER(tessedit_ambigs_training, false, \"Perform training for ambiguities\",\n                  this->params())\n    , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,\n                 \"Whether to use the top-line splitting process for Devanagari \"\n                 \"documents while performing page-segmentation.\",\n                 this->params())\n    , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,\n                 \"Whether to use the top-line splitting process for Devanagari \"\n                 \"documents while performing ocr.\",\n                 this->params())\n    , STRING_MEMBER(tessedit_write_params_to_file, \"\", \"Write all parameters to the given file.\",\n                    this->params())\n    , BOOL_MEMBER(tessedit_adaption_debug, false,\n                  \"Generate and print debug\"\n                  \" information for adaption\",\n                  this->params())\n    , INT_MEMBER(bidi_debug, 0, \"Debug level for BiDi\", this->params())\n    , INT_MEMBER(applybox_debug, 1, \"Debug level\", this->params())\n    , INT_MEMBER(applybox_page, 0, \"Page number to apply boxes from\", this->params())\n    , STRING_MEMBER(applybox_exposure_pattern, \".exp\",\n                    \"Exposure value follows\"\n                    \" this pattern in the image filename. The name of the image\"\n                    \" files are expected to be in the form\"\n                    \" [lang].[fontname].exp[num].tif\",\n                    this->params())\n    , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,\n                  \"Learn both character fragments (as is done in the\"\n                  \" special low exposure mode) as well as unfragmented\"\n                  \" characters.\",\n                  this->params())\n    , BOOL_MEMBER(applybox_learn_ngrams_mode, false,\n                  \"Each bounding box\"\n                  \" is assumed to contain ngrams. Only learn the ngrams\"\n                  \" whose outlines overlap horizontally.\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_display_outwords, false, \"Draw output words\", this->params())\n    , BOOL_MEMBER(tessedit_dump_choices, false, \"Dump char choices\", this->params())\n    , BOOL_MEMBER(tessedit_timing_debug, false, \"Print timing stats\", this->params())\n    , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, \"Try to improve fuzzy spaces\", this->params())\n    , BOOL_MEMBER(tessedit_unrej_any_wd, false, \"Don't bother with word plausibility\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_fix_hyphens, true, \"Crunch double hyphens?\", this->params())\n    , BOOL_MEMBER(tessedit_enable_doc_dict, true, \"Add words to the document dictionary\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_debug_fonts, false, \"Output font info per char\", this->params())\n    , INT_MEMBER(tessedit_font_id, 0, \"Font ID to use or zero\", this->params())\n    , BOOL_MEMBER(tessedit_debug_block_rejection, false, \"Block and Row stats\", this->params())\n    , BOOL_MEMBER(tessedit_enable_bigram_correction, true,\n                  \"Enable correction based on the word bigram dictionary.\", this->params())\n    , BOOL_MEMBER(tessedit_enable_dict_correction, false,\n                  \"Enable single word correction based on the dictionary.\", this->params())\n    , INT_MEMBER(tessedit_bigram_debug, 0, \"Amount of debug output for bigram correction.\",\n                 this->params())\n    , BOOL_MEMBER(enable_noise_removal, true,\n                  \"Remove and conditionally reassign small outlines when they\"\n                  \" confuse layout analysis, determining diacritics vs noise\",\n                  this->params())\n    , INT_MEMBER(debug_noise_removal, 0, \"Debug reassignment of small outlines\", this->params())\n    ,\n    // Worst (min) certainty, for which a diacritic is allowed to make the\n    // base\n    // character worse and still be included.\n    double_MEMBER(noise_cert_basechar, -8.0, \"Hingepoint for base char certainty\", this->params())\n    ,\n    // Worst (min) certainty, for which a non-overlapping diacritic is allowed\n    // to make the base character worse and still be included.\n    double_MEMBER(noise_cert_disjoint, -1.0, \"Hingepoint for disjoint certainty\", this->params())\n    ,\n    // Worst (min) certainty, for which a diacritic is allowed to make a new\n    // stand-alone blob.\n    double_MEMBER(noise_cert_punc, -3.0, \"Threshold for new punc char certainty\", this->params())\n    ,\n    // Factor of certainty margin for adding diacritics to not count as worse.\n    double_MEMBER(noise_cert_factor, 0.375, \"Scaling on certainty diff from Hingepoint\",\n                  this->params())\n    , INT_MEMBER(noise_maxperblob, 8, \"Max diacritics to apply to a blob\", this->params())\n    , INT_MEMBER(noise_maxperword, 16, \"Max diacritics to apply to a word\", this->params())\n    , INT_MEMBER(debug_x_ht_level, 0, \"Reestimate debug\", this->params())\n    , STRING_MEMBER(chs_leading_punct, \"('`\\\"\", \"Leading punctuation\", this->params())\n    , STRING_MEMBER(chs_trailing_punct1, \").,;:?!\", \"1st Trailing punctuation\", this->params())\n    , STRING_MEMBER(chs_trailing_punct2, \")'`\\\"\", \"2nd Trailing punctuation\", this->params())\n    , double_MEMBER(quality_rej_pc, 0.08, \"good_quality_doc lte rejection limit\", this->params())\n    , double_MEMBER(quality_blob_pc, 0.0, \"good_quality_doc gte good blobs limit\", this->params())\n    , double_MEMBER(quality_outline_pc, 1.0, \"good_quality_doc lte outline error limit\",\n                    this->params())\n    , double_MEMBER(quality_char_pc, 0.95, \"good_quality_doc gte good char limit\", this->params())\n    , INT_MEMBER(quality_min_initial_alphas_reqd, 2, \"alphas in a good word\", this->params())\n    , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, \"Adaptation decision algorithm for tess\",\n                 this->params())\n    , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, \"Do minimal rejection on pass 1 output\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_test_adaption, false, \"Test adaption criteria\", this->params())\n    , BOOL_MEMBER(test_pt, false, \"Test for point\", this->params())\n    , double_MEMBER(test_pt_x, 99999.99, \"xcoord\", this->params())\n    , double_MEMBER(test_pt_y, 99999.99, \"ycoord\", this->params())\n    , INT_MEMBER(multilang_debug_level, 0, \"Print multilang debug info.\", this->params())\n    , INT_MEMBER(paragraph_debug_level, 0, \"Print paragraph debug info.\", this->params())\n    , BOOL_MEMBER(paragraph_text_based, true,\n                  \"Run paragraph detection on the post-text-recognition \"\n                  \"(more accurate)\",\n                  this->params())\n    , BOOL_MEMBER(lstm_use_matrix, 1, \"Use ratings matrix/beam search with lstm\", this->params())\n    , STRING_MEMBER(outlines_odd, \"%| \", \"Non standard number of outlines\", this->params())\n    , STRING_MEMBER(outlines_2, \"ij!?%\\\":;\", \"Non standard number of outlines\", this->params())\n    , BOOL_MEMBER(tessedit_good_quality_unrej, true, \"Reduce rejection on good docs\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_use_reject_spaces, true, \"Reject spaces?\", this->params())\n    , double_MEMBER(tessedit_reject_doc_percent, 65.00, \"%rej allowed before rej whole doc\",\n                    this->params())\n    , double_MEMBER(tessedit_reject_block_percent, 45.00, \"%rej allowed before rej whole block\",\n                    this->params())\n    , double_MEMBER(tessedit_reject_row_percent, 40.00, \"%rej allowed before rej whole row\",\n                    this->params())\n    , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,\n                    \"Number of row rejects in whole word rejects\"\n                    \" which prevents whole row rejection\",\n                    this->params())\n    , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,\n                  \"Only rej partially rejected words in block rejection\", this->params())\n    , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,\n                  \"Only rej partially rejected words in row rejection\", this->params())\n    , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, \"Use word segmentation quality metric\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, \"Use word segmentation quality metric\",\n                  this->params())\n    , INT_MEMBER(tessedit_preserve_min_wd_len, 2, \"Only preserve wds longer than this\",\n                 this->params())\n    , BOOL_MEMBER(tessedit_row_rej_good_docs, true, \"Apply row rejection to good docs\",\n                  this->params())\n    , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,\n                    \"rej good doc wd if more than this fraction rejected\", this->params())\n    , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, \"Reject all bad quality wds\", this->params())\n    , BOOL_MEMBER(tessedit_debug_doc_rejection, false, \"Page stats\", this->params())\n    , BOOL_MEMBER(tessedit_debug_quality_metrics, false, \"Output data to debug file\",\n                  this->params())\n    , BOOL_MEMBER(bland_unrej, false, \"unrej potential with no checks\", this->params())\n    , double_MEMBER(quality_rowrej_pc, 1.1, \"good_quality_doc gte good char limit\", this->params())\n    , BOOL_MEMBER(unlv_tilde_crunching, false, \"Mark v.bad words for tilde crunch\", this->params())\n    , BOOL_MEMBER(hocr_font_info, false, \"Add font info to hocr output\", this->params())\n    , BOOL_MEMBER(hocr_char_boxes, false, \"Add coordinates for each character to hocr output\",\n                  this->params())\n    , BOOL_MEMBER(crunch_early_merge_tess_fails, true, \"Before word crunch?\", this->params())\n    , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, \"Take out ~^ early?\", this->params())\n    , double_MEMBER(crunch_terrible_rating, 80.0, \"crunch rating lt this\", this->params())\n    , BOOL_MEMBER(crunch_terrible_garbage, true, \"As it says\", this->params())\n    , double_MEMBER(crunch_poor_garbage_cert, -9.0, \"crunch garbage cert lt this\", this->params())\n    , double_MEMBER(crunch_poor_garbage_rate, 60, \"crunch garbage rating lt this\", this->params())\n    , double_MEMBER(crunch_pot_poor_rate, 40, \"POTENTIAL crunch rating lt this\", this->params())\n    , double_MEMBER(crunch_pot_poor_cert, -8.0, \"POTENTIAL crunch cert lt this\", this->params())\n    , double_MEMBER(crunch_del_rating, 60, \"POTENTIAL crunch rating lt this\", this->params())\n    , double_MEMBER(crunch_del_cert, -10.0, \"POTENTIAL crunch cert lt this\", this->params())\n    , double_MEMBER(crunch_del_min_ht, 0.7, \"Del if word ht lt xht x this\", this->params())\n    , double_MEMBER(crunch_del_max_ht, 3.0, \"Del if word ht gt xht x this\", this->params())\n    , double_MEMBER(crunch_del_min_width, 3.0, \"Del if word width lt xht x this\", this->params())\n    , double_MEMBER(crunch_del_high_word, 1.5, \"Del if word gt xht x this above bl\", this->params())\n    , double_MEMBER(crunch_del_low_word, 0.5, \"Del if word gt xht x this below bl\", this->params())\n    , double_MEMBER(crunch_small_outlines_size, 0.6, \"Small if lt xht x this\", this->params())\n    , INT_MEMBER(crunch_rating_max, 10, \"For adj length in rating per ch\", this->params())\n    , INT_MEMBER(crunch_pot_indicators, 1, \"How many potential indicators needed\", this->params())\n    , BOOL_MEMBER(crunch_leave_ok_strings, true, \"Don't touch sensible strings\", this->params())\n    , BOOL_MEMBER(crunch_accept_ok, true, \"Use acceptability in okstring\", this->params())\n    , BOOL_MEMBER(crunch_leave_accept_strings, false, \"Don't pot crunch sensible strings\",\n                  this->params())\n    , BOOL_MEMBER(crunch_include_numerals, false, \"Fiddle alpha figures\", this->params())\n    , INT_MEMBER(crunch_leave_lc_strings, 4, \"Don't crunch words with long lower case strings\",\n                 this->params())\n    , INT_MEMBER(crunch_leave_uc_strings, 4, \"Don't crunch words with long lower case strings\",\n                 this->params())\n    , INT_MEMBER(crunch_long_repetitions, 3, \"Crunch words with long repetitions\", this->params())\n    , INT_MEMBER(crunch_debug, 0, \"As it says\", this->params())\n    , INT_MEMBER(fixsp_non_noise_limit, 1, \"How many non-noise blbs either side?\", this->params())\n    , double_MEMBER(fixsp_small_outlines_size, 0.28, \"Small if lt xht x this\", this->params())\n    , BOOL_MEMBER(tessedit_prefer_joined_punct, false, \"Reward punctuation joins\", this->params())\n    , INT_MEMBER(fixsp_done_mode, 1, \"What constitutes done for spacing\", this->params())\n    , INT_MEMBER(debug_fix_space_level, 0, \"Contextual fixspace debug\", this->params())\n    , STRING_MEMBER(numeric_punctuation, \".,\", \"Punct. chs expected WITHIN numbers\", this->params())\n    , INT_MEMBER(x_ht_acceptance_tolerance, 8,\n                 \"Max allowed deviation of blob top outside of font data\", this->params())\n    , INT_MEMBER(x_ht_min_change, 8, \"Min change in xht before actually trying it\", this->params())\n    , INT_MEMBER(superscript_debug, 0, \"Debug level for sub & superscript fixer\", this->params())\n    , double_MEMBER(superscript_worse_certainty, 2.0,\n                    \"How many times worse \"\n                    \"certainty does a superscript position glyph need to be for \"\n                    \"us to try classifying it as a char with a different \"\n                    \"baseline?\",\n                    this->params())\n    , double_MEMBER(superscript_bettered_certainty, 0.97,\n                    \"What reduction in \"\n                    \"badness do we think sufficient to choose a superscript \"\n                    \"over what we'd thought.  For example, a value of 0.6 means \"\n                    \"we want to reduce badness of certainty by at least 40%\",\n                    this->params())\n    , double_MEMBER(superscript_scaledown_ratio, 0.4,\n                    \"A superscript scaled down more than this is unbelievably \"\n                    \"small.  For example, 0.3 means we expect the font size to \"\n                    \"be no smaller than 30% of the text line font size.\",\n                    this->params())\n    , double_MEMBER(subscript_max_y_top, 0.5,\n                    \"Maximum top of a character measured as a multiple of \"\n                    \"x-height above the baseline for us to reconsider whether \"\n                    \"it's a subscript.\",\n                    this->params())\n    , double_MEMBER(superscript_min_y_bottom, 0.3,\n                    \"Minimum bottom of a character measured as a multiple of \"\n                    \"x-height above the baseline for us to reconsider whether \"\n                    \"it's a superscript.\",\n                    this->params())\n    , BOOL_MEMBER(tessedit_write_block_separators, false, \"Write block separators in output\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_write_rep_codes, false, \"Write repetition char code\", this->params())\n    , BOOL_MEMBER(tessedit_write_unlv, false, \"Write .unlv output file\", this->params())\n    , BOOL_MEMBER(tessedit_create_txt, false, \"Write .txt output file\", this->params())\n    , BOOL_MEMBER(tessedit_create_hocr, false, \"Write .html hOCR output file\", this->params())\n    , BOOL_MEMBER(tessedit_create_alto, false, \"Write .xml ALTO file\", this->params())\n    , BOOL_MEMBER(tessedit_create_page_xml, false, \"Write .page.xml PAGE file\", this->params())\n    , BOOL_MEMBER(page_xml_polygon, true, \"Create the PAGE file with polygons instead of box values\", this->params())\n    , INT_MEMBER(page_xml_level, 0, \"Create the PAGE file on 0=line or 1=word level.\", this->params())\n    , BOOL_MEMBER(tessedit_create_lstmbox, false, \"Write .box file for LSTM training\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_create_tsv, false, \"Write .tsv output file\", this->params())\n    , BOOL_MEMBER(tessedit_create_wordstrbox, false, \"Write WordStr format .box output file\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_create_pdf, false, \"Write .pdf output file\", this->params())\n    , BOOL_MEMBER(textonly_pdf, false, \"Create PDF with only one invisible text layer\",\n                  this->params())\n    , INT_MEMBER(jpg_quality, 85, \"Set JPEG quality level\", this->params())\n    , INT_MEMBER(user_defined_dpi, 0, \"Specify DPI for input image\", this->params())\n    , INT_MEMBER(min_characters_to_try, 50, \"Specify minimum characters to try during OSD\",\n                 this->params())\n    , STRING_MEMBER(unrecognised_char, \"|\", \"Output char for unidentified blobs\", this->params())\n    , INT_MEMBER(suspect_level, 99, \"Suspect marker level\", this->params())\n    , INT_MEMBER(suspect_short_words, 2, \"Don't suspect dict wds longer than this\", this->params())\n    , BOOL_MEMBER(suspect_constrain_1Il, false, \"UNLV keep 1Il chars rejected\", this->params())\n    , double_MEMBER(suspect_rating_per_ch, 999.9, \"Don't touch bad rating limit\", this->params())\n    , double_MEMBER(suspect_accept_rating, -999.9, \"Accept good rating limit\", this->params())\n    , BOOL_MEMBER(tessedit_minimal_rejection, false, \"Only reject tess failures\", this->params())\n    , BOOL_MEMBER(tessedit_zero_rejection, false, \"Don't reject ANYTHING\", this->params())\n    , BOOL_MEMBER(tessedit_word_for_word, false, \"Make output have exactly one word per WERD\",\n                  this->params())\n    , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, \"Don't reject ANYTHING AT ALL\",\n                  this->params())\n    , INT_MEMBER(tessedit_reject_mode, 0, \"Rejection algorithm\", this->params())\n    , BOOL_MEMBER(tessedit_rejection_debug, false, \"Adaption debug\", this->params())\n    , BOOL_MEMBER(tessedit_flip_0O, true, \"Contextual 0O O0 flips\", this->params())\n    , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, \"Aspect ratio dot/hyphen test\", this->params())\n    , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, \"Aspect ratio dot/hyphen test\", this->params())\n    , BOOL_MEMBER(rej_trust_doc_dawg, false, \"Use DOC dawg in 11l conf. detector\", this->params())\n    , BOOL_MEMBER(rej_1Il_use_dict_word, false, \"Use dictword test\", this->params())\n    , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, \"Don't double check\", this->params())\n    , BOOL_MEMBER(rej_use_tess_accepted, true, \"Individual rejection control\", this->params())\n    , BOOL_MEMBER(rej_use_tess_blanks, true, \"Individual rejection control\", this->params())\n    , BOOL_MEMBER(rej_use_good_perm, true, \"Individual rejection control\", this->params())\n    , BOOL_MEMBER(rej_use_sensible_wd, false, \"Extend permuter check\", this->params())\n    , BOOL_MEMBER(rej_alphas_in_number_perm, false, \"Extend permuter check\", this->params())\n    , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, \"if >this fract\", this->params())\n    , INT_MEMBER(tessedit_image_border, 2, \"Rej blbs near image edge limit\", this->params())\n    , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, \"-?*\\075\", \"Allow NN to unrej\", this->params())\n    , STRING_MEMBER(conflict_set_I_l_1, \"Il1[]\", \"Il1 conflict set\", this->params())\n    , INT_MEMBER(min_sane_x_ht_pixels, 8, \"Reject any x-ht lt or eq than this\", this->params())\n    , BOOL_MEMBER(tessedit_create_boxfile, false, \"Output text with boxes\", this->params())\n    , INT_MEMBER(tessedit_page_number, -1, \"-1 -> All pages, else specific page to process\",\n                 this->params())\n    , BOOL_MEMBER(tessedit_write_images, false, \"Capture the image from the IPE\", this->params())\n    , BOOL_MEMBER(interactive_display_mode, false, \"Run interactively?\", this->params())\n    , STRING_MEMBER(file_type, \".tif\", \"Filename extension\", this->params())\n    , BOOL_MEMBER(tessedit_override_permuter, true, \"According to dict_word\", this->params())\n    , STRING_MEMBER(tessedit_load_sublangs, \"\", \"List of languages to load with this one\",\n                    this->params())\n    , BOOL_MEMBER(tessedit_use_primary_params_model, false,\n                  \"In multilingual mode use params model of the\"\n                  \" primary language\",\n                  this->params())\n    , double_MEMBER(min_orientation_margin, 7.0, \"Min acceptable orientation margin\",\n                    this->params())\n    , BOOL_MEMBER(textord_tabfind_show_vlines, false, \"Debug line finding\", this->params())\n    , BOOL_MEMBER(textord_use_cjk_fp_model, false, \"Use CJK fixed pitch model\", this->params())\n    , BOOL_MEMBER(poly_allow_detailed_fx, false,\n                  \"Allow feature extractors to see the original outline\", this->params())\n    , BOOL_INIT_MEMBER(tessedit_init_config_only, false,\n                       \"Only initialize with the config file. Useful if the \"\n                       \"instance is not going to be used for OCR but say only \"\n                       \"for layout analysis.\",\n                       this->params())\n#ifndef DISABLED_LEGACY_ENGINE\n    , BOOL_MEMBER(textord_equation_detect, false, \"Turn on equation detector\", this->params())\n#endif // ndef DISABLED_LEGACY_ENGINE\n    , BOOL_MEMBER(textord_tabfind_vertical_text, true, \"Enable vertical detection\", this->params())\n    , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, \"Force using vertical text page mode\",\n                  this->params())\n    , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,\n                    \"Fraction of textlines deemed vertical to use vertical page \"\n                    \"mode\",\n                    this->params())\n    , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,\n                    \"Fraction of height used as a minimum gap for aligned blobs.\", this->params())\n    , INT_MEMBER(tessedit_parallelize, 0, \"Run in parallel where possible\", this->params())\n    , BOOL_MEMBER(preserve_interword_spaces, false, \"Preserve multiple interword spaces\",\n                  this->params())\n    , STRING_MEMBER(page_separator, \"\\f\", \"Page separator (default is form feed control character)\",\n                    this->params())\n    , INT_MEMBER(lstm_choice_mode, 0,\n                 \"Allows to include alternative symbols choices in the hOCR output. \"\n                 \"Valid input values are 0, 1 and 2. 0 is the default value. \"\n                 \"With 1 the alternative symbol choices per timestep are included. \"\n                 \"With 2 alternative symbol choices are extracted from the CTC \"\n                 \"process instead of the lattice. The choices are mapped per \"\n                 \"character.\",\n                 this->params())\n    , INT_MEMBER(lstm_choice_iterations, 5,\n                 \"Sets the number of cascading iterations for the Beamsearch in \"\n                 \"lstm_choice_mode. Note that lstm_choice_mode must be set to a \"\n                 \"value greater than 0 to produce results.\",\n                 this->params())\n    , double_MEMBER(lstm_rating_coefficient, 5,\n                    \"Sets the rating coefficient for the lstm choices. The smaller the \"\n                    \"coefficient, the better are the ratings for each choice and less \"\n                    \"information is lost due to the cut off at 0. The standard value is \"\n                    \"5\",\n                    this->params())\n    , BOOL_MEMBER(pageseg_apply_music_mask, false,\n                  \"Detect music staff and remove intersecting components\", this->params())\n    ,\n\n    backup_config_file_(nullptr)\n    , pix_binary_(nullptr)\n    , pix_grey_(nullptr)\n    , pix_original_(nullptr)\n    , pix_thresholds_(nullptr)\n    , source_resolution_(0)\n    , textord_(this)\n    , right_to_left_(false)\n    , scaled_color_(nullptr)\n    , scaled_factor_(-1)\n    , deskew_(1.0f, 0.0f)\n    , reskew_(1.0f, 0.0f)\n    , gradient_(0.0f)\n    , most_recently_used_(this)\n    , font_table_size_(0)\n#ifndef DISABLED_LEGACY_ENGINE\n    , equ_detect_(nullptr)\n#endif // ndef DISABLED_LEGACY_ENGINE\n    , lstm_recognizer_(nullptr)\n    , train_line_page_num_(0) {}\n\nTesseract::~Tesseract() {\n  Clear();\n  pix_original_.destroy();\n  end_tesseract();\n  for (auto *lang : sub_langs_) {\n    delete lang;\n  }\n  delete lstm_recognizer_;\n  lstm_recognizer_ = nullptr;\n}\n\nDict &Tesseract::getDict() {\n  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {\n    if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {\n      return *lstm_recognizer_->GetDict();\n    }\n  }\n  return Classify::getDict();\n}\n\nvoid Tesseract::Clear() {\n  std::string debug_name = imagebasename + \"_debug.pdf\";\n  pixa_debug_.WritePDF(debug_name.c_str());\n  pix_binary_.destroy();\n  pix_grey_.destroy();\n  pix_thresholds_.destroy();\n  scaled_color_.destroy();\n  deskew_ = FCOORD(1.0f, 0.0f);\n  reskew_ = FCOORD(1.0f, 0.0f);\n  gradient_ = 0.0f;\n  splitter_.Clear();\n  scaled_factor_ = -1;\n  for (auto &sub_lang : sub_langs_) {\n    sub_lang->Clear();\n  }\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\nvoid Tesseract::SetEquationDetect(EquationDetect *detector) {\n  equ_detect_ = detector;\n  equ_detect_->SetLangTesseract(this);\n}\n\n// Clear all memory of adaption for this and all subclassifiers.\nvoid Tesseract::ResetAdaptiveClassifier() {\n  ResetAdaptiveClassifierInternal();\n  for (auto &sub_lang : sub_langs_) {\n    sub_lang->ResetAdaptiveClassifierInternal();\n  }\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n// Clear the document dictionary for this and all subclassifiers.\nvoid Tesseract::ResetDocumentDictionary() {\n  getDict().ResetDocumentDictionary();\n  for (auto &sub_lang : sub_langs_) {\n    sub_lang->getDict().ResetDocumentDictionary();\n  }\n}\n\nvoid Tesseract::SetBlackAndWhitelist() {\n  // Set the white and blacklists (if any)\n  unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),\n                                     tessedit_char_whitelist.c_str(),\n                                     tessedit_char_unblacklist.c_str());\n  if (lstm_recognizer_) {\n    UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();\n    lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),\n                                            tessedit_char_whitelist.c_str(),\n                                            tessedit_char_unblacklist.c_str());\n  }\n  // Black and white lists should apply to all loaded classifiers.\n  for (auto &sub_lang : sub_langs_) {\n    sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),\n                                                 tessedit_char_whitelist.c_str(),\n                                                 tessedit_char_unblacklist.c_str());\n    if (sub_lang->lstm_recognizer_) {\n      UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();\n      lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),\n                                              tessedit_char_whitelist.c_str(),\n                                              tessedit_char_unblacklist.c_str());\n    }\n  }\n}\n\n// Perform steps to prepare underlying binary image/other data structures for\n// page segmentation.\nvoid Tesseract::PrepareForPageseg() {\n  textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);\n  // Find the max splitter strategy over all langs.\n  auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(\n      static_cast<int32_t>(pageseg_devanagari_split_strategy));\n  for (auto &sub_lang : sub_langs_) {\n    auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(\n        static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));\n    if (pageseg_strategy > max_pageseg_strategy) {\n      max_pageseg_strategy = pageseg_strategy;\n    }\n    sub_lang->pix_binary_.destroy();\n    sub_lang->pix_binary_ = pix_binary().clone();\n  }\n  // Perform shiro-rekha (top-line) splitting and replace the current image by\n  // the newly split image.\n  splitter_.set_orig_pix(pix_binary());\n  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);\n  if (splitter_.Split(true, &pixa_debug_)) {\n    ASSERT_HOST(splitter_.splitted_image());\n    pix_binary_.destroy();\n    pix_binary_ = splitter_.splitted_image().clone();\n  }\n}\n\n// Perform steps to prepare underlying binary image/other data structures for\n// OCR. The current segmentation is required by this method.\n// Note that this method resets pix_binary_ to the original binarized image,\n// which may be different from the image actually used for OCR depending on the\n// value of devanagari_ocr_split_strategy.\nvoid Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) {\n  // Find the max splitter strategy over all langs.\n  auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(\n      static_cast<int32_t>(ocr_devanagari_split_strategy));\n  for (auto &sub_lang : sub_langs_) {\n    auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(\n        static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));\n    if (ocr_strategy > max_ocr_strategy) {\n      max_ocr_strategy = ocr_strategy;\n    }\n  }\n  // Utilize the segmentation information available.\n  splitter_.set_segmentation_block_list(block_list);\n  splitter_.set_ocr_split_strategy(max_ocr_strategy);\n  // Run the splitter for OCR\n  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);\n  // Restore pix_binary to the binarized original pix for future reference.\n  ASSERT_HOST(splitter_.orig_pix());\n  pix_binary_.destroy();\n  pix_binary_ = splitter_.orig_pix().clone();\n  // If the pageseg and ocr strategies are different, refresh the block list\n  // (from the last SegmentImage call) with blobs from the real image to be used\n  // for OCR.\n  if (splitter_.HasDifferentSplitStrategies()) {\n    BLOCK block(\"\", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));\n    Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();\n    extract_edges(pix_for_ocr, &block);\n    splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());\n  }\n  // The splitter isn't needed any more after this, so save memory by clearing.\n  splitter_.Clear();\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/tesseractclass.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tesseractclass.h\n// Description: The Tesseract class. It holds/owns everything needed\n//              to run Tesseract on a single language, and also a set of\n//              sub-Tesseracts to run sub-languages. For thread safety, *every*\n//              global variable goes in here, directly, or indirectly.\n//              This makes it safe to run multiple Tesseracts in different\n//              threads in parallel, and keeps the different language\n//              instances separate.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_\n#define TESSERACT_CCMAIN_TESSERACTCLASS_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#include \"control.h\"               // for ACCEPTABLE_WERD_TYPE\n#include \"debugpixa.h\"             // for DebugPixa\n#include \"devanagari_processing.h\" // for ShiroRekhaSplitter\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"docqual.h\" // for GARBAGE_LEVEL\n#endif\n#include \"genericvector.h\"   // for PointerVector\n#include \"pageres.h\"         // for WERD_RES (ptr only), PAGE_RES (pt...\n#include \"params.h\"          // for BOOL_VAR_H, BoolParam, DoubleParam\n#include \"points.h\"          // for FCOORD\n#include \"ratngs.h\"          // for ScriptPos, WERD_CHOICE (ptr only)\n#include \"tessdatamanager.h\" // for TessdataManager\n#include \"textord.h\"         // for Textord\n#include \"wordrec.h\"         // for Wordrec\n\n#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...\n#include <tesseract/unichar.h>     // for UNICHAR_ID\n\n#include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...\n\n#include <cstdint> // for int16_t, int32_t, uint16_t\n#include <cstdio>  // for FILE\n\nnamespace tesseract {\n\nclass BLOCK_LIST;\nclass ETEXT_DESC;\nstruct OSResults;\nclass PAGE_RES;\nclass PAGE_RES_IT;\nclass ROW;\nclass SVMenuNode;\nclass TBOX;\nclass TO_BLOCK_LIST;\nclass WERD;\nclass WERD_CHOICE;\nclass WERD_RES;\n\nclass ColumnFinder;\nclass DocumentData;\n#ifndef DISABLED_LEGACY_ENGINE\nclass EquationDetect;\n#endif // ndef DISABLED_LEGACY_ENGINE\nclass ImageData;\nclass LSTMRecognizer;\nclass Tesseract;\n\n// Top-level class for all tesseract global instance data.\n// This class either holds or points to all data used by an instance\n// of Tesseract, including the memory allocator. When this is\n// complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!\n//\n// NOTE to developers: Do not create cyclic dependencies through this class!\n// The directory dependency tree must remain a tree! To keep this clean,\n// lower-level code (eg in ccutil, the bottom level) must never need to\n// know about the content of a higher-level directory.\n// The following scheme will grant the easiest access to lower-level\n// global members without creating a cyclic dependency:\n//\n// Class Hierarchy (^ = inheritance):\n//\n//             CCUtil (ccutil/ccutil.h)\n//                         ^      Members include: UNICHARSET\n//           CCStruct (ccstruct/ccstruct.h)\n//                         ^       Members include: Image\n//           Classify (classify/classify.h)\n//                         ^       Members include: Dict\n//             WordRec (wordrec/wordrec.h)\n//                         ^       Members include: WERD*, DENORM*\n//        Tesseract (ccmain/tesseractclass.h)\n//                                 Members include: Pix*\n//\n// Other important classes:\n//\n//  TessBaseAPI (tesseract/baseapi.h)\n//                                 Members include: BLOCK_LIST*, PAGE_RES*,\n//                                 Tesseract*, ImageThresholder*\n//  Dict (dict/dict.h)\n//                                 Members include: Image* (private)\n//\n// NOTE: that each level contains members that correspond to global\n// data that is defined (and used) at that level, not necessarily where\n// the type is defined so for instance:\n// BOOL_VAR_H(textord_show_blobs);\n// goes inside the Textord class, not the cc_util class.\n\n// A collection of various variables for statistics and debugging.\nstruct TesseractStats {\n  TesseractStats()\n      : adaption_word_number(0)\n      , doc_blob_quality(0)\n      , doc_outline_errs(0)\n      , doc_char_quality(0)\n      , good_char_count(0)\n      , doc_good_char_quality(0)\n      , word_count(0)\n      , dict_words(0)\n      , tilde_crunch_written(false)\n      , last_char_was_newline(true)\n      , last_char_was_tilde(false)\n      , write_results_empty_block(true) {}\n\n  int32_t adaption_word_number;\n  int16_t doc_blob_quality;\n  int16_t doc_outline_errs;\n  int16_t doc_char_quality;\n  int16_t good_char_count;\n  int16_t doc_good_char_quality;\n  int32_t word_count;    // count of word in the document\n  int32_t dict_words;    // number of dicitionary words in the document\n  std::string dump_words_str; // accumulator used by dump_words()\n  // Flags used by write_results()\n  bool tilde_crunch_written;\n  bool last_char_was_newline;\n  bool last_char_was_tilde;\n  bool write_results_empty_block;\n};\n\n// Struct to hold all the pointers to relevant data for processing a word.\nstruct WordData {\n  WordData() : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}\n  explicit WordData(const PAGE_RES_IT &page_res_it)\n      : word(page_res_it.word())\n      , row(page_res_it.row()->row)\n      , block(page_res_it.block()->block)\n      , prev_word(nullptr) {}\n  WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)\n      : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}\n\n  WERD_RES *word;\n  ROW *row;\n  BLOCK *block;\n  WordData *prev_word;\n  PointerVector<WERD_RES> lang_words;\n};\n\n// Definition of a Tesseract WordRecognizer. The WordData provides the context\n// of row/block, in_word holds an initialized, possibly pre-classified word,\n// that the recognizer may or may not consume (but if so it sets\n// *in_word=nullptr) and produces one or more output words in out_words, which\n// may be the consumed in_word, or may be generated independently. This api\n// allows both a conventional tesseract classifier to work, or a line-level\n// classifier that generates multiple words from a merged input.\nusing WordRecognizer = void (Tesseract::*)(const WordData &, WERD_RES **,\n                                           PointerVector<WERD_RES> *);\n\nclass TESS_API Tesseract : public Wordrec {\npublic:\n  Tesseract();\n  ~Tesseract() override;\n\n  // Return appropriate dictionary\n  Dict &getDict() override;\n\n  // Clear as much used memory as possible without resetting the adaptive\n  // classifier or losing any other classifier data.\n  void Clear();\n  // Clear all memory of adaption for this and all subclassifiers.\n  void ResetAdaptiveClassifier();\n  // Clear the document dictionary for this and all subclassifiers.\n  void ResetDocumentDictionary();\n\n#ifndef DISABLED_LEGACY_ENGINE\n  // Set the equation detector.\n  void SetEquationDetect(EquationDetect *detector);\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  // Simple accessors.\n  const FCOORD &reskew() const {\n    return reskew_;\n  }\n  float gradient() const {\n    return gradient_;\n  }\n  // Destroy any existing pix and return a pointer to the pointer.\n  Image *mutable_pix_binary() {\n    pix_binary_.destroy();\n    return &pix_binary_;\n  }\n  Image pix_binary() const {\n    return pix_binary_;\n  }\n  Image pix_grey() const {\n    return pix_grey_;\n  }\n  void set_pix_grey(Image grey_pix) {\n    pix_grey_.destroy();\n    pix_grey_ = grey_pix;\n  }\n  Image pix_original() const {\n    return pix_original_;\n  }\n  // Takes ownership of the given original_pix.\n  void set_pix_original(Image original_pix) {\n    pix_original_.destroy();\n    pix_original_ = original_pix;\n    // Clone to sublangs as well.\n    for (auto &lang : sub_langs_) {\n      lang->set_pix_original(original_pix ? original_pix.clone() : nullptr);\n    }\n  }\n  // Returns a pointer to a Pix representing the best available resolution image\n  // of the page, with best available bit depth as second priority. Result can\n  // be of any bit depth, but never color-mapped, as that has always been\n  // removed. Note that in grey and color, 0 is black and 255 is\n  // white. If the input was binary, then black is 1 and white is 0.\n  // To tell the difference pixGetDepth() will return 32, 8 or 1.\n  // In any case, the return value is a borrowed Pix, and should not be\n  // deleted or pixDestroyed.\n  Image BestPix() const {\n    if (pixGetWidth(pix_original_) == ImageWidth()) {\n      return pix_original_;\n    } else if (pix_grey_ != nullptr) {\n      return pix_grey_;\n    } else {\n      return pix_binary_;\n    }\n  }\n  void set_pix_thresholds(Image thresholds) {\n    pix_thresholds_.destroy();\n    pix_thresholds_ = thresholds;\n  }\n  int source_resolution() const {\n    return source_resolution_;\n  }\n  void set_source_resolution(int ppi) {\n    source_resolution_ = ppi;\n  }\n  int ImageWidth() const {\n    return pixGetWidth(pix_binary_);\n  }\n  int ImageHeight() const {\n    return pixGetHeight(pix_binary_);\n  }\n  Image scaled_color() const {\n    return scaled_color_;\n  }\n  int scaled_factor() const {\n    return scaled_factor_;\n  }\n  void SetScaledColor(int factor, Image color) {\n    scaled_factor_ = factor;\n    scaled_color_ = color;\n  }\n  const Textord &textord() const {\n    return textord_;\n  }\n  Textord *mutable_textord() {\n    return &textord_;\n  }\n\n  bool right_to_left() const {\n    return right_to_left_;\n  }\n  int num_sub_langs() const {\n    return sub_langs_.size();\n  }\n  Tesseract *get_sub_lang(int index) const {\n    return sub_langs_[index];\n  }\n  // Returns true if any language uses Tesseract (as opposed to LSTM).\n  bool AnyTessLang() const {\n    if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {\n      return true;\n    }\n    for (auto &lang : sub_langs_) {\n      if (lang->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {\n        return true;\n      }\n    }\n    return false;\n  }\n  // Returns true if any language uses the LSTM.\n  bool AnyLSTMLang() const {\n    if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {\n      return true;\n    }\n    for (auto &lang : sub_langs_) {\n      if (lang->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  void SetBlackAndWhitelist();\n\n  // Perform steps to prepare underlying binary image/other data structures for\n  // page segmentation. Uses the strategy specified in the global variable\n  // pageseg_devanagari_split_strategy for perform splitting while preparing for\n  // page segmentation.\n  void PrepareForPageseg();\n\n  // Perform steps to prepare underlying binary image/other data structures for\n  // Tesseract OCR. The current segmentation is required by this method.\n  // Uses the strategy specified in the global variable\n  // ocr_devanagari_split_strategy for performing splitting while preparing for\n  // Tesseract ocr.\n  void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr);\n\n  int SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr);\n  void SetupWordScripts(BLOCK_LIST *blocks);\n  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,\n                  BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr);\n  ColumnFinder *SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks,\n                                                 Tesseract *osd_tess, OSResults *osr,\n                                                 TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix,\n                                                 Image *music_mask_pix);\n  // par_control.cpp\n  void PrerecAllWordsPar(const std::vector<WordData> &words);\n\n  //// linerec.cpp\n  // Generates training data for training a line recognizer, eg LSTM.\n  // Breaks the page into lines, according to the boxes, and writes them to a\n  // serialized DocumentData based on output_basename.\n  // Return true if successful, false if an error occurred.\n  bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,\n                           BLOCK_LIST *block_list);\n  // Generates training data for training a line recognizer, eg LSTM.\n  // Breaks the boxes into lines, normalizes them, converts to ImageData and\n  // appends them to the given training_data.\n  void TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,\n                      BLOCK_LIST *block_list, DocumentData *training_data);\n\n  // Returns an Imagedata containing the image of the given textline,\n  // and ground truth boxes/truth text if available in the input.\n  // The image is not normalized in any way.\n  ImageData *GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,\n                         const std::vector<std::string> &texts, int start_box, int end_box,\n                         const BLOCK &block);\n  // Helper gets the image of a rectangle, using the block.re_rotation() if\n  // needed to get to the image, and rotating the result back to horizontal\n  // layout. (CJK characters will be on their left sides) The vertical text flag\n  // is set in the returned ImageData if the text was originally vertical, which\n  // can be used to invoke a different CJK recognition engine. The revised_box\n  // is also returned to enable calculation of output bounding boxes.\n  ImageData *GetRectImage(const TBOX &box, const BLOCK &block, int padding,\n                          TBOX *revised_box) const;\n  // Recognizes a word or group of words, converting to WERD_RES in *words.\n  // Analogous to classify_word_pass1, but can handle a group of words as well.\n  void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,\n                         PointerVector<WERD_RES> *words);\n  // Apply segmentation search to the given set of words, within the constraints\n  // of the existing ratings matrix. If there is already a best_choice on a word\n  // leaves it untouched and just sets the done/accepted etc flags.\n  void SearchWords(PointerVector<WERD_RES> *words);\n\n  //// control.h /////////////////////////////////////////////////////////\n  bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config,\n                         int pass);\n  // Sets up the words ready for whichever engine is to be run\n  void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,\n                          PAGE_RES *page_res, std::vector<WordData> *words);\n  // Sets up the single word ready for whichever engine is to be run.\n  void SetupWordPassN(int pass_n, WordData *word);\n  // Runs word recognition on all the words.\n  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,\n                          std::vector<WordData> *words);\n  bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box,\n                       const char *word_config, int dopasses);\n  void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box,\n                        const char *word_config);\n  void bigram_correction_pass(PAGE_RES *page_res);\n  void blamer_pass(PAGE_RES *page_res);\n  // Sets script positions and detects smallcaps on all output words.\n  void script_pos_pass(PAGE_RES *page_res);\n  // Helper to recognize the word using the given (language-specific) tesseract.\n  // Returns positive if this recognizer found more new best words than the\n  // number kept from best_words.\n  int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,\n                        WERD_RES **in_word, PointerVector<WERD_RES> *best_words);\n  // Moves good-looking \"noise\"/diacritics from the reject list to the main\n  // blob list on the current word. Returns true if anything was done, and\n  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.\n  bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy);\n  // Attempts to put noise/diacritic outlines into the blobs that they overlap.\n  // Input: a set of noisy outlines that probably belong to the real_word.\n  // Output: outlines that overlapped blobs are set to nullptr and put back into\n  // the word, either in the blobs or in the reject list.\n  void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,\n                                          WERD *real_word, PAGE_RES_IT *pr_it,\n                                          std::vector<bool> *word_wanted,\n                                          std::vector<bool> *overlapped_any_blob,\n                                          std::vector<C_BLOB *> *target_blobs);\n  // Attempts to assign non-overlapping outlines to their nearest blobs or\n  // make new blobs out of them.\n  void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,\n                                  WERD *real_word, PAGE_RES_IT *pr_it,\n                                  std::vector<bool> *word_wanted,\n                                  std::vector<C_BLOB *> *target_blobs);\n  // Starting with ok_outlines set to indicate which outlines overlap the blob,\n  // chooses the optimal set (approximately) and returns true if any outlines\n  // are desired, in which case ok_outlines indicates which ones.\n  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,\n                                   C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,\n                                   int num_outlines, std::vector<bool> *ok_outlines);\n  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes\n  // the inclusion of the outlines, and returns the certainty of the raw choice.\n  float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,\n                                 const std::vector<C_OUTLINE *> &outlines, int pass_n,\n                                 PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);\n  // Classifies the given blob (part of word_data->word->word) as an individual\n  // word, using languages, chopper etc, returning only the certainty of the\n  // best raw choice, and undoing all the work done to fake out the word.\n  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,\n                           float *c2);\n  void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data);\n  void classify_word_pass1(const WordData &word_data, WERD_RES **in_word,\n                           PointerVector<WERD_RES> *out_words);\n  void recog_pseudo_word(PAGE_RES *page_res, // blocks to check\n                         TBOX &selection_box);\n\n  void fix_rep_char(PAGE_RES_IT *page_res_it);\n\n  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s,\n                                              const char *lengths);\n  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block);\n  void classify_word_pass2(const WordData &word_data, WERD_RES **in_word,\n                           PointerVector<WERD_RES> *out_words);\n  void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word);\n  bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row);\n  bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row);\n  // Runs recognition with the test baseline shift and x-height and returns true\n  // if there was an improvement in recognition result.\n  bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,\n                            WERD_RES *word, BLOCK *block, ROW *row);\n  bool recog_interactive(PAGE_RES_IT *pr_it);\n\n  // Set fonts of this word.\n  void set_word_fonts(WERD_RES *word);\n  void font_recognition_pass(PAGE_RES *page_res);\n  void dictionary_correction_pass(PAGE_RES *page_res);\n  bool check_debug_pt(WERD_RES *word, int location);\n\n  //// superscript.cpp ////////////////////////////////////////////////////\n  bool SubAndSuperscriptFix(WERD_RES *word_res);\n  void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,\n                                      ScriptPos *leading_pos, float *leading_certainty,\n                                      int *num_rebuilt_trailing, ScriptPos *trailing_pos,\n                                      float *trailing_certainty, float *avg_certainty,\n                                      float *unlikely_threshold);\n  WERD_RES *TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,\n                                 ScriptPos leading_pos, int num_chopped_trailing,\n                                 float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word,\n                                 bool *is_good, int *retry_leading, int *retry_trailing);\n  bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,\n                             int *left_ok, int *right_ok) const;\n\n  //// output.h //////////////////////////////////////////////////////////\n\n  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);\n  void write_results(PAGE_RES_IT &page_res_it, // full info\n                     char newline_type,        // type of newline\n                     bool force_eol            // override tilde crunch?\n  );\n  void set_unlv_suspects(WERD_RES *word);\n  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?\n  bool acceptable_number_string(const char *s, const char *lengths);\n  int16_t count_alphanums(const WERD_CHOICE &word);\n  int16_t count_alphas(const WERD_CHOICE &word);\n\n  void read_config_file(const char *filename, SetParamConstraint constraint);\n  // Initialize for potentially a set of languages defined by the language\n  // string and recursively any additional languages required by any language\n  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.\n  // See init_tesseract_internal for args.\n  int init_tesseract(const std::string &arg0, const std::string &textbase,\n                     const std::string &language, OcrEngineMode oem, char **configs,\n                     int configs_size, const std::vector<std::string> *vars_vec,\n                     const std::vector<std::string> *vars_values, bool set_only_non_debug_params,\n                     TessdataManager *mgr);\n  int init_tesseract(const std::string &datapath, const std::string &language, OcrEngineMode oem) {\n    TessdataManager mgr;\n    return init_tesseract(datapath, {}, language, oem, nullptr, 0, nullptr, nullptr, false, &mgr);\n  }\n  // Common initialization for a single language.\n  // arg0 is the datapath for the tessdata directory, which could be the\n  // path of the tessdata directory with no trailing /, or (if tessdata\n  // lives in the same directory as the executable, the path of the executable,\n  // hence the name arg0.\n  // textbase is an optional output file basename (used only for training)\n  // language is the language code to load.\n  // oem controls which engine(s) will operate on the image\n  // configs (argv) is an array of config filenames to load variables from.\n  // May be nullptr.\n  // configs_size (argc) is the number of elements in configs.\n  // vars_vec is an optional vector of variables to set.\n  // vars_values is an optional corresponding vector of values for the variables\n  // in vars_vec.\n  // If set_only_non_debug_params is true, only params that do not contain\n  // \"debug\" in the name will be set.\n  int init_tesseract_internal(const std::string &arg0, const std::string &textbase,\n                              const std::string &language, OcrEngineMode oem, char **configs,\n                              int configs_size, const std::vector<std::string> *vars_vec,\n                              const std::vector<std::string> *vars_values,\n                              bool set_only_non_debug_params, TessdataManager *mgr);\n\n  // Set the universal_id member of each font to be unique among all\n  // instances of the same font loaded.\n  void SetupUniversalFontIds();\n\n  void recognize_page(std::string &image_name);\n  void end_tesseract();\n\n  bool init_tesseract_lang_data(const std::string &arg0,\n                                const std::string &language, OcrEngineMode oem, char **configs,\n                                int configs_size, const std::vector<std::string> *vars_vec,\n                                const std::vector<std::string> *vars_values,\n                                bool set_only_non_debug_params, TessdataManager *mgr);\n\n  void ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,\n                           std::vector<std::string> *not_to_load);\n\n  //// pgedit.h //////////////////////////////////////////////////////////\n  SVMenuNode *build_menu_new();\n#ifndef GRAPHICS_DISABLED\n  void pgeditor_main(int width, int height, PAGE_RES *page_res);\n\n  void process_image_event( // action in image win\n      const SVEvent &event);\n  bool process_cmd_win_event( // UI command semantics\n      int32_t cmd_event,      // which menu item?\n      char *new_value         // any prompt data\n  );\n#endif // !GRAPHICS_DISABLED\n  void debug_word(PAGE_RES *page_res, const TBOX &selection_box);\n  void do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it));\n  bool word_display(PAGE_RES_IT *pr_it);\n  bool word_bln_display(PAGE_RES_IT *pr_it);\n  bool word_blank_and_set_display(PAGE_RES_IT *pr_its);\n  bool word_set_display(PAGE_RES_IT *pr_it);\n  // #ifndef GRAPHICS_DISABLED\n  bool word_dumper(PAGE_RES_IT *pr_it);\n  // #endif // !GRAPHICS_DISABLED\n  void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box);\n  //// reject.h //////////////////////////////////////////////////////////\n  // make rej map for word\n  void make_reject_map(WERD_RES *word, ROW *row, int16_t pass);\n  bool one_ell_conflict(WERD_RES *word_res, bool update_map);\n  int16_t first_alphanum_index(const char *word, const char *word_lengths);\n  int16_t first_alphanum_offset(const char *word, const char *word_lengths);\n  int16_t alpha_count(const char *word, const char *word_lengths);\n  bool word_contains_non_1_digit(const char *word, const char *word_lengths);\n  void dont_allow_1Il(WERD_RES *word);\n  int16_t count_alphanums( // how many alphanums\n      WERD_RES *word);\n  void flip_0O(WERD_RES *word);\n  bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id);\n  bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id);\n  bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row);\n  void nn_match_word( // Match a word\n      WERD_RES *word, ROW *row);\n  void nn_recover_rejects(WERD_RES *word, ROW *row);\n  void set_done( // set done flag\n      WERD_RES *word, int16_t pass);\n  int16_t safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?\n  void flip_hyphens(WERD_RES *word);\n  void reject_I_1_L(WERD_RES *word);\n  void reject_edge_blobs(WERD_RES *word);\n  void reject_mostly_rejects(WERD_RES *word);\n  //// adaptions.h ///////////////////////////////////////////////////////\n  bool word_adaptable( // should we adapt?\n      WERD_RES *word, uint16_t mode);\n\n  //// tfacepp.cpp ///////////////////////////////////////////////////////\n  void recog_word_recursive(WERD_RES *word);\n  void recog_word(WERD_RES *word);\n  void split_and_recog_word(WERD_RES *word);\n  void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,\n                  BlamerBundle **orig_blamer_bundle) const;\n  void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const;\n  //// fixspace.cpp ///////////////////////////////////////////////////////\n  bool digit_or_numeric_punct(WERD_RES *word, int char_position);\n  int16_t eval_word_spacing(WERD_RES_LIST &word_res_list);\n  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block);\n  int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list);\n  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block);\n  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block);\n  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block);\n  void fix_fuzzy_spaces(   // find fuzzy words\n      ETEXT_DESC *monitor, // progress monitor\n      int32_t word_count,  // count of words in doc\n      PAGE_RES *page_res);\n  void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved);\n  bool fixspace_thinks_word_done(WERD_RES *word);\n  int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);\n  float blob_noise_score(TBLOB *blob);\n  void break_noisiest_blob_word(WERD_RES_LIST &words);\n  //// docqual.cpp ////////////////////////////////////////////////////////\n#ifndef DISABLED_LEGACY_ENGINE\n  GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word);\n  bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word);\n#endif\n  void tilde_crunch(PAGE_RES_IT &page_res_it);\n  void unrej_good_quality_words( // unreject potential\n      PAGE_RES_IT &page_res_it);\n  void doc_and_block_rejection( // reject big chunks\n      PAGE_RES_IT &page_res_it, bool good_quality_doc);\n  void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc);\n  void convert_bad_unlv_chs(WERD_RES *word_res);\n  void tilde_delete(PAGE_RES_IT &page_res_it);\n  int16_t word_blob_quality(WERD_RES *word);\n  void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count);\n  void unrej_good_chs(WERD_RES *word);\n  int16_t count_outline_errs(char c, int16_t outline_count);\n  int16_t word_outline_errs(WERD_RES *word);\n#ifndef DISABLED_LEGACY_ENGINE\n  bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);\n#endif\n  CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode);\n  int16_t failure_count(WERD_RES *word);\n  bool noise_outlines(TWERD *word);\n  //// pagewalk.cpp ///////////////////////////////////////////////////////\n  void process_selected_words(PAGE_RES *page_res, // blocks to check\n                                                  // function to call\n                              TBOX &selection_box,\n                              bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it));\n  //// tessbox.cpp ///////////////////////////////////////////////////////\n  void tess_add_doc_word(      // test acceptability\n      WERD_CHOICE *word_choice // after context\n  );\n  void tess_segment_pass_n(int pass_n, WERD_RES *word);\n  bool tess_acceptable_word(WERD_RES *word);\n\n  //// applybox.cpp //////////////////////////////////////////////////////\n  // Applies the box file based on the image name filename, and resegments\n  // the words in the block_list (page), with:\n  // blob-mode: one blob per line in the box file, words as input.\n  // word/line-mode: one blob per space-delimited unit after the #, and one word\n  // per line in the box file. (See comment above for box file format.)\n  // If find_segmentation is true, (word/line mode) then the classifier is used\n  // to re-segment words/lines to match the space-delimited truth string for\n  // each box. In this case, the input box may be for a word or even a whole\n  // text line, and the output words will contain multiple blobs corresponding\n  // to the space-delimited input string.\n  // With find_segmentation false, no classifier is needed, but the chopper\n  // can still be used to correctly segment touching characters with the help\n  // of the input boxes.\n  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned\n  // from normal classification, ie. with a word, chopped_word, rebuild_word,\n  // seam_array, denorm, box_word, and best_state, but NO best_choice or\n  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.\n  // Instead, the correct_text member of WERD_RES is set, and this may be later\n  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords\n  // is not required before calling ApplyBoxTraining.\n  PAGE_RES *ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list);\n\n  // Any row xheight that is significantly different from the median is set\n  // to the median.\n  void PreenXHeights(BLOCK_LIST *block_list);\n\n  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:\n  // All fuzzy spaces are removed, and all the words are maximally chopped.\n  PAGE_RES *SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list);\n  // Tests the chopper by exhaustively running chop_one_blob.\n  // The word_res will contain filled chopped_word, seam_array, denorm,\n  // box_word and best_state for the maximally chopped word.\n  void MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,\n                         WERD_RES *word_res);\n  // Gather consecutive blobs that match the given box into the best_state\n  // and corresponding correct_text.\n  // Fights over which box owns which blobs are settled by pre-chopping and\n  // applying the blobs to box or next_box with the least non-overlap.\n  // Returns false if the box was in error, which can only be caused by\n  // failing to find an appropriate blob for a box.\n  // This means that occasionally, blobs may be incorrectly segmented if the\n  // chopper fails to find a suitable chop point.\n  bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,\n                        const TBOX *next_box, const char *correct_text);\n  // Consume all source blobs that strongly overlap the given box,\n  // putting them into a new word, with the correct_text label.\n  // Fights over which box owns which blobs are settled by\n  // applying the blobs to box or next_box with the least non-overlap.\n  // Returns false if the box was in error, which can only be caused by\n  // failing to find an overlapping blob for a box.\n  bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,\n                        const char *correct_text);\n  // Resegments the words by running the classifier in an attempt to find the\n  // correct segmentation that produces the required string.\n  void ReSegmentByClassification(PAGE_RES *page_res);\n  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.\n  // Returns false if an invalid UNICHAR_ID is encountered.\n  bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);\n  // Resegments the word to achieve the target_text from the classifier.\n  // Returns false if the re-segmentation fails.\n  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and\n  // applies a full search on the classifier results to find the best classified\n  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity\n  // substitutions ARE used.\n  bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);\n  // Recursive helper to find a match to the target_text (from text_index\n  // position) in the choices (from choices_pos position).\n  // Choices is an array of vectors of length choices_length, with each\n  // element representing a starting position in the word, and the\n  // vector holding classification results for a sequence of consecutive\n  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.\n  void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,\n                     unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,\n                     unsigned text_index, float rating, std::vector<int> *segmentation,\n                     float *best_rating, std::vector<int> *best_segmentation);\n  // Counts up the labelled words and the blobs within.\n  // Deletes all unused or emptied words, counting the unused ones.\n  // Resets W_BOL and W_EOL flags correctly.\n  // Builds the rebuild_word and rebuilds the box_word.\n  void TidyUp(PAGE_RES *page_res);\n  // Logs a bad box by line in the box file and box coords.\n  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg);\n  // Creates a fake best_choice entry in each WERD_RES with the correct text.\n  void CorrectClassifyWords(PAGE_RES *page_res);\n  // Call LearnWord to extract features for labelled blobs within each word.\n  // Features are stored in an internal buffer.\n  void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res);\n\n  //// fixxht.cpp ///////////////////////////////////////////////////////\n  // Returns the number of misfit blob tops in this word.\n  int CountMisfitTops(WERD_RES *word_res);\n  // Returns a new x-height in pixels (original image coords) that is\n  // maximally compatible with the result in word_res.\n  // Returns 0.0f if no x-height is found that is better than the current\n  // estimate.\n  float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift);\n  //// Data members ///////////////////////////////////////////////////////\n  // TODO(ocr-team): Find and remove obsolete parameters.\n  BOOL_VAR_H(tessedit_resegment_from_boxes);\n  BOOL_VAR_H(tessedit_resegment_from_line_boxes);\n  BOOL_VAR_H(tessedit_train_from_boxes);\n  BOOL_VAR_H(tessedit_make_boxes_from_boxes);\n  BOOL_VAR_H(tessedit_train_line_recognizer);\n  BOOL_VAR_H(tessedit_dump_pageseg_images);\n  // TODO: remove deprecated tessedit_do_invert in release 6.\n  BOOL_VAR_H(tessedit_do_invert);\n  double_VAR_H(invert_threshold);\n  INT_VAR_H(tessedit_pageseg_mode);\n  INT_VAR_H(thresholding_method);\n  BOOL_VAR_H(thresholding_debug);\n  double_VAR_H(thresholding_window_size);\n  double_VAR_H(thresholding_kfactor);\n  double_VAR_H(thresholding_tile_size);\n  double_VAR_H(thresholding_smooth_kernel_size);\n  double_VAR_H(thresholding_score_fraction);\n  INT_VAR_H(tessedit_ocr_engine_mode);\n  STRING_VAR_H(tessedit_char_blacklist);\n  STRING_VAR_H(tessedit_char_whitelist);\n  STRING_VAR_H(tessedit_char_unblacklist);\n  BOOL_VAR_H(tessedit_ambigs_training);\n  INT_VAR_H(pageseg_devanagari_split_strategy);\n  INT_VAR_H(ocr_devanagari_split_strategy);\n  STRING_VAR_H(tessedit_write_params_to_file);\n  BOOL_VAR_H(tessedit_adaption_debug);\n  INT_VAR_H(bidi_debug);\n  INT_VAR_H(applybox_debug);\n  INT_VAR_H(applybox_page);\n  STRING_VAR_H(applybox_exposure_pattern);\n  BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode);\n  BOOL_VAR_H(applybox_learn_ngrams_mode);\n  BOOL_VAR_H(tessedit_display_outwords);\n  BOOL_VAR_H(tessedit_dump_choices);\n  BOOL_VAR_H(tessedit_timing_debug);\n  BOOL_VAR_H(tessedit_fix_fuzzy_spaces);\n  BOOL_VAR_H(tessedit_unrej_any_wd);\n  BOOL_VAR_H(tessedit_fix_hyphens);\n  BOOL_VAR_H(tessedit_enable_doc_dict);\n  BOOL_VAR_H(tessedit_debug_fonts);\n  INT_VAR_H(tessedit_font_id);\n  BOOL_VAR_H(tessedit_debug_block_rejection);\n  BOOL_VAR_H(tessedit_enable_bigram_correction);\n  BOOL_VAR_H(tessedit_enable_dict_correction);\n  INT_VAR_H(tessedit_bigram_debug);\n  BOOL_VAR_H(enable_noise_removal);\n  INT_VAR_H(debug_noise_removal);\n  // Worst (min) certainty, for which a diacritic is allowed to make the base\n  // character worse and still be included.\n  double_VAR_H(noise_cert_basechar);\n  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to\n  // make the base character worse and still be included.\n  double_VAR_H(noise_cert_disjoint);\n  // Worst (min) certainty, for which a diacritic is allowed to make a new\n  // stand-alone blob.\n  double_VAR_H(noise_cert_punc);\n  // Factor of certainty margin for adding diacritics to not count as worse.\n  double_VAR_H(noise_cert_factor);\n  INT_VAR_H(noise_maxperblob);\n  INT_VAR_H(noise_maxperword);\n  INT_VAR_H(debug_x_ht_level);\n  STRING_VAR_H(chs_leading_punct);\n  STRING_VAR_H(chs_trailing_punct1);\n  STRING_VAR_H(chs_trailing_punct2);\n  double_VAR_H(quality_rej_pc);\n  double_VAR_H(quality_blob_pc);\n  double_VAR_H(quality_outline_pc);\n  double_VAR_H(quality_char_pc);\n  INT_VAR_H(quality_min_initial_alphas_reqd);\n  INT_VAR_H(tessedit_tess_adaption_mode);\n  BOOL_VAR_H(tessedit_minimal_rej_pass1);\n  BOOL_VAR_H(tessedit_test_adaption);\n  BOOL_VAR_H(test_pt);\n  double_VAR_H(test_pt_x);\n  double_VAR_H(test_pt_y);\n  INT_VAR_H(multilang_debug_level);\n  INT_VAR_H(paragraph_debug_level);\n  BOOL_VAR_H(paragraph_text_based);\n  BOOL_VAR_H(lstm_use_matrix);\n  STRING_VAR_H(outlines_odd);\n  STRING_VAR_H(outlines_2);\n  BOOL_VAR_H(tessedit_good_quality_unrej);\n  BOOL_VAR_H(tessedit_use_reject_spaces);\n  double_VAR_H(tessedit_reject_doc_percent);\n  double_VAR_H(tessedit_reject_block_percent);\n  double_VAR_H(tessedit_reject_row_percent);\n  double_VAR_H(tessedit_whole_wd_rej_row_percent);\n  BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds);\n  BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds);\n  BOOL_VAR_H(tessedit_dont_blkrej_good_wds);\n  BOOL_VAR_H(tessedit_dont_rowrej_good_wds);\n  INT_VAR_H(tessedit_preserve_min_wd_len);\n  BOOL_VAR_H(tessedit_row_rej_good_docs);\n  double_VAR_H(tessedit_good_doc_still_rowrej_wd);\n  BOOL_VAR_H(tessedit_reject_bad_qual_wds);\n  BOOL_VAR_H(tessedit_debug_doc_rejection);\n  BOOL_VAR_H(tessedit_debug_quality_metrics);\n  BOOL_VAR_H(bland_unrej);\n  double_VAR_H(quality_rowrej_pc);\n  BOOL_VAR_H(unlv_tilde_crunching);\n  BOOL_VAR_H(hocr_font_info);\n  BOOL_VAR_H(hocr_char_boxes);\n  BOOL_VAR_H(crunch_early_merge_tess_fails);\n  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs);\n  double_VAR_H(crunch_terrible_rating);\n  BOOL_VAR_H(crunch_terrible_garbage);\n  double_VAR_H(crunch_poor_garbage_cert);\n  double_VAR_H(crunch_poor_garbage_rate);\n  double_VAR_H(crunch_pot_poor_rate);\n  double_VAR_H(crunch_pot_poor_cert);\n  double_VAR_H(crunch_del_rating);\n  double_VAR_H(crunch_del_cert);\n  double_VAR_H(crunch_del_min_ht);\n  double_VAR_H(crunch_del_max_ht);\n  double_VAR_H(crunch_del_min_width);\n  double_VAR_H(crunch_del_high_word);\n  double_VAR_H(crunch_del_low_word);\n  double_VAR_H(crunch_small_outlines_size);\n  INT_VAR_H(crunch_rating_max);\n  INT_VAR_H(crunch_pot_indicators);\n  BOOL_VAR_H(crunch_leave_ok_strings);\n  BOOL_VAR_H(crunch_accept_ok);\n  BOOL_VAR_H(crunch_leave_accept_strings);\n  BOOL_VAR_H(crunch_include_numerals);\n  INT_VAR_H(crunch_leave_lc_strings);\n  INT_VAR_H(crunch_leave_uc_strings);\n  INT_VAR_H(crunch_long_repetitions);\n  INT_VAR_H(crunch_debug);\n  INT_VAR_H(fixsp_non_noise_limit);\n  double_VAR_H(fixsp_small_outlines_size);\n  BOOL_VAR_H(tessedit_prefer_joined_punct);\n  INT_VAR_H(fixsp_done_mode);\n  INT_VAR_H(debug_fix_space_level);\n  STRING_VAR_H(numeric_punctuation);\n  INT_VAR_H(x_ht_acceptance_tolerance);\n  INT_VAR_H(x_ht_min_change);\n  INT_VAR_H(superscript_debug);\n  double_VAR_H(superscript_worse_certainty);\n  double_VAR_H(superscript_bettered_certainty);\n  double_VAR_H(superscript_scaledown_ratio);\n  double_VAR_H(subscript_max_y_top);\n  double_VAR_H(superscript_min_y_bottom);\n  BOOL_VAR_H(tessedit_write_block_separators);\n  BOOL_VAR_H(tessedit_write_rep_codes);\n  BOOL_VAR_H(tessedit_write_unlv);\n  BOOL_VAR_H(tessedit_create_txt);\n  BOOL_VAR_H(tessedit_create_hocr);\n  BOOL_VAR_H(tessedit_create_alto);\n  BOOL_VAR_H(tessedit_create_page_xml);\n  BOOL_VAR_H(page_xml_polygon);\n  INT_VAR_H(page_xml_level);\n  BOOL_VAR_H(tessedit_create_lstmbox);\n  BOOL_VAR_H(tessedit_create_tsv);\n  BOOL_VAR_H(tessedit_create_wordstrbox);\n  BOOL_VAR_H(tessedit_create_pdf);\n  BOOL_VAR_H(textonly_pdf);\n  INT_VAR_H(jpg_quality);\n  INT_VAR_H(user_defined_dpi);\n  INT_VAR_H(min_characters_to_try);\n  STRING_VAR_H(unrecognised_char);\n  INT_VAR_H(suspect_level);\n  INT_VAR_H(suspect_short_words);\n  BOOL_VAR_H(suspect_constrain_1Il);\n  double_VAR_H(suspect_rating_per_ch);\n  double_VAR_H(suspect_accept_rating);\n  BOOL_VAR_H(tessedit_minimal_rejection);\n  BOOL_VAR_H(tessedit_zero_rejection);\n  BOOL_VAR_H(tessedit_word_for_word);\n  BOOL_VAR_H(tessedit_zero_kelvin_rejection);\n  INT_VAR_H(tessedit_reject_mode);\n  BOOL_VAR_H(tessedit_rejection_debug);\n  BOOL_VAR_H(tessedit_flip_0O);\n  double_VAR_H(tessedit_lower_flip_hyphen);\n  double_VAR_H(tessedit_upper_flip_hyphen);\n  BOOL_VAR_H(rej_trust_doc_dawg);\n  BOOL_VAR_H(rej_1Il_use_dict_word);\n  BOOL_VAR_H(rej_1Il_trust_permuter_type);\n  BOOL_VAR_H(rej_use_tess_accepted);\n  BOOL_VAR_H(rej_use_tess_blanks);\n  BOOL_VAR_H(rej_use_good_perm);\n  BOOL_VAR_H(rej_use_sensible_wd);\n  BOOL_VAR_H(rej_alphas_in_number_perm);\n  double_VAR_H(rej_whole_of_mostly_reject_word_fract);\n  INT_VAR_H(tessedit_image_border);\n  STRING_VAR_H(ok_repeated_ch_non_alphanum_wds);\n  STRING_VAR_H(conflict_set_I_l_1);\n  INT_VAR_H(min_sane_x_ht_pixels);\n  BOOL_VAR_H(tessedit_create_boxfile);\n  INT_VAR_H(tessedit_page_number);\n  BOOL_VAR_H(tessedit_write_images);\n  BOOL_VAR_H(interactive_display_mode);\n  STRING_VAR_H(file_type);\n  BOOL_VAR_H(tessedit_override_permuter);\n  STRING_VAR_H(tessedit_load_sublangs);\n  BOOL_VAR_H(tessedit_use_primary_params_model);\n  // Min acceptable orientation margin (difference in scores between top and 2nd\n  // choice in OSResults::orientations) to believe the page orientation.\n  double_VAR_H(min_orientation_margin);\n  BOOL_VAR_H(textord_tabfind_show_vlines);\n  BOOL_VAR_H(textord_use_cjk_fp_model);\n  BOOL_VAR_H(poly_allow_detailed_fx);\n  BOOL_VAR_H(tessedit_init_config_only);\n#ifndef DISABLED_LEGACY_ENGINE\n  BOOL_VAR_H(textord_equation_detect);\n#endif // ndef DISABLED_LEGACY_ENGINE\n  BOOL_VAR_H(textord_tabfind_vertical_text);\n  BOOL_VAR_H(textord_tabfind_force_vertical_text);\n  double_VAR_H(textord_tabfind_vertical_text_ratio);\n  double_VAR_H(textord_tabfind_aligned_gap_fraction);\n  INT_VAR_H(tessedit_parallelize);\n  BOOL_VAR_H(preserve_interword_spaces);\n  STRING_VAR_H(page_separator);\n  INT_VAR_H(lstm_choice_mode);\n  INT_VAR_H(lstm_choice_iterations);\n  double_VAR_H(lstm_rating_coefficient);\n  BOOL_VAR_H(pageseg_apply_music_mask);\n\n  //// ambigsrecog.cpp /////////////////////////////////////////////////////////\n  FILE *init_recog_training(const char *filename);\n  void recog_training_segmented(const char *filename, PAGE_RES *page_res,\n                                volatile ETEXT_DESC *monitor, FILE *output_file);\n  void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file);\n\nprivate:\n  // The filename of a backup config file. If not null, then we currently\n  // have a temporary debug config file loaded, and backup_config_file_\n  // will be loaded, and set to null when debug is complete.\n  const char *backup_config_file_;\n  // The filename of a config file to read when processing a debug word.\n  std::string word_config_;\n  // Image used for input to layout analysis and tesseract recognition.\n  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.\n  Image pix_binary_;\n  // Grey-level input image if the input was not binary, otherwise nullptr.\n  Image pix_grey_;\n  // Original input image. Color if the input was color.\n  Image pix_original_;\n  // Thresholds that were used to generate the thresholded image from grey.\n  Image pix_thresholds_;\n  // Debug images. If non-empty, will be written on destruction.\n  DebugPixa pixa_debug_;\n  // Input image resolution after any scaling. The resolution is not well\n  // transmitted by operations on Pix, so we keep an independent record here.\n  int source_resolution_;\n  // The shiro-rekha splitter object which is used to split top-lines in\n  // Devanagari words to provide a better word and grapheme segmentation.\n  ShiroRekhaSplitter splitter_;\n  // Page segmentation/layout\n  Textord textord_;\n  // True if the primary language uses right_to_left reading order.\n  bool right_to_left_;\n  Image scaled_color_;\n  int scaled_factor_;\n  FCOORD deskew_;\n  FCOORD reskew_;\n  float gradient_;\n  TesseractStats stats_;\n  // Sub-languages to be tried in addition to this.\n  std::vector<Tesseract *> sub_langs_;\n  // Most recently used Tesseract out of this and sub_langs_. The default\n  // language for the next word.\n  Tesseract *most_recently_used_;\n  // The size of the font table, ie max possible font id + 1.\n  int font_table_size_;\n#ifndef DISABLED_LEGACY_ENGINE\n  // Equation detector. Note: this pointer is NOT owned by the class.\n  EquationDetect *equ_detect_;\n#endif // ndef DISABLED_LEGACY_ENGINE\n  // LSTM recognizer, if available.\n  LSTMRecognizer *lstm_recognizer_;\n  // Output \"page\" number (actually line number) using TrainLineRecognizer.\n  int train_line_page_num_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_\n"
  },
  {
    "path": "src/ccmain/tessvars.cpp",
    "content": "/**********************************************************************\n * File:        tessvars.cpp  (Formerly tessvars.c)\n * Description: Variables and other globals for tessedit.\n * Author:      Ray Smith\n * Created:     Mon Apr 13 13:13:23 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <cstdio>\n\n#include \"tessvars.h\"\n\nFILE *debug_fp = stderr; // write debug stuff here\n"
  },
  {
    "path": "src/ccmain/tessvars.h",
    "content": "/**********************************************************************\n * File:        tessvars.h  (Formerly tessvars.h)\n * Description: Variables and other globals for tessedit.\n * Author:      Ray Smith\n * Created:     Mon Apr 13 13:13:23 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSVARS_H\n#define TESSVARS_H\n\n#include <cstdio>\n\nextern FILE *debug_fp; // write debug stuff here\n\n#endif\n"
  },
  {
    "path": "src/ccmain/tfacepp.cpp",
    "content": "/**********************************************************************\n * File:        tfacepp.cpp  (Formerly tface++.c)\n * Description: C++ side of the C/C++ Tess/Editor interface.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <cmath>\n\n#include \"blamer.h\"\n#include \"errcode.h\"\n#include \"ratngs.h\"\n#include \"reject.h\"\n#include \"tesseractclass.h\"\n#include \"werd.h\"\n\n#define MAX_UNDIVIDED_LENGTH 24\n\n/**********************************************************************\n * recog_word\n *\n * Convert the word to tess form and pass it to the tess segmenter.\n * Convert the output back to editor form.\n **********************************************************************/\nnamespace tesseract {\nvoid Tesseract::recog_word(WERD_RES *word) {\n  if (wordrec_skip_no_truth_words &&\n      (word->blamer_bundle == nullptr ||\n       word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {\n    if (classify_debug_level) {\n      tprintf(\"No truth for word - skipping\\n\");\n    }\n    word->tess_failed = true;\n    return;\n  }\n  ASSERT_HOST(!word->chopped_word->blobs.empty());\n  recog_word_recursive(word);\n  word->SetupBoxWord();\n  ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());\n  // Check that the ratings matrix size matches the sum of all the\n  // segmentation states.\n  if (!word->StatesAllValid()) {\n    tprintf(\"Not all words have valid states relative to ratings matrix!!\");\n    word->DebugWordChoices(true, nullptr);\n    ASSERT_HOST(word->StatesAllValid());\n  }\n  if (tessedit_override_permuter) {\n    /* Override the permuter type if a straight dictionary check disagrees. */\n    uint8_t perm_type = word->best_choice->permuter();\n    if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&\n        (perm_type != USER_DAWG_PERM)) {\n      uint8_t real_dict_perm_type = dict_word(*word->best_choice);\n      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||\n           (real_dict_perm_type == USER_DAWG_PERM)) &&\n          (alpha_count(word->best_choice->unichar_string().c_str(),\n                       word->best_choice->unichar_lengths().c_str()) > 0)) {\n        word->best_choice->set_permuter(real_dict_perm_type); // use dict perm\n      }\n    }\n    if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {\n      tprintf(\"Permuter Type Flipped from %d to %d\\n\", perm_type, word->best_choice->permuter());\n    }\n  }\n  // Factored out from control.cpp\n  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));\n  if (word->best_choice == nullptr || word->best_choice->empty() ||\n      strspn(word->best_choice->unichar_string().c_str(), \" \") ==\n          word->best_choice->length()) {\n    word->tess_failed = true;\n    word->reject_map.initialise(word->box_word->length());\n    word->reject_map.rej_word_tess_failure();\n  } else {\n    word->tess_failed = false;\n  }\n}\n\n/**********************************************************************\n * recog_word_recursive\n *\n * Convert the word to tess form and pass it to the tess segmenter.\n * Convert the output back to editor form.\n **********************************************************************/\nvoid Tesseract::recog_word_recursive(WERD_RES *word) {\n  auto word_length = word->chopped_word->NumBlobs(); // no of blobs\n  if (word_length > MAX_UNDIVIDED_LENGTH) {\n    return split_and_recog_word(word);\n  }\n  cc_recog(word);\n  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.\n\n  // Do sanity checks and minor fixes on best_choice.\n  if (word->best_choice->length() > word_length) {\n    word->best_choice->make_bad(); // should never happen\n    tprintf(\n        \"recog_word: Discarded long string \\\"%s\\\"\"\n        \" (%d characters vs %d blobs)\\n\",\n        word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);\n    tprintf(\"Word is at:\");\n    word->word->bounding_box().print();\n  }\n  if (word->best_choice->length() < word_length) {\n    UNICHAR_ID space_id = unicharset.unichar_to_id(\" \");\n    while (word->best_choice->length() < word_length) {\n      word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());\n    }\n  }\n}\n\n/**********************************************************************\n * split_and_recog_word\n *\n * Split the word into 2 smaller pieces at the largest gap.\n * Recognize the pieces and stick the results back together.\n **********************************************************************/\nvoid Tesseract::split_and_recog_word(WERD_RES *word) {\n  // Find the biggest blob gap in the chopped_word.\n  int bestgap = -INT32_MAX;\n  int split_index = 0;\n  for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {\n    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();\n    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();\n    int gap = blob_box.left() - prev_box.right();\n    if (gap > bestgap) {\n      bestgap = gap;\n      split_index = b;\n    }\n  }\n  ASSERT_HOST(split_index > 0);\n\n  WERD_RES *word2 = nullptr;\n  BlamerBundle *orig_bb = nullptr;\n  split_word(word, split_index, &word2, &orig_bb);\n\n  // Recognize the first part of the word.\n  recog_word_recursive(word);\n  // Recognize the second part of the word.\n  recog_word_recursive(word2);\n\n  join_words(word, word2, orig_bb);\n}\n\n/**********************************************************************\n * split_word\n *\n * Split a given WERD_RES in place into two smaller words for recognition.\n * split_pt is the index of the first blob to go in the second word.\n * The underlying word is left alone, only the TWERD (and subsequent data)\n * are split up.  orig_blamer_bundle is set to the original blamer bundle,\n * and will now be owned by the caller.  New blamer bundles are forged for the\n * two pieces.\n **********************************************************************/\nvoid Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,\n                           BlamerBundle **orig_blamer_bundle) const {\n  ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());\n\n  // Save a copy of the blamer bundle so we can try to reconstruct it below.\n  BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;\n\n  auto *word2 = new WERD_RES(*word);\n\n  // blow away the copied chopped_word, as we want to work with\n  // the blobs from the input chopped_word so seam_arrays can be merged.\n  TWERD *chopped = word->chopped_word;\n  auto *chopped2 = new TWERD;\n  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);\n  for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {\n    chopped2->blobs.push_back(chopped->blobs[i]);\n  }\n  chopped->blobs.resize(split_pt);\n  word->chopped_word = nullptr;\n  delete word2->chopped_word;\n  word2->chopped_word = nullptr;\n\n  const UNICHARSET &unicharset = *word->uch_set;\n  word->ClearResults();\n  word2->ClearResults();\n  word->chopped_word = chopped;\n  word2->chopped_word = chopped2;\n  word->SetupBasicsFromChoppedWord(unicharset);\n  word2->SetupBasicsFromChoppedWord(unicharset);\n\n  // Try to adjust the blamer bundle.\n  if (orig_bb != nullptr) {\n    // TODO(rays) Looks like a leak to me.\n    // orig_bb should take, rather than copy.\n    word->blamer_bundle = new BlamerBundle();\n    word2->blamer_bundle = new BlamerBundle();\n    orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),\n                         word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,\n                         word->blamer_bundle, word2->blamer_bundle);\n  }\n\n  *right_piece = word2;\n  *orig_blamer_bundle = orig_bb;\n}\n\n/**********************************************************************\n * join_words\n *\n * The opposite of split_word():\n *  join word2 (including any recognized data / seam array / etc)\n *  onto the right of word and then delete word2.\n *  Also, if orig_bb is provided, stitch it back into word.\n **********************************************************************/\nvoid Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {\n  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();\n  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();\n  // Tack the word2 outputs onto the end of the word outputs.\n  word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());\n  word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());\n  word2->chopped_word->blobs.clear();\n  word2->rebuild_word->blobs.clear();\n  TPOINT split_pt;\n  split_pt.x = (prev_box.right() + blob_box.left()) / 2;\n  split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;\n  // Move the word2 seams onto the end of the word1 seam_array.\n  // Since the seam list is one element short, an empty seam marking the\n  // end of the last blob in the first word is needed first.\n  word->seam_array.push_back(new SEAM(0.0f, split_pt));\n  word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());\n  word2->seam_array.clear();\n  // Fix widths and gaps.\n  word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());\n  word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());\n  // Fix the ratings matrix.\n  int rat1 = word->ratings->dimension();\n  int rat2 = word2->ratings->dimension();\n  word->ratings->AttachOnCorner(word2->ratings);\n  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);\n  word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());\n  // Append the word choices.\n  *word->raw_choice += *word2->raw_choice;\n\n  // How many alt choices from each should we try to get?\n  const int kAltsPerPiece = 2;\n  // When do we start throwing away extra alt choices?\n  const int kTooManyAltChoices = 100;\n\n  // Construct the cartesian product of the best_choices of word(1) and word2.\n  WERD_CHOICE_LIST joined_choices;\n  WERD_CHOICE_IT jc_it(&joined_choices);\n  WERD_CHOICE_IT bc1_it(&word->best_choices);\n  WERD_CHOICE_IT bc2_it(&word2->best_choices);\n  int num_word1_choices = word->best_choices.length();\n  int total_joined_choices = num_word1_choices;\n  // Nota Bene: For the main loop here, we operate only on the 2nd and greater\n  // word2 choices, and put them in the joined_choices list. The 1st word2\n  // choice gets added to the original word1 choices in-place after we have\n  // finished with them.\n  int bc2_index = 1;\n  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {\n    if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {\n      break;\n    }\n    int bc1_index = 0;\n    for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {\n      if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {\n        break;\n      }\n      auto *wc = new WERD_CHOICE(*bc1_it.data());\n      *wc += *bc2_it.data();\n      jc_it.add_after_then_move(wc);\n      ++total_joined_choices;\n    }\n  }\n  // Now that we've filled in as many alternates as we want, paste the best\n  // choice for word2 onto the original word alt_choices.\n  bc1_it.move_to_first();\n  bc2_it.move_to_first();\n  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {\n    *bc1_it.data() += *bc2_it.data();\n  }\n  bc1_it.move_to_last();\n  bc1_it.add_list_after(&joined_choices);\n\n  // Restore the pointer to original blamer bundle and combine blamer\n  // information recorded in the splits.\n  if (orig_bb != nullptr) {\n    orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);\n    delete word->blamer_bundle;\n    word->blamer_bundle = orig_bb;\n  }\n  word->SetupBoxWord();\n  word->reject_map.initialise(word->box_word->length());\n  delete word2;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/thresholder.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        thresholder.cpp\n// Description: Base API for thresholding images in tesseract.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"otsuthr.h\"\n#include \"thresholder.h\"\n#include \"tprintf.h\" // for tprintf\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h> // for api->GetIntVariable()\n\n#include <algorithm> // for std::max, std::min\n#include <cstdint>   // for uint32_t\n#include <cstring>\n#include <tuple>\n\nnamespace tesseract {\n\nImageThresholder::ImageThresholder()\n    : pix_(nullptr)\n    , image_width_(0)\n    , image_height_(0)\n    , pix_channels_(0)\n    , pix_wpl_(0)\n    , scale_(1)\n    , yres_(300)\n    , estimated_res_(300) {\n  SetRectangle(0, 0, 0, 0);\n}\n\nImageThresholder::~ImageThresholder() {\n  Clear();\n}\n\n// Destroy the Pix if there is one, freeing memory.\nvoid ImageThresholder::Clear() {\n  pix_.destroy();\n}\n\n// Return true if no image has been set.\nbool ImageThresholder::IsEmpty() const {\n  return pix_ == nullptr;\n}\n\n// SetImage makes a copy of all the image data, so it may be deleted\n// immediately after this call.\n// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.\n// Palette color images will not work properly and must be converted to\n// 24 bit.\n// Binary images of 1 bit per pixel may also be given but they must be\n// byte packed with the MSB of the first byte being the first pixel, and a\n// one pixel is WHITE. For binary images set bytes_per_pixel=0.\nvoid ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,\n                                int bytes_per_pixel, int bytes_per_line) {\n  int bpp = bytes_per_pixel * 8;\n  if (bpp == 0) {\n    bpp = 1;\n  }\n  Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);\n  l_uint32 *data = pixGetData(pix);\n  int wpl = pixGetWpl(pix);\n  switch (bpp) {\n    case 1:\n      for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {\n        for (int x = 0; x < width; ++x) {\n          if (imagedata[x / 8] & (0x80 >> (x % 8))) {\n            CLEAR_DATA_BIT(data, x);\n          } else {\n            SET_DATA_BIT(data, x);\n          }\n        }\n      }\n      break;\n\n    case 8:\n      // Greyscale just copies the bytes in the right order.\n      for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {\n        for (int x = 0; x < width; ++x) {\n          SET_DATA_BYTE(data, x, imagedata[x]);\n        }\n      }\n      break;\n\n    case 24:\n      // Put the colors in the correct places in the line buffer.\n      for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {\n        for (int x = 0; x < width; ++x, ++data) {\n          SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);\n          SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);\n          SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);\n        }\n      }\n      break;\n\n    case 32:\n      // Maintain byte order consistency across different endianness.\n      for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {\n        for (int x = 0; x < width; ++x) {\n          data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |\n                    (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];\n        }\n      }\n      break;\n\n    default:\n      tprintf(\"Cannot convert RAW image to Pix with bpp = %d\\n\", bpp);\n  }\n  SetImage(pix);\n  pix.destroy();\n}\n\n// Store the coordinates of the rectangle to process for later use.\n// Doesn't actually do any thresholding.\nvoid ImageThresholder::SetRectangle(int left, int top, int width, int height) {\n  rect_left_ = left;\n  rect_top_ = top;\n  rect_width_ = width;\n  rect_height_ = height;\n}\n\n// Get enough parameters to be able to rebuild bounding boxes in the\n// original image (not just within the rectangle).\n// Left and top are enough with top-down coordinates, but\n// the height of the rectangle and the image are needed for bottom-up.\nvoid ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,\n                                     int *imageheight) {\n  *left = rect_left_;\n  *top = rect_top_;\n  *width = rect_width_;\n  *height = rect_height_;\n  *imagewidth = image_width_;\n  *imageheight = image_height_;\n}\n\n// Pix vs raw, which to use? Pix is the preferred input for efficiency,\n// since raw buffers are copied.\n// SetImage for Pix clones its input, so the source pix may be pixDestroyed\n// immediately after, but may not go away until after the Thresholder has\n// finished with it.\nvoid ImageThresholder::SetImage(const Image pix) {\n  if (pix_ != nullptr) {\n    pix_.destroy();\n  }\n  Image src = pix;\n  int depth;\n  pixGetDimensions(src, &image_width_, &image_height_, &depth);\n  // Convert the image as necessary so it is one of binary, plain RGB, or\n  // 8 bit with no colormap. Guarantee that we always end up with our own copy,\n  // not just a clone of the input.\n  if (depth > 1 && depth < 8) {\n    pix_ = pixConvertTo8(src, false);\n  } else {\n    pix_ = src.copy();\n  }\n  depth = pixGetDepth(pix_);\n  pix_channels_ = depth / 8;\n  pix_wpl_ = pixGetWpl(pix_);\n  scale_ = 1;\n  estimated_res_ = yres_ = pixGetYRes(pix_);\n  Init();\n}\n\nstd::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(\n                                                      TessBaseAPI *api,\n                                                      ThresholdMethod method) {\n  Image pix_binary = nullptr;\n  Image pix_thresholds = nullptr;\n\n  if (pix_channels_ == 0) {\n    // We have a binary image, but it still has to be copied, as this API\n    // allows the caller to modify the output.\n    Image original = GetPixRect();\n    pix_binary = original.copy();\n    original.destroy();\n    return std::make_tuple(true, nullptr, pix_binary, nullptr);\n  }\n\n  auto pix_grey = GetPixRectGrey();\n\n  int r;\n\n  l_int32 pix_w, pix_h;\n  pixGetDimensions(pix_grey, &pix_w, &pix_h, nullptr);\n\n  bool thresholding_debug;\n  api->GetBoolVariable(\"thresholding_debug\", &thresholding_debug);\n  if (thresholding_debug) {\n    tprintf(\"\\nimage width: %d  height: %d  ppi: %d\\n\", pix_w, pix_h, yres_);\n  }\n\n  if (method == ThresholdMethod::Sauvola && pix_w > 6 && pix_h > 6) {\n    // pixSauvolaBinarizeTiled requires half_window_size >= 2.\n    // Therefore window_size must be at least 4 which requires\n    // pix_w and pix_h to be at least 7.\n    int window_size;\n    double window_size_factor;\n    api->GetDoubleVariable(\"thresholding_window_size\", &window_size_factor);\n    window_size = window_size_factor * yres_;\n    window_size = std::max(7, window_size);\n    window_size = std::min(pix_w < pix_h ? pix_w - 3 : pix_h - 3, window_size);\n    int half_window_size = window_size / 2;\n\n    // factor for image division into tiles; >= 1\n    l_int32 nx, ny;\n    // tiles size will be approx. 250 x 250 pixels\n    nx = std::max(1, (pix_w + 125) / 250);\n    ny = std::max(1, (pix_h + 125) / 250);\n    auto xrat = pix_w / nx;\n    auto yrat = pix_h / ny;\n    if (xrat < half_window_size + 2) {\n      nx = pix_w / (half_window_size + 2);\n    }\n    if (yrat < half_window_size + 2) {\n      ny = pix_h / (half_window_size + 2);\n    }\n\n    double kfactor;\n    api->GetDoubleVariable(\"thresholding_kfactor\", &kfactor);\n    kfactor = std::max(0.0, kfactor);\n\n    if (thresholding_debug) {\n      tprintf(\"window size: %d  kfactor: %.3f  nx:%d  ny: %d\\n\", window_size, kfactor, nx, ny);\n    }\n\n    r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny,\n                               (PIX**)pix_thresholds,\n                                (PIX**)pix_binary);\n  } else { // if (method == ThresholdMethod::LeptonicaOtsu)\n    int tile_size;\n    double tile_size_factor;\n    api->GetDoubleVariable(\"thresholding_tile_size\", &tile_size_factor);\n    tile_size = tile_size_factor * yres_;\n    tile_size = std::max(16, tile_size);\n\n    int smooth_size;\n    double smooth_size_factor;\n    api->GetDoubleVariable(\"thresholding_smooth_kernel_size\",\n                         &smooth_size_factor);\n    smooth_size_factor = std::max(0.0, smooth_size_factor);\n    smooth_size = smooth_size_factor * yres_;\n    int half_smooth_size = smooth_size / 2;\n\n    double score_fraction;\n    api->GetDoubleVariable(\"thresholding_score_fraction\", &score_fraction);\n\n    if (thresholding_debug) {\n      tprintf(\"tile size: %d  smooth_size: %d  score_fraction: %.2f\\n\", tile_size, smooth_size, score_fraction);\n    }\n\n    r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size,\n                                 half_smooth_size, half_smooth_size,\n                                 score_fraction,\n                                 (PIX**)pix_thresholds,\n                                 (PIX**)pix_binary);\n  }\n\n  bool ok = (r == 0);\n  return std::make_tuple(ok, pix_grey, pix_binary, pix_thresholds);\n}\n\n// Threshold the source image as efficiently as possible to the output Pix.\n// Creates a Pix and sets pix to point to the resulting pointer.\n// Caller must use pixDestroy to free the created Pix.\n/// Returns false on error.\nbool ImageThresholder::ThresholdToPix(Image *pix) {\n  if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {\n    tprintf(\"Image too large: (%d, %d)\\n\", image_width_, image_height_);\n    return false;\n  }\n  // Handle binary image\n  if (pix_channels_ == 0) {\n    // We have a binary image, but it still has to be copied, as this API\n    // allows the caller to modify the output.\n    Image original = GetPixRect();\n    *pix = original.copy();\n    original.destroy();\n    return true;\n  }\n  // Handle colormaps\n  Image src = pix_;\n  if (pixGetColormap(src)) {\n    src = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);\n  }\n  OtsuThresholdRectToPix(src, pix);\n  if (src != pix_) {\n    src.destroy();\n  }\n  return true;\n}\n\n// Gets a pix that contains an 8 bit threshold value at each pixel. The\n// returned pix may be an integer reduction of the binary image such that\n// the scale factor may be inferred from the ratio of the sizes, even down\n// to the extreme of a 1x1 pixel thresholds image.\n// Ideally the 8 bit threshold should be the exact threshold used to generate\n// the binary image in ThresholdToPix, but this is not a hard constraint.\n// Returns nullptr if the input is binary. PixDestroy after use.\nImage ImageThresholder::GetPixRectThresholds() {\n  if (IsBinary()) {\n    return nullptr;\n  }\n  Image pix_grey = GetPixRectGrey();\n  int width = pixGetWidth(pix_grey);\n  int height = pixGetHeight(pix_grey);\n  std::vector<int> thresholds;\n  std::vector<int> hi_values;\n  OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);\n  pix_grey.destroy();\n  Image pix_thresholds = pixCreate(width, height, 8);\n  int threshold = thresholds[0] > 0 ? thresholds[0] : 128;\n  pixSetAllArbitrary(pix_thresholds, threshold);\n  return pix_thresholds;\n}\n\n// Common initialization shared between SetImage methods.\nvoid ImageThresholder::Init() {\n  SetRectangle(0, 0, image_width_, image_height_);\n}\n\n// Get a clone/copy of the source image rectangle.\n// The returned Pix must be pixDestroyed.\n// This function will be used in the future by the page layout analysis, and\n// the layout analysis that uses it will only be available with Leptonica,\n// so there is no raw equivalent.\nImage ImageThresholder::GetPixRect() {\n  if (IsFullImage()) {\n    // Just clone the whole thing.\n    return pix_.clone();\n  } else {\n    // Crop to the given rectangle.\n    Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);\n    Image cropped = pixClipRectangle(pix_, box, nullptr);\n    boxDestroy(&box);\n    return cropped;\n  }\n}\n\n// Get a clone/copy of the source image rectangle, reduced to greyscale,\n// and at the same resolution as the output binary.\n// The returned Pix must be pixDestroyed.\n// Provided to the classifier to extract features from the greyscale image.\nImage ImageThresholder::GetPixRectGrey() {\n  auto pix = GetPixRect(); // May have to be reduced to grey.\n  int depth = pixGetDepth(pix);\n  if (depth != 8 || pixGetColormap(pix)) {\n    if (depth == 24) {\n      auto tmp = pixConvert24To32(pix);\n      pix.destroy();\n      pix = tmp;\n    }\n    auto result = pixConvertTo8(pix, false);\n    pix.destroy();\n    return result;\n  }\n  return pix;\n}\n\n// Otsu thresholds the rectangle, taking the rectangle from *this.\nvoid ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {\n  std::vector<int> thresholds;\n  std::vector<int> hi_values;\n\n  int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,\n                                   thresholds, hi_values);\n  ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);\n}\n\n/// Threshold the rectangle, taking everything except the src_pix\n/// from the class, using thresholds/hi_values to the output pix.\n/// NOTE that num_channels is the size of the thresholds and hi_values\n// arrays and also the bytes per pixel in src_pix.\nvoid ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,\n                                          const std::vector<int> &hi_values, Image *pix) const {\n  *pix = pixCreate(rect_width_, rect_height_, 1);\n  uint32_t *pixdata = pixGetData(*pix);\n  int wpl = pixGetWpl(*pix);\n  int src_wpl = pixGetWpl(src_pix);\n  uint32_t *srcdata = pixGetData(src_pix);\n  pixSetXRes(*pix, pixGetXRes(src_pix));\n  pixSetYRes(*pix, pixGetYRes(src_pix));\n  for (int y = 0; y < rect_height_; ++y) {\n    const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;\n    uint32_t *pixline = pixdata + y * wpl;\n    for (int x = 0; x < rect_width_; ++x) {\n      bool white_result = true;\n      for (int ch = 0; ch < num_channels; ++ch) {\n        int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);\n        if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {\n          white_result = false;\n          break;\n        }\n      }\n      if (white_result) {\n        CLEAR_DATA_BIT(pixline, x);\n      } else {\n        SET_DATA_BIT(pixline, x);\n      }\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccmain/thresholder.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        thresholder.h\n// Description: Base API for thresholding images in tesseract.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCMAIN_THRESHOLDER_H_\n#define TESSERACT_CCMAIN_THRESHOLDER_H_\n\n#include <tesseract/export.h>\n\n#include <vector> // for std::vector\n\nstruct Pix;\n\nnamespace tesseract {\n\nenum class ThresholdMethod {\n  Otsu,          // Tesseract's legacy Otsu\n  LeptonicaOtsu, // Leptonica's Otsu\n  Sauvola,       // Leptonica's Sauvola\n  Max,           // Number of Thresholding methods\n};\n\nclass TessBaseAPI;\n\n/// Base class for all tesseract image thresholding classes.\n/// Specific classes can add new thresholding methods by\n/// overriding ThresholdToPix.\n/// Each instance deals with a single image, but the design is intended to\n/// be useful for multiple calls to SetRectangle and ThresholdTo* if\n/// desired.\nclass TESS_API ImageThresholder {\npublic:\n  ImageThresholder();\n  virtual ~ImageThresholder();\n\n  /// Destroy the Pix if there is one, freeing memory.\n  virtual void Clear();\n\n  /// Return true if no image has been set.\n  bool IsEmpty() const;\n\n  /// SetImage makes a copy of all the image data, so it may be deleted\n  /// immediately after this call.\n  /// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.\n  /// Palette color images will not work properly and must be converted to\n  /// 24 bit.\n  /// Binary images of 1 bit per pixel may also be given but they must be\n  /// byte packed with the MSB of the first byte being the first pixel, and a\n  /// one pixel is WHITE. For binary images set bytes_per_pixel=0.\n  void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,\n                int bytes_per_line);\n\n  /// Store the coordinates of the rectangle to process for later use.\n  /// Doesn't actually do any thresholding.\n  void SetRectangle(int left, int top, int width, int height);\n\n  /// Get enough parameters to be able to rebuild bounding boxes in the\n  /// original image (not just within the rectangle).\n  /// Left and top are enough with top-down coordinates, but\n  /// the height of the rectangle and the image are needed for bottom-up.\n  virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,\n                             int *imageheight);\n\n  /// Return true if the source image is color.\n  bool IsColor() const {\n    return pix_channels_ >= 3;\n  }\n\n  /// Returns true if the source image is binary.\n  bool IsBinary() const {\n    return pix_channels_ == 0;\n  }\n\n  int GetScaleFactor() const {\n    return scale_;\n  }\n\n  // Set the resolution of the source image in pixels per inch.\n  // This should be called right after SetImage(), and will let us return\n  // appropriate font sizes for the text.\n  void SetSourceYResolution(int ppi) {\n    yres_ = ppi;\n    estimated_res_ = ppi;\n  }\n  int GetSourceYResolution() const {\n    return yres_;\n  }\n  int GetScaledYResolution() const {\n    return scale_ * yres_;\n  }\n  // Set the resolution of the source image in pixels per inch, as estimated\n  // by the thresholder from the text size found during thresholding.\n  // This value will be used to set internal size thresholds during recognition\n  // and will not influence the output \"point size.\" The default value is\n  // the same as the source resolution. (yres_)\n  void SetEstimatedResolution(int ppi) {\n    estimated_res_ = ppi;\n  }\n  // Returns the estimated resolution, including any active scaling.\n  // This value will be used to set internal size thresholds during recognition.\n  int GetScaledEstimatedResolution() const {\n    return scale_ * estimated_res_;\n  }\n\n  /// Pix vs raw, which to use? Pix is the preferred input for efficiency,\n  /// since raw buffers are copied.\n  /// SetImage for Pix clones its input, so the source pix may be pixDestroyed\n  /// immediately after, but may not go away until after the Thresholder has\n  /// finished with it.\n  void SetImage(const Image pix);\n\n  /// Threshold the source image as efficiently as possible to the output Pix.\n  /// Creates a Pix and sets pix to point to the resulting pointer.\n  /// Caller must use pixDestroy to free the created Pix.\n  /// Returns false on error.\n  virtual bool ThresholdToPix(Image *pix);\n\n  virtual std::tuple<bool, Image, Image, Image> Threshold(TessBaseAPI *api,\n                                                          ThresholdMethod method);\n\n  // Gets a pix that contains an 8 bit threshold value at each pixel. The\n  // returned pix may be an integer reduction of the binary image such that\n  // the scale factor may be inferred from the ratio of the sizes, even down\n  // to the extreme of a 1x1 pixel thresholds image.\n  // Ideally the 8 bit threshold should be the exact threshold used to generate\n  // the binary image in ThresholdToPix, but this is not a hard constraint.\n  // Returns nullptr if the input is binary. PixDestroy after use.\n  virtual Image GetPixRectThresholds();\n\n  /// Get a clone/copy of the source image rectangle.\n  /// The returned Pix must be pixDestroyed.\n  /// This function will be used in the future by the page layout analysis, and\n  /// the layout analysis that uses it will only be available with Leptonica,\n  /// so there is no raw equivalent.\n  Image GetPixRect();\n\n  // Get a clone/copy of the source image rectangle, reduced to greyscale,\n  // and at the same resolution as the output binary.\n  // The returned Pix must be pixDestroyed.\n  // Provided to the classifier to extract features from the greyscale image.\n  virtual Image GetPixRectGrey();\n\nprotected:\n  // ----------------------------------------------------------------------\n  // Utility functions that may be useful components for other thresholders.\n\n  /// Common initialization shared between SetImage methods.\n  virtual void Init();\n\n  /// Return true if we are processing the full image.\n  bool IsFullImage() const {\n    return rect_left_ == 0 && rect_top_ == 0 && rect_width_ == image_width_ &&\n           rect_height_ == image_height_;\n  }\n\n  // Otsu thresholds the rectangle, taking the rectangle from *this.\n  void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const;\n\n  /// Threshold the rectangle, taking everything except the src_pix\n  /// from the class, using thresholds/hi_values to the output pix.\n  /// NOTE that num_channels is the size of the thresholds and hi_values\n  // arrays and also the bytes per pixel in src_pix.\n  void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,\n                          const std::vector <int> &hi_values, Image *pix) const;\n\nprotected:\n  /// Clone or other copy of the source Pix.\n  /// The pix will always be PixDestroy()ed on destruction of the class.\n  Image pix_;\n\n  int image_width_;  ///< Width of source pix_.\n  int image_height_; ///< Height of source pix_.\n  int pix_channels_; ///< Number of 8-bit channels in pix_.\n  int pix_wpl_;      ///< Words per line of pix_.\n  // Limits of image rectangle to be processed.\n  int scale_;         ///< Scale factor from original image.\n  int yres_;          ///< y pixels/inch in source image.\n  int estimated_res_; ///< Resolution estimate from text size.\n  int rect_left_;\n  int rect_top_;\n  int rect_width_;\n  int rect_height_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_THRESHOLDER_H_\n"
  },
  {
    "path": "src/ccmain/werdit.cpp",
    "content": "/**********************************************************************\n * File:        werdit.cpp  (Formerly wordit.c)\n * Description: An iterator for passing over all the words in a document.\n * Author:      Ray Smith\n * Created:     Mon Apr 27 08:51:22 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"werdit.h\"\n\n#include \"errcode.h\"  // for ASSERT_HOST\n#include \"pageres.h\"  // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES\n#include \"stepblob.h\" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST\n#include \"werd.h\"     // for WERD\n\nnamespace tesseract {\n\n/**********************************************************************\n * make_pseudo_word\n *\n * Make all the blobs inside a selection into a single word.\n * The returned PAGE_RES_IT* it points to the new word. After use, call\n * it->DeleteCurrentWord() to delete the fake word, and then\n * delete it to get rid of the iterator itself.\n **********************************************************************/\n\nPAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box) {\n  PAGE_RES_IT pr_it(page_res);\n  C_BLOB_LIST new_blobs;              // list of gathered blobs\n  C_BLOB_IT new_blob_it = &new_blobs; // iterator\n\n  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {\n    WERD *word = word_res->word;\n    if (word->bounding_box().overlap(selection_box)) {\n      C_BLOB_IT blob_it(word->cblob_list());\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        C_BLOB *blob = blob_it.data();\n        if (blob->bounding_box().overlap(selection_box)) {\n          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));\n        }\n      }\n      if (!new_blobs.empty()) {\n        WERD *pseudo_word = new WERD(&new_blobs, 1, nullptr);\n        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);\n        auto *it = new PAGE_RES_IT(page_res);\n        while (it->word() != word_res && it->word() != nullptr) {\n          it->forward();\n        }\n        ASSERT_HOST(it->word() == word_res);\n        return it;\n      }\n    }\n  }\n  return nullptr;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccmain/werdit.h",
    "content": "/**********************************************************************\n * File:        wordit.h\n * Description: An iterator for passing over all the words in a document.\n * Author:      Ray Smith\n * Created:     Mon Apr 27 08:51:22 BST 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef WERDIT_H\n#define WERDIT_H\n\n#include \"rect.h\" // for TBOX\n\nnamespace tesseract {\n\nclass PAGE_RES;\nclass PAGE_RES_IT;\n\nPAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/blamer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        blamer.cpp\n// Description: Module allowing precise error causes to be allocated.\n// Author:      Rike Antonova\n// Refactored:  Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"blamer.h\"\n\n#include \"blobs.h\"   // for TPOINT, TWERD, TBLOB\n#include \"errcode.h\" // for ASSERT_HOST\n#if !defined(DISABLED_LEGACY_ENGINE)\n#  include \"lm_pain_points.h\" // for LMPainPoints\n#endif\n#include \"matrix.h\"     // for MATRIX\n#include \"normalis.h\"   // for DENORM\n#include \"pageres.h\"    // for WERD_RES\n#include \"unicharset.h\" // for UNICHARSET\n\n#include <cmath>   // for abs\n#include <cstdlib> // for abs\n\nnamespace tesseract {\n\n// Names for each value of IncorrectResultReason enum. Keep in sync.\nconst char kBlameCorrect[] = \"corr\";\nconst char kBlameClassifier[] = \"cl\";\nconst char kBlameChopper[] = \"chop\";\nconst char kBlameClassLMTradeoff[] = \"cl/LM\";\nconst char kBlamePageLayout[] = \"pglt\";\nconst char kBlameSegsearchHeur[] = \"ss_heur\";\nconst char kBlameSegsearchPP[] = \"ss_pp\";\nconst char kBlameClassOldLMTradeoff[] = \"cl/old_LM\";\nconst char kBlameAdaption[] = \"adapt\";\nconst char kBlameNoTruthSplit[] = \"no_tr_spl\";\nconst char kBlameNoTruth[] = \"no_tr\";\nconst char kBlameUnknown[] = \"unkn\";\n\nconst char *const kIncorrectResultReasonNames[] = {\n    kBlameCorrect,    kBlameClassifier,    kBlameChopper,     kBlameClassLMTradeoff,\n    kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff,\n    kBlameAdaption,   kBlameNoTruthSplit,  kBlameNoTruth,     kBlameUnknown};\n\nconst char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {\n  return kIncorrectResultReasonNames[irr];\n}\n\nconst char *BlamerBundle::IncorrectReason() const {\n  return kIncorrectResultReasonNames[incorrect_result_reason_];\n}\n\n// Functions to setup the blamer.\n// Whole word string, whole word bounding box.\nvoid BlamerBundle::SetWordTruth(const UNICHARSET &unicharset, const char *truth_str,\n                                const TBOX &word_box) {\n  truth_word_.InsertBox(0, word_box);\n  truth_has_char_boxes_ = false;\n  // Encode the string as UNICHAR_IDs.\n  std::vector<UNICHAR_ID> encoding;\n  std::vector<char> lengths;\n  unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);\n  int total_length = 0;\n  for (size_t i = 0; i < encoding.size(); total_length += lengths[i++]) {\n    std::string uch(truth_str + total_length);\n    uch.resize(lengths[i] - total_length);\n    UNICHAR_ID id = encoding[i];\n    if (id != INVALID_UNICHAR_ID) {\n      uch = unicharset.get_normed_unichar(id);\n    }\n    truth_text_.push_back(uch);\n  }\n}\n\n// Single \"character\" string, \"character\" bounding box.\n// May be called multiple times to indicate the characters in a word.\nvoid BlamerBundle::SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str,\n                                  const TBOX &char_box) {\n  std::string symbol_str(char_str);\n  UNICHAR_ID id = unicharset.unichar_to_id(char_str);\n  if (id != INVALID_UNICHAR_ID) {\n    std::string normed_uch(unicharset.get_normed_unichar(id));\n    if (normed_uch.length() > 0) {\n      symbol_str = std::move(normed_uch);\n    }\n  }\n  int length = truth_word_.length();\n  truth_text_.push_back(symbol_str);\n  truth_word_.InsertBox(length, char_box);\n  if (length == 0) {\n    truth_has_char_boxes_ = true;\n  } else if (truth_word_.BlobBox(length - 1) == char_box) {\n    truth_has_char_boxes_ = false;\n  }\n}\n\n// Marks that there is something wrong with the truth text, like it contains\n// reject characters.\nvoid BlamerBundle::SetRejectedTruth() {\n  incorrect_result_reason_ = IRR_NO_TRUTH;\n  truth_has_char_boxes_ = false;\n}\n\n// Returns true if the provided word_choice is correct.\nbool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE *word_choice) const {\n  if (word_choice == nullptr) {\n    return false;\n  }\n  const UNICHARSET *uni_set = word_choice->unicharset();\n  std::string normed_choice_str;\n  for (unsigned i = 0; i < word_choice->length(); ++i) {\n    normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));\n  }\n  std::string truth_str = TruthString();\n  return truth_str == normed_choice_str;\n}\n\nvoid BlamerBundle::FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug) {\n  debug += \"Truth \";\n  for (auto &text : this->truth_text_) {\n    debug += text;\n  }\n  if (!this->truth_has_char_boxes_) {\n    debug += \" (no char boxes)\";\n  }\n  if (choice != nullptr) {\n    debug += \" Choice \";\n    std::string choice_str;\n    choice->string_and_lengths(&choice_str, nullptr);\n    debug += choice_str;\n  }\n  if (msg.length() > 0) {\n    debug += \"\\n\";\n    debug += msg;\n  }\n  debug += \"\\n\";\n}\n\n// Sets up the norm_truth_word from truth_word using the given DENORM.\nvoid BlamerBundle::SetupNormTruthWord(const DENORM &denorm) {\n  // TODO(rays) Is this the last use of denorm in WERD_RES and can it go?\n  norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();\n  TPOINT topleft;\n  TPOINT botright;\n  TPOINT norm_topleft;\n  TPOINT norm_botright;\n  for (unsigned b = 0; b < truth_word_.length(); ++b) {\n    const TBOX &box = truth_word_.BlobBox(b);\n    topleft.x = box.left();\n    topleft.y = box.top();\n    botright.x = box.right();\n    botright.y = box.bottom();\n    denorm.NormTransform(nullptr, topleft, &norm_topleft);\n    denorm.NormTransform(nullptr, botright, &norm_botright);\n    TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y);\n    norm_truth_word_.InsertBox(b, norm_box);\n  }\n}\n\n// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty\n// bundles) where the right edge/ of the left-hand word is word1_right,\n// and the left edge of the right-hand word is word2_left.\nvoid BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,\n                               BlamerBundle *bundle2) const {\n  std::string debug_str;\n  // Find truth boxes that correspond to the split in the blobs.\n  unsigned begin2_truth_index = 0;\n  if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {\n    debug_str = \"Looking for truth split at\";\n    debug_str += \" end1_x \" + std::to_string(word1_right);\n    debug_str += \" begin2_x \" + std::to_string(word2_left);\n    debug_str += \"\\nnorm_truth_word boxes:\\n\";\n    if (norm_truth_word_.length() > 1) {\n      norm_truth_word_.BlobBox(0).print_to_str(debug_str);\n      for (unsigned b = 1; b < norm_truth_word_.length(); ++b) {\n        norm_truth_word_.BlobBox(b).print_to_str(debug_str);\n        if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&\n            (abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {\n          begin2_truth_index = b;\n          debug_str += \"Split found\";\n          break;\n        }\n      }\n      debug_str += '\\n';\n    }\n  }\n  // Populate truth information in word and word2 with the first and second\n  // part of the original truth.\n  if (begin2_truth_index > 0) {\n    bundle1->truth_has_char_boxes_ = true;\n    bundle1->norm_box_tolerance_ = norm_box_tolerance_;\n    bundle2->truth_has_char_boxes_ = true;\n    bundle2->norm_box_tolerance_ = norm_box_tolerance_;\n    BlamerBundle *curr_bb = bundle1;\n    for (unsigned b = 0; b < norm_truth_word_.length(); ++b) {\n      if (b == begin2_truth_index) {\n        curr_bb = bundle2;\n      }\n      curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));\n      curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));\n      curr_bb->truth_text_.push_back(truth_text_[b]);\n    }\n  } else if (incorrect_result_reason_ == IRR_NO_TRUTH) {\n    bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;\n    bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;\n  } else {\n    debug_str += \"Truth split not found\";\n    debug_str += truth_has_char_boxes_ ? \"\\n\" : \" (no truth char boxes)\\n\";\n    bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);\n    bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);\n  }\n}\n\n// \"Joins\" the blames from bundle1 and bundle2 into *this.\nvoid BlamerBundle::JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2,\n                              bool debug) {\n  std::string debug_str;\n  IncorrectResultReason irr = incorrect_result_reason_;\n  if (irr != IRR_NO_TRUTH_SPLIT) {\n    debug_str = \"\";\n  }\n  if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&\n      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&\n      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {\n    debug_str += \"Blame from part 1: \";\n    debug_str += bundle1.debug_;\n    irr = bundle1.incorrect_result_reason_;\n  }\n  if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&\n      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&\n      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {\n    debug_str += \"Blame from part 2: \";\n    debug_str += bundle2.debug_;\n    if (irr == IRR_CORRECT) {\n      irr = bundle2.incorrect_result_reason_;\n    } else if (irr != bundle2.incorrect_result_reason_) {\n      irr = IRR_UNKNOWN;\n    }\n  }\n  incorrect_result_reason_ = irr;\n  if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {\n    SetBlame(irr, debug_str, nullptr, debug);\n  }\n}\n\n// If a blob with the same bounding box as one of the truth character\n// bounding boxes is not classified as the corresponding truth character\n// blames character classifier for incorrect answer.\nvoid BlamerBundle::BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,\n                                   const BLOB_CHOICE_LIST &choices, bool debug) {\n  if (!truth_has_char_boxes_ || incorrect_result_reason_ != IRR_CORRECT) {\n    return; // Nothing to do here.\n  }\n\n  for (unsigned b = 0; b < norm_truth_word_.length(); ++b) {\n    const TBOX &truth_box = norm_truth_word_.BlobBox(b);\n    // Note that we are more strict on the bounding box boundaries here\n    // than in other places (chopper, segmentation search), since we do\n    // not have the ability to check the previous and next bounding box.\n    if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {\n      bool found = false;\n      bool incorrect_adapted = false;\n      UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;\n      const char *truth_str = truth_text_[b].c_str();\n      // We promise not to modify the list or its contents, using a\n      // const BLOB_CHOICE* below.\n      BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST *>(&choices));\n      for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) {\n        const BLOB_CHOICE *choice = choices_it.data();\n        if (strcmp(truth_str, unicharset.get_normed_unichar(choice->unichar_id())) == 0) {\n          found = true;\n          break;\n        } else if (choice->IsAdapted()) {\n          incorrect_adapted = true;\n          incorrect_adapted_id = choice->unichar_id();\n        }\n      } // end choices_it for loop\n      if (!found) {\n        std::string debug_str = \"unichar \";\n        debug_str += truth_str;\n        debug_str += \" not found in classification list\";\n        SetBlame(IRR_CLASSIFIER, debug_str, nullptr, debug);\n      } else if (incorrect_adapted) {\n        std::string debug_str = \"better rating for adapted \";\n        debug_str += unicharset.id_to_unichar(incorrect_adapted_id);\n        debug_str += \" than for correct \";\n        debug_str += truth_str;\n        SetBlame(IRR_ADAPTION, debug_str, nullptr, debug);\n      }\n      break;\n    }\n  } // end iterating over blamer_bundle->norm_truth_word\n}\n\n// Checks whether chops were made at all the character bounding box\n// boundaries in word->truth_word. If not - blames the chopper for an\n// incorrect answer.\nvoid BlamerBundle::SetChopperBlame(const WERD_RES *word, bool debug) {\n  if (NoTruth() || !truth_has_char_boxes_ || word->chopped_word->blobs.empty()) {\n    return;\n  }\n  bool missing_chop = false;\n  int num_blobs = word->chopped_word->blobs.size();\n  unsigned box_index = 0;\n  int blob_index = 0;\n  int16_t truth_x = -1;\n  while (box_index < truth_word_.length() && blob_index < num_blobs) {\n    truth_x = norm_truth_word_.BlobBox(box_index).right();\n    TBLOB *curr_blob = word->chopped_word->blobs[blob_index];\n    if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {\n      ++blob_index;\n      continue; // encountered an extra chop, keep looking\n    } else if (curr_blob->bounding_box().right() > truth_x + norm_box_tolerance_) {\n      missing_chop = true;\n      break;\n    } else {\n      ++blob_index;\n    }\n  }\n  if (missing_chop || box_index < norm_truth_word_.length()) {\n    std::string debug_str;\n    if (missing_chop) {\n      debug_str += \"Detected missing chop (tolerance=\" + std::to_string(norm_box_tolerance_);\n      debug_str += \") at Bounding Box=\";\n      TBLOB *curr_blob = word->chopped_word->blobs[blob_index];\n      curr_blob->bounding_box().print_to_str(debug_str);\n      debug_str += \"\\nNo chop for truth at x=\" + std::to_string(truth_x);\n    } else {\n      debug_str += \"Missing chops for last \" + std::to_string(norm_truth_word_.length() - box_index);\n      debug_str += \" truth box(es)\";\n    }\n    debug_str += \"\\nMaximally chopped word boxes:\\n\";\n    for (blob_index = 0; blob_index < num_blobs; ++blob_index) {\n      TBLOB *curr_blob = word->chopped_word->blobs[blob_index];\n      curr_blob->bounding_box().print_to_str(debug_str);\n      debug_str += '\\n';\n    }\n    debug_str += \"Truth  bounding  boxes:\\n\";\n    for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {\n      norm_truth_word_.BlobBox(box_index).print_to_str(debug_str);\n      debug_str += '\\n';\n    }\n    SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);\n  }\n}\n\n// Blames the classifier or the language model if, after running only the\n// chopper, best_choice is incorrect and no blame has been yet set.\n// Blames the classifier if best_choice is classifier's top choice and is a\n// dictionary word (i.e. language model could not have helped).\n// Otherwise, blames the language model (formerly permuter word adjustment).\nvoid BlamerBundle::BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,\n                                              bool valid_permuter, bool debug) {\n  if (valid_permuter) {\n    // Find out whether best choice is a top choice.\n    best_choice_is_dict_and_top_choice_ = true;\n    for (unsigned i = 0; i < word->best_choice->length(); ++i) {\n      BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));\n      ASSERT_HOST(!blob_choice_it.empty());\n      BLOB_CHOICE *first_choice = nullptr;\n      for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();\n           blob_choice_it.forward()) { // find first non-fragment choice\n        if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {\n          first_choice = blob_choice_it.data();\n          break;\n        }\n      }\n      ASSERT_HOST(first_choice != nullptr);\n      if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {\n        best_choice_is_dict_and_top_choice_ = false;\n        break;\n      }\n    }\n  }\n  std::string debug_str;\n  if (best_choice_is_dict_and_top_choice_) {\n    debug_str = \"Best choice is: incorrect, top choice, dictionary word\";\n    debug_str += \" with permuter \";\n    debug_str += word->best_choice->permuter_name();\n  } else {\n    debug_str = \"Classifier/Old LM tradeoff is to blame\";\n  }\n  SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,\n           debug_str, word->best_choice, debug);\n}\n\n// Sets up the correct_segmentation_* to mark the correct bounding boxes.\nvoid BlamerBundle::SetupCorrectSegmentation(const TWERD *word, bool debug) {\n#ifndef DISABLED_LEGACY_ENGINE\n  params_training_bundle_.StartHypothesisList();\n#endif //  ndef DISABLED_LEGACY_ENGINE\n  if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) {\n    return; // Nothing to do here.\n  }\n\n  std::string debug_str = \"Blamer computing correct_segmentation_cols\\n\";\n  int curr_box_col = 0;\n  int next_box_col = 0;\n  int num_blobs = word->NumBlobs();\n  if (num_blobs == 0) {\n    return; // No blobs to play with.\n  }\n  int blob_index = 0;\n  int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();\n  for (unsigned truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();\n       ++blob_index) {\n    ++next_box_col;\n    int16_t curr_box_x = next_box_x;\n    if (blob_index + 1 < num_blobs) {\n      next_box_x = word->blobs[blob_index + 1]->bounding_box().right();\n    }\n    int16_t truth_x = norm_truth_word_.BlobBox(truth_idx).right();\n    debug_str += \"Box x coord vs. truth: \" + std::to_string(curr_box_x);\n    debug_str += \" \" + std::to_string(truth_x);\n    debug_str += \"\\n\";\n    if (curr_box_x > (truth_x + norm_box_tolerance_)) {\n      break;                                                  // failed to find a matching box\n    } else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched\n               (blob_index + 1 >= num_blobs ||                // next box can't be included\n                next_box_x > truth_x + norm_box_tolerance_)) {\n      correct_segmentation_cols_.push_back(curr_box_col);\n      correct_segmentation_rows_.push_back(next_box_col - 1);\n      ++truth_idx;\n      debug_str += \"col=\" + std::to_string(curr_box_col);\n      debug_str += \" row=\" + std::to_string(next_box_col - 1);\n      debug_str += \"\\n\";\n      curr_box_col = next_box_col;\n    }\n  }\n  if (blob_index < num_blobs || // trailing blobs\n      correct_segmentation_cols_.size() != norm_truth_word_.length()) {\n    debug_str +=\n        \"Blamer failed to find correct segmentation\"\n        \" (tolerance=\" +\n        std::to_string(norm_box_tolerance_);\n    if (blob_index >= num_blobs) {\n      debug_str += \" blob == nullptr\";\n    }\n    debug_str += \")\\n\";\n    debug_str += \" path length \" + std::to_string(correct_segmentation_cols_.size());\n    debug_str += \" vs. truth \" + std::to_string(norm_truth_word_.length());\n    debug_str += \"\\n\";\n    SetBlame(IRR_UNKNOWN, debug_str, nullptr, debug);\n    correct_segmentation_cols_.clear();\n    correct_segmentation_rows_.clear();\n  }\n}\n\n// Returns true if a guided segmentation search is needed.\nbool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {\n  return incorrect_result_reason_ == IRR_CORRECT && !segsearch_is_looking_for_blame_ &&\n         truth_has_char_boxes_ && !ChoiceIsCorrect(best_choice);\n}\n\n#if !defined(DISABLED_LEGACY_ENGINE)\n// Setup ready to guide the segmentation search to the correct segmentation.\nvoid BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings,\n                                    UNICHAR_ID wildcard_id, bool debug, std::string &debug_str,\n                                    tesseract::LMPainPoints *pain_points, double max_char_wh_ratio,\n                                    WERD_RES *word_res) {\n  segsearch_is_looking_for_blame_ = true;\n  if (debug) {\n    tprintf(\"segsearch starting to look for blame\\n\");\n  }\n  // Fill pain points for any unclassifed blob corresponding to the\n  // correct segmentation state.\n  debug_str += \"Correct segmentation:\\n\";\n  for (unsigned idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {\n    debug_str += \"col=\" + std::to_string(correct_segmentation_cols_[idx]);\n    debug_str += \" row=\" + std::to_string(correct_segmentation_rows_[idx]);\n    debug_str += \"\\n\";\n    if (!ratings->Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],\n                             wildcard_id) &&\n        !pain_points->GeneratePainPoint(\n            correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],\n            tesseract::LM_PPTYPE_BLAMER, 0.0, false, max_char_wh_ratio, word_res)) {\n      segsearch_is_looking_for_blame_ = false;\n      debug_str += \"\\nFailed to insert pain point\\n\";\n      SetBlame(IRR_SEGSEARCH_HEUR, debug_str, best_choice, debug);\n      break;\n    }\n  } // end for blamer_bundle->correct_segmentation_cols/rows\n}\n#endif // !defined(DISABLED_LEGACY_ENGINE)\n\n// Returns true if the guided segsearch is in progress.\nbool BlamerBundle::GuidedSegsearchStillGoing() const {\n  return segsearch_is_looking_for_blame_;\n}\n\n// The segmentation search has ended. Sets the blame appropriately.\nvoid BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str) {\n  // If we are still looking for blame (i.e. best_choice is incorrect, but a\n  // path representing the correct segmentation could be constructed), we can\n  // blame segmentation search pain point prioritization if the rating of the\n  // path corresponding to the correct segmentation is better than that of\n  // best_choice (i.e. language model would have done the correct thing, but\n  // because of poor pain point prioritization the correct segmentation was\n  // never explored). Otherwise we blame the tradeoff between the language model\n  // and the classifier, since even after exploring the path corresponding to\n  // the correct segmentation incorrect best_choice would have been chosen.\n  // One special case when we blame the classifier instead is when best choice\n  // is incorrect, but it is a dictionary word and it classifier's top choice.\n  if (segsearch_is_looking_for_blame_) {\n    segsearch_is_looking_for_blame_ = false;\n    if (best_choice_is_dict_and_top_choice_) {\n      debug_str = \"Best choice is: incorrect, top choice, dictionary word\";\n      debug_str += \" with permuter \";\n      debug_str += best_choice->permuter_name();\n      SetBlame(IRR_CLASSIFIER, debug_str, best_choice, debug);\n    } else if (best_correctly_segmented_rating_ < best_choice->rating()) {\n      debug_str += \"Correct segmentation state was not explored\";\n      SetBlame(IRR_SEGSEARCH_PP, debug_str, best_choice, debug);\n    } else {\n      if (best_correctly_segmented_rating_ >= WERD_CHOICE::kBadRating) {\n        debug_str += \"Correct segmentation paths were pruned by LM\\n\";\n      } else {\n        debug_str += \"Best correct segmentation rating \" +\n                                  std::to_string(best_correctly_segmented_rating_);\n        debug_str += \" vs. best choice rating \" + std::to_string(best_choice->rating());\n      }\n      SetBlame(IRR_CLASS_LM_TRADEOFF, debug_str, best_choice, debug);\n    }\n  }\n}\n\n// If the bundle is null or still does not indicate the correct result,\n// fix it and use some backup reason for the blame.\nvoid BlamerBundle::LastChanceBlame(bool debug, WERD_RES *word) {\n  if (word->blamer_bundle == nullptr) {\n    word->blamer_bundle = new BlamerBundle();\n    word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, \"LastChanceBlame\", word->best_choice, debug);\n  } else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {\n    word->blamer_bundle->SetBlame(IRR_NO_TRUTH, \"Rejected truth\", word->best_choice, debug);\n  } else {\n    bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);\n    IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;\n    if (irr == IRR_CORRECT && !correct) {\n      std::string debug_str = \"Choice is incorrect after recognition\";\n      word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, debug);\n    } else if (irr != IRR_CORRECT && correct) {\n      if (debug) {\n        tprintf(\"Corrected %s\\n\", word->blamer_bundle->debug_.c_str());\n      }\n      word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;\n      word->blamer_bundle->debug_ = \"\";\n    }\n  }\n}\n\n// Sets the misadaption debug if this word is incorrect, as this word is\n// being adapted to.\nvoid BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug) {\n  if (incorrect_result_reason_ != IRR_NO_TRUTH && !ChoiceIsCorrect(best_choice)) {\n    misadaption_debug_ = \"misadapt to word (\";\n    misadaption_debug_ += best_choice->permuter_name();\n    misadaption_debug_ += \"): \";\n    FillDebugString(\"\", best_choice, misadaption_debug_);\n    if (debug) {\n      tprintf(\"%s\\n\", misadaption_debug_.c_str());\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/blamer.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        blamer.h\n// Description: Module allowing precise error causes to be allocated.\n// Author:      Rike Antonova\n// Refactored:  Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_BLAMER_H_\n#define TESSERACT_CCSTRUCT_BLAMER_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n#include \"boxword.h\" // for BoxWord\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"params_training_featdef.h\" // for ParamsTrainingBundle, ParamsTra...\n#endif                                 //  ndef DISABLED_LEGACY_ENGINE\n#include \"ratngs.h\"                    // for BLOB_CHOICE_LIST (ptr only)\n#include \"rect.h\"                      // for TBOX\n#include \"tprintf.h\"                   // for tprintf\n\n#include <tesseract/unichar.h> // for UNICHAR_ID\n\n#include <cstdint> // for int16_t\n#include <cstring> // for memcpy\n#include <vector>  // for std::vector\n\nnamespace tesseract {\n\nclass DENORM;\nclass MATRIX;\nclass UNICHARSET;\nclass WERD_RES;\n\nstruct MATRIX_COORD;\nstruct TWERD;\n\nclass LMPainPoints;\n\nstatic const int16_t kBlamerBoxTolerance = 5;\n\n// Enum for expressing the source of error.\n// Note: Please update kIncorrectResultReasonNames when modifying this enum.\nenum IncorrectResultReason {\n  // The text recorded in best choice == truth text\n  IRR_CORRECT,\n  // Either: Top choice is incorrect and is a dictionary word (language model\n  // is unlikely to help correct such errors, so blame the classifier).\n  // Or: the correct unichar was not included in shortlist produced by the\n  // classifier at all.\n  IRR_CLASSIFIER,\n  // Chopper have not found one or more splits that correspond to the correct\n  // character bounding boxes recorded in BlamerBundle::truth_word.\n  IRR_CHOPPER,\n  // Classifier did include correct unichars for each blob in the correct\n  // segmentation, however its rating could have been too bad to allow the\n  // language model to pull out the correct choice. On the other hand the\n  // strength of the language model might have been too weak to favor the\n  // correct answer, this we call this case a classifier-language model\n  // tradeoff error.\n  IRR_CLASS_LM_TRADEOFF,\n  // Page layout failed to produce the correct bounding box. Blame page layout\n  // if the truth was not found for the word, which implies that the bounding\n  // box of the word was incorrect (no truth word had a similar bounding box).\n  IRR_PAGE_LAYOUT,\n  // SegSearch heuristic prevented one or more blobs from the correct\n  // segmentation state to be classified (e.g. the blob was too wide).\n  IRR_SEGSEARCH_HEUR,\n  // The correct segmentaiton state was not explored because of poor SegSearch\n  // pain point prioritization. We blame SegSearch pain point prioritization\n  // if the best rating of a choice constructed from correct segmentation is\n  // better than that of the best choice (i.e. if we got to explore the correct\n  // segmentation state, language model would have picked the correct choice).\n  IRR_SEGSEARCH_PP,\n  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,\n  // and thus use the old language model (permuters).\n  // TODO(antonova): integrate the new language mode with chopper\n  IRR_CLASS_OLD_LM_TRADEOFF,\n  // If there is an incorrect adaptive template match with a better score than\n  // a correct one (either pre-trained or adapted), mark this as adaption error.\n  IRR_ADAPTION,\n  // split_and_recog_word() failed to find a suitable split in truth.\n  IRR_NO_TRUTH_SPLIT,\n  // Truth is not available for this word (e.g. when words in corrected content\n  // file are turned into ~~~~ because an appropriate alignment was not found.\n  IRR_NO_TRUTH,\n  // The text recorded in best choice != truth text, but none of the above\n  // reasons are set.\n  IRR_UNKNOWN,\n\n  IRR_NUM_REASONS\n};\n\n// Blamer-related information to determine the source of errors.\nstruct BlamerBundle {\n  static const char *IncorrectReasonName(IncorrectResultReason irr);\n  BlamerBundle()\n      : truth_has_char_boxes_(false)\n      , incorrect_result_reason_(IRR_CORRECT)\n      , lattice_data_(nullptr) {\n    ClearResults();\n  }\n  BlamerBundle(const BlamerBundle &other) {\n    this->CopyTruth(other);\n    this->CopyResults(other);\n  }\n  ~BlamerBundle() {\n    delete[] lattice_data_;\n  }\n\n  // Accessors.\n  std::string TruthString() const {\n    std::string truth_str;\n    for (auto &text : truth_text_) {\n      truth_str += text;\n    }\n    return truth_str;\n  }\n  IncorrectResultReason incorrect_result_reason() const {\n    return incorrect_result_reason_;\n  }\n  bool NoTruth() const {\n    return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;\n  }\n  bool HasDebugInfo() const {\n    return debug_.length() > 0 || misadaption_debug_.length() > 0;\n  }\n  const std::string &debug() const {\n    return debug_;\n  }\n  const std::string &misadaption_debug() const {\n    return misadaption_debug_;\n  }\n  void UpdateBestRating(float rating) {\n    if (rating < best_correctly_segmented_rating_) {\n      best_correctly_segmented_rating_ = rating;\n    }\n  }\n  int correct_segmentation_length() const {\n    return correct_segmentation_cols_.size();\n  }\n  // Returns true if the given ratings matrix col,row position is included\n  // in the correct segmentation path at the given index.\n  bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {\n    return correct_segmentation_cols_[index] == coord.col &&\n           correct_segmentation_rows_[index] == coord.row;\n  }\n  void set_best_choice_is_dict_and_top_choice(bool value) {\n    best_choice_is_dict_and_top_choice_ = value;\n  }\n  const char *lattice_data() const {\n    return lattice_data_;\n  }\n  int lattice_size() const {\n    return lattice_size_; // size of lattice_data in bytes\n  }\n  void set_lattice_data(const char *data, int size) {\n    lattice_size_ = size;\n    delete[] lattice_data_;\n    lattice_data_ = new char[lattice_size_];\n    memcpy(lattice_data_, data, lattice_size_);\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  const tesseract::ParamsTrainingBundle &params_training_bundle() const {\n    return params_training_bundle_;\n  }\n  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.\n  void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {\n    params_training_bundle_.AddHypothesis(hypo);\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n  // Functions to setup the blamer.\n  // Whole word string, whole word bounding box.\n  void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);\n  // Single \"character\" string, \"character\" bounding box.\n  // May be called multiple times to indicate the characters in a word.\n  void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);\n  // Marks that there is something wrong with the truth text, like it contains\n  // reject characters.\n  void SetRejectedTruth();\n\n  // Returns true if the provided word_choice is correct.\n  bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;\n\n  void ClearResults() {\n    norm_truth_word_.DeleteAllBoxes();\n    norm_box_tolerance_ = 0;\n    if (!NoTruth()) {\n      incorrect_result_reason_ = IRR_CORRECT;\n    }\n    debug_ = \"\";\n    segsearch_is_looking_for_blame_ = false;\n    best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;\n    correct_segmentation_cols_.clear();\n    correct_segmentation_rows_.clear();\n    best_choice_is_dict_and_top_choice_ = false;\n    delete[] lattice_data_;\n    lattice_data_ = nullptr;\n    lattice_size_ = 0;\n  }\n  void CopyTruth(const BlamerBundle &other) {\n    truth_has_char_boxes_ = other.truth_has_char_boxes_;\n    truth_word_ = other.truth_word_;\n    truth_text_ = other.truth_text_;\n    incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);\n  }\n  void CopyResults(const BlamerBundle &other) {\n    norm_truth_word_ = other.norm_truth_word_;\n    norm_box_tolerance_ = other.norm_box_tolerance_;\n    incorrect_result_reason_ = other.incorrect_result_reason_;\n    segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;\n    best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;\n    correct_segmentation_cols_ = other.correct_segmentation_cols_;\n    correct_segmentation_rows_ = other.correct_segmentation_rows_;\n    best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;\n    if (other.lattice_data_ != nullptr) {\n      lattice_data_ = new char[other.lattice_size_];\n      memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);\n      lattice_size_ = other.lattice_size_;\n    } else {\n      lattice_data_ = nullptr;\n    }\n  }\n  const char *IncorrectReason() const;\n\n  // Appends choice and truth details to the given debug string.\n  void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);\n\n  // Sets up the norm_truth_word from truth_word using the given DENORM.\n  void SetupNormTruthWord(const DENORM &denorm);\n\n  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty\n  // bundles) where the right edge/ of the left-hand word is word1_right,\n  // and the left edge of the right-hand word is word2_left.\n  void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,\n                   BlamerBundle *bundle2) const;\n  // \"Joins\" the blames from bundle1 and bundle2 into *this.\n  void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);\n\n  // If a blob with the same bounding box as one of the truth character\n  // bounding boxes is not classified as the corresponding truth character\n  // blames character classifier for incorrect answer.\n  void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,\n                       const BLOB_CHOICE_LIST &choices, bool debug);\n\n  // Checks whether chops were made at all the character bounding box\n  // boundaries in word->truth_word. If not - blames the chopper for an\n  // incorrect answer.\n  void SetChopperBlame(const WERD_RES *word, bool debug);\n  // Blames the classifier or the language model if, after running only the\n  // chopper, best_choice is incorrect and no blame has been yet set.\n  // Blames the classifier if best_choice is classifier's top choice and is a\n  // dictionary word (i.e. language model could not have helped).\n  // Otherwise, blames the language model (formerly permuter word adjustment).\n  void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,\n                                  bool valid_permuter, bool debug);\n  // Sets up the correct_segmentation_* to mark the correct bounding boxes.\n  void SetupCorrectSegmentation(const TWERD *word, bool debug);\n\n  // Returns true if a guided segmentation search is needed.\n  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;\n  // Setup ready to guide the segmentation search to the correct segmentation.\n  void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,\n                        bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,\n                        double max_char_wh_ratio, WERD_RES *word_res);\n  // Returns true if the guided segsearch is in progress.\n  bool GuidedSegsearchStillGoing() const;\n  // The segmentation search has ended. Sets the blame appropriately.\n  void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);\n\n  // If the bundle is null or still does not indicate the correct result,\n  // fix it and use some backup reason for the blame.\n  static void LastChanceBlame(bool debug, WERD_RES *word);\n\n  // Sets the misadaption debug if this word is incorrect, as this word is\n  // being adapted to.\n  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);\n\nprivate:\n  // Copy assignment operator (currently unused, therefore private).\n  BlamerBundle &operator=(const BlamerBundle &other) = delete;\n  void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,\n                bool debug) {\n    incorrect_result_reason_ = irr;\n    debug_ = IncorrectReason();\n    debug_ += \" to blame: \";\n    FillDebugString(msg, choice, debug_);\n    if (debug) {\n      tprintf(\"SetBlame(): %s\", debug_.c_str());\n    }\n  }\n\nprivate:\n  // Set to true when bounding boxes for individual unichars are recorded.\n  bool truth_has_char_boxes_;\n  // Variables used by the segmentation search when looking for the blame.\n  // Set to true while segmentation search is continued after the usual\n  // termination condition in order to look for the blame.\n  bool segsearch_is_looking_for_blame_;\n  // Set to true if best choice is a dictionary word and\n  // classifier's top choice.\n  bool best_choice_is_dict_and_top_choice_;\n  // Tolerance for bounding box comparisons in normalized space.\n  int norm_box_tolerance_;\n  // The true_word (in the original image coordinate space) contains ground\n  // truth bounding boxes for this WERD_RES.\n  tesseract::BoxWord truth_word_;\n  // Same as above, but in normalized coordinates\n  // (filled in by WERD_RES::SetupForRecognition()).\n  tesseract::BoxWord norm_truth_word_;\n  // Contains ground truth unichar for each of the bounding boxes in truth_word.\n  std::vector<std::string> truth_text_;\n  // The reason for incorrect OCR result.\n  IncorrectResultReason incorrect_result_reason_;\n  // Debug text associated with the blame.\n  std::string debug_;\n  // Misadaption debug information (filled in if this word was misadapted to).\n  std::string misadaption_debug_;\n  // Vectors populated by SegSearch to indicate column and row indices that\n  // correspond to blobs with correct bounding boxes.\n  std::vector<int> correct_segmentation_cols_;\n  std::vector<int> correct_segmentation_rows_;\n  // Best rating for correctly segmented path\n  // (set and used by SegSearch when looking for blame).\n  float best_correctly_segmented_rating_;\n  int lattice_size_; // size of lattice_data in bytes\n  // Serialized segmentation search lattice.\n  char *lattice_data_;\n  // Information about hypotheses (paths) explored by the segmentation search.\n#ifndef DISABLED_LEGACY_ENGINE\n  tesseract::ParamsTrainingBundle params_training_bundle_;\n#endif // ndef DISABLED_LEGACY_ENGINE\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_BLAMER_H_\n"
  },
  {
    "path": "src/ccstruct/blobbox.cpp",
    "content": "/**********************************************************************\n * File:        blobbox.cpp  (Formerly blobnbox.c)\n * Description: Code for the textord blob class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blobbox.h\"\n#include \"blobs.h\"   // for TPOINT\n#include \"coutln.h\"  // for C_OUTLINE_IT, C_OUTLINE, C_OUTLINE_LIST\n#include \"environ.h\" // for l_uint32\n#include \"host.h\"    // for NearlyEqual\n#include \"points.h\"  // for operator+=, ICOORD::rotate\n\n#include \"helpers.h\" // for UpdateRange, IntCastRounded\n\n#include <allheaders.h> // for pixGetHeight, pixGetPixel\n\n#include <algorithm> // for max, min\n#include <cmath>\n#include <cstdint>   // for INT32_MAX, INT16_MAX\n\n#define PROJECTION_MARGIN 10 // arbitrary\n\nnamespace tesseract {\n\n// Up to 30 degrees is allowed for rotations of diacritic blobs.\nconst double kCosSmallAngle = 0.866;\n// Min aspect ratio for a joined word to indicate an obvious flow direction.\nconst double kDefiniteAspectRatio = 2.0;\n// Multiple of short length in perimeter to make a joined word.\nconst double kComplexShapePerimeterRatio = 1.5;\n// Min multiple of linesize for medium-sized blobs in ReFilterBlobs.\nconst double kMinMediumSizeRatio = 0.25;\n// Max multiple of linesize for medium-sized blobs in ReFilterBlobs.\nconst double kMaxMediumSizeRatio = 4.0;\n\n// Rotates the box and the underlying blob.\nvoid BLOBNBOX::rotate(FCOORD rotation) {\n  cblob_ptr->rotate(rotation);\n  rotate_box(rotation);\n  compute_bounding_box();\n}\n\n// Reflect the box in the y-axis, leaving the underlying blob untouched.\nvoid BLOBNBOX::reflect_box_in_y_axis() {\n  int left = -box.right();\n  box.set_right(-box.left());\n  box.set_left(left);\n}\n\n// Rotates the box by the angle given by rotation.\n// If the blob is a diacritic, then only small rotations for skew\n// correction can be applied.\nvoid BLOBNBOX::rotate_box(FCOORD rotation) {\n  if (IsDiacritic()) {\n    ASSERT_HOST(rotation.x() >= kCosSmallAngle);\n    ICOORD top_pt((box.left() + box.right()) / 2, base_char_top_);\n    ICOORD bottom_pt(top_pt.x(), base_char_bottom_);\n    top_pt.rotate(rotation);\n    base_char_top_ = top_pt.y();\n    bottom_pt.rotate(rotation);\n    base_char_bottom_ = bottom_pt.y();\n    box.rotate(rotation);\n  } else {\n    box.rotate(rotation);\n    set_diacritic_box(box);\n  }\n}\n\n/**********************************************************************\n * BLOBNBOX::merge\n *\n * Merge this blob with the given blob, which should be after this.\n **********************************************************************/\nvoid BLOBNBOX::merge(  // merge blobs\n    BLOBNBOX *nextblob // blob to join with\n) {\n  box += nextblob->box; // merge boxes\n  set_diacritic_box(box);\n  nextblob->joined = true;\n}\n\n// Merge this with other, taking the outlines from other.\n// Other is not deleted, but left for the caller to handle.\nvoid BLOBNBOX::really_merge(BLOBNBOX *other) {\n  if (other->cblob_ptr != nullptr) {\n    C_OUTLINE_IT ol_it(cblob_ptr->out_list());\n    ol_it.add_list_after(other->cblob_ptr->out_list());\n  }\n  compute_bounding_box();\n}\n\n/**********************************************************************\n * BLOBNBOX::chop\n *\n * Chop this blob into equal sized pieces using the x height as a guide.\n * The blob is not actually chopped. Instead, fake blobs are inserted\n * with the relevant bounding boxes.\n **********************************************************************/\n\nvoid BLOBNBOX::chop(       // chop blobs\n    BLOBNBOX_IT *start_it, // location of this\n    BLOBNBOX_IT *end_it,   // iterator\n    FCOORD rotation,       // for landscape\n    float xheight          // of line\n) {\n  int16_t blobcount;          // no of blobs\n  BLOBNBOX *newblob;          // fake blob\n  BLOBNBOX *blob;             // current blob\n  int16_t blobindex;          // number of chop\n  int16_t leftx;              // left edge of blob\n  float blobwidth;            // width of each\n  float rightx;               // right edge to scan\n  float ymin, ymax;           // limits of new blob\n  float test_ymin, test_ymax; // limits of part blob\n  ICOORD bl, tr;              // corners of box\n  BLOBNBOX_IT blob_it;        // blob iterator\n\n  // get no of chops\n  blobcount = static_cast<int16_t>(std::floor(box.width() / xheight));\n  if (blobcount > 1 && cblob_ptr != nullptr) {\n    // width of each\n    blobwidth = static_cast<float>(box.width() + 1) / blobcount;\n    for (blobindex = blobcount - 1, rightx = box.right(); blobindex >= 0;\n         blobindex--, rightx -= blobwidth) {\n      ymin = static_cast<float>(INT32_MAX);\n      ymax = static_cast<float>(-INT32_MAX);\n      blob_it = *start_it;\n      do {\n        blob = blob_it.data();\n        find_cblob_vlimits(blob->cblob_ptr, rightx - blobwidth, rightx,\n                           /*rotation, */ test_ymin, test_ymax);\n        blob_it.forward();\n        UpdateRange(test_ymin, test_ymax, &ymin, &ymax);\n      } while (blob != end_it->data());\n      if (ymin < ymax) {\n        leftx = static_cast<int16_t>(std::floor(rightx - blobwidth));\n        if (leftx < box.left()) {\n          leftx = box.left(); // clip to real box\n        }\n        bl = ICOORD(leftx, static_cast<int16_t>(std::floor(ymin)));\n        tr = ICOORD(static_cast<int16_t>(std::ceil(rightx)), static_cast<int16_t>(std::ceil(ymax)));\n        if (blobindex == 0) {\n          box = TBOX(bl, tr); // change box\n        } else {\n          newblob = new BLOBNBOX;\n          // box is all it has\n          newblob->box = TBOX(bl, tr);\n          // stay on current\n          newblob->base_char_top_ = tr.y();\n          newblob->base_char_bottom_ = bl.y();\n          end_it->add_after_stay_put(newblob);\n        }\n      }\n    }\n  }\n}\n\n// Returns the box gaps between this and its neighbours_ in an array\n// indexed by BlobNeighbourDir.\nvoid BLOBNBOX::NeighbourGaps(int gaps[BND_COUNT]) const {\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    gaps[dir] = INT16_MAX;\n    BLOBNBOX *neighbour = neighbours_[dir];\n    if (neighbour != nullptr) {\n      const TBOX &n_box = neighbour->bounding_box();\n      if (dir == BND_LEFT || dir == BND_RIGHT) {\n        gaps[dir] = box.x_gap(n_box);\n      } else {\n        gaps[dir] = box.y_gap(n_box);\n      }\n    }\n  }\n}\n// Returns the min and max horizontal and vertical gaps (from NeighbourGaps)\n// modified so that if the max exceeds the max dimension of the blob, and\n// the min is less, the max is replaced with the min.\n// The objective is to catch cases where there is only a single neighbour\n// and avoid reporting the other gap as a ridiculously large number\nvoid BLOBNBOX::MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const {\n  int max_dimension = std::max(box.width(), box.height());\n  int gaps[BND_COUNT];\n  NeighbourGaps(gaps);\n  *h_min = std::min(gaps[BND_LEFT], gaps[BND_RIGHT]);\n  *h_max = std::max(gaps[BND_LEFT], gaps[BND_RIGHT]);\n  if (*h_max > max_dimension && *h_min < max_dimension) {\n    *h_max = *h_min;\n  }\n  *v_min = std::min(gaps[BND_ABOVE], gaps[BND_BELOW]);\n  *v_max = std::max(gaps[BND_ABOVE], gaps[BND_BELOW]);\n  if (*v_max > max_dimension && *v_min < max_dimension) {\n    *v_max = *v_min;\n  }\n}\n\n// Nulls out any neighbours that are DeletableNoise to remove references.\nvoid BLOBNBOX::CleanNeighbours() {\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    BLOBNBOX *neighbour = neighbours_[dir];\n    if (neighbour != nullptr && neighbour->DeletableNoise()) {\n      neighbours_[dir] = nullptr;\n      good_stroke_neighbours_[dir] = false;\n    }\n  }\n}\n\n// Returns positive if there is at least one side neighbour that has a similar\n// stroke width and is not on the other side of a rule line.\nint BLOBNBOX::GoodTextBlob() const {\n  int score = 0;\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    if (good_stroke_neighbour(bnd)) {\n      ++score;\n    }\n  }\n  return score;\n}\n\n// Returns the number of side neighbours that are of type BRT_NOISE.\nint BLOBNBOX::NoisyNeighbours() const {\n  int count = 0;\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    BLOBNBOX *blob = neighbour(bnd);\n    if (blob != nullptr && blob->region_type() == BRT_NOISE) {\n      ++count;\n    }\n  }\n  return count;\n}\n\n// Returns true, and sets vert_possible/horz_possible if the blob has some\n// feature that makes it individually appear to flow one way.\n// eg if it has a high aspect ratio, yet has a complex shape, such as a\n// joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1 etc.\nbool BLOBNBOX::DefiniteIndividualFlow() {\n  if (cblob() == nullptr) {\n    return false;\n  }\n  int box_perimeter = 2 * (box.height() + box.width());\n  if (box.width() > box.height() * kDefiniteAspectRatio) {\n    // Attempt to distinguish a wide joined word from a dash.\n    // If it is a dash, then its perimeter is approximately\n    // 2 * (box width + stroke width), but more if the outline is noisy,\n    // so perimeter - 2*(box width + stroke width) should be close to zero.\n    // A complex shape such as a joined word should have a much larger value.\n    int perimeter = cblob()->perimeter();\n    if (vert_stroke_width() > 0 || perimeter <= 0) {\n      perimeter -= 2 * vert_stroke_width();\n    } else {\n      perimeter -= 4 * cblob()->area() / perimeter;\n    }\n    perimeter -= 2 * box.width();\n    // Use a multiple of the box perimeter as a threshold.\n    if (perimeter > kComplexShapePerimeterRatio * box_perimeter) {\n      set_vert_possible(false);\n      set_horz_possible(true);\n      return true;\n    }\n  }\n  if (box.height() > box.width() * kDefiniteAspectRatio) {\n    // As above, but for a putative vertical word vs a I/1/l.\n    int perimeter = cblob()->perimeter();\n    if (horz_stroke_width() > 0 || perimeter <= 0) {\n      perimeter -= 2 * horz_stroke_width();\n    } else {\n      perimeter -= 4 * cblob()->area() / perimeter;\n    }\n    perimeter -= 2 * box.height();\n    if (perimeter > kComplexShapePerimeterRatio * box_perimeter) {\n      set_vert_possible(true);\n      set_horz_possible(false);\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns true if there is no tabstop violation in merging this and other.\nbool BLOBNBOX::ConfirmNoTabViolation(const BLOBNBOX &other) const {\n  if (box.left() < other.box.left() && box.left() < other.left_rule_) {\n    return false;\n  }\n  if (other.box.left() < box.left() && other.box.left() < left_rule_) {\n    return false;\n  }\n  if (box.right() > other.box.right() && box.right() > other.right_rule_) {\n    return false;\n  }\n  if (other.box.right() > box.right() && other.box.right() > right_rule_) {\n    return false;\n  }\n  return true;\n}\n\n// Returns true if other has a similar stroke width to this.\nbool BLOBNBOX::MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,\n                                   double constant_tolerance) const {\n  // The perimeter-based width is used as a backup in case there is\n  // no information in the blob.\n  double p_width = area_stroke_width();\n  double n_p_width = other.area_stroke_width();\n  float h_tolerance = horz_stroke_width_ * fractional_tolerance + constant_tolerance;\n  float v_tolerance = vert_stroke_width_ * fractional_tolerance + constant_tolerance;\n  double p_tolerance = p_width * fractional_tolerance + constant_tolerance;\n  bool h_zero = horz_stroke_width_ == 0.0f || other.horz_stroke_width_ == 0.0f;\n  bool v_zero = vert_stroke_width_ == 0.0f || other.vert_stroke_width_ == 0.0f;\n  bool h_ok = !h_zero && NearlyEqual(horz_stroke_width_, other.horz_stroke_width_, h_tolerance);\n  bool v_ok = !v_zero && NearlyEqual(vert_stroke_width_, other.vert_stroke_width_, v_tolerance);\n  bool p_ok = h_zero && v_zero && NearlyEqual(p_width, n_p_width, p_tolerance);\n  // For a match, at least one of the horizontal and vertical widths\n  // must match, and the other one must either match or be zero.\n  // Only if both are zero will we look at the perimeter metric.\n  return p_ok || ((v_ok || h_ok) && (h_ok || h_zero) && (v_ok || v_zero));\n}\n\n// Returns a bounding box of the outline contained within the\n// given horizontal range.\nTBOX BLOBNBOX::BoundsWithinLimits(int left, int right) {\n  FCOORD no_rotation(1.0f, 0.0f);\n  float top = box.top();\n  float bottom = box.bottom();\n  if (cblob_ptr != nullptr) {\n    find_cblob_limits(cblob_ptr, static_cast<float>(left), static_cast<float>(right), no_rotation,\n                      bottom, top);\n  }\n\n  if (top < bottom) {\n    top = box.top();\n    bottom = box.bottom();\n  }\n  FCOORD bot_left(left, bottom);\n  FCOORD top_right(right, top);\n  TBOX shrunken_box(bot_left);\n  TBOX shrunken_box2(top_right);\n  shrunken_box += shrunken_box2;\n  return shrunken_box;\n}\n\n// Estimates and stores the baseline position based on the shape of the\n// outline.\nvoid BLOBNBOX::EstimateBaselinePosition() {\n  baseline_y_ = box.bottom(); // The default.\n  if (cblob_ptr == nullptr) {\n    return;\n  }\n  baseline_y_ = cblob_ptr->EstimateBaselinePosition();\n}\n\n// Helper to call CleanNeighbours on all blobs on the list.\nvoid BLOBNBOX::CleanNeighbours(BLOBNBOX_LIST *blobs) {\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob_it.data()->CleanNeighbours();\n  }\n}\n\n// Helper to delete all the deletable blobs on the list.\nvoid BLOBNBOX::DeleteNoiseBlobs(BLOBNBOX_LIST *blobs) {\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->DeletableNoise()) {\n      delete blob->remove_cblob();\n      delete blob_it.extract();\n    }\n  }\n}\n\n// Helper to compute edge offsets for  all the blobs on the list.\n// See coutln.h for an explanation of edge offsets.\nvoid BLOBNBOX::ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs) {\n  int grey_height = 0;\n  int thr_height = 0;\n  int scale_factor = 1;\n  if (thresholds != nullptr && grey != nullptr) {\n    grey_height = pixGetHeight(grey);\n    thr_height = pixGetHeight(thresholds);\n    scale_factor = IntCastRounded(static_cast<double>(grey_height) / thr_height);\n  }\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->cblob() != nullptr) {\n      // Get the threshold that applies to this blob.\n      l_uint32 threshold = 128;\n      if (thresholds != nullptr && grey != nullptr) {\n        const TBOX &box = blob->cblob()->bounding_box();\n        // Transform the coordinates if required.\n        TPOINT pt((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2);\n        pixGetPixel(thresholds, pt.x / scale_factor, thr_height - 1 - pt.y / scale_factor,\n                    &threshold);\n      }\n      blob->cblob()->ComputeEdgeOffsets(threshold, grey);\n    }\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n// Helper to draw all the blobs on the list in the given body_colour,\n// with child outlines in the child_colour.\nvoid BLOBNBOX::PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,\n                         ScrollView::Color child_colour, ScrollView *win) {\n  BLOBNBOX_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot(win, body_colour, child_colour);\n  }\n}\n\n// Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the\n// given list in the given body_colour, with child outlines in the\n// child_colour.\nvoid BLOBNBOX::PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,\n                              ScrollView::Color child_colour, ScrollView *win) {\n  BLOBNBOX_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (blob->DeletableNoise()) {\n      blob->plot(win, body_colour, child_colour);\n    }\n  }\n}\n\nScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type) {\n  switch (region_type) {\n    case BRT_HLINE:\n      return ScrollView::BROWN;\n    case BRT_VLINE:\n      return ScrollView::DARK_GREEN;\n    case BRT_RECTIMAGE:\n      return ScrollView::RED;\n    case BRT_POLYIMAGE:\n      return ScrollView::ORANGE;\n    case BRT_UNKNOWN:\n      return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView::WHITE;\n    case BRT_VERT_TEXT:\n      if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE) {\n        return ScrollView::GREEN;\n      }\n      if (flow_type == BTFT_CHAIN) {\n        return ScrollView::LIME_GREEN;\n      }\n      return ScrollView::YELLOW;\n    case BRT_TEXT:\n      if (flow_type == BTFT_STRONG_CHAIN) {\n        return ScrollView::BLUE;\n      }\n      if (flow_type == BTFT_TEXT_ON_IMAGE) {\n        return ScrollView::LIGHT_BLUE;\n      }\n      if (flow_type == BTFT_CHAIN) {\n        return ScrollView::MEDIUM_BLUE;\n      }\n      if (flow_type == BTFT_LEADER) {\n        return ScrollView::WHEAT;\n      }\n      if (flow_type == BTFT_NONTEXT) {\n        return ScrollView::PINK;\n      }\n      return ScrollView::MAGENTA;\n    default:\n      return ScrollView::GREY;\n  }\n}\n\n// Keep in sync with BlobRegionType.\nScrollView::Color BLOBNBOX::BoxColor() const {\n  return TextlineColor(region_type_, flow_);\n}\n\nvoid BLOBNBOX::plot(ScrollView *window,               // window to draw in\n                    ScrollView::Color blob_colour,    // for outer bits\n                    ScrollView::Color child_colour) { // for holes\n  if (cblob_ptr != nullptr) {\n    cblob_ptr->plot(window, blob_colour, child_colour);\n  }\n}\n#endif\n/**********************************************************************\n * find_cblob_limits\n *\n * Scan the outlines of the cblob to locate the y min and max\n * between the given x limits.\n **********************************************************************/\n\nvoid find_cblob_limits( // get y limits\n    C_BLOB *blob,       // blob to search\n    float leftx,        // x limits\n    float rightx,\n    FCOORD rotation, // for landscape\n    float &ymin,     // output y limits\n    float &ymax) {\n  int16_t stepindex;  // current point\n  ICOORD pos;         // current coords\n  ICOORD vec;         // rotated step\n  C_OUTLINE *outline; // current outline\n                      // outlines\n  C_OUTLINE_IT out_it = blob->out_list();\n\n  ymin = static_cast<float>(INT32_MAX);\n  ymax = static_cast<float>(-INT32_MAX);\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    outline = out_it.data();\n    pos = outline->start_pos(); // get coords\n    pos.rotate(rotation);\n    for (stepindex = 0; stepindex < outline->pathlength(); stepindex++) {\n      // inside\n      if (pos.x() >= leftx && pos.x() <= rightx) {\n        UpdateRange(pos.y(), &ymin, &ymax);\n      }\n      vec = outline->step(stepindex);\n      vec.rotate(rotation);\n      pos += vec; // move to next\n    }\n  }\n}\n\n/**********************************************************************\n * find_cblob_vlimits\n *\n * Scan the outlines of the cblob to locate the y min and max\n * between the given x limits.\n **********************************************************************/\n\nvoid find_cblob_vlimits( // get y limits\n    C_BLOB *blob,        // blob to search\n    float leftx,         // x limits\n    float rightx,\n    float &ymin, // output y limits\n    float &ymax) {\n  int16_t stepindex;  // current point\n  ICOORD pos;         // current coords\n  ICOORD vec;         // rotated step\n  C_OUTLINE *outline; // current outline\n                      // outlines\n  C_OUTLINE_IT out_it = blob->out_list();\n\n  ymin = static_cast<float>(INT32_MAX);\n  ymax = static_cast<float>(-INT32_MAX);\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    outline = out_it.data();\n    pos = outline->start_pos(); // get coords\n    for (stepindex = 0; stepindex < outline->pathlength(); stepindex++) {\n      // inside\n      if (pos.x() >= leftx && pos.x() <= rightx) {\n        UpdateRange(pos.y(), &ymin, &ymax);\n      }\n      vec = outline->step(stepindex);\n      pos += vec; // move to next\n    }\n  }\n}\n\n/**********************************************************************\n * find_cblob_hlimits\n *\n * Scan the outlines of the cblob to locate the x min and max\n * between the given y limits.\n **********************************************************************/\n\nvoid find_cblob_hlimits( // get x limits\n    C_BLOB *blob,        // blob to search\n    float bottomy,       // y limits\n    float topy,\n    float &xmin, // output x limits\n    float &xmax) {\n  int16_t stepindex;  // current point\n  ICOORD pos;         // current coords\n  ICOORD vec;         // rotated step\n  C_OUTLINE *outline; // current outline\n                      // outlines\n  C_OUTLINE_IT out_it = blob->out_list();\n\n  xmin = static_cast<float>(INT32_MAX);\n  xmax = static_cast<float>(-INT32_MAX);\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    outline = out_it.data();\n    pos = outline->start_pos(); // get coords\n    for (stepindex = 0; stepindex < outline->pathlength(); stepindex++) {\n      // inside\n      if (pos.y() >= bottomy && pos.y() <= topy) {\n        UpdateRange(pos.x(), &xmin, &xmax);\n      }\n      vec = outline->step(stepindex);\n      pos += vec; // move to next\n    }\n  }\n}\n\n/**********************************************************************\n * crotate_cblob\n *\n * Rotate the copy by the given vector and return a C_BLOB.\n **********************************************************************/\n\nC_BLOB *crotate_cblob( // rotate it\n    C_BLOB *blob,      // blob to search\n    FCOORD rotation    // for landscape\n) {\n  C_OUTLINE_LIST out_list; // output outlines\n                           // input outlines\n  C_OUTLINE_IT in_it = blob->out_list();\n  // output outlines\n  C_OUTLINE_IT out_it = &out_list;\n\n  for (in_it.mark_cycle_pt(); !in_it.cycled_list(); in_it.forward()) {\n    out_it.add_after_then_move(new C_OUTLINE(in_it.data(), rotation));\n  }\n  return new C_BLOB(&out_list);\n}\n\n/**********************************************************************\n * box_next\n *\n * Compute the bounding box of this blob with merging of x overlaps\n * but no pre-chopping.\n * Then move the iterator on to the start of the next blob.\n **********************************************************************/\n\nTBOX box_next(      // get bounding box\n    BLOBNBOX_IT *it // iterator to blobds\n) {\n  BLOBNBOX *blob; // current blob\n  TBOX result;    // total box\n\n  blob = it->data();\n  result = blob->bounding_box();\n  do {\n    it->forward();\n    blob = it->data();\n    if (blob->cblob() == nullptr) {\n      // was pre-chopped\n      result += blob->bounding_box();\n    }\n  }\n  // until next real blob\n  while ((blob->cblob() == nullptr) || blob->joined_to_prev());\n  return result;\n}\n\n/**********************************************************************\n * box_next_pre_chopped\n *\n * Compute the bounding box of this blob with merging of x overlaps\n * but WITH pre-chopping.\n * Then move the iterator on to the start of the next pre-chopped blob.\n **********************************************************************/\n\nTBOX box_next_pre_chopped( // get bounding box\n    BLOBNBOX_IT *it        // iterator to blobds\n) {\n  BLOBNBOX *blob; // current blob\n  TBOX result;    // total box\n\n  blob = it->data();\n  result = blob->bounding_box();\n  do {\n    it->forward();\n    blob = it->data();\n  }\n  // until next real blob\n  while (blob->joined_to_prev());\n  return result;\n}\n\n/**********************************************************************\n * TO_ROW::TO_ROW\n *\n * Constructor to make a row from a blob.\n **********************************************************************/\n\nTO_ROW::TO_ROW(     // constructor\n    BLOBNBOX *blob, // first blob\n    float top,      // corrected top\n    float bottom,   // of row\n    float row_size  // ideal\n) {\n  clear();\n  y_min = bottom;\n  y_max = top;\n  initial_y_min = bottom;\n\n  float diff;              // in size\n  BLOBNBOX_IT it = &blobs; // list of blobs\n\n  it.add_to_end(blob);\n  diff = top - bottom - row_size;\n  if (diff > 0) {\n    y_max -= diff / 2;\n    y_min += diff / 2;\n  }\n  // very small object\n  else if ((top - bottom) * 3 < row_size) {\n    diff = row_size / 3 + bottom - top;\n    y_max += diff / 2;\n    y_min -= diff / 2;\n  }\n}\n\nvoid TO_ROW::print() const {\n  tprintf(\n      \"pitch=%d, fp=%g, fps=%g, fpns=%g, prs=%g, prns=%g,\"\n      \" spacing=%g xh=%g y_origin=%g xev=%d, asc=%g, desc=%g,\"\n      \" body=%g, minsp=%d maxnsp=%d, thr=%d kern=%g sp=%g\\n\",\n      pitch_decision, fixed_pitch, fp_space, fp_nonsp, pr_space, pr_nonsp, spacing, xheight,\n      y_origin, xheight_evidence, ascrise, descdrop, body_size, min_space, max_nonspace,\n      space_threshold, kern_size, space_size);\n}\n\n/**********************************************************************\n * TO_ROW:add_blob\n *\n * Add the blob to the end of the row.\n **********************************************************************/\n\nvoid TO_ROW::add_blob( // constructor\n    BLOBNBOX *blob,    // first blob\n    float top,         // corrected top\n    float bottom,      // of row\n    float row_size     // ideal\n) {\n  float allowed;           // allowed expansion\n  float available;         // expansion\n  BLOBNBOX_IT it = &blobs; // list of blobs\n\n  it.add_to_end(blob);\n  allowed = row_size + y_min - y_max;\n  if (allowed > 0) {\n    available = top > y_max ? top - y_max : 0;\n    if (bottom < y_min) {\n      // total available\n      available += y_min - bottom;\n    }\n    if (available > 0) {\n      available += available; // do it gradually\n      if (available < allowed) {\n        available = allowed;\n      }\n      if (bottom < y_min) {\n        y_min -= (y_min - bottom) * allowed / available;\n      }\n      if (top > y_max) {\n        y_max += (top - y_max) * allowed / available;\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * TO_ROW:insert_blob\n *\n * Add the blob to the row in the correct position.\n **********************************************************************/\n\nvoid TO_ROW::insert_blob( // constructor\n    BLOBNBOX *blob        // first blob\n) {\n  BLOBNBOX_IT it = &blobs; // list of blobs\n\n  if (it.empty()) {\n    it.add_before_then_move(blob);\n  } else {\n    it.mark_cycle_pt();\n    while (!it.cycled_list() && it.data()->bounding_box().left() <= blob->bounding_box().left()) {\n      it.forward();\n    }\n    if (it.cycled_list()) {\n      it.add_to_end(blob);\n    } else {\n      it.add_before_stay_put(blob);\n    }\n  }\n}\n\n/**********************************************************************\n * TO_ROW::compute_vertical_projection\n *\n * Compute the vertical projection of a TO_ROW from its blobs.\n **********************************************************************/\n\nvoid TO_ROW::compute_vertical_projection() { // project whole row\n  TBOX row_box;                              // bound of row\n  BLOBNBOX *blob;                            // current blob\n  TBOX blob_box;                             // bounding box\n  BLOBNBOX_IT blob_it = blob_list();\n\n  if (blob_it.empty()) {\n    return;\n  }\n  row_box = blob_it.data()->bounding_box();\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    row_box += blob_it.data()->bounding_box();\n  }\n\n  projection.set_range(row_box.left() - PROJECTION_MARGIN, row_box.right() + PROJECTION_MARGIN - 1);\n  projection_left = row_box.left() - PROJECTION_MARGIN;\n  projection_right = row_box.right() + PROJECTION_MARGIN;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    if (blob->cblob() != nullptr) {\n      vertical_cblob_projection(blob->cblob(), &projection);\n    }\n  }\n}\n\n/**********************************************************************\n * TO_ROW::clear\n *\n * Zero out all scalar members.\n **********************************************************************/\nvoid TO_ROW::clear() {\n  all_caps = false;\n  used_dm_model = false;\n  projection_left = 0;\n  projection_right = 0;\n  pitch_decision = PITCH_DUNNO;\n  fixed_pitch = 0.0;\n  fp_space = 0.0;\n  fp_nonsp = 0.0;\n  pr_space = 0.0;\n  pr_nonsp = 0.0;\n  spacing = 0.0;\n  xheight = 0.0;\n  xheight_evidence = 0;\n  body_size = 0.0;\n  ascrise = 0.0;\n  descdrop = 0.0;\n  min_space = 0;\n  max_nonspace = 0;\n  space_threshold = 0;\n  kern_size = 0.0;\n  space_size = 0.0;\n  y_min = 0.0;\n  y_max = 0.0;\n  initial_y_min = 0.0;\n  m = 0.0;\n  c = 0.0;\n  error = 0.0;\n  para_c = 0.0;\n  para_error = 0.0;\n  y_origin = 0.0;\n  credibility = 0.0;\n  num_repeated_sets_ = -1;\n}\n\n/**********************************************************************\n * vertical_cblob_projection\n *\n * Compute the vertical projection of a cblob from its outlines\n * and add to the given STATS.\n **********************************************************************/\n\nvoid vertical_cblob_projection( // project outlines\n    C_BLOB *blob,               // blob to project\n    STATS *stats                // output\n) {\n  // outlines of blob\n  C_OUTLINE_IT out_it = blob->out_list();\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    vertical_coutline_projection(out_it.data(), stats);\n  }\n}\n\n/**********************************************************************\n * vertical_coutline_projection\n *\n * Compute the vertical projection of an outline from its outlines\n * and add to the given STATS.\n **********************************************************************/\n\nvoid vertical_coutline_projection( // project outlines\n    C_OUTLINE *outline,            // outline to project\n    STATS *stats                   // output\n) {\n  ICOORD pos;        // current point\n  ICOORD step;       // edge step\n  int32_t length;    // of outline\n  int16_t stepindex; // current step\n  C_OUTLINE_IT out_it = outline->child();\n\n  pos = outline->start_pos();\n  length = outline->pathlength();\n  for (stepindex = 0; stepindex < length; stepindex++) {\n    step = outline->step(stepindex);\n    if (step.x() > 0) {\n      stats->add(pos.x(), -pos.y());\n    } else if (step.x() < 0) {\n      stats->add(pos.x() - 1, pos.y());\n    }\n    pos += step;\n  }\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    vertical_coutline_projection(out_it.data(), stats);\n  }\n}\n\n/**********************************************************************\n * TO_BLOCK::TO_BLOCK\n *\n * Constructor to make a TO_BLOCK from a real block.\n **********************************************************************/\n\nTO_BLOCK::TO_BLOCK(  // make a block\n    BLOCK *src_block // real block\n) {\n  clear();\n  block = src_block;\n}\n\n/**********************************************************************\n * TO_BLOCK::clear\n *\n * Zero out all scalar members.\n **********************************************************************/\nvoid TO_BLOCK::clear() {\n  block = nullptr;\n  pitch_decision = PITCH_DUNNO;\n  line_spacing = 0.0;\n  line_size = 0.0;\n  max_blob_size = 0.0;\n  baseline_offset = 0.0;\n  xheight = 0.0;\n  fixed_pitch = 0.0;\n  kern_size = 0.0;\n  space_size = 0.0;\n  min_space = 0;\n  max_nonspace = 0;\n  fp_space = 0.0;\n  fp_nonsp = 0.0;\n  pr_space = 0.0;\n  pr_nonsp = 0.0;\n  key_row = nullptr;\n}\n\nTO_BLOCK::~TO_BLOCK() {\n  // Any residual BLOBNBOXes at this stage own their blobs, so delete them.\n  BLOBNBOX::clear_blobnboxes(&blobs);\n  BLOBNBOX::clear_blobnboxes(&underlines);\n  BLOBNBOX::clear_blobnboxes(&noise_blobs);\n  BLOBNBOX::clear_blobnboxes(&small_blobs);\n  BLOBNBOX::clear_blobnboxes(&large_blobs);\n}\n\n// Helper function to divide the input blobs over noise, small, medium\n// and large lists. Blobs small in height and (small in width or large in width)\n// go in the noise list. Dash (-) candidates go in the small list, and\n// medium and large are by height.\n// SIDE-EFFECT: reset all blobs to initial state by calling Init().\nstatic void SizeFilterBlobs(int min_height, int max_height, BLOBNBOX_LIST *src_list,\n                            BLOBNBOX_LIST *noise_list, BLOBNBOX_LIST *small_list,\n                            BLOBNBOX_LIST *medium_list, BLOBNBOX_LIST *large_list) {\n  BLOBNBOX_IT noise_it(noise_list);\n  BLOBNBOX_IT small_it(small_list);\n  BLOBNBOX_IT medium_it(medium_list);\n  BLOBNBOX_IT large_it(large_list);\n  for (BLOBNBOX_IT src_it(src_list); !src_it.empty(); src_it.forward()) {\n    BLOBNBOX *blob = src_it.extract();\n    blob->ReInit();\n    int width = blob->bounding_box().width();\n    int height = blob->bounding_box().height();\n    if (height < min_height && (width < min_height || width > max_height)) {\n      noise_it.add_after_then_move(blob);\n    } else if (height > max_height) {\n      large_it.add_after_then_move(blob);\n    } else if (height < min_height) {\n      small_it.add_after_then_move(blob);\n    } else {\n      medium_it.add_after_then_move(blob);\n    }\n  }\n}\n\n// Reorganize the blob lists with a different definition of small, medium\n// and large, compared to the original definition.\n// Height is still the primary filter key, but medium width blobs of small\n// height become small, and very wide blobs of small height stay noise, along\n// with small dot-shaped blobs.\nvoid TO_BLOCK::ReSetAndReFilterBlobs() {\n  int min_height = IntCastRounded(kMinMediumSizeRatio * line_size);\n  int max_height = IntCastRounded(kMaxMediumSizeRatio * line_size);\n  BLOBNBOX_LIST noise_list;\n  BLOBNBOX_LIST small_list;\n  BLOBNBOX_LIST medium_list;\n  BLOBNBOX_LIST large_list;\n  SizeFilterBlobs(min_height, max_height, &blobs, &noise_list, &small_list, &medium_list,\n                  &large_list);\n  SizeFilterBlobs(min_height, max_height, &large_blobs, &noise_list, &small_list, &medium_list,\n                  &large_list);\n  SizeFilterBlobs(min_height, max_height, &small_blobs, &noise_list, &small_list, &medium_list,\n                  &large_list);\n  SizeFilterBlobs(min_height, max_height, &noise_blobs, &noise_list, &small_list, &medium_list,\n                  &large_list);\n  BLOBNBOX_IT blob_it(&blobs);\n  blob_it.add_list_after(&medium_list);\n  blob_it.set_to_list(&large_blobs);\n  blob_it.add_list_after(&large_list);\n  blob_it.set_to_list(&small_blobs);\n  blob_it.add_list_after(&small_list);\n  blob_it.set_to_list(&noise_blobs);\n  blob_it.add_list_after(&noise_list);\n}\n\n// Deletes noise blobs from all lists where not owned by a ColPartition.\nvoid TO_BLOCK::DeleteUnownedNoise() {\n  BLOBNBOX::CleanNeighbours(&blobs);\n  BLOBNBOX::CleanNeighbours(&small_blobs);\n  BLOBNBOX::CleanNeighbours(&noise_blobs);\n  BLOBNBOX::CleanNeighbours(&large_blobs);\n  BLOBNBOX::DeleteNoiseBlobs(&blobs);\n  BLOBNBOX::DeleteNoiseBlobs(&small_blobs);\n  BLOBNBOX::DeleteNoiseBlobs(&noise_blobs);\n  BLOBNBOX::DeleteNoiseBlobs(&large_blobs);\n}\n\n// Computes and stores the edge offsets on each blob for use in feature\n// extraction, using greyscale if the supplied grey and thresholds pixes\n// are 8-bit or otherwise (if nullptr or not 8 bit) the original binary\n// edge step outlines.\n// Thresholds must either be the same size as grey or an integer down-scale\n// of grey.\n// See coutln.h for an explanation of edge offsets.\nvoid TO_BLOCK::ComputeEdgeOffsets(Image thresholds, Image grey) {\n  BLOBNBOX::ComputeEdgeOffsets(thresholds, grey, &blobs);\n  BLOBNBOX::ComputeEdgeOffsets(thresholds, grey, &small_blobs);\n  BLOBNBOX::ComputeEdgeOffsets(thresholds, grey, &noise_blobs);\n}\n\n#ifndef GRAPHICS_DISABLED\n// Draw the noise blobs from all lists in red.\nvoid TO_BLOCK::plot_noise_blobs(ScrollView *win) {\n  BLOBNBOX::PlotNoiseBlobs(&noise_blobs, ScrollView::RED, ScrollView::RED, win);\n  BLOBNBOX::PlotNoiseBlobs(&small_blobs, ScrollView::RED, ScrollView::RED, win);\n  BLOBNBOX::PlotNoiseBlobs(&large_blobs, ScrollView::RED, ScrollView::RED, win);\n  BLOBNBOX::PlotNoiseBlobs(&blobs, ScrollView::RED, ScrollView::RED, win);\n}\n\n// Draw the blobs on the various lists in the block in different colors.\nvoid TO_BLOCK::plot_graded_blobs(ScrollView *win) {\n  BLOBNBOX::PlotBlobs(&noise_blobs, ScrollView::CORAL, ScrollView::BLUE, win);\n  BLOBNBOX::PlotBlobs(&small_blobs, ScrollView::GOLDENROD, ScrollView::YELLOW, win);\n  BLOBNBOX::PlotBlobs(&large_blobs, ScrollView::DARK_GREEN, ScrollView::YELLOW, win);\n  BLOBNBOX::PlotBlobs(&blobs, ScrollView::WHITE, ScrollView::BROWN, win);\n}\n\n/**********************************************************************\n * plot_blob_list\n *\n * Draw a list of blobs.\n **********************************************************************/\n\nvoid plot_blob_list(ScrollView *win,                  // window to draw in\n                    BLOBNBOX_LIST *list,              // blob list\n                    ScrollView::Color body_colour,    // colour to draw\n                    ScrollView::Color child_colour) { // colour of child\n  BLOBNBOX_IT it = list;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot(win, body_colour, child_colour);\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/blobbox.h",
    "content": "/**********************************************************************\n * File:        blobbox.h  (Formerly blobnbox.h)\n * Description: Code for the textord blob class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef BLOBBOX_H\n#define BLOBBOX_H\n\n#include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#include \"elst2.h\"      // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK\n#include \"errcode.h\"    // for ASSERT_HOST\n#include \"ocrblock.h\"   // for BLOCK\n#include \"params.h\"     // for DoubleParam, double_VAR_H\n#include \"pdblock.h\"    // for PDBLK\n#include \"points.h\"     // for FCOORD, ICOORD, ICOORDELT_LIST\n#include \"quspline.h\"   // for QSPLINE\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n#include \"statistc.h\"   // for STATS\n#include \"stepblob.h\"   // for C_BLOB\n#include \"tprintf.h\"    // for tprintf\n#include \"werd.h\"       // for WERD_LIST\n\n#include <cinttypes> // for PRId32\n#include <cmath>     // for std::sqrt\n#include <cstdint>   // for int16_t, int32_t\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass C_OUTLINE;\n\nenum PITCH_TYPE {\n  PITCH_DUNNO,       // insufficient data\n  PITCH_DEF_FIXED,   // definitely fixed\n  PITCH_MAYBE_FIXED, // could be\n  PITCH_DEF_PROP,\n  PITCH_MAYBE_PROP,\n  PITCH_CORR_FIXED,\n  PITCH_CORR_PROP\n};\n\n// The possible tab-stop types of each side of a BLOBNBOX.\n// The ordering is important, as it is used for deleting dead-ends in the\n// search. ALIGNED, CONFIRMED and VLINE should remain greater than the\n// non-aligned, unset, or deleted members.\nenum TabType {\n  TT_NONE,          // Not a tab.\n  TT_DELETED,       // Not a tab after detailed analysis.\n  TT_MAYBE_RAGGED,  // Initial designation of a tab-stop candidate.\n  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.\n  TT_CONFIRMED,     // Aligned with neighbours.\n  TT_VLINE          // Detected as a vertical line.\n};\n\n// The possible region types of a BLOBNBOX.\n// Note: keep all the text types > BRT_UNKNOWN and all the image types less.\n// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the\n// *Type static functions below.\nenum BlobRegionType {\n  BRT_NOISE,     // Neither text nor image.\n  BRT_HLINE,     // Horizontal separator line.\n  BRT_VLINE,     // Vertical separator line.\n  BRT_RECTIMAGE, // Rectangular image.\n  BRT_POLYIMAGE, // Non-rectangular image.\n  BRT_UNKNOWN,   // Not determined yet.\n  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.\n  BRT_TEXT,      // Convincing text.\n\n  BRT_COUNT // Number of possibilities.\n};\n\n// enum for elements of arrays that refer to neighbours.\n// NOTE: keep in this order, so ^2 can be used to flip direction.\nenum BlobNeighbourDir { BND_LEFT, BND_BELOW, BND_RIGHT, BND_ABOVE, BND_COUNT };\n\n// enum for special type of text characters, such as math symbol or italic.\nenum BlobSpecialTextType {\n  BSTT_NONE,    // No special.\n  BSTT_ITALIC,  // Italic style.\n  BSTT_DIGIT,   // Digit symbols.\n  BSTT_MATH,    // Mathematical symbols (not including digit).\n  BSTT_UNCLEAR, // Characters with low recognition rate.\n  BSTT_SKIP,    // Characters that we skip labeling (usually too small).\n  BSTT_COUNT\n};\n\ninline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {\n  return static_cast<BlobNeighbourDir>(dir ^ 2);\n}\n\n// BlobTextFlowType indicates the quality of neighbouring information\n// related to a chain of connected components, either horizontally or\n// vertically. Also used by ColPartition for the collection of blobs\n// within, which should all have the same value in most cases.\nenum BlobTextFlowType {\n  BTFT_NONE,          // No text flow set yet.\n  BTFT_NONTEXT,       // Flow too poor to be likely text.\n  BTFT_NEIGHBOURS,    // Neighbours support flow in this direction.\n  BTFT_CHAIN,         // There is a weak chain of text in this direction.\n  BTFT_STRONG_CHAIN,  // There is a strong chain of text in this direction.\n  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.\n  BTFT_LEADER,        // Leader dots/dashes etc.\n  BTFT_COUNT\n};\n\n// Returns true if type1 dominates type2 in a merge. Mostly determined by the\n// ordering of the enum, LEADER is weak and dominates nothing.\n// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that\n// this cannot be true if t1 == t2, so the result is undefined.\ninline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {\n  // LEADER always loses.\n  if (type1 == BTFT_LEADER) {\n    return false;\n  }\n  if (type2 == BTFT_LEADER) {\n    return true;\n  }\n  // With those out of the way, the ordering of the enum determines the result.\n  return type1 >= type2;\n}\n\nclass ColPartition;\n\nclass BLOBNBOX;\nELISTIZEH(BLOBNBOX)\nclass BLOBNBOX : public ELIST<BLOBNBOX>::LINK {\npublic:\n  BLOBNBOX() {\n    ReInit();\n  }\n  explicit BLOBNBOX(C_BLOB *srcblob) {\n    box = srcblob->bounding_box();\n    ReInit();\n    cblob_ptr = srcblob;\n    area = static_cast<int>(srcblob->area());\n  }\n  ~BLOBNBOX() {\n    if (owns_cblob_) {\n      delete cblob_ptr;\n    }\n  }\n\n  static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {\n    BLOBNBOX_IT it = boxes;\n    // A BLOBNBOX generally doesn't own its blobs, so if they do, you\n    // have to delete them explicitly.\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      BLOBNBOX *box = it.data();\n      // TODO: remove next line, currently still needed for resultiterator_test.\n      delete box->remove_cblob();\n    }\n  }\n\n  static BLOBNBOX *RealBlob(C_OUTLINE *outline) {\n    auto *blob = new C_BLOB(outline);\n    return new BLOBNBOX(blob);\n  }\n\n  // Rotates the box and the underlying blob.\n  void rotate(FCOORD rotation);\n\n  // Methods that act on the box without touching the underlying blob.\n  // Reflect the box in the y-axis, leaving the underlying blob untouched.\n  void reflect_box_in_y_axis();\n  // Rotates the box by the angle given by rotation.\n  // If the blob is a diacritic, then only small rotations for skew\n  // correction can be applied.\n  void rotate_box(FCOORD rotation);\n  // Moves just the box by the given vector.\n  void translate_box(ICOORD v) {\n    if (IsDiacritic()) {\n      box.move(v);\n      base_char_top_ += v.y();\n      base_char_bottom_ += v.y();\n    } else {\n      box.move(v);\n      set_diacritic_box(box);\n    }\n  }\n  void merge(BLOBNBOX *nextblob);\n  void really_merge(BLOBNBOX *other);\n  void chop(                 // fake chop blob\n      BLOBNBOX_IT *start_it, // location of this\n      BLOBNBOX_IT *blob_it,  // iterator\n      FCOORD rotation,       // for landscape\n      float xheight);        // line height\n\n  void NeighbourGaps(int gaps[BND_COUNT]) const;\n  void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;\n  void CleanNeighbours();\n  // Returns positive if there is at least one side neighbour that has a\n  // similar stroke width and is not on the other side of a rule line.\n  int GoodTextBlob() const;\n  // Returns the number of side neighbours that are of type BRT_NOISE.\n  int NoisyNeighbours() const;\n\n  // Returns true if the blob is noise and has no owner.\n  bool DeletableNoise() const {\n    return owner() == nullptr && region_type() == BRT_NOISE;\n  }\n\n  // Returns true, and sets vert_possible/horz_possible if the blob has some\n  // feature that makes it individually appear to flow one way.\n  // eg if it has a high aspect ratio, yet has a complex shape, such as a\n  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.\n  bool DefiniteIndividualFlow();\n\n  // Returns true if there is no tabstop violation in merging this and other.\n  bool ConfirmNoTabViolation(const BLOBNBOX &other) const;\n\n  // Returns true if other has a similar stroke width to this.\n  bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,\n                           double constant_tolerance) const;\n\n  // Returns a bounding box of the outline contained within the\n  // given horizontal range.\n  TBOX BoundsWithinLimits(int left, int right);\n\n  // Estimates and stores the baseline position based on the shape of the\n  // outline.\n  void EstimateBaselinePosition();\n\n  // Simple accessors.\n  const TBOX &bounding_box() const {\n    return box;\n  }\n  // Set the bounding box. Use with caution.\n  // Normally use compute_bounding_box instead.\n  void set_bounding_box(const TBOX &new_box) {\n    box = new_box;\n    base_char_top_ = box.top();\n    base_char_bottom_ = box.bottom();\n  }\n  void compute_bounding_box() {\n    box = cblob_ptr->bounding_box();\n    base_char_top_ = box.top();\n    base_char_bottom_ = box.bottom();\n    baseline_y_ = box.bottom();\n  }\n  const TBOX &reduced_box() const {\n    return red_box;\n  }\n  void set_reduced_box(TBOX new_box) {\n    red_box = new_box;\n    reduced = true;\n  }\n  int32_t enclosed_area() const {\n    return area;\n  }\n  bool joined_to_prev() const {\n    return joined;\n  }\n  bool red_box_set() const {\n    return reduced;\n  }\n  int repeated_set() const {\n    return repeated_set_;\n  }\n  void set_repeated_set(int set_id) {\n    repeated_set_ = set_id;\n  }\n  C_BLOB *cblob() const {\n    return cblob_ptr;\n  }\n  C_BLOB *remove_cblob() {\n    auto blob = cblob_ptr;\n    cblob_ptr = nullptr;\n    owns_cblob_ = false;\n    return blob;\n  }\n  TabType left_tab_type() const {\n    return left_tab_type_;\n  }\n  void set_left_tab_type(TabType new_type) {\n    left_tab_type_ = new_type;\n  }\n  TabType right_tab_type() const {\n    return right_tab_type_;\n  }\n  void set_right_tab_type(TabType new_type) {\n    right_tab_type_ = new_type;\n  }\n  BlobRegionType region_type() const {\n    return region_type_;\n  }\n  void set_region_type(BlobRegionType new_type) {\n    region_type_ = new_type;\n  }\n  BlobSpecialTextType special_text_type() const {\n    return spt_type_;\n  }\n  void set_special_text_type(BlobSpecialTextType new_type) {\n    spt_type_ = new_type;\n  }\n  BlobTextFlowType flow() const {\n    return flow_;\n  }\n  void set_flow(BlobTextFlowType value) {\n    flow_ = value;\n  }\n  bool vert_possible() const {\n    return vert_possible_;\n  }\n  void set_vert_possible(bool value) {\n    vert_possible_ = value;\n  }\n  bool horz_possible() const {\n    return horz_possible_;\n  }\n  void set_horz_possible(bool value) {\n    horz_possible_ = value;\n  }\n  int left_rule() const {\n    return left_rule_;\n  }\n  void set_left_rule(int new_left) {\n    left_rule_ = new_left;\n  }\n  int right_rule() const {\n    return right_rule_;\n  }\n  void set_right_rule(int new_right) {\n    right_rule_ = new_right;\n  }\n  int left_crossing_rule() const {\n    return left_crossing_rule_;\n  }\n  void set_left_crossing_rule(int new_left) {\n    left_crossing_rule_ = new_left;\n  }\n  int right_crossing_rule() const {\n    return right_crossing_rule_;\n  }\n  void set_right_crossing_rule(int new_right) {\n    right_crossing_rule_ = new_right;\n  }\n  float horz_stroke_width() const {\n    return horz_stroke_width_;\n  }\n  void set_horz_stroke_width(float width) {\n    horz_stroke_width_ = width;\n  }\n  float vert_stroke_width() const {\n    return vert_stroke_width_;\n  }\n  void set_vert_stroke_width(float width) {\n    vert_stroke_width_ = width;\n  }\n  float area_stroke_width() const {\n    return area_stroke_width_;\n  }\n  tesseract::ColPartition *owner() const {\n    return owner_;\n  }\n  void set_owner(tesseract::ColPartition *new_owner) {\n    owner_ = new_owner;\n  }\n  bool leader_on_left() const {\n    return leader_on_left_;\n  }\n  void set_leader_on_left(bool flag) {\n    leader_on_left_ = flag;\n  }\n  bool leader_on_right() const {\n    return leader_on_right_;\n  }\n  void set_leader_on_right(bool flag) {\n    leader_on_right_ = flag;\n  }\n  BLOBNBOX *neighbour(BlobNeighbourDir n) const {\n    return neighbours_[n];\n  }\n  bool good_stroke_neighbour(BlobNeighbourDir n) const {\n    return good_stroke_neighbours_[n];\n  }\n  void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {\n    neighbours_[n] = neighbour;\n    good_stroke_neighbours_[n] = good;\n  }\n  bool IsDiacritic() const {\n    return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();\n  }\n  int base_char_top() const {\n    return base_char_top_;\n  }\n  int base_char_bottom() const {\n    return base_char_bottom_;\n  }\n  int baseline_position() const {\n    return baseline_y_;\n  }\n  int line_crossings() const {\n    return line_crossings_;\n  }\n  void set_line_crossings(int value) {\n    line_crossings_ = value;\n  }\n  void set_diacritic_box(const TBOX &diacritic_box) {\n    base_char_top_ = diacritic_box.top();\n    base_char_bottom_ = diacritic_box.bottom();\n  }\n  BLOBNBOX *base_char_blob() const {\n    return base_char_blob_;\n  }\n  void set_base_char_blob(BLOBNBOX *blob) {\n    base_char_blob_ = blob;\n  }\n  void set_owns_cblob(bool value) {\n    owns_cblob_ = value;\n  }\n\n  bool UniquelyVertical() const {\n    return vert_possible_ && !horz_possible_;\n  }\n  bool UniquelyHorizontal() const {\n    return horz_possible_ && !vert_possible_;\n  }\n\n  // Returns true if the region type is text.\n  static bool IsTextType(BlobRegionType type) {\n    return type == BRT_TEXT || type == BRT_VERT_TEXT;\n  }\n  // Returns true if the region type is image.\n  static bool IsImageType(BlobRegionType type) {\n    return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;\n  }\n  // Returns true if the region type is line.\n  static bool IsLineType(BlobRegionType type) {\n    return type == BRT_HLINE || type == BRT_VLINE;\n  }\n  // Returns true if the region type cannot be merged.\n  static bool UnMergeableType(BlobRegionType type) {\n    return IsLineType(type) || IsImageType(type);\n  }\n  // Helper to call CleanNeighbours on all blobs on the list.\n  static void CleanNeighbours(BLOBNBOX_LIST *blobs);\n  // Helper to delete all the deletable blobs on the list.\n  static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);\n  // Helper to compute edge offsets for  all the blobs on the list.\n  // See coutln.h for an explanation of edge offsets.\n  static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);\n\n#ifndef GRAPHICS_DISABLED\n  // Helper to draw all the blobs on the list in the given body_colour,\n  // with child outlines in the child_colour.\n  static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,\n                        ScrollView::Color child_colour, ScrollView *win);\n  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the\n  // given list in the given body_colour, with child outlines in the\n  // child_colour.\n  static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,\n                             ScrollView::Color child_colour, ScrollView *win);\n\n  static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);\n\n  // Keep in sync with BlobRegionType.\n  ScrollView::Color BoxColor() const;\n\n  void plot(ScrollView *window,              // window to draw in\n            ScrollView::Color blob_colour,   // for outer bits\n            ScrollView::Color child_colour); // for holes\n#endif\n\n  // Initializes members set by StrokeWidth and beyond, without discarding\n  // stored area and strokewidth values, which are expensive to calculate.\n  void ReInit() {\n    joined = false;\n    reduced = false;\n    repeated_set_ = 0;\n    left_tab_type_ = TT_NONE;\n    right_tab_type_ = TT_NONE;\n    region_type_ = BRT_UNKNOWN;\n    flow_ = BTFT_NONE;\n    spt_type_ = BSTT_SKIP;\n    left_rule_ = 0;\n    right_rule_ = 0;\n    left_crossing_rule_ = 0;\n    right_crossing_rule_ = 0;\n    if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {\n      area_stroke_width_ = 2.0f * area / cblob()->perimeter();\n    }\n    owner_ = nullptr;\n    base_char_top_ = box.top();\n    base_char_bottom_ = box.bottom();\n    baseline_y_ = box.bottom();\n    line_crossings_ = 0;\n    base_char_blob_ = nullptr;\n    horz_possible_ = false;\n    vert_possible_ = false;\n    leader_on_left_ = false;\n    leader_on_right_ = false;\n    ClearNeighbours();\n  }\n\n  void ClearNeighbours() {\n    for (int n = 0; n < BND_COUNT; ++n) {\n      neighbours_[n] = nullptr;\n      good_stroke_neighbours_[n] = false;\n    }\n  }\n\nprivate:\n  C_BLOB *cblob_ptr = nullptr;               // edgestep blob\n  TBOX box;                                  // bounding box\n  TBOX red_box;                              // bounding box\n  int32_t area = 0;                          // enclosed area\n  int32_t repeated_set_ = 0;                 // id of the set of repeated blobs\n  TabType left_tab_type_ = TT_NONE;          // Indicates tab-stop assessment\n  TabType right_tab_type_ = TT_NONE;         // Indicates tab-stop assessment\n  BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to\n  BlobTextFlowType flow_ = BTFT_NONE;        // Quality of text flow.\n  BlobSpecialTextType spt_type_;             // Special text type.\n  bool joined = false;                       // joined to prev\n  bool reduced = false;                      // reduced box set\n  int16_t left_rule_ = 0;                    // x-coord of nearest but not crossing rule line\n  int16_t right_rule_ = 0;                   // x-coord of nearest but not crossing rule line\n  int16_t left_crossing_rule_;               // x-coord of nearest or crossing rule line\n  int16_t right_crossing_rule_;              // x-coord of nearest or crossing rule line\n  int16_t base_char_top_;                    // y-coord of top/bottom of diacritic base,\n  int16_t base_char_bottom_;                 // if it exists else top/bottom of this blob.\n  int16_t baseline_y_;                       // Estimate of baseline position.\n  int32_t line_crossings_;                   // Number of line intersections touched.\n  BLOBNBOX *base_char_blob_;                 // The blob that was the base char.\n  tesseract::ColPartition *owner_;           // Who will delete me when I am not needed\n  BLOBNBOX *neighbours_[BND_COUNT];\n  float horz_stroke_width_ = 0.0f; // Median horizontal stroke width\n  float vert_stroke_width_ = 0.0f; // Median vertical stroke width\n  float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.\n  bool good_stroke_neighbours_[BND_COUNT];\n  bool horz_possible_;   // Could be part of horizontal flow.\n  bool vert_possible_;   // Could be part of vertical flow.\n  bool leader_on_left_;  // There is a leader to the left.\n  bool leader_on_right_; // There is a leader to the right.\n  // Iff true, then the destructor should delete the cblob_ptr.\n  // TODO(rays) migrate all uses to correctly setting this flag instead of\n  // deleting the C_BLOB before deleting the BLOBNBOX.\n  bool owns_cblob_ = false;\n};\n\nclass TO_ROW : public ELIST2<TO_ROW>::LINK {\npublic:\n  static const int kErrorWeight = 3;\n\n  TO_ROW() {\n    clear();\n  }                   // empty\n  TO_ROW(             // constructor\n      BLOBNBOX *blob, // from first blob\n      float top,      // of row //target height\n      float bottom, float row_size);\n\n  void print() const;\n  float max_y() const { // access function\n    return y_max;\n  }\n  float min_y() const {\n    return y_min;\n  }\n  float mean_y() const {\n    return (y_min + y_max) / 2.0f;\n  }\n  float initial_min_y() const {\n    return initial_y_min;\n  }\n  float line_m() const { // access to line fit\n    return m;\n  }\n  float line_c() const {\n    return c;\n  }\n  float line_error() const {\n    return error;\n  }\n  float parallel_c() const {\n    return para_c;\n  }\n  float parallel_error() const {\n    return para_error;\n  }\n  float believability() const { // baseline goodness\n    return credibility;\n  }\n  float intercept() const { // real parallel_c\n    return y_origin;\n  }\n  void add_blob(      // put in row\n      BLOBNBOX *blob, // blob to add\n      float top,      // of row //target height\n      float bottom, float row_size);\n  void insert_blob( // put in row in order\n      BLOBNBOX *blob);\n\n  BLOBNBOX_LIST *blob_list() { // get list\n    return &blobs;\n  }\n\n  void set_line(   // set line spec\n      float new_m, // line to set\n      float new_c, float new_error) {\n    m = new_m;\n    c = new_c;\n    error = new_error;\n  }\n  void set_parallel_line( // set fixed gradient line\n      float gradient,     // page gradient\n      float new_c, float new_error) {\n    para_c = new_c;\n    para_error = new_error;\n    credibility = blobs.length() - kErrorWeight * new_error;\n    y_origin = new_c / std::sqrt(1 + gradient * gradient);\n    // real intercept\n  }\n  void set_limits(     // set min,max\n      float new_min,   // bottom and\n      float new_max) { // top of row\n    y_min = new_min;\n    y_max = new_max;\n  }\n  void compute_vertical_projection();\n  // get projection\n\n  bool rep_chars_marked() const {\n    return num_repeated_sets_ != -1;\n  }\n  void clear_rep_chars_marked() {\n    num_repeated_sets_ = -1;\n  }\n  int num_repeated_sets() const {\n    return num_repeated_sets_;\n  }\n  void set_num_repeated_sets(int num_sets) {\n    num_repeated_sets_ = num_sets;\n  }\n\n  // true when dead\n  bool merged = false;\n  bool all_caps;             // had no ascenders\n  bool used_dm_model;        // in guessing pitch\n  int16_t projection_left;   // start of projection\n  int16_t projection_right;  // start of projection\n  PITCH_TYPE pitch_decision; // how strong is decision\n  float fixed_pitch;         // pitch or 0\n  float fp_space;            // sp if fixed pitch\n  float fp_nonsp;            // nonsp if fixed pitch\n  float pr_space;            // sp if prop\n  float pr_nonsp;            // non sp if prop\n  float spacing;             // to \"next\" row\n  float xheight;             // of line\n  int xheight_evidence;      // number of blobs of height xheight\n  float ascrise;             // ascenders\n  float descdrop;            // descenders\n  float body_size;           // of CJK characters.  Assumed to be\n                             // xheight+ascrise for non-CJK text.\n  int32_t min_space;         // min size for real space\n  int32_t max_nonspace;      // max size of non-space\n  int32_t space_threshold;   // space vs nonspace\n  float kern_size;           // average non-space\n  float space_size;          // average space\n  WERD_LIST rep_words;       // repeated chars\n  ICOORDELT_LIST char_cells; // fixed pitch cells\n  QSPLINE baseline;          // curved baseline\n  STATS projection;          // vertical projection\n\nprivate:\n  void clear(); // clear all values to reasonable defaults\n\n  BLOBNBOX_LIST blobs; // blobs in row\n  float y_min;         // coords\n  float y_max;\n  float initial_y_min;\n  float m, c;   // line spec\n  float error;  // line error\n  float para_c; // constrained fit\n  float para_error;\n  float y_origin;         // rotated para_c;\n  float credibility;      // baseline believability\n  int num_repeated_sets_; // number of sets of repeated blobs\n                          // set to -1 if we have not searched\n                          // for repeated blobs in this row yet\n};\n\nELIST2IZEH(TO_ROW)\nclass TESS_API TO_BLOCK : public ELIST<TO_BLOCK>::LINK {\npublic:\n  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {\n    clear();\n  }                      // empty\n  TO_BLOCK(              // constructor\n      BLOCK *src_block); // real block\n  ~TO_BLOCK();\n\n  void clear(); // clear all scalar members.\n\n  TO_ROW_LIST *get_rows() { // access function\n    return &row_list;\n  }\n\n  // Rotate all the blobnbox lists and the underlying block. Then update the\n  // median size statistic from the blobs list.\n  void rotate(const FCOORD &rotation) {\n    BLOBNBOX_LIST *blobnbox_list[] = {&blobs,       &underlines,  &noise_blobs,\n                                      &small_blobs, &large_blobs, nullptr};\n    for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {\n      BLOBNBOX_IT it(*list);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        it.data()->rotate(rotation);\n      }\n    }\n    // Rotate the block\n    ASSERT_HOST(block->pdblk.poly_block() != nullptr);\n    block->rotate(rotation);\n    // Update the median size statistic from the blobs list.\n    STATS widths(0, block->pdblk.bounding_box().width() - 1);\n    STATS heights(0, block->pdblk.bounding_box().height() - 1);\n    BLOBNBOX_IT blob_it(&blobs);\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      widths.add(blob_it.data()->bounding_box().width(), 1);\n      heights.add(blob_it.data()->bounding_box().height(), 1);\n    }\n    block->set_median_size(static_cast<int>(widths.median() + 0.5),\n                           static_cast<int>(heights.median() + 0.5));\n  }\n\n  void print_rows() { // debug info\n    TO_ROW_IT row_it = &row_list;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      auto row = row_it.data();\n      tprintf(\"Row range (%g,%g), para_c=%g, blobcount=%\" PRId32 \"\\n\",\n              static_cast<double>(row->min_y()),\n              static_cast<double>(row->max_y()),\n              static_cast<double>(row->parallel_c()),\n              row->blob_list()->length());\n    }\n  }\n\n  // Reorganizes the blob lists with a different definition of small, medium\n  // and large, compared to the original definition.\n  // Height is still the primary filter key, but medium width blobs of small\n  // height become medium, and very wide blobs of small height stay small.\n  void ReSetAndReFilterBlobs();\n\n  // Deletes noise blobs from all lists where not owned by a ColPartition.\n  void DeleteUnownedNoise();\n\n  // Computes and stores the edge offsets on each blob for use in feature\n  // extraction, using greyscale if the supplied grey and thresholds pixes\n  // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary\n  // edge step outlines.\n  // Thresholds must either be the same size as grey or an integer down-scale\n  // of grey.\n  // See coutln.h for an explanation of edge offsets.\n  void ComputeEdgeOffsets(Image thresholds, Image grey);\n\n#ifndef GRAPHICS_DISABLED\n  // Draw the noise blobs from all lists in red.\n  void plot_noise_blobs(ScrollView *to_win);\n  // Draw the blobs on the various lists in the block in different colors.\n  void plot_graded_blobs(ScrollView *to_win);\n#endif\n\n  BLOBNBOX_LIST blobs;       // medium size\n  BLOBNBOX_LIST underlines;  // underline blobs\n  BLOBNBOX_LIST noise_blobs; // very small\n  BLOBNBOX_LIST small_blobs; // fairly small\n  BLOBNBOX_LIST large_blobs; // big blobs\n  BLOCK *block;              // real block\n  PITCH_TYPE pitch_decision; // how strong is decision\n  float line_spacing;        // estimate\n  // line_size is a lower-bound estimate of the font size in pixels of\n  // the text in the block (with ascenders and descenders), being a small\n  // (1.25) multiple of the median height of filtered blobs.\n  // In most cases the font size will be bigger, but it will be closer\n  // if the text is allcaps, or in a no-x-height script.\n  float line_size;       // estimate\n  float max_blob_size;   // line assignment limit\n  float baseline_offset; // phase shift\n  float xheight;         // median blob size\n  float fixed_pitch;     // pitch or 0\n  float kern_size;       // average non-space\n  float space_size;      // average space\n  int32_t min_space;     // min definite space\n  int32_t max_nonspace;  // max definite\n  float fp_space;        // sp if fixed pitch\n  float fp_nonsp;        // nonsp if fixed pitch\n  float pr_space;        // sp if prop\n  float pr_nonsp;        // non sp if prop\n  TO_ROW *key_row;       // starting row\n\nprivate:\n  TO_ROW_LIST row_list; // temporary rows\n};\n\nELISTIZEH(TO_BLOCK)\nvoid find_cblob_limits( // get y limits\n    C_BLOB *blob,       // blob to search\n    float leftx,        // x limits\n    float rightx,\n    FCOORD rotation, // for landscape\n    float &ymin,     // output y limits\n    float &ymax);\nvoid find_cblob_vlimits( // get y limits\n    C_BLOB *blob,        // blob to search\n    float leftx,         // x limits\n    float rightx,\n    float &ymin, // output y limits\n    float &ymax);\nvoid find_cblob_hlimits( // get x limits\n    C_BLOB *blob,        // blob to search\n    float bottomy,       // y limits\n    float topy,\n    float &xmin, // output x limits\n    float &xymax);\nC_BLOB *crotate_cblob( // rotate it\n    C_BLOB *blob,      // blob to search\n    FCOORD rotation    // for landscape\n);\nTBOX box_next(      // get bounding box\n    BLOBNBOX_IT *it // iterator to blobds\n);\nTBOX box_next_pre_chopped( // get bounding box\n    BLOBNBOX_IT *it        // iterator to blobds\n);\nvoid vertical_cblob_projection( // project outlines\n    C_BLOB *blob,               // blob to project\n    STATS *stats                // output\n);\nvoid vertical_coutline_projection( // project outlines\n    C_OUTLINE *outline,            // outline to project\n    STATS *stats                   // output\n);\n#ifndef GRAPHICS_DISABLED\nvoid plot_blob_list(ScrollView *win,                 // window to draw in\n                    BLOBNBOX_LIST *list,             // blob list\n                    ScrollView::Color body_colour,   // colour to draw\n                    ScrollView::Color child_colour); // colour of child\n#endif                                               // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/blobs.cpp",
    "content": "/******************************************************************************\n *\n * File:         blobs.cpp  (Formerly blobs.c)\n * Description:  Blob definition\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blobs.h\"\n\n#include \"ccstruct.h\"\n#include \"clst.h\"\n#include \"linlsq.h\"\n#include \"normalis.h\"\n#include \"ocrblock.h\"\n#include \"ocrrow.h\"\n#include \"points.h\"\n#include \"polyaprx.h\"\n#include \"werd.h\"\n\n#include \"helpers.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// A Vector representing the \"vertical\" direction when measuring the\n// divisiblity of blobs into multiple blobs just by separating outlines.\n// See divisible_blob below for the use.\nconst TPOINT kDivisibleVerticalUpright(0, 1);\n// A vector representing the \"vertical\" direction for italic text for use\n// when separating outlines. Using it actually deteriorates final accuracy,\n// so it is only used for ApplyBoxes chopping to get a better segmentation.\nconst TPOINT kDivisibleVerticalItalic(1, 5);\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n\n// Returns true when the two line segments cross each other.\n// (Moved from outlines.cpp).\n// Finds where the projected lines would cross and then checks to see if the\n// point of intersection lies on both of the line segments. If it does\n// then these two segments cross.\n/* static */\nbool TPOINT::IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1) {\n  TPOINT b0a1, b0a0, a1b1, b0b1, a1a0;\n\n  b0a1.x = a1.x - b0.x;\n  b0a0.x = a0.x - b0.x;\n  a1b1.x = b1.x - a1.x;\n  b0b1.x = b1.x - b0.x;\n  a1a0.x = a0.x - a1.x;\n  b0a1.y = a1.y - b0.y;\n  b0a0.y = a0.y - b0.y;\n  a1b1.y = b1.y - a1.y;\n  b0b1.y = b1.y - b0.y;\n  a1a0.y = a0.y - a1.y;\n\n  int b0a1xb0b1 = b0a1.cross(b0b1);\n  int b0b1xb0a0 = b0b1.cross(b0a0);\n  int a1b1xa1a0 = a1b1.cross(a1a0);\n  // For clarity, we want a1a0.cross(a1b0) here but we have b0a1 instead of a1b0\n  // so use -a1b0.cross(b0a1) instead, which is the same.\n  int a1a0xa1b0 = -a1a0.cross(b0a1);\n\n  return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0) || (b0a1xb0b1 < 0 && b0b1xb0a0 < 0)) &&\n         ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0));\n}\n\n// Consume the circular list of EDGEPTs to make a TESSLINE.\nTESSLINE *TESSLINE::BuildFromOutlineList(EDGEPT *outline) {\n  auto *result = new TESSLINE;\n  result->loop = outline;\n  if (outline->src_outline != nullptr) {\n    // ASSUMPTION: This function is only ever called from ApproximateOutline\n    // and therefore either all points have a src_outline or all do not.\n    // Just as SetupFromPos sets the vectors from the vertices, setup the\n    // step_count members to indicate the (positive) number of original\n    // C_OUTLINE steps to the next vertex.\n    EDGEPT *pt = outline;\n    do {\n      pt->step_count = pt->next->start_step - pt->start_step;\n      if (pt->step_count < 0) {\n        pt->step_count += pt->src_outline->pathlength();\n      }\n      pt = pt->next;\n    } while (pt != outline);\n  }\n  result->SetupFromPos();\n  return result;\n}\n\n// Copies the data and the outline, but leaves next untouched.\nvoid TESSLINE::CopyFrom(const TESSLINE &src) {\n  Clear();\n  topleft = src.topleft;\n  botright = src.botright;\n  start = src.start;\n  is_hole = src.is_hole;\n  if (src.loop != nullptr) {\n    EDGEPT *prevpt = nullptr;\n    EDGEPT *newpt = nullptr;\n    EDGEPT *srcpt = src.loop;\n    do {\n      newpt = new EDGEPT(*srcpt);\n      if (prevpt == nullptr) {\n        loop = newpt;\n      } else {\n        newpt->prev = prevpt;\n        prevpt->next = newpt;\n      }\n      prevpt = newpt;\n      srcpt = srcpt->next;\n    } while (srcpt != src.loop);\n    loop->prev = newpt;\n    newpt->next = loop;\n  }\n}\n\n// Deletes owned data.\nvoid TESSLINE::Clear() {\n  if (loop == nullptr) {\n    return;\n  }\n\n  EDGEPT *this_edge = loop;\n  do {\n    EDGEPT *next_edge = this_edge->next;\n    delete this_edge;\n    this_edge = next_edge;\n  } while (this_edge != loop);\n  loop = nullptr;\n}\n\n// Normalize in-place using the DENORM.\nvoid TESSLINE::Normalize(const DENORM &denorm) {\n  EDGEPT *pt = loop;\n  do {\n    denorm.LocalNormTransform(pt->pos, &pt->pos);\n    pt = pt->next;\n  } while (pt != loop);\n  SetupFromPos();\n}\n\n// Rotates by the given rotation in place.\nvoid TESSLINE::Rotate(const FCOORD rot) {\n  EDGEPT *pt = loop;\n  do {\n    int tmp = static_cast<int>(floor(pt->pos.x * rot.x() - pt->pos.y * rot.y() + 0.5));\n    pt->pos.y = static_cast<int>(floor(pt->pos.y * rot.x() + pt->pos.x * rot.y() + 0.5));\n    pt->pos.x = tmp;\n    pt = pt->next;\n  } while (pt != loop);\n  SetupFromPos();\n}\n\n// Moves by the given vec in place.\nvoid TESSLINE::Move(const ICOORD vec) {\n  EDGEPT *pt = loop;\n  do {\n    pt->pos.x += vec.x();\n    pt->pos.y += vec.y();\n    pt = pt->next;\n  } while (pt != loop);\n  SetupFromPos();\n}\n\n// Scales by the given factor in place.\nvoid TESSLINE::Scale(float factor) {\n  EDGEPT *pt = loop;\n  do {\n    pt->pos.x = static_cast<int>(floor(pt->pos.x * factor + 0.5));\n    pt->pos.y = static_cast<int>(floor(pt->pos.y * factor + 0.5));\n    pt = pt->next;\n  } while (pt != loop);\n  SetupFromPos();\n}\n\n// Sets up the start and vec members of the loop from the pos members.\nvoid TESSLINE::SetupFromPos() {\n  EDGEPT *pt = loop;\n  do {\n    pt->vec.x = pt->next->pos.x - pt->pos.x;\n    pt->vec.y = pt->next->pos.y - pt->pos.y;\n    pt = pt->next;\n  } while (pt != loop);\n  start = pt->pos;\n  ComputeBoundingBox();\n}\n\n// Recomputes the bounding box from the points in the loop.\nvoid TESSLINE::ComputeBoundingBox() {\n  int minx = INT32_MAX;\n  int miny = INT32_MAX;\n  int maxx = -INT32_MAX;\n  int maxy = -INT32_MAX;\n\n  // Find boundaries.\n  start = loop->pos;\n  EDGEPT *this_edge = loop;\n  do {\n    if (!this_edge->IsHidden() || !this_edge->prev->IsHidden()) {\n      if (this_edge->pos.x < minx) {\n        minx = this_edge->pos.x;\n      }\n      if (this_edge->pos.y < miny) {\n        miny = this_edge->pos.y;\n      }\n      if (this_edge->pos.x > maxx) {\n        maxx = this_edge->pos.x;\n      }\n      if (this_edge->pos.y > maxy) {\n        maxy = this_edge->pos.y;\n      }\n    }\n    this_edge = this_edge->next;\n  } while (this_edge != loop);\n  // Reset bounds.\n  topleft.x = minx;\n  topleft.y = maxy;\n  botright.x = maxx;\n  botright.y = miny;\n}\n\n// Computes the min and max cross product of the outline points with the\n// given vec and returns the results in min_xp and max_xp. Geometrically\n// this is the left and right edge of the outline perpendicular to the\n// given direction, but to get the distance units correct, you would\n// have to divide by the modulus of vec.\nvoid TESSLINE::MinMaxCrossProduct(const TPOINT vec, int *min_xp, int *max_xp) const {\n  *min_xp = INT32_MAX;\n  *max_xp = INT32_MIN;\n  EDGEPT *this_edge = loop;\n  do {\n    if (!this_edge->IsHidden() || !this_edge->prev->IsHidden()) {\n      int product = this_edge->pos.cross(vec);\n      UpdateRange(product, min_xp, max_xp);\n    }\n    this_edge = this_edge->next;\n  } while (this_edge != loop);\n}\n\nTBOX TESSLINE::bounding_box() const {\n  return TBOX(topleft.x, botright.y, botright.x, topleft.y);\n}\n\n#ifndef GRAPHICS_DISABLED\nvoid TESSLINE::plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color) {\n  if (is_hole) {\n    window->Pen(child_color);\n  } else {\n    window->Pen(color);\n  }\n  window->SetCursor(start.x, start.y);\n  EDGEPT *pt = loop;\n  do {\n    bool prev_hidden = pt->IsHidden();\n    pt = pt->next;\n    if (prev_hidden) {\n      window->SetCursor(pt->pos.x, pt->pos.y);\n    } else {\n      window->DrawTo(pt->pos.x, pt->pos.y);\n    }\n  } while (pt != loop);\n}\n#endif // !GRAPHICS_DISABLED\n\n// Returns the first non-hidden EDGEPT that has a different src_outline to\n// its predecessor, or, if all the same, the lowest indexed point.\nEDGEPT *TESSLINE::FindBestStartPt() const {\n  EDGEPT *best_start = loop;\n  int best_step = loop->start_step;\n  // Iterate the polygon.\n  EDGEPT *pt = loop;\n  do {\n    if (pt->IsHidden()) {\n      continue;\n    }\n    if (pt->prev->IsHidden() || pt->prev->src_outline != pt->src_outline) {\n      return pt; // Qualifies as the best.\n    }\n    if (pt->start_step < best_step) {\n      best_step = pt->start_step;\n      best_start = pt;\n    }\n  } while ((pt = pt->next) != loop);\n  return best_start;\n}\n\n// Iterate the given list of outlines, converting to TESSLINE by polygonal\n// approximation and recursively any children, returning the current tail\n// of the resulting list of TESSLINEs.\nstatic TESSLINE **ApproximateOutlineList(bool allow_detailed_fx, C_OUTLINE_LIST *outlines,\n                                         bool children, TESSLINE **tail) {\n  C_OUTLINE_IT ol_it(outlines);\n  for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {\n    C_OUTLINE *outline = ol_it.data();\n    if (outline->pathlength() > 0) {\n      TESSLINE *tessline = ApproximateOutline(allow_detailed_fx, outline);\n      tessline->is_hole = children;\n      *tail = tessline;\n      tail = &tessline->next;\n    }\n    if (!outline->child()->empty()) {\n      tail = ApproximateOutlineList(allow_detailed_fx, outline->child(), true, tail);\n    }\n  }\n  return tail;\n}\n\n// Factory to build a TBLOB from a C_BLOB with polygonal approximation along\n// the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB\n// contain pointers to the input C_OUTLINEs that enable higher-resolution\n// feature extraction that does not use the polygonal approximation.\nTBLOB *TBLOB::PolygonalCopy(bool allow_detailed_fx, C_BLOB *src) {\n  auto *tblob = new TBLOB;\n  ApproximateOutlineList(allow_detailed_fx, src->out_list(), false, &tblob->outlines);\n  return tblob;\n}\n\n// Factory builds a blob with no outlines, but copies the other member data.\nTBLOB *TBLOB::ShallowCopy(const TBLOB &src) {\n  auto *blob = new TBLOB;\n  blob->denorm_ = src.denorm_;\n  return blob;\n}\n\n// Normalizes the blob for classification only if needed.\n// (Normally this means a non-zero classify rotation.)\n// If no Normalization is needed, then nullptr is returned, and the input blob\n// can be used directly. Otherwise a new TBLOB is returned which must be\n// deleted after use.\nTBLOB *TBLOB::ClassifyNormalizeIfNeeded() const {\n  TBLOB *rotated_blob = nullptr;\n  // If necessary, copy the blob and rotate it. The rotation is always\n  // +/- 90 degrees, as 180 was already taken care of.\n  if (denorm_.block() != nullptr && denorm_.block()->classify_rotation().y() != 0.0) {\n    TBOX box = bounding_box();\n    int x_middle = (box.left() + box.right()) / 2;\n    int y_middle = (box.top() + box.bottom()) / 2;\n    rotated_blob = new TBLOB(*this);\n    const FCOORD &rotation = denorm_.block()->classify_rotation();\n    // Move the rotated blob back to the same y-position so that we\n    // can still distinguish similar glyphs with different y-position.\n    float target_y =\n        kBlnBaselineOffset + (rotation.y() > 0 ? x_middle - box.left() : box.right() - x_middle);\n    rotated_blob->Normalize(nullptr, &rotation, &denorm_, x_middle, y_middle, 1.0f, 1.0f, 0.0f,\n                            target_y, denorm_.inverse(), denorm_.pix());\n  }\n  return rotated_blob;\n}\n\n// Copies the data and the outline, but leaves next untouched.\nvoid TBLOB::CopyFrom(const TBLOB &src) {\n  Clear();\n  TESSLINE *prev_outline = nullptr;\n  for (TESSLINE *srcline = src.outlines; srcline != nullptr; srcline = srcline->next) {\n    auto *new_outline = new TESSLINE(*srcline);\n    if (outlines == nullptr) {\n      outlines = new_outline;\n    } else {\n      prev_outline->next = new_outline;\n    }\n    prev_outline = new_outline;\n  }\n  denorm_ = src.denorm_;\n}\n\n// Deletes owned data.\nvoid TBLOB::Clear() {\n  for (TESSLINE *next_outline = nullptr; outlines != nullptr; outlines = next_outline) {\n    next_outline = outlines->next;\n    delete outlines;\n  }\n}\n\n// Sets up the built-in DENORM and normalizes the blob in-place.\n// For parameters see DENORM::SetupNormalization, plus the inverse flag for\n// this blob and the Pix for the full image.\nvoid TBLOB::Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,\n                      float x_origin, float y_origin, float x_scale, float y_scale,\n                      float final_xshift, float final_yshift, bool inverse, Image pix) {\n  denorm_.SetupNormalization(block, rotation, predecessor, x_origin, y_origin, x_scale, y_scale,\n                             final_xshift, final_yshift);\n  denorm_.set_inverse(inverse);\n  denorm_.set_pix(pix);\n  // TODO(rays) outline->Normalize is more accurate, but breaks tests due\n  // the changes it makes. Reinstate this code with a retraining.\n  // The reason this change is troublesome is that it normalizes for the\n  // baseline value computed independently at each x-coord. If the baseline\n  // is not horizontal, this introduces shear into the normalized blob, which\n  // is useful on the rare occasions that the baseline is really curved, but\n  // the baselines need to be stabilized the rest of the time.\n#if 0\n  for (TESSLINE* outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->Normalize(denorm_);\n  }\n#else\n  denorm_.LocalNormBlob(this);\n#endif\n}\n\n// Rotates by the given rotation in place.\nvoid TBLOB::Rotate(const FCOORD rotation) {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->Rotate(rotation);\n  }\n}\n\n// Moves by the given vec in place.\nvoid TBLOB::Move(const ICOORD vec) {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->Move(vec);\n  }\n}\n\n// Scales by the given factor in place.\nvoid TBLOB::Scale(float factor) {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->Scale(factor);\n  }\n}\n\n// Recomputes the bounding boxes of the outlines.\nvoid TBLOB::ComputeBoundingBoxes() {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->ComputeBoundingBox();\n  }\n}\n\n// Returns the number of outlines.\nint TBLOB::NumOutlines() const {\n  int result = 0;\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    ++result;\n  }\n  return result;\n}\n\n/**********************************************************************\n * TBLOB::bounding_box()\n *\n * Compute the bounding_box of a compound blob, defined to be the\n * bounding box of the union of all top-level outlines in the blob.\n **********************************************************************/\nTBOX TBLOB::bounding_box() const {\n  if (outlines == nullptr) {\n    return TBOX(0, 0, 0, 0);\n  }\n  TESSLINE *outline = outlines;\n  TBOX box = outline->bounding_box();\n  for (outline = outline->next; outline != nullptr; outline = outline->next) {\n    box += outline->bounding_box();\n  }\n  return box;\n}\n\n// Finds and deletes any duplicate outlines in this blob, without deleting\n// their EDGEPTs.\nvoid TBLOB::EliminateDuplicateOutlines() {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    TESSLINE *last_outline = outline;\n    for (TESSLINE *other_outline = outline->next; other_outline != nullptr;\n         last_outline = other_outline, other_outline = other_outline->next) {\n      if (outline->SameBox(*other_outline)) {\n        last_outline->next = other_outline->next;\n        // This doesn't leak - the outlines share the EDGEPTs.\n        other_outline->loop = nullptr;\n        delete other_outline;\n        other_outline = last_outline;\n        // If it is part of a cut, then it can't be a hole any more.\n        outline->is_hole = false;\n      }\n    }\n  }\n}\n\n// Swaps the outlines of *this and next if needed to keep the centers in\n// increasing x.\nvoid TBLOB::CorrectBlobOrder(TBLOB *next) {\n  TBOX box = bounding_box();\n  TBOX next_box = next->bounding_box();\n  if (box.x_middle() > next_box.x_middle()) {\n    std::swap(outlines, next->outlines);\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\nvoid TBLOB::plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color) {\n  for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n    outline->plot(window, color, child_color);\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\n// Computes the center of mass and second moments for the old baseline and\n// 2nd moment normalizations. Returns the outline length.\n// The input denorm should be the normalizations that have been applied from\n// the image to the current state of this TBLOB.\nint TBLOB::ComputeMoments(FCOORD *center, FCOORD *second_moments) const {\n  // Compute 1st and 2nd moments of the original outline.\n  LLSQ accumulator;\n  TBOX box = bounding_box();\n  // Iterate the outlines, accumulating edges relative the box.botleft().\n  CollectEdges(box, nullptr, &accumulator, nullptr, nullptr);\n  *center = accumulator.mean_point() + box.botleft();\n  // The 2nd moments are just the standard deviation of the point positions.\n  double x2nd = sqrt(accumulator.x_variance());\n  double y2nd = sqrt(accumulator.y_variance());\n  if (x2nd < 1.0) {\n    x2nd = 1.0;\n  }\n  if (y2nd < 1.0) {\n    y2nd = 1.0;\n  }\n  second_moments->set_x(x2nd);\n  second_moments->set_y(y2nd);\n  return accumulator.count();\n}\n\n// Computes the precise bounding box of the coords that are generated by\n// GetEdgeCoords. This may be different from the bounding box of the polygon.\nvoid TBLOB::GetPreciseBoundingBox(TBOX *precise_box) const {\n  TBOX box = bounding_box();\n  *precise_box = TBOX();\n  CollectEdges(box, precise_box, nullptr, nullptr, nullptr);\n  precise_box->move(box.botleft());\n}\n\n// Adds edges to the given vectors.\n// For all the edge steps in all the outlines, or polygonal approximation\n// where there are no edge steps, collects the steps into x_coords/y_coords.\n// x_coords is a collection of the x-coords of vertical edges for each\n// y-coord starting at box.bottom().\n// y_coords is a collection of the y-coords of horizontal edges for each\n// x-coord starting at box.left().\n// Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.\n// Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.\nvoid TBLOB::GetEdgeCoords(const TBOX &box, std::vector<std::vector<int>> &x_coords,\n                          std::vector<std::vector<int>> &y_coords) const {\n  x_coords.clear();\n  x_coords.resize(box.height());\n  y_coords.clear();\n  y_coords.resize(box.width());\n  CollectEdges(box, nullptr, nullptr, &x_coords, &y_coords);\n  // Sort the output vectors.\n  for (auto &coord : x_coords) {\n    std::sort(coord.begin(), coord.end());\n  }\n  for (auto &coord : y_coords) {\n    std::sort(coord.begin(), coord.end());\n  }\n}\n\n// Accumulates the segment between pt1 and pt2 in the LLSQ, quantizing over\n// the integer coordinate grid to properly weight long vectors.\nstatic void SegmentLLSQ(const FCOORD &pt1, const FCOORD &pt2, LLSQ *accumulator) {\n  FCOORD step(pt2);\n  step -= pt1;\n  int xstart = IntCastRounded(std::min(pt1.x(), pt2.x()));\n  int xend = IntCastRounded(std::max(pt1.x(), pt2.x()));\n  int ystart = IntCastRounded(std::min(pt1.y(), pt2.y()));\n  int yend = IntCastRounded(std::max(pt1.y(), pt2.y()));\n  if (xstart == xend && ystart == yend) {\n    return; // Nothing to do.\n  }\n  double weight = step.length() / (xend - xstart + yend - ystart);\n  // Compute and save the y-position at the middle of each x-step.\n  for (int x = xstart; x < xend; ++x) {\n    double y = pt1.y() + step.y() * (x + 0.5 - pt1.x()) / step.x();\n    accumulator->add(x + 0.5, y, weight);\n  }\n  // Compute and save the x-position at the middle of each y-step.\n  for (int y = ystart; y < yend; ++y) {\n    double x = pt1.x() + step.x() * (y + 0.5 - pt1.y()) / step.y();\n    accumulator->add(x, y + 0.5, weight);\n  }\n}\n\n// Adds any edges from a single segment of outline between pt1 and pt2 to\n// the x_coords, y_coords vectors. pt1 and pt2 should be relative to the\n// bottom-left of the bounding box, hence indices to x_coords, y_coords\n// are clipped to ([0,x_limit], [0,y_limit]).\n// See GetEdgeCoords above for a description of x_coords, y_coords.\nstatic void SegmentCoords(const FCOORD &pt1, const FCOORD &pt2, int x_limit, int y_limit,\n                          std::vector<std::vector<int>> *x_coords,\n                          std::vector<std::vector<int>> *y_coords) {\n  FCOORD step(pt2);\n  step -= pt1;\n  int start = ClipToRange(IntCastRounded(std::min(pt1.x(), pt2.x())), 0, x_limit);\n  int end = ClipToRange(IntCastRounded(std::max(pt1.x(), pt2.x())), 0, x_limit);\n  for (int x = start; x < end; ++x) {\n    int y = IntCastRounded(pt1.y() + step.y() * (x + 0.5 - pt1.x()) / step.x());\n    (*y_coords)[x].push_back(y);\n  }\n  start = ClipToRange(IntCastRounded(std::min(pt1.y(), pt2.y())), 0, y_limit);\n  end = ClipToRange(IntCastRounded(std::max(pt1.y(), pt2.y())), 0, y_limit);\n  for (int y = start; y < end; ++y) {\n    int x = IntCastRounded(pt1.x() + step.x() * (y + 0.5 - pt1.y()) / step.y());\n    (*x_coords)[y].push_back(x);\n  }\n}\n\n// Adds any edges from a single segment of outline between pt1 and pt2 to\n// the bbox such that it guarantees to contain anything produced by\n// SegmentCoords.\nstatic void SegmentBBox(const FCOORD &pt1, const FCOORD &pt2, TBOX *bbox) {\n  FCOORD step(pt2);\n  step -= pt1;\n  int x1 = IntCastRounded(std::min(pt1.x(), pt2.x()));\n  int x2 = IntCastRounded(std::max(pt1.x(), pt2.x()));\n  if (x2 > x1) {\n    int y1 = IntCastRounded(pt1.y() + step.y() * (x1 + 0.5 - pt1.x()) / step.x());\n    int y2 = IntCastRounded(pt1.y() + step.y() * (x2 - 0.5 - pt1.x()) / step.x());\n    TBOX point(x1, std::min(y1, y2), x2, std::max(y1, y2));\n    *bbox += point;\n  }\n  int y1 = IntCastRounded(std::min(pt1.y(), pt2.y()));\n  int y2 = IntCastRounded(std::max(pt1.y(), pt2.y()));\n  if (y2 > y1) {\n    int x1 = IntCastRounded(pt1.x() + step.x() * (y1 + 0.5 - pt1.y()) / step.y());\n    int x2 = IntCastRounded(pt1.x() + step.x() * (y2 - 0.5 - pt1.y()) / step.y());\n    TBOX point(std::min(x1, x2), y1, std::max(x1, x2), y2);\n    *bbox += point;\n  }\n}\n\n// Collects edges into the given bounding box, LLSQ accumulator and/or x_coords,\n// y_coords vectors.\n// For a description of x_coords/y_coords, see GetEdgeCoords above.\n// Startpt to lastpt, inclusive, MUST have the same src_outline member,\n// which may be nullptr. The vector from lastpt to its next is included in\n// the accumulation. Hidden edges should be excluded by the caller.\n// The input denorm should be the normalizations that have been applied from\n// the image to the current state of the TBLOB from which startpt, lastpt come.\n// box is the bounding box of the blob from which the EDGEPTs are taken and\n// indices into x_coords, y_coords are offset by box.botleft().\nstatic void CollectEdgesOfRun(const EDGEPT *startpt, const EDGEPT *lastpt, const DENORM &denorm,\n                              const TBOX &box, TBOX *bounding_box, LLSQ *accumulator,\n                              std::vector<std::vector<int>> *x_coords,\n                              std::vector<std::vector<int>> *y_coords) {\n  const C_OUTLINE *outline = startpt->src_outline;\n  int x_limit = box.width() - 1;\n  int y_limit = box.height() - 1;\n  if (outline != nullptr) {\n    // Use higher-resolution edge points stored on the outline.\n    // The outline coordinates may not match the binary image because of the\n    // rotation for vertical text lines, but the root_denorm IS the matching\n    // start of the DENORM chain.\n    const DENORM *root_denorm = denorm.RootDenorm();\n    int step_length = outline->pathlength();\n    int start_index = startpt->start_step;\n    // Note that if this run straddles the wrap-around point of the outline,\n    // that lastpt->start_step may have a lower index than startpt->start_step,\n    // and we want to use an end_index that allows us to use a positive\n    // increment, so we add step_length if necessary, but that may be beyond the\n    // bounds of the outline steps/ due to wrap-around, so we use % step_length\n    // everywhere, except for start_index.\n    int end_index = lastpt->start_step + lastpt->step_count;\n    if (end_index <= start_index) {\n      end_index += step_length;\n    }\n    // pos is the integer coordinates of the binary image steps.\n    ICOORD pos = outline->position_at_index(start_index);\n    FCOORD origin(box.left(), box.bottom());\n    // f_pos is a floating-point version of pos that offers improved edge\n    // positioning using greyscale information or smoothing of edge steps.\n    FCOORD f_pos = outline->sub_pixel_pos_at_index(pos, start_index);\n    // pos_normed is f_pos after the appropriate normalization, and relative\n    // to origin.\n    // prev_normed is the previous value of pos_normed.\n    FCOORD prev_normed;\n    denorm.NormTransform(root_denorm, f_pos, &prev_normed);\n    prev_normed -= origin;\n    for (int index = start_index; index < end_index; ++index) {\n      ICOORD step = outline->step(index % step_length);\n      // Only use the point if its edge strength is positive. This excludes\n      // points that don't provide useful information, eg\n      // ___________\n      //            |___________\n      // The vertical step provides only noisy, damaging information, as even\n      // with a greyscale image, the positioning of the edge there may be a\n      // fictitious extrapolation, so previous processing has eliminated it.\n      if (outline->edge_strength_at_index(index % step_length) > 0) {\n        FCOORD f_pos = outline->sub_pixel_pos_at_index(pos, index % step_length);\n        FCOORD pos_normed;\n        denorm.NormTransform(root_denorm, f_pos, &pos_normed);\n        pos_normed -= origin;\n        // Accumulate the information that is selected by the caller.\n        if (bounding_box != nullptr) {\n          SegmentBBox(pos_normed, prev_normed, bounding_box);\n        }\n        if (accumulator != nullptr) {\n          SegmentLLSQ(pos_normed, prev_normed, accumulator);\n        }\n        if (x_coords != nullptr && y_coords != nullptr) {\n          SegmentCoords(pos_normed, prev_normed, x_limit, y_limit, x_coords, y_coords);\n        }\n        prev_normed = pos_normed;\n      }\n      pos += step;\n    }\n  } else {\n    // There is no outline, so we are forced to use the polygonal approximation.\n    const EDGEPT *endpt = lastpt->next;\n    const EDGEPT *pt = startpt;\n    do {\n      FCOORD next_pos(pt->next->pos.x - box.left(), pt->next->pos.y - box.bottom());\n      FCOORD pos(pt->pos.x - box.left(), pt->pos.y - box.bottom());\n      if (bounding_box != nullptr) {\n        SegmentBBox(next_pos, pos, bounding_box);\n      }\n      if (accumulator != nullptr) {\n        SegmentLLSQ(next_pos, pos, accumulator);\n      }\n      if (x_coords != nullptr && y_coords != nullptr) {\n        SegmentCoords(next_pos, pos, x_limit, y_limit, x_coords, y_coords);\n      }\n    } while ((pt = pt->next) != endpt);\n  }\n}\n\n// For all the edge steps in all the outlines, or polygonal approximation\n// where there are no edge steps, collects the steps into the bounding_box,\n// llsq and/or the x_coords/y_coords. Both are used in different kinds of\n// normalization.\n// For a description of x_coords, y_coords, see GetEdgeCoords above.\nvoid TBLOB::CollectEdges(const TBOX &box, TBOX *bounding_box, LLSQ *llsq,\n                         std::vector<std::vector<int>> *x_coords,\n                         std::vector<std::vector<int>> *y_coords) const {\n  // Iterate the outlines.\n  for (const TESSLINE *ol = outlines; ol != nullptr; ol = ol->next) {\n    // Iterate the polygon.\n    EDGEPT *loop_pt = ol->FindBestStartPt();\n    EDGEPT *pt = loop_pt;\n    if (pt == nullptr) {\n      continue;\n    }\n    do {\n      if (pt->IsHidden()) {\n        continue;\n      }\n      // Find a run of equal src_outline.\n      EDGEPT *last_pt = pt;\n      do {\n        last_pt = last_pt->next;\n      } while (last_pt != loop_pt && !last_pt->IsHidden() &&\n               last_pt->src_outline == pt->src_outline);\n      last_pt = last_pt->prev;\n      CollectEdgesOfRun(pt, last_pt, denorm_, box, bounding_box, llsq, x_coords, y_coords);\n      pt = last_pt;\n    } while ((pt = pt->next) != loop_pt);\n  }\n}\n\n// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal\n// approximation along the way.\nTWERD *TWERD::PolygonalCopy(bool allow_detailed_fx, WERD *src) {\n  auto *tessword = new TWERD;\n  tessword->latin_script = src->flag(W_SCRIPT_IS_LATIN);\n  C_BLOB_IT b_it(src->cblob_list());\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    C_BLOB *blob = b_it.data();\n    TBLOB *tblob = TBLOB::PolygonalCopy(allow_detailed_fx, blob);\n    tessword->blobs.push_back(tblob);\n  }\n  return tessword;\n}\n\n// Baseline normalizes the blobs in-place, recording the normalization in the\n// DENORMs in the blobs.\nvoid TWERD::BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height,\n                        float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint,\n                        const TBOX *norm_box, DENORM *word_denorm) {\n  TBOX word_box = bounding_box();\n  if (norm_box != nullptr) {\n    word_box = *norm_box;\n  }\n  float word_middle = (word_box.left() + word_box.right()) / 2.0f;\n  float input_y_offset = 0.0f;\n  auto final_y_offset = static_cast<float>(kBlnBaselineOffset);\n  float scale = kBlnXHeight / x_height;\n  if (row == nullptr) {\n    word_middle = word_box.left();\n    input_y_offset = word_box.bottom();\n    final_y_offset = 0.0f;\n  } else {\n    input_y_offset = row->base_line(word_middle) + baseline_shift;\n  }\n  for (auto blob : blobs) {\n    TBOX blob_box = blob->bounding_box();\n    float mid_x = (blob_box.left() + blob_box.right()) / 2.0f;\n    float baseline = input_y_offset;\n    float blob_scale = scale;\n    if (numeric_mode) {\n      baseline = blob_box.bottom();\n      blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), scale, scale * 1.5f);\n    } else if (row != nullptr) {\n      baseline = row->base_line(mid_x) + baseline_shift;\n    }\n    // The image will be 8-bit grey if the input was grey or color. Note that in\n    // a grey image 0 is black and 255 is white. If the input was binary, then\n    // the pix will be binary and 0 is white, with 1 being black.\n    // To tell the difference pixGetDepth() will return 8 or 1.\n    // The inverse flag will be true iff the word has been determined to be\n    // white on black, and is independent of whether the pix is 8 bit or 1 bit.\n    blob->Normalize(block, nullptr, nullptr, word_middle, baseline, blob_scale, blob_scale, 0.0f,\n                    final_y_offset, inverse, pix);\n  }\n  if (word_denorm != nullptr) {\n    word_denorm->SetupNormalization(block, nullptr, nullptr, word_middle, input_y_offset, scale,\n                                    scale, 0.0f, final_y_offset);\n    word_denorm->set_inverse(inverse);\n    word_denorm->set_pix(pix);\n  }\n}\n\n// Copies the data and the blobs, but leaves next untouched.\nvoid TWERD::CopyFrom(const TWERD &src) {\n  Clear();\n  latin_script = src.latin_script;\n  for (auto blob : src.blobs) {\n    auto *new_blob = new TBLOB(*blob);\n    blobs.push_back(new_blob);\n  }\n}\n\n// Deletes owned data.\nvoid TWERD::Clear() {\n  for (auto blob : blobs) {\n    delete blob;\n  }\n  blobs.clear();\n}\n\n// Recomputes the bounding boxes of the blobs.\nvoid TWERD::ComputeBoundingBoxes() {\n  for (auto &blob : blobs) {\n    blob->ComputeBoundingBoxes();\n  }\n}\n\nTBOX TWERD::bounding_box() const {\n  TBOX result;\n  for (auto blob : blobs) {\n    TBOX box = blob->bounding_box();\n    result += box;\n  }\n  return result;\n}\n\n// Merges the blobs from start to end, not including end, and deletes\n// the blobs between start and end.\nvoid TWERD::MergeBlobs(unsigned start, unsigned end) {\n  if (end > blobs.size()) {\n    end = blobs.size();\n  }\n  if (start >= end) {\n    return; // Nothing to do.\n  }\n  TESSLINE *outline = blobs[start]->outlines;\n  for (auto i = start + 1; i < end; ++i) {\n    TBLOB *next_blob = blobs[i];\n    // Take the outlines from the next blob.\n    if (outline == nullptr) {\n      blobs[start]->outlines = next_blob->outlines;\n      outline = blobs[start]->outlines;\n    } else {\n      while (outline->next != nullptr) {\n        outline = outline->next;\n      }\n      outline->next = next_blob->outlines;\n      next_blob->outlines = nullptr;\n    }\n    // Delete the next blob and move on.\n    delete next_blob;\n    blobs[i] = nullptr;\n  }\n  // Remove dead blobs from the vector.\n  // TODO: optimize.\n  for (auto i = start + 1; i < end && start + 1 < blobs.size(); ++i) {\n    blobs.erase(blobs.begin() + start + 1);\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\nvoid TWERD::plot(ScrollView *window) {\n  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);\n  for (auto &blob : blobs) {\n    blob->plot(window, color, ScrollView::BROWN);\n    color = WERD::NextColor(color);\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\n/**********************************************************************\n * divisible_blob\n *\n * Returns true if the blob contains multiple outlines than can be\n * separated using divide_blobs. Sets the location to be used in the\n * call to divide_blobs.\n **********************************************************************/\nbool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location) {\n  if (blob->outlines == nullptr || blob->outlines->next == nullptr) {\n    return false; // Need at least 2 outlines for it to be possible.\n  }\n  int max_gap = 0;\n  TPOINT vertical = italic_blob ? kDivisibleVerticalItalic : kDivisibleVerticalUpright;\n  for (TESSLINE *outline1 = blob->outlines; outline1 != nullptr; outline1 = outline1->next) {\n    if (outline1->is_hole) {\n      continue; // Holes do not count as separable.\n    }\n    TPOINT mid_pt1((outline1->topleft.x + outline1->botright.x) / 2,\n                   (outline1->topleft.y + outline1->botright.y) / 2);\n    int mid_prod1 = mid_pt1.cross(vertical);\n    int min_prod1, max_prod1;\n    outline1->MinMaxCrossProduct(vertical, &min_prod1, &max_prod1);\n    for (TESSLINE *outline2 = outline1->next; outline2 != nullptr; outline2 = outline2->next) {\n      if (outline2->is_hole) {\n        continue; // Holes do not count as separable.\n      }\n      TPOINT mid_pt2((outline2->topleft.x + outline2->botright.x) / 2,\n                     (outline2->topleft.y + outline2->botright.y) / 2);\n      int mid_prod2 = mid_pt2.cross(vertical);\n      int min_prod2, max_prod2;\n      outline2->MinMaxCrossProduct(vertical, &min_prod2, &max_prod2);\n      int mid_gap = abs(mid_prod2 - mid_prod1);\n      int overlap = std::min(max_prod1, max_prod2) - std::max(min_prod1, min_prod2);\n      if (mid_gap - overlap / 4 > max_gap) {\n        max_gap = mid_gap - overlap / 4;\n        *location = mid_pt1;\n        *location += mid_pt2;\n        *location /= 2;\n      }\n    }\n  }\n  // Use the y component of the vertical vector as an approximation to its\n  // length.\n  return max_gap > vertical.y;\n}\n\n/**********************************************************************\n * divide_blobs\n *\n * Create two blobs by grouping the outlines in the appropriate blob.\n * The outlines that are beyond the location point are moved to the\n * other blob.  The ones whose x location is less than that point are\n * retained in the original blob.\n **********************************************************************/\nvoid divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location) {\n  TPOINT vertical = italic_blob ? kDivisibleVerticalItalic : kDivisibleVerticalUpright;\n  TESSLINE *outline1 = nullptr;\n  TESSLINE *outline2 = nullptr;\n\n  TESSLINE *outline = blob->outlines;\n  blob->outlines = nullptr;\n  int location_prod = location.cross(vertical);\n\n  while (outline != nullptr) {\n    TPOINT mid_pt((outline->topleft.x + outline->botright.x) / 2,\n                  (outline->topleft.y + outline->botright.y) / 2);\n    int mid_prod = mid_pt.cross(vertical);\n    if (mid_prod < location_prod) {\n      // Outline is in left blob.\n      if (outline1) {\n        outline1->next = outline;\n      } else {\n        blob->outlines = outline;\n      }\n      outline1 = outline;\n    } else {\n      // Outline is in right blob.\n      if (outline2) {\n        outline2->next = outline;\n      } else {\n        other_blob->outlines = outline;\n      }\n      outline2 = outline;\n    }\n    outline = outline->next;\n  }\n\n  if (outline1) {\n    outline1->next = nullptr;\n  }\n  if (outline2) {\n    outline2->next = nullptr;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/blobs.h",
    "content": "/******************************************************************************\n *\n * File:        blobs.h\n * Description: Blob definition\n * Author:      Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef BLOBS_H\n#define BLOBS_H\n\n#include \"clst.h\"       // for CLIST_ITERATOR, CLISTIZEH\n#include \"normalis.h\"   // for DENORM\n#include \"points.h\"     // for FCOORD, ICOORD\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n\n#include <tesseract/publictypes.h> // for OcrEngineMode\n\n#include \"tesstypes.h\" // for TDimension\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass BLOCK;\nclass C_BLOB;\nclass C_OUTLINE;\nclass LLSQ;\nclass ROW;\nclass WERD;\n\n/*----------------------------------------------------------------------\n              T y p e s\n----------------------------------------------------------------------*/\n\nstruct TPOINT {\n  TPOINT() = default;\n  TPOINT(TDimension vx, TDimension vy) : x(vx), y(vy) {}\n  TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}\n\n  void operator+=(const TPOINT &other) {\n    x += other.x;\n    y += other.y;\n  }\n  void operator/=(int divisor) {\n    x /= divisor;\n    y /= divisor;\n  }\n  bool operator==(const TPOINT &other) const {\n    return x == other.x && y == other.y;\n  }\n  // Returns true when the two line segments cross each other.\n  // (Moved from outlines.cpp).\n  static bool IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1);\n\n  // Assign the difference from point p1 to point p2.\n  void diff(const TPOINT &p1, const TPOINT &p2) {\n    x = p1.x - p2.x;\n    y = p1.y - p2.y;\n  }\n\n  // Return cross product.\n  int cross(const TPOINT &other) const {\n    return x * other.y - y * other.x;\n  }\n\n  // Return scalar or dot product.\n  int dot(const TPOINT &other) const {\n    return x * other.x + y * other.y;\n  }\n\n  // Calculate square of vector length.\n  int length2() const {\n    return x * x + y * y;\n  }\n\n  TDimension x = 0; // absolute x coord.\n  TDimension y = 0; // absolute y coord.\n};\n\nusing VECTOR = TPOINT; // structure for coordinates.\n\nstruct EDGEPT {\n  EDGEPT() = default;\n  EDGEPT(const EDGEPT &src) : next(nullptr), prev(nullptr) {\n    CopyFrom(src);\n  }\n  EDGEPT &operator=(const EDGEPT &src) {\n    CopyFrom(src);\n    return *this;\n  }\n  // Copies the data elements, but leaves the pointers untouched.\n  void CopyFrom(const EDGEPT &src) {\n    pos = src.pos;\n    vec = src.vec;\n    is_hidden = src.is_hidden;\n    runlength = src.runlength;\n    dir = src.dir;\n    fixed = src.fixed;\n    src_outline = src.src_outline;\n    start_step = src.start_step;\n    step_count = src.step_count;\n  }\n  // Returns the squared distance between the points, with the x-component\n  // weighted by x_factor.\n  int WeightedDistance(const EDGEPT &other, int x_factor) const {\n    int x_dist = pos.x - other.pos.x;\n    int y_dist = pos.y - other.pos.y;\n    return x_dist * x_dist * x_factor + y_dist * y_dist;\n  }\n  // Returns true if the positions are equal.\n  bool EqualPos(const EDGEPT &other) const {\n    return pos == other.pos;\n  }\n  // Returns the bounding box of the outline segment from *this to *end.\n  // Ignores hidden edge flags.\n  TBOX SegmentBox(const EDGEPT *end) const {\n    TBOX box(pos.x, pos.y, pos.x, pos.y);\n    const EDGEPT *pt = this;\n    do {\n      pt = pt->next;\n      if (pt->pos.x < box.left()) {\n        box.set_left(pt->pos.x);\n      }\n      if (pt->pos.x > box.right()) {\n        box.set_right(pt->pos.x);\n      }\n      if (pt->pos.y < box.bottom()) {\n        box.set_bottom(pt->pos.y);\n      }\n      if (pt->pos.y > box.top()) {\n        box.set_top(pt->pos.y);\n      }\n    } while (pt != end && pt != this);\n    return box;\n  }\n  // Returns the area of the outline segment from *this to *end.\n  // Ignores hidden edge flags.\n  int SegmentArea(const EDGEPT *end) const {\n    int area = 0;\n    const EDGEPT *pt = this->next;\n    do {\n      TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);\n      area += origin_vec.cross(pt->vec);\n      pt = pt->next;\n    } while (pt != end && pt != this);\n    return area;\n  }\n  // Returns true if the number of points in the outline segment from *this to\n  // *end is less that min_points and false if we get back to *this first.\n  // Ignores hidden edge flags.\n  bool ShortNonCircularSegment(int min_points, const EDGEPT *end) const {\n    int count = 0;\n    const EDGEPT *pt = this;\n    do {\n      if (pt == end) {\n        return true;\n      }\n      pt = pt->next;\n      ++count;\n    } while (pt != this && count <= min_points);\n    return false;\n  }\n\n  // Accessors to hide or reveal a cut edge from feature extractors.\n  void Hide() {\n    is_hidden = true;\n  }\n  void Reveal() {\n    is_hidden = false;\n  }\n  bool IsHidden() const {\n    return is_hidden;\n  }\n  void MarkChop() {\n    dir = 1;\n  }\n  bool IsChopPt() const {\n    return dir != 0;\n  }\n\n  TPOINT pos; // position\n  VECTOR vec; // vector to next point\n  bool is_hidden = false;\n  uint8_t runlength = 0;\n  int8_t dir = 0;\n  bool fixed = false;\n  EDGEPT *next = nullptr;           // anticlockwise element\n  EDGEPT *prev = nullptr;           // clockwise element\n  C_OUTLINE *src_outline = nullptr; // Outline it came from.\n  // The following fields are not used if src_outline is nullptr.\n  int start_step = 0; // Location of pos in src_outline.\n  int step_count = 0; // Number of steps used (may wrap around).\n};\n\n// For use in chop and findseam to keep a list of which EDGEPTs were inserted.\nCLISTIZEH(EDGEPT)\n\nstruct TESSLINE {\n  TESSLINE() : is_hole(false), loop(nullptr), next(nullptr) {}\n  TESSLINE(const TESSLINE &src) : loop(nullptr), next(nullptr) {\n    CopyFrom(src);\n  }\n  ~TESSLINE() {\n    Clear();\n  }\n  TESSLINE &operator=(const TESSLINE &src) {\n    CopyFrom(src);\n    return *this;\n  }\n  // Consume the circular list of EDGEPTs to make a TESSLINE.\n  static TESSLINE *BuildFromOutlineList(EDGEPT *outline);\n  // Copies the data and the outline, but leaves next untouched.\n  void CopyFrom(const TESSLINE &src);\n  // Deletes owned data.\n  void Clear();\n  // Normalize in-place using the DENORM.\n  void Normalize(const DENORM &denorm);\n  // Rotates by the given rotation in place.\n  void Rotate(const FCOORD rotation);\n  // Moves by the given vec in place.\n  void Move(const ICOORD vec);\n  // Scales by the given factor in place.\n  void Scale(float factor);\n  // Sets up the start and vec members of the loop from the pos members.\n  void SetupFromPos();\n  // Recomputes the bounding box from the points in the loop.\n  void ComputeBoundingBox();\n  // Computes the min and max cross product of the outline points with the\n  // given vec and returns the results in min_xp and max_xp. Geometrically\n  // this is the left and right edge of the outline perpendicular to the\n  // given direction, but to get the distance units correct, you would\n  // have to divide by the modulus of vec.\n  void MinMaxCrossProduct(const TPOINT vec, int *min_xp, int *max_xp) const;\n\n  TBOX bounding_box() const;\n  // Returns true if *this and other have equal bounding boxes.\n  bool SameBox(const TESSLINE &other) const {\n    return topleft == other.topleft && botright == other.botright;\n  }\n  // Returns true if the given line segment crosses any outline of this blob.\n  bool SegmentCrosses(const TPOINT &pt1, const TPOINT &pt2) const {\n    if (Contains(pt1) && Contains(pt2)) {\n      EDGEPT *pt = loop;\n      do {\n        if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) {\n          return true;\n        }\n        pt = pt->next;\n      } while (pt != loop);\n    }\n    return false;\n  }\n  // Returns true if the point is contained within the outline box.\n  bool Contains(const TPOINT &pt) const {\n    return topleft.x <= pt.x && pt.x <= botright.x && botright.y <= pt.y && pt.y <= topleft.y;\n  }\n\n#ifndef GRAPHICS_DISABLED\n  void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);\n#endif // !GRAPHICS_DISABLED\n\n  // Returns the first outline point that has a different src_outline to its\n  // predecessor, or, if all the same, the lowest indexed point.\n  EDGEPT *FindBestStartPt() const;\n\n  int BBArea() const {\n    return (botright.x - topleft.x) * (topleft.y - botright.y);\n  }\n\n  TPOINT topleft;  // Top left of loop.\n  TPOINT botright; // Bottom right of loop.\n  TPOINT start;    // Start of loop.\n  bool is_hole;    // True if this is a hole/child outline.\n  EDGEPT *loop;    // Edgeloop.\n  TESSLINE *next;  // Next outline in blob.\n};                 // Outline structure.\n\nstruct TBLOB {\n  TBLOB() : outlines(nullptr) {}\n  TBLOB(const TBLOB &src) : outlines(nullptr) {\n    CopyFrom(src);\n  }\n  ~TBLOB() {\n    Clear();\n  }\n  TBLOB &operator=(const TBLOB &src) {\n    CopyFrom(src);\n    return *this;\n  }\n  // Factory to build a TBLOB from a C_BLOB with polygonal approximation along\n  // the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB\n  // contain pointers to the input C_OUTLINEs that enable higher-resolution\n  // feature extraction that does not use the polygonal approximation.\n  static TBLOB *PolygonalCopy(bool allow_detailed_fx, C_BLOB *src);\n  // Factory builds a blob with no outlines, but copies the other member data.\n  static TBLOB *ShallowCopy(const TBLOB &src);\n  // Normalizes the blob for classification only if needed.\n  // (Normally this means a non-zero classify rotation.)\n  // If no Normalization is needed, then nullptr is returned, and the input blob\n  // can be used directly. Otherwise a new TBLOB is returned which must be\n  // deleted after use.\n  TBLOB *ClassifyNormalizeIfNeeded() const;\n\n  // Copies the data and the outlines, but leaves next untouched.\n  void CopyFrom(const TBLOB &src);\n  // Deletes owned data.\n  void Clear();\n  // Sets up the built-in DENORM and normalizes the blob in-place.\n  // For parameters see DENORM::SetupNormalization, plus the inverse flag for\n  // this blob and the Pix for the full image.\n  void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,\n                 float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift,\n                 float final_yshift, bool inverse, Image pix);\n  // Rotates by the given rotation in place.\n  void Rotate(const FCOORD rotation);\n  // Moves by the given vec in place.\n  void Move(const ICOORD vec);\n  // Scales by the given factor in place.\n  void Scale(float factor);\n  // Recomputes the bounding boxes of the outlines.\n  void ComputeBoundingBoxes();\n\n  // Returns the number of outlines.\n  int NumOutlines() const;\n\n  TBOX bounding_box() const;\n\n  // Returns true if the given line segment crosses any outline of this blob.\n  bool SegmentCrossesOutline(const TPOINT &pt1, const TPOINT &pt2) const {\n    for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n      if (outline->SegmentCrosses(pt1, pt2)) {\n        return true;\n      }\n    }\n    return false;\n  }\n  // Returns true if the point is contained within any of the outline boxes.\n  bool Contains(const TPOINT &pt) const {\n    for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n      if (outline->Contains(pt)) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  // Finds and deletes any duplicate outlines in this blob, without deleting\n  // their EDGEPTs.\n  void EliminateDuplicateOutlines();\n\n  // Swaps the outlines of *this and next if needed to keep the centers in\n  // increasing x.\n  void CorrectBlobOrder(TBLOB *next);\n\n  const DENORM &denorm() const {\n    return denorm_;\n  }\n\n#ifndef GRAPHICS_DISABLED\n  void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);\n#endif // !GRAPHICS_DISABLED\n\n  int BBArea() const {\n    int total_area = 0;\n    for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {\n      total_area += outline->BBArea();\n    }\n    return total_area;\n  }\n\n  // Computes the center of mass and second moments for the old baseline and\n  // 2nd moment normalizations. Returns the outline length.\n  // The input denorm should be the normalizations that have been applied from\n  // the image to the current state of this TBLOB.\n  int ComputeMoments(FCOORD *center, FCOORD *second_moments) const;\n  // Computes the precise bounding box of the coords that are generated by\n  // GetEdgeCoords. This may be different from the bounding box of the polygon.\n  void GetPreciseBoundingBox(TBOX *precise_box) const;\n  // Adds edges to the given vectors.\n  // For all the edge steps in all the outlines, or polygonal approximation\n  // where there are no edge steps, collects the steps into x_coords/y_coords.\n  // x_coords is a collection of the x-coords of vertical edges for each\n  // y-coord starting at box.bottom().\n  // y_coords is a collection of the y-coords of horizontal edges for each\n  // x-coord starting at box.left().\n  // Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.\n  // Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.\n  void GetEdgeCoords(const TBOX &box, std::vector<std::vector<int>> &x_coords,\n                     std::vector<std::vector<int>> &y_coords) const;\n\n  TESSLINE *outlines; // List of outlines in blob.\n\nprivate: // TODO(rays) Someday the data members will be private too.\n  // For all the edge steps in all the outlines, or polygonal approximation\n  // where there are no edge steps, collects the steps into the bounding_box,\n  // llsq and/or the x_coords/y_coords. Both are used in different kinds of\n  // normalization.\n  // For a description of x_coords, y_coords, see GetEdgeCoords above.\n  void CollectEdges(const TBOX &box, TBOX *bounding_box, LLSQ *llsq,\n                    std::vector<std::vector<int>> *x_coords,\n                    std::vector<std::vector<int>> *y_coords) const;\n\nprivate:\n  // DENORM indicating the transformations that this blob has undergone so far.\n  DENORM denorm_;\n}; // Blob structure.\n\nstruct TWERD {\n  TWERD() : latin_script(false) {}\n  TWERD(const TWERD &src) {\n    CopyFrom(src);\n  }\n  ~TWERD() {\n    Clear();\n  }\n  TWERD &operator=(const TWERD &src) {\n    CopyFrom(src);\n    return *this;\n  }\n  // Factory to build a TWERD from a (C_BLOB) WERD, with polygonal\n  // approximation along the way.\n  static TWERD *PolygonalCopy(bool allow_detailed_fx, WERD *src);\n  // Baseline normalizes the blobs in-place, recording the normalization in the\n  // DENORMs in the blobs.\n  void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height,\n                   float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint,\n                   const TBOX *norm_box, DENORM *word_denorm);\n  // Copies the data and the blobs, but leaves next untouched.\n  void CopyFrom(const TWERD &src);\n  // Deletes owned data.\n  void Clear();\n  // Recomputes the bounding boxes of the blobs.\n  void ComputeBoundingBoxes();\n\n  // Returns the number of blobs in the word.\n  unsigned NumBlobs() const {\n    return blobs.size();\n  }\n  TBOX bounding_box() const;\n\n  // Merges the blobs from start to end, not including end, and deletes\n  // the blobs between start and end.\n  void MergeBlobs(unsigned start, unsigned end);\n\n#ifndef GRAPHICS_DISABLED\n  void plot(ScrollView *window);\n#endif // !GRAPHICS_DISABLED\n\n  std::vector<TBLOB *> blobs; // Blobs in word.\n  bool latin_script;          // This word is in a latin-based script.\n};\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.\nbool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location);\n\nvoid divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/blread.cpp",
    "content": "/**********************************************************************\n * File:        blread.cpp  (Formerly pdread.c)\n * Description: Friend function of BLOCK to read the uscan pd file.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"blread.h\"\n\n#include \"ocrblock.h\"  // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)\n#include \"scanutils.h\" // for tfscanf\n\n#include <cstdio> // for fclose, fopen, FILE\n\nnamespace tesseract {\n\n#define UNLV_EXT \".uzn\" // unlv zone file\n\n/**********************************************************************\n * read_unlv_file\n *\n * Read a whole unlv zone file to make a list of blocks.\n **********************************************************************/\n\nbool read_unlv_file(   // print list of sides\n    std::string &name, // basename of file\n    int32_t xsize,     // image size\n    int32_t ysize,     // image size\n    BLOCK_LIST *blocks // output list\n) {\n  FILE *pdfp;   // file pointer\n  BLOCK *block; // current block\n  int x;        // current top-down coords\n  int y;\n  int width; // of current block\n  int height;\n  BLOCK_IT block_it = blocks; // block iterator\n\n  name += UNLV_EXT; // add extension\n  if ((pdfp = fopen(name.c_str(), \"rb\")) == nullptr) {\n    return false; // didn't read one\n  } else {\n    while (tfscanf(pdfp, \"%d %d %d %d %*s\", &x, &y, &width, &height) >= 4) {\n      // make rect block\n      block = new BLOCK(name.c_str(), true, 0, 0, static_cast<int16_t>(x),\n                        static_cast<int16_t>(ysize - y - height), static_cast<int16_t>(x + width),\n                        static_cast<int16_t>(ysize - y));\n      // on end of list\n      block_it.add_to_end(block);\n    }\n    fclose(pdfp);\n  }\n  tprintf(\"UZN file %s loaded.\\n\", name.c_str());\n  return true;\n}\n\nvoid FullPageBlock(int width, int height, BLOCK_LIST *blocks) {\n  BLOCK_IT block_it(blocks);\n  auto *block = new BLOCK(\"\", true, 0, 0, 0, 0, width, height);\n  block_it.add_to_end(block);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/blread.h",
    "content": "/**********************************************************************\n * File:        blread.h  (Formerly pdread.h)\n * Description: Friend function of BLOCK to read the uscan pd file.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef BLREAD_H\n#define BLREAD_H\n\n#include <cstdint> // for int32_t\n#include <string>  // for std::string\n\nnamespace tesseract {\n\nclass BLOCK_LIST;\n\nbool read_unlv_file(   // print list of sides\n    std::string &name, // basename of file\n    int32_t xsize,     // image size\n    int32_t ysize,     // image size\n    BLOCK_LIST *blocks // output list\n);\n\nvoid FullPageBlock(int width, int height, BLOCK_LIST *blocks);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/boxread.cpp",
    "content": "/**********************************************************************\n * File:        boxread.cpp\n * Description: Read data from a box file.\n * Author:      Ray Smith\n *\n * (C) Copyright 2007, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"boxread.h\"\n\n#include \"errcode.h\" // for ERRCODE, TESSEXIT\n#include \"fileerr.h\" // for CANTOPENFILE\n#include \"rect.h\"    // for TBOX\n#include \"tprintf.h\" // for tprintf\n\n#include <tesseract/unichar.h> // for UNICHAR\n#include \"helpers.h\"           // for chomp_string\n\n#include <climits> // for INT_MAX\n#include <cstring> // for strchr, strcmp\n#include <fstream> // for std::ifstream\n#include <locale>  // for std::locale::classic\n#include <sstream> // for std::stringstream\n#include <string>  // for std::string\n\nnamespace tesseract {\n\n// Special char code used to identify multi-blob labels.\nstatic const char *kMultiBlobLabelCode = \"WordStr\";\n\n// Returns the box file name corresponding to the given image_filename.\nstatic std::string BoxFileName(const char *image_filename) {\n  std::string box_filename = image_filename;\n  size_t length = box_filename.length();\n  std::string last = (length > 8) ? box_filename.substr(length - 8) : \"\";\n  if (last == \".bin.png\" || last == \".nrm.png\" || last == \".raw.png\") {\n    box_filename.resize(length - 8);\n  } else {\n    size_t lastdot = box_filename.find_last_of('.');\n    if (lastdot < length) {\n      box_filename.resize(lastdot);\n    }\n  }\n  box_filename += \".box\";\n  return box_filename;\n}\n\n// Open the boxfile based on the given image filename.\nFILE *OpenBoxFile(const char *fname) {\n  std::string filename = BoxFileName(fname);\n  FILE *box_file = nullptr;\n  if (!(box_file = fopen(filename.c_str(), \"rb\"))) {\n    CANTOPENFILE.error(\"read_next_box\", TESSEXIT, \"Can't open box file %s\", filename.c_str());\n    tprintf(\"Can't open box file %s\", filename.c_str());\n  }\n  return box_file;\n}\n\n// Reads all boxes from the given filename.\n// Reads a specific target_page number if >= 0, or all pages otherwise.\n// Skips blanks if skip_blanks is true.\n// The UTF-8 label of the box is put in texts, and the full box definition as\n// a string is put in box_texts, with the corresponding page number in pages.\n// Each of the output vectors is optional (may be nullptr).\n// Returns false if no boxes are found.\nbool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,\n                  std::vector<std::string> *texts, std::vector<std::string> *box_texts,\n                  std::vector<int> *pages) {\n  std::ifstream input(BoxFileName(filename), std::ios::in | std::ios::binary);\n  if (input.fail()) {\n    tprintf(\"Cannot read box data from '%s'.\\n\", BoxFileName(filename).c_str());\n    tprintf(\"Does it exists?\\n\");\n    return false;\n  }\n  std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});\n  if (box_data.empty()) {\n    tprintf(\"No box data found in '%s'.\\n\", BoxFileName(filename).c_str());\n    return false;\n  }\n  // Convert the array of bytes to a string, so it can be used by the parser.\n  box_data.push_back('\\0');\n  return ReadMemBoxes(target_page, skip_blanks, &box_data[0],\n                      /*continue_on_failure*/ true, boxes, texts, box_texts, pages);\n}\n\n// Reads all boxes from the string. Otherwise, as ReadAllBoxes.\nbool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,\n                  std::vector<TBOX> *boxes, std::vector<std::string> *texts,\n                  std::vector<std::string> *box_texts, std::vector<int> *pages) {\n  std::string box_str(box_data);\n  std::vector<std::string> lines = split(box_str, '\\n');\n  if (lines.empty()) {\n    return false;\n  }\n  int num_boxes = 0;\n  for (auto &line : lines) {\n    int page = 0;\n    std::string utf8_str;\n    TBOX box;\n    if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) {\n      if (continue_on_failure) {\n        continue;\n      } else {\n        return false;\n      }\n    }\n    if (skip_blanks && (utf8_str == \" \" || utf8_str == \"\\t\")) {\n      continue;\n    }\n    if (target_page >= 0 && page != target_page) {\n      continue;\n    }\n    if (boxes != nullptr) {\n      boxes->push_back(box);\n    }\n    if (texts != nullptr) {\n      texts->push_back(utf8_str);\n    }\n    if (box_texts != nullptr) {\n      std::string full_text;\n      MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text);\n      box_texts->push_back(full_text);\n    }\n    if (pages != nullptr) {\n      pages->push_back(page);\n    }\n    ++num_boxes;\n  }\n  return num_boxes > 0;\n}\n\n// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.\n// Box files are used ONLY DURING TRAINING, but by both processes of\n// creating tr files with tesseract, and unicharset_extractor.\n// ReadNextBox factors out the code to interpret a line of a box\n// file so that applybox and unicharset_extractor interpret the same way.\n// This function returns the next valid box file utf8 string and coords\n// and returns true, or false on eof (and closes the file).\n// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks\n// for valid utf-8 and allows space or tab between fields.\n// utf8_str is set with the unichar string, and bounding box with the box.\n// If there are page numbers in the file, it reads them all.\nbool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {\n  return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);\n}\n\n// As ReadNextBox above, but get a specific page number. (0-based)\n// Use -1 to read any page number. Files without page number all\n// read as if they are page 0.\nbool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,\n                 TBOX *bounding_box) {\n  int page = 0;\n  char buff[kBoxReadBufSize]; // boxfile read buffer\n  char *buffptr = buff;\n\n  while (fgets(buff, sizeof(buff) - 1, box_file)) {\n    (*line_number)++;\n\n    buffptr = buff;\n    const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);\n    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {\n      buffptr += 3; // Skip unicode file designation.\n    }\n    // Check for blank lines in box file\n    if (*buffptr == '\\n' || *buffptr == '\\0') {\n      continue;\n    }\n    // Skip blank boxes.\n    if (*buffptr == ' ' || *buffptr == '\\t') {\n      continue;\n    }\n    if (*buffptr != '\\0') {\n      if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {\n        tprintf(\"Box file format error on line %i; ignored\\n\", *line_number);\n        continue;\n      }\n      if (target_page >= 0 && target_page != page) {\n        continue; // Not on the appropriate page.\n      }\n      return true; // Successfully read a box.\n    }\n  }\n  fclose(box_file);\n  return false; // EOF\n}\n\n// Parses the given box file string into a page_number, utf8_str, and\n// bounding_box. Returns true on a successful parse.\n// The box file is assumed to contain box definitions, one per line, of the\n// following format for blob-level boxes:\n//   <UTF8 str> <left> <bottom> <right> <top> <page id>\n// and for word/line-level boxes:\n//   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>\n// See applyybox.cpp for more information.\nbool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,\n                     TBOX *bounding_box) {\n  *bounding_box = TBOX(); // Initialize it to empty.\n  utf8_str = \"\";\n  char uch[kBoxReadBufSize];\n  const char *buffptr = boxfile_str;\n  // Read the unichar without messing up on Tibetan.\n  // According to issue 253 the utf-8 surrogates 85 and A0 are treated\n  // as whitespace by sscanf, so it is more reliable to just find\n  // ascii space and tab.\n  int uch_len = 0;\n  // Skip unicode file designation, if present.\n  const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);\n  if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {\n    buffptr += 3;\n  }\n  // Allow a single blank as the UTF-8 string. Check for empty string and\n  // then blindly eat the first character.\n  if (*buffptr == '\\0') {\n    return false;\n  }\n  do {\n    uch[uch_len++] = *buffptr++;\n  } while (*buffptr != '\\0' && *buffptr != ' ' && *buffptr != '\\t' &&\n           uch_len < kBoxReadBufSize - 1);\n  uch[uch_len] = '\\0';\n  if (*buffptr != '\\0') {\n    ++buffptr;\n  }\n  int x_min = INT_MAX;\n  int y_min = INT_MAX;\n  int x_max = INT_MIN;\n  int y_max = INT_MIN;\n  *page_number = 0;\n  std::stringstream stream(buffptr);\n  stream.imbue(std::locale::classic());\n  stream >> x_min;\n  stream >> y_min;\n  stream >> x_max;\n  stream >> y_max;\n  stream >> *page_number;\n  if (x_max < x_min || y_max < y_min) {\n    tprintf(\"Bad box coordinates in boxfile string! %s\\n\", ubuf);\n    return false;\n  }\n  // Test for long space-delimited string label.\n  if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) {\n    strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);\n    uch[kBoxReadBufSize - 1] = '\\0'; // Prevent buffer overrun.\n    chomp_string(uch);\n    uch_len = strlen(uch);\n  }\n  // Validate UTF8 by making unichars with it.\n  int used = 0;\n  while (used < uch_len) {\n    tesseract::UNICHAR ch(uch + used, uch_len - used);\n    int new_used = ch.utf8_len();\n    if (new_used == 0) {\n      tprintf(\"Bad UTF-8 str %s starts with 0x%02x at col %d\\n\", uch + used, uch[used], used + 1);\n      return false;\n    }\n    used += new_used;\n  }\n  utf8_str = uch;\n  if (x_min > x_max) {\n    std::swap(x_min, x_max);\n  }\n  if (y_min > y_max) {\n    std::swap(y_min, y_max);\n  }\n  bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);\n  return true; // Successfully read a box.\n}\n\n// Creates a box file string from a unichar string, TBOX and page number.\nvoid MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) {\n  box_str = unichar_str;\n  box_str += \" \" + std::to_string(box.left());\n  box_str += \" \" + std::to_string(box.bottom());\n  box_str += \" \" + std::to_string(box.right());\n  box_str += \" \" + std::to_string(box.top());\n  box_str += \" \" + std::to_string(page_num);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/boxread.h",
    "content": "/**********************************************************************\n * File:        boxread.h\n * Description: Read data from a box file.\n * Author:      Ray Smith\n *\n * (C) Copyright 2007, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCUTIL_BOXREAD_H_\n#define TESSERACT_CCUTIL_BOXREAD_H_\n\n#include <cstdio> // for FILE\n#include <string> // for std::string\n#include <vector> // for std::vector\n\n#include <tesseract/export.h> // for TESS_API\n\nnamespace tesseract {\n\nclass TBOX;\n\n// Size of buffer used to read a line from a box file.\nconst int kBoxReadBufSize = 1024;\n\n// Open the boxfile based on the given image filename.\n// Returns nullptr if the box file cannot be opened.\nTESS_API\nFILE *OpenBoxFile(const char *filename);\n\n// Reads all boxes from the given filename.\n// Reads a specific target_page number if >= 0, or all pages otherwise.\n// Skips blanks if skip_blanks is true.\n// The UTF-8 label of the box is put in texts, and the full box definition as\n// a string is put in box_texts, with the corresponding page number in pages.\n// Each of the output vectors is optional (may be nullptr).\n// Returns false if no boxes are found.\nbool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,\n                  std::vector<std::string> *texts, std::vector<std::string> *box_texts,\n                  std::vector<int> *pages);\n\n// Reads all boxes from the string. Otherwise, as ReadAllBoxes.\n// continue_on_failure allows reading to continue even if an invalid box is\n// encountered and will return true if it succeeds in reading some boxes.\n// It otherwise gives up and returns false on encountering an invalid box.\nTESS_API\nbool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,\n                  std::vector<TBOX> *boxes, std::vector<std::string> *texts,\n                  std::vector<std::string> *box_texts, std::vector<int> *pages);\n\n// ReadNextBox factors out the code to interpret a line of a box\n// file so that applybox and unicharset_extractor interpret the same way.\n// This function returns the next valid box file utf8 string and coords\n// and returns true, or false on eof (and closes the file).\n// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks\n// for valid utf-8 and allows space or tab between fields.\n// utf8_str is set with the unichar string, and bounding box with the box.\n// If there are page numbers in the file, it reads them all.\nTESS_API\nbool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);\n// As ReadNextBox above, but get a specific page number. (0-based)\n// Use -1 to read any page number. Files without page number all\n// read as if they are page 0.\nTESS_API\nbool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,\n                 TBOX *bounding_box);\n\n// Parses the given box file string into a page_number, utf8_str, and\n// bounding_box. Returns true on a successful parse.\nTESS_API\nbool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,\n                     TBOX *bounding_box);\n\n// Creates a box file string from a unichar string, TBOX and page number.\nTESS_API\nvoid MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_BOXREAD_H_\n"
  },
  {
    "path": "src/ccstruct/boxword.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        boxword.cpp\n// Description: Class to represent the bounding boxes of the output.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"boxword.h\"\n#include \"blobs.h\"\n#include \"host.h\" // for NearlyEqual\n#include \"normalis.h\"\n#include \"ocrblock.h\"\n#include \"pageres.h\"\n\nnamespace tesseract {\n\n// Clip output boxes to input blob boxes for bounds that are within this\n// tolerance. Otherwise, the blob may be chopped and we have to just use\n// the word bounding box.\nconst int kBoxClipTolerance = 2;\n\nBoxWord::BoxWord() : length_(0) {}\n\nBoxWord::BoxWord(const BoxWord &src) {\n  CopyFrom(src);\n}\n\nBoxWord &BoxWord::operator=(const BoxWord &src) {\n  CopyFrom(src);\n  return *this;\n}\n\nvoid BoxWord::CopyFrom(const BoxWord &src) {\n  bbox_ = src.bbox_;\n  length_ = src.length_;\n  boxes_.clear();\n  boxes_.reserve(length_);\n  for (unsigned i = 0; i < length_; ++i) {\n    boxes_.push_back(src.boxes_[i]);\n  }\n}\n\n// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to\n// switch back to original image coordinates.\nBoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {\n  auto *boxword = new BoxWord();\n  // Count the blobs.\n  boxword->length_ = tessword->NumBlobs();\n  // Allocate memory.\n  boxword->boxes_.reserve(boxword->length_);\n\n  for (unsigned b = 0; b < boxword->length_; ++b) {\n    TBLOB *tblob = tessword->blobs[b];\n    TBOX blob_box;\n    for (TESSLINE *outline = tblob->outlines; outline != nullptr;\n         outline = outline->next) {\n      EDGEPT *edgept = outline->loop;\n      // Iterate over the edges.\n      do {\n        if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {\n          ICOORD pos(edgept->pos.x, edgept->pos.y);\n          TPOINT denormed;\n          tblob->denorm().DenormTransform(nullptr, edgept->pos, &denormed);\n          pos.set_x(denormed.x);\n          pos.set_y(denormed.y);\n          TBOX pt_box(pos, pos);\n          blob_box += pt_box;\n        }\n        edgept = edgept->next;\n      } while (edgept != outline->loop);\n    }\n    boxword->boxes_.push_back(blob_box);\n  }\n  boxword->ComputeBoundingBox();\n  return boxword;\n}\n\n// Clean up the bounding boxes from the polygonal approximation by\n// expanding slightly, then clipping to the blobs from the original_word\n// that overlap. If not null, the block provides the inverse rotation.\nvoid BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {\n  for (unsigned i = 0; i < length_; ++i) {\n    TBOX box = boxes_[i];\n    // Expand by a single pixel, as the poly approximation error is 1 pixel.\n    box =\n        TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);\n    // Now find the original box that matches.\n    TBOX original_box;\n    C_BLOB_IT b_it(original_word->cblob_list());\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      TBOX blob_box = b_it.data()->bounding_box();\n      if (block != nullptr) {\n        blob_box.rotate(block->re_rotation());\n      }\n      if (blob_box.major_overlap(box)) {\n        original_box += blob_box;\n      }\n    }\n    if (!original_box.null_box()) {\n      if (NearlyEqual<int>(original_box.left(), box.left(),\n                           kBoxClipTolerance)) {\n        box.set_left(original_box.left());\n      }\n      if (NearlyEqual<int>(original_box.right(), box.right(),\n                           kBoxClipTolerance)) {\n        box.set_right(original_box.right());\n      }\n      if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance)) {\n        box.set_top(original_box.top());\n      }\n      if (NearlyEqual<int>(original_box.bottom(), box.bottom(),\n                           kBoxClipTolerance)) {\n        box.set_bottom(original_box.bottom());\n      }\n    }\n    original_box = original_word->bounding_box();\n    if (block != nullptr) {\n      original_box.rotate(block->re_rotation());\n    }\n    boxes_[i] = box.intersection(original_box);\n  }\n  ComputeBoundingBox();\n}\n\n// Merges the boxes from start to end, not including end, and deletes\n// the boxes between start and end.\nvoid BoxWord::MergeBoxes(unsigned start, unsigned end) {\n  start = ClipToRange(start, 0U, length_);\n  end = ClipToRange(end, 0U, length_);\n  if (end <= start + 1) {\n    return;\n  }\n  for (unsigned i = start + 1; i < end; ++i) {\n    boxes_[start] += boxes_[i];\n  }\n  int shrinkage = end - 1 - start;\n  length_ -= shrinkage;\n  for (unsigned i = start + 1; i < length_; ++i) {\n    boxes_[i] = boxes_[i + shrinkage];\n  }\n  boxes_.resize(length_);\n}\n\n// Inserts a new box before the given index.\n// Recomputes the bounding box.\nvoid BoxWord::InsertBox(unsigned index, const TBOX &box) {\n  if (index < length_) {\n    boxes_.insert(boxes_.begin() + index, box);\n  } else {\n    boxes_.push_back(box);\n  }\n  length_ = boxes_.size();\n  ComputeBoundingBox();\n}\n\n// Changes the box at the given index to the new box.\n// Recomputes the bounding box.\nvoid BoxWord::ChangeBox(unsigned index, const TBOX &box) {\n  boxes_[index] = box;\n  ComputeBoundingBox();\n}\n\n// Deletes the box with the given index, and shuffles up the rest.\n// Recomputes the bounding box.\nvoid BoxWord::DeleteBox(unsigned index) {\n  ASSERT_HOST(index < length_);\n  boxes_.erase(boxes_.begin() + index);\n  --length_;\n  ComputeBoundingBox();\n}\n\n// Deletes all the boxes stored in BoxWord.\nvoid BoxWord::DeleteAllBoxes() {\n  length_ = 0;\n  boxes_.clear();\n  bbox_ = TBOX();\n}\n\n// Computes the bounding box of the word.\nvoid BoxWord::ComputeBoundingBox() {\n  bbox_ = TBOX();\n  for (unsigned i = 0; i < length_; ++i) {\n    bbox_ += boxes_[i];\n  }\n}\n\n// This and other putatively are the same, so call the (permanent) callback\n// for each blob index where the bounding boxes match.\n// The callback is deleted on completion.\nvoid BoxWord::ProcessMatchedBlobs(const TWERD &other,\n                                  const std::function<void(int)> &cb) const {\n  for (unsigned i = 0; i < length_ && i < other.NumBlobs(); ++i) {\n    TBOX blob_box = other.blobs[i]->bounding_box();\n    if (blob_box == boxes_[i]) {\n      cb(i);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/boxword.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        boxword.h\n// Description: Class to represent the bounding boxes of the output.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CSTRUCT_BOXWORD_H_\n#define TESSERACT_CSTRUCT_BOXWORD_H_\n\n#include \"rect.h\" // for TBOX\n\n#include <functional> // for std::function\n\nnamespace tesseract {\n\nclass BLOCK;\nclass WERD;\nstruct TWERD;\n\n// Class to hold an array of bounding boxes for an output word and\n// the bounding box of the whole word.\nclass BoxWord {\npublic:\n  BoxWord();\n  explicit BoxWord(const BoxWord &src);\n  ~BoxWord() = default;\n\n  BoxWord &operator=(const BoxWord &src);\n\n  void CopyFrom(const BoxWord &src);\n\n  // Factory to build a BoxWord from a TWERD using the DENORMs on each blob to\n  // switch back to original image coordinates.\n  static BoxWord *CopyFromNormalized(TWERD *tessword);\n\n  // Clean up the bounding boxes from the polygonal approximation by\n  // expanding slightly, then clipping to the blobs from the original_word\n  // that overlap. If not null, the block provides the inverse rotation.\n  void ClipToOriginalWord(const BLOCK *block, WERD *original_word);\n\n  // Merges the boxes from start to end, not including end, and deletes\n  // the boxes between start and end.\n  void MergeBoxes(unsigned start, unsigned end);\n\n  // Inserts a new box before the given index.\n  // Recomputes the bounding box.\n  void InsertBox(unsigned index, const TBOX &box);\n\n  // Changes the box at the given index to the new box.\n  // Recomputes the bounding box.\n  void ChangeBox(unsigned index, const TBOX &box);\n\n  // Deletes the box with the given index, and shuffles up the rest.\n  // Recomputes the bounding box.\n  void DeleteBox(unsigned index);\n\n  // Deletes all the boxes stored in BoxWord.\n  void DeleteAllBoxes();\n\n  // This and other putatively are the same, so call the (permanent) callback\n  // for each blob index where the bounding boxes match.\n  // The callback is deleted on completion.\n  void ProcessMatchedBlobs(const TWERD &other,\n                           const std::function<void(int)> &cb) const;\n\n  const TBOX &bounding_box() const {\n    return bbox_;\n  }\n  unsigned length() const {\n    return length_;\n  }\n  const TBOX &BlobBox(unsigned index) const {\n    return boxes_[index];\n  }\n\nprivate:\n  void ComputeBoundingBox();\n\n  TBOX bbox_;\n  unsigned length_;\n  std::vector<TBOX> boxes_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CSTRUCT_BOXWORD_H_\n"
  },
  {
    "path": "src/ccstruct/ccstruct.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ccstruct.cpp\n// Description: ccstruct class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"ccstruct.h\"\n\nnamespace tesseract {\n\n// APPROXIMATIONS of the fractions of the character cell taken by\n// the descenders, ascenders, and x-height.\nconst double CCStruct::kDescenderFraction = 0.25;\nconst double CCStruct::kXHeightFraction = 0.5;\nconst double CCStruct::kAscenderFraction = 0.25;\nconst double CCStruct::kXHeightCapRatio =\n    CCStruct::kXHeightFraction / (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/ccstruct.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ccstruct.h\n// Description: ccstruct class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_\n#define TESSERACT_CCSTRUCT_CCSTRUCT_H_\n\n#include \"ccutil.h\" // for CCUtil\n\nnamespace tesseract {\n\nclass CCStruct : public CCUtil {\npublic:\n  // Globally accessible constants.\n  // APPROXIMATIONS of the fractions of the character cell taken by\n  // the descenders, ascenders, and x-height.\n  static const double kDescenderFraction; // = 0.25;\n  static const double kXHeightFraction;   // = 0.5;\n  static const double kAscenderFraction;  // = 0.25;\n  // Derived value giving the x-height as a fraction of cap-height.\n  static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender).\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_\n"
  },
  {
    "path": "src/ccstruct/coutln.cpp",
    "content": "/**********************************************************************\n * File:        coutln.cpp  (Formerly coutline.c)\n * Description: Code for the C_OUTLINE class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"coutln.h\"\n\n#include \"arrayaccess.h\" // for GET_DATA_BYTE\n#include \"blobs.h\"       // for TPOINT\n#include \"crakedge.h\"    // for CRACKEDGE\n#include \"environ.h\"     // for l_uint32\n#include \"errcode.h\"     // for ASSERT_HOST\n#include \"normalis.h\"    // for DENORM\n\n#include \"helpers.h\" // for ClipToRange, IntCastRounded, Modulo\n\n#include <allheaders.h> // for pixSetPixel, pixGetData, pixRasterop, pixGe...\n#include \"pix.h\"        // for Pix (ptr only), PIX_DST, PIX_NOT\n\n#include <algorithm> // for max, min\n#include <cmath>     // for abs\n#include <cstdlib>   // for abs\n#include <cstring>   // for memset, memcpy, memmove\n\nnamespace tesseract {\n\nICOORD C_OUTLINE::step_coords[4] = {ICOORD(-1, 0), ICOORD(0, -1), ICOORD(1, 0), ICOORD(0, 1)};\n\n/**\n * @name C_OUTLINE::C_OUTLINE\n *\n * Constructor to build a C_OUTLINE from a CRACKEDGE LOOP.\n * @param startpt outline to convert\n * @param bot_left bounding box\n * @param top_right bounding box\n * @param length length of loop\n */\n\nC_OUTLINE::C_OUTLINE(CRACKEDGE *startpt, ICOORD bot_left, ICOORD top_right, int16_t length)\n    : box(bot_left, top_right), start(startpt->pos), offsets(nullptr) {\n  int16_t stepindex; // index to step\n  CRACKEDGE *edgept; // current point\n\n  stepcount = length; // no of steps\n  if (length == 0) {\n    return;\n  }\n  // get memory\n  steps.resize(step_mem());\n  edgept = startpt;\n\n  for (stepindex = 0; stepindex < length; stepindex++) {\n    // set compact step\n    set_step(stepindex, edgept->stepdir);\n    edgept = edgept->next;\n  }\n}\n\n/**\n * @name C_OUTLINE::C_OUTLINE\n *\n * Constructor to build a C_OUTLINE from a C_OUTLINE_FRAG.\n */\nC_OUTLINE::C_OUTLINE(\n    // constructor\n    // steps to copy\n    ICOORD startpt, DIR128 *new_steps,\n    int16_t length // length of loop\n    )\n    : start(startpt), offsets(nullptr) {\n  int8_t dirdiff;    // direction difference\n  DIR128 prevdir;    // previous direction\n  DIR128 dir;        // current direction\n  DIR128 lastdir;    // dir of last step\n  TBOX new_box;      // easy bounding\n  int16_t stepindex; // index to step\n  int16_t srcindex;  // source steps\n  ICOORD pos;        // current position\n\n  pos = startpt;\n  stepcount = length; // No. of steps.\n  ASSERT_HOST(length >= 0);\n  steps.resize(step_mem()); // Get memory.\n\n  lastdir = new_steps[length - 1];\n  prevdir = lastdir;\n  for (stepindex = 0, srcindex = 0; srcindex < length; stepindex++, srcindex++) {\n    new_box = TBOX(pos, pos);\n    box += new_box;\n    // copy steps\n    dir = new_steps[srcindex];\n    set_step(stepindex, dir);\n    dirdiff = dir - prevdir;\n    pos += step(stepindex);\n    if ((dirdiff == 64 || dirdiff == -64) && stepindex > 0) {\n      stepindex -= 2; // cancel there-and-back\n      prevdir = stepindex >= 0 ? step_dir(stepindex) : lastdir;\n    } else {\n      prevdir = dir;\n    }\n  }\n  ASSERT_HOST(pos.x() == startpt.x() && pos.y() == startpt.y());\n  do {\n    dirdiff = step_dir(stepindex - 1) - step_dir(0);\n    if (dirdiff == 64 || dirdiff == -64) {\n      start += step(0);\n      stepindex -= 2; // cancel there-and-back\n      for (int i = 0; i < stepindex; ++i) {\n        set_step(i, step_dir(i + 1));\n      }\n    }\n  } while (stepindex > 1 && (dirdiff == 64 || dirdiff == -64));\n  stepcount = stepindex;\n  ASSERT_HOST(stepcount >= 4);\n}\n\n/**\n * @name C_OUTLINE::C_OUTLINE\n *\n * Constructor to build a C_OUTLINE from a rotation of a C_OUTLINE.\n * @param srcline outline to rotate\n * @param rotation rotate to coord\n */\n\nC_OUTLINE::C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation) : offsets(nullptr) {\n  TBOX new_box;      // easy bounding\n  int16_t stepindex; // index to step\n  int16_t dirdiff;   // direction change\n  ICOORD pos;        // current position\n  ICOORD prevpos;    // previous dest point\n\n  ICOORD destpos;                // destination point\n  int16_t destindex = INT16_MAX; // index to step\n  DIR128 dir;                    // coded direction\n  uint8_t new_step;\n\n  stepcount = srcline->stepcount * 2;\n  if (stepcount == 0) {\n    box = srcline->box;\n    box.rotate(rotation);\n    return;\n  }\n  // get memory\n  steps.resize(step_mem());\n\n  for (int iteration = 0; iteration < 2; ++iteration) {\n    DIR128 round1 = iteration == 0 ? 32 : 0;\n    DIR128 round2 = iteration != 0 ? 32 : 0;\n    pos = srcline->start;\n    prevpos = pos;\n    prevpos.rotate(rotation);\n    start = prevpos;\n    box = TBOX(start, start);\n    destindex = 0;\n    for (stepindex = 0; stepindex < srcline->stepcount; stepindex++) {\n      pos += srcline->step(stepindex);\n      destpos = pos;\n      destpos.rotate(rotation);\n      //  tprintf(\"%i %i %i %i \", destpos.x(), destpos.y(), pos.x(), pos.y());\n      while (destpos.x() != prevpos.x() || destpos.y() != prevpos.y()) {\n        dir = DIR128(FCOORD(destpos - prevpos));\n        dir += 64; // turn to step style\n        new_step = dir.get_dir();\n        //  tprintf(\" %i\\n\", new_step);\n        if (new_step & 31) {\n          set_step(destindex++, dir + round1);\n          prevpos += step(destindex - 1);\n          if (destindex < 2 ||\n              ((dirdiff = step_dir(destindex - 1) - step_dir(destindex - 2)) != -64 &&\n               dirdiff != 64)) {\n            set_step(destindex++, dir + round2);\n            prevpos += step(destindex - 1);\n          } else {\n            prevpos -= step(destindex - 1);\n            destindex--;\n            prevpos -= step(destindex - 1);\n            set_step(destindex - 1, dir + round2);\n            prevpos += step(destindex - 1);\n          }\n        } else {\n          set_step(destindex++, dir);\n          prevpos += step(destindex - 1);\n        }\n        while (destindex >= 2 &&\n               ((dirdiff = step_dir(destindex - 1) - step_dir(destindex - 2)) == -64 ||\n                dirdiff == 64)) {\n          prevpos -= step(destindex - 1);\n          prevpos -= step(destindex - 2);\n          destindex -= 2; // Forget u turn\n        }\n        // ASSERT_HOST(prevpos.x() == destpos.x() && prevpos.y() ==\n        // destpos.y());\n        new_box = TBOX(destpos, destpos);\n        box += new_box;\n      }\n    }\n    ASSERT_HOST(destpos.x() == start.x() && destpos.y() == start.y());\n    while (destindex > 1) {\n      dirdiff = step_dir(destindex - 1) - step_dir(0);\n      if (dirdiff != 64 && dirdiff != -64) {\n        break;\n      }\n      start += step(0);\n      destindex -= 2;\n      for (int i = 0; i < destindex; ++i) {\n        set_step(i, step_dir(i + 1));\n      }\n    }\n    if (destindex >= 4) {\n      break;\n    }\n  }\n  ASSERT_HOST(destindex <= stepcount);\n  stepcount = destindex;\n  destpos = start;\n  for (stepindex = 0; stepindex < stepcount; stepindex++) {\n    destpos += step(stepindex);\n  }\n  ASSERT_HOST(destpos.x() == start.x() && destpos.y() == start.y());\n}\n\n// Build a fake outline, given just a bounding box and append to the list.\nvoid C_OUTLINE::FakeOutline(const TBOX &box, C_OUTLINE_LIST *outlines) {\n  C_OUTLINE_IT ol_it(outlines);\n  // Make a C_OUTLINE from the bounds. This is a bit of a hack,\n  // as there is no outline, just a bounding box, but it works nicely.\n  CRACKEDGE start;\n  start.pos = box.topleft();\n  auto *outline = new C_OUTLINE(&start, box.topleft(), box.botright(), 0);\n  ol_it.add_to_end(outline);\n}\n\n/**\n * @name C_OUTLINE::area\n *\n * Compute the area of the outline.\n */\n\nint32_t C_OUTLINE::area() const {\n  int stepindex;       // current step\n  int32_t total_steps; // steps to do\n  int32_t total;       // total area\n  ICOORD pos;          // position of point\n  ICOORD next_step;    // step to next pix\n  // We aren't going to modify the list, or its contents, but there is\n  // no const iterator.\n  C_OUTLINE_IT it(const_cast<C_OUTLINE_LIST *>(&children));\n\n  pos = start_pos();\n  total_steps = pathlength();\n  total = 0;\n  for (stepindex = 0; stepindex < total_steps; stepindex++) {\n    // all intersected\n    next_step = step(stepindex);\n    if (next_step.x() < 0) {\n      total += pos.y();\n    } else if (next_step.x() > 0) {\n      total -= pos.y();\n    }\n    pos += next_step;\n  }\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    total += it.data()->area(); // add areas of children\n  }\n\n  return total;\n}\n\n/**\n * @name C_OUTLINE::perimeter\n *\n * Compute the perimeter of the outline and its first level children.\n */\n\nint32_t C_OUTLINE::perimeter() const {\n  int32_t total_steps; // Return value.\n  // We aren't going to modify the list, or its contents, but there is\n  // no const iterator.\n  C_OUTLINE_IT it(const_cast<C_OUTLINE_LIST *>(&children));\n\n  total_steps = pathlength();\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    total_steps += it.data()->pathlength(); // Add perimeters of children.\n  }\n\n  return total_steps;\n}\n\n/**\n * @name C_OUTLINE::outer_area\n *\n * Compute the area of the outline.\n */\n\nint32_t C_OUTLINE::outer_area() const {\n  int stepindex;       // current step\n  int32_t total_steps; // steps to do\n  int32_t total;       // total area\n  ICOORD pos;          // position of point\n  ICOORD next_step;    // step to next pix\n\n  pos = start_pos();\n  total_steps = pathlength();\n  if (total_steps == 0) {\n    return box.area();\n  }\n  total = 0;\n  for (stepindex = 0; stepindex < total_steps; stepindex++) {\n    // all intersected\n    next_step = step(stepindex);\n    if (next_step.x() < 0) {\n      total += pos.y();\n    } else if (next_step.x() > 0) {\n      total -= pos.y();\n    }\n    pos += next_step;\n  }\n\n  return total;\n}\n\n/**\n * @name C_OUTLINE::count_transitions\n *\n * Compute the number of x and y maxes and mins in the outline.\n * @param threshold winding number on size\n */\n\nint32_t C_OUTLINE::count_transitions(int32_t threshold) {\n  bool first_was_max_x; // what was first\n  bool first_was_max_y;\n  bool looking_for_max_x; // what is next\n  bool looking_for_min_x;\n  bool looking_for_max_y; // what is next\n  bool looking_for_min_y;\n  int stepindex;       // current step\n  int32_t total_steps; // steps to do\n                       // current limits\n  int32_t max_x, min_x, max_y, min_y;\n  int32_t initial_x, initial_y; // initial limits\n  int32_t total;                // total changes\n  ICOORD pos;                   // position of point\n  ICOORD next_step;             // step to next pix\n\n  pos = start_pos();\n  total_steps = pathlength();\n  total = 0;\n  max_x = min_x = pos.x();\n  max_y = min_y = pos.y();\n  looking_for_max_x = true;\n  looking_for_min_x = true;\n  looking_for_max_y = true;\n  looking_for_min_y = true;\n  first_was_max_x = false;\n  first_was_max_y = false;\n  initial_x = pos.x();\n  initial_y = pos.y(); // stop uninit warning\n  for (stepindex = 0; stepindex < total_steps; stepindex++) {\n    // all intersected\n    next_step = step(stepindex);\n    pos += next_step;\n    if (next_step.x() < 0) {\n      if (looking_for_max_x && pos.x() < min_x) {\n        min_x = pos.x();\n      }\n      if (looking_for_min_x && max_x - pos.x() > threshold) {\n        if (looking_for_max_x) {\n          initial_x = max_x;\n          first_was_max_x = false;\n        }\n        total++;\n        looking_for_max_x = true;\n        looking_for_min_x = false;\n        min_x = pos.x(); // reset min\n      }\n    } else if (next_step.x() > 0) {\n      if (looking_for_min_x && pos.x() > max_x) {\n        max_x = pos.x();\n      }\n      if (looking_for_max_x && pos.x() - min_x > threshold) {\n        if (looking_for_min_x) {\n          initial_x = min_x; // remember first min\n          first_was_max_x = true;\n        }\n        total++;\n        looking_for_max_x = false;\n        looking_for_min_x = true;\n        max_x = pos.x();\n      }\n    } else if (next_step.y() < 0) {\n      if (looking_for_max_y && pos.y() < min_y) {\n        min_y = pos.y();\n      }\n      if (looking_for_min_y && max_y - pos.y() > threshold) {\n        if (looking_for_max_y) {\n          initial_y = max_y; // remember first max\n          first_was_max_y = false;\n        }\n        total++;\n        looking_for_max_y = true;\n        looking_for_min_y = false;\n        min_y = pos.y(); // reset min\n      }\n    } else {\n      if (looking_for_min_y && pos.y() > max_y) {\n        max_y = pos.y();\n      }\n      if (looking_for_max_y && pos.y() - min_y > threshold) {\n        if (looking_for_min_y) {\n          initial_y = min_y; // remember first min\n          first_was_max_y = true;\n        }\n        total++;\n        looking_for_max_y = false;\n        looking_for_min_y = true;\n        max_y = pos.y();\n      }\n    }\n  }\n  if (first_was_max_x && looking_for_min_x) {\n    if (max_x - initial_x > threshold) {\n      total++;\n    } else {\n      total--;\n    }\n  } else if (!first_was_max_x && looking_for_max_x) {\n    if (initial_x - min_x > threshold) {\n      total++;\n    } else {\n      total--;\n    }\n  }\n  if (first_was_max_y && looking_for_min_y) {\n    if (max_y - initial_y > threshold) {\n      total++;\n    } else {\n      total--;\n    }\n  } else if (!first_was_max_y && looking_for_max_y) {\n    if (initial_y - min_y > threshold) {\n      total++;\n    } else {\n      total--;\n    }\n  }\n\n  return total;\n}\n\n/**\n * @name C_OUTLINE::operator<\n *\n * @return true if the left operand is inside the right one.\n * @param other other outline\n */\n\nbool C_OUTLINE::operator<(const C_OUTLINE &other) const {\n  int16_t count = 0; // winding count\n  ICOORD pos;        // position of point\n  int32_t stepindex; // index to cstep\n\n  if (!box.overlap(other.box)) {\n    return false; // can't be contained\n  }\n  if (stepcount == 0) {\n    return other.box.contains(this->box);\n  }\n\n  pos = start;\n  for (stepindex = 0; stepindex < stepcount && (count = other.winding_number(pos)) == INTERSECTING;\n       stepindex++) {\n    pos += step(stepindex); // try all points\n  }\n  if (count == INTERSECTING) {\n    // all intersected\n    pos = other.start;\n    for (stepindex = 0;\n         stepindex < other.stepcount && (count = winding_number(pos)) == INTERSECTING;\n         stepindex++) {\n      // try other way round\n      pos += other.step(stepindex);\n    }\n    return count == INTERSECTING || count == 0;\n  }\n  return count != 0;\n}\n\n/**\n * @name C_OUTLINE::winding_number\n *\n * @return the winding number of the outline around the given point.\n * @param point point to wind around\n */\n\nint16_t C_OUTLINE::winding_number(ICOORD point) const {\n  int16_t stepindex; // index to cstep\n  int16_t count;     // winding count\n  ICOORD vec;        // to current point\n  ICOORD stepvec;    // step vector\n  int32_t cross;     // cross product\n\n  vec = start - point; // vector to it\n  count = 0;\n  for (stepindex = 0; stepindex < stepcount; stepindex++) {\n    stepvec = step(stepindex); // get the step\n                               // crossing the line\n    if (vec.y() <= 0 && vec.y() + stepvec.y() > 0) {\n      cross = vec * stepvec; // cross product\n      if (cross > 0) {\n        count++; // crossing right half\n      } else if (cross == 0) {\n        return INTERSECTING; // going through point\n      }\n    } else if (vec.y() > 0 && vec.y() + stepvec.y() <= 0) {\n      cross = vec * stepvec;\n      if (cross < 0) {\n        count--; // crossing back\n      } else if (cross == 0) {\n        return INTERSECTING; // illegal\n      }\n    }\n    vec += stepvec; // sum vectors\n  }\n  return count; // winding number\n}\n\n/**\n * C_OUTLINE::turn_direction\n *\n * @return the sum direction delta of the outline.\n */\n\nint16_t C_OUTLINE::turn_direction() const { // winding number\n  DIR128 prevdir;                           // previous direction\n  DIR128 dir;                               // current direction\n  int16_t stepindex;                        // index to cstep\n  int8_t dirdiff;                           // direction difference\n  int16_t count;                            // winding count\n\n  if (stepcount == 0) {\n    return 128;\n  }\n  count = 0;\n  prevdir = step_dir(stepcount - 1);\n  for (stepindex = 0; stepindex < stepcount; stepindex++) {\n    dir = step_dir(stepindex);\n    dirdiff = dir - prevdir;\n    ASSERT_HOST(dirdiff == 0 || dirdiff == 32 || dirdiff == -32);\n    count += dirdiff;\n    prevdir = dir;\n  }\n  ASSERT_HOST(count == 128 || count == -128);\n  return count; // winding number\n}\n\n/**\n * @name C_OUTLINE::reverse\n *\n * Reverse the direction of an outline.\n */\n\nvoid C_OUTLINE::reverse() {      // reverse direction\n  DIR128 halfturn = MODULUS / 2; // amount to shift\n  DIR128 stepdir;                // direction of step\n  int16_t stepindex;             // index to cstep\n  int16_t farindex;              // index to other side\n  int16_t halfsteps;             // half of stepcount\n\n  halfsteps = (stepcount + 1) / 2;\n  for (stepindex = 0; stepindex < halfsteps; stepindex++) {\n    farindex = stepcount - stepindex - 1;\n    stepdir = step_dir(stepindex);\n    set_step(stepindex, step_dir(farindex) + halfturn);\n    set_step(farindex, stepdir + halfturn);\n  }\n}\n\n/**\n * @name C_OUTLINE::move\n *\n * Move C_OUTLINE by vector\n * @param vec vector to reposition OUTLINE by\n */\n\nvoid C_OUTLINE::move(const ICOORD vec) {\n  C_OUTLINE_IT it(&children); // iterator\n\n  box.move(vec);\n  start += vec;\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->move(vec); // move child outlines\n  }\n}\n\n/**\n * Returns true if *this and its children are legally nested.\n * The outer area of a child should have the opposite sign to the\n * parent. If not, it means we have discarded an outline in between\n * (probably due to excessive length).\n */\nbool C_OUTLINE::IsLegallyNested() const {\n  if (stepcount == 0) {\n    return true;\n  }\n  int64_t parent_area = outer_area();\n  // We aren't going to modify the list, or its contents, but there is\n  // no const iterator.\n  C_OUTLINE_IT child_it(const_cast<C_OUTLINE_LIST *>(&children));\n  for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) {\n    const C_OUTLINE *child = child_it.data();\n    if (child->outer_area() * parent_area > 0 || !child->IsLegallyNested()) {\n      return false;\n    }\n  }\n  return true;\n}\n\n/**\n * If this outline is smaller than the given min_size, delete this and\n * remove from its list, via *it, after checking that *it points to this.\n * Otherwise, if any children of this are too small, delete them.\n * On entry, *it must be an iterator pointing to this. If this gets deleted\n * then this is extracted from *it, so an iteration can continue.\n * @param min_size minimum size for outline\n * @param it outline iterator\n */\nvoid C_OUTLINE::RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it) {\n  if (box.width() < min_size || box.height() < min_size) {\n    ASSERT_HOST(this == it->data());\n    delete it->extract(); // Too small so get rid of it and any children.\n  } else if (!children.empty()) {\n    // Search the children of this, deleting any that are too small.\n    C_OUTLINE_IT child_it(&children);\n    for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) {\n      C_OUTLINE *child = child_it.data();\n      child->RemoveSmallRecursive(min_size, &child_it);\n    }\n  }\n}\n\n// Factored out helpers below are used only by ComputeEdgeOffsets to operate\n// on data from an 8-bit Pix, and assume that any input x and/or y are already\n// constrained to be legal Pix coordinates.\n\n/**\n * Helper computes the local 2-D gradient (dx, dy) from the 2x2 cell centered\n * on the given (x,y). If the cell would go outside the image, it is padded\n * with white.\n */\nstatic void ComputeGradient(const l_uint32 *data, int wpl, int x, int y, int width, int height,\n                            ICOORD *gradient) {\n  const l_uint32 *line = data + y * wpl;\n  int pix_x_y = x < width && y < height ? GET_DATA_BYTE(line, x) : 255;\n  int pix_x_prevy = x < width && y > 0 ? GET_DATA_BYTE(line - wpl, x) : 255;\n  int pix_prevx_prevy = x > 0 && y > 0 ? GET_DATA_BYTE(line - wpl, x - 1) : 255;\n  int pix_prevx_y = x > 0 && y < height ? GET_DATA_BYTE(line, x - 1) : 255;\n  gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy));\n  gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y));\n}\n\n/**\n * Helper evaluates a vertical difference, (x,y) - (x,y-1), returning true if\n * the difference, matches diff_sign and updating the best_diff, best_sum,\n * best_y if a new max.\n */\nstatic bool EvaluateVerticalDiff(const l_uint32 *data, int wpl, int diff_sign, int x, int y,\n                                 int height, int *best_diff, int *best_sum, int *best_y) {\n  if (y <= 0 || y >= height) {\n    return false;\n  }\n  const l_uint32 *line = data + y * wpl;\n  int pixel1 = GET_DATA_BYTE(line - wpl, x);\n  int pixel2 = GET_DATA_BYTE(line, x);\n  int diff = (pixel2 - pixel1) * diff_sign;\n  if (diff > *best_diff) {\n    *best_diff = diff;\n    *best_sum = pixel1 + pixel2;\n    *best_y = y;\n  }\n  return diff > 0;\n}\n\n/**\n * Helper evaluates a horizontal difference, (x,y) - (x-1,y), where y is implied\n * by the input image line, returning true if the difference matches diff_sign\n * and updating the best_diff, best_sum, best_x if a new max.\n */\nstatic bool EvaluateHorizontalDiff(const l_uint32 *line, int diff_sign, int x, int width,\n                                   int *best_diff, int *best_sum, int *best_x) {\n  if (x <= 0 || x >= width) {\n    return false;\n  }\n  int pixel1 = GET_DATA_BYTE(line, x - 1);\n  int pixel2 = GET_DATA_BYTE(line, x);\n  int diff = (pixel2 - pixel1) * diff_sign;\n  if (diff > *best_diff) {\n    *best_diff = diff;\n    *best_sum = pixel1 + pixel2;\n    *best_x = x;\n  }\n  return diff > 0;\n}\n\n/**\n * Adds sub-pixel resolution EdgeOffsets for the outline if the supplied\n * pix is 8-bit. Does nothing otherwise.\n * Operation: Consider the following near-horizontal line:\n * @verbatim\n *   _________\n *            |________\n *                     |________\n * @endverbatim\n * At *every* position along this line, the gradient direction will be close\n * to vertical. Extrapoaltion/interpolation of the position of the threshold\n * that was used to binarize the image gives a more precise vertical position\n * for each horizontal step, and the conflict in step direction and gradient\n * direction can be used to ignore the vertical steps.\n */\nvoid C_OUTLINE::ComputeEdgeOffsets(int threshold, Image pix) {\n  if (pixGetDepth(pix) != 8) {\n    return;\n  }\n  const l_uint32 *data = pixGetData(pix);\n  int wpl = pixGetWpl(pix);\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  bool negative = flag(COUT_INVERSE);\n  delete[] offsets;\n  offsets = new EdgeOffset[stepcount];\n  ICOORD pos = start;\n  ICOORD prev_gradient;\n  ComputeGradient(data, wpl, pos.x(), height - pos.y(), width, height, &prev_gradient);\n  for (int s = 0; s < stepcount; ++s) {\n    ICOORD step_vec = step(s);\n    TPOINT pt1(pos);\n    pos += step_vec;\n    TPOINT pt2(pos);\n    ICOORD next_gradient;\n    ComputeGradient(data, wpl, pos.x(), height - pos.y(), width, height, &next_gradient);\n    // Use the sum of the prev and next as the working gradient.\n    ICOORD gradient = prev_gradient + next_gradient;\n    // best_diff will be manipulated to be always positive.\n    int best_diff = 0;\n    // offset will be the extrapolation of the location of the greyscale\n    // threshold from the edge with the largest difference, relative to the\n    // location of the binary edge.\n    int offset = 0;\n    if (pt1.y == pt2.y && abs(gradient.y()) * 2 >= abs(gradient.x())) {\n      // Horizontal step. diff_sign == 1 indicates black above.\n      int diff_sign = (pt1.x > pt2.x) == negative ? 1 : -1;\n      int x = std::min(pt1.x, pt2.x);\n      int y = height - pt1.y;\n      int best_sum = 0;\n      int best_y = y;\n      EvaluateVerticalDiff(data, wpl, diff_sign, x, y, height, &best_diff, &best_sum, &best_y);\n      // Find the strongest edge.\n      int test_y = y;\n      do {\n        ++test_y;\n      } while (EvaluateVerticalDiff(data, wpl, diff_sign, x, test_y, height, &best_diff, &best_sum,\n                                    &best_y));\n      test_y = y;\n      do {\n        --test_y;\n      } while (EvaluateVerticalDiff(data, wpl, diff_sign, x, test_y, height, &best_diff, &best_sum,\n                                    &best_y));\n      offset = diff_sign * (best_sum / 2 - threshold) + (y - best_y) * best_diff;\n    } else if (pt1.x == pt2.x && abs(gradient.x()) * 2 >= abs(gradient.y())) {\n      // Vertical step. diff_sign == 1 indicates black on the left.\n      int diff_sign = (pt1.y > pt2.y) == negative ? 1 : -1;\n      int x = pt1.x;\n      int y = height - std::max(pt1.y, pt2.y);\n      const l_uint32 *line = pixGetData(pix) + y * wpl;\n      int best_sum = 0;\n      int best_x = x;\n      EvaluateHorizontalDiff(line, diff_sign, x, width, &best_diff, &best_sum, &best_x);\n      // Find the strongest edge.\n      int test_x = x;\n      do {\n        ++test_x;\n      } while (\n          EvaluateHorizontalDiff(line, diff_sign, test_x, width, &best_diff, &best_sum, &best_x));\n      test_x = x;\n      do {\n        --test_x;\n      } while (\n          EvaluateHorizontalDiff(line, diff_sign, test_x, width, &best_diff, &best_sum, &best_x));\n      offset = diff_sign * (threshold - best_sum / 2) + (best_x - x) * best_diff;\n    }\n    offsets[s].offset_numerator = ClipToRange<int>(offset, -INT8_MAX, INT8_MAX);\n    offsets[s].pixel_diff = ClipToRange<int>(best_diff, 0, UINT8_MAX);\n    if (negative) {\n      gradient = -gradient;\n    }\n    // Compute gradient angle quantized to 256 directions, rotated by 64 (pi/2)\n    // to convert from gradient direction to edge direction.\n    offsets[s].direction = Modulo(FCOORD::binary_angle_plus_pi(gradient.angle()) + 64, 256);\n    prev_gradient = next_gradient;\n  }\n}\n\n/**\n * Adds sub-pixel resolution EdgeOffsets for the outline using only\n * a binary image source.\n *\n * Runs a sliding window of 5 edge steps over the outline, maintaining a count\n * of the number of steps in each of the 4 directions in the window, and a\n * sum of the x or y position of each step (as appropriate to its direction.)\n * Ignores single-count steps EXCEPT the sharp U-turn and smoothes out the\n * perpendicular direction. Eg\n * @verbatim\n * ___              ___       Chain code from the left:\n *    |___    ___   ___|      222122212223221232223000\n *        |___|  |_|          Corresponding counts of each direction:\n *                          0   00000000000000000123\n *                          1   11121111001111100000\n *                          2   44434443443333343321\n *                          3   00000001111111112111\n * Count of direction at center 41434143413313143313\n * Step gets used?              YNYYYNYYYNYYNYNYYYyY (y= U-turn exception)\n * Path redrawn showing only the used points:\n * ___              ___\n *     ___    ___   ___|\n *         ___    _\n * @endverbatim\n * Sub-pixel edge position cannot be shown well with ASCII-art, but each\n * horizontal step's y position is the mean of the y positions of the steps\n * in the same direction in the sliding window, which makes a much smoother\n * outline, without losing important detail.\n */\nvoid C_OUTLINE::ComputeBinaryOffsets() {\n  delete[] offsets;\n  offsets = new EdgeOffset[stepcount];\n  // Count of the number of steps in each direction in the sliding window.\n  int dir_counts[4];\n  // Sum of the positions (y for a horizontal step, x for vertical) in each\n  // direction in the sliding window.\n  int pos_totals[4];\n  memset(dir_counts, 0, sizeof(dir_counts));\n  memset(pos_totals, 0, sizeof(pos_totals));\n  ICOORD pos = start;\n  ICOORD tail_pos = pos;\n  // tail_pos is the trailing position, with the next point to be lost from\n  // the window.\n  tail_pos -= step(stepcount - 1);\n  tail_pos -= step(stepcount - 2);\n  // head_pos is the leading position, with the next point to be added to the\n  // window.\n  ICOORD head_pos = tail_pos;\n  // Set up the initial window with 4 points in [-2, 2)\n  for (int s = -2; s < 2; ++s) {\n    increment_step(s, 1, &head_pos, dir_counts, pos_totals);\n  }\n  for (int s = 0; s < stepcount; pos += step(s++)) {\n    // At step s, s in the middle of [s-2, s+2].\n    increment_step(s + 2, 1, &head_pos, dir_counts, pos_totals);\n    int dir_index = chain_code(s);\n    ICOORD step_vec = step(s);\n    int best_diff = 0;\n    int offset = 0;\n    // Use only steps that have a count of >=2 OR the strong U-turn with a\n    // single d and 2 at d-1 and 2 at d+1 (mod 4).\n    if (dir_counts[dir_index] >= 2 ||\n        (dir_counts[dir_index] == 1 && dir_counts[Modulo(dir_index - 1, 4)] == 2 &&\n         dir_counts[Modulo(dir_index + 1, 4)] == 2)) {\n      // Valid step direction.\n      best_diff = dir_counts[dir_index];\n      int edge_pos = step_vec.x() == 0 ? pos.x() : pos.y();\n      // The offset proposes that the actual step should be positioned at\n      // the mean position of the steps in the window of the same direction.\n      // See ASCII art above.\n      offset = pos_totals[dir_index] - best_diff * edge_pos;\n    }\n    offsets[s].offset_numerator = ClipToRange<int>(offset, -INT8_MAX, INT8_MAX);\n    offsets[s].pixel_diff = ClipToRange<int>(best_diff, 0, UINT8_MAX);\n    // The direction is just the vector from start to end of the window.\n    FCOORD direction(head_pos.x() - tail_pos.x(), head_pos.y() - tail_pos.y());\n    offsets[s].direction = direction.to_direction();\n    increment_step(s - 2, -1, &tail_pos, dir_counts, pos_totals);\n  }\n}\n\n/**\n * Renders the outline to the given pix, with left and top being\n * the coords of the upper-left corner of the pix.\n */\nvoid C_OUTLINE::render(int left, int top, Image pix) const {\n  ICOORD pos = start;\n  for (int stepindex = 0; stepindex < stepcount; ++stepindex) {\n    ICOORD next_step = step(stepindex);\n    if (next_step.y() < 0) {\n      pixRasterop(pix, 0, top - pos.y(), pos.x() - left, 1, PIX_NOT(PIX_DST), nullptr, 0, 0);\n    } else if (next_step.y() > 0) {\n      pixRasterop(pix, 0, top - pos.y() - 1, pos.x() - left, 1, PIX_NOT(PIX_DST), nullptr, 0, 0);\n    }\n    pos += next_step;\n  }\n}\n\n/**\n * Renders just the outline to the given pix (no fill), with left and top\n * being the coords of the upper-left corner of the pix.\n * @param left coord\n * @param top coord\n * @param pix the pix to outline\n */\nvoid C_OUTLINE::render_outline(int left, int top, Image pix) const {\n  ICOORD pos = start;\n  for (int stepindex = 0; stepindex < stepcount; ++stepindex) {\n    ICOORD next_step = step(stepindex);\n    if (next_step.y() < 0) {\n      pixSetPixel(pix, pos.x() - left, top - pos.y(), 1);\n    } else if (next_step.y() > 0) {\n      pixSetPixel(pix, pos.x() - left - 1, top - pos.y() - 1, 1);\n    } else if (next_step.x() < 0) {\n      pixSetPixel(pix, pos.x() - left - 1, top - pos.y(), 1);\n    } else if (next_step.x() > 0) {\n      pixSetPixel(pix, pos.x() - left, top - pos.y() - 1, 1);\n    }\n    pos += next_step;\n  }\n}\n\n/**\n * @name C_OUTLINE::plot\n *\n * Draw the outline in the given colour.\n * @param window window to draw in\n * @param colour colour to draw in\n */\n\n#ifndef GRAPHICS_DISABLED\nvoid C_OUTLINE::plot(ScrollView *window, ScrollView::Color colour) const {\n  int16_t stepindex; // index to cstep\n  ICOORD pos;        // current position\n  DIR128 stepdir;    // direction of step\n\n  pos = start; // current position\n  window->Pen(colour);\n  if (stepcount == 0) {\n    window->Rectangle(box.left(), box.top(), box.right(), box.bottom());\n    return;\n  }\n  window->SetCursor(pos.x(), pos.y());\n\n  stepindex = 0;\n  while (stepindex < stepcount) {\n    pos += step(stepindex); // step to next\n    stepdir = step_dir(stepindex);\n    stepindex++; // count steps\n    // merge straight lines\n    while (stepindex < stepcount && stepdir.get_dir() == step_dir(stepindex).get_dir()) {\n      pos += step(stepindex);\n      stepindex++;\n    }\n    window->DrawTo(pos.x(), pos.y());\n  }\n}\n\n/**\n * Draws the outline in the given colour, normalized using the given denorm,\n * making use of sub-pixel accurate information if available.\n */\nvoid C_OUTLINE::plot_normed(const DENORM &denorm, ScrollView::Color colour,\n                            ScrollView *window) const {\n  window->Pen(colour);\n  if (stepcount == 0) {\n    window->Rectangle(box.left(), box.top(), box.right(), box.bottom());\n    return;\n  }\n  const DENORM *root_denorm = denorm.RootDenorm();\n  ICOORD pos = start; // current position\n  FCOORD f_pos = sub_pixel_pos_at_index(pos, 0);\n  FCOORD pos_normed;\n  denorm.NormTransform(root_denorm, f_pos, &pos_normed);\n  window->SetCursor(IntCastRounded(pos_normed.x()), IntCastRounded(pos_normed.y()));\n  for (int s = 0; s < stepcount; pos += step(s++)) {\n    int edge_weight = edge_strength_at_index(s);\n    if (edge_weight == 0) {\n      // This point has conflicting gradient and step direction, so ignore it.\n      continue;\n    }\n    FCOORD f_pos = sub_pixel_pos_at_index(pos, s);\n    FCOORD pos_normed;\n    denorm.NormTransform(root_denorm, f_pos, &pos_normed);\n    window->DrawTo(IntCastRounded(pos_normed.x()), IntCastRounded(pos_normed.y()));\n  }\n}\n#endif\n\n/**\n * @name C_OUTLINE::operator=\n *\n * Assignment - deep copy data\n * @param source assign from this\n */\n\nC_OUTLINE &C_OUTLINE::operator=(const C_OUTLINE &source) {\n  box = source.box;\n  start = source.start;\n  if (!children.empty()) {\n    children.clear();\n  }\n  children.deep_copy(&source.children, &deep_copy);\n  delete[] offsets;\n  offsets = nullptr;\n  stepcount = source.stepcount;\n  if (stepcount > 0) {\n    steps.resize(step_mem());\n    memmove(&steps[0], &source.steps[0], step_mem());\n    if (source.offsets != nullptr) {\n      offsets = new EdgeOffset[stepcount];\n      memcpy(offsets, source.offsets, stepcount * sizeof(*offsets));\n    }\n  }\n  return *this;\n}\n\n/**\n * Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals\n * by the step, increment, and vertical step ? x : y position * increment\n * at step s Mod stepcount respectively. Used to add or subtract the\n * direction and position to/from accumulators of a small neighbourhood.\n */\nvoid C_OUTLINE::increment_step(int s, int increment, ICOORD *pos, int *dir_counts,\n                               int *pos_totals) const {\n  int step_index = Modulo(s, stepcount);\n  int dir_index = chain_code(step_index);\n  dir_counts[dir_index] += increment;\n  ICOORD step_vec = step(step_index);\n  if (step_vec.x() == 0) {\n    pos_totals[dir_index] += pos->x() * increment;\n  } else {\n    pos_totals[dir_index] += pos->y() * increment;\n  }\n  *pos += step_vec;\n}\n\nICOORD C_OUTLINE::chain_step(int chaindir) {\n  return step_coords[chaindir % 4];\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/coutln.h",
    "content": "/**********************************************************************\n * File:        coutln.h\n * Description: Code for the C_OUTLINE class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef COUTLN_H\n#define COUTLN_H\n\n#include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#include \"mod128.h\"     // for DIR128, DIRBITS\n#include \"points.h\"     // for ICOORD, FCOORD\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n\n#include <tesseract/export.h> // for DLLSYM\n\n#include <cstdint> // for int16_t, int32_t\n#include <bitset>  // for std::bitset<16>\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass CRACKEDGE;\nclass DENORM;\n\n#define INTERSECTING INT16_MAX // no winding number\n\n// mask to get step\n#define STEP_MASK 3\n\nenum C_OUTLINE_FLAGS {\n  COUT_INVERSE // White on black blob\n};\n\n// Simple struct to hold the 3 values needed to compute a more precise edge\n// position and direction. The offset_numerator is the difference between the\n// grey threshold and the mean pixel value. pixel_diff is the difference between\n// the pixels in the edge. Consider the following row of pixels: p1 p2 p3 p4 p5\n// Say the image was thresholded  at threshold t, making p1, p2, p3 black\n// and p4, p5 white (p1, p2, p3 < t, and p4, p5 >= t), but suppose that\n// max(p[i+1] - p[i]) is p3 - p2. Then the extrapolated position of the edge,\n// based on the maximum gradient, is at the crack between p2 and p3 plus the\n// offset (t - (p2+p3)/2)/(p3 - p2). We store the pixel difference p3-p2\n// denominator in pixel_diff and the offset numerator, relative to the original\n// binary edge (t - (p2+p3)/2) - (p3 -p2) in offset_numerator.\n// The sign of offset_numerator and pixel_diff are manipulated to ensure\n// that the pixel_diff, which will be used as a weight, is always positive.\n// The direction stores the quantized feature direction for the given step\n// computed from the edge gradient. (Using binary_angle_plus_pi.)\n// If the pixel_diff is zero, it means that the direction of the gradient\n// is in conflict with the step direction, so this step is to be ignored.\nstruct EdgeOffset {\n  int8_t offset_numerator;\n  uint8_t pixel_diff;\n  uint8_t direction;\n};\n\nclass C_OUTLINE; // forward declaration\n\nELISTIZEH(C_OUTLINE)\nclass C_OUTLINE : public ELIST<C_OUTLINE>::LINK {\npublic:\n  C_OUTLINE() {\n    stepcount = 0;\n    offsets = nullptr;\n  }\n  C_OUTLINE(              // constructor\n      CRACKEDGE *startpt, // from edge detector\n      ICOORD bot_left,    // bounding box //length of loop\n      ICOORD top_right, int16_t length);\n  C_OUTLINE(ICOORD startpt,                       // start of loop\n            DIR128 *new_steps,                    // steps in loop\n            int16_t length);                      // length of loop\n                                                  // outline to copy\n  C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation); // and rotate\n\n  // Build a fake outline, given just a bounding box and append to the list.\n  static void FakeOutline(const TBOX &box, C_OUTLINE_LIST *outlines);\n\n  ~C_OUTLINE() { // destructor\n    delete[] offsets;\n  }\n\n  bool flag(                        // test flag\n      C_OUTLINE_FLAGS mask) const { // flag to test\n    return flags[mask];\n  }\n  void set_flag(            // set flag value\n      C_OUTLINE_FLAGS mask, // flag to test\n      bool value) {         // value to set\n    flags.set(mask, value);\n  }\n\n  C_OUTLINE_LIST *child() { // get child list\n    return &children;\n  }\n\n  // access function\n  const TBOX &bounding_box() const {\n    return box;\n  }\n  void set_step(         // set a step\n      int16_t stepindex, // index of step\n      int8_t stepdir) {  // chain code\n    int shift = stepindex % 4 * 2;\n    uint8_t mask = 3 << shift;\n    steps[stepindex / 4] = ((stepdir << shift) & mask) | (steps[stepindex / 4] & ~mask);\n    // squeeze 4 into byte\n  }\n  void set_step(         // set a step\n      int16_t stepindex, // index of step\n      DIR128 stepdir) {  // direction\n    // clean it\n    int8_t chaindir = stepdir.get_dir() >> (DIRBITS - 2);\n    // difference\n    set_step(stepindex, chaindir);\n    // squeeze 4 into byte\n  }\n\n  int32_t pathlength() const { // get path length\n    return stepcount;\n  }\n  // Return step at a given index as a DIR128.\n  DIR128 step_dir(int index) const {\n    return DIR128(\n        static_cast<int16_t>(((steps[index / 4] >> (index % 4 * 2)) & STEP_MASK) << (DIRBITS - 2)));\n  }\n  // Return the step vector for the given outline position.\n  ICOORD step(int index) const { // index of step\n    return step_coords[chain_code(index)];\n  }\n  // get start position\n  const ICOORD &start_pos() const {\n    return start;\n  }\n  // Returns the position at the given index on the outline.\n  // NOT to be used lightly, as it has to iterate the outline to find out.\n  ICOORD position_at_index(int index) const {\n    ICOORD pos = start;\n    for (int i = 0; i < index; ++i) {\n      pos += step(i);\n    }\n    return pos;\n  }\n  // Returns the sub-pixel accurate position given the integer position pos\n  // at the given index on the outline. pos may be a return value of\n  // position_at_index, or computed by repeatedly adding step to the\n  // start_pos() in the usual way.\n  FCOORD sub_pixel_pos_at_index(const ICOORD &pos, int index) const {\n    const ICOORD &step_to_next(step(index));\n    FCOORD f_pos(pos.x() + step_to_next.x() / 2.0f, pos.y() + step_to_next.y() / 2.0f);\n    if (offsets != nullptr && offsets[index].pixel_diff > 0) {\n      float offset = offsets[index].offset_numerator;\n      offset /= offsets[index].pixel_diff;\n      if (step_to_next.x() != 0) {\n        f_pos.set_y(f_pos.y() + offset);\n      } else {\n        f_pos.set_x(f_pos.x() + offset);\n      }\n    }\n    return f_pos;\n  }\n  // Returns the step direction for the given index or -1 if there is none.\n  int direction_at_index(int index) const {\n    if (offsets != nullptr && offsets[index].pixel_diff > 0) {\n      return offsets[index].direction;\n    }\n    return -1;\n  }\n  // Returns the edge strength for the given index.\n  // If there are no recorded edge strengths, returns 1 (assuming the image\n  // is binary). Returns 0 if the gradient direction conflicts with the\n  // step direction, indicating that this position could be skipped.\n  int edge_strength_at_index(int index) const {\n    if (offsets != nullptr) {\n      return offsets[index].pixel_diff;\n    }\n    return 1;\n  }\n  // Return the step as a chain code (0-3) related to the standard feature\n  // direction of binary_angle_plus_pi by:\n  // chain_code * 64 = feature direction.\n  int chain_code(int index) const { // index of step\n    return (steps[index / 4] >> (index % 4 * 2)) & STEP_MASK;\n  }\n\n  int32_t area() const;       // Returns area of self and 1st level children.\n  int32_t perimeter() const;  // Total perimeter of self and 1st level children.\n  int32_t outer_area() const; // Returns area of self only.\n  int32_t count_transitions(  // count maxima\n      int32_t threshold);     // size threshold\n\n  bool operator<( // containment test\n      const C_OUTLINE &other) const;\n  bool operator>( // containment test\n      C_OUTLINE &other) const {\n    return other < *this; // use the < to do it\n  }\n  int16_t winding_number(   // get winding number\n      ICOORD testpt) const; // around this point\n                            // get direction\n  int16_t turn_direction() const;\n  void reverse(); // reverse direction\n\n  void move(             // reposition outline\n      const ICOORD vec); // by vector\n\n  // Returns true if *this and its children are legally nested.\n  // The outer area of a child should have the opposite sign to the\n  // parent. If not, it means we have discarded an outline in between\n  // (probably due to excessive length).\n  bool IsLegallyNested() const;\n\n  // If this outline is smaller than the given min_size, delete this and\n  // remove from its list, via *it, after checking that *it points to this.\n  // Otherwise, if any children of this are too small, delete them.\n  // On entry, *it must be an iterator pointing to this. If this gets deleted\n  // then this is extracted from *it, so an iteration can continue.\n  void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it);\n\n  // Adds sub-pixel resolution EdgeOffsets for the outline if the supplied\n  // pix is 8-bit. Does nothing otherwise.\n  void ComputeEdgeOffsets(int threshold, Image pix);\n  // Adds sub-pixel resolution EdgeOffsets for the outline using only\n  // a binary image source.\n  void ComputeBinaryOffsets();\n\n  // Renders the outline to the given pix, with left and top being\n  // the coords of the upper-left corner of the pix.\n  void render(int left, int top, Image pix) const;\n\n  // Renders just the outline to the given pix (no fill), with left and top\n  // being the coords of the upper-left corner of the pix.\n  void render_outline(int left, int top, Image pix) const;\n\n#ifndef GRAPHICS_DISABLED\n  void plot(                           // draw one\n      ScrollView *window,              // window to draw in\n      ScrollView::Color colour) const; // colour to draw it\n  // Draws the outline in the given colour, normalized using the given denorm,\n  // making use of sub-pixel accurate information if available.\n  void plot_normed(const DENORM &denorm, ScrollView::Color colour, ScrollView *window) const;\n#endif // !GRAPHICS_DISABLED\n\n  C_OUTLINE &operator=(const C_OUTLINE &source);\n\n  static C_OUTLINE *deep_copy(const C_OUTLINE *src) {\n    auto *outline = new C_OUTLINE;\n    *outline = *src;\n    return outline;\n  }\n\n  static ICOORD chain_step(int chaindir);\n\n  // The maximum length of any outline. The stepcount is stored as 16 bits,\n  // but it is probably not a good idea to increase this constant by much\n  // and switch to 32 bits, as it plays an important role in keeping huge\n  // outlines invisible, which prevents bad speed behavior.\n  static const int kMaxOutlineLength = 16000;\n\nprivate:\n  // Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals\n  // by the step, increment, and vertical step ? x : y position * increment\n  // at step s Mod stepcount respectively. Used to add or subtract the\n  // direction and position to/from accumulators of a small neighbourhood.\n  void increment_step(int s, int increment, ICOORD *pos, int *dir_counts, int *pos_totals) const;\n  int step_mem() const {\n    return (stepcount + 3) / 4;\n  }\n\n  TBOX box;                // bounding box\n  ICOORD start;            // start coord\n  int16_t stepcount;       // no of steps\n  std::bitset<16> flags;   // flags about outline\n  std::vector<uint8_t> steps; // step array\n  EdgeOffset *offsets;     // Higher precision edge.\n  C_OUTLINE_LIST children; // child elements\n  static ICOORD step_coords[4];\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/crakedge.h",
    "content": "/**********************************************************************\n * File:        crakedge.h      (Formerly: crkedge.h)\n * Description: Structures for the Crack following edge detector.\n * Author:      Ray Smith\n * Created:     Fri Mar 22 16:06:38 GMT 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef CRAKEDGE_H\n#define CRAKEDGE_H\n\n#include \"mod128.h\"\n#include \"points.h\"\n\nnamespace tesseract {\n\nclass CRACKEDGE {\npublic:\n  CRACKEDGE() = default;\n\n  ICOORD pos;   /*position of crack */\n  int8_t stepx; // edge step\n  int8_t stepy;\n  int8_t stepdir;  // chaincode\n  CRACKEDGE *prev; /*previous point */\n  CRACKEDGE *next; /*next point */\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/debugpixa.h",
    "content": "#ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_\n#define TESSERACT_CCSTRUCT_DEBUGPIXA_H_\n\n#include \"image.h\"\n\n#include <allheaders.h>\n\nnamespace tesseract {\n\n// Class to hold a Pixa collection of debug images with captions and save them\n// to a PDF file.\nclass DebugPixa {\npublic:\n  // TODO(rays) add another constructor with size control.\n  DebugPixa() {\n    pixa_ = pixaCreate(0);\n#ifdef TESSERACT_DISABLE_DEBUG_FONTS\n    fonts_ = NULL;\n#else\n    fonts_ = bmfCreate(nullptr, 14);\n#endif\n  }\n  // If the filename_ has been set and there are any debug images, they are\n  // written to the set filename_.\n  ~DebugPixa() {\n    pixaDestroy(&pixa_);\n    bmfDestroy(&fonts_);\n  }\n\n  // Adds the given pix to the set of pages in the PDF file, with the given\n  // caption added to the top.\n  void AddPix(const Image pix, const char *caption) {\n    int depth = pixGetDepth(pix);\n    int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80);\n    Image pix_debug =\n        pixAddSingleTextblock(pix, fonts_, caption, color, L_ADD_BELOW, nullptr);\n    pixaAddPix(pixa_, pix_debug, L_INSERT);\n  }\n\n  // Sets the destination filename and enables images to be written to a PDF\n  // on destruction.\n  void WritePDF(const char *filename) {\n    if (pixaGetCount(pixa_) > 0) {\n      pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, \"AllDebugImages\", filename);\n      pixaClear(pixa_);\n    }\n  }\n\nprivate:\n  // The collection of images to put in the PDF.\n  Pixa *pixa_;\n  // The fonts used to draw text captions.\n  L_Bmf *fonts_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_\n"
  },
  {
    "path": "src/ccstruct/detlinefit.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        detlinefit.cpp\n// Description: Deterministic least median squares line fitting.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"detlinefit.h\"\n#include \"helpers.h\"        // for IntCastRounded\n#include \"statistc.h\"\n#include \"tesserrstream.h\"  // for tesserr\n\n#include <algorithm>\n#include <cfloat> // for FLT_MAX\n\nnamespace tesseract {\n\n// The number of points to consider at each end.\nconst int kNumEndPoints = 3;\n// The minimum number of points at which to switch to number of points\n// for badly fitted lines.\n// To ensure a sensible error metric, kMinPointsForErrorCount should be at\n// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in\n// ComputeUpperQuartileError.\nconst int kMinPointsForErrorCount = 16;\n// The maximum real distance to use before switching to number of\n// mis-fitted points, which will get square-rooted for true distance.\nconst int kMaxRealDistance = 2.0;\n\nDetLineFit::DetLineFit() : square_length_(0.0) {}\n\n// Delete all Added points.\nvoid DetLineFit::Clear() {\n  pts_.clear();\n  distances_.clear();\n}\n\n// Add a new point. Takes a copy - the pt doesn't need to stay in scope.\nvoid DetLineFit::Add(const ICOORD &pt) {\n  pts_.emplace_back(pt, 0);\n}\n// Associates a half-width with the given point if a point overlaps the\n// previous point by more than half the width, and its distance is further\n// than the previous point, then the more distant point is ignored in the\n// distance calculation. Useful for ignoring i dots and other diacritics.\nvoid DetLineFit::Add(const ICOORD &pt, int halfwidth) {\n  pts_.emplace_back(pt, halfwidth);\n}\n\n// Fits a line to the points, ignoring the skip_first initial points and the\n// skip_last final points, returning the fitted line as a pair of points,\n// and the upper quartile error.\ndouble DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {\n  // Do something sensible with no points.\n  if (pts_.empty()) {\n    pt1->set_x(0);\n    pt1->set_y(0);\n    *pt2 = *pt1;\n    return 0.0;\n  }\n  // Count the points and find the first and last kNumEndPoints.\n  int pt_count = pts_.size();\n  ICOORD *starts[kNumEndPoints];\n  if (skip_first >= pt_count) {\n    skip_first = pt_count - 1;\n  }\n  int start_count = 0;\n  int end_i = std::min(skip_first + kNumEndPoints, pt_count);\n  for (int i = skip_first; i < end_i; ++i) {\n    starts[start_count++] = &pts_[i].pt;\n  }\n  ICOORD *ends[kNumEndPoints];\n  if (skip_last >= pt_count) {\n    skip_last = pt_count - 1;\n  }\n  int end_count = 0;\n  end_i = std::max(0, pt_count - kNumEndPoints - skip_last);\n  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {\n    ends[end_count++] = &pts_[i].pt;\n  }\n  // 1 or 2 points need special treatment.\n  if (pt_count <= 2) {\n    *pt1 = *starts[0];\n    if (pt_count > 1) {\n      *pt2 = *ends[0];\n    } else {\n      *pt2 = *pt1;\n    }\n    return 0.0;\n  }\n  // Although with between 2 and 2*kNumEndPoints-1 points, there will be\n  // overlap in the starts, ends sets, this is OK and taken care of by the\n  // if (*start != *end) test below, which also tests for equal input points.\n  double best_uq = -1.0;\n  // Iterate each pair of points and find the best fitting line.\n  for (int i = 0; i < start_count; ++i) {\n    ICOORD *start = starts[i];\n    for (int j = 0; j < end_count; ++j) {\n      ICOORD *end = ends[j];\n      if (*start != *end) {\n        ComputeDistances(*start, *end);\n        // Compute the upper quartile error from the line.\n        double dist = EvaluateLineFit();\n        if (dist < best_uq || best_uq < 0.0) {\n          best_uq = dist;\n          *pt1 = *start;\n          *pt2 = *end;\n        }\n      }\n    }\n  }\n  // Finally compute the square root to return the true distance.\n  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;\n}\n\n// Constrained fit with a supplied direction vector. Finds the best line_pt,\n// that is one of the supplied points having the median cross product with\n// direction, ignoring points that have a cross product outside of the range\n// [min_dist, max_dist]. Returns the resulting error metric using the same\n// reduced set of points.\n// *Makes use of floating point arithmetic*\ndouble DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,\n                                  bool debug, ICOORD *line_pt) {\n  ComputeConstrainedDistances(direction, min_dist, max_dist);\n  // Do something sensible with no points or computed distances.\n  if (pts_.empty() || distances_.empty()) {\n    line_pt->set_x(0);\n    line_pt->set_y(0);\n    return 0.0;\n  }\n  auto median_index = distances_.size() / 2;\n  std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());\n  *line_pt = distances_[median_index].data();\n  if (debug) {\n    tesserr << \"Constrained fit to dir \" << direction.x() << \", \"\n            << direction.y() << \" = \"\n            << line_pt->x() << \", \" << line_pt->y()\n            << \" :\" << distances_.size() << \" distances:\\n\";\n    for (unsigned i = 0; i < distances_.size(); ++i) {\n      tesserr << i << \": \"\n              << distances_[i].data().x() << \", \"\n              << distances_[i].data().y() << \" -> \"\n              << distances_[i].key() << '\\n';\n    }\n    tesserr << \"Result = \" << median_index << '\\n';\n  }\n  // Center distances on the fitted point.\n  double dist_origin = direction * *line_pt;\n  for (auto &distance : distances_) {\n    distance.key() -= dist_origin;\n  }\n  return sqrt(EvaluateLineFit());\n}\n\n// Returns true if there were enough points at the last call to Fit or\n// ConstrainedFit for the fitted points to be used on a badly fitted line.\nbool DetLineFit::SufficientPointsForIndependentFit() const {\n  return distances_.size() >= kMinPointsForErrorCount;\n}\n\n// Backwards compatible fit returning a gradient and constant.\n// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this\n// function in preference to the LMS class.\ndouble DetLineFit::Fit(float *m, float *c) {\n  ICOORD start, end;\n  double error = Fit(&start, &end);\n  if (end.x() != start.x()) {\n    *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());\n    *c = start.y() - *m * start.x();\n  } else {\n    *m = 0.0f;\n    *c = 0.0f;\n  }\n  return error;\n}\n\n// Backwards compatible constrained fit with a supplied gradient.\n// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible\n// to avoid potential difficulties with infinite gradients.\ndouble DetLineFit::ConstrainedFit(double m, float *c) {\n  // Do something sensible with no points.\n  if (pts_.empty()) {\n    *c = 0.0f;\n    return 0.0;\n  }\n  double cos = 1.0 / sqrt(1.0 + m * m);\n  FCOORD direction(cos, m * cos);\n  ICOORD line_pt;\n  double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);\n  *c = line_pt.y() - line_pt.x() * m;\n  return error;\n}\n\n// Computes and returns the squared evaluation metric for a line fit.\ndouble DetLineFit::EvaluateLineFit() {\n  // Compute the upper quartile error from the line.\n  double dist = ComputeUpperQuartileError();\n  if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {\n    // Use the number of mis-fitted points as the error metric, as this\n    // gives a better measure of fit for badly fitted lines where more\n    // than a quarter are badly fitted.\n    double threshold = kMaxRealDistance * sqrt(square_length_);\n    dist = NumberOfMisfittedPoints(threshold);\n  }\n  return dist;\n}\n\n// Computes the absolute error distances of the points from the line,\n// and returns the squared upper-quartile error distance.\ndouble DetLineFit::ComputeUpperQuartileError() {\n  int num_errors = distances_.size();\n  if (num_errors == 0) {\n    return 0.0;\n  }\n  // Get the absolute values of the errors.\n  for (int i = 0; i < num_errors; ++i) {\n    if (distances_[i].key() < 0) {\n      distances_[i].key() = -distances_[i].key();\n    }\n  }\n  // Now get the upper quartile distance.\n  auto index = 3 * num_errors / 4;\n  std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());\n  double dist = distances_[index].key();\n  // The true distance is the square root of the dist squared / square_length.\n  // Don't bother with the square root. Just return the square distance.\n  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;\n}\n\n// Returns the number of sample points that have an error more than threshold.\nint DetLineFit::NumberOfMisfittedPoints(double threshold) const {\n  int num_misfits = 0;\n  int num_dists = distances_.size();\n  // Get the absolute values of the errors.\n  for (int i = 0; i < num_dists; ++i) {\n    if (distances_[i].key() > threshold) {\n      ++num_misfits;\n    }\n  }\n  return num_misfits;\n}\n\n// Computes all the cross product distances of the points from the line,\n// storing the actual (signed) cross products in distances.\n// Ignores distances of points that are further away than the previous point,\n// and overlaps the previous point by at least half.\nvoid DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {\n  distances_.clear();\n  ICOORD line_vector = end;\n  line_vector -= start;\n  square_length_ = line_vector.sqlength();\n  int line_length = IntCastRounded(sqrt(square_length_));\n  // Compute the distance of each point from the line.\n  int prev_abs_dist = 0;\n  int prev_dot = 0;\n  for (unsigned i = 0; i < pts_.size(); ++i) {\n    ICOORD pt_vector = pts_[i].pt;\n    pt_vector -= start;\n    int dot = line_vector % pt_vector;\n    // Compute |line_vector||pt_vector|sin(angle between)\n    int dist = line_vector * pt_vector;\n    int abs_dist = dist < 0 ? -dist : dist;\n    if (abs_dist > prev_abs_dist && i > 0) {\n      // Ignore this point if it overlaps the previous one.\n      int separation = abs(dot - prev_dot);\n      if (separation < line_length * pts_[i].halfwidth ||\n          separation < line_length * pts_[i - 1].halfwidth) {\n        continue;\n      }\n    }\n    distances_.emplace_back(dist, pts_[i].pt);\n    prev_abs_dist = abs_dist;\n    prev_dot = dot;\n  }\n}\n\n// Computes all the cross product distances of the points perpendicular to\n// the given direction, ignoring distances outside of the give distance range,\n// storing the actual (signed) cross products in distances_.\nvoid DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,\n                                             double max_dist) {\n  distances_.clear();\n  square_length_ = direction.sqlength();\n  // Compute the distance of each point from the line.\n  for (auto &pt : pts_) {\n    FCOORD pt_vector = pt.pt;\n    // Compute |line_vector||pt_vector|sin(angle between)\n    double dist = direction * pt_vector;\n    if (min_dist <= dist && dist <= max_dist) {\n      distances_.emplace_back(dist, pt.pt);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/detlinefit.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        detlinefit.h\n// Description: Deterministic least upper-quartile squares line fitting.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_\n#define TESSERACT_CCSTRUCT_DETLINEFIT_H_\n\n#include \"kdpair.h\"\n#include \"points.h\"\n\nnamespace tesseract {\n\n// This class fits a line to a set of ICOORD points.\n// There is no restriction on the direction of the line, as it\n// uses a vector method, ie no concern over infinite gradients.\n// The fitted line has the least upper quartile of squares of perpendicular\n// distances of all source points from the line, subject to the constraint\n// that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}]\n// i.e. the 9 combinations of one of the first 3 and last 3 points.\n// A fundamental assumption of this algorithm is that one of the first 3 and\n// one of the last 3 points are near the best line fit.\n// The points must be Added in line order for the algorithm to work properly.\n// No floating point calculations are needed* to make an accurate fit,\n// and no random numbers are needed** so the algorithm is deterministic,\n// architecture-stable, and compiler-stable as well as stable to minor\n// changes in the input.\n// *A single floating point division is used to compute each line's distance.\n// This is unlikely to result in choice of a different line, but if it does,\n// it would be easy to replace with a 64 bit integer calculation.\n// **Random numbers are used in the nth_item function, but the worst\n// non-determinism that can result is picking a different result among equals,\n// and that wouldn't make any difference to the end-result distance, so the\n// randomness does not affect the determinism of the algorithm. The random\n// numbers are only there to guarantee average linear time.\n// Fitting time is linear, but with a high constant, as it tries 9 different\n// lines and computes the distance of all points each time.\n// This class is aimed at replacing the LLSQ (linear least squares) and\n// LMS (least median of squares) classes that are currently used for most\n// of the line fitting in Tesseract.\nclass DetLineFit {\npublic:\n  DetLineFit();\n  ~DetLineFit() = default;\n\n  // Delete all Added points.\n  void Clear();\n\n  // Adds a new point. Takes a copy - the pt doesn't need to stay in scope.\n  // Add must be called on points in sequence along the line.\n  void Add(const ICOORD &pt);\n  // Associates a half-width with the given point if a point overlaps the\n  // previous point by more than half the width, and its distance is further\n  // than the previous point, then the more distant point is ignored in the\n  // distance calculation. Useful for ignoring i dots and other diacritics.\n  void Add(const ICOORD &pt, int halfwidth);\n\n  // Fits a line to the points, returning the fitted line as a pair of\n  // points, and the upper quartile error.\n  double Fit(ICOORD *pt1, ICOORD *pt2) {\n    return Fit(0, 0, pt1, pt2);\n  }\n  // Fits a line to the points, ignoring the skip_first initial points and the\n  // skip_last final points, returning the fitted line as a pair of points,\n  // and the upper quartile error.\n  double Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2);\n\n  // Constrained fit with a supplied direction vector. Finds the best line_pt,\n  // that is one of the supplied points having the median cross product with\n  // direction, ignoring points that have a cross product outside of the range\n  // [min_dist, max_dist]. Returns the resulting error metric using the same\n  // reduced set of points.\n  // *Makes use of floating point arithmetic*\n  double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug,\n                        ICOORD *line_pt);\n\n  // Returns true if there were enough points at the last call to Fit or\n  // ConstrainedFit for the fitted points to be used on a badly fitted line.\n  bool SufficientPointsForIndependentFit() const;\n\n  // Backwards compatible fit returning a gradient and constant.\n  // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this\n  // function in preference to the LMS class.\n  double Fit(float *m, float *c);\n\n  // Backwards compatible constrained fit with a supplied gradient.\n  // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible\n  // to avoid potential difficulties with infinite gradients.\n  double ConstrainedFit(double m, float *c);\n\nprivate:\n  // Simple struct to hold an ICOORD point and a halfwidth representing half\n  // the \"width\" (supposedly approximately parallel to the direction of the\n  // line) of each point, such that distant points can be discarded when they\n  // overlap nearer points. (Think i dot and other diacritics or noise.)\n  struct PointWidth {\n    PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {}\n    PointWidth(const ICOORD &pt0, int halfwidth0) : pt(pt0), halfwidth(halfwidth0) {}\n\n    ICOORD pt;\n    int halfwidth;\n  };\n  // Type holds the distance of each point from the fitted line and the point\n  // itself. Use of double allows integer distances from ICOORDs to be stored\n  // exactly, and also the floating point results from ConstrainedFit.\n  using DistPointPair = KDPairInc<double, ICOORD>;\n\n  // Computes and returns the squared evaluation metric for a line fit.\n  double EvaluateLineFit();\n\n  // Computes the absolute values of the precomputed distances_,\n  // and returns the squared upper-quartile error distance.\n  double ComputeUpperQuartileError();\n\n  // Returns the number of sample points that have an error more than threshold.\n  int NumberOfMisfittedPoints(double threshold) const;\n\n  // Computes all the cross product distances of the points from the line,\n  // storing the actual (signed) cross products in distances_.\n  // Ignores distances of points that are further away than the previous point,\n  // and overlaps the previous point by at least half.\n  void ComputeDistances(const ICOORD &start, const ICOORD &end);\n\n  // Computes all the cross product distances of the points perpendicular to\n  // the given direction, ignoring distances outside of the give distance range,\n  // storing the actual (signed) cross products in distances_.\n  void ComputeConstrainedDistances(const FCOORD &direction, double min_dist, double max_dist);\n\n  // Stores all the source points in the order they were given and their\n  // halfwidths, if any.\n  std::vector<PointWidth> pts_;\n  // Stores the computed perpendicular distances of (some of) the pts_ from a\n  // given vector (assuming it goes through the origin, making it a line).\n  // Since the distances may be a subset of the input points, and get\n  // re-ordered by the nth_item function, the original point is stored\n  // along side the distance.\n  std::vector<DistPointPair> distances_; // Distances of points.\n  // The squared length of the vector used to compute distances_.\n  double square_length_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_\n"
  },
  {
    "path": "src/ccstruct/dppoint.cpp",
    "content": "/**********************************************************************\n * File:        dppoint.cpp\n * Description: Simple generic dynamic programming class.\n * Author:      Ray Smith\n * Created:     Wed Mar 25 19:08:01 PDT 2009\n *\n * (C) Copyright 2009, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"dppoint.h\"\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Solve the dynamic programming problem for the given array of points, with\n// the given size and cost function.\n// Steps backwards are limited to being between min_step and max_step\n// inclusive.\n// The return value is the tail of the best path.\nDPPoint *DPPoint::Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,\n                        DPPoint *points) {\n  if (size <= 0 || max_step < min_step || min_step >= size) {\n    return nullptr; // Degenerate, but not necessarily an error.\n  }\n  ASSERT_HOST(min_step > 0); // Infinite loop possible if this is not true.\n  if (debug) {\n    tprintf(\"min = %d, max=%d\\n\", min_step, max_step);\n  }\n  // Evaluate the total cost at each point.\n  for (int i = 0; i < size; ++i) {\n    for (int offset = min_step; offset <= max_step; ++offset) {\n      DPPoint *prev = offset <= i ? points + i - offset : nullptr;\n      int64_t new_cost = (points[i].*cost_func)(prev);\n      if (points[i].best_prev_ != nullptr && offset > min_step * 2 &&\n          new_cost > points[i].total_cost_) {\n        break; // Find only the first minimum if going over twice the min.\n      }\n    }\n    points[i].total_cost_ += points[i].local_cost_;\n    if (debug) {\n      tprintf(\"At point %d, local cost=%d, total_cost=%d, steps=%d\\n\", i, points[i].local_cost_,\n              points[i].total_cost_, points[i].total_steps_);\n    }\n  }\n  // Now find the end of the best path and return it.\n  int best_cost = points[size - 1].total_cost_;\n  int best_end = size - 1;\n  for (int end = best_end - 1; end >= size - min_step; --end) {\n    int cost = points[end].total_cost_;\n    if (cost < best_cost) {\n      best_cost = cost;\n      best_end = end;\n    }\n  }\n  return points + best_end;\n}\n\n// A CostFunc that takes the variance of step into account in the cost.\nint64_t DPPoint::CostWithVariance(const DPPoint *prev) {\n  if (prev == nullptr || prev == this) {\n    UpdateIfBetter(0, 1, nullptr, 0, 0, 0);\n    return 0;\n  }\n\n  int delta = this - prev;\n  int32_t n = prev->n_ + 1;\n  int32_t sig_x = prev->sig_x_ + delta;\n  int64_t sig_xsq = prev->sig_xsq_ + static_cast<int64_t>(delta) * delta;\n  int64_t cost = (sig_xsq - sig_x * sig_x / n) / n;\n  cost += prev->total_cost_;\n  UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);\n  return cost;\n}\n\n// Update the other members if the cost is lower.\nvoid DPPoint::UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n,\n                             int32_t sig_x, int64_t sig_xsq) {\n  if (cost < total_cost_) {\n    total_cost_ = cost;\n    total_steps_ = steps;\n    best_prev_ = prev;\n    n_ = n;\n    sig_x_ = sig_x;\n    sig_xsq_ = sig_xsq;\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/dppoint.h",
    "content": "/**********************************************************************\n * File:        dppoint.h\n * Description: Simple generic dynamic programming class.\n * Author:      Ray Smith\n * Created:     Wed Mar 25 18:57:01 PDT 2009\n *\n * (C) Copyright 2009, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCSTRUCT_DPPOINT_H_\n#define TESSERACT_CCSTRUCT_DPPOINT_H_\n\n#include <cstdint>\n\nnamespace tesseract {\n\n// A simple class to provide a dynamic programming solution to a class of\n// 1st-order problems in which the cost is dependent only on the current\n// step and the best cost to that step, with a possible special case\n// of using the variance of the steps, and only the top choice is required.\n// Useful for problems such as finding the optimal cut points in a fixed-pitch\n// (vertical or horizontal) situation.\n// Skeletal Example:\n// DPPoint* array = new DPPoint[width];\n// for (int i = 0; i < width; i++) {\n//   array[i].AddLocalCost(cost_at_i)\n// }\n// DPPoint* best_end = DPPoint::Solve(..., array);\n// while (best_end != nullptr) {\n//   int cut_index = best_end - array;\n//   best_end = best_end->best_prev();\n// }\n// delete [] array;\nclass DPPoint {\npublic:\n  // The cost function evaluates the total cost at this (excluding this's\n  // local_cost) and if it beats this's total_cost, then\n  // replace the appropriate values in this.\n  using CostFunc = int64_t (DPPoint::*)(const DPPoint *);\n\n  DPPoint()\n      : local_cost_(0)\n      , total_cost_(INT32_MAX)\n      , total_steps_(1)\n      , best_prev_(nullptr)\n      , n_(0)\n      , sig_x_(0)\n      , sig_xsq_(0) {}\n\n  // Solve the dynamic programming problem for the given array of points, with\n  // the given size and cost function.\n  // Steps backwards are limited to being between min_step and max_step\n  // inclusive.\n  // The return value is the tail of the best path.\n  static DPPoint *Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,\n                        DPPoint *points);\n\n  // A CostFunc that takes the variance of step into account in the cost.\n  int64_t CostWithVariance(const DPPoint *prev);\n\n  // Accessors.\n  int total_cost() const {\n    return total_cost_;\n  }\n  int Pathlength() const {\n    return total_steps_;\n  }\n  const DPPoint *best_prev() const {\n    return best_prev_;\n  }\n  void AddLocalCost(int new_cost) {\n    local_cost_ += new_cost;\n  }\n\nprivate:\n  // Code common to different cost functions.\n\n  // Update the other members if the cost is lower.\n  void UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n, int32_t sig_x,\n                      int64_t sig_xsq);\n\n  int32_t local_cost_;       // Cost of this point on its own.\n  int32_t total_cost_;       // Sum of all costs in best path to here.\n                             // During cost calculations local_cost is excluded.\n  int32_t total_steps_;      // Number of steps in best path to here.\n  const DPPoint *best_prev_; // Pointer to prev point in best path from here.\n  // Information for computing the variance part of the cost.\n  int32_t n_;       // Number of steps in best path to here for variance.\n  int32_t sig_x_;   // Sum of step sizes for computing variance.\n  int64_t sig_xsq_; // Sum of squares of steps for computing variance.\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCSTRUCT_DPPOINT_H_\n"
  },
  {
    "path": "src/ccstruct/fontinfo.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        fontinfo.cpp\n// Description: Font information classes abstracted from intproto.h/cpp.\n// Author:      rays@google.com (Ray Smith)\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"fontinfo.h\"\n#include \"bitvector.h\"\n#include \"unicity_table.h\"\n\nnamespace tesseract {\n\n// Writes to the given file. Returns false in case of error.\nbool FontInfo::Serialize(FILE *fp) const {\n  if (!write_info(fp, *this)) {\n    return false;\n  }\n  if (!write_spacing_info(fp, *this)) {\n    return false;\n  }\n  return true;\n}\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool FontInfo::DeSerialize(TFile *fp) {\n  if (!read_info(fp, this)) {\n    return false;\n  }\n  if (!read_spacing_info(fp, this)) {\n    return false;\n  }\n  return true;\n}\n\nFontInfoTable::FontInfoTable() {\n  using namespace std::placeholders; // for _1, _2\n  set_clear_callback(std::bind(FontInfoDeleteCallback, _1));\n}\n\nFontInfoTable::~FontInfoTable() = default;\n\n// Writes to the given file. Returns false in case of error.\nbool FontInfoTable::Serialize(FILE *fp) const {\n  return this->SerializeClasses(fp);\n}\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool FontInfoTable::DeSerialize(TFile *fp) {\n  truncate(0);\n  return this->DeSerializeClasses(fp);\n}\n\n// Returns true if the given set of fonts includes one with the same\n// properties as font_id.\nbool FontInfoTable::SetContainsFontProperties(int font_id,\n                                              const std::vector<ScoredFont> &font_set) const {\n  uint32_t properties = at(font_id).properties;\n  for (auto &&f : font_set) {\n    if (at(f.fontinfo_id).properties == properties) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns true if the given set of fonts includes multiple properties.\nbool FontInfoTable::SetContainsMultipleFontProperties(\n    const std::vector<ScoredFont> &font_set) const {\n  if (font_set.empty()) {\n    return false;\n  }\n  int first_font = font_set[0].fontinfo_id;\n  uint32_t properties = at(first_font).properties;\n  for (unsigned f = 1; f < font_set.size(); ++f) {\n    if (at(font_set[f].fontinfo_id).properties != properties) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Moves any non-empty FontSpacingInfo entries from other to this.\nvoid FontInfoTable::MoveSpacingInfoFrom(FontInfoTable *other) {\n  using namespace std::placeholders; // for _1, _2\n  set_clear_callback(std::bind(FontInfoDeleteCallback, _1));\n  for (unsigned i = 0; i < other->size(); ++i) {\n    std::vector<FontSpacingInfo *> *spacing_vec = other->at(i).spacing_vec;\n    if (spacing_vec != nullptr) {\n      int target_index = get_index(other->at(i));\n      if (target_index < 0) {\n        // Bit copy the FontInfo and steal all the pointers.\n        push_back(other->at(i));\n        other->at(i).name = nullptr;\n      } else {\n        delete at(target_index).spacing_vec;\n        at(target_index).spacing_vec = other->at(i).spacing_vec;\n      }\n      other->at(i).spacing_vec = nullptr;\n    }\n  }\n}\n\n// Moves this to the target unicity table.\nvoid FontInfoTable::MoveTo(UnicityTable<FontInfo> *target) {\n  target->clear();\n  using namespace std::placeholders; // for _1, _2\n  target->set_clear_callback(std::bind(FontInfoDeleteCallback, _1));\n  for (unsigned i = 0; i < size(); ++i) {\n    // Bit copy the FontInfo and steal all the pointers.\n    target->push_back(at(i));\n    at(i).name = nullptr;\n    at(i).spacing_vec = nullptr;\n  }\n}\n\n// Callbacks for GenericVector.\nvoid FontInfoDeleteCallback(FontInfo f) {\n  if (f.spacing_vec != nullptr) {\n    for (auto data : *f.spacing_vec) {\n      delete data;\n    }\n    delete f.spacing_vec;\n    f.spacing_vec = nullptr;\n  }\n  delete[] f.name;\n  f.name = nullptr;\n}\n\n/*---------------------------------------------------------------------------*/\n// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.\nbool read_info(TFile *f, FontInfo *fi) {\n  uint32_t size;\n  if (!f->DeSerialize(&size)) {\n    return false;\n  }\n  char *font_name = new char[size + 1];\n  fi->name = font_name;\n  if (!f->DeSerialize(font_name, size)) {\n    return false;\n  }\n  font_name[size] = '\\0';\n  return f->DeSerialize(&fi->properties);\n}\n\nbool write_info(FILE *f, const FontInfo &fi) {\n  int32_t size = strlen(fi.name);\n  return tesseract::Serialize(f, &size) && tesseract::Serialize(f, &fi.name[0], size) &&\n         tesseract::Serialize(f, &fi.properties);\n}\n\nbool read_spacing_info(TFile *f, FontInfo *fi) {\n  int32_t vec_size, kern_size;\n  if (!f->DeSerialize(&vec_size)) {\n    return false;\n  }\n  ASSERT_HOST(vec_size >= 0);\n  if (vec_size == 0) {\n    return true;\n  }\n  fi->init_spacing(vec_size);\n  for (int i = 0; i < vec_size; ++i) {\n    auto *fs = new FontSpacingInfo();\n    if (!f->DeSerialize(&fs->x_gap_before) || !f->DeSerialize(&fs->x_gap_after) ||\n        !f->DeSerialize(&kern_size)) {\n      delete fs;\n      return false;\n    }\n    if (kern_size < 0) { // indication of a nullptr entry in fi->spacing_vec\n      delete fs;\n      continue;\n    }\n    if (kern_size > 0 &&\n        (!f->DeSerialize(fs->kerned_unichar_ids) || !f->DeSerialize(fs->kerned_x_gaps))) {\n      delete fs;\n      return false;\n    }\n    fi->add_spacing(i, fs);\n  }\n  return true;\n}\n\nbool write_spacing_info(FILE *f, const FontInfo &fi) {\n  int32_t vec_size = (fi.spacing_vec == nullptr) ? 0 : fi.spacing_vec->size();\n  if (!tesseract::Serialize(f, &vec_size)) {\n    return false;\n  }\n  int16_t x_gap_invalid = -1;\n  for (int i = 0; i < vec_size; ++i) {\n    FontSpacingInfo *fs = fi.spacing_vec->at(i);\n    int32_t kern_size = (fs == nullptr) ? -1 : fs->kerned_x_gaps.size();\n    if (fs == nullptr) {\n      // Writing two invalid x-gaps.\n      if (!tesseract::Serialize(f, &x_gap_invalid, 2) || !tesseract::Serialize(f, &kern_size)) {\n        return false;\n      }\n    } else {\n      if (!tesseract::Serialize(f, &fs->x_gap_before) ||\n          !tesseract::Serialize(f, &fs->x_gap_after) || !tesseract::Serialize(f, &kern_size)) {\n        return false;\n      }\n    }\n    if (kern_size > 0 &&\n        (!Serialize(f, fs->kerned_unichar_ids) || !Serialize(f, fs->kerned_x_gaps))) {\n      return false;\n    }\n  }\n  return true;\n}\n\nbool write_set(FILE *f, const FontSet &fs) {\n  int size = fs.size();\n  return tesseract::Serialize(f, &size) &&\n         (size > 0 ? tesseract::Serialize(f, &fs[0], size) : true);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/fontinfo.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        fontinfo.h\n// Description: Font information classes abstracted from intproto.h/cpp.\n// Author:      rays@google.com (Ray Smith)\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_FONTINFO_H_\n#define TESSERACT_CCSTRUCT_FONTINFO_H_\n\n#include \"errcode.h\"\n\n#include <tesseract/unichar.h>\n#include \"genericvector.h\"\n\n#include <cstdint> // for uint16_t, uint32_t\n#include <cstdio>  // for FILE\n#include <vector>\n\nnamespace tesseract {\n\ntemplate <typename T>\nclass UnicityTable;\n\n// Simple struct to hold a font and a score. The scores come from the low-level\n// integer matcher, so they are in the uint16_t range. Fonts are an index to\n// fontinfo_table.\n// These get copied around a lot, so best to keep them small.\nstruct ScoredFont {\n  ScoredFont() : fontinfo_id(-1), score(0) {}\n  ScoredFont(int font_id, uint16_t classifier_score)\n      : fontinfo_id(font_id), score(classifier_score) {}\n\n  // Index into fontinfo table, but inside the classifier, may be a shapetable\n  // index.\n  int32_t fontinfo_id;\n  // Raw score from the low-level classifier.\n  uint16_t score;\n};\n\n// Struct for information about spacing between characters in a particular font.\nstruct FontSpacingInfo {\n  int16_t x_gap_before;\n  int16_t x_gap_after;\n  std::vector<UNICHAR_ID> kerned_unichar_ids;\n  std::vector<int16_t> kerned_x_gaps;\n};\n\n/*\n * font_properties contains properties about boldness, italicness, fixed pitch,\n * serif, fraktur\n */\nstruct FontInfo {\n  FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {}\n  ~FontInfo() = default;\n\n  bool operator==(const FontInfo &rhs) const {\n    return strcmp(name, rhs.name) == 0;\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(TFile *fp);\n\n  // Reserves unicharset_size spots in spacing_vec.\n  void init_spacing(int unicharset_size) {\n    spacing_vec = new std::vector<FontSpacingInfo *>(unicharset_size);\n  }\n  // Adds the given pointer to FontSpacingInfo to spacing_vec member\n  // (FontInfo class takes ownership of the pointer).\n  // Note: init_spacing should be called before calling this function.\n  void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {\n    ASSERT_HOST(static_cast<size_t>(uch_id) < spacing_vec->size());\n    (*spacing_vec)[uch_id] = spacing_info;\n  }\n\n  // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.\n  const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {\n    return (spacing_vec == nullptr || spacing_vec->size() <= static_cast<size_t>(uch_id)) ? nullptr\n                                                                     : (*spacing_vec)[uch_id];\n  }\n\n  // Fills spacing with the value of the x gap expected between the two given\n  // UNICHAR_IDs. Returns true on success.\n  bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const {\n    const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);\n    const FontSpacingInfo *fsi = this->get_spacing(uch_id);\n    if (prev_fsi == nullptr || fsi == nullptr) {\n      return false;\n    }\n    size_t i = 0;\n    for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {\n      if (prev_fsi->kerned_unichar_ids[i] == uch_id) {\n        break;\n      }\n    }\n    if (i < prev_fsi->kerned_unichar_ids.size()) {\n      *spacing = prev_fsi->kerned_x_gaps[i];\n    } else {\n      *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;\n    }\n    return true;\n  }\n\n  bool is_italic() const {\n    return properties & 1;\n  }\n  bool is_bold() const {\n    return (properties & 2) != 0;\n  }\n  bool is_fixed_pitch() const {\n    return (properties & 4) != 0;\n  }\n  bool is_serif() const {\n    return (properties & 8) != 0;\n  }\n  bool is_fraktur() const {\n    return (properties & 16) != 0;\n  }\n\n  char *name;\n  uint32_t properties;\n  // The universal_id is a field reserved for the initialization process\n  // to assign a unique id number to all fonts loaded for the current\n  // combination of languages. This id will then be returned by\n  // ResultIterator::WordFontAttributes.\n  int32_t universal_id;\n  // Horizontal spacing between characters (indexed by UNICHAR_ID).\n  std::vector<FontSpacingInfo *> *spacing_vec;\n};\n\n// Every class (character) owns a FontSet that represents all the fonts that can\n// render this character.\n// Since almost all the characters from the same script share the same set of\n// fonts, the sets are shared over multiple classes (see\n// Classify::fontset_table_). Thus, a class only store an id to a set.\n// Because some fonts cannot render just one character of a set, there are a\n// lot of FontSet that differ only by one font. Rather than storing directly\n// the FontInfo in the FontSet structure, it's better to share FontInfos among\n// FontSets (Classify::fontinfo_table_).\nusing FontSet = std::vector<int>;\n\n// Class that adds a bit of functionality on top of GenericVector to\n// implement a table of FontInfo that replaces UniCityTable<FontInfo>.\n// TODO(rays) change all references once all existing traineddata files\n// are replaced.\nclass FontInfoTable : public GenericVector<FontInfo> {\npublic:\n  TESS_API // when you remove inheritance from GenericVector, move this on\n  // class level\n  FontInfoTable();\n  TESS_API\n  ~FontInfoTable();\n\n  // Writes to the given file. Returns false in case of error.\n  TESS_API\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  TESS_API\n  bool DeSerialize(TFile *fp);\n\n  // Returns true if the given set of fonts includes one with the same\n  // properties as font_id.\n  TESS_API\n  bool SetContainsFontProperties(int font_id, const std::vector<ScoredFont> &font_set) const;\n  // Returns true if the given set of fonts includes multiple properties.\n  TESS_API\n  bool SetContainsMultipleFontProperties(const std::vector<ScoredFont> &font_set) const;\n\n  // Moves any non-empty FontSpacingInfo entries from other to this.\n  TESS_API\n  void MoveSpacingInfoFrom(FontInfoTable *other);\n  // Moves this to the target unicity table.\n  TESS_API\n  void MoveTo(UnicityTable<FontInfo> *target);\n};\n\n// Deletion callbacks for GenericVector.\nvoid FontInfoDeleteCallback(FontInfo f);\n\n// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.\nbool read_info(TFile *f, FontInfo *fi);\nbool write_info(FILE *f, const FontInfo &fi);\nbool read_spacing_info(TFile *f, FontInfo *fi);\nbool write_spacing_info(FILE *f, const FontInfo &fi);\nbool write_set(FILE *f, const FontSet &fs);\n\n} // namespace tesseract.\n\n#endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */\n"
  },
  {
    "path": "src/ccstruct/image.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"image.h\"\n\n#include <allheaders.h>\n\nnamespace tesseract {\n\nImage Image::clone() const {\n  return pix_ ? pixClone(pix_) : nullptr;\n}\n\nImage Image::copy() const {\n  return pixCopy(nullptr, pix_);\n}\n\nvoid Image::destroy() {\n  pixDestroy(&pix_);\n}\n\nbool Image::isZero() const {\n  l_int32 r = 0;\n  pixZero(pix_, &r);\n  return r == 1;\n}\n\nImage Image::operator|(Image i) const {\n  return pixOr(nullptr, pix_, i);\n}\n\nImage &Image::operator|=(Image i) {\n  pixOr(pix_, pix_, i);\n  return *this;\n}\n\nImage Image::operator&(Image i) const {\n  return pixAnd(nullptr, pix_, i);\n}\n\nImage &Image::operator&=(Image i) {\n  pixAnd(pix_, pix_, i);\n  return *this;\n}\n\n}\n"
  },
  {
    "path": "src/ccstruct/image.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        image.h\n// Description: Image wrapper.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_IMAGE_H_\n#define TESSERACT_CCSTRUCT_IMAGE_H_\n\n#include <tesseract/export.h>\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass TESS_API Image {\npublic:\n  Pix *pix_ = nullptr;\n\npublic:\n  Image() = default;\n  Image(Pix *pix) : pix_(pix) {}\n\n  // service\n  bool operator==(decltype(nullptr)) const { return pix_ == nullptr; }\n  bool operator!=(decltype(nullptr)) const { return pix_ != nullptr; }\n  explicit operator bool() const { return pix_ != nullptr; }\n  operator Pix *() const { return pix_; }\n  explicit operator Pix **() { return &pix_; }\n  Pix *operator->() const { return pix_; }\n\n  // api\n  Image clone() const; // increases refcount\n  Image copy() const;  // does full copy\n  void destroy();\n  bool isZero() const;\n\n  // ops\n  Image operator|(Image) const;\n  Image &operator|=(Image);\n  Image operator&(Image) const;\n  Image &operator&=(Image);\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_IMAGE_H_\n"
  },
  {
    "path": "src/ccstruct/imagedata.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        imagedata.cpp\n// Description: Class to hold information about a single multi-page tiff\n//              training file and its corresponding boxes or text file.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"imagedata.h\"\n\n#include \"boxread.h\"    // for ReadMemBoxes\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::CYAN, ScrollView::NONE\n#include \"tprintf.h\"    // for tprintf\n#include \"tesserrstream.h\" // for tesserr\n\n#include \"helpers.h\"  // for IntCastRounded, TRand, ClipToRange, Modulo\n#include \"serialis.h\" // for TFile\n\n#include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_...\n\n#include <cinttypes>    // for PRId64\n#include <fstream>      // for std::ifstream\n\nnamespace tesseract {\n\n// Number of documents to read ahead while training. Doesn't need to be very\n// large.\nconst int kMaxReadAhead = 8;\n\nImageData::ImageData() : page_number_(-1), vertical_text_(false) {}\n// Takes ownership of the pix and destroys it.\nImageData::ImageData(bool vertical, Image pix)\n    : page_number_(0), vertical_text_(vertical) {\n  SetPix(pix);\n}\nImageData::~ImageData() {\n#ifdef TESSERACT_IMAGEDATA_AS_PIX\n  internal_pix_.destroy();\n#endif\n}\n\n// Builds and returns an ImageData from the basic data. Note that imagedata,\n// truth_text, and box_text are all the actual file data, NOT filenames.\nImageData *ImageData::Build(const char *name, int page_number, const char *lang,\n                            const char *imagedata, int imagedatasize,\n                            const char *truth_text, const char *box_text) {\n  auto *image_data = new ImageData();\n  image_data->imagefilename_ = name;\n  image_data->page_number_ = page_number;\n  image_data->language_ = lang;\n  // Save the imagedata.\n  // TODO: optimize resize (no init).\n  image_data->image_data_.resize(imagedatasize);\n  memcpy(&image_data->image_data_[0], imagedata, imagedatasize);\n  if (!image_data->AddBoxes(box_text)) {\n    if (truth_text == nullptr || truth_text[0] == '\\0') {\n      tprintf(\"Error: No text corresponding to page %d from image %s!\\n\",\n              page_number, name);\n      delete image_data;\n      return nullptr;\n    }\n    image_data->transcription_ = truth_text;\n    // If we have no boxes, the transcription is in the 0th box_texts_.\n    image_data->box_texts_.emplace_back(truth_text);\n    // We will create a box for the whole image on PreScale, to save unpacking\n    // the image now.\n  } else if (truth_text != nullptr && truth_text[0] != '\\0' &&\n             image_data->transcription_ != truth_text) {\n    // Save the truth text as it is present and disagrees with the box text.\n    image_data->transcription_ = truth_text;\n  }\n  return image_data;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool ImageData::Serialize(TFile *fp) const {\n  if (!fp->Serialize(imagefilename_)) {\n    return false;\n  }\n  if (!fp->Serialize(&page_number_)) {\n    return false;\n  }\n  if (!fp->Serialize(image_data_)) {\n    return false;\n  }\n  if (!fp->Serialize(language_)) {\n    return false;\n  }\n  if (!fp->Serialize(transcription_)) {\n    return false;\n  }\n  if (!fp->Serialize(boxes_)) {\n    return false;\n  }\n  if (!fp->Serialize(box_texts_)) {\n    return false;\n  }\n  int8_t vertical = vertical_text_;\n  return fp->Serialize(&vertical);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool ImageData::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(imagefilename_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&page_number_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(image_data_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(language_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(transcription_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(boxes_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(box_texts_)) {\n    return false;\n  }\n  int8_t vertical = 0;\n  if (!fp->DeSerialize(&vertical)) {\n    return false;\n  }\n  vertical_text_ = vertical != 0;\n  return true;\n}\n\n// As DeSerialize, but only seeks past the data - hence a static method.\nbool ImageData::SkipDeSerialize(TFile *fp) {\n  if (!fp->DeSerializeSkip()) {\n    return false;\n  }\n  int32_t page_number;\n  if (!fp->DeSerialize(&page_number)) {\n    return false;\n  }\n  if (!fp->DeSerializeSkip()) {\n    return false;\n  }\n  if (!fp->DeSerializeSkip()) {\n    return false;\n  }\n  if (!fp->DeSerializeSkip()) {\n    return false;\n  }\n  if (!fp->DeSerializeSkip(sizeof(TBOX))) {\n    return false;\n  }\n  int32_t number;\n  if (!fp->DeSerialize(&number)) {\n    return false;\n  }\n  for (int i = 0; i < number; i++) {\n    if (!fp->DeSerializeSkip()) {\n      return false;\n    }\n  }\n  int8_t vertical = 0;\n  return fp->DeSerialize(&vertical);\n}\n\n// Saves the given Pix as a PNG-encoded string and destroys it.\n// In case of missing PNG support in Leptonica use PNM format,\n// which requires more memory.\nvoid ImageData::SetPix(Image pix) {\n#ifdef TESSERACT_IMAGEDATA_AS_PIX\n  internal_pix_ = pix;\n#else\n  SetPixInternal(pix, &image_data_);\n#endif\n}\n\n// Returns the Pix image for *this. Must be pixDestroyed after use.\nImage ImageData::GetPix() const {\n#ifdef TESSERACT_IMAGEDATA_AS_PIX\n#  ifdef GRAPHICS_DISABLED\n  /* The only caller of this is the scaling functions to prescale the\n   * source. Thus we can just return a new pointer to the same data. */\n  return internal_pix_.clone();\n#  else\n  /* pixCopy always does an actual copy, so the caller can modify the\n   * changed data. */\n  return internal_pix_.copy();\n#  endif\n#else\n  return GetPixInternal(image_data_);\n#endif\n}\n\n// Gets anything and everything with a non-nullptr pointer, prescaled to a\n// given target_height (if 0, then the original image height), and aligned.\n// Also returns (if not nullptr) the width and height of the scaled image.\n// The return value is the scaled Pix, which must be pixDestroyed after use,\n// and scale_factor (if not nullptr) is set to the scale factor that was applied\n// to the image to achieve the target_height.\nImage ImageData::PreScale(int target_height, int max_height,\n                          float *scale_factor, int *scaled_width,\n                          int *scaled_height, std::vector<TBOX> *boxes) const {\n  int input_width = 0;\n  int input_height = 0;\n  Image src_pix = GetPix();\n  ASSERT_HOST(src_pix != nullptr);\n  input_width = pixGetWidth(src_pix);\n  input_height = pixGetHeight(src_pix);\n  if (target_height == 0) {\n    target_height = std::min(input_height, max_height);\n  }\n  float im_factor = static_cast<float>(target_height) / input_height;\n  if (scaled_width != nullptr) {\n    *scaled_width = IntCastRounded(im_factor * input_width);\n  }\n  if (scaled_height != nullptr) {\n    *scaled_height = target_height;\n  }\n  // Get the scaled image.\n  Image pix = pixScale(src_pix, im_factor, im_factor);\n  if (pix == nullptr) {\n    tprintf(\"Scaling pix of size %d, %d by factor %g made null pix!!\\n\",\n            input_width, input_height, im_factor);\n    src_pix.destroy();\n    return nullptr;\n  }\n  if (scaled_width != nullptr) {\n    *scaled_width = pixGetWidth(pix);\n  }\n  if (scaled_height != nullptr) {\n    *scaled_height = pixGetHeight(pix);\n  }\n  src_pix.destroy();\n  if (boxes != nullptr) {\n    // Get the boxes.\n    boxes->clear();\n    for (auto box : boxes_) {\n      box.scale(im_factor);\n      boxes->push_back(box);\n    }\n    if (boxes->empty()) {\n      // Make a single box for the whole image.\n      TBOX box(0, 0, im_factor * input_width, target_height);\n      boxes->push_back(box);\n    }\n  }\n  if (scale_factor != nullptr) {\n    *scale_factor = im_factor;\n  }\n  return pix;\n}\n\nint ImageData::MemoryUsed() const {\n  return image_data_.size();\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Draws the data in a new window.\nvoid ImageData::Display() const {\n  const int kTextSize = 64;\n  // Draw the image.\n  Image pix = GetPix();\n  if (pix == nullptr) {\n    return;\n  }\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  auto *win = new ScrollView(\"Imagedata\", 100, 100, 2 * (width + 2 * kTextSize),\n                             2 * (height + 4 * kTextSize), width + 10,\n                             height + 3 * kTextSize, true);\n  win->Draw(pix, 0, height - 1);\n  pix.destroy();\n  // Draw the boxes.\n  win->Pen(ScrollView::RED);\n  win->Brush(ScrollView::NONE);\n  int text_size = kTextSize;\n  if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {\n    text_size = boxes_[0].height() * 2;\n  }\n  win->TextAttributes(\"Arial\", text_size, false, false, false);\n  if (!boxes_.empty()) {\n    for (unsigned b = 0; b < boxes_.size(); ++b) {\n      boxes_[b].plot(win);\n      win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());\n    }\n  } else {\n    // The full transcription.\n    win->Pen(ScrollView::CYAN);\n    win->Text(0, height + kTextSize * 2, transcription_.c_str());\n  }\n  win->Update();\n  win->Wait();\n}\n\n#endif\n\n// Adds the supplied boxes and transcriptions that correspond to the correct\n// page number.\nvoid ImageData::AddBoxes(const std::vector<TBOX> &boxes,\n                         const std::vector<std::string> &texts,\n                         const std::vector<int> &box_pages) {\n  // Copy the boxes and make the transcription.\n  for (unsigned i = 0; i < box_pages.size(); ++i) {\n    if (page_number_ >= 0 && box_pages[i] != page_number_) {\n      continue;\n    }\n    transcription_ += texts[i];\n    boxes_.push_back(boxes[i]);\n    box_texts_.push_back(texts[i]);\n  }\n}\n\n#ifndef TESSERACT_IMAGEDATA_AS_PIX\n// Saves the given Pix as a PNG-encoded string and destroys it.\n// In case of missing PNG support in Leptonica use PNM format,\n// which requires more memory.\nvoid ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) {\n  l_uint8 *data;\n  size_t size;\n  l_int32 ret;\n  ret = pixWriteMem(&data, &size, pix, IFF_PNG);\n  if (ret) {\n    ret = pixWriteMem(&data, &size, pix, IFF_PNM);\n  }\n  pix.destroy();\n  // TODO: optimize resize (no init).\n  image_data->resize(size);\n  memcpy(&(*image_data)[0], data, size);\n  lept_free(data);\n}\n\n// Returns the Pix image for the image_data. Must be pixDestroyed after use.\nImage ImageData::GetPixInternal(const std::vector<char> &image_data) {\n  Image pix = nullptr;\n  if (!image_data.empty()) {\n    // Convert the array to an image.\n    const auto *u_data =\n        reinterpret_cast<const unsigned char *>(&image_data[0]);\n    pix = pixReadMem(u_data, image_data.size());\n  }\n  return pix;\n}\n#endif\n\n// Parses the text string as a box file and adds any discovered boxes that\n// match the page number. Returns false on error.\nbool ImageData::AddBoxes(const char *box_text) {\n  if (box_text != nullptr && box_text[0] != '\\0') {\n    std::vector<TBOX> boxes;\n    std::vector<std::string> texts;\n    std::vector<int> box_pages;\n    if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,\n                     /*continue_on_failure*/ true, &boxes, &texts, nullptr,\n                     &box_pages)) {\n      AddBoxes(boxes, texts, box_pages);\n      return true;\n    } else {\n      tprintf(\"Error: No boxes for page %d from image %s!\\n\", page_number_,\n              imagefilename_.c_str());\n    }\n  }\n  return false;\n}\n\nDocumentData::DocumentData(const std::string &name)\n    : document_name_(name),\n      pages_offset_(-1),\n      total_pages_(-1),\n      memory_used_(0),\n      max_memory_(0),\n      reader_(nullptr) {}\n\nDocumentData::~DocumentData() {\n  if (thread.joinable()) {\n    thread.join();\n  }\n  std::lock_guard<std::mutex> lock_p(pages_mutex_);\n  std::lock_guard<std::mutex> lock_g(general_mutex_);\n  for (auto data : pages_) {\n    delete data;\n  }\n}\n\n// Reads all the pages in the given lstmf filename to the cache. The reader\n// is used to read the file.\nbool DocumentData::LoadDocument(const char *filename, int start_page,\n                                int64_t max_memory, FileReader reader) {\n  SetDocument(filename, max_memory, reader);\n  pages_offset_ = start_page;\n  return ReCachePages();\n}\n\n// Sets up the document, without actually loading it.\nvoid DocumentData::SetDocument(const char *filename, int64_t max_memory,\n                               FileReader reader) {\n  std::lock_guard<std::mutex> lock_p(pages_mutex_);\n  std::lock_guard<std::mutex> lock(general_mutex_);\n  document_name_ = filename;\n  pages_offset_ = -1;\n  max_memory_ = max_memory;\n  reader_ = reader;\n}\n\n// Writes all the pages to the given filename. Returns false on error.\nbool DocumentData::SaveDocument(const char *filename, FileWriter writer) {\n  std::lock_guard<std::mutex> lock(pages_mutex_);\n  TFile fp;\n  fp.OpenWrite(nullptr);\n  if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {\n    tprintf(\"Serialize failed: %s\\n\", filename);\n    return false;\n  }\n  return true;\n}\n\n// Adds the given page data to this document, counting up memory.\nvoid DocumentData::AddPageToDocument(ImageData *page) {\n  std::lock_guard<std::mutex> lock(pages_mutex_);\n  pages_.push_back(page);\n  set_memory_used(memory_used() + page->MemoryUsed());\n}\n\n// If the given index is not currently loaded, loads it using a separate\n// thread.\nvoid DocumentData::LoadPageInBackground(int index) {\n  ImageData *page = nullptr;\n  if (IsPageAvailable(index, &page)) {\n    return;\n  }\n  {\n    std::lock_guard<std::mutex> lock(pages_mutex_);\n    if (pages_offset_ == index) {\n      return;\n    }\n    pages_offset_ = index;\n    for (auto page : pages_) {\n      delete page;\n    }\n    pages_.clear();\n  }\n  if (thread.joinable()) {\n    thread.join();\n  }\n  // Don't run next statement asynchronously because that would\n  // create too many threads on Linux (see issue #3111).\n  ReCachePages();\n}\n\n// Returns a pointer to the page with the given index, modulo the total\n// number of pages. Blocks until the background load is completed.\nconst ImageData *DocumentData::GetPage(int index) {\n  ImageData *page = nullptr;\n  while (!IsPageAvailable(index, &page)) {\n    // If there is no background load scheduled, schedule one now.\n    pages_mutex_.lock();\n    bool needs_loading = pages_offset_ != index;\n    pages_mutex_.unlock();\n    if (needs_loading) {\n      LoadPageInBackground(index);\n    }\n    // We can't directly load the page, or the background load will delete it\n    // while the caller is using it, so give it a chance to work.\n    std::this_thread::yield();\n  }\n  return page;\n}\n\n// Returns true if the requested page is available, and provides a pointer,\n// which may be nullptr if the document is empty. May block, even though it\n// doesn't guarantee to return true.\nbool DocumentData::IsPageAvailable(int index, ImageData **page) {\n  std::lock_guard<std::mutex> lock(pages_mutex_);\n  int num_pages = NumPages();\n  if (num_pages == 0 || index < 0) {\n    *page = nullptr; // Empty Document.\n    return true;\n  }\n  if (num_pages > 0) {\n    index = Modulo(index, num_pages);\n    if (pages_offset_ <= index &&\n        static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {\n      *page = pages_[index - pages_offset_]; // Page is available already.\n      return true;\n    }\n  }\n  return false;\n}\n\n// Removes all pages from memory and frees the memory, but does not forget\n// the document metadata.\nint64_t DocumentData::UnCache() {\n  std::lock_guard<std::mutex> lock(pages_mutex_);\n  int64_t memory_saved = memory_used();\n  for (auto page : pages_) {\n    delete page;\n  }\n  pages_.clear();\n  pages_offset_ = -1;\n  set_total_pages(-1);\n  set_memory_used(0);\n  tprintf(\"Unloaded document %s, saving %\" PRId64 \" memory\\n\",\n          document_name_.c_str(), memory_saved);\n  return memory_saved;\n}\n\n// Shuffles all the pages in the document.\nvoid DocumentData::Shuffle() {\n  TRand random;\n  // Different documents get shuffled differently, but the same for the same\n  // name.\n  std::hash<std::string> hasher;\n  random.set_seed(static_cast<uint64_t>(hasher(document_name_)));\n  int num_pages = pages_.size();\n  // Execute one random swap for each page in the document.\n  for (int i = 0; i < num_pages; ++i) {\n    int src = random.IntRand() % num_pages;\n    int dest = random.IntRand() % num_pages;\n    std::swap(pages_[src], pages_[dest]);\n  }\n}\n\n// Locks the pages_mutex_ and loads as many pages as will fit into max_memory_\n// starting at index pages_offset_.\nbool DocumentData::ReCachePages() {\n  std::lock_guard<std::mutex> lock(pages_mutex_);\n  // Read the file.\n  set_total_pages(0);\n  set_memory_used(0);\n  int loaded_pages = 0;\n  for (auto page : pages_) {\n    delete page;\n  }\n  pages_.clear();\n#if !defined(TESSERACT_IMAGEDATA_AS_PIX)\n  auto name_size = document_name_.size();\n  if (name_size > 4 && document_name_.substr(name_size - 4) == \".png\") {\n    // PNG image given instead of LSTMF file.\n    std::string gt_name = document_name_.substr(0, name_size - 3) + \"gt.txt\";\n    std::ifstream t(gt_name);\n    std::string line;\n    std::getline(t, line);\n    t.close();\n    ImageData *image_data = ImageData::Build(document_name_.c_str(), 0, \"\", nullptr, 0, line.c_str(), nullptr);\n    Image image = pixRead(document_name_.c_str());\n    image_data->SetPix(image);\n    pages_.push_back(image_data);\n    loaded_pages = 1;\n    pages_offset_ %= loaded_pages;\n    set_total_pages(loaded_pages);\n    set_memory_used(memory_used() + image_data->MemoryUsed());\n#if 0\n    tprintf(\"Loaded %zu/%d lines (%d-%zu) of document %s\\n\", pages_.size(),\n            loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),\n            document_name_.c_str());\n#endif\n    return !pages_.empty();\n  }\n#endif\n  TFile fp;\n  if (!fp.Open(document_name_.c_str(), reader_) ||\n      !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {\n    tprintf(\"Deserialize header failed: %s\\n\", document_name_.c_str());\n    return false;\n  }\n  pages_offset_ %= loaded_pages;\n  // Skip pages before the first one we want, and load the rest until max\n  // memory and skip the rest after that.\n  int page;\n  for (page = 0; page < loaded_pages; ++page) {\n    uint8_t non_null;\n    if (!fp.DeSerialize(&non_null)) {\n      break;\n    }\n    if (page < pages_offset_ ||\n        (max_memory_ > 0 && memory_used() > max_memory_)) {\n      if (non_null && !ImageData::SkipDeSerialize(&fp)) {\n        break;\n      }\n    } else {\n      ImageData *image_data = nullptr;\n      if (non_null) {\n        image_data = new ImageData;\n        if (!image_data->DeSerialize(&fp)) {\n          delete image_data;\n          break;\n        }\n      }\n      pages_.push_back(image_data);\n      if (image_data->imagefilename().empty()) {\n        image_data->set_imagefilename(document_name_);\n        image_data->set_page_number(page);\n      }\n      set_memory_used(memory_used() + image_data->MemoryUsed());\n    }\n  }\n  if (page < loaded_pages) {\n    tprintf(\"Deserialize failed: %s read %d/%d lines\\n\", document_name_.c_str(),\n            page, loaded_pages);\n    for (auto page : pages_) {\n      delete page;\n    }\n    pages_.clear();\n  } else if (loaded_pages > 1) {\n    // Avoid lots of messages for training with single line images.\n    tesserr << \"Loaded \" << pages_.size() << '/' << loaded_pages << \" lines (\"\n            << pages_offset_ + 1 << '-'\n            << pages_offset_ + pages_.size() << \") of document \"\n            << document_name_ << '\\n';\n  }\n  set_total_pages(loaded_pages);\n  return !pages_.empty();\n}\n\n// A collection of DocumentData that knows roughly how much memory it is using.\nDocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {}\n\nDocumentCache::~DocumentCache() {\n  for (auto *document : documents_) {\n    delete document;\n  }\n}\n\n// Adds all the documents in the list of filenames, counting memory.\n// The reader is used to read the files.\nbool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames,\n                                  CachingStrategy cache_strategy,\n                                  FileReader reader) {\n  cache_strategy_ = cache_strategy;\n  int64_t fair_share_memory = 0;\n  // In the round-robin case, each DocumentData handles restricting its content\n  // to its fair share of memory. In the sequential case, DocumentCache\n  // determines which DocumentDatas are held entirely in memory.\n  if (cache_strategy_ == CS_ROUND_ROBIN) {\n    fair_share_memory = max_memory_ / filenames.size();\n  }\n  for (const auto &filename : filenames) {\n    auto *document = new DocumentData(filename);\n    document->SetDocument(filename.c_str(), fair_share_memory, reader);\n    AddToCache(document);\n  }\n  if (!documents_.empty()) {\n    // Try to get the first page now to verify the list of filenames.\n    if (GetPageBySerial(0) != nullptr) {\n      return true;\n    }\n    tprintf(\"Load of page 0 failed!\\n\");\n  }\n  return false;\n}\n\n// Adds document to the cache.\nbool DocumentCache::AddToCache(DocumentData *data) {\n  documents_.push_back(data);\n  return true;\n}\n\n// Finds and returns a document by name.\nDocumentData *DocumentCache::FindDocument(\n    const std::string &document_name) const {\n  for (auto *document : documents_) {\n    if (document->document_name() == document_name) {\n      return document;\n    }\n  }\n  return nullptr;\n}\n\n// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache\n// strategy, could take a long time.\nint DocumentCache::TotalPages() {\n  if (cache_strategy_ == CS_SEQUENTIAL) {\n    // In sequential mode, we assume each doc has the same number of pages\n    // whether it is true or not.\n    if (num_pages_per_doc_ == 0) {\n      GetPageSequential(0);\n    }\n    return num_pages_per_doc_ * documents_.size();\n  }\n  int total_pages = 0;\n  for (auto *document : documents_) {\n    // We have to load a page to make NumPages() valid.\n    document->GetPage(0);\n    total_pages += document->NumPages();\n  }\n  return total_pages;\n}\n\n// Returns a page by serial number, selecting them in a round-robin fashion\n// from all the documents. Highly disk-intensive, but doesn't need samples\n// to be shuffled between files to begin with.\nconst ImageData *DocumentCache::GetPageRoundRobin(int serial) {\n  int num_docs = documents_.size();\n  int doc_index = serial % num_docs;\n  const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);\n  for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {\n    doc_index = (serial + offset) % num_docs;\n    int page = (serial + offset) / num_docs;\n    documents_[doc_index]->LoadPageInBackground(page);\n  }\n  return doc;\n}\n\n// Returns a page by serial number, selecting them in sequence from each file.\n// Requires the samples to be shuffled between the files to give a random or\n// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.\nconst ImageData *DocumentCache::GetPageSequential(int serial) {\n  int num_docs = documents_.size();\n  ASSERT_HOST(num_docs > 0);\n  if (num_pages_per_doc_ == 0) {\n    // Use the pages in the first doc as the number of pages in each doc.\n    documents_[0]->GetPage(0);\n    num_pages_per_doc_ = documents_[0]->NumPages();\n    if (num_pages_per_doc_ == 0) {\n      tprintf(\"First document cannot be empty!!\\n\");\n      ASSERT_HOST(num_pages_per_doc_ > 0);\n    }\n    // Get rid of zero now if we don't need it.\n    if (serial / num_pages_per_doc_ % num_docs > 0) {\n      documents_[0]->UnCache();\n    }\n  }\n  int doc_index = serial / num_pages_per_doc_ % num_docs;\n  const ImageData *doc =\n      documents_[doc_index]->GetPage(serial % num_pages_per_doc_);\n  // Count up total memory. Background loading makes it more complicated to\n  // keep a running count.\n  int64_t total_memory = 0;\n  for (auto *document : documents_) {\n    total_memory += document->memory_used();\n  }\n  if (total_memory >= max_memory_) {\n    // Find something to un-cache.\n    // If there are more than 3 in front, then serial is from the back reader\n    // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then\n    // we create a hole between them and then un-caching the backmost occupied\n    // will work for both.\n    int num_in_front = CountNeighbourDocs(doc_index, 1);\n    for (int offset = num_in_front - 2;\n         offset > 1 && total_memory >= max_memory_; --offset) {\n      int next_index = (doc_index + offset) % num_docs;\n      total_memory -= documents_[next_index]->UnCache();\n    }\n    // If that didn't work, the best solution is to un-cache from the back. If\n    // we take away the document that a 2nd reader is using, it will put it\n    // back and make a hole between.\n    int num_behind = CountNeighbourDocs(doc_index, -1);\n    for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;\n         ++offset) {\n      int next_index = (doc_index + offset + num_docs) % num_docs;\n      total_memory -= documents_[next_index]->UnCache();\n    }\n  }\n  int next_index = (doc_index + 1) % num_docs;\n  if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {\n    documents_[next_index]->LoadPageInBackground(0);\n  }\n  return doc;\n}\n\n// Helper counts the number of adjacent cached neighbours of index looking in\n// direction dir, ie index+dir, index+2*dir etc.\nint DocumentCache::CountNeighbourDocs(int index, int dir) {\n  int num_docs = documents_.size();\n  for (int offset = dir; abs(offset) < num_docs; offset += dir) {\n    int offset_index = (index + offset + num_docs) % num_docs;\n    if (!documents_[offset_index]->IsCached()) {\n      return offset - dir;\n    }\n  }\n  return num_docs;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/imagedata.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        imagedata.h\n// Description: Class to hold information about a single image and its\n//              corresponding boxes or text file.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_IMAGE_IMAGEDATA_H_\n#define TESSERACT_IMAGE_IMAGEDATA_H_\n\n#include \"image.h\"\n#include \"points.h\" // for FCOORD\n\n#include <mutex>  // for std::mutex\n#include <thread> // for std::thread\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass TFile;\nclass ScrollView;\nclass TBOX;\n\n// Amount of padding to apply in output pixels in feature mode.\nconst int kFeaturePadding = 2;\n// Number of pixels to pad around text boxes.\nconst int kImagePadding = 4;\n\n// Enum to determine the caching and data sequencing strategy.\nenum CachingStrategy {\n  // Reads all of one file before moving on to the next. Requires samples to be\n  // shuffled across files. Uses the count of samples in the first file as\n  // the count in all the files to achieve high-speed random access. As a\n  // consequence, if subsequent files are smaller, they get entries used more\n  // than once, and if subsequent files are larger, some entries are not used.\n  // Best for larger data sets that don't fit in memory.\n  CS_SEQUENTIAL,\n  // Reads one sample from each file in rotation. Does not require shuffled\n  // samples, but is extremely disk-intensive. Samples in smaller files also\n  // get used more often than samples in larger files.\n  // Best for smaller data sets that mostly fit in memory.\n  CS_ROUND_ROBIN,\n};\n\n// Class to hold information on a single image:\n// Filename, cached image as a Pix*, character boxes, text transcription.\n// The text transcription is the ground truth UTF-8 text for the image.\n// Character boxes are optional and indicate the desired segmentation of\n// the text into recognition units.\nclass TESS_API ImageData {\npublic:\n  ImageData();\n  // Takes ownership of the pix.\n  ImageData(bool vertical, Image pix);\n  ~ImageData();\n\n  // Builds and returns an ImageData from the basic data. Note that imagedata,\n  // truth_text, and box_text are all the actual file data, NOT filenames.\n  static ImageData *Build(const char *name, int page_number, const char *lang,\n                          const char *imagedata, int imagedatasize, const char *truth_text,\n                          const char *box_text);\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp);\n  // As DeSerialize, but only seeks past the data - hence a static method.\n  static bool SkipDeSerialize(TFile *fp);\n\n  // Other accessors.\n  const std::string &imagefilename() const {\n    return imagefilename_;\n  }\n  void set_imagefilename(const std::string &name) {\n    imagefilename_ = name;\n  }\n  int page_number() const {\n    return page_number_;\n  }\n  void set_page_number(int num) {\n    page_number_ = num;\n  }\n  const std::vector<char> &image_data() const {\n    return image_data_;\n  }\n  const std::string &language() const {\n    return language_;\n  }\n  void set_language(const std::string &lang) {\n    language_ = lang;\n  }\n  const std::string &transcription() const {\n    return transcription_;\n  }\n  const std::vector<TBOX> &boxes() const {\n    return boxes_;\n  }\n  const std::vector<std::string> &box_texts() const {\n    return box_texts_;\n  }\n  const std::string &box_text(int index) const {\n    return box_texts_[index];\n  }\n  // Saves the given Pix as a PNG-encoded string and destroys it.\n  // In case of missing PNG support in Leptonica use PNM format,\n  // which requires more memory.\n  void SetPix(Image pix);\n  // Returns the Pix image for *this. Must be pixDestroyed after use.\n  Image GetPix() const;\n  // Gets anything and everything with a non-nullptr pointer, prescaled to a\n  // given target_height (if 0, then the original image height), and aligned.\n  // Also returns (if not nullptr) the width and height of the scaled image.\n  // The return value is the scaled Pix, which must be pixDestroyed after use,\n  // and scale_factor (if not nullptr) is set to the scale factor that was\n  // applied to the image to achieve the target_height.\n  Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width,\n                int *scaled_height, std::vector<TBOX> *boxes) const;\n\n  int MemoryUsed() const;\n\n  // Draws the data in a new window.\n  void Display() const;\n\n  // Adds the supplied boxes and transcriptions that correspond to the correct\n  // page number.\n  void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,\n                const std::vector<int> &box_pages);\n\nprivate:\n  // Saves the given Pix as a PNG-encoded string and destroys it.\n  // In case of missing PNG support in Leptonica use PNM format,\n  // which requires more memory.\n  static void SetPixInternal(Image pix, std::vector<char> *image_data);\n  // Returns the Pix image for the image_data. Must be pixDestroyed after use.\n  static Image GetPixInternal(const std::vector<char> &image_data);\n  // Parses the text string as a box file and adds any discovered boxes that\n  // match the page number. Returns false on error.\n  bool AddBoxes(const char *box_text);\n\nprivate:\n  std::string imagefilename_; // File to read image from.\n  int32_t page_number_;  // Page number if multi-page tif or -1.\n  // see https://github.com/tesseract-ocr/tesseract/pull/2965\n  // EP: reconsider for tess6.0/opencv\n#ifdef TESSERACT_IMAGEDATA_AS_PIX\n  Image internal_pix_;\n#endif\n  std::vector<char> image_data_;  // PNG/PNM file data.\n  std::string language_;          // Language code for image.\n  std::string transcription_;     // UTF-8 ground truth of image.\n  std::vector<TBOX> boxes_;       // If non-empty boxes of the image.\n  std::vector<std::string> box_texts_; // String for text in each box.\n  bool vertical_text_;            // Image has been rotated from vertical.\n};\n\n// A collection of ImageData that knows roughly how much memory it is using.\nclass DocumentData {\npublic:\n  TESS_API\n  explicit DocumentData(const std::string &name);\n  TESS_API\n  ~DocumentData();\n\n  // Reads all the pages in the given lstmf filename to the cache. The reader\n  // is used to read the file.\n  TESS_API\n  bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);\n  // Sets up the document, without actually loading it.\n  void SetDocument(const char *filename, int64_t max_memory, FileReader reader);\n  // Writes all the pages to the given filename. Returns false on error.\n  TESS_API\n  bool SaveDocument(const char *filename, FileWriter writer);\n\n  // Adds the given page data to this document, counting up memory.\n  TESS_API\n  void AddPageToDocument(ImageData *page);\n\n  const std::string &document_name() const {\n    std::lock_guard<std::mutex> lock(general_mutex_);\n    return document_name_;\n  }\n  int NumPages() const {\n    std::lock_guard<std::mutex> lock(general_mutex_);\n    return total_pages_;\n  }\n  size_t PagesSize() const {\n    return pages_.size();\n  }\n  int64_t memory_used() const {\n    std::lock_guard<std::mutex> lock(general_mutex_);\n    return memory_used_;\n  }\n  // If the given index is not currently loaded, loads it using a separate\n  // thread. Note: there are 4 cases:\n  // Document uncached: IsCached() returns false, total_pages_ < 0.\n  // Required page is available: IsPageAvailable returns true. In this case,\n  // total_pages_ > 0 and\n  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()\n  // Pages are loaded, but the required one is not.\n  // The requested page is being loaded by LoadPageInBackground. In this case,\n  // index == pages_offset_. Once the loading starts, the pages lock is held\n  // until it completes, at which point IsPageAvailable will unblock and return\n  // true.\n  void LoadPageInBackground(int index);\n  // Returns a pointer to the page with the given index, modulo the total\n  // number of pages. Blocks until the background load is completed.\n  TESS_API\n  const ImageData *GetPage(int index);\n  // Returns true if the requested page is available, and provides a pointer,\n  // which may be nullptr if the document is empty. May block, even though it\n  // doesn't guarantee to return true.\n  bool IsPageAvailable(int index, ImageData **page);\n  // Takes ownership of the given page index. The page is made nullptr in *this.\n  ImageData *TakePage(int index) {\n    std::lock_guard<std::mutex> lock(pages_mutex_);\n    ImageData *page = pages_[index];\n    pages_[index] = nullptr;\n    return page;\n  }\n  // Returns true if the document is currently loaded or in the process of\n  // loading.\n  bool IsCached() const {\n    return NumPages() >= 0;\n  }\n  // Removes all pages from memory and frees the memory, but does not forget\n  // the document metadata. Returns the memory saved.\n  int64_t UnCache();\n  // Shuffles all the pages in the document.\n  void Shuffle();\n\nprivate:\n  // Sets the value of total_pages_ behind a mutex.\n  void set_total_pages(int total) {\n    std::lock_guard<std::mutex> lock(general_mutex_);\n    total_pages_ = total;\n  }\n  void set_memory_used(int64_t memory_used) {\n    std::lock_guard<std::mutex> lock(general_mutex_);\n    memory_used_ = memory_used;\n  }\n  // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_\n  // starting at index pages_offset_.\n  bool ReCachePages();\n\nprivate:\n  // A name for this document.\n  std::string document_name_;\n  // A group of pages that corresponds in some loose way to a document.\n  std::vector<ImageData *> pages_;\n  // Page number of the first index in pages_.\n  int pages_offset_;\n  // Total number of pages in document (may exceed size of pages_.)\n  int total_pages_;\n  // Total of all pix sizes in the document.\n  int64_t memory_used_;\n  // Max memory to use at any time.\n  int64_t max_memory_;\n  // Saved reader from LoadDocument to allow re-caching.\n  FileReader reader_;\n  // Mutex that protects pages_ and pages_offset_ against multiple parallel\n  // loads, and provides a wait for page.\n  std::mutex pages_mutex_;\n  // Mutex that protects other data members that callers want to access without\n  // waiting for a load operation.\n  mutable std::mutex general_mutex_;\n\n  // Thread which loads document.\n  std::thread thread;\n};\n\n// A collection of DocumentData that knows roughly how much memory it is using.\n// Note that while it supports background read-ahead, it assumes that a single\n// thread is accessing documents, ie it is not safe for multiple threads to\n// access different documents in parallel, as one may de-cache the other's\n// content.\nclass DocumentCache {\npublic:\n  TESS_API\n  explicit DocumentCache(int64_t max_memory);\n  TESS_API\n  ~DocumentCache();\n\n  // Deletes all existing documents from the cache.\n  void Clear() {\n    for (auto *document : documents_) {\n      delete document;\n    }\n    documents_.clear();\n    num_pages_per_doc_ = 0;\n  }\n  // Adds all the documents in the list of filenames, counting memory.\n  // The reader is used to read the files.\n  TESS_API\n  bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,\n                     FileReader reader);\n\n  // Adds document to the cache.\n  bool AddToCache(DocumentData *data);\n\n  // Finds and returns a document by name.\n  DocumentData *FindDocument(const std::string &document_name) const;\n\n  // Returns a page by serial number using the current cache_strategy_ to\n  // determine the mapping from serial number to page.\n  const ImageData *GetPageBySerial(int serial) {\n    if (cache_strategy_ == CS_SEQUENTIAL) {\n      return GetPageSequential(serial);\n    } else {\n      return GetPageRoundRobin(serial);\n    }\n  }\n\n  const std::vector<DocumentData *> &documents() const {\n    return documents_;\n  }\n  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache\n  // strategy, could take a long time.\n  TESS_API\n  int TotalPages();\n\nprivate:\n  // Returns a page by serial number, selecting them in a round-robin fashion\n  // from all the documents. Highly disk-intensive, but doesn't need samples\n  // to be shuffled between files to begin with.\n  TESS_API\n  const ImageData *GetPageRoundRobin(int serial);\n  // Returns a page by serial number, selecting them in sequence from each file.\n  // Requires the samples to be shuffled between the files to give a random or\n  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.\n  TESS_API\n  const ImageData *GetPageSequential(int serial);\n\n  // Helper counts the number of adjacent cached neighbour documents_ of index\n  // looking in direction dir, ie index+dir, index+2*dir etc.\n  int CountNeighbourDocs(int index, int dir);\n\n  // A group of pages that corresponds in some loose way to a document.\n  std::vector<DocumentData *> documents_;\n  // Strategy to use for caching and serializing data samples.\n  CachingStrategy cache_strategy_ = CS_SEQUENTIAL;\n  // Number of pages in the first document, used as a divisor in\n  // GetPageSequential to determine the document index.\n  int num_pages_per_doc_ = 0;\n  // Max memory allowed in this cache.\n  int64_t max_memory_ = 0;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_IMAGE_IMAGEDATA_H_\n"
  },
  {
    "path": "src/ccstruct/linlsq.cpp",
    "content": "/**********************************************************************\n * File:        linlsq.cpp  (Formerly llsq.c)\n * Description: Linear Least squares fitting code.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"linlsq.h\"\n#include <cmath> // for std::sqrt\n#include <cstdio>\n#include \"errcode.h\"\n\nnamespace tesseract {\n\nconstexpr ERRCODE EMPTY_LLSQ(\"Can't delete from an empty LLSQ\");\n\n/**********************************************************************\n * LLSQ::clear\n *\n * Function to initialize a LLSQ.\n **********************************************************************/\n\nvoid LLSQ::clear() {  // initialize\n  total_weight = 0.0; // no elements\n  sigx = 0.0;         // update accumulators\n  sigy = 0.0;\n  sigxx = 0.0;\n  sigxy = 0.0;\n  sigyy = 0.0;\n}\n\n/**********************************************************************\n * LLSQ::add\n *\n * Add an element to the accumulator.\n **********************************************************************/\n\nvoid LLSQ::add(double x, double y) { // add an element\n  total_weight++;                    // count elements\n  sigx += x;                         // update accumulators\n  sigy += y;\n  sigxx += x * x;\n  sigxy += x * y;\n  sigyy += y * y;\n}\n// Adds an element with a specified weight.\nvoid LLSQ::add(double x, double y, double weight) {\n  total_weight += weight;\n  sigx += x * weight; // update accumulators\n  sigy += y * weight;\n  sigxx += x * x * weight;\n  sigxy += x * y * weight;\n  sigyy += y * y * weight;\n}\n// Adds a whole LLSQ.\nvoid LLSQ::add(const LLSQ &other) {\n  total_weight += other.total_weight;\n  sigx += other.sigx; // update accumulators\n  sigy += other.sigy;\n  sigxx += other.sigxx;\n  sigxy += other.sigxy;\n  sigyy += other.sigyy;\n}\n\n/**********************************************************************\n * LLSQ::remove\n *\n * Delete an element from the acculuator.\n **********************************************************************/\n\nvoid LLSQ::remove(double x, double y) { // delete an element\n  if (total_weight <= 0.0) {            // illegal\n    EMPTY_LLSQ.error(\"LLSQ::remove\", ABORT);\n  }\n  total_weight--; // count elements\n  sigx -= x;      // update accumulators\n  sigy -= y;\n  sigxx -= x * x;\n  sigxy -= x * y;\n  sigyy -= y * y;\n}\n\n/**********************************************************************\n * LLSQ::m\n *\n * Return the gradient of the line fit.\n **********************************************************************/\n\ndouble LLSQ::m() const { // get gradient\n  double covar = covariance();\n  double x_var = x_variance();\n  if (x_var != 0.0) {\n    return covar / x_var;\n  } else {\n    return 0.0; // too little\n  }\n}\n\n/**********************************************************************\n * LLSQ::c\n *\n * Return the constant of the line fit.\n **********************************************************************/\n\ndouble LLSQ::c(double m) const { // get constant\n  if (total_weight > 0.0) {\n    return (sigy - m * sigx) / total_weight;\n  } else {\n    return 0; // too little\n  }\n}\n\n/**********************************************************************\n * LLSQ::rms\n *\n * Return the rms error of the fit.\n **********************************************************************/\n\ndouble LLSQ::rms(double m, double c) const { // get error\n  double error;                              // total error\n\n  if (total_weight > 0) {\n    error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);\n    if (error >= 0) {\n      error = std::sqrt(error / total_weight); // sqrt of mean\n    } else {\n      error = 0;\n    }\n  } else {\n    error = 0; // too little\n  }\n  return error;\n}\n\n/**********************************************************************\n * LLSQ::pearson\n *\n * Return the pearson product moment correlation coefficient.\n **********************************************************************/\n\ndouble LLSQ::pearson() const { // get correlation\n  double r = 0.0;              // Correlation is 0 if insufficient data.\n\n  double covar = covariance();\n  if (covar != 0.0) {\n    double var_product = x_variance() * y_variance();\n    if (var_product > 0.0) {\n      r = covar / std::sqrt(var_product);\n    }\n  }\n  return r;\n}\n\n// Returns the x,y means as an FCOORD.\nFCOORD LLSQ::mean_point() const {\n  if (total_weight > 0.0) {\n    return FCOORD(sigx / total_weight, sigy / total_weight);\n  } else {\n    return FCOORD(0.0f, 0.0f);\n  }\n}\n\n// Returns the sqrt of the mean squared error measured perpendicular from the\n// line through mean_point() in the direction dir.\n//\n// Derivation:\n//   Lemma:  Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).\n//     Let % be dot product and ' be transpose.  Note that:\n//      Sum[i=1..N] (v % x_i)^2\n//         = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'\n//     If x_i have average 0 we have:\n//       = v * (N * COVARIANCE_MATRIX(X)) * v'\n//     Expanded for the case that k = 2, where we treat the dimensions\n//     as x_i and y_i, this is:\n//       = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'\n//  Now, we are trying to calculate the mean squared error, where v is\n//  perpendicular to our line of interest:\n//    Mean squared error\n//      = E [ (v % (x_i - x_avg))) ^2 ]\n//      = Sum (v % (x_i - x_avg))^2 / N\n//      = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'\n//      = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'\n//      = code below\ndouble LLSQ::rms_orth(const FCOORD &dir) const {\n  FCOORD v = !dir;\n  v.normalise();\n  return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +\n                   y_variance() * v.y() * v.y());\n}\n\n// Returns the direction of the fitted line as a unit vector, using the\n// least mean squared perpendicular distance. The line runs through the\n// mean_point, i.e. a point p on the line is given by:\n// p = mean_point() + lambda * vector_fit() for some real number lambda.\n// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous\n// and may be negated without changing its meaning.\n// Fitting a line m + 𝜆v to a set of N points Pi = (xi, yi), where\n// m is the mean point (𝝁, 𝝂) and\n// v is the direction vector (cos𝜃, sin𝜃)\n// The perpendicular distance of each Pi from the line is:\n// (Pi - m) x v, where x is the scalar cross product.\n// Total squared error is thus:\n// E = ∑((xi - 𝝁)sin𝜃 - (yi - 𝝂)cos𝜃)²\n//   = ∑(xi - 𝝁)²sin²𝜃  - 2∑(xi - 𝝁)(yi - 𝝂)sin𝜃 cos𝜃 + ∑(yi - 𝝂)²cos²𝜃\n//   = NVar(xi)sin²𝜃  - 2NCovar(xi, yi)sin𝜃 cos𝜃  + NVar(yi)cos²𝜃   (Eq 1)\n// where Var(xi) is the variance of xi,\n// and Covar(xi, yi) is the covariance of xi, yi.\n// Taking the derivative wrt 𝜃 and setting to 0 to obtain the min/max:\n// 0 = 2NVar(xi)sin𝜃 cos𝜃 -2NCovar(xi, yi)(cos²𝜃 - sin²𝜃) -2NVar(yi)sin𝜃 cos𝜃\n// => Covar(xi, yi)(cos²𝜃 - sin²𝜃) = (Var(xi) - Var(yi))sin𝜃 cos𝜃\n// Using double angles:\n// 2Covar(xi, yi)cos2𝜃 = (Var(xi) - Var(yi))sin2𝜃   (Eq 2)\n// So 𝜃 = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)\n\n// Because it involves 2𝜃 , Eq 2 has 2 solutions 90 degrees apart, but which\n// is the min and which is the max? From Eq1:\n// E/N = Var(xi)sin²𝜃  - 2Covar(xi, yi)sin𝜃 cos𝜃  + Var(yi)cos²𝜃\n// and 90 degrees away, using sin/cos equivalences:\n// E'/N = Var(xi)cos²𝜃  + 2Covar(xi, yi)sin𝜃 cos𝜃  + Var(yi)sin²𝜃\n// The second error is smaller (making it the minimum) iff\n// E'/N < E/N ie:\n// (Var(xi) - Var(yi))(cos²𝜃 - sin²𝜃) < -4Covar(xi, yi)sin𝜃 cos𝜃\n// Using double angles:\n// (Var(xi) - Var(yi))cos2𝜃  < -2Covar(xi, yi)sin2𝜃  (InEq 1)\n// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2𝜃  such that:\n// sgn(cos2𝜃) = sgn(Var(xi) - Var(yi)) and sgn(sin2𝜃) = sgn(Covar(xi, yi))\n// so InEq1 can *never* be true, making the atan2 result *always* the min!\n// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),\n// the 2 solutions have equal error and the inequality is still false.\n// Therefore the solution really is as trivial as Eq 3.\n\n// This is equivalent to returning the Principal Component in PCA, or the\n// eigenvector corresponding to the largest eigenvalue in the covariance\n// matrix.  However, atan2 is much simpler! The one reference I found that\n// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but\n// that is still a much more complex derivation. It seems Pearson had already\n// found this simple solution in 1901.\n// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559\nFCOORD LLSQ::vector_fit() const {\n  double x_var = x_variance();\n  double y_var = y_variance();\n  double covar = covariance();\n  double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);\n  FCOORD result(cos(theta), sin(theta));\n  return result;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/linlsq.h",
    "content": "/**********************************************************************\n * File:        linlsq.h  (Formerly llsq.h)\n * Description: Linear Least squares fitting code.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCSTRUCT_LINLSQ_H_\n#define TESSERACT_CCSTRUCT_LINLSQ_H_\n\n#include \"points.h\" // for FCOORD\n\n#include <algorithm> // for std::nth_element\n#include <cstdint> // for int32_t\n\nnamespace tesseract {\n\nclass TESS_API LLSQ {\npublic:\n  LLSQ() {   // constructor\n    clear(); // set to zeros\n  }\n  void clear(); // initialize\n\n  // Adds an element with a weight of 1.\n  void add(double x, double y);\n  // Adds an element with a specified weight.\n  void add(double x, double y, double weight);\n  // Adds a whole LLSQ.\n  void add(const LLSQ &other);\n  // Deletes an element with a weight of 1.\n  void remove(double x, double y);\n  int32_t count() const { // no of elements\n    return static_cast<int>(total_weight + 0.5);\n  }\n\n  double m() const;                     // get gradient\n  double c(double m) const;             // get constant\n  double rms(double m, double c) const; // get error\n  double pearson() const;               // get correlation coefficient.\n\n  // Returns the x,y means as an FCOORD.\n  FCOORD mean_point() const;\n\n  // Returns the average sum of squared perpendicular error from a line\n  // through mean_point() in the direction dir.\n  double rms_orth(const FCOORD &dir) const;\n\n  // Returns the direction of the fitted line as a unit vector, using the\n  // least mean squared perpendicular distance. The line runs through the\n  // mean_point, i.e. a point p on the line is given by:\n  // p = mean_point() + lambda * vector_fit() for some real number lambda.\n  // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous\n  // and may be negated without changing its meaning, since a line is only\n  // unique to a range of pi radians.\n  // Modernists prefer to think of this as an Eigenvalue problem, but\n  // Pearson had the simple solution in 1901.\n  //\n  // Note that this is equivalent to returning the Principal Component in PCA,\n  // or the eigenvector corresponding to the largest eigenvalue in the\n  // covariance matrix.\n  FCOORD vector_fit() const;\n\n  // Returns the covariance.\n  double covariance() const {\n    if (total_weight > 0.0) {\n      return (sigxy - sigx * sigy / total_weight) / total_weight;\n    } else {\n      return 0.0;\n    }\n  }\n  double x_variance() const {\n    if (total_weight > 0.0) {\n      return (sigxx - sigx * sigx / total_weight) / total_weight;\n    } else {\n      return 0.0;\n    }\n  }\n  double y_variance() const {\n    if (total_weight > 0.0) {\n      return (sigyy - sigy * sigy / total_weight) / total_weight;\n    } else {\n      return 0.0;\n    }\n  }\n\nprivate:\n  double total_weight; // no of elements or sum of weights.\n  double sigx;         // sum of x\n  double sigy;         // sum of y\n  double sigxx;        // sum x squared\n  double sigxy;        // sum of xy\n  double sigyy;        // sum y squared\n};\n\n// Returns the median value of the vector, given that the values are\n// circular, with the given modulus. Values may be signed or unsigned,\n// eg range from -pi to pi (modulus 2pi) or from 0 to 2pi (modulus 2pi).\n// NOTE that the array is shuffled, but the time taken is linear.\n// An assumption is made that most of the values are spread over no more than\n// half the range, but wrap-around is accounted for if the median is near\n// the wrap-around point.\n// Cannot be a member of vector, as it makes heavy use of LLSQ.\n// T must be an integer or float/double type.\ntemplate <typename T>\nT MedianOfCircularValues(T modulus, std::vector<T> &v) {\n  LLSQ stats;\n  T halfrange = static_cast<T>(modulus / 2);\n  auto num_elements = v.size();\n  for (auto i : v) {\n    stats.add(i, i + halfrange);\n  }\n  bool offset_needed = stats.y_variance() < stats.x_variance();\n  if (offset_needed) {\n    for (auto i : v) {\n      i += halfrange;\n    }\n  }\n  auto median_index = num_elements / 2;\n  std::nth_element(v.begin(), v.begin() + median_index, v.end());\n  if (offset_needed) {\n    for (auto i : v) {\n      i -= halfrange;\n    }\n  }\n  return v[median_index];\n}\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_LINLSQ_H_\n"
  },
  {
    "path": "src/ccstruct/matrix.cpp",
    "content": "/******************************************************************************\n *\n * File:         matrix.cpp  (Formerly matrix.c)\n * Description:  Ratings matrix code. (Used by associator)\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1990, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n#include \"matrix.h\"\n\n#include \"ratngs.h\"\n#include \"tprintf.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nMATRIX::~MATRIX() = default;\n\n// Returns true if there are any real classification results.\nbool MATRIX::Classified(int col, int row, int wildcard_id) const {\n  if (get(col, row) == NOT_CLASSIFIED) {\n    return false;\n  }\n  BLOB_CHOICE_IT b_it(get(col, row));\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    BLOB_CHOICE *choice = b_it.data();\n    if (choice->IsClassified()) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Expands the existing matrix in-place to make the band wider, without\n// losing any existing data.\nvoid MATRIX::IncreaseBandSize(int bandwidth) {\n  ResizeWithCopy(dimension(), bandwidth);\n}\n\n// Returns a bigger MATRIX with a new column and row in the matrix in order\n// to split the blob at the given (ind,ind) diagonal location.\n// Entries are relocated to the new MATRIX using the transformation defined\n// by MATRIX_COORD::MapForSplit.\n// Transfers the pointer data to the new MATRIX and deletes *this.\nMATRIX *MATRIX::ConsumeAndMakeBigger(int ind) {\n  int dim = dimension();\n  int band_width = bandwidth();\n  // Check to see if bandwidth needs expanding.\n  for (int col = ind; col >= 0 && col > ind - band_width; --col) {\n    if (array_[col * band_width + band_width - 1] != empty_) {\n      ++band_width;\n      break;\n    }\n  }\n  auto *result = new MATRIX(dim + 1, band_width);\n\n  for (int col = 0; col < dim; ++col) {\n    for (int row = col; row < dim && row < col + bandwidth(); ++row) {\n      MATRIX_COORD coord(col, row);\n      coord.MapForSplit(ind);\n      BLOB_CHOICE_LIST *choices = get(col, row);\n      if (choices != nullptr) {\n        // Correct matrix location on each choice.\n        BLOB_CHOICE_IT bc_it(choices);\n        for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {\n          BLOB_CHOICE *choice = bc_it.data();\n          choice->set_matrix_cell(coord.col, coord.row);\n        }\n        ASSERT_HOST(coord.Valid(*result));\n        result->put(coord.col, coord.row, choices);\n      }\n    }\n  }\n  delete this;\n  return result;\n}\n\n// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs\n// on the lists, but not any LanguageModelState that may be attached to the\n// BLOB_CHOICEs.\nMATRIX *MATRIX::DeepCopy() const {\n  int dim = dimension();\n  int band_width = bandwidth();\n  auto *result = new MATRIX(dim, band_width);\n  for (int col = 0; col < dim; ++col) {\n    for (int row = col; row < dim && row < col + band_width; ++row) {\n      BLOB_CHOICE_LIST *choices = get(col, row);\n      if (choices != nullptr) {\n        auto *copy_choices = new BLOB_CHOICE_LIST;\n        copy_choices->deep_copy(choices, &BLOB_CHOICE::deep_copy);\n        result->put(col, row, copy_choices);\n      }\n    }\n  }\n  return result;\n}\n\n// Print the best guesses out of the match rating matrix.\nvoid MATRIX::print(const UNICHARSET &unicharset) const {\n  tprintf(\"Ratings Matrix (top 3 choices)\\n\");\n  int dim = dimension();\n  int band_width = bandwidth();\n  int row, col;\n  for (col = 0; col < dim; ++col) {\n    for (row = col; row < dim && row < col + band_width; ++row) {\n      BLOB_CHOICE_LIST *rating = this->get(col, row);\n      if (rating == NOT_CLASSIFIED) {\n        continue;\n      }\n      BLOB_CHOICE_IT b_it(rating);\n      tprintf(\"col=%d row=%d \", col, row);\n      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n        tprintf(\"%s rat=%g cert=%g \", unicharset.id_to_unichar(b_it.data()->unichar_id()),\n                b_it.data()->rating(), b_it.data()->certainty());\n      }\n      tprintf(\"\\n\");\n    }\n    tprintf(\"\\n\");\n  }\n  tprintf(\"\\n\");\n  for (col = 0; col < dim; ++col) {\n    tprintf(\"\\t%d\", col);\n  }\n  tprintf(\"\\n\");\n  for (row = 0; row < dim; ++row) {\n    for (col = 0; col <= row; ++col) {\n      if (col == 0) {\n        tprintf(\"%d\\t\", row);\n      }\n      if (row >= col + band_width) {\n        tprintf(\" \\t\");\n        continue;\n      }\n      BLOB_CHOICE_LIST *rating = this->get(col, row);\n      if (rating != NOT_CLASSIFIED) {\n        BLOB_CHOICE_IT b_it(rating);\n        int counter = 0;\n        for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n          tprintf(\"%s \", unicharset.id_to_unichar(b_it.data()->unichar_id()));\n          ++counter;\n          if (counter == 3) {\n            break;\n          }\n        }\n        tprintf(\"\\t\");\n      } else {\n        tprintf(\" \\t\");\n      }\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/matrix.h",
    "content": "/******************************************************************************\n * File:         matrix.h\n * Description:  Generic 2-d array/matrix and banded triangular matrix class.\n * Author:       Ray Smith\n * TODO(rays) Separate from ratings matrix, which it also contains:\n *\n * Description:  Ratings matrix class (specialization of banded matrix).\n *               Segmentation search matrix of lists of BLOB_CHOICE.\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1990, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef TESSERACT_CCSTRUCT_MATRIX_H_\n#define TESSERACT_CCSTRUCT_MATRIX_H_\n\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"helpers.h\" // for ReverseN, ClipToRange\n#include \"kdpair.h\"  // for KDPairInc\n#include \"points.h\"  // for ICOORD\n\n#include \"serialis.h\" // for TFile\n\n#include <algorithm> // for max, min\n#include <cmath>     // for sqrt, fabs, isfinite\n#include <cstdint>   // for int32_t\n#include <cstdio>    // for FILE\n#include <cstring>   // for memcpy\n\nnamespace tesseract {\n\nclass BLOB_CHOICE_LIST;\nclass UNICHARSET;\n\n#define NOT_CLASSIFIED static_cast<BLOB_CHOICE_LIST *>(nullptr)\n\n// A generic class to hold a 2-D matrix with entries of type T, but can also\n// act as a base class for other implementations, such as a triangular or\n// banded matrix.\ntemplate <class T>\nclass GENERIC_2D_ARRAY {\npublic:\n  // Initializes the array size, and empty element, but cannot allocate memory\n  // for the subclasses or initialize because calls to the num_elements\n  // member will be routed to the base class implementation. Subclasses can\n  // either pass the memory in, or allocate after by calling Resize().\n  GENERIC_2D_ARRAY(int dim1, int dim2, const T &empty, T *array)\n      : empty_(empty), dim1_(dim1), dim2_(dim2), array_(array) {\n    size_allocated_ = dim1 * dim2;\n  }\n  // Original constructor for a full rectangular matrix DOES allocate memory\n  // and initialize it to empty.\n  GENERIC_2D_ARRAY(int dim1, int dim2, const T &empty) : empty_(empty), dim1_(dim1), dim2_(dim2) {\n    int new_size = dim1 * dim2;\n    array_ = new T[new_size];\n    size_allocated_ = new_size;\n    for (int i = 0; i < size_allocated_; ++i) {\n      array_[i] = empty_;\n    }\n  }\n  // Default constructor for array allocation. Use Resize to set the size.\n  GENERIC_2D_ARRAY()\n      : array_(nullptr), empty_(static_cast<T>(0)), dim1_(0), dim2_(0), size_allocated_(0) {}\n  GENERIC_2D_ARRAY(const GENERIC_2D_ARRAY<T> &src)\n      : array_(nullptr), empty_(static_cast<T>(0)), dim1_(0), dim2_(0), size_allocated_(0) {\n    *this = src;\n  }\n  virtual ~GENERIC_2D_ARRAY() {\n    delete[] array_;\n  }\n\n  void operator=(const GENERIC_2D_ARRAY<T> &src) {\n    ResizeNoInit(src.dim1(), src.dim2());\n    int size = num_elements();\n    if (size > 0) {\n      memcpy(array_, src.array_, size * sizeof(array_[0]));\n    }\n  }\n\n  // Reallocates the array to the given size. Does not keep old data, but does\n  // not initialize the array either.\n  // The allocated memory is expanded on the end by pad, allowing deliberate\n  // access beyond the bounds of the array.\n  void ResizeNoInit(int size1, int size2, int pad = 0) {\n    int new_size = size1 * size2 + pad;\n    if (new_size > size_allocated_) {\n      delete[] array_;\n      array_ = new T[new_size];\n      size_allocated_ = new_size;\n    }\n    dim1_ = size1;\n    dim2_ = size2;\n    // Fill the padding data so it isn't uninitialized.\n    for (int i = size1 * size2; i < new_size; ++i) {\n      array_[i] = empty_;\n    }\n  }\n\n  // Reallocate the array to the given size. Does not keep old data.\n  void Resize(int size1, int size2, const T &empty) {\n    empty_ = empty;\n    ResizeNoInit(size1, size2);\n    Clear();\n  }\n\n  // Reallocate the array to the given size, keeping old data.\n  void ResizeWithCopy(int size1, int size2) {\n    if (size1 != dim1_ || size2 != dim2_) {\n      int new_size = size1 * size2;\n      T *new_array = new T[new_size];\n      for (int col = 0; col < size1; ++col) {\n        for (int row = 0; row < size2; ++row) {\n          int old_index = col * dim2() + row;\n          int new_index = col * size2 + row;\n          if (col < dim1_ && row < dim2_) {\n            new_array[new_index] = array_[old_index];\n          } else {\n            new_array[new_index] = empty_;\n          }\n        }\n      }\n      delete[] array_;\n      array_ = new_array;\n      dim1_ = size1;\n      dim2_ = size2;\n      size_allocated_ = new_size;\n    }\n  }\n\n  // Sets all the elements of the array to the empty value.\n  void Clear() {\n    int total_size = num_elements();\n    for (int i = 0; i < total_size; ++i) {\n      array_[i] = empty_;\n    }\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  // Only works with bitwise-serializable types!\n  bool Serialize(FILE *fp) const {\n    if (!SerializeSize(fp)) {\n      return false;\n    }\n    if (!tesseract::Serialize(fp, &empty_)) {\n      return false;\n    }\n    int size = num_elements();\n    return tesseract::Serialize(fp, &array_[0], size);\n  }\n\n  bool Serialize(TFile *fp) const {\n    if (!SerializeSize(fp)) {\n      return false;\n    }\n    if (!fp->Serialize(&empty_)) {\n      return false;\n    }\n    int size = num_elements();\n    return fp->Serialize(&array_[0], size);\n  }\n\n  // Reads from the given file. Returns false in case of error.\n  // Only works with bitwise-serializable types!\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp) {\n    if (!DeSerializeSize(swap, fp)) {\n      return false;\n    }\n    if (!tesseract::DeSerialize(fp, &empty_)) {\n      return false;\n    }\n    if (swap) {\n      ReverseN(&empty_, sizeof(empty_));\n    }\n    int size = num_elements();\n    if (!tesseract::DeSerialize(fp, &array_[0], size)) {\n      return false;\n    }\n    if (swap) {\n      for (int i = 0; i < size; ++i) {\n        ReverseN(&array_[i], sizeof(array_[i]));\n      }\n    }\n    return true;\n  }\n\n  bool DeSerialize(TFile *fp) {\n    return DeSerializeSize(fp) && fp->DeSerialize(&empty_) &&\n           fp->DeSerialize(&array_[0], num_elements());\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  // Assumes a T::Serialize(FILE*) const function.\n  bool SerializeClasses(FILE *fp) const {\n    if (!SerializeSize(fp)) {\n      return false;\n    }\n    if (!empty_.Serialize(fp)) {\n      return false;\n    }\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      if (!array_[i].Serialize(fp)) {\n        return false;\n      }\n    }\n    return true;\n  }\n\n  // Reads from the given file. Returns false in case of error.\n  // Assumes a T::DeSerialize(bool swap, FILE*) function.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerializeClasses(bool swap, FILE *fp) {\n    if (!DeSerializeSize(swap, fp)) {\n      return false;\n    }\n    if (!empty_.DeSerialize(swap, fp)) {\n      return false;\n    }\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      if (!array_[i].DeSerialize(swap, fp)) {\n        return false;\n      }\n    }\n    return true;\n  }\n\n  // Provide the dimensions of this rectangular matrix.\n  int dim1() const {\n    return dim1_;\n  }\n  int dim2() const {\n    return dim2_;\n  }\n  // Returns the number of elements in the array.\n  // Banded/triangular matrices may override.\n  virtual int num_elements() const {\n    return dim1_ * dim2_;\n  }\n\n  // Expression to select a specific location in the matrix. The matrix is\n  // stored COLUMN-major, so the left-most index is the most significant.\n  // This allows [][] access to use indices in the same order as (,).\n  virtual int index(int column, int row) const {\n    return (column * dim2_ + row);\n  }\n\n  // Put a list element into the matrix at a specific location.\n  void put(ICOORD pos, const T &thing) {\n    array_[this->index(pos.x(), pos.y())] = thing;\n  }\n  void put(int column, int row, const T &thing) {\n    array_[this->index(column, row)] = thing;\n  }\n\n  // Get the item at a specified location from the matrix.\n  T get(ICOORD pos) const {\n    return array_[this->index(pos.x(), pos.y())];\n  }\n  T get(int column, int row) const {\n    return array_[this->index(column, row)];\n  }\n  // Return a reference to the element at the specified location.\n  const T &operator()(int column, int row) const {\n    return array_[this->index(column, row)];\n  }\n  T &operator()(int column, int row) {\n    return array_[this->index(column, row)];\n  }\n  // Allow access using array[column][row]. NOTE that the indices are\n  // in the same left-to-right order as the () indexing.\n  T *operator[](int column) {\n    return &array_[this->index(column, 0)];\n  }\n  const T *operator[](int column) const {\n    return &array_[this->index(column, 0)];\n  }\n\n  // Adds addend to *this, element-by-element.\n  void operator+=(const GENERIC_2D_ARRAY<T> &addend) {\n    if (dim2_ == addend.dim2_) {\n      // Faster if equal size in the major dimension.\n      int size = std::min(num_elements(), addend.num_elements());\n      for (int i = 0; i < size; ++i) {\n        array_[i] += addend.array_[i];\n      }\n    } else {\n      for (int x = 0; x < dim1_; x++) {\n        for (int y = 0; y < dim2_; y++) {\n          (*this)(x, y) += addend(x, y);\n        }\n      }\n    }\n  }\n  // Subtracts minuend from *this, element-by-element.\n  void operator-=(const GENERIC_2D_ARRAY<T> &minuend) {\n    if (dim2_ == minuend.dim2_) {\n      // Faster if equal size in the major dimension.\n      int size = std::min(num_elements(), minuend.num_elements());\n      for (int i = 0; i < size; ++i) {\n        array_[i] -= minuend.array_[i];\n      }\n    } else {\n      for (int x = 0; x < dim1_; x++) {\n        for (int y = 0; y < dim2_; y++) {\n          (*this)(x, y) -= minuend(x, y);\n        }\n      }\n    }\n  }\n  // Adds addend to all elements.\n  void operator+=(const T &addend) {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      array_[i] += addend;\n    }\n  }\n  // Multiplies *this by factor, element-by-element.\n  void operator*=(const T &factor) {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      array_[i] *= factor;\n    }\n  }\n  // Clips *this to the given range.\n  void Clip(const T &rangemin, const T &rangemax) {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      array_[i] = ClipToRange(array_[i], rangemin, rangemax);\n    }\n  }\n  // Returns true if all elements of *this are within the given range.\n  // Only uses operator<\n  bool WithinBounds(const T &rangemin, const T &rangemax) const {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      const T &value = array_[i];\n      if (value < rangemin || rangemax < value) {\n        return false;\n      }\n    }\n    return true;\n  }\n  // Normalize the whole array.\n  double Normalize() {\n    int size = num_elements();\n    if (size <= 0) {\n      return 0.0;\n    }\n    // Compute the mean.\n    double mean = 0.0;\n    for (int i = 0; i < size; ++i) {\n      mean += array_[i];\n    }\n    mean /= size;\n    // Subtract the mean and compute the standard deviation.\n    double sd = 0.0;\n    for (int i = 0; i < size; ++i) {\n      double normed = array_[i] - mean;\n      array_[i] = normed;\n      sd += normed * normed;\n    }\n    sd = sqrt(sd / size);\n    if (sd > 0.0) {\n      // Divide by the sd.\n      for (int i = 0; i < size; ++i) {\n        array_[i] /= sd;\n      }\n    }\n    return sd;\n  }\n\n  // Returns the maximum value of the array.\n  T Max() const {\n    int size = num_elements();\n    if (size <= 0) {\n      return empty_;\n    }\n    // Compute the max.\n    T max_value = array_[0];\n    for (int i = 1; i < size; ++i) {\n      const T &value = array_[i];\n      if (value > max_value) {\n        max_value = value;\n      }\n    }\n    return max_value;\n  }\n\n  // Returns the maximum absolute value of the array.\n  T MaxAbs() const {\n    int size = num_elements();\n    if (size <= 0) {\n      return empty_;\n    }\n    // Compute the max.\n    T max_abs = static_cast<T>(0);\n    for (int i = 0; i < size; ++i) {\n      T value = static_cast<T>(fabs(array_[i]));\n      if (value > max_abs) {\n        max_abs = value;\n      }\n    }\n    return max_abs;\n  }\n\n  // Accumulates the element-wise sums of squares of src into *this.\n  void SumSquares(const GENERIC_2D_ARRAY<T> &src, const T &decay_factor) {\n    T update_factor = 1 - decay_factor;\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      array_[i] = array_[i] * decay_factor + update_factor * src.array_[i] * src.array_[i];\n    }\n  }\n\n  // Scales each element using the adam algorithm, ie array_[i] by\n  // sqrt(sqsum[i] + epsilon)).\n  void AdamUpdate(const GENERIC_2D_ARRAY<T> &sum, const GENERIC_2D_ARRAY<T> &sqsum,\n                  const T &epsilon) {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      array_[i] += sum.array_[i] / (sqrt(sqsum.array_[i]) + epsilon);\n    }\n  }\n\n  void AssertFinite() const {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      ASSERT_HOST(isfinite(array_[i]));\n    }\n  }\n\n  // REGARDLESS OF THE CURRENT DIMENSIONS, treats the data as a\n  // num_dims-dimensional array/tensor with dimensions given by dims, (ordered\n  // from most significant to least significant, the same as standard C arrays)\n  // and moves src_dim to dest_dim, with the initial dest_dim and any dimensions\n  // in between shifted towards the hole left by src_dim. Example:\n  // Current data content: array_=[0, 1, 2, ....119]\n  //   perhaps *this may be of dim[40, 3], with values [[0, 1, 2][3, 4, 5]...\n  //   but the current dimensions are irrelevant.\n  // num_dims = 4, dims=[5, 4, 3, 2]\n  // src_dim=3, dest_dim=1\n  // tensor=[[[[0, 1][2, 3][4, 5]]\n  //          [[6, 7][8, 9][10, 11]]\n  //          [[12, 13][14, 15][16, 17]]\n  //          [[18, 19][20, 21][22, 23]]]\n  //         [[[24, 25]...\n  // output dims =[5, 2, 4, 3]\n  // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]]\n  //                 [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]]\n  //                [[[24, 26, 28]...\n  // which is stored in the array_ as:\n  //   [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 1, 3, 5, 7, 9, 11, 13...]\n  // NOTE: the 2 stored matrix dimensions are simply copied from *this. To\n  // change the dimensions after the transpose, use ResizeNoInit.\n  // Higher dimensions above 2 are strictly the responsibility of the caller.\n  void RotatingTranspose(const int *dims, int num_dims, int src_dim, int dest_dim,\n                         GENERIC_2D_ARRAY<T> *result) const {\n    int max_d = std::max(src_dim, dest_dim);\n    int min_d = std::min(src_dim, dest_dim);\n    // In a tensor of shape [d0, d1... min_d, ... max_d, ... dn-2, dn-1], the\n    // ends outside of min_d and max_d are unaffected, with [max_d +1, dn-1]\n    // being contiguous blocks of data that will move together, and\n    // [d0, min_d -1] being replicas of the transpose operation.\n    // num_replicas represents the large dimensions unchanged by the operation.\n    // move_size represents the small dimensions unchanged by the operation.\n    // src_step represents the stride in the src between each adjacent group\n    // in the destination.\n    int num_replicas = 1, move_size = 1, src_step = 1;\n    for (int d = 0; d < min_d; ++d) {\n      num_replicas *= dims[d];\n    }\n    for (int d = max_d + 1; d < num_dims; ++d) {\n      move_size *= dims[d];\n    }\n    for (int d = src_dim + 1; d < num_dims; ++d) {\n      src_step *= dims[d];\n    }\n    if (src_dim > dest_dim) {\n      src_step *= dims[src_dim];\n    }\n    // wrap_size is the size of a single replica, being the amount that is\n    // handled num_replicas times.\n    int wrap_size = move_size;\n    for (int d = min_d; d <= max_d; ++d) {\n      wrap_size *= dims[d];\n    }\n    result->ResizeNoInit(dim1_, dim2_);\n    result->empty_ = empty_;\n    const T *src = array_;\n    T *dest = result->array_;\n    for (int replica = 0; replica < num_replicas; ++replica) {\n      for (int start = 0; start < src_step; start += move_size) {\n        for (int pos = start; pos < wrap_size; pos += src_step) {\n          memcpy(dest, src + pos, sizeof(*dest) * move_size);\n          dest += move_size;\n        }\n      }\n      src += wrap_size;\n    }\n  }\n\n  // Delete objects pointed to by array_[i].\n  void delete_matrix_pointers() {\n    int size = num_elements();\n    for (int i = 0; i < size; ++i) {\n      T matrix_cell = array_[i];\n      if (matrix_cell != empty_) {\n        delete matrix_cell;\n      }\n    }\n  }\n\nprotected:\n  // Factored helper to serialize the size.\n  bool SerializeSize(FILE *fp) const {\n    uint32_t size = dim1_;\n    if (!tesseract::Serialize(fp, &size)) {\n      return false;\n    }\n    size = dim2_;\n    return tesseract::Serialize(fp, &size);\n  }\n  bool SerializeSize(TFile *fp) const {\n    uint32_t size = dim1_;\n    if (!fp->Serialize(&size)) {\n      return false;\n    }\n    size = dim2_;\n    return fp->Serialize(&size);\n  }\n  // Factored helper to deserialize the size.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerializeSize(bool swap, FILE *fp) {\n    uint32_t size1, size2;\n    if (!tesseract::DeSerialize(fp, &size1)) {\n      return false;\n    }\n    if (!tesseract::DeSerialize(fp, &size2)) {\n      return false;\n    }\n    if (swap) {\n      ReverseN(&size1, sizeof(size1));\n      ReverseN(&size2, sizeof(size2));\n    }\n    // Arbitrarily limit the number of elements to protect against bad data.\n    if (size1 > UINT16_MAX) {\n      return false;\n    }\n    if (size2 > UINT16_MAX) {\n      return false;\n    }\n    Resize(size1, size2, empty_);\n    return true;\n  }\n  bool DeSerializeSize(TFile *fp) {\n    int32_t size1, size2;\n    if (!fp->DeSerialize(&size1)) {\n      return false;\n    }\n    if (!fp->DeSerialize(&size2)) {\n      return false;\n    }\n    // Arbitrarily limit the number of elements to protect against bad data.\n    if (size1 > UINT16_MAX) {\n      return false;\n    }\n    if (size2 > UINT16_MAX) {\n      return false;\n    }\n    Resize(size1, size2, empty_);\n    return true;\n  }\n\n  T *array_;\n  T empty_;  // The unused cell.\n  int dim1_; // Size of the 1st dimension in indexing functions.\n  int dim2_; // Size of the 2nd dimension in indexing functions.\n  // The total size to which the array can be expanded before a realloc is\n  // needed. If Resize is used, memory is retained so it can be re-expanded\n  // without a further alloc, and this stores the allocated size.\n  int size_allocated_;\n};\n\n// A generic class to store a banded triangular matrix with entries of type T.\n// In this array, the nominally square matrix is dim1_ x dim1_, and dim2_ is\n// the number of bands, INCLUDING the diagonal. The storage is thus of size\n// dim1_ * dim2_ and index(col, row) = col * dim2_ + row - col, and an\n// assert will fail if row < col or row - col >= dim2.\ntemplate <class T>\nclass BandTriMatrix : public GENERIC_2D_ARRAY<T> {\npublic:\n  // Allocate a piece of memory to hold a 2d-array of the given dimension.\n  // Initialize all the elements of the array to empty instead of assuming\n  // that a default constructor can be used.\n  BandTriMatrix(int dim1, int dim2, const T &empty) : GENERIC_2D_ARRAY<T>(dim1, dim2, empty) {}\n  // The default destructor will do.\n\n  // Provide the dimensions of this matrix.\n  // dimension is the size of the nominally square matrix.\n  int dimension() const {\n    return this->dim1_;\n  }\n  // bandwidth is the number of bands in the matrix, INCLUDING the diagonal.\n  int bandwidth() const {\n    return this->dim2_;\n  }\n\n  // Expression to select a specific location in the matrix. The matrix is\n  // stored COLUMN-major, so the left-most index is the most significant.\n  // This allows [][] access to use indices in the same order as (,).\n  int index(int column, int row) const override {\n    ASSERT_HOST(row >= column);\n    ASSERT_HOST(row - column < this->dim2_);\n    return column * this->dim2_ + row - column;\n  }\n\n  // Appends array2 corner-to-corner to *this, making an array of dimension\n  // equal to the sum of the individual dimensions.\n  // array2 is not destroyed, but is left empty, as all elements are moved\n  // to *this.\n  void AttachOnCorner(BandTriMatrix<T> *array2) {\n    int new_dim1 = this->dim1_ + array2->dim1_;\n    int new_dim2 = std::max(this->dim2_, array2->dim2_);\n    T *new_array = new T[new_dim1 * new_dim2];\n    for (int col = 0; col < new_dim1; ++col) {\n      for (int j = 0; j < new_dim2; ++j) {\n        int new_index = col * new_dim2 + j;\n        if (col < this->dim1_ && j < this->dim2_) {\n          new_array[new_index] = this->get(col, col + j);\n        } else if (col >= this->dim1_ && j < array2->dim2_) {\n          new_array[new_index] = array2->get(col - this->dim1_, col - this->dim1_ + j);\n          array2->put(col - this->dim1_, col - this->dim1_ + j, nullptr);\n        } else {\n          new_array[new_index] = this->empty_;\n        }\n      }\n    }\n    delete[] this->array_;\n    this->array_ = new_array;\n    this->dim1_ = new_dim1;\n    this->dim2_ = new_dim2;\n  }\n};\n\nclass MATRIX : public BandTriMatrix<BLOB_CHOICE_LIST *> {\npublic:\n  MATRIX(int dimension, int bandwidth)\n      : BandTriMatrix<BLOB_CHOICE_LIST *>(dimension, bandwidth, NOT_CLASSIFIED) {}\n\n  ~MATRIX() override;\n\n  // Returns true if there are any real classification results.\n  bool Classified(int col, int row, int wildcard_id) const;\n\n  // Expands the existing matrix in-place to make the band wider, without\n  // losing any existing data.\n  void IncreaseBandSize(int bandwidth);\n\n  // Returns a bigger MATRIX with a new column and row in the matrix in order\n  // to split the blob at the given (ind,ind) diagonal location.\n  // Entries are relocated to the new MATRIX using the transformation defined\n  // by MATRIX_COORD::MapForSplit.\n  // Transfers the pointer data to the new MATRIX and deletes *this.\n  MATRIX *ConsumeAndMakeBigger(int ind);\n\n  // Makes and returns a deep copy of *this, including all the BLOB_CHOICEs\n  // on the lists, but not any LanguageModelState that may be attached to the\n  // BLOB_CHOICEs.\n  MATRIX *DeepCopy() const;\n\n  // Print a shortened version of the contents of the matrix.\n  void print(const UNICHARSET &unicharset) const;\n};\n\nstruct MATRIX_COORD {\n  static void Delete(void *arg) {\n    auto *c = static_cast<MATRIX_COORD *>(arg);\n    delete c;\n  }\n  // Default constructor required by GenericHeap.\n  MATRIX_COORD() : col(0), row(0) {}\n  MATRIX_COORD(int c, int r) : col(c), row(r) {}\n  ~MATRIX_COORD() = default;\n\n  bool Valid(const MATRIX &m) const {\n    return 0 <= col && col < m.dimension() && col <= row && row < col + m.bandwidth() &&\n           row < m.dimension();\n  }\n\n  // Remaps the col,row pair to split the blob at the given (ind,ind) diagonal\n  // location.\n  // Entries at (i,j) for i in [0,ind] and j in [ind,dim) move to (i,j+1),\n  // making a new row at ind.\n  // Entries at (i,j) for i in [ind+1,dim) and j in [i,dim) move to (i+i,j+1),\n  // making a new column at ind+1.\n  void MapForSplit(int ind) {\n    ASSERT_HOST(row >= col);\n    if (col > ind) {\n      ++col;\n    }\n    if (row >= ind) {\n      ++row;\n    }\n    ASSERT_HOST(row >= col);\n  }\n\n  int col;\n  int row;\n};\n\n// The MatrixCoordPair contains a MATRIX_COORD and its priority.\nusing MatrixCoordPair = KDPairInc<float, MATRIX_COORD>;\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_MATRIX_H_\n"
  },
  {
    "path": "src/ccstruct/mod128.cpp",
    "content": "/**********************************************************************\n * File:        mod128.cpp  (Formerly dir128.c)\n * Description: Code to convert a DIR128 to an ICOORD.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"mod128.h\"\n\nnamespace tesseract {\n\nstatic const TDimension idirtab[] = {\n    1000, 0,    998,  49,   995,  98,   989,  146,  980,   195,  970,  242,  956,  290,   941,\n    336,  923,  382,  903,  427,  881,  471,  857,  514,   831,  555,  803,  595,  773,   634,\n    740,  671,  707,  707,  671,  740,  634,  773,  595,   803,  555,  831,  514,  857,   471,\n    881,  427,  903,  382,  923,  336,  941,  290,  956,   242,  970,  195,  980,  146,   989,\n    98,   995,  49,   998,  0,    1000, -49,  998,  -98,   995,  -146, 989,  -195, 980,   -242,\n    970,  -290, 956,  -336, 941,  -382, 923,  -427, 903,   -471, 881,  -514, 857,  -555,  831,\n    -595, 803,  -634, 773,  -671, 740,  -707, 707,  -740,  671,  -773, 634,  -803, 595,   -831,\n    555,  -857, 514,  -881, 471,  -903, 427,  -923, 382,   -941, 336,  -956, 290,  -970,  242,\n    -980, 195,  -989, 146,  -995, 98,   -998, 49,   -1000, 0,    -998, -49,  -995, -98,   -989,\n    -146, -980, -195, -970, -242, -956, -290, -941, -336,  -923, -382, -903, -427, -881,  -471,\n    -857, -514, -831, -555, -803, -595, -773, -634, -740,  -671, -707, -707, -671, -740,  -634,\n    -773, -595, -803, -555, -831, -514, -857, -471, -881,  -427, -903, -382, -923, -336,  -941,\n    -290, -956, -242, -970, -195, -980, -146, -989, -98,   -995, -49,  -998, 0,    -1000, 49,\n    -998, 98,   -995, 146,  -989, 195,  -980, 242,  -970,  290,  -956, 336,  -941, 382,   -923,\n    427,  -903, 471,  -881, 514,  -857, 555,  -831, 595,   -803, 634,  -773, 671,  -740,  707,\n    -707, 740,  -671, 773,  -634, 803,  -595, 831,  -555,  857,  -514, 881,  -471, 903,   -427,\n    923,  -382, 941,  -336, 956,  -290, 970,  -242, 980,   -195, 989,  -146, 995,  -98,   998,\n    -49};\n\nstatic const ICOORD *dirtab = reinterpret_cast<const ICOORD *>(idirtab);\n\n/**********************************************************************\n * DIR128::DIR128\n *\n * Quantize the direction of an FCOORD to make a DIR128.\n **********************************************************************/\n\nDIR128::DIR128(     // from fcoord\n    const FCOORD fc // vector to quantize\n) {\n  int high, low, current; // binary search\n\n  low = 0;\n  if (fc.y() == 0) {\n    if (fc.x() >= 0) {\n      dir = 0;\n    } else {\n      dir = MODULUS / 2;\n    }\n    return;\n  }\n  high = MODULUS;\n  do {\n    current = (high + low) / 2;\n    if (dirtab[current] * fc >= 0) {\n      low = current;\n    } else {\n      high = current;\n    }\n  } while (high - low > 1);\n  dir = low;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/mod128.h",
    "content": "/**********************************************************************\n * File:        mod128.h  (Formerly dir128.h)\n * Description: Header for class which implements modulo arithmetic.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef MOD128_H\n#define MOD128_H\n\n#include \"points.h\"\n\nnamespace tesseract {\n\n#define MODULUS 128   /*range of directions */\n#define DIRBITS 7     // no of bits used\n#define DIRSCALE 1000 // length of vector\n\nclass DIR128 {\npublic:\n  DIR128() = default;\n\n  DIR128(              // constructor\n      int16_t value) { // value to assign\n    value %= MODULUS;  // modulo arithmetic\n    if (value < 0) {\n      value += MODULUS; // done properly\n    }\n    dir = static_cast<int8_t>(value);\n  }\n  DIR128(const FCOORD fc); // quantize vector\n\n  DIR128 &operator=(   // assign of int16_t\n      int16_t value) { // value to assign\n    value %= MODULUS;  // modulo arithmetic\n    if (value < 0) {\n      value += MODULUS; // done properly\n    }\n    dir = static_cast<int8_t>(value);\n    return *this;\n  }\n  int8_t operator-(              // subtraction\n      const DIR128 &minus) const // for signed result\n  {\n    // result\n    int16_t result = dir - minus.dir;\n\n    if (result > MODULUS / 2) {\n      result -= MODULUS; // get in range\n    } else if (result < -MODULUS / 2) {\n      result += MODULUS;\n    }\n    return static_cast<int8_t>(result);\n  }\n  DIR128 operator+(            // addition\n      const DIR128 &add) const // of itself\n  {\n    DIR128 result; // sum\n\n    result = dir + add.dir; // let = do the work\n    return result;\n  }\n  DIR128 &operator+=( // same as +\n      const DIR128 &add) {\n    *this = dir + add.dir; // let = do the work\n    return *this;\n  }\n  int8_t get_dir() const { // access function\n    return dir;\n  }\n\n  int8_t dir; // a direction\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/normalis.cpp",
    "content": "/**********************************************************************\n * File:        normalis.cpp  (Formerly denorm.c)\n * Description: Code for the DENORM class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"normalis.h\"\n\n#include <allheaders.h>\n#include \"blobs.h\"\n#include \"helpers.h\"\n#include \"matrix.h\"\n#include \"ocrblock.h\"\n#include \"unicharset.h\"\n#include \"werd.h\"\n\n#include <cfloat> // for FLT_MAX\n#include <cstdlib>\n\nnamespace tesseract {\n\n// Tolerance in pixels used for baseline and xheight on non-upper/lower scripts.\nconst int kSloppyTolerance = 4;\n// Final tolerance in pixels added to the computed xheight range.\nconst float kFinalPixelTolerance = 0.125f;\n\nDENORM::DENORM() {\n  Init();\n}\n\nDENORM::DENORM(const DENORM &src) {\n  rotation_ = nullptr;\n  x_map_ = nullptr;\n  y_map_ = nullptr;\n  *this = src;\n}\n\nDENORM &DENORM::operator=(const DENORM &src) {\n  Clear();\n  inverse_ = src.inverse_;\n  predecessor_ = src.predecessor_;\n  pix_ = src.pix_;\n  block_ = src.block_;\n  if (src.rotation_ == nullptr) {\n    rotation_ = nullptr;\n  } else {\n    rotation_ = new FCOORD(*src.rotation_);\n  }\n  x_origin_ = src.x_origin_;\n  y_origin_ = src.y_origin_;\n  x_scale_ = src.x_scale_;\n  y_scale_ = src.y_scale_;\n  final_xshift_ = src.final_xshift_;\n  final_yshift_ = src.final_yshift_;\n  return *this;\n}\n\nDENORM::~DENORM() {\n  Clear();\n}\n\n// Initializes the denorm for a transformation. For details see the large\n// comment in normalis.h.\n// Arguments:\n// block: if not nullptr, then this is the first transformation, and\n//        block->re_rotation() needs to be used after the Denorm\n//        transformation to get back to the image coords.\n// rotation: if not nullptr, apply this rotation after translation to the\n//           origin and scaling. (Usually a classify rotation.)\n// predecessor: if not nullptr, then predecessor has been applied to the\n//              input space and needs to be undone to complete the inverse.\n// The above pointers are not owned by this DENORM and are assumed to live\n// longer than this denorm, except rotation, which is deep copied on input.\n//\n// x_origin: The x origin which will be mapped to final_xshift in the result.\n// y_origin: The y origin which will be mapped to final_yshift in the result.\n//           Added to result of row->baseline(x) if not nullptr.\n//\n// x_scale: scale factor for the x-coordinate.\n// y_scale: scale factor for the y-coordinate. Ignored if segs is given.\n// Note that these scale factors apply to the same x and y system as the\n// x-origin and y-origin apply, ie after any block rotation, but before\n// the rotation argument is applied.\n//\n// final_xshift: The x component of the final translation.\n// final_yshift: The y component of the final translation.\nvoid DENORM::SetupNormalization(const BLOCK *block, const FCOORD *rotation,\n                                const DENORM *predecessor, float x_origin, float y_origin,\n                                float x_scale, float y_scale, float final_xshift,\n                                float final_yshift) {\n  Clear();\n  block_ = block;\n  if (rotation == nullptr) {\n    rotation_ = nullptr;\n  } else {\n    rotation_ = new FCOORD(*rotation);\n  }\n  predecessor_ = predecessor;\n  x_origin_ = x_origin;\n  y_origin_ = y_origin;\n  x_scale_ = x_scale;\n  y_scale_ = y_scale;\n  final_xshift_ = final_xshift;\n  final_yshift_ = final_yshift;\n}\n\n// Helper for SetupNonLinear computes an image of shortest run-lengths from\n// the x/y edges provided.\n// Based on \"A nonlinear normalization method for handprinted Kanji character\n// recognition -- line density equalization\" by Hiromitsu Yamada et al.\n// Eg below is an O in a 1-pixel margin-ed bounding box and the corresponding\n//  ______________     input x_coords and y_coords.\n// |  _________  |     <empty>\n// | |    _    | |     1, 6\n// | |   | |   | |     1, 3, 4, 6\n// | |   | |   | |     1, 3, 4, 6\n// | |   | |   | |     1, 3, 4, 6\n// | |   |_|   | |     1, 3, 4, 6\n// | |_________| |     1, 6\n// |_____________|     <empty>\n//  E 1 1 1 1 1 E\n//  m 7 7 2 7 7 m\n//  p     6     p\n//  t     7     t\n//  y           y\n// The output image contains the min of the x and y run-length (distance\n// between edges) at each coordinate in the image thus:\n//  ______________\n// |7 1_1_1_1_1 7|\n// |1|5 5 1 5 5|1|\n// |1|2 2|1|2 2|1|\n// |1|2 2|1|2 2|1|\n// |1|2 2|1|2 2|1|\n// |1|2 2|1|2 2|1|\n// |1|5_5_1_5_5|1|\n// |7_1_1_1_1_1_7|\n// Note that the input coords are all integer, so all partial pixels are dealt\n// with elsewhere. Although it is nice for outlines to be properly connected\n// and continuous, there is no requirement that they be as such, so they could\n// have been derived from a flaky source, such as greyscale.\n// This function works only within the provided box, and it is assumed that the\n// input x_coords and y_coords have already been translated to have the bottom-\n// left of box as the origin. Although an output, the minruns should have been\n// pre-initialized to be the same size as box. Each element will contain the\n// minimum of x and y run-length as shown above.\nstatic void ComputeRunlengthImage(const TBOX &box,\n                                  const std::vector<std::vector<int>> &x_coords,\n                                  const std::vector<std::vector<int>> &y_coords,\n                                  GENERIC_2D_ARRAY<int> *minruns) {\n  int width = box.width();\n  int height = box.height();\n  ASSERT_HOST(minruns->dim1() == width);\n  ASSERT_HOST(minruns->dim2() == height);\n  // Set a 2-d image array to the run lengths at each pixel.\n  for (int ix = 0; ix < width; ++ix) {\n    int y = 0;\n    for (auto y_coord : y_coords[ix]) {\n      int y_edge = ClipToRange(y_coord, 0, height);\n      int gap = y_edge - y;\n      // Every pixel between the last and current edge get set to the gap.\n      while (y < y_edge) {\n        (*minruns)(ix, y) = gap;\n        ++y;\n      }\n    }\n    // Pretend there is a bounding box of edges all around the image.\n    int gap = height - y;\n    while (y < height) {\n      (*minruns)(ix, y) = gap;\n      ++y;\n    }\n  }\n  // Now set the image pixels the MIN of the x and y runlengths.\n  for (int iy = 0; iy < height; ++iy) {\n    int x = 0;\n    for (auto x_coord : x_coords[iy]) {\n      int x_edge = ClipToRange(x_coord, 0, width);\n      int gap = x_edge - x;\n      while (x < x_edge) {\n        if (gap < (*minruns)(x, iy)) {\n          (*minruns)(x, iy) = gap;\n        }\n        ++x;\n      }\n    }\n    int gap = width - x;\n    while (x < width) {\n      if (gap < (*minruns)(x, iy)) {\n        (*minruns)(x, iy) = gap;\n      }\n      ++x;\n    }\n  }\n}\n// Converts the run-length image (see above to the edge density profiles used\n// for scaling, thus:\n//  ______________\n// |7 1_1_1_1_1 7|  = 5.28\n// |1|5 5 1 5 5|1|  = 3.8\n// |1|2 2|1|2 2|1|  = 5\n// |1|2 2|1|2 2|1|  = 5\n// |1|2 2|1|2 2|1|  = 5\n// |1|2 2|1|2 2|1|  = 5\n// |1|5_5_1_5_5|1|  = 3.8\n// |7_1_1_1_1_1_7|  = 5.28\n//  6 4 4 8 4 4 6\n//  . . . . . . .\n//  2 4 4 0 4 4 2\n//  8           8\n// Each profile is the sum of the reciprocals of the pixels in the image in\n// the appropriate row or column, and these are then normalized to sum to 1.\n// On output hx, hy contain an extra element, which will eventually be used\n// to guarantee that the top/right edge of the box (and anything beyond) always\n// gets mapped to the maximum target coordinate.\nstatic void ComputeEdgeDensityProfiles(const TBOX &box, const GENERIC_2D_ARRAY<int> &minruns,\n                                       std::vector<float> &hx, std::vector<float> &hy) {\n  int width = box.width();\n  int height = box.height();\n  hx.clear();\n  hx.resize(width + 1);\n  hy.clear();\n  hy.resize(height + 1);\n  double total = 0.0;\n  for (int iy = 0; iy < height; ++iy) {\n    for (int ix = 0; ix < width; ++ix) {\n      int run = minruns(ix, iy);\n      if (run == 0) {\n        run = 1;\n      }\n      float density = 1.0f / run;\n      hx[ix] += density;\n      hy[iy] += density;\n    }\n    total += hy[iy];\n  }\n  // Normalize each profile to sum to 1.\n  if (total > 0.0) {\n    for (int ix = 0; ix < width; ++ix) {\n      hx[ix] /= total;\n    }\n    for (int iy = 0; iy < height; ++iy) {\n      hy[iy] /= total;\n    }\n  }\n  // There is an extra element in each array, so initialize to 1.\n  hx[width] = 1.0f;\n  hy[height] = 1.0f;\n}\n\n// Sets up the DENORM to execute a non-linear transformation based on\n// preserving an even distribution of stroke edges. The transformation\n// operates only within the given box.\n// x_coords is a collection of the x-coords of vertical edges for each\n// y-coord starting at box.bottom().\n// y_coords is a collection of the y-coords of horizontal edges for each\n// x-coord starting at box.left().\n// Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.\n// Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.\n// The second-level vectors must all be sorted in ascending order.\n// See comments on the helper functions above for more details.\nvoid DENORM::SetupNonLinear(const DENORM *predecessor, const TBOX &box, float target_width,\n                            float target_height, float final_xshift, float final_yshift,\n                            const std::vector<std::vector<int>> &x_coords,\n                            const std::vector<std::vector<int>> &y_coords) {\n  Clear();\n  predecessor_ = predecessor;\n  // x_map_ and y_map_ store a mapping from input x and y coordinate to output\n  // x and y coordinate, based on scaling to the supplied target_width and\n  // target_height.\n  x_map_ = new std::vector<float>;\n  y_map_ = new std::vector<float>;\n  // Set a 2-d image array to the run lengths at each pixel.\n  int width = box.width();\n  int height = box.height();\n  GENERIC_2D_ARRAY<int> minruns(width, height, 0);\n  ComputeRunlengthImage(box, x_coords, y_coords, &minruns);\n  // Edge density is the sum of the inverses of the run lengths. Compute\n  // edge density projection profiles.\n  ComputeEdgeDensityProfiles(box, minruns, *x_map_, *y_map_);\n  // Convert the edge density profiles to the coordinates by multiplying by\n  // the desired size and accumulating.\n  (*x_map_)[width] = target_width;\n  for (int x = width - 1; x >= 0; --x) {\n    (*x_map_)[x] = (*x_map_)[x + 1] - (*x_map_)[x] * target_width;\n  }\n  (*y_map_)[height] = target_height;\n  for (int y = height - 1; y >= 0; --y) {\n    (*y_map_)[y] = (*y_map_)[y + 1] - (*y_map_)[y] * target_height;\n  }\n  x_origin_ = box.left();\n  y_origin_ = box.bottom();\n  final_xshift_ = final_xshift;\n  final_yshift_ = final_yshift;\n}\n\n// Transforms the given coords one step forward to normalized space, without\n// using any block rotation or predecessor.\nvoid DENORM::LocalNormTransform(const TPOINT &pt, TPOINT *transformed) const {\n  FCOORD src_pt(pt.x, pt.y);\n  FCOORD float_result;\n  LocalNormTransform(src_pt, &float_result);\n  transformed->x = IntCastRounded(float_result.x());\n  transformed->y = IntCastRounded(float_result.y());\n}\nvoid DENORM::LocalNormTransform(const FCOORD &pt, FCOORD *transformed) const {\n  FCOORD translated(pt.x() - x_origin_, pt.y() - y_origin_);\n  if (x_map_ != nullptr && y_map_ != nullptr) {\n    int x = ClipToRange(IntCastRounded(translated.x()), 0, static_cast<int>(x_map_->size() - 1));\n    translated.set_x((*x_map_)[x]);\n    int y = ClipToRange(IntCastRounded(translated.y()), 0, static_cast<int>(y_map_->size() - 1));\n    translated.set_y((*y_map_)[y]);\n  } else {\n    translated.set_x(translated.x() * x_scale_);\n    translated.set_y(translated.y() * y_scale_);\n    if (rotation_ != nullptr) {\n      translated.rotate(*rotation_);\n    }\n  }\n  transformed->set_x(translated.x() + final_xshift_);\n  transformed->set_y(translated.y() + final_yshift_);\n}\n\n// Transforms the given coords forward to normalized space using the\n// full transformation sequence defined by the block rotation, the\n// predecessors, deepest first, and finally this. If first_norm is not nullptr,\n// then the first and deepest transformation used is first_norm, ending\n// with this, and the block rotation will not be applied.\nvoid DENORM::NormTransform(const DENORM *first_norm, const TPOINT &pt, TPOINT *transformed) const {\n  FCOORD src_pt(pt.x, pt.y);\n  FCOORD float_result;\n  NormTransform(first_norm, src_pt, &float_result);\n  transformed->x = IntCastRounded(float_result.x());\n  transformed->y = IntCastRounded(float_result.y());\n}\nvoid DENORM::NormTransform(const DENORM *first_norm, const FCOORD &pt, FCOORD *transformed) const {\n  FCOORD src_pt(pt);\n  if (first_norm != this) {\n    if (predecessor_ != nullptr) {\n      predecessor_->NormTransform(first_norm, pt, &src_pt);\n    } else if (block_ != nullptr) {\n      FCOORD fwd_rotation(block_->re_rotation().x(), -block_->re_rotation().y());\n      src_pt.rotate(fwd_rotation);\n    }\n  }\n  LocalNormTransform(src_pt, transformed);\n}\n\n// Transforms the given coords one step back to source space, without\n// using to any block rotation or predecessor.\nvoid DENORM::LocalDenormTransform(const TPOINT &pt, TPOINT *original) const {\n  FCOORD src_pt(pt.x, pt.y);\n  FCOORD float_result;\n  LocalDenormTransform(src_pt, &float_result);\n  original->x = IntCastRounded(float_result.x());\n  original->y = IntCastRounded(float_result.y());\n}\n\nvoid DENORM::LocalDenormTransform(const FCOORD &pt, FCOORD *original) const {\n  FCOORD rotated(pt.x() - final_xshift_, pt.y() - final_yshift_);\n  if (x_map_ != nullptr && y_map_ != nullptr) {\n    auto pos = std::upper_bound(x_map_->begin(), x_map_->end(), rotated.x());\n    if (pos > x_map_->begin()) {\n      --pos;\n    }\n    auto x = pos - x_map_->begin();\n    original->set_x(x + x_origin_);\n    pos = std::upper_bound(y_map_->begin(), y_map_->end(), rotated.y());\n    if (pos > y_map_->begin()) {\n      --pos;\n    }\n    auto y = pos - y_map_->begin();\n    original->set_y(y + y_origin_);\n  } else {\n    if (rotation_ != nullptr) {\n      FCOORD inverse_rotation(rotation_->x(), -rotation_->y());\n      rotated.rotate(inverse_rotation);\n    }\n    original->set_x(rotated.x() / x_scale_ + x_origin_);\n    float y_scale = y_scale_;\n    original->set_y(rotated.y() / y_scale + y_origin_);\n  }\n}\n\n// Transforms the given coords all the way back to source image space using\n// the full transformation sequence defined by this and its predecessors\n// recursively, shallowest first, and finally any block re_rotation.\n// If last_denorm is not nullptr, then the last transformation used will\n// be last_denorm, and the block re_rotation will never be executed.\nvoid DENORM::DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const {\n  FCOORD src_pt(pt.x, pt.y);\n  FCOORD float_result;\n  DenormTransform(last_denorm, src_pt, &float_result);\n  original->x = IntCastRounded(float_result.x());\n  original->y = IntCastRounded(float_result.y());\n}\nvoid DENORM::DenormTransform(const DENORM *last_denorm, const FCOORD &pt, FCOORD *original) const {\n  LocalDenormTransform(pt, original);\n  if (last_denorm != this) {\n    if (predecessor_ != nullptr) {\n      predecessor_->DenormTransform(last_denorm, *original, original);\n    } else if (block_ != nullptr) {\n      original->rotate(block_->re_rotation());\n    }\n  }\n}\n\n// Normalize a blob using blob transformations. Less accurate, but\n// more accurately copies the old way.\nvoid DENORM::LocalNormBlob(TBLOB *blob) const {\n  ICOORD translation(-IntCastRounded(x_origin_), -IntCastRounded(y_origin_));\n  blob->Move(translation);\n  if (y_scale_ != 1.0f) {\n    blob->Scale(y_scale_);\n  }\n  if (rotation_ != nullptr) {\n    blob->Rotate(*rotation_);\n  }\n  translation.set_x(IntCastRounded(final_xshift_));\n  translation.set_y(IntCastRounded(final_yshift_));\n  blob->Move(translation);\n}\n\n// Fills in the x-height range accepted by the given unichar_id, given its\n// bounding box in the usual baseline-normalized coordinates, with some\n// initial crude x-height estimate (such as word size) and this denoting the\n// transformation that was used.\nvoid DENORM::XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox,\n                          float *min_xht, float *max_xht, float *yshift) const {\n  // Default return -- accept anything.\n  *yshift = 0.0f;\n  *min_xht = 0.0f;\n  *max_xht = FLT_MAX;\n\n  if (!unicharset.top_bottom_useful()) {\n    return;\n  }\n\n  // Clip the top and bottom to the limit of normalized feature space.\n  int top = ClipToRange<int>(bbox.top(), 0, kBlnCellHeight - 1);\n  int bottom = ClipToRange<int>(bbox.bottom(), 0, kBlnCellHeight - 1);\n  // A tolerance of yscale corresponds to 1 pixel in the image.\n  double tolerance = y_scale();\n  // If the script doesn't have upper and lower-case characters, widen the\n  // tolerance to allow sloppy baseline/x-height estimates.\n  if (!unicharset.script_has_upper_lower()) {\n    tolerance = y_scale() * kSloppyTolerance;\n  }\n\n  int min_bottom, max_bottom, min_top, max_top;\n  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);\n\n  // Calculate the scale factor we'll use to get to image y-pixels\n  double midx = (bbox.left() + bbox.right()) / 2.0;\n  double ydiff = (bbox.top() - bbox.bottom()) + 2.0;\n  FCOORD mid_bot(midx, bbox.bottom()), tmid_bot;\n  FCOORD mid_high(midx, bbox.bottom() + ydiff), tmid_high;\n  DenormTransform(nullptr, mid_bot, &tmid_bot);\n  DenormTransform(nullptr, mid_high, &tmid_high);\n\n  // bln_y_measure * yscale = image_y_measure\n  double yscale = tmid_high.pt_to_pt_dist(tmid_bot) / ydiff;\n\n  // Calculate y-shift\n  int bln_yshift = 0, bottom_shift = 0, top_shift = 0;\n  if (bottom < min_bottom - tolerance) {\n    bottom_shift = bottom - min_bottom;\n  } else if (bottom > max_bottom + tolerance) {\n    bottom_shift = bottom - max_bottom;\n  }\n  if (top < min_top - tolerance) {\n    top_shift = top - min_top;\n  } else if (top > max_top + tolerance) {\n    top_shift = top - max_top;\n  }\n  if ((top_shift >= 0 && bottom_shift > 0) || (top_shift < 0 && bottom_shift < 0)) {\n    bln_yshift = (top_shift + bottom_shift) / 2;\n  }\n  *yshift = bln_yshift * yscale;\n\n  // To help very high cap/xheight ratio fonts accept the correct x-height,\n  // and to allow the large caps in small caps to accept the xheight of the\n  // small caps, add kBlnBaselineOffset to chars with a maximum max, and have\n  // a top already at a significantly high position.\n  if (max_top == kBlnCellHeight - 1 && top > kBlnCellHeight - kBlnBaselineOffset / 2) {\n    max_top += kBlnBaselineOffset;\n  }\n  top -= bln_yshift;\n  int height = top - kBlnBaselineOffset;\n  double min_height = min_top - kBlnBaselineOffset - tolerance;\n  double max_height = max_top - kBlnBaselineOffset + tolerance;\n\n  // We shouldn't try calculations if the characters are very short (for example\n  // for punctuation).\n  if (min_height > kBlnXHeight / 8 && height > 0) {\n    float result = height * kBlnXHeight * yscale / min_height;\n    *max_xht = result + kFinalPixelTolerance;\n    result = height * kBlnXHeight * yscale / max_height;\n    *min_xht = result - kFinalPixelTolerance;\n  }\n}\n\n// Prints the content of the DENORM for debug purposes.\nvoid DENORM::Print() const {\n  if (pix_ != nullptr) {\n    tprintf(\"Pix dimensions %d x %d x %d\\n\", pixGetWidth(pix_), pixGetHeight(pix_),\n            pixGetDepth(pix_));\n  }\n  if (inverse_) {\n    tprintf(\"Inverse\\n\");\n  }\n  if (block_ && block_->re_rotation().x() != 1.0f) {\n    tprintf(\"Block rotation %g, %g\\n\", block_->re_rotation().x(), block_->re_rotation().y());\n  }\n  tprintf(\"Input Origin = (%g, %g)\\n\", x_origin_, y_origin_);\n  if (x_map_ != nullptr && y_map_ != nullptr) {\n    tprintf(\"x map:\\n\");\n    for (auto x : *x_map_) {\n      tprintf(\"%g \", x);\n    }\n    tprintf(\"\\ny map:\\n\");\n    for (auto y : *y_map_) {\n      tprintf(\"%g \", y);\n    }\n    tprintf(\"\\n\");\n  } else {\n    tprintf(\"Scale = (%g, %g)\\n\", x_scale_, y_scale_);\n    if (rotation_ != nullptr) {\n      tprintf(\"Rotation = (%g, %g)\\n\", rotation_->x(), rotation_->y());\n    }\n  }\n  tprintf(\"Final Origin = (%g, %g)\\n\", final_xshift_, final_xshift_);\n  if (predecessor_ != nullptr) {\n    tprintf(\"Predecessor:\\n\");\n    predecessor_->Print();\n  }\n}\n\n// ============== Private Code ======================\n\n// Free allocated memory and clear pointers.\nvoid DENORM::Clear() {\n  delete x_map_;\n  x_map_ = nullptr;\n  delete y_map_;\n  y_map_ = nullptr;\n  delete rotation_;\n  rotation_ = nullptr;\n}\n\n// Setup default values.\nvoid DENORM::Init() {\n  inverse_ = false;\n  pix_ = nullptr;\n  block_ = nullptr;\n  rotation_ = nullptr;\n  predecessor_ = nullptr;\n  x_map_ = nullptr;\n  y_map_ = nullptr;\n  x_origin_ = 0.0f;\n  y_origin_ = 0.0f;\n  x_scale_ = 1.0f;\n  y_scale_ = 1.0f;\n  final_xshift_ = 0.0f;\n  final_yshift_ = static_cast<float>(kBlnBaselineOffset);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/normalis.h",
    "content": "/**********************************************************************\n * File:        normalis.h  (Formerly denorm.h)\n * Description: Code for the DENORM class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef NORMALIS_H\n#define NORMALIS_H\n\n#include \"image.h\"\n\n#include <tesseract/export.h>\n\n#include <vector>\n\nstruct Pix;\n\nnamespace tesseract {\n\nconst int kBlnCellHeight = 256;    // Full-height for baseline normalization.\nconst int kBlnXHeight = 128;       // x-height for baseline normalization.\nconst int kBlnBaselineOffset = 64; // offset for baseline normalization.\n\nclass BLOCK;\nclass FCOORD;\nclass TBOX;\nclass UNICHARSET;\n\nstruct TBLOB;\nstruct TPOINT;\n\n// Possible normalization methods. Use NEGATIVE values as these also\n// double up as markers for the last sub-classifier.\nenum NormalizationMode {\n  NM_BASELINE = -3,        // The original BL normalization mode.\n  NM_CHAR_ISOTROPIC = -2,  // Character normalization but isotropic.\n  NM_CHAR_ANISOTROPIC = -1 // The original CN normalization mode.\n};\n\nclass TESS_API DENORM {\npublic:\n  DENORM();\n\n  // Copying a DENORM is allowed.\n  DENORM(const DENORM &);\n  DENORM &operator=(const DENORM &);\n  ~DENORM();\n\n  // Setup the normalization transformation parameters.\n  // The normalizations applied to a blob are as follows:\n  // 1. An optional block layout rotation that was applied during layout\n  // analysis to make the textlines horizontal.\n  // 2. A normalization transformation (LocalNormTransform):\n  // Subtract the \"origin\"\n  // Apply an x,y scaling.\n  // Apply an optional rotation.\n  // Add back a final translation.\n  // The origin is in the block-rotated space, and is usually something like\n  // the x-middle of the word at the baseline.\n  // 3. Zero or more further normalization transformations that are applied\n  // in sequence, with a similar pattern to the first normalization transform.\n  //\n  // A DENORM holds the parameters of a single normalization, and can execute\n  // both the LocalNormTransform (a forwards normalization), and the\n  // LocalDenormTransform which is an inverse transform or de-normalization.\n  // A DENORM may point to a predecessor DENORM, which is actually the earlier\n  // normalization, so the full normalization sequence involves executing all\n  // predecessors first and then the transform in \"this\".\n  // Let x be image coordinates and that we have normalization classes A, B, C\n  // where we first apply A then B then C to get normalized x':\n  // x' = CBAx\n  // Then the backwards (to original coordinates) would be:\n  // x = A^-1 B^-1 C^-1 x'\n  // and A = B->predecessor_ and B = C->predecessor_\n  // NormTransform executes all predecessors recursively, and then this.\n  // NormTransform would be used to transform an image-based feature to\n  // normalized space for use in a classifier\n  // DenormTransform inverts this and then all predecessors. It can be\n  // used to get back to the original image coordinates from normalized space.\n  // The LocalNormTransform member executes just the transformation\n  // in \"this\" without the layout rotation or any predecessors. It would be\n  // used to run each successive normalization, eg the word normalization,\n  // and later the character normalization.\n\n  // Arguments:\n  // block: if not nullptr, then this is the first transformation, and\n  //        block->re_rotation() needs to be used after the Denorm\n  //        transformation to get back to the image coords.\n  // rotation: if not nullptr, apply this rotation after translation to the\n  //           origin and scaling. (Usually a classify rotation.)\n  // predecessor: if not nullptr, then predecessor has been applied to the\n  //              input space and needs to be undone to complete the inverse.\n  // The above pointers are not owned by this DENORM and are assumed to live\n  // longer than this denorm, except rotation, which is deep copied on input.\n  //\n  // x_origin: The x origin which will be mapped to final_xshift in the result.\n  // y_origin: The y origin which will be mapped to final_yshift in the result.\n  //           Added to result of row->baseline(x) if not nullptr.\n  //\n  // x_scale: scale factor for the x-coordinate.\n  // y_scale: scale factor for the y-coordinate. Ignored if segs is given.\n  // Note that these scale factors apply to the same x and y system as the\n  // x-origin and y-origin apply, ie after any block rotation, but before\n  // the rotation argument is applied.\n  //\n  // final_xshift: The x component of the final translation.\n  // final_yshift: The y component of the final translation.\n  //\n  // In theory, any of the commonly used normalizations can be setup here:\n  // * Traditional baseline normalization on a word:\n  // SetupNormalization(block, nullptr, nullptr,\n  //                    box.x_middle(), baseline,\n  //                    kBlnXHeight / x_height, kBlnXHeight / x_height,\n  //                    0, kBlnBaselineOffset);\n  // * \"Numeric mode\" baseline normalization on a word, in which the blobs\n  //   are positioned with the bottom as the baseline is achieved by making\n  //   a separate DENORM for each blob.\n  // SetupNormalization(block, nullptr, nullptr,\n  //                    box.x_middle(), box.bottom(),\n  //                    kBlnXHeight / x_height, kBlnXHeight / x_height,\n  //                    0, kBlnBaselineOffset);\n  // * Anisotropic character normalization used by IntFx.\n  // SetupNormalization(nullptr, nullptr, denorm,\n  //                    centroid_x, centroid_y,\n  //                    51.2 / ry, 51.2 / rx, 128, 128);\n  // * Normalize blob height to x-height (current OSD):\n  // SetupNormalization(nullptr, &rotation, nullptr,\n  //                    box.rotational_x_middle(rotation),\n  //                    box.rotational_y_middle(rotation),\n  //                    kBlnXHeight / box.rotational_height(rotation),\n  //                    kBlnXHeight / box.rotational_height(rotation),\n  //                    0, kBlnBaselineOffset);\n  // * Secondary normalization for classification rotation (current):\n  // FCOORD rotation = block->classify_rotation();\n  // float target_height = kBlnXHeight / CCStruct::kXHeightCapRatio;\n  // SetupNormalization(nullptr, &rotation, denorm,\n  //                    box.rotational_x_middle(rotation),\n  //                    box.rotational_y_middle(rotation),\n  //                    target_height / box.rotational_height(rotation),\n  //                    target_height / box.rotational_height(rotation),\n  //                    0, kBlnBaselineOffset);\n  // * Proposed new normalizations for CJK: Between them there is then\n  // no need for further normalization at all, and the character fills the cell.\n  // ** Replacement for baseline normalization on a word:\n  // Scales height and width independently so that modal height and pitch\n  // fill the cell respectively.\n  // float cap_height = x_height / CCStruct::kXHeightCapRatio;\n  // SetupNormalization(block, nullptr, nullptr,\n  //                    box.x_middle(), cap_height / 2.0f,\n  //                    kBlnCellHeight / fixed_pitch,\n  //                    kBlnCellHeight / cap_height,\n  //                    0, 0);\n  // ** Secondary normalization for classification (with rotation) (proposed):\n  // Requires a simple translation to the center of the appropriate character\n  // cell, no further scaling and a simple rotation (or nothing) about the\n  // cell center.\n  // FCOORD rotation = block->classify_rotation();\n  // SetupNormalization(nullptr, &rotation, denorm,\n  //                    fixed_pitch_cell_center,\n  //                    0.0f,\n  //                    1.0f,\n  //                    1.0f,\n  //                    0, 0);\n  void SetupNormalization(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,\n                          float x_origin, float y_origin, float x_scale, float y_scale,\n                          float final_xshift, float final_yshift);\n\n  // Sets up the DENORM to execute a non-linear transformation based on\n  // preserving an even distribution of stroke edges. The transformation\n  // operates only within the given box, scaling input coords within the box\n  // non-linearly to a box of target_width by target_height, with all other\n  // coords being clipped to the box edge. As with SetupNormalization above,\n  // final_xshift and final_yshift are applied after scaling, and the bottom-\n  // left of box is used as a pre-scaling origin.\n  // x_coords is a collection of the x-coords of vertical edges for each\n  // y-coord starting at box.bottom().\n  // y_coords is a collection of the y-coords of horizontal edges for each\n  // x-coord starting at box.left().\n  // Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.\n  // Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.\n  // The second-level vectors must all be sorted in ascending order.\n  void SetupNonLinear(const DENORM *predecessor, const TBOX &box, float target_width,\n                      float target_height, float final_xshift, float final_yshift,\n                      const std::vector<std::vector<int>> &x_coords,\n                      const std::vector<std::vector<int>> &y_coords);\n\n  // Transforms the given coords one step forward to normalized space, without\n  // using any block rotation or predecessor.\n  void LocalNormTransform(const TPOINT &pt, TPOINT *transformed) const;\n  void LocalNormTransform(const FCOORD &pt, FCOORD *transformed) const;\n  // Transforms the given coords forward to normalized space using the\n  // full transformation sequence defined by the block rotation, the\n  // predecessors, deepest first, and finally this. If first_norm is not\n  // nullptr, then the first and deepest transformation used is first_norm,\n  // ending with this, and the block rotation will not be applied.\n  void NormTransform(const DENORM *first_norm, const TPOINT &pt, TPOINT *transformed) const;\n  void NormTransform(const DENORM *first_norm, const FCOORD &pt, FCOORD *transformed) const;\n  // Transforms the given coords one step back to source space, without\n  // using to any block rotation or predecessor.\n  void LocalDenormTransform(const TPOINT &pt, TPOINT *original) const;\n  void LocalDenormTransform(const FCOORD &pt, FCOORD *original) const;\n  // Transforms the given coords all the way back to source image space using\n  // the full transformation sequence defined by this and its predecessors\n  // recursively, shallowest first, and finally any block re_rotation.\n  // If last_denorm is not nullptr, then the last transformation used will\n  // be last_denorm, and the block re_rotation will never be executed.\n  void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const;\n  void DenormTransform(const DENORM *last_denorm, const FCOORD &pt, FCOORD *original) const;\n\n  // Normalize a blob using blob transformations. Less accurate, but\n  // more accurately copies the old way.\n  void LocalNormBlob(TBLOB *blob) const;\n\n  // Fills in the x-height range accepted by the given unichar_id in blob\n  // coordinates, given its bounding box in the usual baseline-normalized\n  // coordinates, with some initial crude x-height estimate (such as word\n  // size) and this denoting the transformation that was used.\n  // Also returns the amount the character must have shifted up or down.\n  void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht,\n                    float *max_xht, float *yshift) const;\n\n  // Prints the content of the DENORM for debug purposes.\n  void Print() const;\n\n  Image pix() const {\n    return pix_;\n  }\n  void set_pix(Image pix) {\n    pix_ = pix;\n  }\n  bool inverse() const {\n    return inverse_;\n  }\n  void set_inverse(bool value) {\n    inverse_ = value;\n  }\n  const DENORM *RootDenorm() const {\n    if (predecessor_ != nullptr) {\n      return predecessor_->RootDenorm();\n    }\n    return this;\n  }\n  const DENORM *predecessor() const {\n    return predecessor_;\n  }\n  // Accessors - perhaps should not be needed.\n  float x_scale() const {\n    return x_scale_;\n  }\n  float y_scale() const {\n    return y_scale_;\n  }\n  const BLOCK *block() const {\n    return block_;\n  }\n  void set_block(const BLOCK *block) {\n    block_ = block;\n  }\n\nprivate:\n  // Free allocated memory and clear pointers.\n  void Clear();\n  // Setup default values.\n  void Init();\n\n  // Best available image.\n  Image pix_;\n  // True if the source image is white-on-black.\n  bool inverse_;\n  // Block the word came from. If not null, block->re_rotation() takes the\n  // \"untransformed\" coordinates even further back to the original image.\n  // Used only on the first DENORM in a chain.\n  const BLOCK *block_;\n  // Rotation to apply between translation to the origin and scaling.\n  const FCOORD *rotation_;\n  // Previous transformation in a chain.\n  const DENORM *predecessor_;\n  // Non-linear transformation maps directly from each integer offset from the\n  // origin to the corresponding x-coord. Owned by the DENORM.\n  std::vector<float> *x_map_;\n  // Non-linear transformation maps directly from each integer offset from the\n  // origin to the corresponding y-coord. Owned by the DENORM.\n  std::vector<float> *y_map_;\n  // x-coordinate to be mapped to final_xshift_ in the result.\n  float x_origin_;\n  // y-coordinate to be mapped to final_yshift_ in the result.\n  float y_origin_;\n  // Scale factors for x and y coords. Applied to pre-rotation system.\n  float x_scale_;\n  float y_scale_;\n  // Destination coords of the x_origin_ and y_origin_.\n  float final_xshift_;\n  float final_yshift_;\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/ocrblock.cpp",
    "content": "/**********************************************************************\n * File:        ocrblock.cpp  (Formerly block.c)\n * Description: BLOCK member functions and iterator functions.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"ocrblock.h\"\n\n#include \"stepblob.h\"\n#include \"tprintf.h\"\n\n#include <cstdlib>\n#include <memory> // std::unique_ptr\n\nnamespace tesseract {\n\n/**\n * BLOCK::BLOCK\n *\n * Constructor for a simple rectangular block.\n */\nBLOCK::BLOCK(const char *name, ///< filename\n             bool prop,        ///< proportional\n             int16_t kern,     ///< kerning\n             int16_t space,    ///< spacing\n             TDimension xmin,  ///< bottom left\n             TDimension ymin,\n             TDimension xmax,  ///< top right\n             TDimension ymax)\n    : pdblk(xmin, ymin, xmax, ymax)\n    , filename(name)\n    , re_rotation_(1.0f, 0.0f)\n    , classify_rotation_(1.0f, 0.0f)\n    , skew_(1.0f, 0.0f) {\n  ICOORDELT_IT left_it = &pdblk.leftside;\n  ICOORDELT_IT right_it = &pdblk.rightside;\n\n  proportional = prop;\n  kerning = kern;\n  spacing = space;\n  font_class = -1; // not assigned\n  cell_over_xheight_ = 2.0f;\n  pdblk.hand_poly = nullptr;\n  left_it.set_to_list(&pdblk.leftside);\n  right_it.set_to_list(&pdblk.rightside);\n  // make default box\n  left_it.add_to_end(new ICOORDELT(xmin, ymin));\n  left_it.add_to_end(new ICOORDELT(xmin, ymax));\n  right_it.add_to_end(new ICOORDELT(xmax, ymin));\n  right_it.add_to_end(new ICOORDELT(xmax, ymax));\n}\n\n/**\n * decreasing_top_order\n *\n * Sort Comparator: Return <0 if row1 top < row2 top\n */\n\nstatic int decreasing_top_order(const ROW *row1, const ROW *row2) {\n  return row2->bounding_box().top() -\n         row1->bounding_box().top();\n}\n\n/**\n * BLOCK::rotate\n *\n * Rotate the polygon by the given rotation and recompute the bounding_box.\n */\nvoid BLOCK::rotate(const FCOORD &rotation) {\n  pdblk.poly_block()->rotate(rotation);\n  pdblk.box = *pdblk.poly_block()->bounding_box();\n}\n\n// Returns the bounding box including the desired combination of upper and\n// lower noise/diacritic elements.\nTBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const {\n  TBOX box;\n  // This is a read-only iteration of the rows in the block.\n  ROW_IT it(const_cast<ROW_LIST *>(&rows));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);\n  }\n  return box;\n}\n\n/**\n * BLOCK::reflect_polygon_in_y_axis\n *\n * Reflects the polygon in the y-axis and recompute the bounding_box.\n * Does nothing to any contained rows/words/blobs etc.\n */\nvoid BLOCK::reflect_polygon_in_y_axis() {\n  pdblk.poly_block()->reflect_in_y_axis();\n  pdblk.box = *pdblk.poly_block()->bounding_box();\n}\n\n/**\n * BLOCK::sort_rows\n *\n * Order rows so that they are in order of decreasing Y coordinate\n */\n\nvoid BLOCK::sort_rows() { // order on \"top\"\n  ROW_IT row_it(&rows);\n\n  row_it.sort(decreasing_top_order);\n}\n\n/**\n * BLOCK::compress\n *\n * Delete space between the rows. (And maybe one day, compress the rows)\n * Fill space of block from top down, left aligning rows.\n */\n\nvoid BLOCK::compress() { // squash it up\n#define ROW_SPACING 5\n\n  ROW_IT row_it(&rows);\n  ROW *row;\n  ICOORD row_spacing(0, ROW_SPACING);\n\n  ICOORDELT_IT icoordelt_it;\n\n  sort_rows();\n\n  pdblk.box = TBOX(pdblk.box.topleft(), pdblk.box.topleft());\n  pdblk.box.move_bottom_edge(ROW_SPACING);\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    row->move(pdblk.box.botleft() - row_spacing - row->bounding_box().topleft());\n    pdblk.box += row->bounding_box();\n  }\n\n  pdblk.leftside.clear();\n  icoordelt_it.set_to_list(&pdblk.leftside);\n  icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.bottom()));\n  icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.top()));\n  pdblk.rightside.clear();\n  icoordelt_it.set_to_list(&pdblk.rightside);\n  icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.bottom()));\n  icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.top()));\n}\n\n/**\n * BLOCK::check_pitch\n *\n * Check whether the block is fixed or prop, set the flag, and set\n * the pitch if it is fixed.\n */\n\nvoid BLOCK::check_pitch() { // check prop\n  //      tprintf(\"Missing FFT fixed pitch stuff!\\n\");\n  pitch = -1;\n}\n\n/**\n * BLOCK::compress\n *\n * Compress and move in a single operation.\n */\n\nvoid BLOCK::compress( // squash it up\n    const ICOORD vec  // and move\n) {\n  pdblk.box.move(vec);\n  compress();\n}\n\n/**\n * BLOCK::print\n *\n * Print the info on a block\n */\n\nvoid BLOCK::print( // print list of sides\n    FILE *,        ///< file to print on\n    bool dump      ///< print full detail\n) {\n  ICOORDELT_IT it = &pdblk.leftside; // iterator\n\n  pdblk.box.print();\n  tprintf(\"Proportional= %s\\n\", proportional ? \"TRUE\" : \"FALSE\");\n  tprintf(\"Kerning= %d\\n\", kerning);\n  tprintf(\"Spacing= %d\\n\", spacing);\n  tprintf(\"Fixed_pitch=%d\\n\", pitch);\n  tprintf(\"Filename= %s\\n\", filename.c_str());\n\n  if (dump) {\n    tprintf(\"Left side coords are:\\n\");\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      tprintf(\"(%d,%d) \", it.data()->x(), it.data()->y());\n    }\n    tprintf(\"\\n\");\n    tprintf(\"Right side coords are:\\n\");\n    it.set_to_list(&pdblk.rightside);\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      tprintf(\"(%d,%d) \", it.data()->x(), it.data()->y());\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n/**\n * BLOCK::operator=\n *\n * Assignment - duplicate the block structure, but with an EMPTY row list.\n */\n\nBLOCK &BLOCK::operator=( // assignment\n    const BLOCK &source  // from this\n) {\n  this->ELIST<BLOCK>::LINK::operator=(source);\n  pdblk = source.pdblk;\n  proportional = source.proportional;\n  kerning = source.kerning;\n  spacing = source.spacing;\n  filename = source.filename; // STRINGs assign ok\n  if (!rows.empty()) {\n    rows.clear();\n  }\n  re_rotation_ = source.re_rotation_;\n  classify_rotation_ = source.classify_rotation_;\n  skew_ = source.skew_;\n  return *this;\n}\n\n// This function is for finding the approximate (horizontal) distance from\n// the x-coordinate of the left edge of a symbol to the left edge of the\n// text block which contains it.  We are passed:\n//   segments - output of PB_LINE_IT::get_line() which contains x-coordinate\n//       intervals for the scan line going through the symbol's y-coordinate.\n//       Each element of segments is of the form (x()=start_x, y()=length).\n//   x - the x coordinate of the symbol we're interested in.\n//   margin - return value, the distance from x,y to the left margin of the\n//       block containing it.\n// If all segments were to the right of x, we return false and 0.\nstatic bool LeftMargin(ICOORDELT_LIST *segments, int x, int *margin) {\n  bool found = false;\n  *margin = 0;\n  if (segments->empty()) {\n    return found;\n  }\n  ICOORDELT_IT seg_it(segments);\n  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n    int cur_margin = x - seg_it.data()->x();\n    if (cur_margin >= 0) {\n      if (!found) {\n        *margin = cur_margin;\n      } else if (cur_margin < *margin) {\n        *margin = cur_margin;\n      }\n      found = true;\n    }\n  }\n  return found;\n}\n\n// This function is for finding the approximate (horizontal) distance from\n// the x-coordinate of the right edge of a symbol to the right edge of the\n// text block which contains it.  We are passed:\n//   segments - output of PB_LINE_IT::get_line() which contains x-coordinate\n//       intervals for the scan line going through the symbol's y-coordinate.\n//       Each element of segments is of the form (x()=start_x, y()=length).\n//   x - the x coordinate of the symbol we're interested in.\n//   margin - return value, the distance from x,y to the right margin of the\n//       block containing it.\n// If all segments were to the left of x, we return false and 0.\nstatic bool RightMargin(ICOORDELT_LIST *segments, int x, int *margin) {\n  bool found = false;\n  *margin = 0;\n  if (segments->empty()) {\n    return found;\n  }\n  ICOORDELT_IT seg_it(segments);\n  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n    int cur_margin = seg_it.data()->x() + seg_it.data()->y() - x;\n    if (cur_margin >= 0) {\n      if (!found) {\n        *margin = cur_margin;\n      } else if (cur_margin < *margin) {\n        *margin = cur_margin;\n      }\n      found = true;\n    }\n  }\n  return found;\n}\n\n// Compute the distance from the left and right ends of each row to the\n// left and right edges of the block's polyblock.  Illustration:\n//  ____________________________   _______________________\n//  |  Howdy neighbor!         |  |rectangular blocks look|\n//  |  This text is  written to|  |more like stacked pizza|\n//  |illustrate how useful poly-  |boxes.                 |\n//  |blobs  are   in -----------  ------   The    polyblob|\n//  |dealing    with|     _________     |for a BLOCK  rec-|\n//  |harder   layout|   /===========\\   |ords the possibly|\n//  |issues.        |    |  _    _  |   |skewed    pseudo-|\n//  |  You  see this|    | |_| \\|_| |   |rectangular      |\n//  |text is  flowed|    |      }   |   |boundary     that|\n//  |around  a  mid-|     \\   ____  |   |forms the  ideal-|\n//  |column portrait._____ \\       /  __|ized  text margin|\n//  |  Polyblobs     exist| \\    /   |from which we should|\n//  |to account for insets|  |   |   |measure    paragraph|\n//  |which make  otherwise|  -----   |indentation.        |\n//  -----------------------          ----------------------\n//\n// If we identify a drop-cap, we measure the left margin for the lines\n// below the first line relative to one space past the drop cap.  The\n// first line's margin and those past the drop cap area are measured\n// relative to the enclosing polyblock.\n//\n// TODO(rays): Before this will work well, we'll need to adjust the\n//             polyblob tighter around the text near images, as in:\n//             UNLV_AUTO:mag.3G0  page 2\n//             UNLV_AUTO:mag.3G4  page 16\nvoid BLOCK::compute_row_margins() {\n  if (row_list()->empty() || row_list()->singleton()) {\n    return;\n  }\n\n  // If Layout analysis was not called, default to this.\n  POLY_BLOCK rect_block(pdblk.bounding_box(), PT_FLOWING_TEXT);\n  POLY_BLOCK *pblock = &rect_block;\n  if (pdblk.poly_block() != nullptr) {\n    pblock = pdblk.poly_block();\n  }\n\n  // Step One: Determine if there is a drop-cap.\n  //           TODO(eger): Fix up drop cap code for RTL languages.\n  ROW_IT r_it(row_list());\n  ROW *first_row = r_it.data();\n  ROW *second_row = r_it.data_relative(1);\n\n  // initialize the bottom of a fictitious drop cap far above the first line.\n  int drop_cap_bottom = first_row->bounding_box().top() + first_row->bounding_box().height();\n  int drop_cap_right = first_row->bounding_box().left();\n  int mid_second_line = second_row->bounding_box().top() - second_row->bounding_box().height() / 2;\n  WERD_IT werd_it(r_it.data()->word_list()); // words of line one\n  if (!werd_it.empty()) {\n    C_BLOB_IT cblob_it(werd_it.data()->cblob_list());\n    for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {\n      TBOX bbox = cblob_it.data()->bounding_box();\n      if (bbox.bottom() <= mid_second_line) {\n        // we found a real drop cap\n        first_row->set_has_drop_cap(true);\n        if (drop_cap_bottom > bbox.bottom()) {\n          drop_cap_bottom = bbox.bottom();\n        }\n        if (drop_cap_right < bbox.right()) {\n          drop_cap_right = bbox.right();\n        }\n      }\n    }\n  }\n\n  // Step Two: Calculate the margin from the text of each row to the block\n  //           (or drop-cap) boundaries.\n  PB_LINE_IT lines(pblock);\n  r_it.set_to_list(row_list());\n  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {\n    ROW *row = r_it.data();\n    TBOX row_box = row->bounding_box();\n    int left_y = row->base_line(row_box.left()) + row->x_height();\n    int left_margin;\n    const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_left(lines.get_line(left_y));\n    LeftMargin(segments_left.get(), row_box.left(), &left_margin);\n\n    if (row_box.top() >= drop_cap_bottom) {\n      int drop_cap_distance = row_box.left() - row->space() - drop_cap_right;\n      if (drop_cap_distance < 0) {\n        drop_cap_distance = 0;\n      }\n      if (drop_cap_distance < left_margin) {\n        left_margin = drop_cap_distance;\n      }\n    }\n\n    int right_y = row->base_line(row_box.right()) + row->x_height();\n    int right_margin;\n    const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_right(lines.get_line(right_y));\n    RightMargin(segments_right.get(), row_box.right(), &right_margin);\n    row->set_lmargin(left_margin);\n    row->set_rmargin(right_margin);\n  }\n}\n\n/**********************************************************************\n * PrintSegmentationStats\n *\n * Prints segmentation stats for the given block list.\n **********************************************************************/\n\nvoid PrintSegmentationStats(BLOCK_LIST *block_list) {\n  int num_blocks = 0;\n  int num_rows = 0;\n  int num_words = 0;\n  int num_blobs = 0;\n  BLOCK_IT block_it(block_list);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    ++num_blocks;\n    ROW_IT row_it(block->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      ++num_rows;\n      ROW *row = row_it.data();\n      // Iterate over all werds in the row.\n      WERD_IT werd_it(row->word_list());\n      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {\n        WERD *werd = werd_it.data();\n        ++num_words;\n        num_blobs += werd->cblob_list()->length();\n      }\n    }\n  }\n  tprintf(\"Block list stats:\\nBlocks = %d\\nRows = %d\\nWords = %d\\nBlobs = %d\\n\", num_blocks,\n          num_rows, num_words, num_blobs);\n}\n\n/**********************************************************************\n * ExtractBlobsFromSegmentation\n *\n * Extracts blobs from the given block list and adds them to the output list.\n * The block list must have been created by performing a page segmentation.\n **********************************************************************/\n\nvoid ExtractBlobsFromSegmentation(BLOCK_LIST *blocks, C_BLOB_LIST *output_blob_list) {\n  C_BLOB_IT return_list_it(output_blob_list);\n  BLOCK_IT block_it(blocks);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    ROW_IT row_it(block->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      ROW *row = row_it.data();\n      // Iterate over all werds in the row.\n      WERD_IT werd_it(row->word_list());\n      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {\n        WERD *werd = werd_it.data();\n        return_list_it.move_to_last();\n        return_list_it.add_list_after(werd->cblob_list());\n        return_list_it.move_to_last();\n        return_list_it.add_list_after(werd->rej_cblob_list());\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * RefreshWordBlobsFromNewBlobs()\n *\n * Refreshes the words in the block_list by using blobs in the\n * new_blobs list.\n * Block list must have word segmentation in it.\n * It consumes the blobs provided in the new_blobs list. The blobs leftover in\n * the new_blobs list after the call weren't matched to any blobs of the words\n * in block list.\n * The output not_found_blobs is a list of blobs from the original segmentation\n * in the block_list for which no corresponding new blobs were found.\n **********************************************************************/\n\nvoid RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs,\n                                  C_BLOB_LIST *not_found_blobs) {\n  // Now iterate over all the blobs in the segmentation_block_list_, and just\n  // replace the corresponding c-blobs inside the werds.\n  BLOCK_IT block_it(block_list);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {\n      continue; // Don't touch non-text blocks.\n    }\n    // Iterate over all rows in the block.\n    ROW_IT row_it(block->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      ROW *row = row_it.data();\n      // Iterate over all werds in the row.\n      WERD_IT werd_it(row->word_list());\n      WERD_LIST new_words;\n      WERD_IT new_words_it(&new_words);\n      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {\n        WERD *werd = werd_it.extract();\n        WERD *new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs);\n        if (new_werd) {\n          // Insert this new werd into the actual row's werd-list. Remove the\n          // existing one.\n          new_words_it.add_after_then_move(new_werd);\n          delete werd;\n        } else {\n          // Reinsert the older word back, for lack of better options.\n          // This is critical since dropping the words messes up segmentation:\n          // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on.\n          new_words_it.add_after_then_move(werd);\n        }\n      }\n      // Get rid of the old word list & replace it with the new one.\n      row->word_list()->clear();\n      werd_it.move_to_first();\n      werd_it.add_list_after(&new_words);\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/ocrblock.h",
    "content": "/**********************************************************************\n * File:        ocrblock.h  (Formerly block.h)\n * Description: Page block class definition.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef OCRBLOCK_H\n#define OCRBLOCK_H\n\n#include \"ocrpara.h\"\n#include \"ocrrow.h\"\n#include \"pdblock.h\"\n\nnamespace tesseract {\n\nclass BLOCK; // forward decl\n\nELISTIZEH(BLOCK)\n\nclass TESS_API BLOCK : public ELIST<BLOCK>::LINK\n// page block\n{\n  friend class BLOCK_RECT_IT; // block iterator\npublic:\n  BLOCK() : re_rotation_(1.0f, 0.0f), classify_rotation_(1.0f, 0.0f), skew_(1.0f, 0.0f) {}\n  BLOCK(const char *name, ///< filename\n        bool prop,        ///< proportional\n        int16_t kern,     ///< kerning\n        int16_t space,    ///< spacing\n        TDimension xmin,  ///< bottom left\n        TDimension ymin,\n        TDimension xmax,  ///< top right\n        TDimension ymax);\n\n  ~BLOCK() = default;\n\n  /**\n   * set space size etc.\n   * @param prop proportional\n   * @param kern inter char size\n   * @param space inter word size\n   * @param ch_pitch pitch if fixed\n   */\n  void set_stats(bool prop, int16_t kern, int16_t space, int16_t ch_pitch) {\n    proportional = prop;\n    kerning = static_cast<int8_t>(kern);\n    spacing = space;\n    pitch = ch_pitch;\n  }\n  /// set char size\n  void set_xheight(int32_t height) {\n    xheight = height;\n  }\n  /// set font class\n  void set_font_class(int16_t font) {\n    font_class = font;\n  }\n  /// return proportional\n  bool prop() const {\n    return proportional;\n  }\n  bool right_to_left() const {\n    return right_to_left_;\n  }\n  void set_right_to_left(bool value) {\n    right_to_left_ = value;\n  }\n  /// return pitch\n  int32_t fixed_pitch() const {\n    return pitch;\n  }\n  /// return kerning\n  int16_t kern() const {\n    return kerning;\n  }\n  /// return font class\n  int16_t font() const {\n    return font_class;\n  }\n  /// return spacing\n  int16_t space() const {\n    return spacing;\n  }\n  /// return filename\n  const char *name() const {\n    return filename.c_str();\n  }\n  /// return xheight\n  int32_t x_height() const {\n    return xheight;\n  }\n  float cell_over_xheight() const {\n    return cell_over_xheight_;\n  }\n  void set_cell_over_xheight(float ratio) {\n    cell_over_xheight_ = ratio;\n  }\n  /// get rows\n  ROW_LIST *row_list() {\n    return &rows;\n  }\n  // Compute the margins between the edges of each row and this block's\n  // polyblock, and store the results in the rows.\n  void compute_row_margins();\n\n  // get paragraphs\n  PARA_LIST *para_list() {\n    return &paras_;\n  }\n  /// get blobs\n  C_BLOB_LIST *blob_list() {\n    return &c_blobs;\n  }\n  C_BLOB_LIST *reject_blobs() {\n    return &rej_blobs;\n  }\n  FCOORD re_rotation() const {\n    return re_rotation_; // How to transform coords back to image.\n  }\n  void set_re_rotation(const FCOORD &rotation) {\n    re_rotation_ = rotation;\n  }\n  FCOORD classify_rotation() const {\n    return classify_rotation_; // Apply this before classifying.\n  }\n  void set_classify_rotation(const FCOORD &rotation) {\n    classify_rotation_ = rotation;\n  }\n  FCOORD skew() const {\n    return skew_; // Direction of true horizontal.\n  }\n  void set_skew(const FCOORD &skew) {\n    skew_ = skew;\n  }\n  const ICOORD &median_size() const {\n    return median_size_;\n  }\n  void set_median_size(int x, int y) {\n    median_size_.set_x(x);\n    median_size_.set_y(y);\n  }\n\n  Image render_mask(TBOX *mask_box) {\n    return pdblk.render_mask(re_rotation_, mask_box);\n  }\n\n  // Returns the bounding box including the desired combination of upper and\n  // lower noise/diacritic elements.\n  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;\n\n  // Reflects the polygon in the y-axis and recomputes the bounding_box.\n  // Does nothing to any contained rows/words/blobs etc.\n  void reflect_polygon_in_y_axis();\n\n  void rotate(const FCOORD &rotation);\n\n  /// decreasing y order\n  void sort_rows();\n\n  /// shrink white space\n  void compress();\n\n  /// check proportional\n  void check_pitch();\n\n  /// shrink white space and move by vector\n  void compress(const ICOORD vec);\n\n  /// dump whole table\n  void print(FILE *fp, bool dump);\n\n  BLOCK &operator=(const BLOCK &source);\n  PDBLK pdblk; ///< Page Description Block\n\nprivate:\n  bool proportional = false;       ///< proportional\n  bool right_to_left_ = false;     ///< major script is right to left.\n  int8_t kerning = 0;              ///< inter blob gap\n  int16_t spacing = 0;             ///< inter word gap\n  int16_t pitch = 0;               ///< pitch of non-props\n  int16_t font_class = 0;          ///< correct font class\n  int32_t xheight = 0;             ///< height of chars\n  float cell_over_xheight_ = 0.0f; ///< Ratio of cell height to xheight.\n  std::string filename;            ///< name of block\n  ROW_LIST rows;                   ///< rows in block\n  PARA_LIST paras_;                ///< paragraphs of block\n  C_BLOB_LIST c_blobs;             ///< before textord\n  C_BLOB_LIST rej_blobs;           ///< duff stuff\n  FCOORD re_rotation_;             ///< How to transform coords back to image.\n  FCOORD classify_rotation_;       ///< Apply this before classifying.\n  FCOORD skew_;                    ///< Direction of true horizontal.\n  ICOORD median_size_;             ///< Median size of blobs.\n};\n\n// A function to print segmentation stats for the given block list.\nvoid PrintSegmentationStats(BLOCK_LIST *block_list);\n\n// Extracts blobs fromo the given block list and adds them to the output list.\n// The block list must have been created by performing a page segmentation.\nvoid ExtractBlobsFromSegmentation(BLOCK_LIST *blocks, C_BLOB_LIST *output_blob_list);\n\n// Refreshes the words in the block_list by using blobs in the\n// new_blobs list.\n// Block list must have word segmentation in it.\n// It consumes the blobs provided in the new_blobs list. The blobs leftover in\n// the new_blobs list after the call weren't matched to any blobs of the words\n// in block list.\n// The output not_found_blobs is a list of blobs from the original segmentation\n// in the block_list for which no corresponding new blobs were found.\nvoid RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs,\n                                  C_BLOB_LIST *not_found_blobs);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/ocrpara.cpp",
    "content": "/////////////////////////////////////////////////////////////////////\n// File:        ocrpara.cpp\n// Description: OCR Paragraph Output Type\n// Author:      David Eger\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"ocrpara.h\"\n\n#include \"host.h\" // For NearlyEqual()\n\n#include <cstdio>\n\nnamespace tesseract {\n\nusing tesseract::JUSTIFICATION_CENTER;\nusing tesseract::JUSTIFICATION_LEFT;\nusing tesseract::JUSTIFICATION_RIGHT;\nusing tesseract::JUSTIFICATION_UNKNOWN;\n\nstatic const char *ParagraphJustificationToString(tesseract::ParagraphJustification justification) {\n  switch (justification) {\n    case JUSTIFICATION_LEFT:\n      return \"LEFT\";\n    case JUSTIFICATION_RIGHT:\n      return \"RIGHT\";\n    case JUSTIFICATION_CENTER:\n      return \"CENTER\";\n    default:\n      return \"UNKNOWN\";\n  }\n}\n\nbool ParagraphModel::ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const {\n  switch (justification_) {\n    case JUSTIFICATION_LEFT:\n      return NearlyEqual(lmargin + lindent, margin_ + first_indent_, tolerance_);\n    case JUSTIFICATION_RIGHT:\n      return NearlyEqual(rmargin + rindent, margin_ + first_indent_, tolerance_);\n    case JUSTIFICATION_CENTER:\n      return NearlyEqual(lindent, rindent, tolerance_ * 2);\n    default:\n      // shouldn't happen\n      return false;\n  }\n}\n\nbool ParagraphModel::ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const {\n  switch (justification_) {\n    case JUSTIFICATION_LEFT:\n      return NearlyEqual(lmargin + lindent, margin_ + body_indent_, tolerance_);\n    case JUSTIFICATION_RIGHT:\n      return NearlyEqual(rmargin + rindent, margin_ + body_indent_, tolerance_);\n    case JUSTIFICATION_CENTER:\n      return NearlyEqual(lindent, rindent, tolerance_ * 2);\n    default:\n      // shouldn't happen\n      return false;\n  }\n}\n\nbool ParagraphModel::Comparable(const ParagraphModel &other) const {\n  if (justification_ != other.justification_) {\n    return false;\n  }\n  if (justification_ == JUSTIFICATION_CENTER || justification_ == JUSTIFICATION_UNKNOWN) {\n    return true;\n  }\n  int tolerance = (tolerance_ + other.tolerance_) / 4;\n  return NearlyEqual(margin_ + first_indent_, other.margin_ + other.first_indent_, tolerance) &&\n         NearlyEqual(margin_ + body_indent_, other.margin_ + other.body_indent_, tolerance);\n}\n\nstd::string ParagraphModel::ToString() const {\n  char buffer[200];\n  const char *alignment = ParagraphJustificationToString(justification_);\n  snprintf(buffer, sizeof(buffer), \"margin: %d, first_indent: %d, body_indent: %d, alignment: %s\",\n           margin_, first_indent_, body_indent_, alignment);\n  return std::string(buffer);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/ocrpara.h",
    "content": "/////////////////////////////////////////////////////////////////////\n// File:        ocrpara.h\n// Description: OCR Paragraph Output Type\n// Author:      David Eger\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_\n#define TESSERACT_CCSTRUCT_OCRPARA_H_\n\n#include \"elst.h\"\n\n#include <tesseract/publictypes.h>\n\nnamespace tesseract {\n\nclass ParagraphModel;\n\nstruct PARA : public ELIST<PARA>::LINK {\npublic:\n  PARA()\n      : model(nullptr)\n      , is_list_item(false)\n      , is_very_first_or_continuation(false)\n      , has_drop_cap(false) {}\n\n  // We do not own the model, we just reference it.\n  // model may be nullptr if there is not a good model for this paragraph.\n  const ParagraphModel *model;\n\n  bool is_list_item;\n\n  // The first paragraph on a page often lacks a first line indent, but should\n  // still be modeled by the same model as other body text paragraphs on the\n  // page.\n  bool is_very_first_or_continuation;\n\n  // Does this paragraph begin with a drop cap?\n  bool has_drop_cap;\n};\n\nELISTIZEH(PARA)\n\n// A geometric model of paragraph indentation and alignment.\n//\n// Measurements are in pixels. The meaning of the integer arguments changes\n// depending upon the value of justification.  Distances less than or equal\n// to tolerance apart we take as \"equivalent\" for the purpose of model\n// matching, and in the examples below, we assume tolerance is zero.\n//\n// justification = LEFT:\n//   margin       the \"ignored\" margin to the left block edge.\n//   first_indent indent from the left margin to a typical first text line.\n//   body_indent  indent from the left margin of a typical body text line.\n//\n// justification = RIGHT:\n//   margin       the \"ignored\" margin to the right block edge.\n//   first_indent indent from the right margin to a typical first text line.\n//   body_indent  indent from the right margin of a typical body text line.\n//\n// justification = CENTER:\n//   margin       ignored\n//   first_indent ignored\n//   body_indent  ignored\n//\n//  ====== Extended example, assuming each letter is ten pixels wide: =======\n//\n// +--------------------------------+\n// |      Awesome                   | ParagraphModel(CENTER, 0, 0, 0)\n// |   Centered Title               |\n// | Paragraph Detection            |\n// |      OCR TEAM                  |\n// |  10 November 2010              |\n// |                                |\n// |  Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)\n// |This paragraph starts at the top|\n// |of the page and takes 3 lines.  |\n// |  Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)\n// |which indicates that the first  |\n// |paragraph is not a continuation |\n// |from a previous page, as it is  |\n// |indented just like this second  |\n// |paragraph.                      |\n// |   Here is a block quote. It    | ParagraphModel(LEFT, 30, 0, 0)\n// |   looks like the prior text    |\n// |   but it  is indented  more    |\n// |   and is fully justified.      |\n// |  So how does one deal with     | ParagraphModel(LEFT, 0, 20, 0)\n// |centered text, block quotes,    |\n// |normal paragraphs, and lists    |\n// |like what follows?              |\n// |1. Make a plan.                 | ParagraphModel(LEFT, 0, 0, 30)\n// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)\n// |   looking for lines where the  |\n// |   first word of the next line  |\n// |   would fit on the previous    |\n// |   line.                        |\n// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)\n// |   Python and try it out.       |\n// |4. Determine how to fix the     | ParagraphModel(LEFT, 0, 0, 30)\n// |   mistakes.                    |\n// |5. Repeat.                      | ParagraphModel(LEFT, 0, 0, 30)\n// |  For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)\n// |you can try to identify source  |\n// |code.  Ouch!                    |\n// +--------------------------------+\nclass TESS_API ParagraphModel {\npublic:\n  ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,\n                 int body_indent, int tolerance)\n      : justification_(justification)\n      , margin_(margin)\n      , first_indent_(first_indent)\n      , body_indent_(body_indent)\n      , tolerance_(tolerance) {\n    // Make one of {first_indent, body_indent} is 0.\n    int added_margin = first_indent;\n    if (body_indent < added_margin) {\n      added_margin = body_indent;\n    }\n    margin_ += added_margin;\n    first_indent_ -= added_margin;\n    body_indent_ -= added_margin;\n  }\n\n  ParagraphModel()\n      : justification_(tesseract::JUSTIFICATION_UNKNOWN)\n      , margin_(0)\n      , first_indent_(0)\n      , body_indent_(0)\n      , tolerance_(0) {}\n\n  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line\n  // in a block of text which we are trying to model:\n  //   lmargin, lindent:  these add up to the distance from the leftmost ink\n  //                      in the text line to the surrounding text block's left\n  //                      edge.\n  //   rmargin, rindent:  these add up to the distance from the rightmost ink\n  //                      in the text line to the surrounding text block's right\n  //                      edge.\n  // The caller determines the division between \"margin\" and \"indent\", which\n  // only actually affect whether we think the line may be centered.\n  //\n  // If the amount of whitespace matches the amount of whitespace expected on\n  // the relevant side of the line (within tolerance_) we say it matches.\n\n  // Return whether a given text line could be a first paragraph line according\n  // to this paragraph model.\n  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;\n\n  // Return whether a given text line could be a first paragraph line according\n  // to this paragraph model.\n  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;\n\n  tesseract::ParagraphJustification justification() const {\n    return justification_;\n  }\n  int margin() const {\n    return margin_;\n  }\n  int first_indent() const {\n    return first_indent_;\n  }\n  int body_indent() const {\n    return body_indent_;\n  }\n  int tolerance() const {\n    return tolerance_;\n  }\n  bool is_flush() const {\n    return (justification_ == tesseract::JUSTIFICATION_LEFT ||\n            justification_ == tesseract::JUSTIFICATION_RIGHT) &&\n           abs(first_indent_ - body_indent_) <= tolerance_;\n  }\n\n  // Return whether this model is likely to agree with the other model on most\n  // paragraphs they are marked.\n  bool Comparable(const ParagraphModel &other) const;\n\n  std::string ToString() const;\n\nprivate:\n  tesseract::ParagraphJustification justification_;\n  int margin_;\n  int first_indent_;\n  int body_indent_;\n  int tolerance_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_OCRPARA_H_\n"
  },
  {
    "path": "src/ccstruct/ocrrow.cpp",
    "content": "/**********************************************************************\n * File:        ocrrow.cpp  (Formerly row.c)\n * Description: Code for the ROW class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blobbox.h\"\n#include \"ocrrow.h\"\n\nnamespace tesseract {\n\n/**********************************************************************\n * ROW::ROW\n *\n * Constructor to build a ROW. Only the stats stuff are given here.\n * The words are added directly.\n **********************************************************************/\nROW::ROW(                // constructor\n    int32_t spline_size, // no of segments\n    int32_t *xstarts,    // segment boundaries\n    double *coeffs,      // coefficients\n    float x_height,      // line height\n    float ascenders,     // ascender size\n    float descenders,    // descender drop\n    int16_t kern,        // char gap\n    int16_t space        // word gap\n    )\n    : baseline(spline_size, xstarts, coeffs), para_(nullptr) {\n  kerning = kern; // just store stuff\n  spacing = space;\n  xheight = x_height;\n  ascrise = ascenders;\n  bodysize = 0.0f;\n  descdrop = descenders;\n  has_drop_cap_ = false;\n  lmargin_ = 0;\n  rmargin_ = 0;\n}\n\n/**********************************************************************\n * ROW::ROW\n *\n * Constructor to build a ROW. Only the stats stuff are given here.\n * The words are added directly.\n **********************************************************************/\n\nROW::ROW(           // constructor\n    TO_ROW *to_row, // source row\n    int16_t kern,   // char gap\n    int16_t space   // word gap\n    )\n    : para_(nullptr) {\n  kerning = kern; // just store stuff\n  spacing = space;\n  xheight = to_row->xheight;\n  bodysize = to_row->body_size;\n  ascrise = to_row->ascrise;\n  descdrop = to_row->descdrop;\n  baseline = to_row->baseline;\n  has_drop_cap_ = false;\n  lmargin_ = 0;\n  rmargin_ = 0;\n}\n\n// Returns the bounding box including the desired combination of upper and\n// lower noise/diacritic elements.\nTBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const {\n  TBOX box;\n  // This is a read-only iteration of the words in the row.\n  WERD_IT it(const_cast<WERD_LIST *>(&words));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);\n  }\n  return box;\n}\n\n/**********************************************************************\n * ROW::recalc_bounding_box\n *\n * Set the bounding box correctly\n **********************************************************************/\n\nvoid ROW::recalc_bounding_box() { // recalculate BB\n  WERD *word;                     // current word\n  WERD_IT it = &words;            // words of ROW\n  int16_t left;                   // of word\n  int16_t prev_left;              // old left\n\n  if (!it.empty()) {\n    word = it.data();\n    prev_left = word->bounding_box().left();\n    it.forward();\n    while (!it.at_first()) {\n      word = it.data();\n      left = word->bounding_box().left();\n      if (left < prev_left) {\n        it.move_to_first();\n        // words in BB order\n        it.sort(word_comparator);\n        break;\n      }\n      prev_left = left;\n      it.forward();\n    }\n  }\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    word = it.data();\n    if (it.at_first()) {\n      word->set_flag(W_BOL, true);\n    } else {\n      // not start of line\n      word->set_flag(W_BOL, false);\n    }\n    if (it.at_last()) {\n      word->set_flag(W_EOL, true);\n    } else {\n      // not end of line\n      word->set_flag(W_EOL, false);\n    }\n    // extend BB as reqd\n    bound_box += word->bounding_box();\n  }\n}\n\n/**********************************************************************\n * ROW::move\n *\n * Reposition row by vector\n **********************************************************************/\n\nvoid ROW::move(      // reposition row\n    const ICOORD vec // by vector\n) {\n  WERD_IT it(&words); // word iterator\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->move(vec);\n  }\n\n  bound_box.move(vec);\n  baseline.move(vec);\n}\n\n/**********************************************************************\n * ROW::print\n *\n * Display members\n **********************************************************************/\n\nvoid ROW::print( // print\n    FILE *fp     // file to print on\n) const {\n  tprintf(\"Kerning= %d\\n\", kerning);\n  tprintf(\"Spacing= %d\\n\", spacing);\n  bound_box.print();\n  tprintf(\"Xheight= %f\\n\", xheight);\n  tprintf(\"Ascrise= %f\\n\", ascrise);\n  tprintf(\"Descdrop= %f\\n\", descdrop);\n  tprintf(\"has_drop_cap= %d\\n\", has_drop_cap_);\n  tprintf(\"lmargin= %d, rmargin= %d\\n\", lmargin_, rmargin_);\n}\n\n/**********************************************************************\n * ROW::plot\n *\n * Draw the ROW in the given colour.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid ROW::plot(              // draw it\n    ScrollView *window,      // window to draw in\n    ScrollView::Color colour // colour to draw in\n) {\n  WERD *word;          // current word\n  WERD_IT it = &words; // words of ROW\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    word = it.data();\n    word->plot(window, colour); // all in one colour\n  }\n}\n\n/**********************************************************************\n * ROW::plot\n *\n * Draw the ROW in rainbow colours.\n **********************************************************************/\n\nvoid ROW::plot(        // draw it\n    ScrollView *window // window to draw in\n) {\n  WERD *word;          // current word\n  WERD_IT it = &words; // words of ROW\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    word = it.data();\n    word->plot(window); // in rainbow colours\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\n/**********************************************************************\n * ROW::operator=\n *\n * Assign rows by duplicating the row structure but NOT the WERDLIST\n **********************************************************************/\n\nROW &ROW::operator=(const ROW &source) {\n  this->ELIST<ROW>::LINK::operator=(source);\n  kerning = source.kerning;\n  spacing = source.spacing;\n  xheight = source.xheight;\n  bodysize = source.bodysize;\n  ascrise = source.ascrise;\n  descdrop = source.descdrop;\n  if (!words.empty()) {\n    words.clear();\n  }\n  baseline = source.baseline; // QSPLINES must do =\n  bound_box = source.bound_box;\n  has_drop_cap_ = source.has_drop_cap_;\n  lmargin_ = source.lmargin_;\n  rmargin_ = source.rmargin_;\n  para_ = source.para_;\n  return *this;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/ocrrow.h",
    "content": "/**********************************************************************\n * File:        ocrrow.h  (Formerly row.h)\n * Description: Code for the ROW class.\n * Author:      Ray Smith\n * Created:     Tue Oct 08 15:58:04 BST 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef OCRROW_H\n#define OCRROW_H\n\n#include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#include \"quspline.h\"   // for QSPLINE\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n#include \"werd.h\"       // for WERD_LIST\n\n#include <cstdint> // for int16_t, int32_t\n#include <cstdio>  // for FILE\n\nnamespace tesseract {\n\nclass ICOORD;\nclass TO_ROW;\n\nstruct PARA;\n\nclass ROW : public ELIST<ROW>::LINK {\n  friend void tweak_row_baseline(ROW *, double, double);\n\npublic:\n  ROW() = default;\n  ROW(                     // constructor\n      int32_t spline_size, // no of segments\n      int32_t *xstarts,    // segment boundaries\n      double *coeffs,      // coefficients //ascender size\n      float x_height, float ascenders,\n      float descenders, // descender size\n      int16_t kern,     // char gap\n      int16_t space);   // word gap\n  ROW(                  // constructor\n      TO_ROW *row,      // textord row\n      int16_t kern,     // char gap\n      int16_t space);   // word gap\n\n  WERD_LIST *word_list() { // get words\n    return &words;\n  }\n\n  float base_line(        // compute baseline\n      float xpos) const { // at the position\n    // get spline value\n    return static_cast<float>(baseline.y(xpos));\n  }\n  float x_height() const { // return x height\n    return xheight;\n  }\n  void set_x_height(float new_xheight) { // set x height\n    xheight = new_xheight;\n  }\n  int32_t kern() const { // return kerning\n    return kerning;\n  }\n  float body_size() const { // return body size\n    return bodysize;\n  }\n  void set_body_size(float new_size) { // set body size\n    bodysize = new_size;\n  }\n  int32_t space() const { // return spacing\n    return spacing;\n  }\n  float ascenders() const { // return size\n    return ascrise;\n  }\n  float descenders() const { // return size\n    return descdrop;\n  }\n  TBOX bounding_box() const { // return bounding box\n    return bound_box;\n  }\n  // Returns the bounding box including the desired combination of upper and\n  // lower noise/diacritic elements.\n  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;\n\n  void set_lmargin(int16_t lmargin) {\n    lmargin_ = lmargin;\n  }\n  void set_rmargin(int16_t rmargin) {\n    rmargin_ = rmargin;\n  }\n  int16_t lmargin() const {\n    return lmargin_;\n  }\n  int16_t rmargin() const {\n    return rmargin_;\n  }\n\n  void set_has_drop_cap(bool has) {\n    has_drop_cap_ = has;\n  }\n  bool has_drop_cap() const {\n    return has_drop_cap_;\n  }\n\n  void set_para(PARA *p) {\n    para_ = p;\n  }\n  PARA *para() const {\n    return para_;\n  }\n\n  void recalc_bounding_box(); // recalculate BB\n\n  void move(             // reposition row\n      const ICOORD vec); // by vector\n\n  void print(    // print\n      FILE *fp) const; // file to print on\n\n#ifndef GRAPHICS_DISABLED\n  void plot(                     // draw one\n      ScrollView *window,        // window to draw in\n      ScrollView::Color colour); // uniform colour\n  void plot(                     // draw one\n      ScrollView *window);       // in rainbow colours\n\n  void plot_baseline(             // draw the baseline\n      ScrollView *window,         // window to draw in\n      ScrollView::Color colour) { // colour to draw\n    // draw it\n    baseline.plot(window, colour);\n  }\n#endif // !GRAPHICS_DISABLED\n  ROW &operator=(const ROW &source);\n\nprivate:\n  // Copy constructor (currently unused, therefore private).\n  ROW(const ROW &source) = delete;\n\n  int32_t kerning;  // inter char gap\n  int32_t spacing;  // inter word gap\n  TBOX bound_box;   // bounding box\n  float xheight;    // height of line\n  float ascrise;    // size of ascenders\n  float descdrop;   //-size of descenders\n  float bodysize;   // CJK character size. (equals to\n                    // xheight+ascrise by default)\n  WERD_LIST words;  // words\n  QSPLINE baseline; // baseline spline\n\n  // These get set after blocks have been determined.\n  bool has_drop_cap_;\n  int16_t lmargin_; // Distance to left polyblock margin.\n  int16_t rmargin_; // Distance to right polyblock margin.\n\n  // This gets set during paragraph analysis.\n  PARA *para_; // Paragraph of which this row is part.\n};\n\nELISTIZEH(ROW)\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/otsuthr.cpp",
    "content": "/**********************************************************************\n * File:        otsuthr.cpp\n * Description: Simple Otsu thresholding for binarizing images.\n * Author:      Ray Smith\n *\n * (C) Copyright 2008, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"otsuthr.h\"\n\n#include <allheaders.h>\n#include <cstring>\n#include \"helpers.h\"\n\nnamespace tesseract {\n\n// Computes the Otsu threshold(s) for the given image rectangle, making one\n// for each channel. Each channel is always one byte per pixel.\n// Returns an array of threshold values and an array of hi_values, such\n// that a pixel value >threshold[channel] is considered foreground if\n// hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates\n// that there is no apparent foreground. At least one hi_value will not be -1.\n// The return value is the number of channels in the input image, being\n// the size of the output thresholds and hi_values arrays.\nint OtsuThreshold(Image src_pix, int left, int top, int width, int height, std::vector<int> &thresholds,\n                  std::vector<int> &hi_values) {\n  int num_channels = pixGetDepth(src_pix) / 8;\n  // Of all channels with no good hi_value, keep the best so we can always\n  // produce at least one answer.\n  int best_hi_value = 1;\n  int best_hi_index = 0;\n  bool any_good_hivalue = false;\n  double best_hi_dist = 0.0;\n  thresholds.resize(num_channels);\n  hi_values.resize(num_channels);\n\n  for (int ch = 0; ch < num_channels; ++ch) {\n    thresholds[ch] = -1;\n    hi_values[ch] = -1;\n    // Compute the histogram of the image rectangle.\n    int histogram[kHistogramSize];\n    HistogramRect(src_pix, ch, left, top, width, height, histogram);\n    int H;\n    int best_omega_0;\n    int best_t = OtsuStats(histogram, &H, &best_omega_0);\n    if (best_omega_0 == 0 || best_omega_0 == H) {\n      // This channel is empty.\n      continue;\n    }\n    // To be a convincing foreground we must have a small fraction of H\n    // or to be a convincing background we must have a large fraction of H.\n    // In between we assume this channel contains no thresholding information.\n    int hi_value = best_omega_0 < H * 0.5;\n    thresholds[ch] = best_t;\n    if (best_omega_0 > H * 0.75) {\n      any_good_hivalue = true;\n      hi_values[ch] = 0;\n    } else if (best_omega_0 < H * 0.25) {\n      any_good_hivalue = true;\n      hi_values[ch] = 1;\n    } else {\n      // In case all channels are like this, keep the best of the bad lot.\n      double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;\n      if (hi_dist > best_hi_dist) {\n        best_hi_dist = hi_dist;\n        best_hi_value = hi_value;\n        best_hi_index = ch;\n      }\n    }\n  }\n\n  if (!any_good_hivalue) {\n    // Use the best of the ones that were not good enough.\n    hi_values[best_hi_index] = best_hi_value;\n  }\n  return num_channels;\n}\n\n// Computes the histogram for the given image rectangle, and the given\n// single channel. Each channel is always one byte per pixel.\n// Histogram is always a kHistogramSize(256) element array to count\n// occurrences of each pixel value.\nvoid HistogramRect(Image src_pix, int channel, int left, int top, int width, int height,\n                   int *histogram) {\n  int num_channels = pixGetDepth(src_pix) / 8;\n  channel = ClipToRange(channel, 0, num_channels - 1);\n  int bottom = top + height;\n  memset(histogram, 0, sizeof(*histogram) * kHistogramSize);\n  int src_wpl = pixGetWpl(src_pix);\n  l_uint32 *srcdata = pixGetData(src_pix);\n  for (int y = top; y < bottom; ++y) {\n    const l_uint32 *linedata = srcdata + y * src_wpl;\n    for (int x = 0; x < width; ++x) {\n      int pixel = GET_DATA_BYTE(linedata, (x + left) * num_channels + channel);\n      ++histogram[pixel];\n    }\n  }\n}\n\n// Computes the Otsu threshold(s) for the given histogram.\n// Also returns H = total count in histogram, and\n// omega0 = count of histogram below threshold.\nint OtsuStats(const int *histogram, int *H_out, int *omega0_out) {\n  int H = 0;\n  double mu_T = 0.0;\n  for (int i = 0; i < kHistogramSize; ++i) {\n    H += histogram[i];\n    mu_T += static_cast<double>(i) * histogram[i];\n  }\n\n  // Now maximize sig_sq_B over t.\n  // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf\n  int best_t = -1;\n  int omega_0, omega_1;\n  int best_omega_0 = 0;\n  double best_sig_sq_B = 0.0;\n  double mu_0, mu_1, mu_t;\n  omega_0 = 0;\n  mu_t = 0.0;\n  for (int t = 0; t < kHistogramSize - 1; ++t) {\n    omega_0 += histogram[t];\n    mu_t += t * static_cast<double>(histogram[t]);\n    if (omega_0 == 0) {\n      continue;\n    }\n    omega_1 = H - omega_0;\n    if (omega_1 == 0) {\n      break;\n    }\n    mu_0 = mu_t / omega_0;\n    mu_1 = (mu_T - mu_t) / omega_1;\n    double sig_sq_B = mu_1 - mu_0;\n    sig_sq_B *= sig_sq_B * omega_0 * omega_1;\n    if (best_t < 0 || sig_sq_B > best_sig_sq_B) {\n      best_sig_sq_B = sig_sq_B;\n      best_t = t;\n      best_omega_0 = omega_0;\n    }\n  }\n  if (H_out != nullptr) {\n    *H_out = H;\n  }\n  if (omega0_out != nullptr) {\n    *omega0_out = best_omega_0;\n  }\n  return best_t;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccstruct/otsuthr.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        otsuthr.h\n// Description: Simple Otsu thresholding for binarizing images.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCMAIN_OTSUTHR_H_\n#define TESSERACT_CCMAIN_OTSUTHR_H_\n\n#include \"image.h\"\n\n#include <vector> // for std::vector\n\nstruct Pix;\n\nnamespace tesseract {\n\nconst int kHistogramSize = 256; // The size of a histogram of pixel values.\n\n// Computes the Otsu threshold(s) for the given image rectangle, making one\n// for each channel. Each channel is always one byte per pixel.\n// Returns an array of threshold values and an array of hi_values, such\n// that a pixel value >threshold[channel] is considered foreground if\n// hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates\n// that there is no apparent foreground. At least one hi_value will not be -1.\n// The return value is the number of channels in the input image, being\n// the size of the output thresholds and hi_values arrays.\nint OtsuThreshold(Image src_pix, int left, int top, int width, int height,\n                  std::vector<int> &thresholds,\n                  std::vector<int> &hi_values);\n\n// Computes the histogram for the given image rectangle, and the given\n// single channel. Each channel is always one byte per pixel.\n// Histogram is always a kHistogramSize(256) element array to count\n// occurrences of each pixel value.\nvoid HistogramRect(Image src_pix, int channel, int left, int top, int width, int height,\n                   int *histogram);\n\n// Computes the Otsu threshold(s) for the given histogram.\n// Also returns H = total count in histogram, and\n// omega0 = count of histogram below threshold.\nint OtsuStats(const int *histogram, int *H_out, int *omega0_out);\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCMAIN_OTSUTHR_H_\n"
  },
  {
    "path": "src/ccstruct/pageres.cpp",
    "content": "/**********************************************************************\n * File:        pageres.cpp  (Formerly page_res.c)\n * Description: Hierarchy of results classes from PAGE_RES to WERD_RES\n *              and an iterator class to iterate over the words.\n * Main purposes:\n *              Easy way to iterate over the words without a 3-nested loop.\n *              Holds data used during word recognition.\n *              Holds information about alternative spacing paths.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"pageres.h\"\n\n#include \"blamer.h\"   // for BlamerBundle\n#include \"blobs.h\"    // for TWERD, TBLOB\n#include \"boxword.h\"  // for BoxWord\n#include \"errcode.h\"  // for ASSERT_HOST\n#include \"ocrblock.h\" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)\n#include \"ocrrow.h\"   // for ROW, ROW_IT\n#include \"pdblock.h\"  // for PDBLK\n#include \"polyblk.h\"  // for POLY_BLOCK\n#include \"seam.h\"     // for SEAM, start_seam_list\n#include \"stepblob.h\" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST\n#include \"tprintf.h\"  // for tprintf\n\n#include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY\n\n#include <cassert> // for assert\n#include <cstdint> // for INT32_MAX\n#include <cstring> // for strlen\n\nstruct Pix;\n\nnamespace tesseract {\n\n// Gain factor for computing thresholds that determine the ambiguity of a\n// word.\nstatic const double kStopperAmbiguityThresholdGain = 8.0;\n// Constant offset for computing thresholds that determine the ambiguity of a\n// word.\nstatic const double kStopperAmbiguityThresholdOffset = 1.5;\n// Max number of broken pieces to associate.\nconst int kWordrecMaxNumJoinChunks = 4;\n// Max ratio of word box height to line size to allow it to be processed as\n// a line with other words.\nconst double kMaxWordSizeRatio = 1.25;\n// Max ratio of line box height to line size to allow a new word to be added.\nconst double kMaxLineSizeRatio = 1.25;\n// Max ratio of word gap to line size to allow a new word to be added.\nconst double kMaxWordGapRatio = 2.0;\n\n// Computes and returns a threshold of certainty difference used to determine\n// which words to keep, based on the adjustment factors of the two words.\n// TODO(rays) This is horrible. Replace with an enhance params training model.\nstatic double StopperAmbigThreshold(double f1, double f2) {\n  return (f2 - f1) * kStopperAmbiguityThresholdGain -\n         kStopperAmbiguityThresholdOffset;\n}\n\n/*************************************************************************\n * PAGE_RES::PAGE_RES\n *\n * Constructor for page results\n *************************************************************************/\nPAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,\n                   WERD_CHOICE **prev_word_best_choice_ptr) {\n  Init();\n  BLOCK_IT block_it(the_block_list);\n  BLOCK_RES_IT block_res_it(&block_res_list);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block_res_it.add_to_end(\n        new BLOCK_RES(merge_similar_words, block_it.data()));\n  }\n  prev_word_best_choice = prev_word_best_choice_ptr;\n}\n\n/*************************************************************************\n * BLOCK_RES::BLOCK_RES\n *\n * Constructor for BLOCK results\n *************************************************************************/\n\nBLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {\n  ROW_IT row_it(the_block->row_list());\n  ROW_RES_IT row_res_it(&row_res_list);\n\n  char_count = 0;\n  rej_count = 0;\n  font_class = -1; // not assigned\n  x_height = -1.0;\n  font_assigned = false;\n  row_count = 0;\n\n  block = the_block;\n\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));\n  }\n}\n\n/*************************************************************************\n * ROW_RES::ROW_RES\n *\n * Constructor for ROW results\n *************************************************************************/\n\nROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {\n  WERD_IT word_it(the_row->word_list());\n  WERD_RES_IT word_res_it(&word_res_list);\n  WERD_RES *combo = nullptr; // current combination of fuzzies\n  WERD *copy_word;\n\n  char_count = 0;\n  rej_count = 0;\n  whole_word_rej_count = 0;\n\n  row = the_row;\n  bool add_next_word = false;\n  TBOX union_box;\n  float line_height =\n      the_row->x_height() + the_row->ascenders() - the_row->descenders();\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    auto *word_res = new WERD_RES(word_it.data());\n    word_res->x_height = the_row->x_height();\n    if (add_next_word) {\n      ASSERT_HOST(combo != nullptr);\n      // We are adding this word to the combination.\n      word_res->part_of_combo = true;\n      combo->copy_on(word_res);\n    } else if (merge_similar_words) {\n      union_box = word_res->word->bounding_box();\n      add_next_word = !word_res->word->flag(W_REP_CHAR) &&\n                      union_box.height() <= line_height * kMaxWordSizeRatio;\n      word_res->odd_size = !add_next_word;\n    }\n    WERD *next_word = word_it.data_relative(1);\n    if (merge_similar_words) {\n      if (add_next_word && !next_word->flag(W_REP_CHAR)) {\n        // Next word will be added on if all of the following are true:\n        // Not a rep char.\n        // Box height small enough.\n        // Union box height small enough.\n        // Horizontal gap small enough.\n        TBOX next_box = next_word->bounding_box();\n        int prev_right = union_box.right();\n        union_box += next_box;\n        if (next_box.height() > line_height * kMaxWordSizeRatio ||\n            union_box.height() > line_height * kMaxLineSizeRatio ||\n            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {\n          add_next_word = false;\n        }\n      }\n      next_word->set_flag(W_FUZZY_NON, add_next_word);\n    } else {\n      add_next_word = next_word->flag(W_FUZZY_NON);\n    }\n    if (add_next_word) {\n      if (combo == nullptr) {\n        copy_word = new WERD;\n        *copy_word = *(word_it.data()); // deep copy\n        combo = new WERD_RES(copy_word);\n        combo->x_height = the_row->x_height();\n        combo->combination = true;\n        word_res_it.add_to_end(combo);\n      }\n      word_res->part_of_combo = true;\n    } else {\n      combo = nullptr;\n    }\n    word_res_it.add_to_end(word_res);\n  }\n}\n\nWERD_RES &WERD_RES::operator=(const WERD_RES &source) {\n  this->ELIST<WERD_RES>::LINK::operator=(source);\n  Clear();\n  if (source.combination) {\n    word = new WERD;\n    *word = *(source.word); // deep copy\n  } else {\n    word = source.word; // pt to same word\n  }\n  if (source.bln_boxes != nullptr) {\n    bln_boxes = new tesseract::BoxWord(*source.bln_boxes);\n  }\n  if (source.chopped_word != nullptr) {\n    chopped_word = new TWERD(*source.chopped_word);\n  }\n  if (source.rebuild_word != nullptr) {\n    rebuild_word = new TWERD(*source.rebuild_word);\n  }\n  // TODO(rays) Do we ever need to copy the seam_array?\n  blob_row = source.blob_row;\n  denorm = source.denorm;\n  if (source.box_word != nullptr) {\n    box_word = new tesseract::BoxWord(*source.box_word);\n  }\n  best_state = source.best_state;\n  correct_text = source.correct_text;\n  blob_widths = source.blob_widths;\n  blob_gaps = source.blob_gaps;\n  // None of the uses of operator= require the ratings matrix to be copied,\n  // so don't as it would be really slow.\n\n  // Copy the cooked choices.\n  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));\n  WERD_CHOICE_IT wc_dest_it(&best_choices);\n  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {\n    const WERD_CHOICE *choice = wc_it.data();\n    wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));\n  }\n  if (!wc_dest_it.empty()) {\n    wc_dest_it.move_to_first();\n    best_choice = wc_dest_it.data();\n  } else {\n    best_choice = nullptr;\n  }\n\n  if (source.raw_choice != nullptr) {\n    raw_choice = new WERD_CHOICE(*source.raw_choice);\n  } else {\n    raw_choice = nullptr;\n  }\n  if (source.ep_choice != nullptr) {\n    ep_choice = new WERD_CHOICE(*source.ep_choice);\n  } else {\n    ep_choice = nullptr;\n  }\n  reject_map = source.reject_map;\n  combination = source.combination;\n  part_of_combo = source.part_of_combo;\n  CopySimpleFields(source);\n  if (source.blamer_bundle != nullptr) {\n    blamer_bundle = new BlamerBundle(*(source.blamer_bundle));\n  }\n  return *this;\n}\n\n// Copies basic fields that don't involve pointers that might be useful\n// to copy when making one WERD_RES from another.\nvoid WERD_RES::CopySimpleFields(const WERD_RES &source) {\n  tess_failed = source.tess_failed;\n  tess_accepted = source.tess_accepted;\n  tess_would_adapt = source.tess_would_adapt;\n  done = source.done;\n  unlv_crunch_mode = source.unlv_crunch_mode;\n  small_caps = source.small_caps;\n  odd_size = source.odd_size;\n  fontinfo = source.fontinfo;\n  fontinfo2 = source.fontinfo2;\n  fontinfo_id_count = source.fontinfo_id_count;\n  fontinfo_id2_count = source.fontinfo_id2_count;\n  x_height = source.x_height;\n  caps_height = source.caps_height;\n  baseline_shift = source.baseline_shift;\n  guessed_x_ht = source.guessed_x_ht;\n  guessed_caps_ht = source.guessed_caps_ht;\n  reject_spaces = source.reject_spaces;\n  uch_set = source.uch_set;\n  tesseract = source.tesseract;\n}\n\n// Initializes a blank (default constructed) WERD_RES from one that has\n// already been recognized.\n// Use SetupFor*Recognition afterwards to complete the setup and make\n// it ready for a retry recognition.\nvoid WERD_RES::InitForRetryRecognition(const WERD_RES &source) {\n  word = source.word;\n  CopySimpleFields(source);\n  if (source.blamer_bundle != nullptr) {\n    blamer_bundle = new BlamerBundle();\n    blamer_bundle->CopyTruth(*source.blamer_bundle);\n  }\n}\n\n// Sets up the members used in recognition: bln_boxes, chopped_word,\n// seam_array, denorm.  Returns false if\n// the word is empty and sets up fake results.  If use_body_size is\n// true and row->body_size is set, then body_size will be used for\n// blob normalization instead of xheight + ascrise. This flag is for\n// those languages that are using CJK pitch model and thus it has to\n// be true if and only if tesseract->textord_use_cjk_fp_model is\n// true.\n// If allow_detailed_fx is true, the feature extractor will receive fine\n// precision outline information, allowing smoother features and better\n// features on low resolution images.\n// The norm_mode_hint sets the default mode for normalization in absence\n// of any of the above flags.\n// norm_box is used to override the word bounding box to determine the\n// normalization scale and offset.\n// Returns false if the word is empty and sets up fake results.\nbool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,\n                                   tesseract::Tesseract *tess, Image pix,\n                                   int norm_mode, const TBOX *norm_box,\n                                   bool numeric_mode, bool use_body_size,\n                                   bool allow_detailed_fx, ROW *row,\n                                   const BLOCK *block) {\n  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);\n  tesseract = tess;\n  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;\n  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&\n       word->cblob_list()->empty()) ||\n      (pb != nullptr && !pb->IsText())) {\n    // Empty words occur when all the blobs have been moved to the rej_blobs\n    // list, which seems to occur frequently in junk.\n    SetupFake(unicharset_in);\n    word->set_flag(W_REP_CHAR, false);\n    return false;\n  }\n  ClearResults();\n  SetupWordScript(unicharset_in);\n  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);\n  float word_xheight =\n      use_body_size && row != nullptr && row->body_size() > 0.0f\n          ? row->body_size()\n          : x_height;\n  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),\n                            word_xheight, baseline_shift, numeric_mode,\n                            norm_mode_hint, norm_box, &denorm);\n  blob_row = row;\n  SetupBasicsFromChoppedWord(unicharset_in);\n  SetupBlamerBundle();\n  int num_blobs = chopped_word->NumBlobs();\n  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);\n  tess_failed = false;\n  return true;\n}\n\n// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty\n// accumulators from a made chopped word.  We presume the fields are already\n// empty.\nvoid WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {\n  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);\n  start_seam_list(chopped_word, &seam_array);\n  SetupBlobWidthsAndGaps();\n  ClearWordChoices();\n}\n\n// Sets up the members used in recognition for an empty recognition result:\n// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.\nvoid WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {\n  ClearResults();\n  SetupWordScript(unicharset_in);\n  chopped_word = new TWERD;\n  rebuild_word = new TWERD;\n  bln_boxes = new tesseract::BoxWord;\n  box_word = new tesseract::BoxWord;\n  int blob_count = word->cblob_list()->length();\n  if (blob_count > 0) {\n    auto **fake_choices = new BLOB_CHOICE *[blob_count];\n    // For non-text blocks, just pass any blobs through to the box_word\n    // and call the word failed with a fake classification.\n    C_BLOB_IT b_it(word->cblob_list());\n    int blob_id = 0;\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      TBOX box = b_it.data()->bounding_box();\n      box_word->InsertBox(box_word->length(), box);\n      fake_choices[blob_id++] = new BLOB_CHOICE;\n    }\n    FakeClassifyWord(blob_count, fake_choices);\n    delete[] fake_choices;\n  } else {\n    auto *word = new WERD_CHOICE(&unicharset_in);\n    word->make_bad();\n    LogNewRawChoice(word);\n    // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.\n    LogNewCookedChoice(1, false, word);\n  }\n  tess_failed = true;\n  done = true;\n}\n\nvoid WERD_RES::SetupWordScript(const UNICHARSET &uch) {\n  uch_set = &uch;\n  int script = uch.default_sid();\n  word->set_script_id(script);\n  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());\n  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());\n}\n\n// Sets up the blamer_bundle if it is not null, using the initialized denorm.\nvoid WERD_RES::SetupBlamerBundle() {\n  if (blamer_bundle != nullptr) {\n    blamer_bundle->SetupNormTruthWord(denorm);\n  }\n}\n\n// Computes the blob_widths and blob_gaps from the chopped_word.\nvoid WERD_RES::SetupBlobWidthsAndGaps() {\n  blob_widths.clear();\n  blob_gaps.clear();\n  int num_blobs = chopped_word->NumBlobs();\n  for (int b = 0; b < num_blobs; ++b) {\n    TBLOB *blob = chopped_word->blobs[b];\n    TBOX box = blob->bounding_box();\n    blob_widths.push_back(box.width());\n    if (b + 1 < num_blobs) {\n      blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -\n                          box.right());\n    }\n  }\n}\n\n// Updates internal data to account for a new SEAM (chop) at the given\n// blob_number. Fixes the ratings matrix and states in the choices, as well\n// as the blob widths and gaps.\nvoid WERD_RES::InsertSeam(int blob_number, SEAM *seam) {\n  // Insert the seam into the SEAMS array.\n  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);\n  seam_array.insert(seam_array.begin() + blob_number, seam);\n  if (ratings != nullptr) {\n    // Expand the ratings matrix.\n    ratings = ratings->ConsumeAndMakeBigger(blob_number);\n    // Fix all the segmentation states.\n    if (raw_choice != nullptr) {\n      raw_choice->UpdateStateForSplit(blob_number);\n    }\n    WERD_CHOICE_IT wc_it(&best_choices);\n    for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {\n      WERD_CHOICE *choice = wc_it.data();\n      choice->UpdateStateForSplit(blob_number);\n    }\n    SetupBlobWidthsAndGaps();\n  }\n}\n\n// Returns true if all the word choices except the first have adjust_factors\n// worse than the given threshold.\nbool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {\n  // The choices are not changed by this iteration.\n  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));\n  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {\n    WERD_CHOICE *choice = wc_it.data();\n    if (choice->adjust_factor() <= threshold) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Returns true if the current word is ambiguous (by number of answers or\n// by dangerous ambigs.)\nbool WERD_RES::IsAmbiguous() {\n  return !best_choices.singleton() || best_choice->dangerous_ambig_found();\n}\n\n// Returns true if the ratings matrix size matches the sum of each of the\n// segmentation states.\nbool WERD_RES::StatesAllValid() {\n  unsigned ratings_dim = ratings->dimension();\n  if (raw_choice->TotalOfStates() != ratings_dim) {\n    tprintf(\"raw_choice has total of states = %u vs ratings dim of %u\\n\",\n            raw_choice->TotalOfStates(), ratings_dim);\n    return false;\n  }\n  WERD_CHOICE_IT it(&best_choices);\n  unsigned index = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {\n    WERD_CHOICE *choice = it.data();\n    if (choice->TotalOfStates() != ratings_dim) {\n      tprintf(\"Cooked #%u has total of states = %u vs ratings dim of %u\\n\",\n              index, choice->TotalOfStates(), ratings_dim);\n      return false;\n    }\n  }\n  return true;\n}\n\n// Prints a list of words found if debug is true or the word result matches\n// the word_to_debug.\nvoid WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {\n  if (debug || (word_to_debug != nullptr && *word_to_debug != '\\0' &&\n                best_choice != nullptr &&\n                best_choice->unichar_string() == std::string(word_to_debug))) {\n    if (raw_choice != nullptr) {\n      raw_choice->print(\"\\nBest Raw Choice\");\n    }\n\n    WERD_CHOICE_IT it(&best_choices);\n    int index = 0;\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {\n      WERD_CHOICE *choice = it.data();\n      std::string label;\n      label += \"\\nCooked Choice #\" + std::to_string(index);\n      choice->print(label.c_str());\n    }\n  }\n}\n\n// Prints the top choice along with the accepted/done flags.\nvoid WERD_RES::DebugTopChoice(const char *msg) const {\n  tprintf(\"Best choice: accepted=%d, adaptable=%d, done=%d : \", tess_accepted,\n          tess_would_adapt, done);\n  if (best_choice == nullptr) {\n    tprintf(\"<Null choice>\\n\");\n  } else {\n    best_choice->print(msg);\n  }\n}\n\n// Removes from best_choices all choices which are not within a reasonable\n// range of the best choice.\n// TODO(rays) incorporate the information used here into the params training\n// re-ranker, in place of this heuristic that is based on the previous\n// adjustment factor.\nvoid WERD_RES::FilterWordChoices(int debug_level) {\n  if (best_choice == nullptr || best_choices.singleton()) {\n    return;\n  }\n\n  if (debug_level >= 2) {\n    best_choice->print(\"\\nFiltering against best choice\");\n  }\n  WERD_CHOICE_IT it(&best_choices);\n  int index = 0;\n  for (it.forward(); !it.at_first(); it.forward(), ++index) {\n    WERD_CHOICE *choice = it.data();\n    float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),\n                                            choice->adjust_factor());\n    // i, j index the blob choice in choice, best_choice.\n    // chunk is an index into the chopped_word blobs (AKA chunks).\n    // Since the two words may use different segmentations of the chunks, we\n    // iterate over the chunks to find out whether a comparable blob\n    // classification is much worse than the best result.\n    unsigned i = 0, j = 0, chunk = 0;\n    // Each iteration of the while deals with 1 chunk. On entry choice_chunk\n    // and best_chunk are the indices of the first chunk in the NEXT blob,\n    // i.e. we don't have to increment i, j while chunk < choice_chunk and\n    // best_chunk respectively.\n    auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);\n    while (i < choice->length() && j < best_choice->length()) {\n      if (choice->unichar_id(i) != best_choice->unichar_id(j) &&\n          choice->certainty(i) - best_choice->certainty(j) < threshold) {\n        if (debug_level >= 2) {\n          choice->print(\"WorstCertaintyDiffWorseThan\");\n          tprintf(\n              \"i %u j %u Choice->Blob[i].Certainty %.4g\"\n              \" WorstOtherChoiceCertainty %g Threshold %g\\n\",\n              i, j, choice->certainty(i), best_choice->certainty(j), threshold);\n          tprintf(\"Discarding bad choice #%d\\n\", index);\n        }\n        delete it.extract();\n        break;\n      }\n      ++chunk;\n      // If needed, advance choice_chunk to keep up with chunk.\n      while (choice_chunk < chunk && ++i < choice->length()) {\n        choice_chunk += choice->state(i);\n      }\n      // If needed, advance best_chunk to keep up with chunk.\n      while (best_chunk < chunk && ++j < best_choice->length()) {\n        best_chunk += best_choice->state(j);\n      }\n    }\n  }\n}\n\nvoid WERD_RES::ComputeAdaptionThresholds(float certainty_scale,\n                                         float min_rating, float max_rating,\n                                         float rating_margin,\n                                         float *thresholds) {\n  int chunk = 0;\n  int end_chunk = best_choice->state(0);\n  int end_raw_chunk = raw_choice->state(0);\n  int raw_blob = 0;\n  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {\n    float avg_rating = 0.0f;\n    int num_error_chunks = 0;\n\n    // For each chunk in best choice blob i, count non-matching raw results.\n    while (chunk < end_chunk) {\n      if (chunk >= end_raw_chunk) {\n        ++raw_blob;\n        end_raw_chunk += raw_choice->state(raw_blob);\n      }\n      if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {\n        avg_rating += raw_choice->certainty(raw_blob);\n        ++num_error_chunks;\n      }\n      ++chunk;\n    }\n\n    if (num_error_chunks > 0) {\n      avg_rating /= num_error_chunks;\n      *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);\n    } else {\n      *thresholds = max_rating;\n    }\n\n    if (*thresholds > max_rating) {\n      *thresholds = max_rating;\n    }\n    if (*thresholds < min_rating) {\n      *thresholds = min_rating;\n    }\n  }\n}\n\n// Saves a copy of the word_choice if it has the best unadjusted rating.\n// Returns true if the word_choice was the new best.\nbool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {\n  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {\n    delete raw_choice;\n    raw_choice = new WERD_CHOICE(*word_choice);\n    raw_choice->set_permuter(TOP_CHOICE_PERM);\n    return true;\n  }\n  return false;\n}\n\n// Consumes word_choice by adding it to best_choices, (taking ownership) if\n// the certainty for word_choice is some distance of the best choice in\n// best_choices, or by deleting the word_choice and returning false.\n// The best_choices list is kept in sorted order by rating. Duplicates are\n// removed, and the list is kept no longer than max_num_choices in length.\n// Returns true if the word_choice is still a valid pointer.\nbool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,\n                                  WERD_CHOICE *word_choice) {\n  if (best_choice != nullptr) {\n    // Throw out obviously bad choices to save some work.\n    // TODO(rays) Get rid of this! This piece of code produces different\n    // results according to the order in which words are found, which is an\n    // undesirable behavior. It would be better to keep all the choices and\n    // prune them later when more information is available.\n    float max_certainty_delta = StopperAmbigThreshold(\n        best_choice->adjust_factor(), word_choice->adjust_factor());\n    if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {\n      max_certainty_delta = -kStopperAmbiguityThresholdOffset;\n    }\n    if (word_choice->certainty() - best_choice->certainty() <\n        max_certainty_delta) {\n      if (debug) {\n        std::string bad_string;\n        word_choice->string_and_lengths(&bad_string, nullptr);\n        tprintf(\n            \"Discarding choice \\\"%s\\\" with an overly low certainty\"\n            \" %.3f vs best choice certainty %.3f (Threshold: %.3f)\\n\",\n            bad_string.c_str(), word_choice->certainty(),\n            best_choice->certainty(),\n            max_certainty_delta + best_choice->certainty());\n      }\n      delete word_choice;\n      return false;\n    }\n  }\n\n  // Insert in the list in order of increasing rating, but knock out worse\n  // string duplicates.\n  WERD_CHOICE_IT it(&best_choices);\n  const std::string &new_str = word_choice->unichar_string();\n  bool inserted = false;\n  int num_choices = 0;\n  if (!it.empty()) {\n    do {\n      WERD_CHOICE *choice = it.data();\n      if (choice->rating() > word_choice->rating() && !inserted) {\n        // Time to insert.\n        it.add_before_stay_put(word_choice);\n        inserted = true;\n        if (num_choices == 0) {\n          best_choice = word_choice; // This is the new best.\n        }\n        ++num_choices;\n      }\n      if (choice->unichar_string() == new_str) {\n        if (inserted) {\n          // New is better.\n          delete it.extract();\n        } else {\n          // Old is better.\n          if (debug) {\n            tprintf(\"Discarding duplicate choice \\\"%s\\\", rating %g vs %g\\n\",\n                    new_str.c_str(), word_choice->rating(), choice->rating());\n          }\n          delete word_choice;\n          return false;\n        }\n      } else {\n        ++num_choices;\n        if (num_choices > max_num_choices) {\n          delete it.extract();\n        }\n      }\n      it.forward();\n    } while (!it.at_first());\n  }\n  if (!inserted && num_choices < max_num_choices) {\n    it.add_to_end(word_choice);\n    inserted = true;\n    if (num_choices == 0) {\n      best_choice = word_choice; // This is the new best.\n    }\n  }\n  if (debug) {\n    if (inserted) {\n      tprintf(\"New %s\", best_choice == word_choice ? \"Best\" : \"Secondary\");\n    } else {\n      tprintf(\"Poor\");\n    }\n    word_choice->print(\" Word Choice\");\n  }\n  if (!inserted) {\n    delete word_choice;\n    return false;\n  }\n  return true;\n}\n\n// Simple helper moves the ownership of the pointer data from src to dest,\n// first deleting anything in dest, and nulling out src afterwards.\ntemplate <class T>\nstatic void MovePointerData(T **dest, T **src) {\n  delete *dest;\n  *dest = *src;\n  *src = nullptr;\n}\n\n// Prints a brief list of all the best choices.\nvoid WERD_RES::PrintBestChoices() const {\n  std::string alternates_str;\n  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    if (!it.at_first()) {\n      alternates_str += \"\\\", \\\"\";\n    }\n    alternates_str += it.data()->unichar_string();\n  }\n  tprintf(\"Alternates for \\\"%s\\\": {\\\"%s\\\"}\\n\",\n          best_choice->unichar_string().c_str(), alternates_str.c_str());\n}\n\n// Returns the sum of the widths of the blob between start_blob and last_blob\n// inclusive.\nint WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {\n  int result = 0;\n  for (int b = start_blob; b <= last_blob; ++b) {\n    result += blob_widths[b];\n    if (b < last_blob) {\n      result += blob_gaps[b];\n    }\n  }\n  return result;\n}\n// Returns the width of a gap between the specified blob and the next one.\nint WERD_RES::GetBlobsGap(unsigned blob_index) const {\n  if (blob_index >= blob_gaps.size()) {\n    return 0;\n  }\n  return blob_gaps[blob_index];\n}\n\n// Returns the BLOB_CHOICE corresponding to the given index in the\n// best choice word taken from the appropriate cell in the ratings MATRIX.\n// Borrowed pointer, so do not delete. May return nullptr if there is no\n// BLOB_CHOICE matching the unichar_id at the given index.\nBLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {\n  if (index >= best_choice->length()) {\n    return nullptr;\n  }\n  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);\n  return FindMatchingChoice(best_choice->unichar_id(index), choices);\n}\n\n// Returns the BLOB_CHOICE_LIST corresponding to the given index in the\n// best choice word taken from the appropriate cell in the ratings MATRIX.\n// Borrowed pointer, so do not delete.\nBLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {\n  return best_choice->blob_choices(index, ratings);\n}\n\n// Moves the results fields from word to this. This takes ownership of all\n// the data, so src can be destructed.\nvoid WERD_RES::ConsumeWordResults(WERD_RES *word) {\n  denorm = word->denorm;\n  blob_row = word->blob_row;\n  MovePointerData(&chopped_word, &word->chopped_word);\n  MovePointerData(&rebuild_word, &word->rebuild_word);\n  MovePointerData(&box_word, &word->box_word);\n  for (auto data : seam_array) {\n    delete data;\n  }\n  seam_array = word->seam_array;\n  word->seam_array.clear();\n  // TODO: optimize moves.\n  best_state = word->best_state;\n  word->best_state.clear();\n  correct_text = word->correct_text;\n  word->correct_text.clear();\n  blob_widths = word->blob_widths;\n  word->blob_widths.clear();\n  blob_gaps = word->blob_gaps;\n  word->blob_gaps.clear();\n  if (ratings != nullptr) {\n    ratings->delete_matrix_pointers();\n  }\n  MovePointerData(&ratings, &word->ratings);\n  best_choice = word->best_choice;\n  MovePointerData(&raw_choice, &word->raw_choice);\n  best_choices.clear();\n  WERD_CHOICE_IT wc_it(&best_choices);\n  wc_it.add_list_after(&word->best_choices);\n  reject_map = word->reject_map;\n  if (word->blamer_bundle != nullptr) {\n    assert(blamer_bundle != nullptr);\n    blamer_bundle->CopyResults(*(word->blamer_bundle));\n  }\n  CopySimpleFields(*word);\n}\n\n// Replace the best choice and rebuild box word.\n// choice must be from the current best_choices list.\nvoid WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {\n  best_choice = choice;\n  RebuildBestState();\n  SetupBoxWord();\n  // Make up a fake reject map of the right length to keep the\n  // rejection pass happy.\n  reject_map.initialise(best_state.size());\n  done = tess_accepted = tess_would_adapt = true;\n  SetScriptPositions();\n}\n\n// Builds the rebuild_word and sets the best_state from the chopped_word and\n// the best_choice->state.\nvoid WERD_RES::RebuildBestState() {\n  ASSERT_HOST(best_choice != nullptr);\n  delete rebuild_word;\n  rebuild_word = new TWERD;\n  if (seam_array.empty()) {\n    start_seam_list(chopped_word, &seam_array);\n  }\n  best_state.clear();\n  int start = 0;\n  for (unsigned i = 0; i < best_choice->length(); ++i) {\n    int length = best_choice->state(i);\n    best_state.push_back(length);\n    if (length > 1) {\n      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,\n                       start + length - 1);\n    }\n    TBLOB *blob = chopped_word->blobs[start];\n    rebuild_word->blobs.push_back(new TBLOB(*blob));\n    if (length > 1) {\n      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,\n                        start + length - 1);\n    }\n    start += length;\n  }\n}\n\n// Copies the chopped_word to the rebuild_word, faking a best_state as well.\n// Also sets up the output box_word.\nvoid WERD_RES::CloneChoppedToRebuild() {\n  delete rebuild_word;\n  rebuild_word = new TWERD(*chopped_word);\n  SetupBoxWord();\n  auto word_len = box_word->length();\n  best_state.reserve(word_len);\n  correct_text.reserve(word_len);\n  for (unsigned i = 0; i < word_len; ++i) {\n    best_state.push_back(1);\n    correct_text.emplace_back(\"\");\n  }\n}\n\n// Sets/replaces the box_word with one made from the rebuild_word.\nvoid WERD_RES::SetupBoxWord() {\n  delete box_word;\n  rebuild_word->ComputeBoundingBoxes();\n  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);\n  box_word->ClipToOriginalWord(denorm.block(), word);\n}\n\n// Sets up the script positions in the output best_choice using the best_choice\n// to get the unichars, and the unicharset to get the target positions.\nvoid WERD_RES::SetScriptPositions() {\n  best_choice->SetScriptPositions(small_caps, chopped_word);\n}\n// Sets all the blobs in all the words (raw choice and best choices) to be\n// the given position. (When a sub/superscript is recognized as a separate\n// word, it falls victim to the rule that a whole word cannot be sub or\n// superscript, so this function overrides that problem.)\nvoid WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {\n  raw_choice->SetAllScriptPositions(position);\n  WERD_CHOICE_IT wc_it(&best_choices);\n  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {\n    wc_it.data()->SetAllScriptPositions(position);\n  }\n}\n\n// Classifies the word with some already-calculated BLOB_CHOICEs.\n// The choices are an array of blob_count pointers to BLOB_CHOICE,\n// providing a single classifier result for each blob.\n// The BLOB_CHOICEs are consumed and the word takes ownership.\n// The number of blobs in the box_word must match blob_count.\nvoid WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {\n  // Setup the WERD_RES.\n  ASSERT_HOST(box_word != nullptr);\n  ASSERT_HOST(blob_count == box_word->length());\n  ClearWordChoices();\n  ClearRatings();\n  ratings = new MATRIX(blob_count, 1);\n  for (unsigned c = 0; c < blob_count; ++c) {\n    auto *choice_list = new BLOB_CHOICE_LIST;\n    BLOB_CHOICE_IT choice_it(choice_list);\n    choice_it.add_after_then_move(choices[c]);\n    ratings->put(c, c, choice_list);\n  }\n  FakeWordFromRatings(TOP_CHOICE_PERM);\n  reject_map.initialise(blob_count);\n  best_state.clear();\n  best_state.resize(blob_count, 1);\n  done = true;\n}\n\n// Creates a WERD_CHOICE for the word using the top choices from the leading\n// diagonal of the ratings matrix.\nvoid WERD_RES::FakeWordFromRatings(PermuterType permuter) {\n  int num_blobs = ratings->dimension();\n  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);\n  word_choice->set_permuter(permuter);\n  for (int b = 0; b < num_blobs; ++b) {\n    UNICHAR_ID unichar_id = UNICHAR_SPACE;\n    // Initialize rating and certainty like in WERD_CHOICE::make_bad().\n    float rating = WERD_CHOICE::kBadRating;\n    float certainty = -FLT_MAX;\n    BLOB_CHOICE_LIST *choices = ratings->get(b, b);\n    if (choices != nullptr && !choices->empty()) {\n      BLOB_CHOICE_IT bc_it(choices);\n      BLOB_CHOICE *choice = bc_it.data();\n      unichar_id = choice->unichar_id();\n      rating = choice->rating();\n      certainty = choice->certainty();\n    }\n    word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,\n                                                   certainty);\n  }\n  LogNewRawChoice(word_choice);\n  // Ownership of word_choice taken by word here.\n  LogNewCookedChoice(1, false, word_choice);\n}\n\n// Copies the best_choice strings to the correct_text for adaption/training.\nvoid WERD_RES::BestChoiceToCorrectText() {\n  correct_text.clear();\n  ASSERT_HOST(best_choice != nullptr);\n  for (unsigned i = 0; i < best_choice->length(); ++i) {\n    UNICHAR_ID choice_id = best_choice->unichar_id(i);\n    const char *blob_choice = uch_set->id_to_unichar(choice_id);\n    correct_text.emplace_back(blob_choice);\n  }\n}\n\n// Merges 2 adjacent blobs in the result if the permanent callback\n// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent\n// callback box_cb is nullptr or returns true, setting the merged blob\n// result to the class returned from class_cb.\n// Returns true if anything was merged.\nbool WERD_RES::ConditionalBlobMerge(\n    const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,\n    const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {\n  ASSERT_HOST(best_choice->empty() || ratings != nullptr);\n  bool modified = false;\n  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {\n    UNICHAR_ID new_id =\n        class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));\n    if (new_id != INVALID_UNICHAR_ID &&\n        (box_cb == nullptr ||\n         box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {\n      // Raw choice should not be fixed.\n      best_choice->set_unichar_id(new_id, i);\n      modified = true;\n      MergeAdjacentBlobs(i);\n      const MATRIX_COORD &coord = best_choice->MatrixCoord(i);\n      if (!coord.Valid(*ratings)) {\n        ratings->IncreaseBandSize(coord.row + 1 - coord.col);\n      }\n      BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);\n      if (FindMatchingChoice(new_id, blob_choices) == nullptr) {\n        // Insert a fake result.\n        auto *blob_choice = new BLOB_CHOICE;\n        blob_choice->set_unichar_id(new_id);\n        BLOB_CHOICE_IT bc_it(blob_choices);\n        bc_it.add_before_then_move(blob_choice);\n      }\n    }\n  }\n  return modified;\n}\n\n// Merges 2 adjacent blobs in the result (index and index+1) and corrects\n// all the data to account for the change.\nvoid WERD_RES::MergeAdjacentBlobs(unsigned index) {\n  if (reject_map.length() == best_choice->length()) {\n    reject_map.remove_pos(index);\n  }\n  best_choice->remove_unichar_id(index + 1);\n  rebuild_word->MergeBlobs(index, index + 2);\n  box_word->MergeBoxes(index, index + 2);\n  if (index + 1 < best_state.size()) {\n    best_state[index] += best_state[index + 1];\n    best_state.erase(best_state.begin() + index + 1);\n  }\n}\n\n// TODO(tkielbus) Decide between keeping this behavior here or modifying the\n// training data.\n\n// Utility function for fix_quotes\n// Return true if the next character in the string (given the UTF8 length in\n// bytes) is a quote character.\nstatic int is_simple_quote(const char *signed_str, int length) {\n  const auto *str = reinterpret_cast<const unsigned char *>(signed_str);\n  // Standard 1 byte quotes.\n  return (length == 1 && (*str == '\\'' || *str == '`')) ||\n         // UTF-8 3 bytes curved quotes.\n         (length == 3 &&\n          ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||\n           (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));\n}\n\n// Callback helper for fix_quotes returns a double quote if both\n// arguments are quote, otherwise INVALID_UNICHAR_ID.\nUNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {\n  const char *ch = uch_set->id_to_unichar(id1);\n  const char *next_ch = uch_set->id_to_unichar(id2);\n  if (is_simple_quote(ch, strlen(ch)) &&\n      is_simple_quote(next_ch, strlen(next_ch))) {\n    return uch_set->unichar_to_id(\"\\\"\");\n  }\n  return INVALID_UNICHAR_ID;\n}\n\n// Change pairs of quotes to double quotes.\nvoid WERD_RES::fix_quotes() {\n  if (!uch_set->contains_unichar(\"\\\"\") ||\n      !uch_set->get_enabled(uch_set->unichar_to_id(\"\\\"\"))) {\n    return; // Don't create it if it is disallowed.\n  }\n\n  using namespace std::placeholders; // for _1, _2\n  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);\n}\n\n// Callback helper for fix_hyphens returns UNICHAR_ID of - if both\n// arguments are hyphen, otherwise INVALID_UNICHAR_ID.\nUNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {\n  const char *ch = uch_set->id_to_unichar(id1);\n  const char *next_ch = uch_set->id_to_unichar(id2);\n  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&\n      (*next_ch == '-' || *next_ch == '~')) {\n    return uch_set->unichar_to_id(\"-\");\n  }\n  return INVALID_UNICHAR_ID;\n}\n\n// Callback helper for fix_hyphens returns true if box1 and box2 overlap\n// (assuming both on the same textline, are in order and a chopped em dash.)\nbool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {\n  return box1.right() >= box2.left();\n}\n\n// Change pairs of hyphens to a single hyphen if the bounding boxes touch\n// Typically a long dash which has been segmented.\nvoid WERD_RES::fix_hyphens() {\n  if (!uch_set->contains_unichar(\"-\") ||\n      !uch_set->get_enabled(uch_set->unichar_to_id(\"-\"))) {\n    return; // Don't create it if it is disallowed.\n  }\n\n  using namespace std::placeholders; // for _1, _2\n  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),\n                       std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));\n}\n\n// Callback helper for merge_tess_fails returns a space if both\n// arguments are space, otherwise INVALID_UNICHAR_ID.\nUNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {\n  if (id1 == id2 && id1 == uch_set->unichar_to_id(\" \")) {\n    return id1;\n  } else {\n    return INVALID_UNICHAR_ID;\n  }\n}\n\n// Change pairs of tess failures to a single one\nvoid WERD_RES::merge_tess_fails() {\n  using namespace std::placeholders; // for _1, _2\n  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),\n                           nullptr)) {\n    unsigned len = best_choice->length();\n    ASSERT_HOST(reject_map.length() == len);\n    ASSERT_HOST(box_word->length() == len);\n  }\n}\n\n// Returns true if the collection of count pieces, starting at start, are all\n// natural connected components, ie there are no real chops involved.\nbool WERD_RES::PiecesAllNatural(int start, int count) const {\n  // all seams must have no splits.\n  for (int index = start; index < start + count - 1; ++index) {\n    if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {\n      SEAM *seam = seam_array[index];\n      if (seam != nullptr && seam->HasAnySplits()) {\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\nWERD_RES::~WERD_RES() {\n  Clear();\n}\n\nvoid WERD_RES::Clear() {\n  if (combination) {\n    delete word;\n  }\n  word = nullptr;\n  delete blamer_bundle;\n  blamer_bundle = nullptr;\n  ClearResults();\n}\n\nvoid WERD_RES::ClearResults() {\n  done = false;\n  fontinfo = nullptr;\n  fontinfo2 = nullptr;\n  fontinfo_id_count = 0;\n  fontinfo_id2_count = 0;\n  delete bln_boxes;\n  bln_boxes = nullptr;\n  blob_row = nullptr;\n  delete chopped_word;\n  chopped_word = nullptr;\n  delete rebuild_word;\n  rebuild_word = nullptr;\n  delete box_word;\n  box_word = nullptr;\n  best_state.clear();\n  correct_text.clear();\n  for (auto data : seam_array) {\n    delete data;\n  }\n  seam_array.clear();\n  blob_widths.clear();\n  blob_gaps.clear();\n  ClearRatings();\n  ClearWordChoices();\n  if (blamer_bundle != nullptr) {\n    blamer_bundle->ClearResults();\n  }\n}\nvoid WERD_RES::ClearWordChoices() {\n  best_choice = nullptr;\n  delete raw_choice;\n  raw_choice = nullptr;\n  best_choices.clear();\n  delete ep_choice;\n  ep_choice = nullptr;\n}\nvoid WERD_RES::ClearRatings() {\n  if (ratings != nullptr) {\n    ratings->delete_matrix_pointers();\n    delete ratings;\n    ratings = nullptr;\n  }\n}\n\nint PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {\n  ASSERT_HOST(page_res == other.page_res);\n  if (other.block_res == nullptr) {\n    // other points to the end of the page.\n    if (block_res == nullptr) {\n      return 0;\n    }\n    return -1;\n  }\n  if (block_res == nullptr) {\n    return 1; // we point to the end of the page.\n  }\n  if (block_res == other.block_res) {\n    if (other.row_res == nullptr || row_res == nullptr) {\n      // this should only happen if we hit an image block.\n      return 0;\n    }\n    if (row_res == other.row_res) {\n      // we point to the same block and row.\n      ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);\n      if (word_res == other.word_res) {\n        // we point to the same word!\n        return 0;\n      }\n\n      WERD_RES_IT word_res_it(&row_res->word_res_list);\n      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();\n           word_res_it.forward()) {\n        if (word_res_it.data() == word_res) {\n          return -1;\n        } else if (word_res_it.data() == other.word_res) {\n          return 1;\n        }\n      }\n      ASSERT_HOST(\"Error: Incomparable PAGE_RES_ITs\" == nullptr);\n    }\n\n    // we both point to the same block, but different rows.\n    ROW_RES_IT row_res_it(&block_res->row_res_list);\n    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();\n         row_res_it.forward()) {\n      if (row_res_it.data() == row_res) {\n        return -1;\n      } else if (row_res_it.data() == other.row_res) {\n        return 1;\n      }\n    }\n    ASSERT_HOST(\"Error: Incomparable PAGE_RES_ITs\" == nullptr);\n  }\n\n  // We point to different blocks.\n  BLOCK_RES_IT block_res_it(&page_res->block_res_list);\n  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();\n       block_res_it.forward()) {\n    if (block_res_it.data() == block_res) {\n      return -1;\n    } else if (block_res_it.data() == other.block_res) {\n      return 1;\n    }\n  }\n  // Shouldn't happen...\n  ASSERT_HOST(\"Error: Incomparable PAGE_RES_ITs\" == nullptr);\n  return 0;\n}\n\n// Inserts the new_word as a combination owned by a corresponding WERD_RES\n// before the current position. The simple fields of the WERD_RES are copied\n// from clone_res and the resulting WERD_RES is returned for further setup\n// with best_choice etc.\nWERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,\n                                             WERD *new_word) {\n  // Make a WERD_RES for the new_word.\n  auto *new_res = new WERD_RES(new_word);\n  new_res->CopySimpleFields(clone_res);\n  new_res->combination = true;\n  // Insert into the appropriate place in the ROW_RES.\n  WERD_RES_IT wr_it(&row()->word_res_list);\n  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {\n    WERD_RES *word = wr_it.data();\n    if (word == word_res) {\n      break;\n    }\n  }\n  ASSERT_HOST(!wr_it.cycled_list());\n  wr_it.add_before_then_move(new_res);\n  if (wr_it.at_first()) {\n    // This is the new first word, so reset the member iterator so it\n    // detects the cycled_list state correctly.\n    ResetWordIterator();\n  }\n  return new_res;\n}\n\n// Helper computes the boundaries between blobs in the word. The blob bounds\n// are likely very poor, if they come from LSTM, where it only outputs the\n// character at one pixel within it, so we find the midpoints between them.\nstatic void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,\n                            C_BLOB_LIST *next_word_blobs,\n                            std::vector<int> *blob_ends) {\n  C_BLOB_IT blob_it(word.word->cblob_list());\n  for (int length : word.best_state) {\n    // Get the bounding box of the fake blobs\n    TBOX blob_box = blob_it.data()->bounding_box();\n    blob_it.forward();\n    for (int b = 1; b < length; ++b) {\n      blob_box += blob_it.data()->bounding_box();\n      blob_it.forward();\n    }\n    // This blob_box is crap, so for now we are only looking for the\n    // boundaries between them.\n    int blob_end = INT32_MAX;\n    if (!blob_it.at_first() || next_word_blobs != nullptr) {\n      if (blob_it.at_first()) {\n        blob_it.set_to_list(next_word_blobs);\n      }\n      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;\n    }\n    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());\n    blob_ends->push_back(blob_end);\n  }\n  blob_ends->back() = clip_box.right();\n}\n\n// Helper computes the bounds of a word by restricting it to existing words\n// that significantly overlap.\nstatic TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,\n                              int w_index, TBOX prev_box, WERD_RES_IT w_it) {\n  constexpr int kSignificantOverlapFraction = 4;\n  TBOX clipped_box;\n  TBOX current_box = words[w_index]->word->bounding_box();\n  TBOX next_box;\n  if (static_cast<size_t>(w_index + 1) < words.size() &&\n      words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {\n    next_box = words[w_index + 1]->word->bounding_box();\n  }\n  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;\n       w_it.forward()) {\n    if (w_it.data() == nullptr || w_it.data()->word == nullptr) {\n      continue;\n    }\n    TBOX w_box = w_it.data()->word->bounding_box();\n    int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);\n    int width_limit = w_box.width() / kSignificantOverlapFraction;\n    int min_significant_overlap = std::max(height_limit, width_limit);\n    int overlap = w_box.intersection(current_box).width();\n    int prev_overlap = w_box.intersection(prev_box).width();\n    int next_overlap = w_box.intersection(next_box).width();\n    if (overlap > min_significant_overlap) {\n      if (prev_overlap > min_significant_overlap) {\n        // We have no choice but to use the LSTM word edge.\n        clipped_box.set_left(current_box.left());\n      } else if (next_overlap > min_significant_overlap) {\n        // We have no choice but to use the LSTM word edge.\n        clipped_box.set_right(current_box.right());\n      } else {\n        clipped_box += w_box;\n      }\n    }\n  }\n  if (clipped_box.height() <= 0) {\n    clipped_box.set_top(current_box.top());\n    clipped_box.set_bottom(current_box.bottom());\n  }\n  if (clipped_box.width() <= 0) {\n    clipped_box = current_box;\n  }\n  return clipped_box;\n}\n\n// Helper moves the blob from src to dest. If it isn't contained by clip_box,\n// the blob is replaced by a fake that is contained.\nstatic TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,\n                            const TBOX &clip_box) {\n  C_BLOB *src_blob = src_it->extract();\n  TBOX box = src_blob->bounding_box();\n  if (!clip_box.contains(box)) {\n    int left =\n        ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);\n    int right =\n        ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());\n    int top =\n        ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());\n    int bottom =\n        ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);\n    box = TBOX(left, bottom, right, top);\n    delete src_blob;\n    src_blob = C_BLOB::FakeBlob(box);\n  }\n  dest_it->add_after_then_move(src_blob);\n  return box;\n}\n\n// Replaces the current WERD/WERD_RES with the given words. The given words\n// contain fake blobs that indicate the position of the characters. These are\n// replaced with real blobs from the current word as much as possible.\nvoid PAGE_RES_IT::ReplaceCurrentWord(\n    tesseract::PointerVector<WERD_RES> *words) {\n  if (words->empty()) {\n    DeleteCurrentWord();\n    return;\n  }\n  WERD_RES *input_word = word();\n  // Set the BOL/EOL flags on the words from the input word.\n  if (input_word->word->flag(W_BOL)) {\n    (*words)[0]->word->set_flag(W_BOL, true);\n  } else {\n    (*words)[0]->word->set_blanks(input_word->word->space());\n  }\n  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));\n\n  // Move the blobs from the input word to the new set of words.\n  // If the input word_res is a combination, then the replacements will also be\n  // combinations, and will own their own words. If the input word_res is not a\n  // combination, then the final replacements will not be either, (although it\n  // is allowed for the input words to be combinations) and their words\n  // will get put on the row list. This maintains the ownership rules.\n  WERD_IT w_it(row()->row->word_list());\n  if (!input_word->combination) {\n    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n      WERD *word = w_it.data();\n      if (word == input_word->word) {\n        break;\n      }\n    }\n    // w_it is now set to the input_word's word.\n    ASSERT_HOST(!w_it.cycled_list());\n  }\n  // Insert into the appropriate place in the ROW_RES.\n  WERD_RES_IT wr_it(&row()->word_res_list);\n  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {\n    WERD_RES *word = wr_it.data();\n    if (word == input_word) {\n      break;\n    }\n  }\n  ASSERT_HOST(!wr_it.cycled_list());\n  // Since we only have an estimate of the bounds between blobs, use the blob\n  // x-middle as the determiner of where to put the blobs\n  C_BLOB_IT src_b_it(input_word->word->cblob_list());\n  src_b_it.sort(&C_BLOB::SortByXMiddle);\n  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());\n  rej_b_it.sort(&C_BLOB::SortByXMiddle);\n  TBOX clip_box;\n  for (size_t w = 0; w < words->size(); ++w) {\n    WERD_RES *word_w = (*words)[w];\n    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);\n    // Compute blob boundaries.\n    std::vector<int> blob_ends;\n    C_BLOB_LIST *next_word_blobs =\n        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;\n    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);\n    // Remove the fake blobs on the current word, but keep safe for back-up if\n    // no blob can be found.\n    C_BLOB_LIST fake_blobs;\n    C_BLOB_IT fake_b_it(&fake_blobs);\n    fake_b_it.add_list_after(word_w->word->cblob_list());\n    fake_b_it.move_to_first();\n    word_w->word->cblob_list()->clear();\n    C_BLOB_IT dest_it(word_w->word->cblob_list());\n    // Build the box word as we move the blobs.\n    auto *box_word = new tesseract::BoxWord;\n    for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {\n      int end_x = blob_ends[i];\n      TBOX blob_box;\n      // Add the blobs up to end_x.\n      while (!src_b_it.empty() &&\n             src_b_it.data()->bounding_box().x_middle() < end_x) {\n        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);\n        src_b_it.forward();\n      }\n      while (!rej_b_it.empty() &&\n             rej_b_it.data()->bounding_box().x_middle() < end_x) {\n        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);\n        rej_b_it.forward();\n      }\n      if (blob_box.null_box()) {\n        // Use the original box as a back-up.\n        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);\n      }\n      box_word->InsertBox(i, blob_box);\n    }\n    delete word_w->box_word;\n    word_w->box_word = box_word;\n    if (!input_word->combination) {\n      // Insert word_w->word into the ROW. It doesn't own its word, so the\n      // ROW needs to own it.\n      w_it.add_before_stay_put(word_w->word);\n      word_w->combination = false;\n    }\n    (*words)[w] = nullptr; // We are taking ownership.\n    wr_it.add_before_stay_put(word_w);\n  }\n  // We have taken ownership of the words.\n  words->clear();\n  // Delete the current word, which has been replaced. We could just call\n  // DeleteCurrentWord, but that would iterate both lists again, and we know\n  // we are already in the right place.\n  if (!input_word->combination) {\n    delete w_it.extract();\n  }\n  delete wr_it.extract();\n  ResetWordIterator();\n}\n\n// Deletes the current WERD_RES and its underlying WERD.\nvoid PAGE_RES_IT::DeleteCurrentWord() {\n  // Check that this word is as we expect. part_of_combos are NEVER iterated\n  // by the normal iterator, so we should never be trying to delete them.\n  ASSERT_HOST(!word_res->part_of_combo);\n  if (!word_res->combination) {\n    // Combinations own their own word, so we won't find the word on the\n    // row's word_list, but it is legitimate to try to delete them.\n    // Delete word from the ROW when not a combination.\n    WERD_IT w_it(row()->row->word_list());\n    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n      if (w_it.data() == word_res->word) {\n        break;\n      }\n    }\n    ASSERT_HOST(!w_it.cycled_list());\n    delete w_it.extract();\n  }\n  // Remove the WERD_RES for the new_word.\n  // Remove the WORD_RES from the ROW_RES.\n  WERD_RES_IT wr_it(&row()->word_res_list);\n  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {\n    if (wr_it.data() == word_res) {\n      word_res = nullptr;\n      break;\n    }\n  }\n  ASSERT_HOST(!wr_it.cycled_list());\n  delete wr_it.extract();\n  ResetWordIterator();\n}\n\n// Makes the current word a fuzzy space if not already fuzzy. Updates\n// corresponding part of combo if required.\nvoid PAGE_RES_IT::MakeCurrentWordFuzzy() {\n  WERD *real_word = word_res->word;\n  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {\n    real_word->set_flag(W_FUZZY_SP, true);\n    if (word_res->combination) {\n      // The next word should be the corresponding part of combo, but we have\n      // already stepped past it, so find it by search.\n      WERD_RES_IT wr_it(&row()->word_res_list);\n      for (wr_it.mark_cycle_pt();\n           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {\n      }\n      wr_it.forward();\n      ASSERT_HOST(wr_it.data()->part_of_combo);\n      real_word = wr_it.data()->word;\n      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&\n                  !real_word->flag(W_FUZZY_NON));\n      real_word->set_flag(W_FUZZY_SP, true);\n    }\n  }\n}\n\n/*************************************************************************\n * PAGE_RES_IT::restart_page\n *\n * Set things up at the start of the page\n *************************************************************************/\n\nWERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {\n  block_res_it.set_to_list(&page_res->block_res_list);\n  block_res_it.mark_cycle_pt();\n  prev_block_res = nullptr;\n  prev_row_res = nullptr;\n  prev_word_res = nullptr;\n  block_res = nullptr;\n  row_res = nullptr;\n  word_res = nullptr;\n  next_block_res = nullptr;\n  next_row_res = nullptr;\n  next_word_res = nullptr;\n  internal_forward(true, empty_ok);\n  return internal_forward(false, empty_ok);\n}\n\n// Recovers from operations on the current word, such as in InsertCloneWord\n// and DeleteCurrentWord.\n// Resets the word_res_it so that it is one past the next_word_res, as\n// it should be after internal_forward. If next_row_res != row_res,\n// then the next_word_res is in the next row, so there is no need to do\n// anything to word_res_it, but it is still a good idea to reset the pointers\n// word_res and prev_word_res, which are still in the current row.\nvoid PAGE_RES_IT::ResetWordIterator() {\n  if (row_res == next_row_res) {\n    // Reset the member iterator so it can move forward and detect the\n    // cycled_list state correctly.\n    word_res_it.move_to_first();\n    for (word_res_it.mark_cycle_pt();\n         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;\n         word_res_it.forward()) {\n      if (!word_res_it.data()->part_of_combo) {\n        if (prev_row_res == row_res) {\n          prev_word_res = word_res;\n        }\n        word_res = word_res_it.data();\n      }\n    }\n    ASSERT_HOST(!word_res_it.cycled_list());\n    wr_it_of_next_word = word_res_it;\n    word_res_it.forward();\n  } else {\n    // word_res_it is OK, but reset word_res and prev_word_res if needed.\n    WERD_RES_IT wr_it(&row_res->word_res_list);\n    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {\n      if (!wr_it.data()->part_of_combo) {\n        if (prev_row_res == row_res) {\n          prev_word_res = word_res;\n        }\n        word_res = wr_it.data();\n      }\n    }\n  }\n}\n\n/*************************************************************************\n * PAGE_RES_IT::internal_forward\n *\n * Find the next word on the page. If empty_ok is true, then non-text blocks\n * and text blocks with no text are visited as if they contain a single\n * imaginary word in a single imaginary row. (word() and row() both return\n *nullptr in such a block and the return value is nullptr.) If empty_ok is\n *false, the old behaviour is maintained. Each real word is visited and empty\n *and non-text blocks and rows are skipped. new_block is used to initialize the\n *iterators for a new block. The iterator maintains pointers to block, row and\n *word for the previous, current and next words.  These are correct, regardless\n *of block/row boundaries. nullptr values denote start and end of the page.\n *************************************************************************/\n\nWERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {\n  bool new_row = false;\n\n  prev_block_res = block_res;\n  prev_row_res = row_res;\n  prev_word_res = word_res;\n  block_res = next_block_res;\n  row_res = next_row_res;\n  word_res = next_word_res;\n  wr_it_of_current_word = wr_it_of_next_word;\n  next_block_res = nullptr;\n  next_row_res = nullptr;\n  next_word_res = nullptr;\n\n  while (!block_res_it.cycled_list()) {\n    if (new_block) {\n      new_block = false;\n      row_res_it.set_to_list(&block_res_it.data()->row_res_list);\n      row_res_it.mark_cycle_pt();\n      if (row_res_it.empty() && empty_ok) {\n        next_block_res = block_res_it.data();\n        break;\n      }\n      new_row = true;\n    }\n    while (!row_res_it.cycled_list()) {\n      if (new_row) {\n        new_row = false;\n        word_res_it.set_to_list(&row_res_it.data()->word_res_list);\n        word_res_it.mark_cycle_pt();\n      }\n      // Skip any part_of_combo words.\n      while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {\n        word_res_it.forward();\n      }\n      if (!word_res_it.cycled_list()) {\n        next_block_res = block_res_it.data();\n        next_row_res = row_res_it.data();\n        next_word_res = word_res_it.data();\n        wr_it_of_next_word = word_res_it;\n        word_res_it.forward();\n        goto foundword;\n      }\n      // end of row reached\n      row_res_it.forward();\n      new_row = true;\n    }\n    // end of block reached\n    block_res_it.forward();\n    new_block = true;\n  }\nfoundword:\n  // Update prev_word_best_choice pointer.\n  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {\n    *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)\n                                           ? nullptr\n                                           : prev_word_res->best_choice;\n  }\n  return word_res;\n}\n\n/*************************************************************************\n * PAGE_RES_IT::restart_row()\n *\n * Move to the beginning (leftmost word) of the current row.\n *************************************************************************/\nWERD_RES *PAGE_RES_IT::restart_row() {\n  ROW_RES *row = this->row();\n  if (!row) {\n    return nullptr;\n  }\n  for (restart_page(); this->row() != row; forward()) {\n    // pass\n  }\n  return word();\n}\n\n/*************************************************************************\n * PAGE_RES_IT::forward_paragraph\n *\n * Move to the beginning of the next paragraph, allowing empty blocks.\n *************************************************************************/\n\nWERD_RES *PAGE_RES_IT::forward_paragraph() {\n  while (block_res == next_block_res &&\n         (next_row_res != nullptr && next_row_res->row != nullptr &&\n          row_res->row->para() == next_row_res->row->para())) {\n    internal_forward(false, true);\n  }\n  return internal_forward(false, true);\n}\n\n/*************************************************************************\n * PAGE_RES_IT::forward_block\n *\n * Move to the beginning of the next block, allowing empty blocks.\n *************************************************************************/\n\nWERD_RES *PAGE_RES_IT::forward_block() {\n  while (block_res == next_block_res) {\n    internal_forward(false, true);\n  }\n  return internal_forward(false, true);\n}\n\nvoid PAGE_RES_IT::rej_stat_word() {\n  int16_t chars_in_word;\n  int16_t rejects_in_word = 0;\n\n  chars_in_word = word_res->reject_map.length();\n  page_res->char_count += chars_in_word;\n  block_res->char_count += chars_in_word;\n  row_res->char_count += chars_in_word;\n\n  rejects_in_word = word_res->reject_map.reject_count();\n\n  page_res->rej_count += rejects_in_word;\n  block_res->rej_count += rejects_in_word;\n  row_res->rej_count += rejects_in_word;\n  if (chars_in_word == rejects_in_word) {\n    row_res->whole_word_rej_count += rejects_in_word;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/pageres.h",
    "content": "/**********************************************************************\n * File:        pageres.h  (Formerly page_res.h)\n * Description: Results classes used by control.c\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef PAGERES_H\n#define PAGERES_H\n\n#include \"blamer.h\"        // for BlamerBundle (ptr only), IRR_NUM_REASONS\n#include \"clst.h\"          // for CLIST_ITERATOR, CLISTIZEH\n#include \"elst.h\"          // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH\n#include \"genericvector.h\" // for PointerVector\n#include \"matrix.h\"        // for MATRIX\n#include \"normalis.h\"      // for DENORM\n#include \"ratngs.h\"        // for WERD_CHOICE, BLOB_CHOICE (ptr only)\n#include \"rect.h\"          // for TBOX\n#include \"rejctmap.h\"      // for REJMAP\n#include \"unicharset.h\"    // for UNICHARSET, UNICHARSET::Direction, UNI...\n#include \"werd.h\"          // for WERD, W_BOL, W_EOL\n\n#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID\n\n#include <cstdint>    // for int32_t, int16_t\n#include <functional> // for std::function\n#include <set>        // for std::pair\n#include <vector>     // for std::vector\n\n#include <sys/types.h> // for int8_t\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass BLOCK;\nclass BLOCK_LIST;\nclass BLOCK_RES;\nclass ROW;\nclass ROW_RES;\nclass SEAM;\nclass WERD_RES;\n\nstruct TWERD;\n\nclass BoxWord;\nclass Tesseract;\nstruct FontInfo;\n\n/* Forward declarations */\n\nclass BLOCK_RES;\n\nELISTIZEH(BLOCK_RES)\nCLISTIZEH(BLOCK_RES)\nclass ROW_RES;\n\nELISTIZEH(ROW_RES)\nclass WERD_RES;\n\nELISTIZEH(WERD_RES)\n\n/*************************************************************************\n * PAGE_RES - Page results\n *************************************************************************/\nclass PAGE_RES { // page result\npublic:\n  int32_t char_count;\n  int32_t rej_count;\n  BLOCK_RES_LIST block_res_list;\n  bool rejected;\n  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to\n  // the next word. This pointer is not owned by PAGE_RES class.\n  WERD_CHOICE **prev_word_best_choice;\n  // Sums of blame reasons computed by the blamer.\n  std::vector<int> blame_reasons;\n  // Debug information about all the misadaptions on this page.\n  // Each BlamerBundle contains an index into this vector, so that words that\n  // caused misadaption could be marked. However, since words could be\n  // deleted/split/merged, the log is stored on the PAGE_RES level.\n  std::vector<std::string> misadaption_log;\n\n  inline void Init() {\n    char_count = 0;\n    rej_count = 0;\n    rejected = false;\n    prev_word_best_choice = nullptr;\n    blame_reasons.clear();\n    blame_reasons.resize(IRR_NUM_REASONS);\n  }\n\n  PAGE_RES() {\n    Init();\n  } // empty constructor\n\n  PAGE_RES(bool merge_similar_words,\n           BLOCK_LIST *block_list, // real blocks\n           WERD_CHOICE **prev_word_best_choice_ptr);\n\n  ~PAGE_RES() = default;\n};\n\n/*************************************************************************\n * BLOCK_RES - Block results\n *************************************************************************/\n\nclass BLOCK_RES : public ELIST<BLOCK_RES>::LINK {\npublic:\n  BLOCK *block;       // real block\n  int32_t char_count; // chars in block\n  int32_t rej_count;  // rejected chars\n  int16_t font_class; //\n  int16_t row_count;\n  float x_height;\n  bool font_assigned; // block already\n  //      processed\n\n  ROW_RES_LIST row_res_list;\n\n  BLOCK_RES() = default;\n\n  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block\n\n  ~BLOCK_RES() = default;\n};\n\n/*************************************************************************\n * ROW_RES - Row results\n *************************************************************************/\n\nclass ROW_RES : public ELIST<ROW_RES>::LINK {\npublic:\n  ROW *row;                     // real row\n  int32_t char_count;           // chars in block\n  int32_t rej_count;            // rejected chars\n  int32_t whole_word_rej_count; // rejs in total rej wds\n  WERD_RES_LIST word_res_list;\n\n  ROW_RES() = default;\n\n  ROW_RES(bool merge_similar_words, ROW *the_row); // real row\n\n  ~ROW_RES() = default;\n};\n\n/*************************************************************************\n * WERD_RES - Word results\n *************************************************************************/\nenum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE };\n\n// WERD_RES is a collection of publicly accessible members that gathers\n// information about a word result.\nclass TESS_API WERD_RES : public ELIST<WERD_RES>::LINK {\npublic:\n  // Which word is which?\n  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,\n  // the original image coordinate space, and the BLN space in which the\n  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,\n  // and the x-middle of the word is at 0.\n  // In the rotated pixel space, coordinates correspond to the input image,\n  // but may be rotated about the origin by a multiple of 90 degrees,\n  // and may therefore be negative.\n  // In any case a rotation by denorm.block()->re_rotation() will take them\n  // back to the original image.\n  // The other differences between words all represent different stages of\n  // processing during recognition.\n\n  // ---------------------------INPUT-------------------------------------\n\n  // The word is the input C_BLOBs in the rotated pixel space.\n  // word is NOT owned by the WERD_RES unless combination is true.\n  // All the other word pointers ARE owned by the WERD_RES.\n  WERD *word = nullptr; // Input C_BLOB word.\n\n  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------\n\n  // The bln_boxes contains the bounding boxes (only) of the input word, in the\n  // BLN space. The lengths of word and bln_boxes\n  // match as they are both before any chopping.\n  // TODO(rays) determine if docqual does anything useful and delete bln_boxes\n  // if it doesn't.\n  tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes.\n  // The ROW that this word sits in. NOT owned by the WERD_RES.\n  ROW *blob_row = nullptr;\n  // The denorm provides the transformation to get back to the rotated image\n  // coords from the chopped_word/rebuild_word BLN coords, but each blob also\n  // has its own denorm.\n  DENORM denorm; // For use on chopped_word.\n  // Unicharset used by the classifier output in best_choice and raw_choice.\n  const UNICHARSET *uch_set = nullptr; // For converting back to utf8.\n\n  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----\n  // ----Setup to a (different!) state expected by the various classifiers----\n  // TODO(rays) Tidy and make more consistent.\n\n  // The chopped_word is also in BLN space, and represents the fully chopped\n  // character fragments that make up the word.\n  // The length of chopped_word matches length of seam_array + 1 (if set).\n  TWERD *chopped_word = nullptr; // BLN chopped fragments output.\n  // Vector of SEAM* holding chopping points matching chopped_word.\n  std::vector<SEAM *> seam_array;\n  // Widths of blobs in chopped_word.\n  std::vector<int> blob_widths;\n  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between\n  // blob i and blob i+1.\n  std::vector<int> blob_gaps;\n  // Stores the lstm choices of every timestep\n  std::vector<std::vector<std::pair<const char *, float>>> timesteps;\n  // Stores the lstm choices of every timestep segmented by character\n  std::vector<std::vector<std::vector<std::pair<const char *, float>>>>\n      segmented_timesteps;\n  // Symbolchoices acquired during CTC\n  std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices;\n  // Stores if the timestep vector starts with a space\n  bool leading_space = false;\n  // Stores value when the word ends\n  int end = 0;\n  // Ratings matrix contains classifier choices for each classified combination\n  // of blobs. The dimension is the same as the number of blobs in chopped_word\n  // and the leading diagonal corresponds to classifier results of the blobs\n  // in chopped_word. The state_ members of best_choice, raw_choice and\n  // best_choices all correspond to this ratings matrix and allow extraction\n  // of the blob choices for any given WERD_CHOICE.\n  MATRIX *ratings = nullptr; // Owned pointer.\n  // Pointer to the first WERD_CHOICE in best_choices. This is the result that\n  // will be output from Tesseract. Note that this is now a borrowed pointer\n  // and should NOT be deleted.\n  WERD_CHOICE *best_choice = nullptr; // Borrowed pointer.\n  // The best raw_choice found during segmentation search. Differs from the\n  // best_choice by being the best result according to just the character\n  // classifier, not taking any language model information into account.\n  // Unlike best_choice, the pointer IS owned by this WERD_RES.\n  WERD_CHOICE *raw_choice = nullptr; // Owned pointer.\n  // Alternative results found during chopping/segmentation search stages.\n  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.\n  WERD_CHOICE_LIST best_choices;\n\n  // Truth bounding boxes, text and incorrect choice reason.\n  BlamerBundle *blamer_bundle = nullptr;\n\n  // --------------OUTPUT FROM RECOGNITION-------------------------------\n  // --------------Not all fields are necessarily set.-------------------\n  // ---best_choice, raw_choice *must* end up set, with a box_word-------\n  // ---In complete output, the number of blobs in rebuild_word matches---\n  // ---the number of boxes in box_word, the number of unichar_ids in---\n  // ---best_choice, the number of ints in best_state, and the number---\n  // ---of strings in correct_text--------------------------------------\n  // ---SetupFake Sets everything to appropriate values if the word is---\n  // ---known to be bad before recognition.------------------------------\n\n  // The rebuild_word is also in BLN space, but represents the final best\n  // segmentation of the word. Its length is therefore the same as box_word.\n  TWERD *rebuild_word = nullptr; // BLN best segmented word.\n  // The box_word is in the original image coordinate space. It is the\n  // bounding boxes of the rebuild_word, after denormalization.\n  // The length of box_word matches rebuild_word, best_state (if set) and\n  // correct_text (if set), as well as best_choice and represents the\n  // number of classified units in the output.\n  tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes.\n  // The Tesseract that was used to recognize this word. Just a borrowed\n  // pointer. Note: Tesseract's class definition is in a higher-level library.\n  // We avoid introducing a cyclic dependency by not using the Tesseract\n  // within WERD_RES. We are just storing it to provide access to it\n  // for the top-level multi-language controller, and maybe for output of\n  // the recognized language.\n  // tesseract points to data owned elsewhere.\n  tesseract::Tesseract *tesseract = nullptr;\n  // The best_state stores the relationship between chopped_word and\n  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]\n  // adjacent blobs in chopped_word. The seams in seam_array are hidden\n  // within a rebuild_word blob and revealed between them.\n  std::vector<int> best_state; // Number of blobs in each best blob.\n  // The correct_text is used during training and adaption to carry the\n  // text to the training system without the need for a unicharset. There\n  // is one entry in the vector for each blob in rebuild_word and box_word.\n  std::vector<std::string> correct_text;\n\n  // Less-well documented members.\n  // TODO(rays) Add more documentation here.\n  WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.\n  REJMAP reject_map;                // best_choice rejects\n  bool tess_failed = false;\n  /*\n  If tess_failed is true, one of the following tests failed when Tess\n  returned:\n  - The outword blob list was not the same length as the best_choice string;\n  - The best_choice string contained ALL blanks;\n  - The best_choice string was zero length\n*/\n  bool tess_accepted = false;    // Tess thinks its ok?\n  bool tess_would_adapt = false; // Tess would adapt?\n  bool done = false;             // ready for output?\n  bool small_caps = false;       // word appears to be small caps\n  bool odd_size = false;         // word is bigger than line or leader dots.\n  // The fontinfos are pointers to data owned by the classifier.\n  const FontInfo *fontinfo = nullptr;\n  const FontInfo *fontinfo2 = nullptr;\n  int8_t fontinfo_id_count = 0;  // number of votes\n  int8_t fontinfo_id2_count = 0; // number of votes\n  bool guessed_x_ht = true;\n  bool guessed_caps_ht = true;\n  CRUNCH_MODE unlv_crunch_mode = CR_NONE;\n  float x_height = 0.0f;       // post match estimate\n  float caps_height = 0.0f;    // post match estimate\n  float baseline_shift = 0.0f; // post match estimate.\n  // Certainty score for the spaces either side of this word (LSTM mode).\n  // MIN this value with the actual word certainty.\n  float space_certainty = 0.0f;\n\n  /*\n  To deal with fuzzy spaces we need to be able to combine \"words\" to form\n  combinations when we suspect that the gap is a non-space. The (new) text\n  ord code generates separate words for EVERY fuzzy gap - flags in the word\n  indicate whether the gap is below the threshold (fuzzy kern) and is thus\n  NOT a real word break by default, or above the threshold (fuzzy space) and\n  this is a real word break by default.\n\n  The WERD_RES list contains all these words PLUS \"combination\" words built\n  out of (copies of) the words split by fuzzy kerns. The separate parts have\n  their \"part_of_combo\" flag set true and should be IGNORED on a default\n  reading of the list.\n\n  Combination words are FOLLOWED by the sequence of part_of_combo words\n  which they combine.\n*/\n  bool combination = false;   // of two fuzzy gap wds\n  bool part_of_combo = false; // part of a combo\n  bool reject_spaces = false; // Reject spacing?\n\n  WERD_RES() = default;\n\n  WERD_RES(WERD *the_word) {\n    word = the_word;\n  }\n  // Deep copies everything except the ratings MATRIX.\n  // To get that use deep_copy below.\n  WERD_RES(const WERD_RES &source) : ELIST<WERD_RES>::LINK(source) {\n    // combination is used in function Clear which is called from operator=.\n    combination = false;\n    *this = source; // see operator=\n  }\n\n  ~WERD_RES();\n\n  // Returns the UTF-8 string for the given blob index in the best_choice word,\n  // given that we know whether we are in a right-to-left reading context.\n  // This matters for mirrorable characters such as parentheses.  We recognize\n  // characters purely based on their shape on the page, and by default produce\n  // the corresponding unicode for a left-to-right context.\n  const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const {\n    if (best_choice == nullptr || blob_index >= best_choice->length()) {\n      return nullptr;\n    }\n    UNICHAR_ID id = best_choice->unichar_id(blob_index);\n    if (static_cast<unsigned>(id) >= uch_set->size()) {\n      return nullptr;\n    }\n    UNICHAR_ID mirrored = uch_set->get_mirror(id);\n    if (in_rtl_context && mirrored > 0) {\n      id = mirrored;\n    }\n    return uch_set->id_to_unichar_ext(id);\n  }\n  // Returns the UTF-8 string for the given blob index in the raw_choice word.\n  const char *RawUTF8(unsigned blob_index) const {\n    if (blob_index >= raw_choice->length()) {\n      return nullptr;\n    }\n    UNICHAR_ID id = raw_choice->unichar_id(blob_index);\n    if (static_cast<unsigned>(id) >= uch_set->size()) {\n      return nullptr;\n    }\n    return uch_set->id_to_unichar(id);\n  }\n\n  UNICHARSET::Direction SymbolDirection(unsigned blob_index) const {\n    if (best_choice == nullptr || blob_index >= best_choice->length()) {\n      return UNICHARSET::U_OTHER_NEUTRAL;\n    }\n    return uch_set->get_direction(best_choice->unichar_id(blob_index));\n  }\n\n  bool AnyRtlCharsInWord() const {\n    if (uch_set == nullptr || best_choice == nullptr ||\n        best_choice->length() < 1) {\n      return false;\n    }\n    for (unsigned id = 0; id < best_choice->length(); id++) {\n      unsigned unichar_id = best_choice->unichar_id(id);\n      if (unichar_id >= uch_set->size()) {\n        continue; // Ignore illegal chars.\n      }\n      UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);\n      if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||\n          dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  bool AnyLtrCharsInWord() const {\n    if (uch_set == nullptr || best_choice == nullptr ||\n        best_choice->length() < 1) {\n      return false;\n    }\n    for (unsigned id = 0; id < best_choice->length(); id++) {\n      unsigned unichar_id = best_choice->unichar_id(id);\n      if (unichar_id >= uch_set->size()) {\n        continue; // Ignore illegal chars.\n      }\n      UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);\n      if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||\n          dir == UNICHARSET::U_ARABIC_NUMBER) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine\n  // that gave us the unichars in reading order (as opposed to strict left\n  // to right).\n  bool UnicharsInReadingOrder() const {\n    return best_choice->unichars_in_script_order();\n  }\n\n  void Clear();\n  void ClearResults();\n  void ClearWordChoices();\n  void ClearRatings();\n\n  // Deep copies everything except the ratings MATRIX.\n  // To get that use deep_copy below.\n  WERD_RES &operator=(const WERD_RES &source); // from this\n\n  void CopySimpleFields(const WERD_RES &source);\n\n  // Initializes a blank (default constructed) WERD_RES from one that has\n  // already been recognized.\n  // Use SetupFor*Recognition afterwards to complete the setup and make\n  // it ready for a retry recognition.\n  void InitForRetryRecognition(const WERD_RES &source);\n\n  // Sets up the members used in recognition: bln_boxes, chopped_word,\n  // seam_array, denorm.  Returns false if\n  // the word is empty and sets up fake results.  If use_body_size is\n  // true and row->body_size is set, then body_size will be used for\n  // blob normalization instead of xheight + ascrise. This flag is for\n  // those languages that are using CJK pitch model and thus it has to\n  // be true if and only if tesseract->textord_use_cjk_fp_model is\n  // true.\n  // If allow_detailed_fx is true, the feature extractor will receive fine\n  // precision outline information, allowing smoother features and better\n  // features on low resolution images.\n  // The norm_mode sets the default mode for normalization in absence\n  // of any of the above flags. It should really be a tesseract::OcrEngineMode\n  // but is declared as int for ease of use with tessedit_ocr_engine_mode.\n  // Returns false if the word is empty and sets up fake results.\n  bool SetupForRecognition(const UNICHARSET &unicharset_in,\n                           tesseract::Tesseract *tesseract, Image pix,\n                           int norm_mode, const TBOX *norm_box,\n                           bool numeric_mode, bool use_body_size,\n                           bool allow_detailed_fx, ROW *row,\n                           const BLOCK *block);\n\n  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty\n  // accumulators from a made chopped word.  We presume the fields are already\n  // empty.\n  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);\n\n  // Sets up the members used in recognition for an empty recognition result:\n  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.\n  void SetupFake(const UNICHARSET &uch);\n\n  // Set the word as having the script of the input unicharset.\n  void SetupWordScript(const UNICHARSET &unicharset_in);\n\n  // Sets up the blamer_bundle if it is not null, using the initialized denorm.\n  void SetupBlamerBundle();\n\n  // Computes the blob_widths and blob_gaps from the chopped_word.\n  void SetupBlobWidthsAndGaps();\n\n  // Updates internal data to account for a new SEAM (chop) at the given\n  // blob_number. Fixes the ratings matrix and states in the choices, as well\n  // as the blob widths and gaps.\n  void InsertSeam(int blob_number, SEAM *seam);\n\n  // Returns true if all the word choices except the first have adjust_factors\n  // worse than the given threshold.\n  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;\n\n  // Returns true if the current word is ambiguous (by number of answers or\n  // by dangerous ambigs.)\n  bool IsAmbiguous();\n\n  // Returns true if the ratings matrix size matches the sum of each of the\n  // segmentation states.\n  bool StatesAllValid();\n\n  // Prints a list of words found if debug is true or the word result matches\n  // the word_to_debug.\n  void DebugWordChoices(bool debug, const char *word_to_debug);\n\n  // Prints the top choice along with the accepted/done flags.\n  void DebugTopChoice(const char *msg) const;\n\n  // Removes from best_choices all choices which are not within a reasonable\n  // range of the best choice.\n  void FilterWordChoices(int debug_level);\n\n  // Computes a set of distance thresholds used to control adaption.\n  // Compares the best choice for the current word to the best raw choice\n  // to determine which characters were classified incorrectly by the\n  // classifier. Then places a separate threshold into thresholds for each\n  // character in the word. If the classifier was correct, max_rating is placed\n  // into thresholds. If the classifier was incorrect, the mean match rating\n  // (error percentage) of the classifier's incorrect choice minus some margin\n  // is placed into thresholds. This can then be used by the caller to try to\n  // create a new template for the desired class that will classify the\n  // character with a rating better than the threshold value. The match rating\n  // placed into thresholds is never allowed to be below min_rating in order to\n  // prevent trying to make overly tight templates.\n  // min_rating limits how tight to make a template.\n  // max_rating limits how loose to make a template.\n  // rating_margin denotes the amount of margin to put in template.\n  void ComputeAdaptionThresholds(float certainty_scale, float min_rating,\n                                 float max_rating, float rating_margin,\n                                 float *thresholds);\n\n  // Saves a copy of the word_choice if it has the best unadjusted rating.\n  // Returns true if the word_choice was the new best.\n  bool LogNewRawChoice(WERD_CHOICE *word_choice);\n  // Consumes word_choice by adding it to best_choices, (taking ownership) if\n  // the certainty for word_choice is some distance of the best choice in\n  // best_choices, or by deleting the word_choice and returning false.\n  // The best_choices list is kept in sorted order by rating. Duplicates are\n  // removed, and the list is kept no longer than max_num_choices in length.\n  // Returns true if the word_choice is still a valid pointer.\n  bool LogNewCookedChoice(int max_num_choices, bool debug,\n                          WERD_CHOICE *word_choice);\n\n  // Prints a brief list of all the best choices.\n  void PrintBestChoices() const;\n\n  // Returns the sum of the widths of the blob between start_blob and last_blob\n  // inclusive.\n  int GetBlobsWidth(int start_blob, int last_blob) const;\n  // Returns the width of a gap between the specified blob and the next one.\n  int GetBlobsGap(unsigned blob_index) const;\n\n  // Returns the BLOB_CHOICE corresponding to the given index in the\n  // best choice word taken from the appropriate cell in the ratings MATRIX.\n  // Borrowed pointer, so do not delete. May return nullptr if there is no\n  // BLOB_CHOICE matching the unichar_id at the given index.\n  BLOB_CHOICE *GetBlobChoice(unsigned index) const;\n\n  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the\n  // best choice word taken from the appropriate cell in the ratings MATRIX.\n  // Borrowed pointer, so do not delete.\n  BLOB_CHOICE_LIST *GetBlobChoices(int index) const;\n\n  // Moves the results fields from word to this. This takes ownership of all\n  // the data, so src can be destructed.\n  // word1.ConsumeWordResult(word);\n  // delete word;\n  // is simpler and faster than:\n  // word1 = *word;\n  // delete word;\n  // as it doesn't need to copy and reallocate anything.\n  void ConsumeWordResults(WERD_RES *word);\n\n  // Replace the best choice and rebuild box word.\n  // choice must be from the current best_choices list.\n  void ReplaceBestChoice(WERD_CHOICE *choice);\n\n  // Builds the rebuild_word and sets the best_state from the chopped_word and\n  // the best_choice->state.\n  void RebuildBestState();\n\n  // Copies the chopped_word to the rebuild_word, faking a best_state as well.\n  // Also sets up the output box_word.\n  void CloneChoppedToRebuild();\n\n  // Sets/replaces the box_word with one made from the rebuild_word.\n  void SetupBoxWord();\n\n  // Sets up the script positions in the best_choice using the best_choice\n  // to get the unichars, and the unicharset to get the target positions.\n  void SetScriptPositions();\n  // Sets all the blobs in all the words (best choice and alternates) to be\n  // the given position. (When a sub/superscript is recognized as a separate\n  // word, it falls victim to the rule that a whole word cannot be sub or\n  // superscript, so this function overrides that problem.)\n  void SetAllScriptPositions(tesseract::ScriptPos position);\n\n  // Classifies the word with some already-calculated BLOB_CHOICEs.\n  // The choices are an array of blob_count pointers to BLOB_CHOICE,\n  // providing a single classifier result for each blob.\n  // The BLOB_CHOICEs are consumed and the word takes ownership.\n  // The number of blobs in the box_word must match blob_count.\n  void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices);\n\n  // Creates a WERD_CHOICE for the word using the top choices from the leading\n  // diagonal of the ratings matrix.\n  void FakeWordFromRatings(PermuterType permuter);\n\n  // Copies the best_choice strings to the correct_text for adaption/training.\n  void BestChoiceToCorrectText();\n\n  // Merges 2 adjacent blobs in the result if the permanent callback\n  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent\n  // callback box_cb is nullptr or returns true, setting the merged blob\n  // result to the class returned from class_cb.\n  // Returns true if anything was merged.\n  bool ConditionalBlobMerge(\n      const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,\n      const std::function<bool(const TBOX &, const TBOX &)> &box_cb);\n\n  // Merges 2 adjacent blobs in the result (index and index+1) and corrects\n  // all the data to account for the change.\n  void MergeAdjacentBlobs(unsigned index);\n\n  // Callback helper for fix_quotes returns a double quote if both\n  // arguments are quote, otherwise INVALID_UNICHAR_ID.\n  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);\n  void fix_quotes();\n\n  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both\n  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.\n  UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);\n  // Callback helper for fix_hyphens returns true if box1 and box2 overlap\n  // (assuming both on the same textline, are in order and a chopped em dash.)\n  bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2);\n  void fix_hyphens();\n\n  // Callback helper for merge_tess_fails returns a space if both\n  // arguments are space, otherwise INVALID_UNICHAR_ID.\n  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);\n  void merge_tess_fails();\n\n  // Returns a really deep copy of *src, including the ratings MATRIX.\n  static WERD_RES *deep_copy(const WERD_RES *src) {\n    auto *result = new WERD_RES(*src);\n    // That didn't copy the ratings, but we want a copy if there is one to\n    // begin with.\n    if (src->ratings != nullptr) {\n      result->ratings = src->ratings->DeepCopy();\n    }\n    return result;\n  }\n\n  // Copy blobs from word_res onto this word (eliminating spaces between).\n  // Since this may be called bidirectionally OR both the BOL and EOL flags.\n  void copy_on(WERD_RES *word_res) { // from this word\n    word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));\n    word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));\n    word->copy_on(word_res->word);\n  }\n\n  // Returns true if the collection of count pieces, starting at start, are all\n  // natural connected components, ie there are no real chops involved.\n  bool PiecesAllNatural(int start, int count) const;\n};\n\n/*************************************************************************\n * PAGE_RES_IT - Page results iterator\n *************************************************************************/\n\nclass TESS_API PAGE_RES_IT {\npublic:\n  PAGE_RES *page_res; // page being iterated\n\n  PAGE_RES_IT() = default;\n\n  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result\n    page_res = the_page_res;\n    restart_page(); // ready to scan\n  }\n\n  // Do two PAGE_RES_ITs point at the same word?\n  // This is much cheaper than cmp().\n  bool operator==(const PAGE_RES_IT &other) const {\n    return word_res == other.word_res && row_res == other.row_res &&\n           block_res == other.block_res;\n  }\n\n  bool operator!=(const PAGE_RES_IT &other) const {\n    return !(*this == other);\n  }\n\n  // Given another PAGE_RES_IT to the same page,\n  //  this before other:     -1\n  //  this equal to other:    0\n  //  this later than other:  1\n  int cmp(const PAGE_RES_IT &other) const;\n\n  WERD_RES *restart_page() {\n    return start_page(false); // Skip empty blocks.\n  }\n  WERD_RES *restart_page_with_empties() {\n    return start_page(true); // Allow empty blocks.\n  }\n  WERD_RES *start_page(bool empty_ok);\n\n  WERD_RES *restart_row();\n\n  // ============ Methods that mutate the underling structures ===========\n  // Note that these methods will potentially invalidate other PAGE_RES_ITs\n  // and are intended to be used only while a single PAGE_RES_IT is  active.\n  // This problem needs to be taken into account if these mutation operators\n  // are ever provided to PageIterator or its subclasses.\n\n  // Inserts the new_word and a corresponding WERD_RES before the current\n  // position. The simple fields of the WERD_RES are copied from clone_res and\n  // the resulting WERD_RES is returned for further setup with best_choice etc.\n  WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word);\n\n  // Replaces the current WERD/WERD_RES with the given words. The given words\n  // contain fake blobs that indicate the position of the characters. These are\n  // replaced with real blobs from the current word as much as possible.\n  void ReplaceCurrentWord(PointerVector<WERD_RES> *words);\n\n  // Deletes the current WERD_RES and its underlying WERD.\n  void DeleteCurrentWord();\n\n  // Makes the current word a fuzzy space if not already fuzzy. Updates\n  // corresponding part of combo if required.\n  void MakeCurrentWordFuzzy();\n\n  WERD_RES *forward() { // Get next word.\n    return internal_forward(false, false);\n  }\n  // Move forward, but allow empty blocks to show as single nullptr words.\n  WERD_RES *forward_with_empties() {\n    return internal_forward(false, true);\n  }\n\n  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph\n  WERD_RES *forward_block();     // get first word in next non-empty block\n\n  WERD_RES *prev_word() const { // previous word\n    return prev_word_res;\n  }\n  ROW_RES *prev_row() const { // row of prev word\n    return prev_row_res;\n  }\n  BLOCK_RES *prev_block() const { // block of prev word\n    return prev_block_res;\n  }\n  WERD_RES *word() const { // current word\n    return word_res;\n  }\n  ROW_RES *row() const { // row of current word\n    return row_res;\n  }\n  BLOCK_RES *block() const { // block of cur. word\n    return block_res;\n  }\n  WERD_RES *next_word() const { // next word\n    return next_word_res;\n  }\n  ROW_RES *next_row() const { // row of next word\n    return next_row_res;\n  }\n  BLOCK_RES *next_block() const { // block of next word\n    return next_block_res;\n  }\n  void rej_stat_word(); // for page/block/row\n  void ResetWordIterator();\n\nprivate:\n  WERD_RES *internal_forward(bool new_block, bool empty_ok);\n\n  WERD_RES *prev_word_res;   // previous word\n  ROW_RES *prev_row_res;     // row of prev word\n  BLOCK_RES *prev_block_res; // block of prev word\n\n  WERD_RES *word_res;   // current word\n  ROW_RES *row_res;     // row of current word\n  BLOCK_RES *block_res; // block of cur. word\n\n  WERD_RES *next_word_res;   // next word\n  ROW_RES *next_row_res;     // row of next word\n  BLOCK_RES *next_block_res; // block of next word\n\n  BLOCK_RES_IT block_res_it; // iterators\n  ROW_RES_IT row_res_it;\n  WERD_RES_IT word_res_it;\n  // Iterators used to get the state of word_res_it for the current word.\n  // Since word_res_it is 2 words further on, this is otherwise hard to do.\n  WERD_RES_IT wr_it_of_current_word;\n  WERD_RES_IT wr_it_of_next_word;\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/params_training_featdef.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        params_training_featdef.cpp\n// Description: Utility functions for params training features.\n// Author:      David Eger\n// Created:     Mon Jun 11 11:26:42 PDT 2012\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <cstring>\n\n#include \"params_training_featdef.h\"\n\nnamespace tesseract {\n\nint ParamsTrainingFeatureByName(const char *name) {\n  if (name == nullptr) {\n    return -1;\n  }\n  int array_size =\n      sizeof(kParamsTrainingFeatureTypeName) / sizeof(kParamsTrainingFeatureTypeName[0]);\n  for (int i = 0; i < array_size; i++) {\n    if (kParamsTrainingFeatureTypeName[i] == nullptr) {\n      continue;\n    }\n    if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0) {\n      return i;\n    }\n  }\n  return -1;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/params_training_featdef.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        params_training_featdef.h\n// Description: Feature definitions for params training.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_\n#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_\n\n#include <cstring> // for memset\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\n// Maximum number of unichars in the small and medium sized words\nstatic const int kMaxSmallWordUnichars = 3;\nstatic const int kMaxMediumWordUnichars = 6;\n\n// Raw features extracted from a single OCR hypothesis.\n// The features are normalized (by outline length or number of unichars as\n// appropriate) real-valued quantities with unbounded range and\n// unknown distribution.\n// Normalization / binarization of these features is done at a later stage.\n// Note: when adding new fields to this enum make sure to modify\n// kParamsTrainingFeatureTypeName\nenum kParamsTrainingFeatureType {\n  // Digits\n  PTRAIN_DIGITS_SHORT, // 0\n  PTRAIN_DIGITS_MED,   // 1\n  PTRAIN_DIGITS_LONG,  // 2\n  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)\n  PTRAIN_NUM_SHORT, // 3\n  PTRAIN_NUM_MED,   // 4\n  PTRAIN_NUM_LONG,  // 5\n  // Document word (DOC_DAWG_PERM)\n  PTRAIN_DOC_SHORT, // 6\n  PTRAIN_DOC_MED,   // 7\n  PTRAIN_DOC_LONG,  // 8\n  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)\n  PTRAIN_DICT_SHORT, // 9\n  PTRAIN_DICT_MED,   // 10\n  PTRAIN_DICT_LONG,  // 11\n  // Frequent word (FREQ_DAWG_PERM)\n  PTRAIN_FREQ_SHORT,          // 12\n  PTRAIN_FREQ_MED,            // 13\n  PTRAIN_FREQ_LONG,           // 14\n  PTRAIN_SHAPE_COST_PER_CHAR, // 15\n  PTRAIN_NGRAM_COST_PER_CHAR, // 16\n  PTRAIN_NUM_BAD_PUNC,        // 17\n  PTRAIN_NUM_BAD_CASE,        // 18\n  PTRAIN_XHEIGHT_CONSISTENCY, // 19\n  PTRAIN_NUM_BAD_CHAR_TYPE,   // 20\n  PTRAIN_NUM_BAD_SPACING,     // 21\n  PTRAIN_NUM_BAD_FONT,        // 22\n  PTRAIN_RATING_PER_CHAR,     // 23\n\n  PTRAIN_NUM_FEATURE_TYPES\n};\n\nstatic const char *const kParamsTrainingFeatureTypeName[] = {\n    \"PTRAIN_DIGITS_SHORT\",        // 0\n    \"PTRAIN_DIGITS_MED\",          // 1\n    \"PTRAIN_DIGITS_LONG\",         // 2\n    \"PTRAIN_NUM_SHORT\",           // 3\n    \"PTRAIN_NUM_MED\",             // 4\n    \"PTRAIN_NUM_LONG\",            // 5\n    \"PTRAIN_DOC_SHORT\",           // 6\n    \"PTRAIN_DOC_MED\",             // 7\n    \"PTRAIN_DOC_LONG\",            // 8\n    \"PTRAIN_DICT_SHORT\",          // 9\n    \"PTRAIN_DICT_MED\",            // 10\n    \"PTRAIN_DICT_LONG\",           // 11\n    \"PTRAIN_FREQ_SHORT\",          // 12\n    \"PTRAIN_FREQ_MED\",            // 13\n    \"PTRAIN_FREQ_LONG\",           // 14\n    \"PTRAIN_SHAPE_COST_PER_CHAR\", // 15\n    \"PTRAIN_NGRAM_COST_PER_CHAR\", // 16\n    \"PTRAIN_NUM_BAD_PUNC\",        // 17\n    \"PTRAIN_NUM_BAD_CASE\",        // 18\n    \"PTRAIN_XHEIGHT_CONSISTENCY\", // 19\n    \"PTRAIN_NUM_BAD_CHAR_TYPE\",   // 20\n    \"PTRAIN_NUM_BAD_SPACING\",     // 21\n    \"PTRAIN_NUM_BAD_FONT\",        // 22\n    \"PTRAIN_RATING_PER_CHAR\",     // 23\n};\n\n// Returns the index of the given feature (by name),\n// or -1 meaning the feature is unknown.\nint ParamsTrainingFeatureByName(const char *name);\n\n// Entry with features extracted from a single OCR hypothesis for a word.\nstruct ParamsTrainingHypothesis {\n  ParamsTrainingHypothesis() : cost(0.0) {\n    memset(features, 0, sizeof(features));\n  }\n  ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {\n    memcpy(features, other.features, sizeof(features));\n    str = other.str;\n    cost = other.cost;\n  }\n  ParamsTrainingHypothesis &operator=(const ParamsTrainingHypothesis &other) {\n    memcpy(features, other.features, sizeof(features));\n    str = other.str;\n    cost = other.cost;\n    return *this;\n  }\n  std::string str; // string corresponding to word hypothesis (for debugging)\n  float features[PTRAIN_NUM_FEATURE_TYPES];\n  float cost; // path cost computed by segsearch\n};\n\n// A list of hypotheses explored during one run of segmentation search.\nusing ParamsTrainingHypothesisList = std::vector<ParamsTrainingHypothesis>;\n\n// A bundle that accumulates all of the hypothesis lists explored during all\n// of the runs of segmentation search on a word (e.g. a list of hypotheses\n// explored on PASS1, PASS2, fix xheight pass, etc).\nclass ParamsTrainingBundle {\npublic:\n  ParamsTrainingBundle() = default;\n  // Starts a new hypothesis list.\n  // Should be called at the beginning of a new run of the segmentation search.\n  void StartHypothesisList() {\n    hyp_list_vec.emplace_back();\n  }\n  // Adds a new ParamsTrainingHypothesis to the current hypothesis list\n  // and returns the reference to the newly added entry.\n  ParamsTrainingHypothesis &AddHypothesis(const ParamsTrainingHypothesis &other) {\n    if (hyp_list_vec.empty()) {\n      StartHypothesisList();\n    }\n    hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));\n    return hyp_list_vec.back().back();\n  }\n\n  std::vector<ParamsTrainingHypothesisList> hyp_list_vec;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_\n"
  },
  {
    "path": "src/ccstruct/pdblock.cpp",
    "content": "/**********************************************************************\n * File:        pdblock.cpp\n * Description: PDBLK member functions and iterator functions.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"pdblock.h\"\n\n#include <allheaders.h>\n\n#include <cinttypes> // for PRId32\n#include <cstdlib>\n#include <memory> // std::unique_ptr\n\nnamespace tesseract {\n\n#define BLOCK_LABEL_HEIGHT 150 // char height of block id\n\nconstexpr ERRCODE BADBLOCKLINE(\"Y coordinate in block out of bounds\");\nconstexpr ERRCODE LOSTBLOCKLINE(\"Can't find rectangle for line\");\n\n/**********************************************************************\n * PDBLK::PDBLK\n *\n * Constructor for a simple rectangular block.\n **********************************************************************/\nPDBLK::PDBLK(                   // rectangular block\n    TDimension xmin,            // bottom left\n    TDimension ymin,\n    TDimension xmax,            // top right\n    TDimension ymax)\n    : box(ICOORD(xmin, ymin), ICOORD(xmax, ymax)) {\n  // boundaries\n  ICOORDELT_IT left_it = &leftside;\n  ICOORDELT_IT right_it = &rightside;\n\n  hand_poly = nullptr;\n  left_it.set_to_list(&leftside);\n  right_it.set_to_list(&rightside);\n  // make default box\n  left_it.add_to_end(new ICOORDELT(xmin, ymin));\n  left_it.add_to_end(new ICOORDELT(xmin, ymax));\n  right_it.add_to_end(new ICOORDELT(xmax, ymin));\n  right_it.add_to_end(new ICOORDELT(xmax, ymax));\n  index_ = 0;\n}\n\n/**********************************************************************\n * PDBLK::set_sides\n *\n * Sets left and right vertex lists\n **********************************************************************/\n\nvoid PDBLK::set_sides(    // set vertex lists\n    ICOORDELT_LIST *left, // left vertices\n    ICOORDELT_LIST *right // right vertices\n) {\n  // boundaries\n  ICOORDELT_IT left_it = &leftside;\n  ICOORDELT_IT right_it = &rightside;\n\n  leftside.clear();\n  left_it.move_to_first();\n  left_it.add_list_before(left);\n  rightside.clear();\n  right_it.move_to_first();\n  right_it.add_list_before(right);\n}\n\n/**********************************************************************\n * PDBLK::contains\n *\n * Return true if the given point is within the block.\n **********************************************************************/\n\nbool PDBLK::contains( // test containment\n    ICOORD pt         // point to test\n) {\n  BLOCK_RECT_IT it = this; // rectangle iterator\n  ICOORD bleft, tright;    // corners of rectangle\n\n  for (it.start_block(); !it.cycled_rects(); it.forward()) {\n    // get rectangle\n    it.bounding_box(bleft, tright);\n    // inside rect\n    if (pt.x() >= bleft.x() && pt.x() <= tright.x() && pt.y() >= bleft.y() &&\n        pt.y() <= tright.y()) {\n      return true; // is inside\n    }\n  }\n  return false; // not inside\n}\n\n/**********************************************************************\n * PDBLK::move\n *\n * Reposition block\n **********************************************************************/\n\nvoid PDBLK::move(    // reposition block\n    const ICOORD vec // by vector\n) {\n  ICOORDELT_IT it(&leftside);\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    *(it.data()) += vec;\n  }\n\n  it.set_to_list(&rightside);\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    *(it.data()) += vec;\n  }\n\n  box.move(vec);\n}\n\n// Returns a binary Pix mask with a 1 pixel for every pixel within the\n// block. Rotates the coordinate system by rerotation prior to rendering.\nImage PDBLK::render_mask(const FCOORD &rerotation, TBOX *mask_box) {\n  TBOX rotated_box(box);\n  rotated_box.rotate(rerotation);\n  Image pix = pixCreate(rotated_box.width(), rotated_box.height(), 1);\n  if (hand_poly != nullptr) {\n    // We are going to rotate, so get a deep copy of the points and\n    // make a new POLY_BLOCK with it.\n    ICOORDELT_LIST polygon;\n    polygon.deep_copy(hand_poly->points(), ICOORDELT::deep_copy);\n    POLY_BLOCK image_block(&polygon, hand_poly->isA());\n    image_block.rotate(rerotation);\n    // Block outline is a polygon, so use a PB_LINE_IT to get the\n    // rasterized interior. (Runs of interior pixels on a line.)\n    auto *lines = new PB_LINE_IT(&image_block);\n    for (int y = box.bottom(); y < box.top(); ++y) {\n      const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments(lines->get_line(y));\n      if (!segments->empty()) {\n        ICOORDELT_IT s_it(segments.get());\n        // Each element of segments is a start x and x size of the\n        // run of interior pixels.\n        for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {\n          int start = s_it.data()->x();\n          int xext = s_it.data()->y();\n          // Set the run of pixels to 1.\n          pixRasterop(pix, start - rotated_box.left(),\n                      rotated_box.height() - 1 - (y - rotated_box.bottom()), xext, 1, PIX_SET,\n                      nullptr, 0, 0);\n        }\n      }\n    }\n    delete lines;\n  } else {\n    // Just fill the whole block as there is only a bounding box.\n    pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(), PIX_SET, nullptr, 0, 0);\n  }\n  if (mask_box != nullptr) {\n    *mask_box = rotated_box;\n  }\n  return pix;\n}\n\n/**********************************************************************\n * PDBLK::plot\n *\n * Plot the outline of a block in the given colour.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid PDBLK::plot(            // draw outline\n    ScrollView *window,      // window to draw in\n    int32_t serial,          // serial number\n    ScrollView::Color colour // colour to draw in\n) {\n  ICOORD startpt;              // start of outline\n  ICOORD endpt;                // end of outline\n  ICOORD prevpt;               // previous point\n  ICOORDELT_IT it = &leftside; // iterator\n\n  // set the colour\n  window->Pen(colour);\n  window->TextAttributes(\"Times\", BLOCK_LABEL_HEIGHT, false, false, false);\n\n  if (hand_poly != nullptr) {\n    hand_poly->plot(window, serial);\n  } else if (!leftside.empty()) {\n    startpt = *(it.data()); // bottom left corner\n    //              tprintf(\"Block %d bottom left is (%d,%d)\\n\",\n    //                      serial,startpt.x(),startpt.y());\n    char temp_buff[34];\n#  if !defined(_WIN32) || defined(__MINGW32__)\n    snprintf(temp_buff, sizeof(temp_buff), \"%\" PRId32, serial);\n#  else\n    _ultoa(serial, temp_buff, 10);\n#  endif\n    window->Text(startpt.x(), startpt.y(), temp_buff);\n\n    window->SetCursor(startpt.x(), startpt.y());\n    do {\n      prevpt = *(it.data()); // previous point\n      it.forward();          // move to next point\n                             // draw round corner\n      window->DrawTo(prevpt.x(), it.data()->y());\n      window->DrawTo(it.data()->x(), it.data()->y());\n    } while (!it.at_last()); // until end of list\n    endpt = *(it.data());    // end point\n\n    // other side of boundary\n    window->SetCursor(startpt.x(), startpt.y());\n    it.set_to_list(&rightside);\n    prevpt = startpt;\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      // draw round corner\n      window->DrawTo(prevpt.x(), it.data()->y());\n      window->DrawTo(it.data()->x(), it.data()->y());\n      prevpt = *(it.data()); // previous point\n    }\n    // close boundary\n    window->DrawTo(endpt.x(), endpt.y());\n  }\n}\n#endif\n\n/**********************************************************************\n * PDBLK::operator=\n *\n * Assignment - duplicate the block structure, but with an EMPTY row list.\n **********************************************************************/\n\nPDBLK &PDBLK::operator=( // assignment\n    const PDBLK &source  // from this\n) {\n  //      this->ELIST_LINK::operator=(source);\n  if (!leftside.empty()) {\n    leftside.clear();\n  }\n  if (!rightside.empty()) {\n    rightside.clear();\n  }\n  leftside.deep_copy(&source.leftside, &ICOORDELT::deep_copy);\n  rightside.deep_copy(&source.rightside, &ICOORDELT::deep_copy);\n  box = source.box;\n  return *this;\n}\n\n/**********************************************************************\n * BLOCK_RECT_IT::BLOCK_RECT_IT\n *\n * Construct a block rectangle iterator.\n **********************************************************************/\n\nBLOCK_RECT_IT::BLOCK_RECT_IT(\n    // iterate rectangles\n    PDBLK *blkptr // from block\n    )\n    : left_it(&blkptr->leftside), right_it(&blkptr->rightside) {\n  block = blkptr; // remember block\n                  // non empty list\n  if (!blkptr->leftside.empty()) {\n    start_block(); // ready for iteration\n  }\n}\n\n/**********************************************************************\n * BLOCK_RECT_IT::set_to_block\n *\n * Start a new block.\n **********************************************************************/\n\nvoid BLOCK_RECT_IT::set_to_block( // start (new) block\n    PDBLK *blkptr) {              // block to start\n  block = blkptr;                 // remember block\n                                  // set iterators\n  left_it.set_to_list(&blkptr->leftside);\n  right_it.set_to_list(&blkptr->rightside);\n  if (!blkptr->leftside.empty()) {\n    start_block(); // ready for iteration\n  }\n}\n\n/**********************************************************************\n * BLOCK_RECT_IT::start_block\n *\n * Restart a block.\n **********************************************************************/\n\nvoid BLOCK_RECT_IT::start_block() { // start (new) block\n  left_it.move_to_first();\n  right_it.move_to_first();\n  left_it.mark_cycle_pt();\n  right_it.mark_cycle_pt();\n  ymin = left_it.data()->y(); // bottom of first box\n  ymax = left_it.data_relative(1)->y();\n  if (right_it.data_relative(1)->y() < ymax) {\n    // smallest step\n    ymax = right_it.data_relative(1)->y();\n  }\n}\n\n/**********************************************************************\n * BLOCK_RECT_IT::forward\n *\n * Move to the next rectangle in the block.\n **********************************************************************/\n\nvoid BLOCK_RECT_IT::forward() { // next rectangle\n  if (!left_it.empty()) {       // non-empty list\n    if (left_it.data_relative(1)->y() == ymax) {\n      left_it.forward(); // move to meet top\n    }\n    if (right_it.data_relative(1)->y() == ymax) {\n      right_it.forward();\n    }\n    // last is special\n    if (left_it.at_last() || right_it.at_last()) {\n      left_it.move_to_first(); // restart\n      right_it.move_to_first();\n      // now at bottom\n      ymin = left_it.data()->y();\n    } else {\n      ymin = ymax; // new bottom\n    }\n    // next point\n    ymax = left_it.data_relative(1)->y();\n    if (right_it.data_relative(1)->y() < ymax) {\n      // least step forward\n      ymax = right_it.data_relative(1)->y();\n    }\n  }\n}\n\n/**********************************************************************\n * BLOCK_LINE_IT::get_line\n *\n * Get the start and width of a line in the block.\n **********************************************************************/\n\nTDimension BLOCK_LINE_IT::get_line( // get a line\n    TDimension y,                   // line to get\n    TDimension &xext                // output extent\n) {\n  ICOORD bleft;  // bounding box\n  ICOORD tright; // of block & rect\n\n  // get block box\n  block->bounding_box(bleft, tright);\n  if (y < bleft.y() || y >= tright.y()) {\n    //              block->print(stderr,false);\n    BADBLOCKLINE.error(\"BLOCK_LINE_IT::get_line\", ABORT, \"Y=%d\", y);\n  }\n\n  // get rectangle box\n  rect_it.bounding_box(bleft, tright);\n  // inside rectangle\n  if (y >= bleft.y() && y < tright.y()) {\n    // width of line\n    xext = tright.x() - bleft.x();\n    return bleft.x(); // start of line\n  }\n  for (rect_it.start_block(); !rect_it.cycled_rects(); rect_it.forward()) {\n    // get rectangle box\n    rect_it.bounding_box(bleft, tright);\n    // inside rectangle\n    if (y >= bleft.y() && y < tright.y()) {\n      // width of line\n      xext = tright.x() - bleft.x();\n      return bleft.x(); // start of line\n    }\n  }\n  LOSTBLOCKLINE.error(\"BLOCK_LINE_IT::get_line\", ABORT, \"Y=%d\", y);\n  return 0; // dummy to stop warning\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/pdblock.h",
    "content": "/**********************************************************************\n * File:        pdblock.h  (Formerly pdblk.h)\n * Description: Page block class definition.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef PDBLOCK_H\n#define PDBLOCK_H\n\n#include \"clst.h\"\n#include \"polyblk.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass PDBLK; // forward decl\n\nCLISTIZEH(PDBLK)\n/// page block\nclass PDBLK {\n  friend class BLOCK_RECT_IT; ///< block iterator\n  friend class BLOCK;         ///< Page Block\n\npublic:\n  /// empty constructor\n  PDBLK() {\n    hand_poly = nullptr;\n    index_ = 0;\n  }\n  /// simple constructor\n  PDBLK(TDimension xmin, ///< bottom left\n        TDimension ymin,\n        TDimension xmax, ///< top right\n        TDimension ymax);\n\n  /// set vertex lists\n  ///@param left list of left vertices\n  ///@param right list of right vertices\n  void set_sides(ICOORDELT_LIST *left, ICOORDELT_LIST *right);\n\n  /// destructor\n  ~PDBLK() {\n    delete hand_poly;\n  }\n\n  POLY_BLOCK *poly_block() const {\n    return hand_poly;\n  }\n  /// set the poly block\n  void set_poly_block(POLY_BLOCK *blk) {\n    hand_poly = blk;\n  }\n  /// get box\n  void bounding_box(ICOORD &bottom_left,       // bottom left\n                    ICOORD &top_right) const { // topright\n    bottom_left = box.botleft();\n    top_right = box.topright();\n  }\n  /// get real box\n  const TBOX &bounding_box() const {\n    return box;\n  }\n\n  int index() const {\n    return index_;\n  }\n  void set_index(int value) {\n    index_ = value;\n  }\n\n  /// is pt inside block\n  bool contains(ICOORD pt);\n\n  /// reposition block\n  void move(const ICOORD vec); // by vector\n\n  // Returns a binary Pix mask with a 1 pixel for every pixel within the\n  // block. Rotates the coordinate system by rerotation prior to rendering.\n  // If not nullptr, mask_box is filled with the position box of the returned\n  // mask image.\n  Image render_mask(const FCOORD &rerotation, TBOX *mask_box);\n\n#ifndef GRAPHICS_DISABLED\n  /// draw histogram\n  ///@param window window to draw in\n  ///@param serial serial number\n  ///@param colour colour to draw in\n  void plot(ScrollView *window, int32_t serial, ScrollView::Color colour);\n#endif // !GRAPHICS_DISABLED\n\n  /// assignment\n  ///@param source from this\n  PDBLK &operator=(const PDBLK &source);\n\nprotected:\n  POLY_BLOCK *hand_poly;    ///< weird as well\n  ICOORDELT_LIST leftside;  ///< left side vertices\n  ICOORDELT_LIST rightside; ///< right side vertices\n  TBOX box;                 ///< bounding box\n  int index_;               ///< Serial number of this block.\n};\n\nclass BLOCK_RECT_IT // rectangle iterator\n{\npublic:\n  /// constructor\n  ///@param blkptr block to iterate\n  BLOCK_RECT_IT(PDBLK *blkptr);\n\n  /// start (new) block\n  void set_to_block(PDBLK *blkptr); // block to iterate\n\n  /// start iteration\n  void start_block();\n\n  /// next rectangle\n  void forward();\n\n  /// test end\n  bool cycled_rects() const {\n    return left_it.cycled_list() && right_it.cycled_list();\n  }\n\n  /// current rectangle\n  ///@param bleft bottom left\n  ///@param tright top right\n  void bounding_box(ICOORD &bleft, ICOORD &tright) {\n    // bottom left\n    bleft = ICOORD(left_it.data()->x(), ymin);\n    // top right\n    tright = ICOORD(right_it.data()->x(), ymax);\n  }\n\nprivate:\n  TDimension ymin = 0;    ///< bottom of rectangle\n  TDimension ymax = 0;    ///< top of rectangle\n  PDBLK *block = nullptr; ///< block to iterate\n  ICOORDELT_IT left_it;   ///< boundary iterators\n  ICOORDELT_IT right_it;\n};\n\n/// rectangle iterator\nclass BLOCK_LINE_IT {\npublic:\n  /// constructor\n  ///@param blkptr from block\n  BLOCK_LINE_IT(PDBLK *blkptr) : rect_it(blkptr) {\n    block = blkptr; // remember block\n  }\n\n  /// start (new) block\n  ///@param blkptr block to start\n  void set_to_block(PDBLK *blkptr) {\n    block = blkptr; // remember block\n                    // set iterator\n    rect_it.set_to_block(blkptr);\n  }\n\n  /// get a line\n  ///@param y line to get\n  ///@param xext output extent\n  TDimension get_line(TDimension y, TDimension &xext);\n\nprivate:\n  PDBLK *block;          ///< block to iterate\n  BLOCK_RECT_IT rect_it; ///< rectangle iterator\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/points.cpp",
    "content": "/**********************************************************************\n * File:        points.cpp  (Formerly coords.c)\n * Description: Member functions for coordinate classes.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"points.h\"\n\n#include \"helpers.h\"\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cmath> // for M_PI\n#include <cstdlib>\n\nnamespace tesseract {\n\nbool FCOORD::normalise() { // Convert to unit vec\n  float len = length();\n\n  if (len < 0.0000000001) {\n    return false;\n  }\n  xcoord /= len;\n  ycoord /= len;\n  return true;\n}\n\nbool ICOORD::DeSerialize(TFile *f) {\n  return f->DeSerialize(&xcoord) && f->DeSerialize(&ycoord);\n}\n\nbool ICOORD::Serialize(TFile *f) const {\n  return f->Serialize(&xcoord) && f->Serialize(&ycoord);\n}\n\n// Set from the given x,y, shrinking the vector to fit if needed.\nvoid ICOORD::set_with_shrink(int x, int y) {\n  // Fit the vector into an ICOORD, which is 16 bit.\n  int factor = 1;\n  int max_extent = std::max(abs(x), abs(y));\n  if (max_extent > INT16_MAX) {\n    factor = max_extent / INT16_MAX + 1;\n  }\n  xcoord = x / factor;\n  ycoord = y / factor;\n}\n\n// The fortran/basic sgn function returns -1, 0, 1 if x < 0, x == 0, x > 0\n// respectively.\nstatic int sign(int x) {\n  if (x < 0) {\n    return -1;\n  } else {\n    return x > 0 ? 1 : 0;\n  }\n}\n\n// Writes to the given file. Returns false in case of error.\nbool ICOORD::Serialize(FILE *fp) const {\n  return tesseract::Serialize(fp, &xcoord) && tesseract::Serialize(fp, &ycoord);\n}\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool ICOORD::DeSerialize(bool swap, FILE *fp) {\n  if (!tesseract::DeSerialize(fp, &xcoord)) {\n    return false;\n  }\n  if (!tesseract::DeSerialize(fp, &ycoord)) {\n    return false;\n  }\n  if (swap) {\n    ReverseN(&xcoord, sizeof(xcoord));\n    ReverseN(&ycoord, sizeof(ycoord));\n  }\n  return true;\n}\n\n// Setup for iterating over the pixels in a vector by the well-known\n// Bresenham rendering algorithm.\n// Starting with major/2 in the accumulator, on each step add major_step,\n// and then add minor to the accumulator. When the accumulator >= major\n// subtract major and step a minor step.\n\nvoid ICOORD::setup_render(ICOORD *major_step, ICOORD *minor_step, int *major, int *minor) const {\n  int abs_x = abs(xcoord);\n  int abs_y = abs(ycoord);\n  if (abs_x >= abs_y) {\n    // X-direction is major.\n    major_step->xcoord = sign(xcoord);\n    major_step->ycoord = 0;\n    minor_step->xcoord = 0;\n    minor_step->ycoord = sign(ycoord);\n    *major = abs_x;\n    *minor = abs_y;\n  } else {\n    // Y-direction is major.\n    major_step->xcoord = 0;\n    major_step->ycoord = sign(ycoord);\n    minor_step->xcoord = sign(xcoord);\n    minor_step->ycoord = 0;\n    *major = abs_y;\n    *minor = abs_x;\n  }\n}\n\n// Returns the standard feature direction corresponding to this.\n// See binary_angle_plus_pi below for a description of the direction.\nuint8_t FCOORD::to_direction() const {\n  return binary_angle_plus_pi(angle());\n}\n// Sets this with a unit vector in the given standard feature direction.\nvoid FCOORD::from_direction(uint8_t direction) {\n  double radians = angle_from_direction(direction);\n  xcoord = cos(radians);\n  ycoord = sin(radians);\n}\n\n// Converts an angle in radians (from ICOORD::angle or FCOORD::angle) to a\n// standard feature direction as an unsigned angle in 256ths of a circle\n// measured anticlockwise from (-1, 0).\nuint8_t FCOORD::binary_angle_plus_pi(double radians) {\n  return Modulo(IntCastRounded((radians + M_PI) * 128.0 / M_PI), 256);\n}\n// Inverse of binary_angle_plus_pi returns an angle in radians for the\n// given standard feature direction.\ndouble FCOORD::angle_from_direction(uint8_t direction) {\n  return direction * M_PI / 128.0 - M_PI;\n}\n\n// Returns the point on the given line nearest to this, ie the point such\n// that the vector point->this is perpendicular to the line.\n// The line is defined as a line_point and a dir_vector for its direction.\nFCOORD FCOORD::nearest_pt_on_line(const FCOORD &line_point, const FCOORD &dir_vector) const {\n  FCOORD point_vector(*this - line_point);\n  // The dot product (%) is |dir_vector||point_vector|cos theta, so dividing by\n  // the square of the length of dir_vector gives us the fraction of dir_vector\n  // to add to line1 to get the appropriate point, so\n  // result = line1 + lambda dir_vector.\n  double lambda = point_vector % dir_vector / dir_vector.sqlength();\n  return line_point + (dir_vector * lambda);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/points.h",
    "content": "/**********************************************************************\n * File:        points.h  (Formerly coords.h)\n * Description: Coordinate class definitions.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef POINTS_H\n#define POINTS_H\n\n#include \"elst.h\"\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"tesstypes.h\" // for TDimension\n\n#include <tesseract/export.h> // for DLLSYM\n\n#include <cmath> // for sqrt, atan2\n#include <cstdio>\n\nnamespace tesseract {\n\nclass FCOORD;\n\n/// integer coordinate\nclass ICOORD {\n  friend class FCOORD;\n\npublic:\n  /// empty constructor\n  ICOORD() {\n    xcoord = ycoord = 0; // default zero\n  }\n  /// constructor\n  ///@param xin x value\n  ///@param yin y value\n  ICOORD(TDimension xin, TDimension yin) {\n    xcoord = xin;\n    ycoord = yin;\n  }\n  /// destructor\n  ~ICOORD() = default;\n\n  bool DeSerialize(TFile *f);\n  bool Serialize(TFile *f) const;\n\n  /// access function\n  TDimension x() const {\n    return xcoord;\n  }\n  /// access_function\n  TDimension y() const {\n    return ycoord;\n  }\n\n  /// rewrite function\n  void set_x(TDimension xin) {\n    xcoord = xin; // write new value\n  }\n  /// rewrite function\n  void set_y(TDimension yin) { // value to set\n    ycoord = yin;\n  }\n\n  /// Set from the given x,y, shrinking the vector to fit if needed.\n  void set_with_shrink(int x, int y);\n\n  /// find sq length\n  float sqlength() const {\n    return static_cast<float>(xcoord * xcoord + ycoord * ycoord);\n  }\n\n  /// find length\n  float length() const {\n    return std::sqrt(sqlength());\n  }\n\n  /// sq dist between pts\n  float pt_to_pt_sqdist(const ICOORD &pt) const {\n    ICOORD gap;\n\n    gap.xcoord = xcoord - pt.xcoord;\n    gap.ycoord = ycoord - pt.ycoord;\n    return gap.sqlength();\n  }\n\n  /// Distance between pts\n  float pt_to_pt_dist(const ICOORD &pt) const {\n    return std::sqrt(pt_to_pt_sqdist(pt));\n  }\n\n  /// find angle\n  float angle() const {\n    return std::atan2(static_cast<float>(ycoord), static_cast<float>(xcoord));\n  }\n\n  /// test equality\n  bool operator==(const ICOORD &other) const {\n    return xcoord == other.xcoord && ycoord == other.ycoord;\n  }\n  /// test inequality\n  bool operator!=(const ICOORD &other) const {\n    return xcoord != other.xcoord || ycoord != other.ycoord;\n  }\n  /// rotate 90 deg anti\n  friend ICOORD operator!(const ICOORD &);\n  /// unary minus\n  friend ICOORD operator-(const ICOORD &);\n  /// add\n  friend ICOORD operator+(const ICOORD &, const ICOORD &);\n  /// add\n  friend ICOORD &operator+=(ICOORD &, const ICOORD &);\n  /// subtract\n  friend ICOORD operator-(const ICOORD &, const ICOORD &);\n  /// subtract\n  friend ICOORD &operator-=(ICOORD &, const ICOORD &);\n  /// scalar product\n  friend int32_t operator%(const ICOORD &, const ICOORD &);\n  /// cross product\n  friend int32_t operator*(const ICOORD &, const ICOORD &);\n  /// multiply\n  friend ICOORD operator*(const ICOORD &, TDimension);\n  /// multiply\n  friend ICOORD operator*(TDimension, const ICOORD &);\n  /// multiply\n  friend ICOORD &operator*=(ICOORD &, TDimension);\n  /// divide\n  friend ICOORD operator/(const ICOORD &, TDimension);\n  /// divide\n  friend ICOORD &operator/=(ICOORD &, TDimension);\n  /// rotate\n  ///@param vec by vector\n  void rotate(const FCOORD &vec);\n\n  /// Setup for iterating over the pixels in a vector by the well-known\n  /// Bresenham rendering algorithm.\n  /// Starting with major/2 in the accumulator, on each step move by\n  /// major_step, and then add minor to the accumulator. When\n  /// accumulator >= major subtract major and also move by minor_step.\n  void setup_render(ICOORD *major_step, ICOORD *minor_step, int *major, int *minor) const;\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\nprotected:\n  TDimension xcoord; ///< x value\n  TDimension ycoord; ///< y value\n};\n\nclass ICOORDELT : public ELIST<ICOORDELT>::LINK,\n                  public ICOORD\n// embedded coord list\n{\npublic:\n  /// empty constructor\n  ICOORDELT() = default;\n  /// constructor from ICOORD\n  ICOORDELT(ICOORD icoord) : ICOORD(icoord) {}\n  /// constructor\n  ///@param xin x value\n  ///@param yin y value\n  ICOORDELT(TDimension xin, TDimension yin) {\n    xcoord = xin;\n    ycoord = yin;\n  }\n\n  static ICOORDELT *deep_copy(const ICOORDELT *src) {\n    auto *elt = new ICOORDELT;\n    *elt = *src;\n    return elt;\n  }\n};\n\nELISTIZEH(ICOORDELT)\n\nclass TESS_API FCOORD {\npublic:\n  /// empty constructor\n  FCOORD() = default;\n  /// constructor\n  ///@param xvalue x value\n  ///@param yvalue y value\n  FCOORD(float xvalue, float yvalue) {\n    xcoord = xvalue; // set coords\n    ycoord = yvalue;\n  }\n  FCOORD(              // make from ICOORD\n      ICOORD icoord) { // coords to set\n    xcoord = icoord.xcoord;\n    ycoord = icoord.ycoord;\n  }\n\n  float x() const { // get coords\n    return xcoord;\n  }\n  float y() const {\n    return ycoord;\n  }\n  /// rewrite function\n  void set_x(float xin) {\n    xcoord = xin; // write new value\n  }\n  /// rewrite function\n  void set_y(float yin) { // value to set\n    ycoord = yin;\n  }\n\n  /// find sq length\n  float sqlength() const {\n    return xcoord * xcoord + ycoord * ycoord;\n  }\n\n  /// find length\n  float length() const {\n    return std::sqrt(sqlength());\n  }\n\n  /// sq dist between pts\n  float pt_to_pt_sqdist(const FCOORD &pt) const {\n    FCOORD gap;\n\n    gap.xcoord = xcoord - pt.xcoord;\n    gap.ycoord = ycoord - pt.ycoord;\n    return gap.sqlength();\n  }\n\n  /// Distance between pts\n  float pt_to_pt_dist(const FCOORD &pt) const {\n    return std::sqrt(pt_to_pt_sqdist(pt));\n  }\n\n  /// find angle\n  float angle() const {\n    return std::atan2(ycoord, xcoord);\n  }\n  // Returns the standard feature direction corresponding to this.\n  // See binary_angle_plus_pi below for a description of the direction.\n  uint8_t to_direction() const;\n  // Sets this with a unit vector in the given standard feature direction.\n  void from_direction(uint8_t direction);\n\n  // Converts an angle in radians (from ICOORD::angle or FCOORD::angle) to a\n  // standard feature direction as an unsigned angle in 256ths of a circle\n  // measured anticlockwise from (-1, 0).\n  static uint8_t binary_angle_plus_pi(double angle);\n  // Inverse of binary_angle_plus_pi returns an angle in radians for the\n  // given standard feature direction.\n  static double angle_from_direction(uint8_t direction);\n  // Returns the point on the given line nearest to this, ie the point such\n  // that the vector point->this is perpendicular to the line.\n  // The line is defined as a line_point and a dir_vector for its direction.\n  // dir_vector need not be a unit vector.\n  FCOORD nearest_pt_on_line(const FCOORD &line_point, const FCOORD &dir_vector) const;\n\n  /// Convert to unit vec\n  bool normalise();\n\n  /// test equality\n  bool operator==(const FCOORD &other) const {\n    return xcoord == other.xcoord && ycoord == other.ycoord;\n  }\n  /// test inequality\n  bool operator!=(const FCOORD &other) const {\n    return xcoord != other.xcoord || ycoord != other.ycoord;\n  }\n  /// rotate 90 deg anti\n  friend FCOORD operator!(const FCOORD &);\n  /// unary minus\n  friend FCOORD operator-(const FCOORD &);\n  /// add\n  friend FCOORD operator+(const FCOORD &, const FCOORD &);\n  /// add\n  friend FCOORD &operator+=(FCOORD &, const FCOORD &);\n  /// subtract\n  friend FCOORD operator-(const FCOORD &, const FCOORD &);\n  /// subtract\n  friend FCOORD &operator-=(FCOORD &, const FCOORD &);\n  /// scalar product\n  friend float operator%(const FCOORD &, const FCOORD &);\n  /// cross product\n  friend float operator*(const FCOORD &, const FCOORD &);\n  /// multiply\n  friend FCOORD operator*(const FCOORD &, float);\n  /// multiply\n  friend FCOORD operator*(float, const FCOORD &);\n\n  /// multiply\n  friend FCOORD &operator*=(FCOORD &, float);\n  /// divide\n  friend FCOORD operator/(const FCOORD &, float);\n  /// rotate\n  ///@param vec by vector\n  void rotate(const FCOORD vec);\n  // unrotate - undo a rotate(vec)\n  // @param vec by vector\n  void unrotate(const FCOORD &vec);\n  /// divide\n  friend FCOORD &operator/=(FCOORD &, float);\n\nprivate:\n  float xcoord; // 2 floating coords\n  float ycoord;\n};\n\n/**********************************************************************\n * operator!\n *\n * Rotate an ICOORD 90 degrees anticlockwise.\n **********************************************************************/\n\ninline ICOORD operator!( // rotate 90 deg anti\n    const ICOORD &src    // thing to rotate\n) {\n  ICOORD result; // output\n\n  result.xcoord = -src.ycoord;\n  result.ycoord = src.xcoord;\n  return result;\n}\n\n/**********************************************************************\n * operator-\n *\n * Unary minus of an ICOORD.\n **********************************************************************/\n\ninline ICOORD operator-( // unary minus\n    const ICOORD &src    // thing to minus\n) {\n  ICOORD result; // output\n\n  result.xcoord = -src.xcoord;\n  result.ycoord = -src.ycoord;\n  return result;\n}\n\n/**********************************************************************\n * operator+\n *\n * Add 2 ICOORDS.\n **********************************************************************/\n\ninline ICOORD operator+( // sum vectors\n    const ICOORD &op1,   // operands\n    const ICOORD &op2) {\n  ICOORD sum; // result\n\n  sum.xcoord = op1.xcoord + op2.xcoord;\n  sum.ycoord = op1.ycoord + op2.ycoord;\n  return sum;\n}\n\n/**********************************************************************\n * operator+=\n *\n * Add 2 ICOORDS.\n **********************************************************************/\n\ninline ICOORD &operator+=( // sum vectors\n    ICOORD &op1,           // operands\n    const ICOORD &op2) {\n  op1.xcoord += op2.xcoord;\n  op1.ycoord += op2.ycoord;\n  return op1;\n}\n\n/**********************************************************************\n * operator-\n *\n * Subtract 2 ICOORDS.\n **********************************************************************/\n\ninline ICOORD operator-( // subtract vectors\n    const ICOORD &op1,   // operands\n    const ICOORD &op2) {\n  ICOORD sum; // result\n\n  sum.xcoord = op1.xcoord - op2.xcoord;\n  sum.ycoord = op1.ycoord - op2.ycoord;\n  return sum;\n}\n\n/**********************************************************************\n * operator-=\n *\n * Subtract 2 ICOORDS.\n **********************************************************************/\n\ninline ICOORD &operator-=( // subtract vectors\n    ICOORD &op1,           // operands\n    const ICOORD &op2) {\n  op1.xcoord -= op2.xcoord;\n  op1.ycoord -= op2.ycoord;\n  return op1;\n}\n\n/**********************************************************************\n * operator%\n *\n * Scalar product of 2 ICOORDS.\n **********************************************************************/\n\ninline int32_t operator%( // scalar product\n    const ICOORD &op1,    // operands\n    const ICOORD &op2) {\n  return op1.xcoord * op2.xcoord + op1.ycoord * op2.ycoord;\n}\n\n/**********************************************************************\n * operator*\n *\n * Cross product of 2 ICOORDS.\n **********************************************************************/\n\ninline int32_t operator*( // cross product\n    const ICOORD &op1,    // operands\n    const ICOORD &op2) {\n  return op1.xcoord * op2.ycoord - op1.ycoord * op2.xcoord;\n}\n\n/**********************************************************************\n * operator*\n *\n * Scalar multiply of an ICOORD.\n **********************************************************************/\n\ninline ICOORD operator*( // scalar multiply\n    const ICOORD &op1,   // operands\n    TDimension scale) {\n  ICOORD result; // output\n\n  result.xcoord = op1.xcoord * scale;\n  result.ycoord = op1.ycoord * scale;\n  return result;\n}\n\ninline ICOORD operator*( // scalar multiply\n    TDimension scale,\n    const ICOORD &op1 // operands\n) {\n  ICOORD result; // output\n\n  result.xcoord = op1.xcoord * scale;\n  result.ycoord = op1.ycoord * scale;\n  return result;\n}\n\n/**********************************************************************\n * operator*=\n *\n * Scalar multiply of an ICOORD.\n **********************************************************************/\n\ninline ICOORD &operator*=( // scalar multiply\n    ICOORD &op1,           // operands\n    TDimension scale) {\n  op1.xcoord *= scale;\n  op1.ycoord *= scale;\n  return op1;\n}\n\n/**********************************************************************\n * operator/\n *\n * Scalar divide of an ICOORD.\n **********************************************************************/\n\ninline ICOORD operator/( // scalar divide\n    const ICOORD &op1,   // operands\n    TDimension scale) {\n  ICOORD result; // output\n\n  result.xcoord = op1.xcoord / scale;\n  result.ycoord = op1.ycoord / scale;\n  return result;\n}\n\n/**********************************************************************\n * operator/=\n *\n * Scalar divide of an ICOORD.\n **********************************************************************/\n\ninline ICOORD &operator/=( // scalar divide\n    ICOORD &op1,           // operands\n    TDimension scale) {\n  op1.xcoord /= scale;\n  op1.ycoord /= scale;\n  return op1;\n}\n\n/**********************************************************************\n * ICOORD::rotate\n *\n * Rotate an ICOORD by the given (normalized) (cos,sin) vector.\n **********************************************************************/\n\ninline void ICOORD::rotate( // rotate by vector\n    const FCOORD &vec) {\n  auto tmp = static_cast<TDimension>(std::floor(xcoord * vec.x() - ycoord * vec.y() + 0.5f));\n  ycoord = static_cast<TDimension>(std::floor(ycoord * vec.x() + xcoord * vec.y() + 0.5f));\n  xcoord = tmp;\n}\n\n/**********************************************************************\n * operator!\n *\n * Rotate an FCOORD 90 degrees anticlockwise.\n **********************************************************************/\n\ninline FCOORD operator!( // rotate 90 deg anti\n    const FCOORD &src    // thing to rotate\n) {\n  FCOORD result; // output\n\n  result.xcoord = -src.ycoord;\n  result.ycoord = src.xcoord;\n  return result;\n}\n\n/**********************************************************************\n * operator-\n *\n * Unary minus of an FCOORD.\n **********************************************************************/\n\ninline FCOORD operator-( // unary minus\n    const FCOORD &src    // thing to minus\n) {\n  FCOORD result; // output\n\n  result.xcoord = -src.xcoord;\n  result.ycoord = -src.ycoord;\n  return result;\n}\n\n/**********************************************************************\n * operator+\n *\n * Add 2 FCOORDS.\n **********************************************************************/\n\ninline FCOORD operator+( // sum vectors\n    const FCOORD &op1,   // operands\n    const FCOORD &op2) {\n  FCOORD sum; // result\n\n  sum.xcoord = op1.xcoord + op2.xcoord;\n  sum.ycoord = op1.ycoord + op2.ycoord;\n  return sum;\n}\n\n/**********************************************************************\n * operator+=\n *\n * Add 2 FCOORDS.\n **********************************************************************/\n\ninline FCOORD &operator+=( // sum vectors\n    FCOORD &op1,           // operands\n    const FCOORD &op2) {\n  op1.xcoord += op2.xcoord;\n  op1.ycoord += op2.ycoord;\n  return op1;\n}\n\n/**********************************************************************\n * operator-\n *\n * Subtract 2 FCOORDS.\n **********************************************************************/\n\ninline FCOORD operator-( // subtract vectors\n    const FCOORD &op1,   // operands\n    const FCOORD &op2) {\n  FCOORD sum; // result\n\n  sum.xcoord = op1.xcoord - op2.xcoord;\n  sum.ycoord = op1.ycoord - op2.ycoord;\n  return sum;\n}\n\n/**********************************************************************\n * operator-=\n *\n * Subtract 2 FCOORDS.\n **********************************************************************/\n\ninline FCOORD &operator-=( // subtract vectors\n    FCOORD &op1,           // operands\n    const FCOORD &op2) {\n  op1.xcoord -= op2.xcoord;\n  op1.ycoord -= op2.ycoord;\n  return op1;\n}\n\n/**********************************************************************\n * operator%\n *\n * Scalar product of 2 FCOORDS.\n **********************************************************************/\n\ninline float operator%( // scalar product\n    const FCOORD &op1,  // operands\n    const FCOORD &op2) {\n  return op1.xcoord * op2.xcoord + op1.ycoord * op2.ycoord;\n}\n\n/**********************************************************************\n * operator*\n *\n * Cross product of 2 FCOORDS.\n **********************************************************************/\n\ninline float operator*( // cross product\n    const FCOORD &op1,  // operands\n    const FCOORD &op2) {\n  return op1.xcoord * op2.ycoord - op1.ycoord * op2.xcoord;\n}\n\n/**********************************************************************\n * operator*\n *\n * Scalar multiply of an FCOORD.\n **********************************************************************/\n\ninline FCOORD operator*( // scalar multiply\n    const FCOORD &op1,   // operands\n    float scale) {\n  FCOORD result; // output\n\n  result.xcoord = op1.xcoord * scale;\n  result.ycoord = op1.ycoord * scale;\n  return result;\n}\n\ninline FCOORD operator*( // scalar multiply\n    float scale,\n    const FCOORD &op1 // operands\n) {\n  FCOORD result; // output\n\n  result.xcoord = op1.xcoord * scale;\n  result.ycoord = op1.ycoord * scale;\n  return result;\n}\n\n/**********************************************************************\n * operator*=\n *\n * Scalar multiply of an FCOORD.\n **********************************************************************/\n\ninline FCOORD &operator*=( // scalar multiply\n    FCOORD &op1,           // operands\n    float scale) {\n  op1.xcoord *= scale;\n  op1.ycoord *= scale;\n  return op1;\n}\n\n/**********************************************************************\n * operator/\n *\n * Scalar divide of an FCOORD.\n **********************************************************************/\n\ninline FCOORD operator/( // scalar divide\n    const FCOORD &op1,   // operands\n    float scale) {\n  FCOORD result; // output\n  ASSERT_HOST(scale != 0.0f);\n  result.xcoord = op1.xcoord / scale;\n  result.ycoord = op1.ycoord / scale;\n  return result;\n}\n\n/**********************************************************************\n * operator/=\n *\n * Scalar divide of an FCOORD.\n **********************************************************************/\n\ninline FCOORD &operator/=( // scalar divide\n    FCOORD &op1,           // operands\n    float scale) {\n  ASSERT_HOST(scale != 0.0f);\n  op1.xcoord /= scale;\n  op1.ycoord /= scale;\n  return op1;\n}\n\n/**********************************************************************\n * rotate\n *\n * Rotate an FCOORD by the given (normalized) (cos,sin) vector.\n **********************************************************************/\n\ninline void FCOORD::rotate( // rotate by vector\n    const FCOORD vec) {\n  float tmp;\n\n  tmp = xcoord * vec.x() - ycoord * vec.y();\n  ycoord = ycoord * vec.x() + xcoord * vec.y();\n  xcoord = tmp;\n}\n\ninline void FCOORD::unrotate(const FCOORD &vec) {\n  rotate(FCOORD(vec.x(), -vec.y()));\n}\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/polyaprx.cpp",
    "content": "/**********************************************************************\n * File:        polyaprx.cpp\n * Description: Code for polygonal approximation from old edgeprog.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"polyaprx.h\"\n\n#include \"blobs.h\"   // for EDGEPT, TPOINT, VECTOR, TESSLINE\n#include \"coutln.h\"  // for C_OUTLINE\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"mod128.h\"  // for DIR128\n#include \"params.h\"  // for BoolParam, BOOL_VAR\n#include \"points.h\"  // for ICOORD\n#include \"rect.h\"    // for TBOX\n#include \"tprintf.h\" // for tprintf\n\n#include <cstdint> // for INT16_MAX, int8_t\n\nnamespace tesseract {\n\n#define FASTEDGELENGTH 256\n\nstatic BOOL_VAR(poly_debug, false, \"Debug old poly\");\nstatic BOOL_VAR(poly_wide_objects_better, true,\n                \"More accurate approx on wide things\");\n\n#define fixed_dist 20  // really an int_variable\n#define approx_dist 15 // really an int_variable\n\nconst int par1 = 4500 / (approx_dist * approx_dist);\nconst int par2 = 6750 / (approx_dist * approx_dist);\n\n/**********************************************************************\n *cutline(first,last,area) straightens out a line by partitioning\n *and joining the ends by a straight line*\n **********************************************************************/\n\nstatic void cutline(       // recursive refine\n    EDGEPT *first,         // ends of line\n    EDGEPT *last, int area // area of object\n) {\n  EDGEPT *edge;     // current edge\n  TPOINT vecsum;    // vector sum\n  int vlen;         // approx length of vecsum\n  TPOINT vec;       // accumulated vector\n  EDGEPT *maxpoint; // worst point\n  int maxperp;      // max deviation\n  int perp;         // perp distance\n  int ptcount;      // no of points\n  int squaresum;    // sum of perps\n\n  edge = first; // start of line\n  if (edge->next == last) {\n    return; // simple line\n  }\n\n  // vector sum\n  vecsum.x = last->pos.x - edge->pos.x;\n  vecsum.y = last->pos.y - edge->pos.y;\n  if (vecsum.x == 0 && vecsum.y == 0) {\n    // special case\n    vecsum.x = -edge->prev->vec.x;\n    vecsum.y = -edge->prev->vec.y;\n  }\n  // absolute value\n  vlen = vecsum.x > 0 ? vecsum.x : -vecsum.x;\n  if (vecsum.y > vlen) {\n    vlen = vecsum.y; // maximum\n  } else if (-vecsum.y > vlen) {\n    vlen = -vecsum.y; // absolute value\n  }\n\n  vec.x = edge->vec.x; // accumulated vector\n  vec.y = edge->vec.y;\n  maxperp = 0; // none yet\n  squaresum = ptcount = 0;\n  edge = edge->next; // move to actual point\n  maxpoint = edge;   // in case there isn't one\n  do {\n    perp = vec.cross(vecsum); // get perp distance\n    if (perp != 0) {\n      perp *= perp; // squared deviation\n    }\n    squaresum += perp; // sum squares\n    ptcount++;         // count points\n    if (poly_debug) {\n      tprintf(\"Cutline:Final perp=%d\\n\", perp);\n    }\n    if (perp > maxperp) {\n      maxperp = perp;\n      maxpoint = edge; // find greatest deviation\n    }\n    vec.x += edge->vec.x; // accumulate vectors\n    vec.y += edge->vec.y;\n    edge = edge->next;\n  } while (edge != last); // test all line\n\n  perp = vecsum.length2();\n  ASSERT_HOST(perp != 0);\n\n  if (maxperp < 256 * INT16_MAX) {\n    maxperp <<= 8;\n    maxperp /= perp; // true max perp\n  } else {\n    maxperp /= perp;\n    maxperp <<= 8; // avoid overflow\n  }\n  if (squaresum < 256 * INT16_MAX) {\n    // mean squared perp\n    perp = (squaresum << 8) / (perp * ptcount);\n  } else {\n    // avoid overflow\n    perp = (squaresum / perp << 8) / ptcount;\n  }\n\n  if (poly_debug) {\n    tprintf(\"Cutline:A=%d, max=%.2f(%.2f%%), msd=%.2f(%.2f%%)\\n\", area,\n            maxperp / 256.0, maxperp * 200.0 / area, perp / 256.0,\n            perp * 300.0 / area);\n  }\n  if (maxperp * par1 >= 10 * area || perp * par2 >= 10 * area || vlen >= 126) {\n    maxpoint->fixed = true;\n    // partitions\n    cutline(first, maxpoint, area);\n    cutline(maxpoint, last, area);\n  }\n}\n\n/**********************************************************************\n * edgesteps_to_edgepts\n *\n * Convert a C_OUTLINE to EDGEPTs.\n **********************************************************************/\n\nstatic EDGEPT *edgesteps_to_edgepts( // convert outline\n    C_OUTLINE *c_outline,            // input\n    EDGEPT edgepts[]                 // output is array\n) {\n  int32_t length;    // steps in path\n  ICOORD pos;        // current coords\n  int32_t stepindex; // current step\n  int32_t stepinc;   // increment\n  int32_t epindex;   // current EDGEPT\n  ICOORD vec;        // for this 8 step\n  ICOORD prev_vec;\n  int8_t epdir;   // of this step\n  DIR128 prevdir; // previous dir\n  DIR128 dir;     // of this step\n\n  pos = c_outline->start_pos(); // start of loop\n  length = c_outline->pathlength();\n  stepindex = 0;\n  epindex = 0;\n  prevdir = -1;\n  // repeated steps\n  uint32_t count = 0;\n  int prev_stepindex = 0;\n  do {\n    dir = c_outline->step_dir(stepindex);\n    vec = c_outline->step(stepindex);\n    if (stepindex < length - 1 &&\n        c_outline->step_dir(stepindex + 1) - dir == -32) {\n      dir += 128 - 16;\n      vec += c_outline->step(stepindex + 1);\n      stepinc = 2;\n    } else {\n      stepinc = 1;\n    }\n    if (count == 0) {\n      prevdir = dir;\n      prev_vec = vec;\n    }\n    if (prevdir.get_dir() != dir.get_dir()) {\n      edgepts[epindex].pos.x = pos.x();\n      edgepts[epindex].pos.y = pos.y();\n      prev_vec *= count;\n      edgepts[epindex].vec.x = prev_vec.x();\n      edgepts[epindex].vec.y = prev_vec.y();\n      pos += prev_vec;\n      edgepts[epindex].runlength = count;\n      edgepts[epindex].prev = &edgepts[epindex - 1];\n      // TODO: reset is_hidden, too?\n      edgepts[epindex].fixed = false;\n      edgepts[epindex].next = &edgepts[epindex + 1];\n      prevdir += 64;\n      epdir = DIR128(0) - prevdir;\n      epdir >>= 4;\n      epdir &= 7;\n      edgepts[epindex].dir = epdir;\n      edgepts[epindex].src_outline = c_outline;\n      edgepts[epindex].start_step = prev_stepindex;\n      edgepts[epindex].step_count = stepindex - prev_stepindex;\n      epindex++;\n      prevdir = dir;\n      prev_vec = vec;\n      count = 1;\n      prev_stepindex = stepindex;\n    } else {\n      count++;\n    }\n    stepindex += stepinc;\n  } while (stepindex < length);\n  edgepts[epindex].pos.x = pos.x();\n  edgepts[epindex].pos.y = pos.y();\n  prev_vec *= count;\n  edgepts[epindex].vec.x = prev_vec.x();\n  edgepts[epindex].vec.y = prev_vec.y();\n  pos += prev_vec;\n  edgepts[epindex].runlength = count;\n  // TODO: reset is_hidden, too?\n  edgepts[epindex].fixed = false;\n  edgepts[epindex].src_outline = c_outline;\n  edgepts[epindex].start_step = prev_stepindex;\n  edgepts[epindex].step_count = stepindex - prev_stepindex;\n  edgepts[epindex].prev = &edgepts[epindex - 1];\n  edgepts[epindex].next = &edgepts[0];\n  prevdir += 64;\n  epdir = DIR128(0) - prevdir;\n  epdir >>= 4;\n  epdir &= 7;\n  edgepts[epindex].dir = epdir;\n  edgepts[0].prev = &edgepts[epindex];\n  ASSERT_HOST(pos.x() == c_outline->start_pos().x() &&\n              pos.y() == c_outline->start_pos().y());\n  return &edgepts[0];\n}\n\n/**********************************************************************\n *fix2(start,area) fixes points on the outline according to a trial method*\n **********************************************************************/\n\nstatic void fix2(  // polygonal approx\n    EDGEPT *start, // loop to approximate\n    int area) {\n  EDGEPT *edgept; // current point\n  EDGEPT *edgept1;\n  EDGEPT *loopstart; // modified start of loop\n  EDGEPT *linestart; // start of line segment\n  int fixed_count;   // no of fixed points\n  int8_t dir;\n  int d01, d12, d23, gapmin;\n  TPOINT d01vec, d12vec, d23vec;\n  EDGEPT *edgefix, *startfix;\n  EDGEPT *edgefix0, *edgefix1, *edgefix2, *edgefix3;\n\n  edgept = start; // start of loop\n  while (((edgept->dir - edgept->prev->dir + 1) & 7) < 3 &&\n         (dir = (edgept->prev->dir - edgept->next->dir) & 7) != 2 && dir != 6) {\n    edgept = edgept->next; // find suitable start\n  }\n  loopstart = edgept; // remember start\n\n  // completed flag\n  bool stopped = false;\n  edgept->fixed = true; // fix it\n  do {\n    linestart = edgept;      // possible start of line\n    auto dir1 = edgept->dir; // first direction\n    // length of dir1\n    auto sum1 = edgept->runlength;\n    edgept = edgept->next;\n    auto dir2 = edgept->dir; // 2nd direction\n    // length in dir2\n    auto sum2 = edgept->runlength;\n    if (((dir1 - dir2 + 1) & 7) < 3) {\n      while (edgept->prev->dir == edgept->next->dir) {\n        edgept = edgept->next; // look at next\n        if (edgept->dir == dir1) {\n          // sum lengths\n          sum1 += edgept->runlength;\n        } else {\n          sum2 += edgept->runlength;\n        }\n      }\n\n      if (edgept == loopstart) {\n        // finished\n        stopped = true;\n      }\n      if (sum2 + sum1 > 2 && linestart->prev->dir == dir2 &&\n          (linestart->prev->runlength > linestart->runlength || sum2 > sum1)) {\n        // start is back one\n        linestart = linestart->prev;\n        linestart->fixed = true;\n      }\n\n      if (((edgept->next->dir - edgept->dir + 1) & 7) >= 3 ||\n          (edgept->dir == dir1 && sum1 >= sum2) ||\n          ((edgept->prev->runlength < edgept->runlength ||\n            (edgept->dir == dir2 && sum2 >= sum1)) &&\n           linestart->next != edgept)) {\n        edgept = edgept->next;\n      }\n    }\n    // sharp bend\n    edgept->fixed = true;\n  }\n  // do whole loop\n  while (edgept != loopstart && !stopped);\n\n  edgept = start;\n  do {\n    if (((edgept->runlength >= 8) && (edgept->dir != 2) &&\n         (edgept->dir != 6)) ||\n        ((edgept->runlength >= 8) &&\n         ((edgept->dir == 2) || (edgept->dir == 6)))) {\n      edgept->fixed = true;\n      edgept1 = edgept->next;\n      edgept1->fixed = true;\n    }\n    edgept = edgept->next;\n  } while (edgept != start);\n\n  edgept = start;\n  do {\n    // single fixed step\n    if (edgept->fixed &&\n        edgept->runlength == 1\n        // and neighbours free\n        && edgept->next->fixed &&\n        !edgept->prev->fixed\n        // same pair of dirs\n        && !edgept->next->next->fixed &&\n        edgept->prev->dir == edgept->next->dir &&\n        edgept->prev->prev->dir == edgept->next->next->dir &&\n        ((edgept->prev->dir - edgept->dir + 1) & 7) < 3) {\n      // unfix it\n      edgept->fixed = false;\n      edgept->next->fixed = false;\n    }\n    edgept = edgept->next;   // do all points\n  } while (edgept != start); // until finished\n\n  stopped = false;\n  if (area < 450) {\n    area = 450;\n  }\n\n  gapmin = area * fixed_dist * fixed_dist / 44000;\n\n  edgept = start;\n  fixed_count = 0;\n  do {\n    if (edgept->fixed) {\n      fixed_count++;\n    }\n    edgept = edgept->next;\n  } while (edgept != start);\n  while (!edgept->fixed) {\n    edgept = edgept->next;\n  }\n  edgefix0 = edgept;\n\n  edgept = edgept->next;\n  while (!edgept->fixed) {\n    edgept = edgept->next;\n  }\n  edgefix1 = edgept;\n\n  edgept = edgept->next;\n  while (!edgept->fixed) {\n    edgept = edgept->next;\n  }\n  edgefix2 = edgept;\n\n  edgept = edgept->next;\n  while (!edgept->fixed) {\n    edgept = edgept->next;\n  }\n  edgefix3 = edgept;\n\n  startfix = edgefix2;\n\n  do {\n    if (fixed_count <= 3) {\n      break; // already too few\n    }\n    d12vec.diff(edgefix1->pos, edgefix2->pos);\n    d12 = d12vec.length2();\n    // TODO(rays) investigate this change:\n    // Only unfix a point if it is part of a low-curvature section\n    // of outline and the total angle change of the outlines is\n    // less than 90 degrees, ie the scalar product is positive.\n    // if (d12 <= gapmin && edgefix0->vec.dot(edgefix2->vec) > 0) {\n    if (d12 <= gapmin) {\n      d01vec.diff(edgefix0->pos, edgefix1->pos);\n      d01 = d01vec.length2();\n      d23vec.diff(edgefix2->pos, edgefix3->pos);\n      d23 = d23vec.length2();\n      if (d01 > d23) {\n        edgefix2->fixed = false;\n        fixed_count--;\n      } else {\n        edgefix1->fixed = false;\n        fixed_count--;\n        edgefix1 = edgefix2;\n      }\n    } else {\n      edgefix0 = edgefix1;\n      edgefix1 = edgefix2;\n    }\n    edgefix2 = edgefix3;\n    edgept = edgept->next;\n    while (!edgept->fixed) {\n      if (edgept == startfix) {\n        stopped = true;\n      }\n      edgept = edgept->next;\n    }\n    edgefix3 = edgept;\n    edgefix = edgefix2;\n  } while ((edgefix != startfix) && (!stopped));\n}\n\n/**********************************************************************\n *poly2(startpt,area,path) applies a second approximation to the outline\n *using the points which have been fixed by the first approximation*\n **********************************************************************/\n\nstatic EDGEPT *poly2( // second poly\n    EDGEPT *startpt,  // start of loop\n    int area          // area of blob box\n) {\n  EDGEPT *edgept;    // current outline point\n  EDGEPT *loopstart; // starting point\n  EDGEPT *linestart; // start of line\n  int edgesum;       // correction count\n\n  if (area < 1200) {\n    area = 1200; // minimum value\n  }\n\n  loopstart = nullptr; // not found it yet\n  edgept = startpt;    // start of loop\n\n  do {\n    // current point fixed and next not\n    if (edgept->fixed && !edgept->next->fixed) {\n      loopstart = edgept; // start of repoly\n      break;\n    }\n    edgept = edgept->next;     // next point\n  } while (edgept != startpt); // until found or finished\n\n  if (loopstart == nullptr && !startpt->fixed) {\n    // fixed start of loop\n    startpt->fixed = true;\n    loopstart = startpt; // or start of loop\n  }\n  if (loopstart) {\n    do {\n      edgept = loopstart; // first to do\n      do {\n        linestart = edgept;\n        edgesum = 0; // sum of lengths\n        do {\n          // sum lengths\n          edgesum += edgept->runlength;\n          edgept = edgept->next; // move on\n        } while (!edgept->fixed && edgept != loopstart && edgesum < 126);\n        if (poly_debug) {\n          tprintf(\"Poly2:starting at (%d,%d)+%d=(%d,%d),%d to (%d,%d)\\n\",\n                  linestart->pos.x, linestart->pos.y, linestart->dir,\n                  linestart->vec.x, linestart->vec.y, edgesum, edgept->pos.x,\n                  edgept->pos.y);\n        }\n        // reapproximate\n        cutline(linestart, edgept, area);\n\n        while (edgept->next->fixed && edgept != loopstart) {\n          edgept = edgept->next; // look for next non-fixed\n        }\n      }\n      // do all the loop\n      while (edgept != loopstart);\n      edgesum = 0;\n      do {\n        if (edgept->fixed) {\n          edgesum++;\n        }\n        edgept = edgept->next;\n      }\n      // count fixed pts\n      while (edgept != loopstart);\n      if (edgesum < 3) {\n        area /= 2; // must have 3 pts\n      }\n    } while (edgesum < 3);\n    do {\n      linestart = edgept;\n      do {\n        edgept = edgept->next;\n      } while (!edgept->fixed);\n      linestart->next = edgept;\n      edgept->prev = linestart;\n      linestart->vec.x = edgept->pos.x - linestart->pos.x;\n      linestart->vec.y = edgept->pos.y - linestart->pos.y;\n    } while (edgept != loopstart);\n  } else {\n    edgept = startpt; // start of loop\n  }\n\n  loopstart = edgept; // new start\n  return loopstart;   // correct exit\n}\n\n/**********************************************************************\n * tesspoly_outline\n *\n * Approximate an outline from chain codes form using the old tess algorithm.\n * If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB\n * contain pointers to the input C_OUTLINEs that enable higher-resolution\n * feature extraction that does not use the polygonal approximation.\n **********************************************************************/\n\nTESSLINE *ApproximateOutline(bool allow_detailed_fx, C_OUTLINE *c_outline) {\n  EDGEPT stack_edgepts[FASTEDGELENGTH]; // converted path\n  EDGEPT *edgepts = stack_edgepts;\n\n  // Use heap memory if the stack buffer is not big enough.\n  if (c_outline->pathlength() > FASTEDGELENGTH) {\n    edgepts = new EDGEPT[c_outline->pathlength()];\n  }\n\n  // bounding box\n  const auto &loop_box = c_outline->bounding_box();\n  int32_t area = loop_box.height();\n  if (!poly_wide_objects_better && loop_box.width() > area) {\n    area = loop_box.width();\n  }\n  area *= area;\n  edgesteps_to_edgepts(c_outline, edgepts);\n  fix2(edgepts, area);\n  EDGEPT *edgept = poly2(edgepts, area); // 2nd approximation.\n  EDGEPT *startpt = edgept;\n  EDGEPT *result = nullptr;\n  EDGEPT *prev_result = nullptr;\n  do {\n    auto *new_pt = new EDGEPT;\n    new_pt->pos = edgept->pos;\n    new_pt->prev = prev_result;\n    if (prev_result == nullptr) {\n      result = new_pt;\n    } else {\n      prev_result->next = new_pt;\n      new_pt->prev = prev_result;\n    }\n    if (allow_detailed_fx) {\n      new_pt->src_outline = edgept->src_outline;\n      new_pt->start_step = edgept->start_step;\n      new_pt->step_count = edgept->step_count;\n    }\n    prev_result = new_pt;\n    edgept = edgept->next;\n  } while (edgept != startpt);\n  prev_result->next = result;\n  result->prev = prev_result;\n  if (edgepts != stack_edgepts) {\n    delete[] edgepts;\n  }\n  return TESSLINE::BuildFromOutlineList(result);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/polyaprx.h",
    "content": "/**********************************************************************\n * File:        polyaprx.h\n * Description: Code for polygonal approximation from old edgeprog.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef POLYAPRX_H\n#define POLYAPRX_H\n\nnamespace tesseract {\n\nclass C_OUTLINE;\nstruct TESSLINE;\n\n// convert a chain-coded input to the old OUTLINE approximation\nTESSLINE *ApproximateOutline(bool allow_detailed_fx, C_OUTLINE *c_outline);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/polyblk.cpp",
    "content": "/**********************************************************************\n * File:        polyblk.cpp  (Formerly poly_block.c)\n * Description: Polygonal blocks\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"polyblk.h\"\n\n#include \"elst.h\"\n\n#include <cctype>\n#include <cinttypes> // PRId32\n#include <cmath>\n#include <cstdio>\n#include <memory> // std::unique_ptr\n\nnamespace tesseract {\n\n#define INTERSECTING INT16_MAX\n\nPOLY_BLOCK::POLY_BLOCK(ICOORDELT_LIST *points, PolyBlockType t) {\n  ICOORDELT_IT v = &vertices;\n\n  vertices.clear();\n  v.move_to_first();\n  v.add_list_before(points);\n  compute_bb();\n  type = t;\n}\n\n// Initialize from box coordinates.\nPOLY_BLOCK::POLY_BLOCK(const TBOX &tbox, PolyBlockType t) {\n  vertices.clear();\n  ICOORDELT_IT v = &vertices;\n  v.move_to_first();\n  v.add_to_end(new ICOORDELT(tbox.left(), tbox.top()));\n  v.add_to_end(new ICOORDELT(tbox.left(), tbox.bottom()));\n  v.add_to_end(new ICOORDELT(tbox.right(), tbox.bottom()));\n  v.add_to_end(new ICOORDELT(tbox.right(), tbox.top()));\n  compute_bb();\n  type = t;\n}\n\n/**\n * @name POLY_BLOCK::compute_bb\n *\n * Compute the bounding box from the outline points.\n */\n\nvoid POLY_BLOCK::compute_bb() { // constructor\n  ICOORD ibl, itr;              // integer bb\n  ICOORD botleft;               // bounding box\n  ICOORD topright;\n  ICOORD pos;                   // current pos;\n  ICOORDELT_IT pts = &vertices; // iterator\n\n  botleft = *pts.data();\n  topright = botleft;\n  do {\n    pos = *pts.data();\n    if (pos.x() < botleft.x()) {\n      // get bounding box\n      botleft = ICOORD(pos.x(), botleft.y());\n    }\n    if (pos.y() < botleft.y()) {\n      botleft = ICOORD(botleft.x(), pos.y());\n    }\n    if (pos.x() > topright.x()) {\n      topright = ICOORD(pos.x(), topright.y());\n    }\n    if (pos.y() > topright.y()) {\n      topright = ICOORD(topright.x(), pos.y());\n    }\n    pts.forward();\n  } while (!pts.at_first());\n  ibl = ICOORD(botleft.x(), botleft.y());\n  itr = ICOORD(topright.x(), topright.y());\n  box = TBOX(ibl, itr);\n}\n\n/**\n * @name POLY_BLOCK::winding_number\n *\n * Return the winding number of the outline around the given point.\n * @param point point to wind around\n */\n\nint16_t POLY_BLOCK::winding_number(const ICOORD &point) {\n  int16_t count;               // winding count\n  ICOORD pt;                   // current point\n  ICOORD vec;                  // point to current point\n  ICOORD vvec;                 // current point to next point\n  int32_t cross;               // cross product\n  ICOORDELT_IT it = &vertices; // iterator\n\n  count = 0;\n  do {\n    pt = *it.data();\n    vec = pt - point;\n    vvec = *it.data_relative(1) - pt;\n    // crossing the line\n    if (vec.y() <= 0 && vec.y() + vvec.y() > 0) {\n      cross = vec * vvec; // cross product\n      if (cross > 0) {\n        count++; // crossing right half\n      } else if (cross == 0) {\n        return INTERSECTING; // going through point\n      }\n    } else if (vec.y() > 0 && vec.y() + vvec.y() <= 0) {\n      cross = vec * vvec;\n      if (cross < 0) {\n        count--; // crossing back\n      } else if (cross == 0) {\n        return INTERSECTING; // illegal\n      }\n    } else if (vec.y() == 0 && vec.x() == 0) {\n      return INTERSECTING;\n    }\n    it.forward();\n  } while (!it.at_first());\n  return count; // winding number\n}\n\n/// @return true if other is inside this.\nbool POLY_BLOCK::contains(POLY_BLOCK *other) {\n  int16_t count;               // winding count\n  ICOORDELT_IT it = &vertices; // iterator\n  ICOORD vertex;\n\n  if (!box.overlap(*(other->bounding_box()))) {\n    return false; // can't be contained\n  }\n\n  /* check that no vertex of this is inside other */\n\n  do {\n    vertex = *it.data();\n    // get winding number\n    count = other->winding_number(vertex);\n    if (count != INTERSECTING) {\n      if (count != 0) {\n        return false;\n      }\n    }\n    it.forward();\n  } while (!it.at_first());\n\n  /* check that all vertices of other are inside this */\n\n  // switch lists\n  it.set_to_list(other->points());\n  do {\n    vertex = *it.data();\n    // try other way round\n    count = winding_number(vertex);\n    if (count != INTERSECTING) {\n      if (count == 0) {\n        return false;\n      }\n    }\n    it.forward();\n  } while (!it.at_first());\n  return true;\n}\n\n/**\n * @name POLY_BLOCK::rotate\n *\n * Rotate the POLY_BLOCK.\n * @param rotation cos, sin of angle\n */\n\nvoid POLY_BLOCK::rotate(FCOORD rotation) {\n  FCOORD pos;                   // current pos;\n  ICOORDELT *pt;                // current point\n  ICOORDELT_IT pts = &vertices; // iterator\n\n  do {\n    pt = pts.data();\n    pos.set_x(pt->x());\n    pos.set_y(pt->y());\n    pos.rotate(rotation);\n    pt->set_x(static_cast<TDimension>(floor(pos.x() + 0.5)));\n    pt->set_y(static_cast<TDimension>(floor(pos.y() + 0.5)));\n    pts.forward();\n  } while (!pts.at_first());\n  compute_bb();\n}\n\n/**\n * @name POLY_BLOCK::reflect_in_y_axis\n *\n * Reflect the coords of the polygon in the y-axis. (Flip the sign of x.)\n */\n\nvoid POLY_BLOCK::reflect_in_y_axis() {\n  ICOORDELT *pt;                // current point\n  ICOORDELT_IT pts = &vertices; // Iterator.\n\n  do {\n    pt = pts.data();\n    pt->set_x(-pt->x());\n    pts.forward();\n  } while (!pts.at_first());\n  compute_bb();\n}\n\n/**\n * POLY_BLOCK::move\n *\n * Move the POLY_BLOCK.\n * @param shift x,y translation vector\n */\n\nvoid POLY_BLOCK::move(ICOORD shift) {\n  ICOORDELT *pt;                // current point\n  ICOORDELT_IT pts = &vertices; // iterator\n\n  do {\n    pt = pts.data();\n    *pt += shift;\n    pts.forward();\n  } while (!pts.at_first());\n  compute_bb();\n}\n\n#ifndef GRAPHICS_DISABLED\nvoid POLY_BLOCK::plot(ScrollView *window, int32_t num) {\n  ICOORDELT_IT v = &vertices;\n\n  window->Pen(ColorForPolyBlockType(type));\n\n  v.move_to_first();\n\n  if (num > 0) {\n    window->TextAttributes(\"Times\", 80, false, false, false);\n    char temp_buff[34];\n#  if !defined(_WIN32) || defined(__MINGW32__)\n    snprintf(temp_buff, sizeof(temp_buff), \"%\" PRId32, num);\n#  else\n    _ltoa(num, temp_buff, 10);\n#  endif\n    window->Text(v.data()->x(), v.data()->y(), temp_buff);\n  }\n\n  window->SetCursor(v.data()->x(), v.data()->y());\n  for (v.mark_cycle_pt(); !v.cycled_list(); v.forward()) {\n    window->DrawTo(v.data()->x(), v.data()->y());\n  }\n  v.move_to_first();\n  window->DrawTo(v.data()->x(), v.data()->y());\n}\n\nvoid POLY_BLOCK::fill(ScrollView *window, ScrollView::Color colour) {\n  ICOORDELT_IT s_it;\n\n  std::unique_ptr<PB_LINE_IT> lines(new PB_LINE_IT(this));\n  window->Pen(colour);\n\n  for (auto y = this->bounding_box()->bottom(); y <= this->bounding_box()->top(); y++) {\n    const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments(lines->get_line(y));\n    if (!segments->empty()) {\n      s_it.set_to_list(segments.get());\n      for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {\n        // Note different use of ICOORDELT, x coord is x coord of pixel\n        // at the start of line segment, y coord is length of line segment\n        // Last pixel is start pixel + length.\n        auto width = s_it.data()->y();\n        window->SetCursor(s_it.data()->x(), y);\n        window->DrawTo(s_it.data()->x() + static_cast<float>(width), y);\n      }\n    }\n  }\n}\n#endif\n\n/// @return true if the polygons of other and this overlap.\nbool POLY_BLOCK::overlap(POLY_BLOCK *other) {\n  int16_t count;               // winding count\n  ICOORDELT_IT it = &vertices; // iterator\n  ICOORD vertex;\n\n  if (!box.overlap(*(other->bounding_box()))) {\n    return false; // can't be any overlap.\n  }\n\n  /* see if a vertex of this is inside other */\n\n  do {\n    vertex = *it.data();\n    // get winding number\n    count = other->winding_number(vertex);\n    if (count != INTERSECTING) {\n      if (count != 0) {\n        return true;\n      }\n    }\n    it.forward();\n  } while (!it.at_first());\n\n  /* see if a vertex of other is inside this */\n\n  // switch lists\n  it.set_to_list(other->points());\n  do {\n    vertex = *it.data();\n    // try other way round\n    count = winding_number(vertex);\n    if (count != INTERSECTING) {\n      if (count != 0) {\n        return true;\n      }\n    }\n    it.forward();\n  } while (!it.at_first());\n  return false;\n}\n\nICOORDELT_LIST *PB_LINE_IT::get_line(TDimension y) {\n  ICOORDELT_IT v, r;\n  ICOORDELT_LIST *result;\n  ICOORDELT *x, *current, *previous;\n  float fy = y + 0.5f;\n  result = new ICOORDELT_LIST();\n  r.set_to_list(result);\n  v.set_to_list(block->points());\n\n  for (v.mark_cycle_pt(); !v.cycled_list(); v.forward()) {\n    if (((v.data_relative(-1)->y() > y) && (v.data()->y() <= y)) ||\n        ((v.data_relative(-1)->y() <= y) && (v.data()->y() > y))) {\n      previous = v.data_relative(-1);\n      current = v.data();\n      float fx =\n          0.5f + previous->x() +\n          (current->x() - previous->x()) * (fy - previous->y()) / (current->y() - previous->y());\n      x = new ICOORDELT(static_cast<TDimension>(fx), 0);\n      r.add_to_end(x);\n    }\n  }\n\n  if (!r.empty()) {\n    r.sort([](const ICOORDELT *p1, const ICOORDELT *p2) {\n      if (p1->x() < p2->x()) {\n        return (-1);\n      } else if (p1->x() > p2->x()) {\n        return (1);\n      } else {\n        return (0);\n      }\n      });\n    for (r.mark_cycle_pt(); !r.cycled_list(); r.forward()) {\n      x = r.data();\n    }\n    for (r.mark_cycle_pt(); !r.cycled_list(); r.forward()) {\n      r.data()->set_y(r.data_relative(1)->x() - r.data()->x());\n      r.forward();\n      delete (r.extract());\n    }\n  }\n\n  return result;\n}\n\n#ifndef GRAPHICS_DISABLED\n/// Returns a color to draw the given type.\nScrollView::Color POLY_BLOCK::ColorForPolyBlockType(PolyBlockType type) {\n  // Keep kPBColors in sync with PolyBlockType.\n  const ScrollView::Color kPBColors[PT_COUNT] = {\n      ScrollView::WHITE,       // Type is not yet known. Keep as the 1st element.\n      ScrollView::BLUE,        // Text that lives inside a column.\n      ScrollView::CYAN,        // Text that spans more than one column.\n      ScrollView::MEDIUM_BLUE, // Text that is in a cross-column pull-out\n                               // region.\n      ScrollView::AQUAMARINE,  // Partition belonging to an equation region.\n      ScrollView::SKY_BLUE,    // Partition belonging to an inline equation\n                               // region.\n      ScrollView::MAGENTA,     // Partition belonging to a table region.\n      ScrollView::GREEN,       // Text-line runs vertically.\n      ScrollView::LIGHT_BLUE,  // Text that belongs to an image.\n      ScrollView::RED,         // Image that lives inside a column.\n      ScrollView::YELLOW,      // Image that spans more than one column.\n      ScrollView::ORANGE,      // Image in a cross-column pull-out region.\n      ScrollView::BROWN,       // Horizontal Line.\n      ScrollView::DARK_GREEN,  // Vertical Line.\n      ScrollView::GREY         // Lies outside of any column.\n  };\n  if (type < PT_COUNT) {\n    return kPBColors[type];\n  }\n  return ScrollView::WHITE;\n}\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/polyblk.h",
    "content": "/**********************************************************************\n * File:        polyblk.h  (Formerly poly_block.h)\n * Description: Polygonal blocks\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef POLYBLK_H\n#define POLYBLK_H\n\n#include \"elst.h\"\n#include \"points.h\"\n#include \"rect.h\"\n#include \"scrollview.h\"\n\n#include <tesseract/publictypes.h>\n\nnamespace tesseract {\n\nclass TESS_API POLY_BLOCK {\npublic:\n  POLY_BLOCK() = default;\n  // Initialize from box coordinates.\n  POLY_BLOCK(const TBOX &tbox, PolyBlockType type);\n  POLY_BLOCK(ICOORDELT_LIST *points, PolyBlockType type);\n  ~POLY_BLOCK() = default;\n\n  TBOX *bounding_box() { // access function\n    return &box;\n  }\n\n  ICOORDELT_LIST *points() { // access function\n    return &vertices;\n  }\n\n  void compute_bb();\n\n  PolyBlockType isA() const {\n    return type;\n  }\n\n  bool IsText() const {\n    return PTIsTextType(type);\n  }\n\n  // Rotate about the origin by the given rotation. (Analogous to\n  // multiplying by a complex number.\n  void rotate(FCOORD rotation);\n  // Reflect the coords of the polygon in the y-axis. (Flip the sign of x.)\n  void reflect_in_y_axis();\n  // Move by adding shift to all coordinates.\n  void move(ICOORD shift);\n\n#ifndef GRAPHICS_DISABLED\n\n  void plot(ScrollView *window, int32_t num);\n\n  void fill(ScrollView *window, ScrollView::Color colour);\n#endif // !GRAPHICS_DISABLED\n\n  // Returns true if other is inside this.\n  bool contains(POLY_BLOCK *other);\n\n  // Returns true if the polygons of other and this overlap.\n  bool overlap(POLY_BLOCK *other);\n\n  // Returns the winding number of this around the test_pt.\n  // Positive for anticlockwise, negative for clockwise, and zero for\n  // test_pt outside this.\n  int16_t winding_number(const ICOORD &test_pt);\n\n#ifndef GRAPHICS_DISABLED\n  // Static utility functions to handle the PolyBlockType.\n  // Returns a color to draw the given type.\n  static ScrollView::Color ColorForPolyBlockType(PolyBlockType type);\n#endif // !GRAPHICS_DISABLED\n\nprivate:\n  ICOORDELT_LIST vertices; // vertices\n  TBOX box;                // bounding box\n  PolyBlockType type;      // Type of this region.\n};\n\n// Class to iterate the scanlines of a polygon.\nclass PB_LINE_IT {\npublic:\n  PB_LINE_IT(POLY_BLOCK *blkptr) {\n    block = blkptr;\n  }\n\n  void set_to_block(POLY_BLOCK *blkptr) {\n    block = blkptr;\n  }\n\n  // Returns a list of runs of pixels for the given y coord.\n  // Each element of the returned list is the start (x) and extent(y) of\n  // a run inside the region.\n  // Delete the returned list after use.\n  ICOORDELT_LIST *get_line(TDimension y);\n\nprivate:\n  POLY_BLOCK *block;\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/quadlsq.cpp",
    "content": "/**********************************************************************\n * File:        quadlsq.cpp  (Formerly qlsq.c)\n * Description: Code for least squares approximation of quadratics.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"quadlsq.h\"\n\n#include \"tprintf.h\"\n\n#include <cmath>\n#include <cstdio>\n\nnamespace tesseract {\n\n// Minimum variance in least squares before backing off to a lower degree.\nconst long double kMinVariance = 1.0L / 1024;\n\n/**********************************************************************\n * QLSQ::clear\n *\n * Function to initialize a QLSQ.\n **********************************************************************/\n\nvoid QLSQ::clear() { // initialize\n  a = 0.0;\n  b = 0.0;\n  c = 0.0;\n  n = 0;      // No elements.\n  sigx = 0.0; // Zero accumulators.\n  sigy = 0.0;\n  sigxx = 0.0;\n  sigxy = 0.0;\n  sigyy = 0.0;\n  sigxxx = 0.0;\n  sigxxy = 0.0;\n  sigxxxx = 0.0;\n}\n\n/**********************************************************************\n * QLSQ::add\n *\n * Add an element to the accumulator.\n **********************************************************************/\n\nvoid QLSQ::add(double x, double y) {\n  n++;       // Count elements.\n  sigx += x; // Update accumulators.\n  sigy += y;\n  sigxx += x * x;\n  sigxy += x * y;\n  sigyy += y * y;\n  sigxxx += static_cast<long double>(x) * x * x;\n  sigxxy += static_cast<long double>(x) * x * y;\n  sigxxxx += static_cast<long double>(x) * x * x * x;\n}\n\n/**********************************************************************\n * QLSQ::remove\n *\n * Delete an element from the accumulator.\n **********************************************************************/\n\nvoid QLSQ::remove(double x, double y) {\n  if (n <= 0) {\n    tprintf(\"Can't remove an element from an empty QLSQ accumulator!\\n\");\n    return;\n  }\n  n--;       // Count elements.\n  sigx -= x; // Update accumulators.\n  sigy -= y;\n  sigxx -= x * x;\n  sigxy -= x * y;\n  sigyy -= y * y;\n  sigxxx -= static_cast<long double>(x) * x * x;\n  sigxxy -= static_cast<long double>(x) * x * y;\n  sigxxxx -= static_cast<long double>(x) * x * x * x;\n}\n\n/**********************************************************************\n * QLSQ::fit\n *\n * Fit the given degree of polynomial and store the result.\n * This creates a quadratic of the form axx + bx + c, but limited to\n * the given degree.\n **********************************************************************/\n\nvoid QLSQ::fit(int degree) {\n  long double x_variance =\n      static_cast<long double>(sigxx) * n - static_cast<long double>(sigx) * sigx;\n\n  // Note: for computational efficiency, we do not normalize the variance,\n  // covariance and cube variance here as they are in the same order in both\n  // nominators and denominators. However, we need be careful in value range\n  // check.\n  if (x_variance < kMinVariance * n * n || degree < 1 || n < 2) {\n    // We cannot calculate b reliably so forget a and b, and just work on c.\n    a = b = 0.0;\n    if (n >= 1 && degree >= 0) {\n      c = sigy / n;\n    } else {\n      c = 0.0;\n    }\n    return;\n  }\n  long double top96 = 0.0;    // Accurate top.\n  long double bottom96 = 0.0; // Accurate bottom.\n  long double cubevar = sigxxx * n - static_cast<long double>(sigxx) * sigx;\n  long double covariance =\n      static_cast<long double>(sigxy) * n - static_cast<long double>(sigx) * sigy;\n\n  if (n >= 4 && degree >= 2) {\n    top96 = cubevar * covariance;\n    top96 += x_variance * (static_cast<long double>(sigxx) * sigy - sigxxy * n);\n\n    bottom96 = cubevar * cubevar;\n    bottom96 -= x_variance * (sigxxxx * n - static_cast<long double>(sigxx) * sigxx);\n  }\n  if (bottom96 >= kMinVariance * n * n * n * n) {\n    // Denominators looking good\n    a = top96 / bottom96;\n    top96 = covariance - cubevar * a;\n    b = top96 / x_variance;\n  } else {\n    // Forget a, and concentrate on b.\n    a = 0.0;\n    b = covariance / x_variance;\n  }\n  c = (sigy - a * sigxx - b * sigx) / n;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/quadlsq.h",
    "content": "/**********************************************************************\n * File:        quadlsq.h  (Formerly qlsq.h)\n * Description: Code for least squares approximation of quadratics.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef QUADLSQ_H\n#define QUADLSQ_H\n\n#include \"points.h\"\n\nnamespace tesseract {\n\nclass QLSQ {\npublic:\n  QLSQ() {   // constructor\n    clear(); // set to zeros\n  }\n  void clear(); // initialize\n\n  void add(     // add element\n      double x, // coords to add\n      double y);\n  void remove(  // delete element\n      double x, // coords to delete\n      double y);\n  int32_t count() { // no of elements\n    return n;\n  }\n\n  void fit(        // fit the given\n      int degree); // return actual\n  double get_a() const { // get x squard\n    return a;\n  }\n  double get_b() const { // get x squard\n    return b;\n  }\n  double get_c() const { // get x squard\n    return c;\n  }\n\nprivate:\n  int32_t n;           // no of elements\n  double a, b, c;      // result\n  double sigx;         // sum of x\n  double sigy;         // sum of y\n  double sigxx;        // sum x squared\n  double sigxy;        // sum of xy\n  double sigyy;        // sum y squared\n  long double sigxxx;  // sum x cubed\n  long double sigxxy;  // sum xsquared y\n  long double sigxxxx; // sum x fourth\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/quadratc.h",
    "content": "/**********************************************************************\n * File:        quadratc.h  (Formerly quadrtic.h)\n * Description: Code for the QUAD_COEFFS class.\n * Author:      Ray Smith\n * Created:     Tue Oct 08 17:24:40 BST 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef QUADRATC_H\n#define QUADRATC_H\n\n#include \"points.h\"\n\nnamespace tesseract {\n\nclass QUAD_COEFFS {\npublic:\n  QUAD_COEFFS() = default;\n  QUAD_COEFFS(    // constructor\n      double xsq, // coefficients\n      float x, float constant) {\n    a = xsq;\n    b = x;\n    c = constant;\n  }\n\n  float y(             // evaluate\n      float x) const { // at x\n    return static_cast<float>((a * x + b) * x + c);\n  }\n\n  void move(        // reposition word\n      ICOORD vec) { // by vector\n    /************************************************************\n  y - q = a (x - p)^2 + b (x - p) + c\n  y - q = ax^2 - 2apx + ap^2 + bx - bp + c\n    y = ax^2 + (b - 2ap)x + (c - bp + ap^2 + q)\n************************************************************/\n    int16_t p = vec.x();\n    int16_t q = vec.y();\n\n    c = static_cast<float>(c - b * p + a * p * p + q);\n    b = static_cast<float>(b - 2 * a * p);\n  }\n\n  double a; // x squared\n  float b;  // x\n  float c;  // constant\nprivate:\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/quspline.cpp",
    "content": "/**********************************************************************\n * File:        quspline.cpp  (Formerly qspline.c)\n * Description: Code for the QSPLINE class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"quspline.h\"\n\n#include \"points.h\"   // for ICOORD\n#include \"quadlsq.h\"  // for QLSQ\n#include \"quadratc.h\" // for QUAD_COEFFS\n\n#include <allheaders.h> // for pixRenderPolyline, pixGetDepth, pixGetHeight\n#include \"pix.h\"        // for L_CLEAR_PIXELS, L_SET_PIXELS, Pix (ptr only)\n\nnamespace tesseract {\n\n#define QSPLINE_PRECISION 16 // no of steps to draw\n\n/**********************************************************************\n * QSPLINE::QSPLINE\n *\n * Constructor to build a QSPLINE given the components used in the old code.\n **********************************************************************/\n\nQSPLINE::QSPLINE(     // constructor\n    int32_t count,    // no of segments\n    int32_t *xstarts, // start coords\n    double *coeffs    // coefficients\n) {\n  int32_t index; // segment index\n\n  // get memory\n  xcoords = new int32_t[count + 1];\n  quadratics = new QUAD_COEFFS[count];\n  segments = count;\n  for (index = 0; index < segments; index++) {\n    // copy them\n    xcoords[index] = xstarts[index];\n    quadratics[index] =\n        QUAD_COEFFS(coeffs[index * 3], coeffs[index * 3 + 1], coeffs[index * 3 + 2]);\n  }\n  // right edge\n  xcoords[index] = xstarts[index];\n}\n\n/**********************************************************************\n * QSPLINE::QSPLINE\n *\n * Constructor to build a QSPLINE by appproximation of points.\n **********************************************************************/\n\nQSPLINE::QSPLINE(               // constructor\n    int xstarts[],              // spline boundaries\n    int segcount,               // no of segments\n    int xpts[],                 // points to fit\n    int ypts[], int pointcount, // no of pts\n    int degree                  // fit required\n) {\n  int pointindex;    /*no along text line */\n  int segment;       /*segment no */\n  int32_t *ptcounts; // no in each segment\n  QLSQ qlsq;         /*accumulator */\n\n  segments = segcount;\n  xcoords = new int32_t[segcount + 1];\n  ptcounts = new int32_t[segcount + 1];\n  quadratics = new QUAD_COEFFS[segcount];\n  memmove(xcoords, xstarts, (segcount + 1) * sizeof(int32_t));\n  ptcounts[0] = 0; /*none in any yet */\n  for (segment = 0, pointindex = 0; pointindex < pointcount; pointindex++) {\n    while (segment < segcount && xpts[pointindex] >= xstarts[segment]) {\n      segment++; /*try next segment */\n                 /*cumulative counts */\n      ptcounts[segment] = ptcounts[segment - 1];\n    }\n    ptcounts[segment]++; /*no in previous partition */\n  }\n  while (segment < segcount) {\n    segment++;\n    /*zero the rest */\n    ptcounts[segment] = ptcounts[segment - 1];\n  }\n\n  for (segment = 0; segment < segcount; segment++) {\n    qlsq.clear();\n    /*first blob */\n    pointindex = ptcounts[segment];\n    if (pointindex > 0 && xpts[pointindex] != xpts[pointindex - 1] &&\n        xpts[pointindex] != xstarts[segment]) {\n      qlsq.add(xstarts[segment],\n               ypts[pointindex - 1] + (ypts[pointindex] - ypts[pointindex - 1]) *\n                                          (xstarts[segment] - xpts[pointindex - 1]) /\n                                          (xpts[pointindex] - xpts[pointindex - 1]));\n    }\n    for (; pointindex < ptcounts[segment + 1]; pointindex++) {\n      qlsq.add(xpts[pointindex], ypts[pointindex]);\n    }\n    if (pointindex > 0 && pointindex < pointcount && xpts[pointindex] != xstarts[segment + 1]) {\n      qlsq.add(xstarts[segment + 1],\n               ypts[pointindex - 1] + (ypts[pointindex] - ypts[pointindex - 1]) *\n                                          (xstarts[segment + 1] - xpts[pointindex - 1]) /\n                                          (xpts[pointindex] - xpts[pointindex - 1]));\n    }\n    qlsq.fit(degree);\n    quadratics[segment].a = qlsq.get_a();\n    quadratics[segment].b = qlsq.get_b();\n    quadratics[segment].c = qlsq.get_c();\n  }\n  delete[] ptcounts;\n}\n\n/**********************************************************************\n * QSPLINE::QSPLINE\n *\n * Constructor to build a QSPLINE from another.\n **********************************************************************/\n\nQSPLINE::QSPLINE( // constructor\n    const QSPLINE &src) {\n  segments = 0;\n  xcoords = nullptr;\n  quadratics = nullptr;\n  *this = src;\n}\n\n/**********************************************************************\n * QSPLINE::~QSPLINE\n *\n * Destroy a QSPLINE.\n **********************************************************************/\n\nQSPLINE::~QSPLINE() {\n  delete[] xcoords;\n  delete[] quadratics;\n}\n\n/**********************************************************************\n * QSPLINE::operator=\n *\n * Copy a QSPLINE\n **********************************************************************/\n\nQSPLINE &QSPLINE::operator=( // assignment\n    const QSPLINE &source) {\n  delete[] xcoords;\n  delete[] quadratics;\n\n  segments = source.segments;\n  xcoords = new int32_t[segments + 1];\n  quadratics = new QUAD_COEFFS[segments];\n  memmove(xcoords, source.xcoords, (segments + 1) * sizeof(int32_t));\n  memmove(quadratics, source.quadratics, segments * sizeof(QUAD_COEFFS));\n  return *this;\n}\n\n/**********************************************************************\n * QSPLINE::step\n *\n * Return the total of the step functions between the given coords.\n **********************************************************************/\n\ndouble QSPLINE::step( // find step functions\n    double x1,        // between coords\n    double x2) {\n  int index1, index2; // indices of coords\n  double total;       /*total steps */\n\n  index1 = spline_index(x1);\n  index2 = spline_index(x2);\n  total = 0;\n  while (index1 < index2) {\n    total += static_cast<double>(quadratics[index1 + 1].y(static_cast<float>(xcoords[index1 + 1])));\n    total -= static_cast<double>(quadratics[index1].y(static_cast<float>(xcoords[index1 + 1])));\n    index1++; /*next segment */\n  }\n  return total; /*total steps */\n}\n\n/**********************************************************************\n * QSPLINE::y\n *\n * Return the y value at the given x value.\n **********************************************************************/\n\ndouble QSPLINE::y( // evaluate\n    double x       // coord to evaluate at\n    ) const {\n  int32_t index; // segment index\n\n  index = spline_index(x);\n  return quadratics[index].y(x); // in correct segment\n}\n\n/**********************************************************************\n * QSPLINE::spline_index\n *\n * Return the index to the largest xcoord not greater than x.\n **********************************************************************/\n\nint32_t QSPLINE::spline_index( // evaluate\n    double x                   // coord to evaluate at\n    ) const {\n  int32_t index;  // segment index\n  int32_t bottom; // bottom of range\n  int32_t top;    // top of range\n\n  bottom = 0;\n  top = segments;\n  while (top - bottom > 1) {\n    index = (top + bottom) / 2; // centre of range\n    if (x >= xcoords[index]) {\n      bottom = index; // new min\n    } else {\n      top = index; // new max\n    }\n  }\n  return bottom;\n}\n\n/**********************************************************************\n * QSPLINE::move\n *\n * Reposition spline by vector\n **********************************************************************/\n\nvoid QSPLINE::move( // reposition spline\n    ICOORD vec      // by vector\n) {\n  int32_t segment; // index of segment\n  int16_t x_shift = vec.x();\n\n  for (segment = 0; segment < segments; segment++) {\n    xcoords[segment] += x_shift;\n    quadratics[segment].move(vec);\n  }\n  xcoords[segment] += x_shift;\n}\n\n/**********************************************************************\n * QSPLINE::overlap\n *\n * Return true if spline2 overlaps this by no more than fraction less\n * than the bounds of this.\n **********************************************************************/\n\nbool QSPLINE::overlap( // test overlap\n    QSPLINE *spline2,  // 2 cannot be smaller\n    double fraction    // by more than this\n) {\n  int leftlimit = xcoords[1];             /*common left limit */\n  int rightlimit = xcoords[segments - 1]; /*common right limit */\n                                          /*or too non-overlap */\n  return !(spline2->segments < 3 ||\n           spline2->xcoords[1] > leftlimit + fraction * (rightlimit - leftlimit) ||\n           spline2->xcoords[spline2->segments - 1] <\n               rightlimit - fraction * (rightlimit - leftlimit));\n}\n\n/**********************************************************************\n * extrapolate_spline\n *\n * Extrapolates the spline linearly using the same gradient as the\n * quadratic has at either end.\n **********************************************************************/\n\nvoid QSPLINE::extrapolate( // linear extrapolation\n    double gradient,       // gradient to use\n    int xmin,              // new left edge\n    int xmax               // new right edge\n) {\n  int segment;        /*current segment of spline */\n  int dest_segment;   // dest index\n  int32_t *xstarts;   // new boundaries\n  QUAD_COEFFS *quads; // new ones\n  int increment;      // in size\n\n  increment = xmin < xcoords[0] ? 1 : 0;\n  if (xmax > xcoords[segments]) {\n    increment++;\n  }\n  if (increment == 0) {\n    return;\n  }\n  xstarts = new int32_t[segments + 1 + increment];\n  quads = new QUAD_COEFFS[segments + increment];\n  if (xmin < xcoords[0]) {\n    xstarts[0] = xmin;\n    quads[0].a = 0;\n    quads[0].b = gradient;\n    quads[0].c = y(xcoords[0]) - quads[0].b * xcoords[0];\n    dest_segment = 1;\n  } else {\n    dest_segment = 0;\n  }\n  for (segment = 0; segment < segments; segment++) {\n    xstarts[dest_segment] = xcoords[segment];\n    quads[dest_segment] = quadratics[segment];\n    dest_segment++;\n  }\n  xstarts[dest_segment] = xcoords[segment];\n  if (xmax > xcoords[segments]) {\n    quads[dest_segment].a = 0;\n    quads[dest_segment].b = gradient;\n    quads[dest_segment].c = y(xcoords[segments]) - quads[dest_segment].b * xcoords[segments];\n    dest_segment++;\n    xstarts[dest_segment] = xmax + 1;\n  }\n  segments = dest_segment;\n  delete[] xcoords;\n  delete[] quadratics;\n  xcoords = xstarts;\n  quadratics = quads;\n}\n\n/**********************************************************************\n * QSPLINE::plot\n *\n * Draw the QSPLINE in the given colour.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid QSPLINE::plot(          // draw it\n    ScrollView *window,      // window to draw in\n    ScrollView::Color colour // colour to draw in\n    ) const {\n  int32_t segment;  // index of segment\n  int16_t step;     // index of poly piece\n  double increment; // x increment\n  double x;         // x coord\n\n  window->Pen(colour);\n  for (segment = 0; segment < segments; segment++) {\n    increment = static_cast<double>(xcoords[segment + 1] - xcoords[segment]) / QSPLINE_PRECISION;\n    x = xcoords[segment];\n    for (step = 0; step <= QSPLINE_PRECISION; step++) {\n      if (segment == 0 && step == 0) {\n        window->SetCursor(x, quadratics[segment].y(x));\n      } else {\n        window->DrawTo(x, quadratics[segment].y(x));\n      }\n      x += increment;\n    }\n  }\n}\n#endif\n\nvoid QSPLINE::plot(Image pix) const {\n  if (pix == nullptr) {\n    return;\n  }\n\n  int32_t segment;  // Index of segment\n  int16_t step;     // Index of poly piece\n  double increment; // x increment\n  double x;         // x coord\n  auto height = static_cast<double>(pixGetHeight(pix));\n  Pta *points = ptaCreate(QSPLINE_PRECISION * segments);\n  const int kLineWidth = 5;\n\n  for (segment = 0; segment < segments; segment++) {\n    increment = static_cast<double>((xcoords[segment + 1] - xcoords[segment])) / QSPLINE_PRECISION;\n    x = xcoords[segment];\n    for (step = 0; step <= QSPLINE_PRECISION; step++) {\n      double y = height - quadratics[segment].y(x);\n      ptaAddPt(points, x, y);\n      x += increment;\n    }\n  }\n\n  switch (pixGetDepth(pix)) {\n    case 1:\n      pixRenderPolyline(pix, points, kLineWidth, L_SET_PIXELS, 1);\n      break;\n    case 32:\n      pixRenderPolylineArb(pix, points, kLineWidth, 255, 0, 0, 1);\n      break;\n    default:\n      pixRenderPolyline(pix, points, kLineWidth, L_CLEAR_PIXELS, 1);\n      break;\n  }\n  ptaDestroy(&points);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/quspline.h",
    "content": "/**********************************************************************\n * File:        quspline.h  (Formerly qspline.h)\n * Description: Code for the QSPLINE class.\n * Author:      Ray Smith\n * Created:     Tue Oct 08 17:16:12 BST 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef QUSPLINE_H\n#define QUSPLINE_H\n\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n\n#include <cstdint> // for int32_t\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass ICOORD;\nclass QUAD_COEFFS;\nclass ROW;\nclass TBOX;\n\nclass TESS_API QSPLINE {\n  friend void make_first_baseline(TBOX *, int, int *, int *, QSPLINE *, QSPLINE *, float);\n  friend void make_holed_baseline(TBOX *, int, QSPLINE *, QSPLINE *, float);\n  friend void tweak_row_baseline(ROW *, double, double);\n\npublic:\n  QSPLINE() { // empty constructor\n    segments = 0;\n    xcoords = nullptr; // everything empty\n    quadratics = nullptr;\n  }\n  QSPLINE( // copy constructor\n      const QSPLINE &src);\n  QSPLINE(                          // constructor\n      int32_t count,                // number of segments\n      int32_t *xstarts,             // segment starts\n      double *coeffs);              // coefficients\n  ~QSPLINE();                       // destructor\n  QSPLINE(                          // least squares fit\n      int xstarts[],                // spline boundaries\n      int segcount,                 // no of segments\n      int xcoords[],                // points to fit\n      int ycoords[], int blobcount, // no of coords\n      int degree);                  // function\n\n  double step(   // step change\n      double x1, // between coords\n      double x2);\n  double y(            // evaluate\n      double x) const; // at x\n\n  void move(            // reposition spline\n      ICOORD vec);      // by vector\n  bool overlap(         // test overlap\n      QSPLINE *spline2, // 2 cannot be smaller\n      double fraction); // by more than this\n  void extrapolate(     // linear extrapolation\n      double gradient,  // gradient to use\n      int left,         // new left edge\n      int right);       // new right edge\n\n#ifndef GRAPHICS_DISABLED\n  void plot(                           // draw it\n      ScrollView *window,              // in window\n      ScrollView::Color colour) const; // in colour\n#endif\n\n  // Paint the baseline over pix. If pix has depth of 32, then the line will\n  // be painted in red. Otherwise it will be painted in black.\n  void plot(Image pix) const;\n\n  QSPLINE &operator=(const QSPLINE &source); // from this\n\nprivate:\n  int32_t spline_index(    // binary search\n      double x) const;     // for x\n  int32_t segments;        // no of segments\n  int32_t *xcoords;        // no of coords\n  QUAD_COEFFS *quadratics; // spline pieces\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/ratngs.cpp",
    "content": "/**********************************************************************\n * File: ratngs.cpp  (Formerly ratings.c)\n * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.\n * Author: Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"ratngs.h\"\n\n#include \"blobs.h\"\n#include \"matrix.h\"\n#include \"normalis.h\" // kBlnBaselineOffset.\n#include \"unicharset.h\"\n\n#include <algorithm>\n#include <cmath>\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\nconst float WERD_CHOICE::kBadRating = 100000.0;\n// Min offset in baseline-normalized coords to make a character a subscript.\nconst int kMinSubscriptOffset = 20;\n// Min offset in baseline-normalized coords to make a character a superscript.\nconst int kMinSuperscriptOffset = 20;\n// Max y of bottom of a drop-cap blob.\nconst int kMaxDropCapBottom = -128;\n// Max fraction of x-height to use as denominator in measuring x-height overlap.\nconst double kMaxOverlapDenominator = 0.125;\n// Min fraction of x-height range that should be in agreement for matching\n// x-heights.\nconst double kMinXHeightMatch = 0.5;\n// Max tolerance on baseline position as a fraction of x-height for matching\n// baselines.\nconst double kMaxBaselineDrift = 0.0625;\n\nstatic const char kPermuterTypeNoPerm[] = \"None\";\nstatic const char kPermuterTypePuncPerm[] = \"Punctuation\";\nstatic const char kPermuterTypeTopPerm[] = \"Top Choice\";\nstatic const char kPermuterTypeLowerPerm[] = \"Top Lower Case\";\nstatic const char kPermuterTypeUpperPerm[] = \"Top Upper Case\";\nstatic const char kPermuterTypeNgramPerm[] = \"Ngram\";\nstatic const char kPermuterTypeNumberPerm[] = \"Number\";\nstatic const char kPermuterTypeUserPatPerm[] = \"User Pattern\";\nstatic const char kPermuterTypeSysDawgPerm[] = \"System Dictionary\";\nstatic const char kPermuterTypeDocDawgPerm[] = \"Document Dictionary\";\nstatic const char kPermuterTypeUserDawgPerm[] = \"User Dictionary\";\nstatic const char kPermuterTypeFreqDawgPerm[] = \"Frequent Words Dictionary\";\nstatic const char kPermuterTypeCompoundPerm[] = \"Compound\";\n\nstatic const char *const kPermuterTypeNames[] = {\n    kPermuterTypeNoPerm,       // 0\n    kPermuterTypePuncPerm,     // 1\n    kPermuterTypeTopPerm,      // 2\n    kPermuterTypeLowerPerm,    // 3\n    kPermuterTypeUpperPerm,    // 4\n    kPermuterTypeNgramPerm,    // 5\n    kPermuterTypeNumberPerm,   // 6\n    kPermuterTypeUserPatPerm,  // 7\n    kPermuterTypeSysDawgPerm,  // 8\n    kPermuterTypeDocDawgPerm,  // 9\n    kPermuterTypeUserDawgPerm, // 10\n    kPermuterTypeFreqDawgPerm, // 11\n    kPermuterTypeCompoundPerm  // 12\n};\n\n/**\n * BLOB_CHOICE::BLOB_CHOICE\n *\n * Constructor to build a BLOB_CHOICE from a char, rating and certainty.\n */\nBLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id\n                         float src_rating,          // rating\n                         float src_cert,            // certainty\n                         int src_script_id,         // script\n                         float min_xheight,         // min xheight allowed\n                         float max_xheight,         // max xheight by this char\n                         float yshift,              // yshift out of position\n                         BlobChoiceClassifier c) {  // adapted match or other\n  unichar_id_ = src_unichar_id;\n  rating_ = src_rating;\n  certainty_ = src_cert;\n  fontinfo_id_ = -1;\n  fontinfo_id2_ = -1;\n  script_id_ = src_script_id;\n  min_xheight_ = min_xheight;\n  max_xheight_ = max_xheight;\n  yshift_ = yshift;\n  classifier_ = c;\n}\n\n/**\n * BLOB_CHOICE::BLOB_CHOICE\n *\n * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.\n */\nBLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST<BLOB_CHOICE>::LINK(other) {\n  unichar_id_ = other.unichar_id();\n  rating_ = other.rating();\n  certainty_ = other.certainty();\n  fontinfo_id_ = other.fontinfo_id();\n  fontinfo_id2_ = other.fontinfo_id2();\n  script_id_ = other.script_id();\n  matrix_cell_ = other.matrix_cell_;\n  min_xheight_ = other.min_xheight_;\n  max_xheight_ = other.max_xheight_;\n  yshift_ = other.yshift();\n  classifier_ = other.classifier_;\n#ifndef DISABLED_LEGACY_ENGINE\n  fonts_ = other.fonts_;\n#endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n// Copy assignment operator.\nBLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) {\n  ELIST<BLOB_CHOICE>::LINK::operator=(other);\n  unichar_id_ = other.unichar_id();\n  rating_ = other.rating();\n  certainty_ = other.certainty();\n  fontinfo_id_ = other.fontinfo_id();\n  fontinfo_id2_ = other.fontinfo_id2();\n  script_id_ = other.script_id();\n  matrix_cell_ = other.matrix_cell_;\n  min_xheight_ = other.min_xheight_;\n  max_xheight_ = other.max_xheight_;\n  yshift_ = other.yshift();\n  classifier_ = other.classifier_;\n#ifndef DISABLED_LEGACY_ENGINE\n  fonts_ = other.fonts_;\n#endif // ndef DISABLED_LEGACY_ENGINE\n  return *this;\n}\n\n// Returns true if *this and other agree on the baseline and x-height\n// to within some tolerance based on a given estimate of the x-height.\nbool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const {\n  double baseline_diff = std::fabs(yshift() - other.yshift());\n  if (baseline_diff > kMaxBaselineDrift * x_height) {\n    if (debug) {\n      tprintf(\"Baseline diff %g for %d v %d\\n\", baseline_diff, unichar_id_, other.unichar_id_);\n    }\n    return false;\n  }\n  double this_range = max_xheight() - min_xheight();\n  double other_range = other.max_xheight() - other.min_xheight();\n  double denominator =\n      ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height);\n  double overlap =\n      std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight());\n  overlap /= denominator;\n  if (debug) {\n    tprintf(\"PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\\n\", unichar_id_,\n            other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap);\n  }\n\n  return overlap >= kMinXHeightMatch;\n}\n\n// Helper to find the BLOB_CHOICE in the bc_list that matches the given\n// unichar_id, or nullptr if there is no match.\nBLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) {\n  // Find the corresponding best BLOB_CHOICE.\n  BLOB_CHOICE_IT choice_it(bc_list);\n  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {\n    BLOB_CHOICE *choice = choice_it.data();\n    if (choice->unichar_id() == char_id) {\n      return choice;\n    }\n  }\n  return nullptr;\n}\n\nconst char *WERD_CHOICE::permuter_name(uint8_t permuter) {\n  return kPermuterTypeNames[permuter];\n}\n\nconst char *ScriptPosToString(enum ScriptPos script_pos) {\n  switch (script_pos) {\n    case SP_NORMAL:\n      return \"NORM\";\n    case SP_SUBSCRIPT:\n      return \"SUB\";\n    case SP_SUPERSCRIPT:\n      return \"SUPER\";\n    case SP_DROPCAP:\n      return \"DROPC\";\n  }\n  return \"SP_UNKNOWN\";\n}\n\n/**\n * WERD_CHOICE::WERD_CHOICE\n *\n * Constructor to build a WERD_CHOICE from the given string.\n * The function assumes that src_string is not nullptr.\n */\nWERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset)\n    : unicharset_(&unicharset) {\n  std::vector<UNICHAR_ID> encoding;\n  std::vector<char> lengths;\n  std::string cleaned = unicharset.CleanupString(src_string);\n  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {\n    lengths.push_back('\\0');\n    std::string src_lengths = &lengths[0];\n    this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);\n  } else { // There must have been an invalid unichar in the string.\n    this->init(8);\n    this->make_bad();\n  }\n}\n\n/**\n * WERD_CHOICE::init\n *\n * Helper function to build a WERD_CHOICE from the given string,\n * fragment lengths, rating, certainty and permuter.\n *\n * The function assumes that src_string is not nullptr.\n * src_lengths argument could be nullptr, in which case the unichars\n * in src_string are assumed to all be of length 1.\n */\nvoid WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating,\n                       float src_certainty, uint8_t src_permuter) {\n  int src_string_len = strlen(src_string);\n  if (src_string_len == 0) {\n    this->init(8);\n  } else {\n    this->init(src_lengths ? strlen(src_lengths) : src_string_len);\n    length_ = reserved_;\n    int offset = 0;\n    for (unsigned i = 0; i < length_; ++i) {\n      int unichar_length = src_lengths ? src_lengths[i] : 1;\n      unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);\n      state_[i] = 1;\n      certainties_[i] = src_certainty;\n      offset += unichar_length;\n    }\n  }\n  adjust_factor_ = 1.0f;\n  rating_ = src_rating;\n  certainty_ = src_certainty;\n  permuter_ = src_permuter;\n  dangerous_ambig_found_ = false;\n}\n\n/**\n * WERD_CHOICE::~WERD_CHOICE\n */\nWERD_CHOICE::~WERD_CHOICE() = default;\n\nconst char *WERD_CHOICE::permuter_name() const {\n  return kPermuterTypeNames[permuter_];\n}\n\n// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,\n// taken from the appropriate cell in the ratings MATRIX.\n// Borrowed pointer, so do not delete.\nBLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const {\n  MATRIX_COORD coord = MatrixCoord(index);\n  BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);\n  if (result == nullptr) {\n    result = new BLOB_CHOICE_LIST;\n    ratings->put(coord.col, coord.row, result);\n  }\n  return result;\n}\n\n// Returns the MATRIX_COORD corresponding to the location in the ratings\n// MATRIX for the given index into the word.\nMATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const {\n  int col = 0;\n  for (unsigned i = 0; i < index; ++i) {\n    col += state_[i];\n  }\n  int row = col + state_[index] - 1;\n  return MATRIX_COORD(col, row);\n}\n\n// Sets the entries for the given index from the BLOB_CHOICE, assuming\n// unit fragment lengths, but setting the state for this index to blob_count.\nvoid WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) {\n  unichar_ids_[index] = blob_choice->unichar_id();\n  script_pos_[index] = tesseract::SP_NORMAL;\n  state_[index] = blob_count;\n  certainties_[index] = blob_choice->certainty();\n}\n\n/**\n * contains_unichar_id\n *\n * Returns true if unichar_ids_ contain the given unichar_id, false otherwise.\n */\nbool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {\n  for (unsigned i = 0; i < length_; ++i) {\n    if (unichar_ids_[i] == unichar_id) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/**\n * remove_unichar_ids\n *\n * Removes num unichar ids starting from index start from unichar_ids_\n * and updates length_ and fragment_lengths_ to reflect this change.\n * Note: this function does not modify rating_ and certainty_.\n */\nvoid WERD_CHOICE::remove_unichar_ids(unsigned start, int num) {\n  ASSERT_HOST(start + num <= length_);\n  // Accumulate the states to account for the merged blobs.\n  for (int i = 0; i < num; ++i) {\n    if (start > 0) {\n      state_[start - 1] += state_[start + i];\n    } else if (start + num < length_) {\n      state_[start + num] += state_[start + i];\n    }\n  }\n  for (unsigned i = start; i + num < length_; ++i) {\n    unichar_ids_[i] = unichar_ids_[i + num];\n    script_pos_[i] = script_pos_[i + num];\n    state_[i] = state_[i + num];\n    certainties_[i] = certainties_[i + num];\n  }\n  length_ -= num;\n}\n\n/**\n * reverse_and_mirror_unichar_ids\n *\n * Reverses and mirrors unichars in unichar_ids.\n */\nvoid WERD_CHOICE::reverse_and_mirror_unichar_ids() {\n  for (unsigned i = 0; i < length_ / 2; ++i) {\n    UNICHAR_ID tmp_id = unichar_ids_[i];\n    unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);\n    unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);\n  }\n  if (length_ % 2 != 0) {\n    unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);\n  }\n}\n\n/**\n * punct_stripped\n *\n * Returns the half-open interval of unichar_id indices [start, end) which\n * enclose the core portion of this word -- the part after stripping\n * punctuation from the left and right.\n */\nvoid WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {\n  *start = 0;\n  *end = length();\n  while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {\n    (*start)++;\n  }\n  while (*end > *start && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {\n    (*end)--;\n  }\n}\n\nvoid WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {\n  int end = length();\n  while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&\n         BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {\n    end--;\n  }\n  int start = 0;\n  while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&\n         BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {\n    start++;\n  }\n  *pstart = start;\n  *pend = end;\n}\n\nWERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const {\n  ASSERT_HOST(start <= length_);\n  ASSERT_HOST(end <= length_);\n  if (end < start) {\n    end = start;\n  }\n  WERD_CHOICE retval(unicharset_, end - start);\n  for (auto i = start; i < end; i++) {\n    retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);\n  }\n  return retval;\n}\n\n/**\n * has_rtl_unichar_id\n *\n * Returns true if unichar_ids contain at least one \"strongly\" RTL unichar.\n */\nbool WERD_CHOICE::has_rtl_unichar_id() const {\n  for (unsigned i = 0; i < length_; ++i) {\n    UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);\n    if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/**\n * string_and_lengths\n *\n * Populates the given word_str with unichars from unichar_ids and\n * and word_lengths_str with the corresponding unichar lengths.\n */\nvoid WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const {\n  *word_str = \"\";\n  if (word_lengths_str != nullptr) {\n    *word_lengths_str = \"\";\n  }\n  for (unsigned i = 0; i < length_; ++i) {\n    const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);\n    *word_str += ch;\n    if (word_lengths_str != nullptr) {\n      *word_lengths_str += (char)strlen(ch);\n    }\n  }\n}\n\n/**\n * append_unichar_id\n *\n * Make sure there is enough space in the word for the new unichar id\n * and call append_unichar_id_space_allocated().\n */\nvoid WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating,\n                                    float certainty) {\n  if (length_ == reserved_) {\n    this->double_the_size();\n  }\n  this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty);\n}\n\n/**\n * WERD_CHOICE::operator+=\n *\n * Cat a second word rating on the end of this current one.\n * The ratings are added and the confidence is the min.\n * If the permuters are NOT the same the permuter is set to COMPOUND_PERM\n */\nWERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) {\n  ASSERT_HOST(unicharset_ == second.unicharset_);\n  while (reserved_ < length_ + second.length()) {\n    this->double_the_size();\n  }\n  const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();\n  for (unsigned i = 0; i < second.length(); ++i) {\n    unichar_ids_[length_ + i] = other_unichar_ids[i];\n    state_[length_ + i] = second.state_[i];\n    certainties_[length_ + i] = second.certainties_[i];\n    script_pos_[length_ + i] = second.BlobPosition(i);\n  }\n  length_ += second.length();\n  if (second.adjust_factor_ > adjust_factor_) {\n    adjust_factor_ = second.adjust_factor_;\n  }\n  rating_ += second.rating();            // add ratings\n  if (second.certainty() < certainty_) { // take min\n    certainty_ = second.certainty();\n  }\n  if (second.dangerous_ambig_found_) {\n    dangerous_ambig_found_ = true;\n  }\n  if (permuter_ == NO_PERM) {\n    permuter_ = second.permuter();\n  } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {\n    permuter_ = COMPOUND_PERM;\n  }\n  return *this;\n}\n\n/**\n * WERD_CHOICE::operator=\n *\n * Allocate enough memory to hold a copy of source and copy over\n * all the information from source to this WERD_CHOICE.\n */\nWERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) {\n  while (reserved_ < source.length()) {\n    this->double_the_size();\n  }\n\n  unicharset_ = source.unicharset_;\n  const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();\n  for (unsigned i = 0; i < source.length(); ++i) {\n    unichar_ids_[i] = other_unichar_ids[i];\n    state_[i] = source.state_[i];\n    certainties_[i] = source.certainties_[i];\n    script_pos_[i] = source.BlobPosition(i);\n  }\n  length_ = source.length();\n  adjust_factor_ = source.adjust_factor_;\n  rating_ = source.rating();\n  certainty_ = source.certainty();\n  min_x_height_ = source.min_x_height();\n  max_x_height_ = source.max_x_height();\n  permuter_ = source.permuter();\n  dangerous_ambig_found_ = source.dangerous_ambig_found_;\n  return *this;\n}\n\n// Sets up the script_pos_ member using the blobs_list to get the bln\n// bounding boxes, *this to get the unichars, and this->unicharset\n// to get the target positions. If small_caps is true, sub/super are not\n// considered, but dropcaps are.\n// NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)\nvoid WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {\n  // Initialize to normal.\n  for (unsigned i = 0; i < length_; ++i) {\n    script_pos_[i] = tesseract::SP_NORMAL;\n  }\n  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {\n    return;\n  }\n\n  unsigned position_counts[4] = {0, 0, 0, 0};\n\n  int chunk_index = 0;\n  for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {\n    TBLOB *tblob = word->blobs[chunk_index];\n    int uni_id = unichar_id(blob_index);\n    TBOX blob_box = tblob->bounding_box();\n    if (!state_.empty()) {\n      for (int i = 1; i < state_[blob_index]; ++i) {\n        ++chunk_index;\n        tblob = word->blobs[chunk_index];\n        blob_box += tblob->bounding_box();\n      }\n    }\n    script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);\n    if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {\n      script_pos_[blob_index] = tesseract::SP_NORMAL;\n    }\n    position_counts[script_pos_[blob_index]]++;\n  }\n  // If almost everything looks like a superscript or subscript,\n  // we most likely just got the baseline wrong.\n  if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||\n      4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {\n    if (debug >= 2) {\n      tprintf(\n          \"Most characters of %s are subscript or superscript.\\n\"\n          \"That seems wrong, so I'll assume we got the baseline wrong\\n\",\n          unichar_string().c_str());\n    }\n    for (unsigned i = 0; i < length_; i++) {\n      ScriptPos sp = script_pos_[i];\n      if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {\n        ASSERT_HOST(position_counts[sp] > 0);\n        position_counts[sp]--;\n        position_counts[tesseract::SP_NORMAL]++;\n        script_pos_[i] = tesseract::SP_NORMAL;\n      }\n    }\n  }\n\n  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {\n    tprintf(\"SetScriptPosition on %s\\n\", unichar_string().c_str());\n    int chunk_index = 0;\n    for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {\n      if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {\n        TBLOB *tblob = word->blobs[chunk_index];\n        ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));\n      }\n      chunk_index += state_.empty() ? 1 : state_[blob_index];\n    }\n  }\n}\n\n// Sets all the script_pos_ positions to the given position.\nvoid WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {\n  for (unsigned i = 0; i < length_; ++i) {\n    script_pos_[i] = position;\n  }\n}\n\n/* static */\nScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,\n                                        const TBOX &blob_box, UNICHAR_ID unichar_id) {\n  ScriptPos retval = tesseract::SP_NORMAL;\n  int top = blob_box.top();\n  int bottom = blob_box.bottom();\n  int min_bottom, max_bottom, min_top, max_top;\n  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);\n\n  int sub_thresh_top = min_top - kMinSubscriptOffset;\n  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;\n  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;\n  if (bottom <= kMaxDropCapBottom) {\n    retval = tesseract::SP_DROPCAP;\n  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {\n    retval = tesseract::SP_SUBSCRIPT;\n  } else if (bottom > sup_thresh_bot) {\n    retval = tesseract::SP_SUPERSCRIPT;\n  }\n\n  if (print_debug) {\n    const char *pos = ScriptPosToString(retval);\n    tprintf(\n        \"%s Character %s[bot:%d top: %d]  \"\n        \"bot_range[%d,%d]  top_range[%d, %d] \"\n        \"sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\\n\",\n        pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,\n        max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);\n  }\n  return retval;\n}\n\n// Returns the script-id (eg Han) of the dominant script in the word.\nint WERD_CHOICE::GetTopScriptID() const {\n  unsigned max_script = unicharset_->get_script_table_size();\n  std::vector<unsigned> sid(max_script);\n  for (unsigned x = 0; x < length_; ++x) {\n    int script_id = unicharset_->get_script(unichar_id(x));\n    sid[script_id]++;\n  }\n  if (unicharset_->han_sid() != unicharset_->null_sid()) {\n    // Add the Hiragana & Katakana counts to Han and zero them out.\n    if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {\n      sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];\n      sid[unicharset_->hiragana_sid()] = 0;\n    }\n    if (unicharset_->katakana_sid() != unicharset_->null_sid()) {\n      sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];\n      sid[unicharset_->katakana_sid()] = 0;\n    }\n  }\n  // Note that high script ID overrides lower one on a tie, thus biasing\n  // towards non-Common script (if sorted that way in unicharset file).\n  unsigned max_sid = 0;\n  for (unsigned x = 1; x < max_script; x++) {\n    if (sid[x] >= sid[max_sid]) {\n      max_sid = x;\n    }\n  }\n  if (sid[max_sid] < length_ / 2) {\n    max_sid = unicharset_->null_sid();\n  }\n  return max_sid;\n}\n\n// Fixes the state_ for a chop at the given blob_position.\nvoid WERD_CHOICE::UpdateStateForSplit(int blob_position) {\n  int total_chunks = 0;\n  for (unsigned i = 0; i < length_; ++i) {\n    total_chunks += state_[i];\n    if (total_chunks > blob_position) {\n      ++state_[i];\n      return;\n    }\n  }\n}\n\n// Returns the sum of all the state elements, being the total number of blobs.\nunsigned WERD_CHOICE::TotalOfStates() const {\n  unsigned total_chunks = 0;\n  for (unsigned i = 0; i < length_; ++i) {\n    total_chunks += state_[i];\n  }\n  return total_chunks;\n}\n\n/**\n * WERD_CHOICE::print\n *\n * Print WERD_CHOICE to stdout.\n */\nvoid WERD_CHOICE::print(const char *msg) const {\n  tprintf(\"%s : \", msg);\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\"%s\", unicharset_->id_to_unichar(unichar_ids_[i]));\n  }\n  tprintf(\" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\\n\", rating_, certainty_,\n          adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);\n  tprintf(\"pos\");\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\"\\t%s\", ScriptPosToString(script_pos_[i]));\n  }\n  tprintf(\"\\nstr\");\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\"\\t%s\", unicharset_->id_to_unichar(unichar_ids_[i]));\n  }\n  tprintf(\"\\nstate:\");\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\"\\t%d \", state_[i]);\n  }\n  tprintf(\"\\nC\");\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\"\\t%.3f\", certainties_[i]);\n  }\n  tprintf(\"\\n\");\n}\n\n// Prints the segmentation state with an introductory message.\nvoid WERD_CHOICE::print_state(const char *msg) const {\n  tprintf(\"%s\", msg);\n  for (unsigned i = 0; i < length_; ++i) {\n    tprintf(\" %d\", state_[i]);\n  }\n  tprintf(\"\\n\");\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the segmentation state of *this (if not the same as the last\n// one displayed) and waits for a click in the window.\nvoid WERD_CHOICE::DisplaySegmentation(TWERD *word) {\n  // Number of different colors to draw with.\n  const int kNumColors = 6;\n  static ScrollView *segm_window = nullptr;\n  // Check the state against the static prev_drawn_state.\n  static std::vector<int> prev_drawn_state;\n  bool already_done = prev_drawn_state.size() == length_;\n  if (!already_done) {\n    prev_drawn_state.clear();\n    prev_drawn_state.resize(length_);\n  }\n  for (unsigned i = 0; i < length_; ++i) {\n    if (prev_drawn_state[i] != state_[i]) {\n      already_done = false;\n    }\n    prev_drawn_state[i] = state_[i];\n  }\n  if (already_done || word->blobs.empty()) {\n    return;\n  }\n\n  // Create the window if needed.\n  if (segm_window == nullptr) {\n    segm_window = new ScrollView(\"Segmentation\", 5, 10, 500, 256, 2000.0, 256.0, true);\n  } else {\n    segm_window->Clear();\n  }\n\n  TBOX bbox;\n  int blob_index = 0;\n  for (unsigned c = 0; c < length_; ++c) {\n    auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);\n    for (int i = 0; i < state_[c]; ++i, ++blob_index) {\n      TBLOB *blob = word->blobs[blob_index];\n      bbox += blob->bounding_box();\n      blob->plot(segm_window, color, color);\n    }\n  }\n  segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());\n  segm_window->Update();\n  segm_window->Wait();\n}\n\n#endif // !GRAPHICS_DISABLED\n\nbool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) {\n  const UNICHARSET *uchset = word1.unicharset();\n  if (word2.unicharset() != uchset) {\n    return false;\n  }\n  unsigned w1start, w1end;\n  word1.punct_stripped(&w1start, &w1end);\n  unsigned w2start, w2end;\n  word2.punct_stripped(&w2start, &w2end);\n  if (w1end - w1start != w2end - w2start) {\n    return false;\n  }\n  for (unsigned i = 0; i < w1end - w1start; i++) {\n    if (uchset->to_lower(word1.unichar_id(w1start + i)) !=\n        uchset->to_lower(word2.unichar_id(w2start + i))) {\n      return false;\n    }\n  }\n  return true;\n}\n\n/**\n * print_ratings_list\n *\n * Send all the ratings out to the logfile.\n *\n * @param msg intro message\n * @param ratings list of ratings\n * @param current_unicharset unicharset that can be used\n * for id-to-unichar conversion\n */\nvoid print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings,\n                        const UNICHARSET &current_unicharset) {\n  if (ratings->empty()) {\n    tprintf(\"%s:<none>\\n\", msg);\n    return;\n  }\n  if (*msg != '\\0') {\n    tprintf(\"%s\\n\", msg);\n  }\n  BLOB_CHOICE_IT c_it;\n  c_it.set_to_list(ratings);\n  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {\n    c_it.data()->print(&current_unicharset);\n    if (!c_it.at_last()) {\n      tprintf(\"\\n\");\n    }\n  }\n  tprintf(\"\\n\");\n  fflush(stdout);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/ratngs.h",
    "content": "/**********************************************************************\n * File:        ratngs.h  (Formerly ratings.h)\n * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef RATNGS_H\n#define RATNGS_H\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#include \"clst.h\"\n#include \"elst.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"fontinfo.h\"\n#endif // undef DISABLED_LEGACY_ENGINE\n#include \"matrix.h\"\n#include \"unicharset.h\"\n#include \"werd.h\"\n\n#include <tesseract/unichar.h>\n\n#include <cassert>\n#include <cfloat> // for FLT_MAX\n\nnamespace tesseract {\n\nclass MATRIX;\nstruct TBLOB;\nstruct TWERD;\n\n// Enum to describe the source of a BLOB_CHOICE to make it possible to determine\n// whether a blob has been classified by inspecting the BLOB_CHOICEs.\nenum BlobChoiceClassifier {\n  BCC_STATIC_CLASSIFIER,  // From the char_norm classifier.\n  BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.\n  BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.\n  BCC_AMBIG,              // Generated by ambiguity detection.\n  BCC_FAKE,               // From some other process.\n};\n\nclass BLOB_CHOICE : public ELIST<BLOB_CHOICE>::LINK {\npublic:\n  BLOB_CHOICE() {\n    unichar_id_ = UNICHAR_SPACE;\n    fontinfo_id_ = -1;\n    fontinfo_id2_ = -1;\n    rating_ = 10.0f;\n    certainty_ = -1.0f;\n    script_id_ = -1;\n    min_xheight_ = 0.0f;\n    max_xheight_ = 0.0f;\n    yshift_ = 0.0f;\n    classifier_ = BCC_FAKE;\n  }\n  BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id\n              float src_rating,          // rating\n              float src_cert,            // certainty\n              int script_id,             // script\n              float min_xheight,         // min xheight in image pixel units\n              float max_xheight,         // max xheight allowed by this char\n              float yshift,              // the larger of y shift (top or bottom)\n              BlobChoiceClassifier c);   // adapted match or other\n  BLOB_CHOICE(const BLOB_CHOICE &other);\n  ~BLOB_CHOICE() = default;\n\n  UNICHAR_ID unichar_id() const {\n    return unichar_id_;\n  }\n  float rating() const {\n    return rating_;\n  }\n  float certainty() const {\n    return certainty_;\n  }\n  int16_t fontinfo_id() const {\n    return fontinfo_id_;\n  }\n  int16_t fontinfo_id2() const {\n    return fontinfo_id2_;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  const std::vector<ScoredFont> &fonts() const {\n    return fonts_;\n  }\n  void set_fonts(const std::vector<ScoredFont> &fonts) {\n    fonts_ = fonts;\n    int score1 = 0, score2 = 0;\n    fontinfo_id_ = -1;\n    fontinfo_id2_ = -1;\n    for (auto &f : fonts_) {\n      if (f.score > score1) {\n        score2 = score1;\n        fontinfo_id2_ = fontinfo_id_;\n        score1 = f.score;\n        fontinfo_id_ = f.fontinfo_id;\n      } else if (f.score > score2) {\n        score2 = f.score;\n        fontinfo_id2_ = f.fontinfo_id;\n      }\n    }\n  }\n#endif // ndef DISABLED_LEGACY_ENGINE\n  int script_id() const {\n    return script_id_;\n  }\n  const MATRIX_COORD &matrix_cell() {\n    return matrix_cell_;\n  }\n  float min_xheight() const {\n    return min_xheight_;\n  }\n  float max_xheight() const {\n    return max_xheight_;\n  }\n  float yshift() const {\n    return yshift_;\n  }\n  BlobChoiceClassifier classifier() const {\n    return classifier_;\n  }\n  bool IsAdapted() const {\n    return classifier_ == BCC_ADAPTED_CLASSIFIER;\n  }\n  bool IsClassified() const {\n    return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER ||\n           classifier_ == BCC_SPECKLE_CLASSIFIER;\n  }\n\n  void set_unichar_id(UNICHAR_ID newunichar_id) {\n    unichar_id_ = newunichar_id;\n  }\n  void set_rating(float newrat) {\n    rating_ = newrat;\n  }\n  void set_certainty(float newrat) {\n    certainty_ = newrat;\n  }\n  void set_script(int newscript_id) {\n    script_id_ = newscript_id;\n  }\n  void set_matrix_cell(int col, int row) {\n    matrix_cell_.col = col;\n    matrix_cell_.row = row;\n  }\n  void set_classifier(BlobChoiceClassifier classifier) {\n    classifier_ = classifier;\n  }\n  static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) {\n    auto *choice = new BLOB_CHOICE;\n    *choice = *src;\n    return choice;\n  }\n  // Returns true if *this and other agree on the baseline and x-height\n  // to within some tolerance based on a given estimate of the x-height.\n  bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const;\n\n  void print(const UNICHARSET *unicharset) const {\n    tprintf(\"r%.2f c%.2f x[%g,%g]: %d %s\",\n            static_cast<double>(rating_),\n            static_cast<double>(certainty_),\n            static_cast<double>(min_xheight_),\n            static_cast<double>(max_xheight_),\n            unichar_id_, (unicharset == nullptr) ? \"\" : unicharset->debug_str(unichar_id_).c_str());\n  }\n  void print_full() const {\n    print(nullptr);\n    tprintf(\" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\\n\", script_id_, fontinfo_id_,\n            fontinfo_id2_, static_cast<double>(yshift_), classifier_);\n  }\n  // Sort function for sorting BLOB_CHOICEs in increasing order of rating.\n  static int SortByRating(const void *p1, const void *p2) {\n    const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);\n    const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);\n    return (bc1->rating_ < bc2->rating_) ? -1 : 1;\n  }\n\nprivate:\n  // Copy assignment operator.\n  BLOB_CHOICE &operator=(const BLOB_CHOICE &other);\n\n  UNICHAR_ID unichar_id_; // unichar id\n#ifndef DISABLED_LEGACY_ENGINE\n  // Fonts and scores. Allowed to be empty.\n  std::vector<ScoredFont> fonts_;\n#endif                   // ndef DISABLED_LEGACY_ENGINE\n  int16_t fontinfo_id_;  // char font information\n  int16_t fontinfo_id2_; // 2nd choice font information\n  // Rating is the classifier distance weighted by the length of the outline\n  // in the blob. In terms of probability, classifier distance is -klog p such\n  // that the resulting distance is in the range [0, 1] and then\n  // rating = w (-k log p) where w is the weight for the length of the outline.\n  // Sums of ratings may be compared meaningfully for words of different\n  // segmentation.\n  float rating_; // size related\n  // Certainty is a number in [-20, 0] indicating the classifier certainty\n  // of the choice. In terms of probability, certainty = 20 (k log p) where\n  // k is defined as above to normalize -klog p to the range [0, 1].\n  float certainty_; // absolute\n  int script_id_;\n  // Holds the position of this choice in the ratings matrix.\n  // Used to location position in the matrix during path backtracking.\n  MATRIX_COORD matrix_cell_;\n  // X-height range (in image pixels) that this classification supports.\n  float min_xheight_;\n  float max_xheight_;\n  // yshift_ - The vertical distance (in image pixels) the character is\n  //           shifted (up or down) from an acceptable y position.\n  float yshift_;\n  BlobChoiceClassifier classifier_; // What generated *this.\n};\n\n// Make BLOB_CHOICE listable.\nELISTIZEH(BLOB_CHOICE)\n\n// Return the BLOB_CHOICE in bc_list matching a given unichar_id,\n// or nullptr if there is no match.\nBLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);\n\n// Permuter codes used in WERD_CHOICEs.\nenum PermuterType {\n  NO_PERM,           // 0\n  PUNC_PERM,         // 1\n  TOP_CHOICE_PERM,   // 2\n  LOWER_CASE_PERM,   // 3\n  UPPER_CASE_PERM,   // 4\n  NGRAM_PERM,        // 5\n  NUMBER_PERM,       // 6\n  USER_PATTERN_PERM, // 7\n  SYSTEM_DAWG_PERM,  // 8\n  DOC_DAWG_PERM,     // 9\n  USER_DAWG_PERM,    // 10\n  FREQ_DAWG_PERM,    // 11\n  COMPOUND_PERM,     // 12\n\n  NUM_PERMUTER_TYPES\n};\n\n// ScriptPos tells whether a character is subscript, superscript or normal.\nenum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP };\n\nconst char *ScriptPosToString(ScriptPos script_pos);\n\nclass TESS_API WERD_CHOICE : public ELIST<WERD_CHOICE>::LINK {\npublic:\n  static const float kBadRating;\n  static const char *permuter_name(uint8_t permuter);\n\n  WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) {\n    this->init(8);\n  }\n  WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) {\n    this->init(reserved);\n  }\n  WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating,\n              float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)\n      : unicharset_(&unicharset) {\n    this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);\n  }\n  WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);\n  WERD_CHOICE(const WERD_CHOICE &word) : ELIST<WERD_CHOICE>::LINK(word), unicharset_(word.unicharset_) {\n    this->init(word.length());\n    this->operator=(word);\n  }\n  ~WERD_CHOICE();\n\n  const UNICHARSET *unicharset() const {\n    return unicharset_;\n  }\n  bool empty() const {\n    return length_ == 0;\n  }\n  inline unsigned length() const {\n    return length_;\n  }\n  float adjust_factor() const {\n    return adjust_factor_;\n  }\n  void set_adjust_factor(float factor) {\n    adjust_factor_ = factor;\n  }\n  inline const std::vector<UNICHAR_ID> &unichar_ids() const {\n    return unichar_ids_;\n  }\n  inline UNICHAR_ID unichar_id(unsigned index) const {\n    assert(index < length_);\n    return unichar_ids_[index];\n  }\n  inline unsigned state(unsigned index) const {\n    return state_[index];\n  }\n  ScriptPos BlobPosition(unsigned index) const {\n    if (index >= length_) {\n      return SP_NORMAL;\n    }\n    return script_pos_[index];\n  }\n  inline float rating() const {\n    return rating_;\n  }\n  inline float certainty() const {\n    return certainty_;\n  }\n  inline float certainty(unsigned index) const {\n    return certainties_[index];\n  }\n  inline float min_x_height() const {\n    return min_x_height_;\n  }\n  inline float max_x_height() const {\n    return max_x_height_;\n  }\n  inline void set_x_heights(float min_height, float max_height) {\n    min_x_height_ = min_height;\n    max_x_height_ = max_height;\n  }\n  inline uint8_t permuter() const {\n    return permuter_;\n  }\n  const char *permuter_name() const;\n  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,\n  // taken from the appropriate cell in the ratings MATRIX.\n  // Borrowed pointer, so do not delete.\n  BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;\n\n  // Returns the MATRIX_COORD corresponding to the location in the ratings\n  // MATRIX for the given index into the word.\n  MATRIX_COORD MatrixCoord(unsigned index) const;\n\n  inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {\n    assert(index < length_);\n    unichar_ids_[index] = unichar_id;\n  }\n  bool dangerous_ambig_found() const {\n    return dangerous_ambig_found_;\n  }\n  void set_dangerous_ambig_found_(bool value) {\n    dangerous_ambig_found_ = value;\n  }\n  inline void set_rating(float new_val) {\n    rating_ = new_val;\n  }\n  inline void set_certainty(float new_val) {\n    certainty_ = new_val;\n  }\n  inline void set_permuter(uint8_t perm) {\n    permuter_ = perm;\n  }\n  // Note: this function should only be used if all the fields\n  // are populated manually with set_* functions (rather than\n  // (copy)constructors and append_* functions).\n  inline void set_length(unsigned len) {\n    ASSERT_HOST(reserved_ >= len);\n    length_ = len;\n  }\n\n  /// Make more space in unichar_id_ and fragment_lengths_ arrays.\n  inline void double_the_size() {\n    if (reserved_ > 0) {\n      reserved_ *= 2;\n    } else {\n      reserved_ = 1;\n    }\n    unichar_ids_.resize(reserved_);\n    script_pos_.resize(reserved_);\n    state_.resize(reserved_);\n    certainties_.resize(reserved_);\n  }\n\n  /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and\n  /// fragment_length_ arrays. Sets other values to default (blank) values.\n  inline void init(unsigned reserved) {\n    reserved_ = reserved;\n    if (reserved > 0) {\n      unichar_ids_.resize(reserved);\n      script_pos_.resize(reserved);\n      state_.resize(reserved);\n      certainties_.resize(reserved);\n    } else {\n      unichar_ids_.clear();\n      script_pos_.clear();\n      state_.clear();\n      certainties_.clear();\n    }\n    length_ = 0;\n    adjust_factor_ = 1.0f;\n    rating_ = 0.0;\n    certainty_ = FLT_MAX;\n    min_x_height_ = 0.0f;\n    max_x_height_ = FLT_MAX;\n    permuter_ = NO_PERM;\n    unichars_in_script_order_ = false; // Tesseract is strict left-to-right.\n    dangerous_ambig_found_ = false;\n  }\n\n  /// Helper function to build a WERD_CHOICE from the given string,\n  /// fragment lengths, rating, certainty and permuter.\n  /// The function assumes that src_string is not nullptr.\n  /// src_lengths argument could be nullptr, in which case the unichars\n  /// in src_string are assumed to all be of length 1.\n  void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty,\n            uint8_t src_permuter);\n\n  /// Set the fields in this choice to be default (bad) values.\n  inline void make_bad() {\n    length_ = 0;\n    rating_ = kBadRating;\n    certainty_ = -FLT_MAX;\n  }\n\n  /// This function assumes that there is enough space reserved\n  /// in the WERD_CHOICE for adding another unichar.\n  /// This is an efficient alternative to append_unichar_id().\n  inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating,\n                                                float certainty) {\n    assert(reserved_ > length_);\n    length_++;\n    this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);\n  }\n\n  void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);\n\n  inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,\n                             unsigned index) {\n    assert(index < length_);\n    unichar_ids_[index] = unichar_id;\n    state_[index] = blob_count;\n    certainties_[index] = certainty;\n    script_pos_[index] = SP_NORMAL;\n    rating_ += rating;\n    if (certainty < certainty_) {\n      certainty_ = certainty;\n    }\n  }\n  // Sets the entries for the given index from the BLOB_CHOICE, assuming\n  // unit fragment lengths, but setting the state for this index to blob_count.\n  void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);\n\n  bool contains_unichar_id(UNICHAR_ID unichar_id) const;\n  void remove_unichar_ids(unsigned index, int num);\n  inline void remove_last_unichar_id() {\n    --length_;\n  }\n  inline void remove_unichar_id(unsigned index) {\n    this->remove_unichar_ids(index, 1);\n  }\n  bool has_rtl_unichar_id() const;\n  void reverse_and_mirror_unichar_ids();\n\n  // Returns the half-open interval of unichar_id indices [start, end) which\n  // enclose the core portion of this word -- the part after stripping\n  // punctuation from the left and right.\n  void punct_stripped(unsigned *start_core, unsigned *end_core) const;\n\n  // Returns the indices [start, end) containing the core of the word, stripped\n  // of any superscript digits on either side. (i.e., the non-footnote part\n  // of the word). There is no guarantee that the output range is non-empty.\n  void GetNonSuperscriptSpan(int *start, int *end) const;\n\n  // Return a copy of this WERD_CHOICE with the choices [start, end).\n  // The result is useful only for checking against a dictionary.\n  WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;\n\n  void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;\n  std::string debug_string() const {\n    std::string word_str;\n    for (unsigned i = 0; i < length_; ++i) {\n      word_str += unicharset_->debug_str(unichar_ids_[i]);\n      word_str += \" \";\n    }\n    return word_str;\n  }\n  // Returns true if any unichar_id in the word is a non-space-delimited char.\n  bool ContainsAnyNonSpaceDelimited() const {\n    for (unsigned i = 0; i < length_; ++i) {\n      if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {\n        return true;\n      }\n    }\n    return false;\n  }\n  // Returns true if the word is all spaces.\n  bool IsAllSpaces() const {\n    for (unsigned i = 0; i < length_; ++i) {\n      if (unichar_ids_[i] != UNICHAR_SPACE) {\n        return false;\n      }\n    }\n    return true;\n  }\n\n  // Call this to override the default (strict left to right graphemes)\n  // with the fact that some engine produces a \"reading order\" set of\n  // Graphemes for each word.\n  bool set_unichars_in_script_order(bool in_script_order) {\n    return unichars_in_script_order_ = in_script_order;\n  }\n\n  bool unichars_in_script_order() const {\n    return unichars_in_script_order_;\n  }\n\n  // Returns a UTF-8 string equivalent to the current choice\n  // of UNICHAR IDs.\n  std::string &unichar_string() {\n    this->string_and_lengths(&unichar_string_, &unichar_lengths_);\n    return unichar_string_;\n  }\n\n  // Returns a UTF-8 string equivalent to the current choice\n  // of UNICHAR IDs.\n  const std::string &unichar_string() const {\n    this->string_and_lengths(&unichar_string_, &unichar_lengths_);\n    return unichar_string_;\n  }\n\n  // Returns the lengths, one byte each, representing the number of bytes\n  // required in the unichar_string for each UNICHAR_ID.\n  const std::string &unichar_lengths() const {\n    this->string_and_lengths(&unichar_string_, &unichar_lengths_);\n    return unichar_lengths_;\n  }\n\n  // Sets up the script_pos_ member using the blobs_list to get the bln\n  // bounding boxes, *this to get the unichars, and this->unicharset\n  // to get the target positions. If small_caps is true, sub/super are not\n  // considered, but dropcaps are.\n  // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)\n  void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0);\n  // Sets all the script_pos_ positions to the given position.\n  void SetAllScriptPositions(ScriptPos position);\n\n  static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,\n                                    const TBOX &blob_box, UNICHAR_ID unichar_id);\n\n  // Returns the \"dominant\" script ID for the word.  By \"dominant\", the script\n  // must account for at least half the characters.  Otherwise, it returns 0.\n  // Note that for Japanese, Hiragana and Katakana are simply treated as Han.\n  int GetTopScriptID() const;\n\n  // Fixes the state_ for a chop at the given blob_position.\n  void UpdateStateForSplit(int blob_position);\n\n  // Returns the sum of all the state elements, being the total number of blobs.\n  unsigned TotalOfStates() const;\n\n  void print() const {\n    this->print(\"\");\n  }\n  void print(const char *msg) const;\n  // Prints the segmentation state with an introductory message.\n  void print_state(const char *msg) const;\n\n  // Displays the segmentation state of *this (if not the same as the last\n  // one displayed) and waits for a click in the window.\n  void DisplaySegmentation(TWERD *word);\n\n  WERD_CHOICE &operator+=(        // concatenate\n      const WERD_CHOICE &second); // second on first\n\n  WERD_CHOICE &operator=(const WERD_CHOICE &source);\n\nprivate:\n  const UNICHARSET *unicharset_;\n  // TODO(rays) Perhaps replace the multiple arrays with an array of structs?\n  // unichar_ids_ is an array of classifier \"results\" that make up a word.\n  // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position\n  // of each unichar_id.\n  // state_[i] indicates the number of blobs in WERD_RES::chopped_word that\n  // were put together to make the classification results in the ith position\n  // in unichar_ids_, and certainties_[i] is the certainty of the choice that\n  // was used in this word.\n  // == Change from before ==\n  // Previously there was fragment_lengths_ that allowed a word to be\n  // artificially composed of multiple fragment results. Since the new\n  // segmentation search doesn't do fragments, treatment of fragments has\n  // been moved to a lower level, augmenting the ratings matrix with the\n  // combined fragments, and allowing the language-model/segmentation-search\n  // to deal with only the combined unichar_ids.\n  std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word\n  std::vector<ScriptPos> script_pos_;   // Normal/Sub/Superscript of each unichar.\n  std::vector<int> state_;              // Number of blobs in each unichar.\n  std::vector<float> certainties_;      // Certainty of each unichar.\n  unsigned reserved_;            // size of the above arrays\n  unsigned length_;              // word length\n  // Factor that was used to adjust the rating.\n  float adjust_factor_;\n  // Rating is the sum of the ratings of the individual blobs in the word.\n  float rating_; // size related\n  // certainty is the min (worst) certainty of the individual blobs in the word.\n  float certainty_; // absolute\n  // xheight computed from the result, or 0 if inconsistent.\n  float min_x_height_;\n  float max_x_height_;\n  uint8_t permuter_; // permuter code\n\n  // Normally, the ratings_ matrix represents the recognition results in order\n  // from left-to-right.  However, some engines (say Cube) may return\n  // recognition results in the order of the script's major reading direction\n  // (for Arabic, that is right-to-left).\n  bool unichars_in_script_order_;\n  // True if NoDangerousAmbig found an ambiguity.\n  bool dangerous_ambig_found_;\n\n  // The following variables are populated and passed by reference any\n  // time unichar_string() or unichar_lengths() are called.\n  mutable std::string unichar_string_;\n  mutable std::string unichar_lengths_;\n};\n\n// Make WERD_CHOICE listable.\nELISTIZEH(WERD_CHOICE)\nusing BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>;\n\n// Utilities for comparing WERD_CHOICEs\n\nbool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2);\n\n// Utilities for debug printing.\nvoid print_ratings_list(const char *msg,                     // intro message\n                        BLOB_CHOICE_LIST *ratings,           // list of results\n                        const UNICHARSET &current_unicharset // unicharset that can be used\n                                                             // for id-to-unichar conversion\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/rect.cpp",
    "content": "/**********************************************************************\n * File:        rect.cpp  (Formerly box.c)\n * Description: Bounding box class definition.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"rect.h\"\n\n#include \"serialis.h\" // for TFile\n\nnamespace tesseract {\n\n/**********************************************************************\n * TBOX::TBOX()  Constructor from 2 ICOORDS\n *\n **********************************************************************/\n\nTBOX::TBOX(           // constructor\n    const ICOORD pt1, // one corner\n    const ICOORD pt2  // the other corner\n) {\n  if (pt1.x() <= pt2.x()) {\n    if (pt1.y() <= pt2.y()) {\n      bot_left = pt1;\n      top_right = pt2;\n    } else {\n      bot_left = ICOORD(pt1.x(), pt2.y());\n      top_right = ICOORD(pt2.x(), pt1.y());\n    }\n  } else {\n    if (pt1.y() <= pt2.y()) {\n      bot_left = ICOORD(pt2.x(), pt1.y());\n      top_right = ICOORD(pt1.x(), pt2.y());\n    } else {\n      bot_left = pt2;\n      top_right = pt1;\n    }\n  }\n}\n\nbool TBOX::DeSerialize(TFile *f) {\n  return bot_left.DeSerialize(f) && top_right.DeSerialize(f);\n}\n\nbool TBOX::Serialize(TFile *f) const {\n  return bot_left.Serialize(f) && top_right.Serialize(f);\n}\n\n// rotate_large constructs the containing bounding box of all 4\n// corners after rotating them. It therefore guarantees that all\n// original content is contained within, but also slightly enlarges the box.\nvoid TBOX::rotate_large(const FCOORD &vec) {\n  ICOORD top_left(bot_left.x(), top_right.y());\n  ICOORD bottom_right(top_right.x(), bot_left.y());\n  top_left.rotate(vec);\n  bottom_right.rotate(vec);\n  rotate(vec);\n  TBOX box2(top_left, bottom_right);\n  *this += box2;\n}\n\n/**********************************************************************\n * TBOX::intersection()  Build the largest box contained in both boxes\n *\n **********************************************************************/\n\nTBOX TBOX::intersection( // shared area box\n    const TBOX &box) const {\n  TDimension left;\n  TDimension bottom;\n  TDimension right;\n  TDimension top;\n  if (overlap(box)) {\n    if (box.bot_left.x() > bot_left.x()) {\n      left = box.bot_left.x();\n    } else {\n      left = bot_left.x();\n    }\n\n    if (box.top_right.x() < top_right.x()) {\n      right = box.top_right.x();\n    } else {\n      right = top_right.x();\n    }\n\n    if (box.bot_left.y() > bot_left.y()) {\n      bottom = box.bot_left.y();\n    } else {\n      bottom = bot_left.y();\n    }\n\n    if (box.top_right.y() < top_right.y()) {\n      top = box.top_right.y();\n    } else {\n      top = top_right.y();\n    }\n  } else {\n    left = INT16_MAX;\n    bottom = INT16_MAX;\n    top = -INT16_MAX;\n    right = -INT16_MAX;\n  }\n  return TBOX(left, bottom, right, top);\n}\n\n/**********************************************************************\n * TBOX::bounding_union()  Build the smallest box containing both boxes\n *\n **********************************************************************/\n\nTBOX TBOX::bounding_union( // box enclosing both\n    const TBOX &box) const {\n  ICOORD bl; // bottom left\n  ICOORD tr; // top right\n\n  if (box.bot_left.x() < bot_left.x()) {\n    bl.set_x(box.bot_left.x());\n  } else {\n    bl.set_x(bot_left.x());\n  }\n\n  if (box.top_right.x() > top_right.x()) {\n    tr.set_x(box.top_right.x());\n  } else {\n    tr.set_x(top_right.x());\n  }\n\n  if (box.bot_left.y() < bot_left.y()) {\n    bl.set_y(box.bot_left.y());\n  } else {\n    bl.set_y(bot_left.y());\n  }\n\n  if (box.top_right.y() > top_right.y()) {\n    tr.set_y(box.top_right.y());\n  } else {\n    tr.set_y(top_right.y());\n  }\n  return TBOX(bl, tr);\n}\n\n/**********************************************************************\n * TBOX::plot()  Paint a box using specified settings\n *\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid TBOX::plot(                    // paint box\n    ScrollView *fd,                 // where to paint\n    ScrollView::Color fill_colour,  // colour for inside\n    ScrollView::Color border_colour // colour for border\n    ) const {\n  fd->Brush(fill_colour);\n  fd->Pen(border_colour);\n  plot(fd);\n}\n#endif\n\n// Appends the bounding box as (%d,%d)->(%d,%d) to a string.\nvoid TBOX::print_to_str(std::string &str) const {\n  // \"(%d,%d)->(%d,%d)\", left(), bottom(), right(), top()\n  str += \"(\" + std::to_string(left());\n  str += \",\" + std::to_string(bottom());\n  str += \")->(\" + std::to_string(right());\n  str += \",\" + std::to_string(top());\n  str += ')';\n}\n\n// Writes to the given file. Returns false in case of error.\nbool TBOX::Serialize(FILE *fp) const {\n  if (!bot_left.Serialize(fp)) {\n    return false;\n  }\n  if (!top_right.Serialize(fp)) {\n    return false;\n  }\n  return true;\n}\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool TBOX::DeSerialize(bool swap, FILE *fp) {\n  if (!bot_left.DeSerialize(swap, fp)) {\n    return false;\n  }\n  if (!top_right.DeSerialize(swap, fp)) {\n    return false;\n  }\n  return true;\n}\n\n/**********************************************************************\n * operator+=\n *\n * Extend one box to include the other  (In place union)\n **********************************************************************/\n\nTBOX &operator+=( // bounding bounding bx\n    TBOX &op1,    // operands\n    const TBOX &op2) {\n  if (op2.bot_left.x() < op1.bot_left.x()) {\n    op1.bot_left.set_x(op2.bot_left.x());\n  }\n\n  if (op2.top_right.x() > op1.top_right.x()) {\n    op1.top_right.set_x(op2.top_right.x());\n  }\n\n  if (op2.bot_left.y() < op1.bot_left.y()) {\n    op1.bot_left.set_y(op2.bot_left.y());\n  }\n\n  if (op2.top_right.y() > op1.top_right.y()) {\n    op1.top_right.set_y(op2.top_right.y());\n  }\n\n  return op1;\n}\n\n/**********************************************************************\n * operator&=\n *\n * Reduce one box to intersection with the other  (In place intersection)\n **********************************************************************/\n\nTBOX &operator&=(TBOX &op1, const TBOX &op2) {\n  if (op1.overlap(op2)) {\n    if (op2.bot_left.x() > op1.bot_left.x()) {\n      op1.bot_left.set_x(op2.bot_left.x());\n    }\n\n    if (op2.top_right.x() < op1.top_right.x()) {\n      op1.top_right.set_x(op2.top_right.x());\n    }\n\n    if (op2.bot_left.y() > op1.bot_left.y()) {\n      op1.bot_left.set_y(op2.bot_left.y());\n    }\n\n    if (op2.top_right.y() < op1.top_right.y()) {\n      op1.top_right.set_y(op2.top_right.y());\n    }\n  } else {\n    op1.bot_left.set_x(INT16_MAX);\n    op1.bot_left.set_y(INT16_MAX);\n    op1.top_right.set_x(-INT16_MAX);\n    op1.top_right.set_y(-INT16_MAX);\n  }\n  return op1;\n}\n\nbool TBOX::x_almost_equal(const TBOX &box, int tolerance) const {\n  return (abs(left() - box.left()) <= tolerance && abs(right() - box.right()) <= tolerance);\n}\n\nbool TBOX::almost_equal(const TBOX &box, int tolerance) const {\n  return (abs(left() - box.left()) <= tolerance && abs(right() - box.right()) <= tolerance &&\n          abs(top() - box.top()) <= tolerance && abs(bottom() - box.bottom()) <= tolerance);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/rect.h",
    "content": "/**********************************************************************\n * File:        rect.h  (Formerly box.h)\n * Description: Bounding box class definition.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef RECT_H\n#define RECT_H\n\n#include \"points.h\"     // for ICOORD, FCOORD\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n#include \"tesstypes.h\"  // for TDimension\n#include \"tprintf.h\"    // for tprintf\n\n#include <tesseract/export.h> // for DLLSYM\n\n#include <algorithm> // for std::max, std::min\n#include <cmath>     // for std::ceil, std::floor\n#include <cstdint>   // for INT16_MAX\n#include <cstdio>    // for FILE\n#include <string>    // for std::string\n\nnamespace tesseract {\n\nclass TESS_API TBOX { // bounding box\npublic:\n  TBOX()\n      : // empty constructor making a null box\n      bot_left(INT16_MAX, INT16_MAX)\n      , top_right(-INT16_MAX, -INT16_MAX) {}\n\n  TBOX(                  // constructor\n      const ICOORD pt1,  // one corner\n      const ICOORD pt2); // the other corner\n\n  //*********************************************************************\n  // TBOX::TBOX()  Constructor from 4 integer values.\n  //  Note: It is caller's responsibility to provide values\n  //        in the right order.\n  //*********************************************************************\n  TBOX( // constructor\n      TDimension left, TDimension bottom, TDimension right, TDimension top)\n      : bot_left(left, bottom), top_right(right, top) {}\n\n  TBOX( // box around FCOORD\n      const FCOORD pt);\n\n  bool null_box() const { // Is box null\n    return ((left() >= right()) || (top() <= bottom()));\n  }\n\n  bool operator==(const TBOX &other) const {\n    return bot_left == other.bot_left && top_right == other.top_right;\n  }\n\n  TDimension top() const { // coord of top\n    return top_right.y();\n  }\n  void set_top(int y) {\n    top_right.set_y(y);\n  }\n\n  TDimension bottom() const { // coord of bottom\n    return bot_left.y();\n  }\n  void set_bottom(int y) {\n    bot_left.set_y(y);\n  }\n\n  TDimension left() const { // coord of left\n    return bot_left.x();\n  }\n  void set_left(int x) {\n    bot_left.set_x(x);\n  }\n\n  TDimension right() const { // coord of right\n    return top_right.x();\n  }\n  void set_right(int x) {\n    top_right.set_x(x);\n  }\n  int x_middle() const {\n    return (bot_left.x() + top_right.x()) / 2;\n  }\n  int y_middle() const {\n    return (bot_left.y() + top_right.y()) / 2;\n  }\n\n  const ICOORD &botleft() const { // access function\n    return bot_left;\n  }\n\n  ICOORD botright() const { // ~ access function\n    return ICOORD(top_right.x(), bot_left.y());\n  }\n\n  ICOORD topleft() const { // ~ access function\n    return ICOORD(bot_left.x(), top_right.y());\n  }\n\n  const ICOORD &topright() const { // access function\n    return top_right;\n  }\n\n  TDimension height() const { // how high is it?\n    if (!null_box()) {\n      return top_right.y() - bot_left.y();\n    } else {\n      return 0;\n    }\n  }\n\n  TDimension width() const { // how wide is it?\n    if (!null_box()) {\n      return top_right.x() - bot_left.x();\n    } else {\n      return 0;\n    }\n  }\n\n  int32_t area() const { // what is the area?\n    if (!null_box()) {\n      return width() * height();\n    } else {\n      return 0;\n    }\n  }\n\n  // Pads the box on either side by the supplied x,y pad amounts.\n  // NO checks for exceeding any bounds like 0 or an image size.\n  void pad(int xpad, int ypad) {\n    ICOORD pad(xpad, ypad);\n    bot_left -= pad;\n    top_right += pad;\n  }\n\n  void move_bottom_edge( // move one edge\n      const TDimension y) { // by +/- y\n    bot_left += ICOORD(0, y);\n  }\n\n  void move_left_edge(   // move one edge\n      const TDimension x) { // by +/- x\n    bot_left += ICOORD(x, 0);\n  }\n\n  void move_right_edge(  // move one edge\n      const TDimension x) { // by +/- x\n    top_right += ICOORD(x, 0);\n  }\n\n  void move_top_edge(    // move one edge\n      const TDimension y) { // by +/- y\n    top_right += ICOORD(0, y);\n  }\n\n  void move(              // move box\n      const ICOORD vec) { // by vector\n    bot_left += vec;\n    top_right += vec;\n  }\n\n  void move(              // move box\n      const FCOORD vec) { // by float vector\n    bot_left.set_x(static_cast<TDimension>(std::floor(bot_left.x() + vec.x())));\n    // round left\n    bot_left.set_y(static_cast<TDimension>(std::floor(bot_left.y() + vec.y())));\n    // round down\n    top_right.set_x(static_cast<TDimension>(std::ceil(top_right.x() + vec.x())));\n    // round right\n    top_right.set_y(static_cast<TDimension>(std::ceil(top_right.y() + vec.y())));\n    // round up\n  }\n\n  void scale(          // scale box\n      const float f) { // by multiplier\n    // round left\n    bot_left.set_x(static_cast<TDimension>(std::floor(bot_left.x() * f)));\n    // round down\n    bot_left.set_y(static_cast<TDimension>(std::floor(bot_left.y() * f)));\n    // round right\n    top_right.set_x(static_cast<TDimension>(std::ceil(top_right.x() * f)));\n    // round up\n    top_right.set_y(static_cast<TDimension>(std::ceil(top_right.y() * f)));\n  }\n  void scale(             // scale box\n      const FCOORD vec) { // by float vector\n    bot_left.set_x(static_cast<TDimension>(std::floor(bot_left.x() * vec.x())));\n    bot_left.set_y(static_cast<TDimension>(std::floor(bot_left.y() * vec.y())));\n    top_right.set_x(static_cast<TDimension>(std::ceil(top_right.x() * vec.x())));\n    top_right.set_y(static_cast<TDimension>(std::ceil(top_right.y() * vec.y())));\n  }\n\n  // rotate doesn't enlarge the box - it just rotates the bottom-left\n  // and top-right corners. Use rotate_large if you want to guarantee\n  // that all content is contained within the rotated box.\n  void rotate(const FCOORD &vec) { // by vector\n    bot_left.rotate(vec);\n    top_right.rotate(vec);\n    *this = TBOX(bot_left, top_right);\n  }\n  // rotate_large constructs the containing bounding box of all 4\n  // corners after rotating them. It therefore guarantees that all\n  // original content is contained within, but also slightly enlarges the box.\n  void rotate_large(const FCOORD &vec);\n\n  bool contains( // is pt inside box\n      const FCOORD pt) const;\n\n  bool contains( // is box inside box\n      const TBOX &box) const;\n\n  bool overlap( // do boxes overlap\n      const TBOX &box) const;\n\n  bool major_overlap( // do boxes overlap more than half\n      const TBOX &box) const;\n\n  // Do boxes overlap on x axis.\n  bool x_overlap(const TBOX &box) const;\n\n  // Return the horizontal gap between the boxes. If the boxes\n  // overlap horizontally then the return value is negative, indicating\n  // the amount of the overlap.\n  int x_gap(const TBOX &box) const {\n    return std::max(bot_left.x(), box.bot_left.x()) - std::min(top_right.x(), box.top_right.x());\n  }\n\n  // Return the vertical gap between the boxes. If the boxes\n  // overlap vertically then the return value is negative, indicating\n  // the amount of the overlap.\n  int y_gap(const TBOX &box) const {\n    return std::max(bot_left.y(), box.bot_left.y()) - std::min(top_right.y(), box.top_right.y());\n  }\n\n  // Do boxes overlap on x axis by more than\n  // half of the width of the narrower box.\n  bool major_x_overlap(const TBOX &box) const;\n\n  // Do boxes overlap on y axis.\n  bool y_overlap(const TBOX &box) const;\n\n  // Do boxes overlap on y axis by more than\n  // half of the height of the shorter box.\n  bool major_y_overlap(const TBOX &box) const;\n\n  // fraction of current box's area covered by other\n  double overlap_fraction(const TBOX &box) const;\n\n  // fraction of the current box's projected area covered by the other's\n  double x_overlap_fraction(const TBOX &box) const;\n\n  // fraction of the current box's projected area covered by the other's\n  double y_overlap_fraction(const TBOX &box) const;\n\n  // Returns true if the boxes are almost equal on x axis.\n  bool x_almost_equal(const TBOX &box, int tolerance) const;\n\n  // Returns true if the boxes are almost equal\n  bool almost_equal(const TBOX &box, int tolerance) const;\n\n  TBOX intersection( // shared area box\n      const TBOX &box) const;\n\n  TBOX bounding_union( // box enclosing both\n      const TBOX &box) const;\n\n  // Sets the box boundaries to the given coordinates.\n  void set_to_given_coords(int x_min, int y_min, int x_max, int y_max) {\n    bot_left.set_x(x_min);\n    bot_left.set_y(y_min);\n    top_right.set_x(x_max);\n    top_right.set_y(y_max);\n  }\n\n  void print() const { // print\n    tprintf(\"Bounding box=(%d,%d)->(%d,%d)\\n\", left(), bottom(), right(), top());\n  }\n  // Appends the bounding box as (%d,%d)->(%d,%d) to a string.\n  void print_to_str(std::string &str) const;\n\n#ifndef GRAPHICS_DISABLED\n  void plot(                  // use current settings\n      ScrollView *fd) const { // where to paint\n    fd->Rectangle(bot_left.x(), bot_left.y(), top_right.x(), top_right.y());\n  }\n\n  void plot(                                  // paint box\n      ScrollView *fd,                         // where to paint\n      ScrollView::Color fill_colour,          // colour for inside\n      ScrollView::Color border_colour) const; // colour for border\n#endif\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  bool Serialize(TFile *fp) const;\n\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n  bool DeSerialize(TFile *fp);\n\n  friend TBOX &operator+=(TBOX &, const TBOX &);\n  // in place union\n  friend TBOX &operator&=(TBOX &, const TBOX &);\n  // in place intersection\n\nprivate:\n  ICOORD bot_left;  // bottom left corner\n  ICOORD top_right; // top right corner\n};\n\n/**********************************************************************\n * TBOX::TBOX()  Constructor from 1 FCOORD\n *\n **********************************************************************/\n\ninline TBOX::TBOX(  // constructor\n    const FCOORD pt // floating centre\n) {\n  bot_left =\n      ICOORD(static_cast<TDimension>(std::floor(pt.x())), static_cast<TDimension>(std::floor(pt.y())));\n  top_right =\n      ICOORD(static_cast<TDimension>(std::ceil(pt.x())), static_cast<TDimension>(std::ceil(pt.y())));\n}\n\n/**********************************************************************\n * TBOX::contains()  Is point within box\n *\n **********************************************************************/\n\ninline bool TBOX::contains(const FCOORD pt) const {\n  return ((pt.x() >= bot_left.x()) && (pt.x() <= top_right.x()) && (pt.y() >= bot_left.y()) &&\n          (pt.y() <= top_right.y()));\n}\n\n/**********************************************************************\n * TBOX::contains()  Is box within box\n *\n **********************************************************************/\n\ninline bool TBOX::contains(const TBOX &box) const {\n  return (contains(box.bot_left) && contains(box.top_right));\n}\n\n/**********************************************************************\n * TBOX::overlap()  Do two boxes overlap?\n *\n **********************************************************************/\n\ninline bool TBOX::overlap( // do boxes overlap\n    const TBOX &box) const {\n  return ((box.bot_left.x() <= top_right.x()) && (box.top_right.x() >= bot_left.x()) &&\n          (box.bot_left.y() <= top_right.y()) && (box.top_right.y() >= bot_left.y()));\n}\n\n/**********************************************************************\n * TBOX::major_overlap()  Do two boxes overlap by at least half of the smallest?\n *\n **********************************************************************/\n\ninline bool TBOX::major_overlap( // Do boxes overlap more that half.\n    const TBOX &box) const {\n  int overlap = std::min(box.top_right.x(), top_right.x());\n  overlap -= std::max(box.bot_left.x(), bot_left.x());\n  overlap += overlap;\n  if (overlap < std::min(box.width(), width())) {\n    return false;\n  }\n  overlap = std::min(box.top_right.y(), top_right.y());\n  overlap -= std::max(box.bot_left.y(), bot_left.y());\n  overlap += overlap;\n  if (overlap < std::min(box.height(), height())) {\n    return false;\n  }\n  return true;\n}\n\n/**********************************************************************\n * TBOX::overlap_fraction()  Fraction of area covered by the other box\n *\n **********************************************************************/\n\ninline double TBOX::overlap_fraction(const TBOX &box) const {\n  double fraction = 0.0;\n  if (this->area()) {\n    fraction = this->intersection(box).area() * 1.0 / this->area();\n  }\n  return fraction;\n}\n\n/**********************************************************************\n * TBOX::x_overlap()  Do two boxes overlap on x-axis\n *\n **********************************************************************/\n\ninline bool TBOX::x_overlap(const TBOX &box) const {\n  return ((box.bot_left.x() <= top_right.x()) && (box.top_right.x() >= bot_left.x()));\n}\n\n/**********************************************************************\n * TBOX::major_x_overlap()  Do two boxes overlap by more than half the\n *                          width of the narrower box on the x-axis\n *\n **********************************************************************/\n\ninline bool TBOX::major_x_overlap(const TBOX &box) const {\n  TDimension overlap = box.width();\n  if (this->left() > box.left()) {\n    overlap -= this->left() - box.left();\n  }\n  if (this->right() < box.right()) {\n    overlap -= box.right() - this->right();\n  }\n  return (overlap >= box.width() / 2 || overlap >= this->width() / 2);\n}\n\n/**********************************************************************\n * TBOX::y_overlap()  Do two boxes overlap on y-axis\n *\n **********************************************************************/\n\ninline bool TBOX::y_overlap(const TBOX &box) const {\n  return ((box.bot_left.y() <= top_right.y()) && (box.top_right.y() >= bot_left.y()));\n}\n\n/**********************************************************************\n * TBOX::major_y_overlap()  Do two boxes overlap by more than half the\n *                          height of the shorter box on the y-axis\n *\n **********************************************************************/\n\ninline bool TBOX::major_y_overlap(const TBOX &box) const {\n  TDimension overlap = box.height();\n  if (this->bottom() > box.bottom()) {\n    overlap -= this->bottom() - box.bottom();\n  }\n  if (this->top() < box.top()) {\n    overlap -= box.top() - this->top();\n  }\n  return (overlap >= box.height() / 2 || overlap >= this->height() / 2);\n}\n\n/**********************************************************************\n * TBOX::x_overlap_fraction() Calculates the horizontal overlap of the\n *                            given boxes as a fraction of this boxes\n *                            width.\n *\n **********************************************************************/\n\ninline double TBOX::x_overlap_fraction(const TBOX &other) const {\n  int low = std::max(left(), other.left());\n  int high = std::min(right(), other.right());\n  int width = right() - left();\n  if (width == 0) {\n    int x = left();\n    if (other.left() <= x && x <= other.right()) {\n      return 1.0;\n    } else {\n      return 0.0;\n    }\n  } else {\n    return std::max(0.0, static_cast<double>(high - low) / width);\n  }\n}\n\n/**********************************************************************\n * TBOX::y_overlap_fraction() Calculates the vertical overlap of the\n *                            given boxes as a fraction of this boxes\n *                            height.\n *\n **********************************************************************/\n\ninline double TBOX::y_overlap_fraction(const TBOX &other) const {\n  int low = std::max(bottom(), other.bottom());\n  int high = std::min(top(), other.top());\n  int height = top() - bottom();\n  if (height == 0) {\n    int y = bottom();\n    if (other.bottom() <= y && y <= other.top()) {\n      return 1.0;\n    } else {\n      return 0.0;\n    }\n  } else {\n    return std::max(0.0, static_cast<double>(high - low) / height);\n  }\n}\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/rejctmap.cpp",
    "content": "/**********************************************************************\n * File:        rejctmap.cpp  (Formerly rejmap.c)\n * Description: REJ and REJMAP class functions.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"rejctmap.h\"\n\n#include <memory>\n\n#include \"params.h\"\n\nnamespace tesseract {\n\nvoid REJ::full_print(FILE *fp) const {\n  fprintf(fp, \"R_TESS_FAILURE: %s\\n\", flag(R_TESS_FAILURE) ? \"T\" : \"F\");\n  fprintf(fp, \"R_SMALL_XHT: %s\\n\", flag(R_SMALL_XHT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_EDGE_CHAR: %s\\n\", flag(R_EDGE_CHAR) ? \"T\" : \"F\");\n  fprintf(fp, \"R_1IL_CONFLICT: %s\\n\", flag(R_1IL_CONFLICT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_POSTNN_1IL: %s\\n\", flag(R_POSTNN_1IL) ? \"T\" : \"F\");\n  fprintf(fp, \"R_REJ_CBLOB: %s\\n\", flag(R_REJ_CBLOB) ? \"T\" : \"F\");\n  fprintf(fp, \"R_MM_REJECT: %s\\n\", flag(R_MM_REJECT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_BAD_REPETITION: %s\\n\", flag(R_BAD_REPETITION) ? \"T\" : \"F\");\n  fprintf(fp, \"R_POOR_MATCH: %s\\n\", flag(R_POOR_MATCH) ? \"T\" : \"F\");\n  fprintf(fp, \"R_NOT_TESS_ACCEPTED: %s\\n\",\n          flag(R_NOT_TESS_ACCEPTED) ? \"T\" : \"F\");\n  fprintf(fp, \"R_CONTAINS_BLANKS: %s\\n\", flag(R_CONTAINS_BLANKS) ? \"T\" : \"F\");\n  fprintf(fp, \"R_BAD_PERMUTER: %s\\n\", flag(R_BAD_PERMUTER) ? \"T\" : \"F\");\n  fprintf(fp, \"R_HYPHEN: %s\\n\", flag(R_HYPHEN) ? \"T\" : \"F\");\n  fprintf(fp, \"R_DUBIOUS: %s\\n\", flag(R_DUBIOUS) ? \"T\" : \"F\");\n  fprintf(fp, \"R_NO_ALPHANUMS: %s\\n\", flag(R_NO_ALPHANUMS) ? \"T\" : \"F\");\n  fprintf(fp, \"R_MOSTLY_REJ: %s\\n\", flag(R_MOSTLY_REJ) ? \"T\" : \"F\");\n  fprintf(fp, \"R_XHT_FIXUP: %s\\n\", flag(R_XHT_FIXUP) ? \"T\" : \"F\");\n  fprintf(fp, \"R_BAD_QUALITY: %s\\n\", flag(R_BAD_QUALITY) ? \"T\" : \"F\");\n  fprintf(fp, \"R_DOC_REJ: %s\\n\", flag(R_DOC_REJ) ? \"T\" : \"F\");\n  fprintf(fp, \"R_BLOCK_REJ: %s\\n\", flag(R_BLOCK_REJ) ? \"T\" : \"F\");\n  fprintf(fp, \"R_ROW_REJ: %s\\n\", flag(R_ROW_REJ) ? \"T\" : \"F\");\n  fprintf(fp, \"R_UNLV_REJ: %s\\n\", flag(R_UNLV_REJ) ? \"T\" : \"F\");\n  fprintf(fp, \"R_HYPHEN_ACCEPT: %s\\n\", flag(R_HYPHEN_ACCEPT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_NN_ACCEPT: %s\\n\", flag(R_NN_ACCEPT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_MM_ACCEPT: %s\\n\", flag(R_MM_ACCEPT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_QUALITY_ACCEPT: %s\\n\", flag(R_QUALITY_ACCEPT) ? \"T\" : \"F\");\n  fprintf(fp, \"R_MINIMAL_REJ_ACCEPT: %s\\n\",\n          flag(R_MINIMAL_REJ_ACCEPT) ? \"T\" : \"F\");\n}\n\nREJMAP &REJMAP::operator=(const REJMAP &source) {\n  initialise(source.len);\n  for (unsigned i = 0; i < len; i++) {\n    ptr[i] = source.ptr[i];\n  }\n  return *this;\n}\n\nvoid REJMAP::initialise(uint16_t length) {\n  ptr = std::make_unique<REJ[]>(length);\n  len = length;\n}\n\nint16_t REJMAP::accept_count() const { // How many accepted?\n  int16_t count = 0;\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      count++;\n    }\n  }\n  return count;\n}\n\nbool REJMAP::recoverable_rejects() const { // Any non perm rejs?\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].recoverable()) {\n      return true;\n    }\n  }\n  return false;\n}\n\nbool REJMAP::quality_recoverable_rejects() const { // Any potential rejs?\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accept_if_good_quality()) {\n      return true;\n    }\n  }\n  return false;\n}\n\nvoid REJMAP::remove_pos( // Cut out an element\n    uint16_t pos         // element to remove\n) {\n  ASSERT_HOST(pos < len);\n  ASSERT_HOST(len > 0);\n\n  len--;\n  for (; pos < len; pos++) {\n    ptr[pos] = ptr[pos + 1];\n  }\n}\n\nvoid REJMAP::print(FILE *fp) const {\n  fputc('\"', fp);\n  for (unsigned i = 0; i < len; i++) {\n    fputc( ptr[i].display_char(), fp);\n  }\n  fputc('\"', fp);\n}\n\nvoid REJMAP::full_print(FILE *fp) const {\n  for (unsigned i = 0; i < len; i++) {\n    ptr[i].full_print(fp);\n    fprintf(fp, \"\\n\");\n  }\n}\n\nvoid REJMAP::rej_word_small_xht() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    ptr[i].setrej_small_xht();\n  }\n}\n\nvoid REJMAP::rej_word_tess_failure() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    ptr[i].setrej_tess_failure();\n  }\n}\n\nvoid REJMAP::rej_word_not_tess_accepted() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_not_tess_accepted();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_contains_blanks() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_contains_blanks();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_bad_permuter() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_bad_permuter();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_xht_fixup() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_xht_fixup();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_no_alphanums() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_no_alphanums();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_mostly_rej() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_mostly_rej();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_bad_quality() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_bad_quality();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_doc_rej() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_doc_rej();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_block_rej() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_block_rej();\n    }\n  }\n}\n\nvoid REJMAP::rej_word_row_rej() { // Reject whole word\n  for (unsigned i = 0; i < len; i++) {\n    if (ptr[i].accepted()) {\n      ptr[i].setrej_row_rej();\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/rejctmap.h",
    "content": "/**********************************************************************\n * File:        rejctmap.h  (Formerly rejmap.h)\n * Description: REJ and REJMAP class functions.\n * Author:    Phil Cheatle\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n\nThis module may look unnecessarily verbose, but here's the philosophy...\n\nALL processing of the reject map is done in this module. There are lots of\nseparate calls to set reject/accept flags. These have DELIBERATELY been kept\ndistinct so that this module can decide what to do.\n\nBasically, there is a flag for each sort of rejection or acceptance. This\nprovides a history of what has happened to EACH character.\n\nDetermining whether a character is CURRENTLY rejected depends on implicit\nunderstanding of the SEQUENCE of possible calls. The flags are defined and\ngrouped in the REJ_FLAGS enum. These groupings are used in determining a\ncharacters CURRENT rejection status. Basically, a character is ACCEPTED if\n\n    none of the permanent rej flags are set\n  AND (    the character has never been rejected\n      OR an accept flag is set which is LATER than the latest reject flag )\n\nIT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE\nOF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!\n**********************************************************************/\n\n#ifndef REJCTMAP_H\n#define REJCTMAP_H\n\n#include \"errcode.h\"\n#include \"params.h\"\n\n#include <bitset>\n#include <memory>\n\nnamespace tesseract {\n\nenum REJ_FLAGS {\n  /* Reject modes which are NEVER overridden */\n  R_TESS_FAILURE,   // PERM Tess didn't classify\n  R_SMALL_XHT,      // PERM Xht too small\n  R_EDGE_CHAR,      // PERM Too close to edge of image\n  R_1IL_CONFLICT,   // PERM 1Il confusion\n  R_POSTNN_1IL,     // PERM 1Il unrejected by NN\n  R_REJ_CBLOB,      // PERM Odd blob\n  R_MM_REJECT,      // PERM Matrix match rejection (m's)\n  R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend\n\n  /* Initial reject modes (pre NN_ACCEPT) */\n  R_POOR_MATCH,        // TEMP Ray's original heuristic (Not used)\n  R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD\n  R_CONTAINS_BLANKS,   // TEMP Tess failed on other chs in WERD\n  R_BAD_PERMUTER,      // POTENTIAL Bad permuter for WERD\n\n  /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */\n  R_HYPHEN,       // TEMP Post NN dodgy hyphen or full stop\n  R_DUBIOUS,      // TEMP Post NN dodgy chars\n  R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN\n  R_MOSTLY_REJ,   // TEMP Most of word rejected so rej the rest\n  R_XHT_FIXUP,    // TEMP Xht tests unsure\n\n  /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */\n  R_BAD_QUALITY, // TEMP Quality metrics bad for WERD\n\n  /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accept */\n  R_DOC_REJ,   // TEMP Document rejection\n  R_BLOCK_REJ, // TEMP Block rejection\n  R_ROW_REJ,   // TEMP Row rejection\n  R_UNLV_REJ,  // TEMP ~ turned to - or ^ turned to space\n\n  /* Accept modes which occur between the above rejection groups */\n  R_NN_ACCEPT,         // NN acceptance\n  R_HYPHEN_ACCEPT,     // Hyphen acceptance\n  R_MM_ACCEPT,         // Matrix match acceptance\n  R_QUALITY_ACCEPT,    // Accept word in good quality doc\n  R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures\n};\n\n/* REJECT MAP VALUES */\n\n#define MAP_ACCEPT '1'\n#define MAP_REJECT_PERM '0'\n#define MAP_REJECT_TEMP '2'\n#define MAP_REJECT_POTENTIAL '3'\n\nclass REJ {\n  std::bitset<32> flags;\n\n  void set_flag(REJ_FLAGS rej_flag) {\n    flags.set(rej_flag);\n  }\n\npublic:\n  REJ() = default;\n\n  REJ( // classwise copy\n      const REJ &source) {\n    flags = source.flags;\n  }\n\n  REJ &operator=( // assign REJ\n      const REJ &source) = default;\n\n  bool flag(REJ_FLAGS rej_flag) const {\n    return flags[rej_flag];\n  }\n\n  char display_char() const {\n    if (perm_rejected()) {\n      return MAP_REJECT_PERM;\n    } else if (accept_if_good_quality()) {\n      return MAP_REJECT_POTENTIAL;\n    } else if (rejected()) {\n      return MAP_REJECT_TEMP;\n    } else {\n      return MAP_ACCEPT;\n    }\n  }\n\n  bool perm_rejected() const { // Is char perm reject?\n    return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) ||\n            flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) ||\n            flag(R_BAD_REPETITION) || flag(R_MM_REJECT));\n  }\n\nprivate:\n  bool rej_before_nn_accept() const {\n    return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) ||\n           flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER);\n  }\n\n  bool rej_between_nn_and_mm() const {\n    return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) ||\n           flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP);\n  }\n\n  bool rej_between_mm_and_quality_accept() const {\n    return flag(R_BAD_QUALITY);\n  }\n\n  bool rej_between_quality_and_minimal_rej_accept() const {\n    return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) ||\n           flag(R_UNLV_REJ);\n  }\n\n  bool rej_before_mm_accept() const {\n    return rej_between_nn_and_mm() ||\n           (rej_before_nn_accept() && !flag(R_NN_ACCEPT) &&\n            !flag(R_HYPHEN_ACCEPT));\n  }\n\n  bool rej_before_quality_accept() const {\n    return rej_between_mm_and_quality_accept() ||\n           (!flag(R_MM_ACCEPT) && rej_before_mm_accept());\n  }\n\npublic:\n  bool rejected() const { // Is char rejected?\n    if (flag(R_MINIMAL_REJ_ACCEPT)) {\n      return false;\n    } else {\n      return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() ||\n              (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept()));\n    }\n  }\n\n  bool accept_if_good_quality() const { // potential rej?\n    return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) &&\n            !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) &&\n            !flag(R_CONTAINS_BLANKS) &&\n            (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() &&\n             !rej_between_quality_and_minimal_rej_accept()));\n  }\n\n  void setrej_tess_failure() { // Tess generated blank\n    set_flag(R_TESS_FAILURE);\n  }\n\n  void setrej_small_xht() { // Small xht char/wd\n    set_flag(R_SMALL_XHT);\n  }\n\n  void setrej_edge_char() { // Close to image edge\n    set_flag(R_EDGE_CHAR);\n  }\n\n  void setrej_1Il_conflict() { // Initial reject map\n    set_flag(R_1IL_CONFLICT);\n  }\n\n  void setrej_postNN_1Il() { // 1Il after NN\n    set_flag(R_POSTNN_1IL);\n  }\n\n  void setrej_rej_cblob() { // Insert duff blob\n    set_flag(R_REJ_CBLOB);\n  }\n\n  void setrej_mm_reject() { // Matrix matcher\n    set_flag(R_MM_REJECT);\n  }\n\n  void setrej_bad_repetition() { // Odd repeated char\n    set_flag(R_BAD_REPETITION);\n  }\n\n  void setrej_poor_match() { // Failed Rays heuristic\n    set_flag(R_POOR_MATCH);\n  }\n\n  void setrej_not_tess_accepted() {\n    // TEMP reject_word\n    set_flag(R_NOT_TESS_ACCEPTED);\n  }\n\n  void setrej_contains_blanks() {\n    // TEMP reject_word\n    set_flag(R_CONTAINS_BLANKS);\n  }\n\n  void setrej_bad_permuter() { // POTENTIAL reject_word\n    set_flag(R_BAD_PERMUTER);\n  }\n\n  void setrej_hyphen() { // PostNN dubious hyphen or .\n    set_flag(R_HYPHEN);\n  }\n\n  void setrej_dubious() { // PostNN dubious limit\n    set_flag(R_DUBIOUS);\n  }\n\n  void setrej_no_alphanums() { // TEMP reject_word\n    set_flag(R_NO_ALPHANUMS);\n  }\n\n  void setrej_mostly_rej() { // TEMP reject_word\n    set_flag(R_MOSTLY_REJ);\n  }\n\n  void setrej_xht_fixup() { // xht fixup\n    set_flag(R_XHT_FIXUP);\n  }\n\n  void setrej_bad_quality() { // TEMP reject_word\n    set_flag(R_BAD_QUALITY);\n  }\n\n  void setrej_doc_rej() { // TEMP reject_word\n    set_flag(R_DOC_REJ);\n  }\n\n  void setrej_block_rej() { // TEMP reject_word\n    set_flag(R_BLOCK_REJ);\n  }\n\n  void setrej_row_rej() { // TEMP reject_word\n    set_flag(R_ROW_REJ);\n  }\n\n  void setrej_unlv_rej() { // TEMP reject_word\n    set_flag(R_UNLV_REJ);\n  }\n\n  void setrej_hyphen_accept() { // NN Flipped a char\n    set_flag(R_HYPHEN_ACCEPT);\n  }\n\n  void setrej_nn_accept() { // NN Flipped a char\n    set_flag(R_NN_ACCEPT);\n  }\n\n  void setrej_mm_accept() { // Matrix matcher\n    set_flag(R_MM_ACCEPT);\n  }\n\n  void setrej_quality_accept() { // Quality flip a char\n    set_flag(R_QUALITY_ACCEPT);\n  }\n\n  void setrej_minimal_rej_accept() {\n    // Accept all except blank\n    set_flag(R_MINIMAL_REJ_ACCEPT);\n  }\n\n  bool accepted() const { // Is char accepted?\n    return !rejected();\n  }\n\n  bool recoverable() const {\n    return (rejected() && !perm_rejected());\n  }\n\n  void full_print(FILE *fp) const;\n};\n\nclass REJMAP {\n  std::unique_ptr<REJ[]> ptr; // ptr to the chars\n  uint16_t len = 0;           // Number of chars\n\npublic:\n  REJMAP() = default;\n\n  REJMAP(const REJMAP &rejmap) {\n    *this = rejmap;\n  }\n\n  REJMAP &operator=(const REJMAP &source);\n\n  // Sets up the ptr array to length, whatever it was before.\n  void initialise(uint16_t length);\n\n  REJ &operator[](         // access function\n      uint16_t index) const // map index\n  {\n    ASSERT_HOST(index < len);\n    return ptr[index]; // no bounds checks\n  }\n\n  uint16_t length() const { // map length\n    return len;\n  }\n\n  int16_t accept_count() const; // How many accepted?\n\n  int16_t reject_count() const { // How many rejects?\n    return len - accept_count();\n  }\n\n  // Cut out an element.\n  void remove_pos(uint16_t pos);\n\n  void print(FILE *fp) const;\n\n  void full_print(FILE *fp) const;\n\n  bool recoverable_rejects() const; // Any non perm rejs?\n\n  bool quality_recoverable_rejects() const;\n  // Any potential rejs?\n\n  void rej_word_small_xht(); // Reject whole word\n                             // Reject whole word\n  void rej_word_tess_failure();\n  void rej_word_not_tess_accepted();\n  // Reject whole word\n  // Reject whole word\n  void rej_word_contains_blanks();\n  // Reject whole word\n  void rej_word_bad_permuter();\n  void rej_word_xht_fixup(); // Reject whole word\n                             // Reject whole word\n  void rej_word_no_alphanums();\n  void rej_word_mostly_rej();  // Reject whole word\n  void rej_word_bad_quality(); // Reject whole word\n  void rej_word_doc_rej();     // Reject whole word\n  void rej_word_block_rej();   // Reject whole word\n  void rej_word_row_rej();     // Reject whole word\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/seam.cpp",
    "content": "/******************************************************************************\n *\n * File:         seam.cpp  (Formerly seam.c)\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n#include \"seam.h\"\n\n#include \"blobs.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n        Public Function Code\n----------------------------------------------------------------------*/\n\n// Returns the bounding box of all the points in the seam.\nTBOX SEAM::bounding_box() const {\n  TBOX box(location_.x, location_.y, location_.x, location_.y);\n  for (int s = 0; s < num_splits_; ++s) {\n    box += splits_[s].bounding_box();\n  }\n  return box;\n}\n\n// Returns true if the splits in *this SEAM appear OK in the sense that they\n// do not cross any outlines and do not chop off any ridiculously small\n// pieces.\nbool SEAM::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {\n  // TODO(rays) Try testing all the splits. Duplicating original code for now,\n  // which tested only the first.\n  return num_splits_ == 0 || splits_[0].IsHealthy(blob, min_points, min_area);\n}\n\n// Computes the widthp_/widthn_ range for all existing SEAMs and for *this\n// seam, which is about to be inserted at insert_index. Returns false if\n// any of the computations fails, as this indicates an invalid chop.\n// widthn_/widthp_ are only changed if modify is true.\nbool SEAM::PrepareToInsertSeam(const std::vector<SEAM *> &seams,\n                               const std::vector<TBLOB *> &blobs, int insert_index, bool modify) {\n  for (int s = 0; s < insert_index; ++s) {\n    if (!seams[s]->FindBlobWidth(blobs, s, modify)) {\n      return false;\n    }\n  }\n  if (!FindBlobWidth(blobs, insert_index, modify)) {\n    return false;\n  }\n  for (unsigned s = insert_index; s < seams.size(); ++s) {\n    if (!seams[s]->FindBlobWidth(blobs, s + 1, modify)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Computes the widthp_/widthn_ range. Returns false if not all the splits\n// are accounted for. widthn_/widthp_ are only changed if modify is true.\nbool SEAM::FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify) {\n  int num_found = 0;\n  if (modify) {\n    widthp_ = 0;\n    widthn_ = 0;\n  }\n  for (int s = 0; s < num_splits_; ++s) {\n    const SPLIT &split = splits_[s];\n    bool found_split = split.ContainedByBlob(*blobs[index]);\n    // Look right.\n    for (unsigned b = index + 1; !found_split && b < blobs.size(); ++b) {\n      found_split = split.ContainedByBlob(*blobs[b]);\n      if (found_split && b - index > widthp_ && modify) {\n        widthp_ = b - index;\n      }\n    }\n    // Look left.\n    for (int b = index - 1; !found_split && b >= 0; --b) {\n      found_split = split.ContainedByBlob(*blobs[b]);\n      if (found_split && index - b > widthn_ && modify) {\n        widthn_ = index - b;\n      }\n    }\n    if (found_split) {\n      ++num_found;\n    }\n  }\n  return num_found == num_splits_;\n}\n\n// Splits this blob into two blobs by applying the splits included in\n// *this SEAM\nvoid SEAM::ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const {\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].SplitOutlineList(blob->outlines);\n  }\n  blob->ComputeBoundingBoxes();\n\n  divide_blobs(blob, other_blob, italic_blob, location_);\n\n  blob->EliminateDuplicateOutlines();\n  other_blob->EliminateDuplicateOutlines();\n\n  blob->CorrectBlobOrder(other_blob);\n}\n\n// Undoes ApplySeam by removing the seam between these two blobs.\n// Produces one blob as a result, and deletes other_blob.\nvoid SEAM::UndoSeam(TBLOB *blob, TBLOB *other_blob) const {\n  if (blob->outlines == nullptr) {\n    blob->outlines = other_blob->outlines;\n    other_blob->outlines = nullptr;\n  }\n\n  TESSLINE *outline = blob->outlines;\n  while (outline->next) {\n    outline = outline->next;\n  }\n  outline->next = other_blob->outlines;\n  other_blob->outlines = nullptr;\n  delete other_blob;\n\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].UnsplitOutlineList(blob);\n  }\n  blob->ComputeBoundingBoxes();\n  blob->EliminateDuplicateOutlines();\n}\n\n// Prints everything in *this SEAM.\nvoid SEAM::Print(const char *label) const {\n  tprintf(\"%s\", label);\n  tprintf(\" %6.2f @ (%d,%d), p=%u, n=%u \", priority_, location_.x, location_.y, widthp_, widthn_);\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].Print();\n    if (s + 1 < num_splits_) {\n      tprintf(\",   \");\n    }\n  }\n  tprintf(\"\\n\");\n}\n\n// Prints a collection of SEAMs.\n/* static */\nvoid SEAM::PrintSeams(const char *label, const std::vector<SEAM *> &seams) {\n  if (!seams.empty()) {\n    tprintf(\"%s\\n\", label);\n    for (unsigned x = 0; x < seams.size(); ++x) {\n      tprintf(\"%2u:   \", x);\n      seams[x]->Print(\"\");\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n// Draws the seam in the given window.\nvoid SEAM::Mark(ScrollView *window) const {\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].Mark(window);\n  }\n}\n#endif\n\n// Break up the blobs in this chain so that they are all independent.\n// This operation should undo the affect of join_pieces.\n/* static */\nvoid SEAM::BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,\n                       int first, int last) {\n  for (int x = first; x < last; ++x) {\n    seams[x]->Reveal();\n  }\n\n  TESSLINE *outline = blobs[first]->outlines;\n  int next_blob = first + 1;\n\n  while (outline != nullptr && next_blob <= last) {\n    if (outline->next == blobs[next_blob]->outlines) {\n      outline->next = nullptr;\n      outline = blobs[next_blob]->outlines;\n      ++next_blob;\n    } else {\n      outline = outline->next;\n    }\n  }\n}\n\n// Join a group of base level pieces into a single blob that can then\n// be classified.\n/* static */\nvoid SEAM::JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,\n                      int first, int last) {\n  TESSLINE *outline = blobs[first]->outlines;\n  if (!outline) {\n    return;\n  }\n\n  for (int x = first; x < last; ++x) {\n    SEAM *seam = seams[x];\n    if (x - seam->widthn_ >= first && x + seam->widthp_ < last) {\n      seam->Hide();\n    }\n    while (outline->next) {\n      outline = outline->next;\n    }\n    outline->next = blobs[x + 1]->outlines;\n  }\n}\n\n// Hides the seam so the outlines appear not to be cut by it.\nvoid SEAM::Hide() const {\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].Hide();\n  }\n}\n\n// Undoes hide, so the outlines are cut by the seam.\nvoid SEAM::Reveal() const {\n  for (int s = 0; s < num_splits_; ++s) {\n    splits_[s].Reveal();\n  }\n}\n\n// Computes and returns, but does not set, the full priority of *this SEAM.\nfloat SEAM::FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,\n                         double center_knob, double width_change_knob) const {\n  if (num_splits_ == 0) {\n    return 0.0f;\n  }\n  for (int s = 1; s < num_splits_; ++s) {\n    splits_[s].SplitOutline();\n  }\n  float full_priority =\n      priority_ + splits_[0].FullPriority(xmin, xmax, overlap_knob, centered_maxwidth, center_knob,\n                                          width_change_knob);\n  for (int s = num_splits_ - 1; s >= 1; --s) {\n    splits_[s].UnsplitOutlines();\n  }\n  return full_priority;\n}\n\n/**\n * @name start_seam_list\n *\n * Initialize a list of seams that match the original number of blobs\n * present in the starting segmentation.  Each of the seams created\n * by this routine have location information only.\n */\nvoid start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array) {\n  seam_array->clear();\n  TPOINT location;\n\n  for (unsigned b = 1; b < word->NumBlobs(); ++b) {\n    TBOX bbox = word->blobs[b - 1]->bounding_box();\n    TBOX nbox = word->blobs[b]->bounding_box();\n    location.x = (bbox.right() + nbox.left()) / 2;\n    location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;\n    seam_array->push_back(new SEAM(0.0f, location));\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/seam.h",
    "content": "/******************************************************************************\n *\n * File:        seam.h\n * Author:      Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n#ifndef SEAM_H\n#define SEAM_H\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blobs.h\"\n#include \"split.h\"\n\nnamespace tesseract {\n\nusing PRIORITY = float; /*  PRIORITY  */\n\nclass SEAM {\npublic:\n  // A seam with no splits\n  SEAM(float priority, const TPOINT &location)\n      : priority_(priority), location_(location), num_splits_(0) {}\n  // A seam with a single split point.\n  SEAM(float priority, const TPOINT &location, const SPLIT &split)\n      : priority_(priority), location_(location), num_splits_(1) {\n    splits_[0] = split;\n  }\n  // Default copy constructor, operator= and destructor are OK!\n\n  // Accessors.\n  float priority() const {\n    return priority_;\n  }\n  void set_priority(float priority) {\n    priority_ = priority;\n  }\n  bool HasAnySplits() const {\n    return num_splits_ > 0;\n  }\n\n  // Returns the bounding box of all the points in the seam.\n  TBOX bounding_box() const;\n\n  // Returns true if other can be combined into *this.\n  bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const {\n    int dist = location_.x - other.location_.x;\n    return -max_x_dist < dist && dist < max_x_dist &&\n           num_splits_ + other.num_splits_ <= kMaxNumSplits &&\n           priority_ + other.priority_ < max_total_priority && !OverlappingSplits(other) &&\n           !SharesPosition(other);\n  }\n\n  // Combines other into *this. Only works if CombinableWith returned true.\n  void CombineWith(const SEAM &other) {\n    priority_ += other.priority_;\n    location_ += other.location_;\n    location_ /= 2;\n\n    for (uint8_t s = 0; s < other.num_splits_ && num_splits_ < kMaxNumSplits; ++s) {\n      splits_[num_splits_++] = other.splits_[s];\n    }\n  }\n\n  // Returns true if the given blob contains all splits of *this SEAM.\n  bool ContainedByBlob(const TBLOB &blob) const {\n    for (int s = 0; s < num_splits_; ++s) {\n      if (!splits_[s].ContainedByBlob(blob)) {\n        return false;\n      }\n    }\n    return true;\n  }\n\n  // Returns true if the given EDGEPT is used by this SEAM, checking only\n  // the EDGEPT pointer, not the coordinates.\n  bool UsesPoint(const EDGEPT *point) const {\n    for (int s = 0; s < num_splits_; ++s) {\n      if (splits_[s].UsesPoint(point)) {\n        return true;\n      }\n    }\n    return false;\n  }\n  // Returns true if *this and other share any common point, by coordinates.\n  bool SharesPosition(const SEAM &other) const {\n    for (int s = 0; s < num_splits_; ++s) {\n      for (int t = 0; t < other.num_splits_; ++t) {\n        if (splits_[s].SharesPosition(other.splits_[t])) {\n          return true;\n        }\n      }\n    }\n    return false;\n  }\n  // Returns true if *this and other have any vertically overlapping splits.\n  bool OverlappingSplits(const SEAM &other) const {\n    for (int s = 0; s < num_splits_; ++s) {\n      TBOX split1_box = splits_[s].bounding_box();\n      for (int t = 0; t < other.num_splits_; ++t) {\n        TBOX split2_box = other.splits_[t].bounding_box();\n        if (split1_box.y_overlap(split2_box)) {\n          return true;\n        }\n      }\n    }\n    return false;\n  }\n\n  // Marks the edgepts used by the seam so the segments made by the cut\n  // never get split further by another seam in the future.\n  void Finalize() {\n    for (int s = 0; s < num_splits_; ++s) {\n      splits_[s].point1->MarkChop();\n      splits_[s].point2->MarkChop();\n    }\n  }\n\n  // Returns true if the splits in *this SEAM appear OK in the sense that they\n  // do not cross any outlines and do not chop off any ridiculously small\n  // pieces.\n  bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const;\n\n  // Computes the widthp_/widthn_ range for all existing SEAMs and for *this\n  // seam, which is about to be inserted at insert_index. Returns false if\n  // any of the computations fails, as this indicates an invalid chop.\n  // widthn_/widthp_ are only changed if modify is true.\n  bool PrepareToInsertSeam(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,\n                           int insert_index, bool modify);\n  // Computes the widthp_/widthn_ range. Returns false if not all the splits\n  // are accounted for. widthn_/widthp_ are only changed if modify is true.\n  bool FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify);\n\n  // Splits this blob into two blobs by applying the splits included in\n  // *this SEAM\n  void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const;\n  // Undoes ApplySeam by removing the seam between these two blobs.\n  // Produces one blob as a result, and deletes other_blob.\n  void UndoSeam(TBLOB *blob, TBLOB *other_blob) const;\n\n  // Prints everything in *this SEAM.\n  void Print(const char *label) const;\n  // Prints a collection of SEAMs.\n  static void PrintSeams(const char *label, const std::vector<SEAM *> &seams);\n#ifndef GRAPHICS_DISABLED\n  // Draws the seam in the given window.\n  void Mark(ScrollView *window) const;\n#endif\n\n  // Break up the blobs in this chain so that they are all independent.\n  // This operation should undo the affect of join_pieces.\n  static void BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,\n                          int first, int last);\n  // Join a group of base level pieces into a single blob that can then\n  // be classified.\n  static void JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,\n                         int first, int last);\n\n  // Hides the seam so the outlines appear not to be cut by it.\n  void Hide() const;\n  // Undoes hide, so the outlines are cut by the seam.\n  void Reveal() const;\n\n  // Computes and returns, but does not set, the full priority of *this SEAM.\n  // The arguments here are config parameters defined in Wordrec. Add chop_\n  // to the beginning of the name.\n  float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,\n                     double center_knob, double width_change_knob) const;\n\nprivate:\n  // Maximum number of splits that a SEAM can hold.\n  static const uint8_t kMaxNumSplits = 3;\n  // Priority of this split. Lower is better.\n  float priority_;\n  // Position of the middle of the seam.\n  TPOINT location_;\n  // A range such that all splits in *this SEAM are contained within blobs in\n  // the range [index - widthn_,index + widthp_] where index is the index of\n  // this SEAM in the seams vector.\n  uint8_t widthp_ = 0;\n  uint8_t widthn_ = 0;\n  // Number of splits_ that are used.\n  uint8_t num_splits_;\n  // Set of pairs of points that are the ends of each split in the SEAM.\n  SPLIT splits_[kMaxNumSplits];\n};\n\nvoid start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/split.cpp",
    "content": "/******************************************************************************\n *\n * File:         split.cpp  (Formerly split.c)\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *************************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"split.h\"\n\n#include \"coutln.h\"\n#include \"tprintf.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\n// Limit on the amount of penalty for the chop being off-center.\nconst int kCenterGradeCap = 25;\n// Ridiculously large priority for splits that are no use.\nconst double kBadPriority = 999.0;\n\nBOOL_VAR(wordrec_display_splits, 0, \"Display splits\");\n\n// Hides the SPLIT so the outlines appear not to be cut by it.\nvoid SPLIT::Hide() const {\n  EDGEPT *edgept = point1;\n  do {\n    edgept->Hide();\n    edgept = edgept->next;\n  } while (!edgept->EqualPos(*point2) && edgept != point1);\n  edgept = point2;\n  do {\n    edgept->Hide();\n    edgept = edgept->next;\n  } while (!edgept->EqualPos(*point1) && edgept != point2);\n}\n\n// Undoes hide, so the outlines are cut by the SPLIT.\nvoid SPLIT::Reveal() const {\n  EDGEPT *edgept = point1;\n  do {\n    edgept->Reveal();\n    edgept = edgept->next;\n  } while (!edgept->EqualPos(*point2) && edgept != point1);\n  edgept = point2;\n  do {\n    edgept->Reveal();\n    edgept = edgept->next;\n  } while (!edgept->EqualPos(*point1) && edgept != point2);\n}\n\n// Compute a split priority based on the bounding boxes of the parts.\n// The arguments here are config parameters defined in Wordrec. Add chop_\n// to the beginning of the name.\nfloat SPLIT::FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,\n                          double center_knob, double width_change_knob) const {\n  TBOX box1 = Box12();\n  TBOX box2 = Box21();\n  int min_left = std::min(box1.left(), box2.left());\n  int max_right = std::max(box1.right(), box2.right());\n  if (xmin < min_left && xmax > max_right) {\n    return kBadPriority;\n  }\n\n  float grade = 0.0f;\n  // grade_overlap.\n  int width1 = box1.width();\n  int width2 = box2.width();\n  int min_width = std::min(width1, width2);\n  int overlap = -box1.x_gap(box2);\n  if (overlap == min_width) {\n    grade += 100.0f; // Total overlap.\n  } else {\n    if (2 * overlap > min_width) {\n      overlap += 2 * overlap - min_width;\n    }\n    if (overlap > 0) {\n      grade += overlap_knob * overlap;\n    }\n  }\n  // grade_center_of_blob.\n  if (width1 <= centered_maxwidth || width2 <= centered_maxwidth) {\n    grade += std::min(static_cast<double>(kCenterGradeCap), center_knob * abs(width1 - width2));\n  }\n  // grade_width_change.\n  float width_change_grade = 20 - (max_right - min_left - std::max(width1, width2));\n  if (width_change_grade > 0.0f) {\n    grade += width_change_grade * width_change_knob;\n  }\n  return grade;\n}\n\n// Returns true if *this SPLIT appears OK in the sense that it does not cross\n// any outlines and does not chop off any ridiculously small pieces.\nbool SPLIT::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {\n  return !IsLittleChunk(min_points, min_area) &&\n         !blob.SegmentCrossesOutline(point1->pos, point2->pos);\n}\n\n// Returns true if the split generates a small chunk in terms of either area\n// or number of points.\nbool SPLIT::IsLittleChunk(int min_points, int min_area) const {\n  if (point1->ShortNonCircularSegment(min_points, point2) &&\n      point1->SegmentArea(point2) < min_area) {\n    return true;\n  }\n  if (point2->ShortNonCircularSegment(min_points, point1) &&\n      point2->SegmentArea(point1) < min_area) {\n    return true;\n  }\n  return false;\n}\n\n/**********************************************************************\n * make_edgept\n *\n * Create an EDGEPT and hook it into an existing list of edge points.\n **********************************************************************/\nEDGEPT *make_edgept(TDimension x, TDimension y, EDGEPT *next, EDGEPT *prev) {\n  EDGEPT *this_edgept;\n  /* Create point */\n  this_edgept = new EDGEPT;\n  this_edgept->pos.x = x;\n  this_edgept->pos.y = y;\n  // Now deal with the src_outline steps.\n  C_OUTLINE *prev_ol = prev->src_outline;\n  if (prev_ol != nullptr && prev->next == next) {\n    // Compute the fraction of the segment that is being cut.\n    FCOORD segment_vec(next->pos.x - prev->pos.x, next->pos.y - prev->pos.y);\n    FCOORD target_vec(x - prev->pos.x, y - prev->pos.y);\n    double cut_fraction = target_vec.length() / segment_vec.length();\n    // Get the start and end at the step level.\n    ICOORD step_start = prev_ol->position_at_index(prev->start_step);\n    int end_step = prev->start_step + prev->step_count;\n    int step_length = prev_ol->pathlength();\n    ICOORD step_end = prev_ol->position_at_index(end_step % step_length);\n    ICOORD step_vec = step_end - step_start;\n    double target_length = step_vec.length() * cut_fraction;\n    // Find the point on the segment that gives the length nearest to target.\n    int best_step = prev->start_step;\n    ICOORD total_step(0, 0);\n    double best_dist = target_length;\n    for (int s = prev->start_step; s < end_step; ++s) {\n      total_step += prev_ol->step(s % step_length);\n      double dist = fabs(target_length - total_step.length());\n      if (dist < best_dist) {\n        best_dist = dist;\n        best_step = s + 1;\n      }\n    }\n    // The new point is an intermediate point.\n    this_edgept->src_outline = prev_ol;\n    this_edgept->step_count = end_step - best_step;\n    this_edgept->start_step = best_step % step_length;\n    prev->step_count = best_step - prev->start_step;\n  } else {\n    // The new point is poly only.\n    this_edgept->src_outline = nullptr;\n    this_edgept->step_count = 0;\n    this_edgept->start_step = 0;\n  }\n  /* Hook it up */\n  this_edgept->next = next;\n  this_edgept->prev = prev;\n  prev->next = this_edgept;\n  next->prev = this_edgept;\n  /* Set up vec entries */\n  this_edgept->vec.x = this_edgept->next->pos.x - x;\n  this_edgept->vec.y = this_edgept->next->pos.y - y;\n  this_edgept->prev->vec.x = x - this_edgept->prev->pos.x;\n  this_edgept->prev->vec.y = y - this_edgept->prev->pos.y;\n  return this_edgept;\n}\n\n/**********************************************************************\n * remove_edgept\n *\n * Remove a given EDGEPT from its list and delete it.\n **********************************************************************/\nvoid remove_edgept(EDGEPT *point) {\n  EDGEPT *prev = point->prev;\n  EDGEPT *next = point->next;\n  // Add point's steps onto prev's steps if they are from the same outline.\n  if (prev->src_outline == point->src_outline && prev->src_outline != nullptr) {\n    prev->step_count += point->step_count;\n  }\n  prev->next = next;\n  next->prev = prev;\n  prev->vec.x = next->pos.x - prev->pos.x;\n  prev->vec.y = next->pos.y - prev->pos.y;\n  delete point;\n}\n\n/**********************************************************************\n * Print\n *\n * Shows the coordinates of both points in a split.\n **********************************************************************/\nvoid SPLIT::Print() const {\n  tprintf(\"(%d,%d)--(%d,%d)\", point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y);\n}\n\n#ifndef GRAPHICS_DISABLED\n// Draws the split in the given window.\nvoid SPLIT::Mark(ScrollView *window) const {\n  window->Pen(ScrollView::GREEN);\n  window->Line(point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y);\n  window->UpdateWindow();\n}\n#endif\n\n// Creates two outlines out of one by splitting the original one in half.\n// Inserts the resulting outlines into the given list.\nvoid SPLIT::SplitOutlineList(TESSLINE *outlines) const {\n  SplitOutline();\n  while (outlines->next != nullptr) {\n    outlines = outlines->next;\n  }\n\n  outlines->next = new TESSLINE;\n  outlines->next->loop = point1;\n  outlines->next->ComputeBoundingBox();\n\n  outlines = outlines->next;\n\n  outlines->next = new TESSLINE;\n  outlines->next->loop = point2;\n  outlines->next->ComputeBoundingBox();\n\n  outlines->next->next = nullptr;\n}\n\n// Makes a split between these two edge points, but does not affect the\n// outlines to which they belong.\nvoid SPLIT::SplitOutline() const {\n  EDGEPT *temp2 = point2->next;\n  EDGEPT *temp1 = point1->next;\n  /* Create two new points */\n  EDGEPT *new_point1 = make_edgept(point1->pos.x, point1->pos.y, temp1, point2);\n  EDGEPT *new_point2 = make_edgept(point2->pos.x, point2->pos.y, temp2, point1);\n  // point1 and 2 are now cross-over points, so they must have nullptr\n  // src_outlines and give their src_outline information their new\n  // replacements.\n  new_point1->src_outline = point1->src_outline;\n  new_point1->start_step = point1->start_step;\n  new_point1->step_count = point1->step_count;\n  new_point2->src_outline = point2->src_outline;\n  new_point2->start_step = point2->start_step;\n  new_point2->step_count = point2->step_count;\n  point1->src_outline = nullptr;\n  point1->start_step = 0;\n  point1->step_count = 0;\n  point2->src_outline = nullptr;\n  point2->start_step = 0;\n  point2->step_count = 0;\n}\n\n// Undoes the effect of SplitOutlineList, correcting the outlines for undoing\n// the split, but possibly leaving some duplicate outlines.\nvoid SPLIT::UnsplitOutlineList(TBLOB *blob) const {\n  /* Modify edge points */\n  UnsplitOutlines();\n\n  auto *outline1 = new TESSLINE;\n  outline1->next = blob->outlines;\n  blob->outlines = outline1;\n  outline1->loop = point1;\n\n  auto *outline2 = new TESSLINE;\n  outline2->next = blob->outlines;\n  blob->outlines = outline2;\n  outline2->loop = point2;\n}\n\n// Removes the split that was put between these two points.\nvoid SPLIT::UnsplitOutlines() const {\n  EDGEPT *tmp1 = point1->next;\n  EDGEPT *tmp2 = point2->next;\n\n  tmp1->next->prev = point2;\n  tmp2->next->prev = point1;\n\n  // tmp2 is coincident with point1. point1 takes tmp2's place as tmp2 is\n  // deleted.\n  point1->next = tmp2->next;\n  point1->src_outline = tmp2->src_outline;\n  point1->start_step = tmp2->start_step;\n  point1->step_count = tmp2->step_count;\n  // Likewise point2 takes tmp1's place.\n  point2->next = tmp1->next;\n  point2->src_outline = tmp1->src_outline;\n  point2->start_step = tmp1->start_step;\n  point2->step_count = tmp1->step_count;\n\n  delete tmp1;\n  delete tmp2;\n\n  point1->vec.x = point1->next->pos.x - point1->pos.x;\n  point1->vec.y = point1->next->pos.y - point1->pos.y;\n\n  point2->vec.x = point2->next->pos.x - point2->pos.x;\n  point2->vec.y = point2->next->pos.y - point2->pos.y;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/split.h",
    "content": "/******************************************************************************\n *\n * File:        split.h\n * Author:      Mark Seaman, SW Productivity\n * Status:      Reusable Software Component\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n#ifndef SPLIT_H\n#define SPLIT_H\n\n#include \"blobs.h\"  // for EDGEPT, TBLOB, TESSLINE\n#include \"params.h\" // for BOOL_VAR_H, BoolParam\n#include \"rect.h\"   // for TBOX\n\nnamespace tesseract {\n\nclass ScrollView;\n\n/*----------------------------------------------------------------------\n              T y p e s\n----------------------------------------------------------------------*/\nstruct SPLIT {\n  SPLIT() : point1(nullptr), point2(nullptr) {}\n  SPLIT(EDGEPT *pt1, EDGEPT *pt2) : point1(pt1), point2(pt2) {}\n\n  // Returns the bounding box of all the points in the split.\n  TBOX bounding_box() const {\n    return TBOX(std::min(point1->pos.x, point2->pos.x), std::min(point1->pos.y, point2->pos.y),\n                std::max(point1->pos.x, point2->pos.x), std::max(point1->pos.y, point2->pos.y));\n  }\n\n  // Returns the bounding box of the outline from point1 to point2.\n  TBOX Box12() const {\n    return point1->SegmentBox(point2);\n  }\n  // Returns the bounding box of the outline from point1 to point1.\n  TBOX Box21() const {\n    return point2->SegmentBox(point1);\n  }\n  // Returns the bounding box of the out\n\n  // Hides the SPLIT so the outlines appear not to be cut by it.\n  void Hide() const;\n  // Undoes hide, so the outlines are cut by the SPLIT.\n  void Reveal() const;\n\n  // Returns true if the given EDGEPT is used by this SPLIT, checking only\n  // the EDGEPT pointer, not the coordinates.\n  bool UsesPoint(const EDGEPT *point) const {\n    return point1 == point || point2 == point;\n  }\n  // Returns true if the other SPLIT has any position shared with *this.\n  bool SharesPosition(const SPLIT &other) const {\n    return point1->EqualPos(*other.point1) || point1->EqualPos(*other.point2) ||\n           point2->EqualPos(*other.point1) || point2->EqualPos(*other.point2);\n  }\n  // Returns true if both points are contained within the blob.\n  bool ContainedByBlob(const TBLOB &blob) const {\n    return blob.Contains(point1->pos) && blob.Contains(point2->pos);\n  }\n  // Returns true if both points are contained within the outline.\n  bool ContainedByOutline(const TESSLINE &outline) const {\n    return outline.Contains(point1->pos) && outline.Contains(point2->pos);\n  }\n  // Compute a split priority based on the bounding boxes of the parts.\n  // The arguments here are config parameters defined in Wordrec. Add chop_\n  // to the beginning of the name.\n  float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,\n                     double center_knob, double width_change_knob) const;\n  // Returns true if *this SPLIT appears OK in the sense that it does not cross\n  // any outlines and does not chop off any ridiculously small pieces.\n  bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const;\n  // Returns true if the split generates a small chunk in terms of either area\n  // or number of points.\n  bool IsLittleChunk(int min_points, int min_area) const;\n\n  void Print() const;\n#ifndef GRAPHICS_DISABLED\n  // Draws the split in the given window.\n  void Mark(ScrollView *window) const;\n#endif\n\n  // Creates two outlines out of one by splitting the original one in half.\n  // Inserts the resulting outlines into the given list.\n  void SplitOutlineList(TESSLINE *outlines) const;\n  // Makes a split between these two edge points, but does not affect the\n  // outlines to which they belong.\n  void SplitOutline() const;\n  // Undoes the effect of SplitOutlineList, correcting the outlines for undoing\n  // the split, but possibly leaving some duplicate outlines.\n  void UnsplitOutlineList(TBLOB *blob) const;\n  // Removes the split that was put between these two points.\n  void UnsplitOutlines() const;\n\n  EDGEPT *point1;\n  EDGEPT *point2;\n};\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\n\nextern BOOL_VAR_H(wordrec_display_splits);\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\nEDGEPT *make_edgept(TDimension x, TDimension y, EDGEPT *next, EDGEPT *prev);\n\nvoid remove_edgept(EDGEPT *point);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/statistc.cpp",
    "content": "/**********************************************************************\n * File:        statistc.cpp  (Formerly stats.c)\n * Description: Simple statistical package for integer values.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"statistc.h\"\n\n#include \"errcode.h\"\n#include \"scrollview.h\"\n#include \"tprintf.h\"\n\n#include \"helpers.h\"\n\n#include <cmath>\n#include <cstdlib>\n#include <cstring>\n\nnamespace tesseract {\n\n/**********************************************************************\n * STATS::STATS\n *\n * Construct a new stats element by allocating and zeroing the memory.\n **********************************************************************/\nSTATS::STATS(int32_t min_bucket_value, int32_t max_bucket_value) {\n  if (max_bucket_value < min_bucket_value) {\n    min_bucket_value = 0;\n    max_bucket_value = 1;\n  }\n  rangemin_ = min_bucket_value; // setup\n  rangemax_ = max_bucket_value;\n  buckets_ = new int32_t[1 + rangemax_ - rangemin_];\n  clear();\n}\n\n/**********************************************************************\n * STATS::set_range\n *\n * Alter the range on an existing stats element.\n **********************************************************************/\nbool STATS::set_range(int32_t min_bucket_value, int32_t max_bucket_value) {\n  if (max_bucket_value < min_bucket_value) {\n    return false;\n  }\n  if (rangemax_ - rangemin_ != max_bucket_value - min_bucket_value) {\n    delete[] buckets_;\n    buckets_ = new int32_t[1 + max_bucket_value - min_bucket_value];\n  }\n  rangemin_ = min_bucket_value; // setup\n  rangemax_ = max_bucket_value;\n  clear(); // zero it\n  return true;\n}\n\n/**********************************************************************\n * STATS::clear\n *\n * Clear out the STATS class by zeroing all the buckets.\n **********************************************************************/\nvoid STATS::clear() { // clear out buckets\n  total_count_ = 0;\n  if (buckets_ != nullptr) {\n    memset(buckets_, 0, (1 + rangemax_ - rangemin_) * sizeof(buckets_[0]));\n  }\n}\n\n/**********************************************************************\n * STATS::~STATS\n *\n * Destructor for a stats class.\n **********************************************************************/\nSTATS::~STATS() {\n  delete[] buckets_;\n}\n\n/**********************************************************************\n * STATS::add\n *\n * Add a set of samples to (or delete from) a pile.\n **********************************************************************/\nvoid STATS::add(int32_t value, int32_t count) {\n  if (buckets_ != nullptr) {\n    value = ClipToRange(value, rangemin_, rangemax_);\n    buckets_[value - rangemin_] += count;\n    total_count_ += count; // keep count of total\n  }\n}\n\n/**********************************************************************\n * STATS::mode\n *\n * Find the mode of a stats class.\n **********************************************************************/\nint32_t STATS::mode() const { // get mode of samples\n  if (buckets_ == nullptr) {\n    return rangemin_;\n  }\n  int32_t max = buckets_[0]; // max cell count\n  int32_t maxindex = 0;      // index of max\n  for (int index = rangemax_ - rangemin_; index > 0; --index) {\n    if (buckets_[index] > max) {\n      max = buckets_[index]; // find biggest\n      maxindex = index;\n    }\n  }\n  return maxindex + rangemin_; // index of biggest\n}\n\n/**********************************************************************\n * STATS::mean\n *\n * Find the mean of a stats class.\n **********************************************************************/\ndouble STATS::mean() const { // get mean of samples\n  if (buckets_ == nullptr || total_count_ <= 0) {\n    return static_cast<double>(rangemin_);\n  }\n  int64_t sum = 0;\n  for (int index = rangemax_ - rangemin_; index >= 0; --index) {\n    sum += static_cast<int64_t>(index) * buckets_[index];\n  }\n  return static_cast<double>(sum) / total_count_ + rangemin_;\n}\n\n/**********************************************************************\n * STATS::sd\n *\n * Find the standard deviation of a stats class.\n **********************************************************************/\ndouble STATS::sd() const { // standard deviation\n  if (buckets_ == nullptr || total_count_ <= 0) {\n    return 0.0;\n  }\n  int64_t sum = 0;\n  double sqsum = 0.0;\n  for (int index = rangemax_ - rangemin_; index >= 0; --index) {\n    sum += static_cast<int64_t>(index) * buckets_[index];\n    sqsum += static_cast<double>(index) * index * buckets_[index];\n  }\n  double variance = static_cast<double>(sum) / total_count_;\n  variance = sqsum / total_count_ - variance * variance;\n  if (variance > 0.0) {\n    return sqrt(variance);\n  }\n  return 0.0;\n}\n\n/**********************************************************************\n * STATS::ile\n *\n * Returns the fractile value such that frac fraction (in [0,1]) of samples\n * has a value less than the return value.\n **********************************************************************/\ndouble STATS::ile(double frac) const {\n  if (buckets_ == nullptr || total_count_ == 0) {\n    return static_cast<double>(rangemin_);\n  }\n#if 0\n  // TODO(rays) The existing code doesn't seem to be doing the right thing\n  // with target a double but this substitute crashes the code that uses it.\n  // Investigate and fix properly.\n  int target = IntCastRounded(frac * total_count_);\n  target = ClipToRange(target, 1, total_count_);\n#else\n  double target = frac * total_count_;\n  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));\n#endif\n  int sum = 0;\n  int index = 0;\n  for (index = 0; index <= rangemax_ - rangemin_ && sum < target; sum += buckets_[index++]) {\n    ;\n  }\n  if (index > 0) {\n    ASSERT_HOST(buckets_[index - 1] > 0);\n    return rangemin_ + index - static_cast<double>(sum - target) / buckets_[index - 1];\n  } else {\n    return static_cast<double>(rangemin_);\n  }\n}\n\n/**********************************************************************\n * STATS::min_bucket\n *\n * Find REAL minimum bucket - ile(0.0) isn't necessarily correct\n **********************************************************************/\nint32_t STATS::min_bucket() const { // Find min\n  if (buckets_ == nullptr || total_count_ == 0) {\n    return rangemin_;\n  }\n  int32_t min = 0;\n  for (min = 0; (min <= rangemax_ - rangemin_) && (buckets_[min] == 0); min++) {\n    ;\n  }\n  return rangemin_ + min;\n}\n\n/**********************************************************************\n * STATS::max_bucket\n *\n * Find REAL maximum bucket - ile(1.0) isn't necessarily correct\n **********************************************************************/\n\nint32_t STATS::max_bucket() const { // Find max\n  if (buckets_ == nullptr || total_count_ == 0) {\n    return rangemin_;\n  }\n  int32_t max;\n  for (max = rangemax_ - rangemin_; max > 0 && buckets_[max] == 0; max--) {\n    ;\n  }\n  return rangemin_ + max;\n}\n\n/**********************************************************************\n * STATS::median\n *\n * Finds a more useful estimate of median than ile(0.5).\n *\n * Overcomes a problem with ile() - if the samples are, for example,\n * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway\n * between 6 and 13 = 9.5\n **********************************************************************/\ndouble STATS::median() const { // get median\n  if (buckets_ == nullptr) {\n    return static_cast<double>(rangemin_);\n  }\n  double median = ile(0.5);\n  int median_pile = static_cast<int>(floor(median));\n  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {\n    int32_t min_pile;\n    int32_t max_pile;\n    /* Find preceding non zero pile */\n    for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--) {\n      ;\n    }\n    /* Find following non zero pile */\n    for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++) {\n      ;\n    }\n    median = (min_pile + max_pile) / 2.0;\n  }\n  return median;\n}\n\n/**********************************************************************\n * STATS::local_min\n *\n * Return true if this point is a local min.\n **********************************************************************/\nbool STATS::local_min(int32_t x) const {\n  if (buckets_ == nullptr) {\n    return false;\n  }\n  x = ClipToRange(x, rangemin_, rangemax_) - rangemin_;\n  if (buckets_[x] == 0) {\n    return true;\n  }\n  int32_t index; // table index\n  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index) {\n    ;\n  }\n  if (index >= 0 && buckets_[index] < buckets_[x]) {\n    return false;\n  }\n  for (index = x + 1; index <= rangemax_ - rangemin_ && buckets_[index] == buckets_[x]; ++index) {\n    ;\n  }\n  if (index <= rangemax_ - rangemin_ && buckets_[index] < buckets_[x]) {\n    return false;\n  } else {\n    return true;\n  }\n}\n\n/**********************************************************************\n * STATS::smooth\n *\n * Apply a triangular smoothing filter to the stats.\n * This makes the modes a bit more useful.\n * The factor gives the height of the triangle, i.e. the weight of the\n * centre.\n **********************************************************************/\nvoid STATS::smooth(int32_t factor) {\n  if (buckets_ == nullptr || factor < 2) {\n    return;\n  }\n  STATS result(rangemin_, rangemax_);\n  int entrycount = 1 + rangemax_ - rangemin_;\n  for (int entry = 0; entry < entrycount; entry++) {\n    // centre weight\n    int count = buckets_[entry] * factor;\n    for (int offset = 1; offset < factor; offset++) {\n      if (entry - offset >= 0) {\n        count += buckets_[entry - offset] * (factor - offset);\n      }\n      if (entry + offset < entrycount) {\n        count += buckets_[entry + offset] * (factor - offset);\n      }\n    }\n    result.add(entry + rangemin_, count);\n  }\n  total_count_ = result.total_count_;\n  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));\n}\n\n/**********************************************************************\n * STATS::cluster\n *\n * Cluster the samples into max_cluster clusters.\n * Each call runs one iteration. The array of clusters must be\n * max_clusters+1 in size as cluster 0 is used to indicate which samples\n * have been used.\n * The return value is the current number of clusters.\n **********************************************************************/\n\nint32_t STATS::cluster(float lower, // thresholds\n                       float upper,\n                       float multiple,       // distance threshold\n                       int32_t max_clusters, // max no to make\n                       STATS *clusters) {    // array of clusters\n  bool new_cluster;                          // added one\n  float *centres;                            // cluster centres\n  int32_t entry;                             // bucket index\n  int32_t cluster;                           // cluster index\n  int32_t best_cluster;                      // one to assign to\n  int32_t new_centre = 0;                    // residual mode\n  int32_t new_mode;                          // pile count of new_centre\n  int32_t count;                             // pile to place\n  float dist;                                // from cluster\n  float min_dist;                            // from best_cluster\n  int32_t cluster_count;                     // no of clusters\n\n  if (buckets_ == nullptr || max_clusters < 1) {\n    return 0;\n  }\n  centres = new float[max_clusters + 1];\n  for (cluster_count = 1;\n       cluster_count <= max_clusters && clusters[cluster_count].buckets_ != nullptr &&\n       clusters[cluster_count].total_count_ > 0;\n       cluster_count++) {\n    centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));\n    new_centre = clusters[cluster_count].mode();\n    for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&\n                                 pile_count(entry) <= pile_count(entry + 1);\n         entry--) {\n      count = pile_count(entry) - clusters[0].pile_count(entry);\n      if (count > 0) {\n        clusters[cluster_count].add(entry, count);\n        clusters[0].add(entry, count);\n      }\n    }\n    for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&\n                                 pile_count(entry) <= pile_count(entry - 1);\n         entry++) {\n      count = pile_count(entry) - clusters[0].pile_count(entry);\n      if (count > 0) {\n        clusters[cluster_count].add(entry, count);\n        clusters[0].add(entry, count);\n      }\n    }\n  }\n  cluster_count--;\n\n  if (cluster_count == 0) {\n    clusters[0].set_range(rangemin_, rangemax_);\n  }\n  do {\n    new_cluster = false;\n    new_mode = 0;\n    for (entry = 0; entry <= rangemax_ - rangemin_; entry++) {\n      count = buckets_[entry] - clusters[0].buckets_[entry];\n      // remaining pile\n      if (count > 0) { // any to handle\n        min_dist = static_cast<float>(INT32_MAX);\n        best_cluster = 0;\n        for (cluster = 1; cluster <= cluster_count; cluster++) {\n          dist = entry + rangemin_ - centres[cluster];\n          // find distance\n          if (dist < 0) {\n            dist = -dist;\n          }\n          if (dist < min_dist) {\n            min_dist = dist; // find least\n            best_cluster = cluster;\n          }\n        }\n        if (min_dist > upper // far enough for new\n            && (best_cluster == 0 || entry + rangemin_ > centres[best_cluster] * multiple ||\n                entry + rangemin_ < centres[best_cluster] / multiple)) {\n          if (count > new_mode) {\n            new_mode = count;\n            new_centre = entry + rangemin_;\n          }\n        }\n      }\n    }\n    // need new and room\n    if (new_mode > 0 && cluster_count < max_clusters) {\n      cluster_count++;\n      new_cluster = true;\n      if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {\n        delete[] centres;\n        return 0;\n      }\n      centres[cluster_count] = static_cast<float>(new_centre);\n      clusters[cluster_count].add(new_centre, new_mode);\n      clusters[0].add(new_centre, new_mode);\n      for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&\n                                   pile_count(entry) <= pile_count(entry + 1);\n           entry--) {\n        count = pile_count(entry) - clusters[0].pile_count(entry);\n        if (count > 0) {\n          clusters[cluster_count].add(entry, count);\n          clusters[0].add(entry, count);\n        }\n      }\n      for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&\n                                   pile_count(entry) <= pile_count(entry - 1);\n           entry++) {\n        count = pile_count(entry) - clusters[0].pile_count(entry);\n        if (count > 0) {\n          clusters[cluster_count].add(entry, count);\n          clusters[0].add(entry, count);\n        }\n      }\n      centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));\n    }\n  } while (new_cluster && cluster_count < max_clusters);\n  delete[] centres;\n  return cluster_count;\n}\n\n// Helper tests that the current index is still part of the peak and gathers\n// the data into the peak, returning false when the peak is ended.\n// src_buckets[index] - used_buckets[index] is the unused part of the histogram.\n// prev_count is the histogram count of the previous index on entry and is\n// updated to the current index on return.\n// total_count and total_value are accumulating the mean of the peak.\nstatic bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int *prev_count,\n                       int *total_count, double *total_value) {\n  int pile_count = src_buckets[index] - used_buckets[index];\n  if (pile_count <= *prev_count && pile_count > 0) {\n    // Accumulate count and index.count product.\n    *total_count += pile_count;\n    *total_value += index * pile_count;\n    // Mark this index as used\n    used_buckets[index] = src_buckets[index];\n    *prev_count = pile_count;\n    return true;\n  } else {\n    return false;\n  }\n}\n\n// Finds (at most) the top max_modes modes, well actually the whole peak around\n// each mode, returning them in the given modes vector as a <mean of peak,\n// total count of peak> pair in order of decreasing total count.\n// Since the mean is the key and the count the data in the pair, a single call\n// to sort on the output will re-sort by increasing mean of peak if that is\n// more useful than decreasing total count.\n// Returns the actual number of modes found.\nint STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {\n  if (max_modes <= 0) {\n    return 0;\n  }\n  int src_count = 1 + rangemax_ - rangemin_;\n  // Used copies the counts in buckets_ as they get used.\n  STATS used(rangemin_, rangemax_);\n  modes.clear();\n  // Total count of the smallest peak found so far.\n  int least_count = 1;\n  // Mode that is used as a seed for each peak\n  int max_count = 0;\n  do {\n    // Find an unused mode.\n    max_count = 0;\n    int max_index = 0;\n    for (int src_index = 0; src_index < src_count; src_index++) {\n      int pile_count = buckets_[src_index] - used.buckets_[src_index];\n      if (pile_count > max_count) {\n        max_count = pile_count;\n        max_index = src_index;\n      }\n    }\n    if (max_count > 0) {\n      // Copy the bucket count to used so it doesn't get found again.\n      used.buckets_[max_index] = max_count;\n      // Get the entire peak.\n      double total_value = max_index * max_count;\n      int total_count = max_count;\n      int prev_pile = max_count;\n      for (int offset = 1; max_index + offset < src_count; ++offset) {\n        if (!GatherPeak(max_index + offset, buckets_, used.buckets_, &prev_pile, &total_count,\n                        &total_value)) {\n          break;\n        }\n      }\n      prev_pile = buckets_[max_index];\n      for (int offset = 1; max_index - offset >= 0; ++offset) {\n        if (!GatherPeak(max_index - offset, buckets_, used.buckets_, &prev_pile, &total_count,\n                        &total_value)) {\n          break;\n        }\n      }\n      if (total_count > least_count || modes.size() < static_cast<size_t>(max_modes)) {\n        // We definitely want this mode, so if we have enough discard the least.\n        if (modes.size() == static_cast<size_t>(max_modes)) {\n          modes.resize(max_modes - 1);\n        }\n        size_t target_index = 0;\n        // Linear search for the target insertion point.\n        while (target_index < modes.size() && modes[target_index].data() >= total_count) {\n          ++target_index;\n        }\n        auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);\n        modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));\n        least_count = modes.back().data();\n      }\n    }\n  } while (max_count > 0);\n  return modes.size();\n}\n\n/**********************************************************************\n * STATS::print\n *\n * Prints a summary and table of the histogram.\n **********************************************************************/\nvoid STATS::print() const {\n  if (buckets_ == nullptr) {\n    return;\n  }\n  int32_t min = min_bucket() - rangemin_;\n  int32_t max = max_bucket() - rangemin_;\n\n  int num_printed = 0;\n  for (int index = min; index <= max; index++) {\n    if (buckets_[index] != 0) {\n      tprintf(\"%4d:%-3d \", rangemin_ + index, buckets_[index]);\n      if (++num_printed % 8 == 0) {\n        tprintf(\"\\n\");\n      }\n    }\n  }\n  tprintf(\"\\n\");\n  print_summary();\n}\n\n/**********************************************************************\n * STATS::print_summary\n *\n * Print a summary of the stats.\n **********************************************************************/\nvoid STATS::print_summary() const {\n  if (buckets_ == nullptr) {\n    return;\n  }\n  int32_t min = min_bucket();\n  int32_t max = max_bucket();\n  tprintf(\"Total count=%d\\n\", total_count_);\n  tprintf(\"Min=%.2f Really=%d\\n\", ile(0.0), min);\n  tprintf(\"Lower quartile=%.2f\\n\", ile(0.25));\n  tprintf(\"Median=%.2f, ile(0.5)=%.2f\\n\", median(), ile(0.5));\n  tprintf(\"Upper quartile=%.2f\\n\", ile(0.75));\n  tprintf(\"Max=%.2f Really=%d\\n\", ile(1.0), max);\n  tprintf(\"Range=%d\\n\", max + 1 - min);\n  tprintf(\"Mean= %.2f\\n\", mean());\n  tprintf(\"SD= %.2f\\n\", sd());\n}\n\n/**********************************************************************\n * STATS::plot\n *\n * Draw a histogram of the stats table.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid STATS::plot(ScrollView *window, // to draw in\n                 float xorigin,      // bottom left\n                 float yorigin,\n                 float xscale,                     // one x unit\n                 float yscale,                     // one y unit\n                 ScrollView::Color colour) const { // colour to draw in\n  if (buckets_ == nullptr) {\n    return;\n  }\n  window->Pen(colour);\n\n  for (int index = 0; index <= rangemax_ - rangemin_; index++) {\n    window->Rectangle(xorigin + xscale * index, yorigin, xorigin + xscale * (index + 1),\n                      yorigin + yscale * buckets_[index]);\n  }\n}\n#endif\n\n/**********************************************************************\n * STATS::plotline\n *\n * Draw a histogram of the stats table. (Line only)\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid STATS::plotline(ScrollView *window, // to draw in\n                     float xorigin,      // bottom left\n                     float yorigin,\n                     float xscale,                     // one x unit\n                     float yscale,                     // one y unit\n                     ScrollView::Color colour) const { // colour to draw in\n  if (buckets_ == nullptr) {\n    return;\n  }\n  window->Pen(colour);\n  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);\n  for (int index = 0; index <= rangemax_ - rangemin_; index++) {\n    window->DrawTo(xorigin + xscale * index, yorigin + yscale * buckets_[index]);\n  }\n}\n#endif\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/statistc.h",
    "content": "/**********************************************************************\n * File:        statistc.h  (Formerly stats.h)\n * Description: Class description for STATS class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCSTRUCT_STATISTC_H_\n#define TESSERACT_CCSTRUCT_STATISTC_H_\n\n#include <cstdio>\n#include \"kdpair.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n// Simple histogram-based statistics for integer values in a known\n// range, such that the range is small compared to the number of samples.\nclass TESS_API STATS {\npublic:\n  // The histogram buckets are in the range\n  // [min_bucket_value, max_bucket_value].\n  // Any data under min_bucket value is silently mapped to min_bucket_value,\n  // and likewise, any data over max_bucket_value is silently mapped to\n  // max_bucket_value.\n  // In the internal array, min_bucket_value maps to 0 and\n  // 1 + max_bucket_value - min_bucket_value to the array size.\n  STATS(int32_t min_bucket_value, int32_t max_bucket_value);\n  STATS() = default; // empty for arrays\n\n  ~STATS();\n\n  // (Re)Sets the range and clears the counts.\n  // See the constructor for info on max and min values.\n  bool set_range(int32_t min_bucket_value, int32_t max_bucket_value);\n\n  void clear(); // empty buckets\n\n  void add(int32_t value, int32_t count);\n\n  // \"Accessors\" return various statistics on the data.\n  int32_t mode() const; // get mode of samples\n  double mean() const;  // get mean of samples\n  double sd() const;    // standard deviation\n  // Returns the fractile value such that frac fraction (in [0,1]) of samples\n  // has a value less than the return value.\n  double ile(double frac) const;\n  // Returns the minimum used entry in the histogram (ie the minimum of the\n  // data, NOT the minimum of the supplied range, nor is it an index.)\n  // Would normally be called min(), but that is a reserved word in VC++.\n  int32_t min_bucket() const; // Find min\n  // Returns the maximum used entry in the histogram (ie the maximum of the\n  // data, NOT the maximum of the supplied range, nor is it an index.)\n  int32_t max_bucket() const; // Find max\n  // Finds a more useful estimate of median than ile(0.5).\n  // Overcomes a problem with ile() - if the samples are, for example,\n  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway\n  // between 6 and 13 = 9.5\n  double median() const; // get median of samples\n  // Returns the count of the given value.\n  int32_t pile_count(int32_t value) const {\n    if (buckets_ == nullptr) {\n      return 0;\n    }\n    if (value <= rangemin_) {\n      return buckets_[0];\n    }\n    if (value >= rangemax_) {\n      return buckets_[rangemax_ - rangemin_];\n    }\n    return buckets_[value - rangemin_];\n  }\n  // Returns the total count of all buckets.\n  int32_t get_total() const {\n    return total_count_; // total of all piles\n  }\n  // Returns true if x is a local min.\n  bool local_min(int32_t x) const;\n\n  // Apply a triangular smoothing filter to the stats.\n  // This makes the modes a bit more useful.\n  // The factor gives the height of the triangle, i.e. the weight of the\n  // centre.\n  void smooth(int32_t factor);\n\n  // Cluster the samples into max_cluster clusters.\n  // Each call runs one iteration. The array of clusters must be\n  // max_clusters+1 in size as cluster 0 is used to indicate which samples\n  // have been used.\n  // The return value is the current number of clusters.\n  int32_t cluster(float lower, // thresholds\n                  float upper,\n                  float multiple,       // distance threshold\n                  int32_t max_clusters, // max no to make\n                  STATS *clusters);     // array of clusters\n\n  // Finds (at most) the top max_modes modes, well actually the whole peak\n  // around each mode, returning them in the given modes vector as a <mean of\n  // peak, total count of peak> pair in order of decreasing total count. Since\n  // the mean is the key and the count the data in the pair, a single call to\n  // sort on the output will re-sort by increasing mean of peak if that is more\n  // useful than decreasing total count. Returns the actual number of modes\n  // found.\n  int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;\n\n  // Prints a summary and table of the histogram.\n  void print() const;\n  // Prints summary stats only of the histogram.\n  void print_summary() const;\n\n#ifndef GRAPHICS_DISABLED\n  // Draws the histogram as a series of rectangles.\n  void plot(ScrollView *window,              // window to draw in\n            float xorigin,                   // origin of histo\n            float yorigin,                   // gram\n            float xscale,                    // size of one unit\n            float yscale,                    // size of one uint\n            ScrollView::Color colour) const; // colour to draw in\n\n  // Draws a line graph of the histogram.\n  void plotline(ScrollView *window,              // window to draw in\n                float xorigin,                   // origin of histo\n                float yorigin,                   // gram\n                float xscale,                    // size of one unit\n                float yscale,                    // size of one uint\n                ScrollView::Color colour) const; // colour to draw in\n#endif                                           // !GRAPHICS_DISABLED\n\nprivate:\n  int32_t rangemin_ = 0; // min of range\n  int32_t rangemax_ = 0;       // max of range\n  int32_t total_count_ = 0;    // no of samples\n  int32_t *buckets_ = nullptr; // array of cells\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCSTRUCT_STATISTC_H_\n"
  },
  {
    "path": "src/ccstruct/stepblob.cpp",
    "content": "/**********************************************************************\n * File:        stepblob.cpp  (Formerly cblob.c)\n * Description: Code for C_BLOB class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"stepblob.h\"\n\n#include \"points.h\" // for operator+=, FCOORD, ICOORD\n\n#include <allheaders.h> // for pixCreate, pixGetDepth\n#include <vector>       // for std::vector\n\nnamespace tesseract {\n\nclass DENORM;\n\n// Max perimeter to width ratio for a baseline position above box bottom.\nconst double kMaxPerimeterWidthRatio = 8.0;\n\n/**********************************************************************\n * position_outline\n *\n * Position the outline in the given list at the relevant place\n * according to its nesting.\n **********************************************************************/\nstatic void position_outline( // put in place\n    C_OUTLINE *outline,       // thing to place\n    C_OUTLINE_LIST *destlist  // destination list\n) {\n  C_OUTLINE_IT it = destlist; // iterator\n                              // iterator on children\n  C_OUTLINE_IT child_it = outline->child();\n\n  if (!it.empty()) {\n    do {\n      // outline from dest list\n      C_OUTLINE *dest_outline = it.data(); // get destination\n                                // encloses dest\n      if (*dest_outline < *outline) {\n        // take off list\n        dest_outline = it.extract();\n        // put this in place\n        it.add_after_then_move(outline);\n        // make it a child\n        child_it.add_to_end(dest_outline);\n        while (!it.at_last()) {\n          it.forward(); // do rest of list\n                        // check for other children\n          dest_outline = it.data();\n          if (*dest_outline < *outline) {\n            // take off list\n            dest_outline = it.extract();\n            child_it.add_to_end(dest_outline);\n            // make it a child\n            if (it.empty()) {\n              break;\n            }\n          }\n        }\n        return; // finished\n      }\n      // enclosed by dest\n      else if (*outline < *dest_outline) {\n        position_outline(outline, dest_outline->child());\n        // place in child list\n        return; // finished\n      }\n      it.forward();\n    } while (!it.at_first());\n  }\n  it.add_to_end(outline); // at outer level\n}\n\n/**********************************************************************\n * plot_outline_list\n *\n * Draw a list of outlines in the given colour and their children\n * in the child colour.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nstatic void plot_outline_list(     // draw outlines\n    C_OUTLINE_LIST *list,          // outline to draw\n    ScrollView *window,            // window to draw in\n    ScrollView::Color colour,      // colour to use\n    ScrollView::Color child_colour // colour of children\n) {\n  C_OUTLINE *outline;     // current outline\n  C_OUTLINE_IT it = list; // iterator\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    outline = it.data();\n    // draw it\n    outline->plot(window, colour);\n    if (!outline->child()->empty()) {\n      plot_outline_list(outline->child(), window, child_colour, child_colour);\n    }\n  }\n}\n// Draws the outlines in the given colour, and child_colour, normalized\n// using the given denorm, making use of sub-pixel accurate information\n// if available.\nstatic void plot_normed_outline_list(const DENORM &denorm, C_OUTLINE_LIST *list,\n                                     ScrollView::Color colour, ScrollView::Color child_colour,\n                                     ScrollView *window) {\n  C_OUTLINE_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    outline->plot_normed(denorm, colour, window);\n    if (!outline->child()->empty()) {\n      plot_normed_outline_list(denorm, outline->child(), child_colour, child_colour, window);\n    }\n  }\n}\n#endif\n\n/**********************************************************************\n * reverse_outline_list\n *\n * Reverse a list of outlines and their children.\n **********************************************************************/\n\nstatic void reverse_outline_list(C_OUTLINE_LIST *list) {\n  C_OUTLINE_IT it = list; // iterator\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    outline->reverse(); // reverse it\n    outline->set_flag(COUT_INVERSE, true);\n    if (!outline->child()->empty()) {\n      reverse_outline_list(outline->child());\n    }\n  }\n}\n\n/**********************************************************************\n * C_BLOB::C_BLOB\n *\n * Constructor to build a C_BLOB from a list of C_OUTLINEs.\n * The C_OUTLINEs are not copied so the source list is emptied.\n * The C_OUTLINEs are nested correctly in the blob.\n **********************************************************************/\n\nC_BLOB::C_BLOB(C_OUTLINE_LIST *outline_list) {\n  for (C_OUTLINE_IT ol_it(outline_list); !ol_it.empty(); ol_it.forward()) {\n    C_OUTLINE *outline = ol_it.extract();\n    // Position this outline in appropriate position in the hierarchy.\n    position_outline(outline, &outlines);\n  }\n  CheckInverseFlagAndDirection();\n}\n\n// Simpler constructor to build a blob from a single outline that has\n// already been fully initialized.\nC_BLOB::C_BLOB(C_OUTLINE *outline) {\n  C_OUTLINE_IT it(&outlines);\n  it.add_to_end(outline);\n}\n\n// Builds a set of one or more blobs from a list of outlines.\n// Input: one outline on outline_list contains all the others, but the\n// nesting and order are undefined.\n// If good_blob is true, the blob is added to good_blobs_it, unless\n// an illegal (generation-skipping) parent-child relationship is found.\n// If so, the parent blob goes to bad_blobs_it, and the immediate children\n// are promoted to the top level, recursively being sent to good_blobs_it.\n// If good_blob is false, all created blobs will go to the bad_blobs_it.\n// Output: outline_list is empty. One or more blobs are added to\n// good_blobs_it and/or bad_blobs_it.\nvoid C_BLOB::ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list,\n                                        C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it) {\n  // List of top-level outlines with correctly nested children.\n  C_OUTLINE_LIST nested_outlines;\n  for (C_OUTLINE_IT ol_it(outline_list); !ol_it.empty(); ol_it.forward()) {\n    C_OUTLINE *outline = ol_it.extract();\n    // Position this outline in appropriate position in the hierarchy.\n    position_outline(outline, &nested_outlines);\n  }\n  // Check for legal nesting and reassign as required.\n  for (C_OUTLINE_IT ol_it(&nested_outlines); !ol_it.empty(); ol_it.forward()) {\n    C_OUTLINE *outline = ol_it.extract();\n    bool blob_is_good = good_blob;\n    if (!outline->IsLegallyNested()) {\n      // The blob is illegally nested.\n      // Mark it bad, and add all its children to the top-level list.\n      blob_is_good = false;\n      ol_it.add_list_after(outline->child());\n    }\n    auto *blob = new C_BLOB(outline);\n    // Set inverse flag and reverse if needed.\n    blob->CheckInverseFlagAndDirection();\n    // Put on appropriate list.\n    if (!blob_is_good && bad_blobs_it != nullptr) {\n      bad_blobs_it->add_after_then_move(blob);\n    } else {\n      good_blobs_it->add_after_then_move(blob);\n    }\n  }\n}\n\n// Sets the COUT_INVERSE flag appropriately on the outlines and their\n// children recursively, reversing the outlines if needed so that\n// everything has an anticlockwise top-level.\nvoid C_BLOB::CheckInverseFlagAndDirection() {\n  C_OUTLINE_IT ol_it(&outlines);\n  for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {\n    C_OUTLINE *outline = ol_it.data();\n    if (outline->turn_direction() < 0) {\n      outline->reverse();\n      reverse_outline_list(outline->child());\n      outline->set_flag(COUT_INVERSE, true);\n    } else {\n      outline->set_flag(COUT_INVERSE, false);\n    }\n  }\n}\n\n// Build and return a fake blob containing a single fake outline with no\n// steps.\nC_BLOB *C_BLOB::FakeBlob(const TBOX &box) {\n  C_OUTLINE_LIST outlines;\n  C_OUTLINE::FakeOutline(box, &outlines);\n  return new C_BLOB(&outlines);\n}\n\n/**********************************************************************\n * C_BLOB::bounding_box\n *\n * Return the bounding box of the blob.\n **********************************************************************/\n\nTBOX C_BLOB::bounding_box() const { // bounding box\n  // This is a read-only iteration of the outlines.\n  C_OUTLINE_IT it = const_cast<C_OUTLINE_LIST *>(&outlines);\n  TBOX box; // bounding box\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    box += outline->bounding_box();\n  }\n  return box;\n}\n\n/**********************************************************************\n * C_BLOB::area\n *\n * Return the area of the blob.\n **********************************************************************/\n\nint32_t C_BLOB::area() {       // area\n  C_OUTLINE_IT it = &outlines; // outlines of blob\n  int32_t total = 0;           // total area\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    total += outline->area();\n  }\n  return total;\n}\n\n/**********************************************************************\n * C_BLOB::perimeter\n *\n * Return the perimeter of the top and 2nd level outlines.\n **********************************************************************/\n\nint32_t C_BLOB::perimeter() {\n  C_OUTLINE_IT it = &outlines; // outlines of blob\n  int32_t total = 0;           // total perimeter\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    total += outline->perimeter();\n  }\n  return total;\n}\n\n/**********************************************************************\n * C_BLOB::outer_area\n *\n * Return the area of the blob.\n **********************************************************************/\n\nint32_t C_BLOB::outer_area() { // area\n  C_OUTLINE_IT it = &outlines; // outlines of blob\n  int32_t total = 0;           // total area\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    total += outline->outer_area();\n  }\n  return total;\n}\n\n/**********************************************************************\n * C_BLOB::count_transitions\n *\n * Return the total x and y maxes and mins in the blob.\n * Child outlines are not counted.\n **********************************************************************/\n\nint32_t C_BLOB::count_transitions( // area\n    int32_t threshold              // on size\n) {\n  C_OUTLINE_IT it = &outlines; // outlines of blob\n  int32_t total = 0;           // total area\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    total += outline->count_transitions(threshold);\n  }\n  return total;\n}\n\n/**********************************************************************\n * C_BLOB::move\n *\n * Move C_BLOB by vector\n **********************************************************************/\n\nvoid C_BLOB::move(   // reposition blob\n    const ICOORD vec // by vector\n) {\n  C_OUTLINE_IT it(&outlines); // iterator\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->move(vec); // move each outline\n  }\n}\n\n// Static helper for C_BLOB::rotate to allow recursion of child outlines.\nstatic void RotateOutlineList(const FCOORD &rotation, C_OUTLINE_LIST *outlines) {\n  C_OUTLINE_LIST new_outlines;\n  C_OUTLINE_IT src_it(outlines);\n  C_OUTLINE_IT dest_it(&new_outlines);\n  while (!src_it.empty()) {\n    C_OUTLINE *old_outline = src_it.extract();\n    src_it.forward();\n    auto *new_outline = new C_OUTLINE(old_outline, rotation);\n    if (!old_outline->child()->empty()) {\n      RotateOutlineList(rotation, old_outline->child());\n      C_OUTLINE_IT child_it(new_outline->child());\n      child_it.add_list_after(old_outline->child());\n    }\n    delete old_outline;\n    dest_it.add_to_end(new_outline);\n  }\n  src_it.add_list_after(&new_outlines);\n}\n\n/**********************************************************************\n * C_BLOB::rotate\n *\n * Rotate C_BLOB by rotation.\n * Warning! has to rebuild all the C_OUTLINEs.\n **********************************************************************/\nvoid C_BLOB::rotate(const FCOORD &rotation) {\n  RotateOutlineList(rotation, &outlines);\n}\n\n// Helper calls ComputeEdgeOffsets or ComputeBinaryOffsets recursively on the\n// outline list and its children.\nstatic void ComputeEdgeOffsetsOutlineList(int threshold, Image pix, C_OUTLINE_LIST *list) {\n  C_OUTLINE_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    if (pix != nullptr && pixGetDepth(pix) == 8) {\n      outline->ComputeEdgeOffsets(threshold, pix);\n    } else {\n      outline->ComputeBinaryOffsets();\n    }\n    if (!outline->child()->empty()) {\n      ComputeEdgeOffsetsOutlineList(threshold, pix, outline->child());\n    }\n  }\n}\n\n// Adds sub-pixel resolution EdgeOffsets for the outlines using greyscale\n// if the supplied pix is 8-bit or the binary edges if nullptr.\nvoid C_BLOB::ComputeEdgeOffsets(int threshold, Image pix) {\n  ComputeEdgeOffsetsOutlineList(threshold, pix, &outlines);\n}\n\n// Estimates and returns the baseline position based on the shape of the\n// outlines.\n// We first find the minimum y-coord (y_mins) at each x-coord within the blob.\n// If there is a run of some y or y+1 in y_mins that is longer than the total\n// number of positions at bottom or bottom+1, subject to the additional\n// condition that at least one side of the y/y+1 run is higher than y+1, so it\n// is not a local minimum, then y, not the bottom, makes a good candidate\n// baseline position for this blob. Eg\n//   |                  ---|\n//   |                  |\n//   |-      -----------|        <=  Good candidate baseline position.\n//    |-    -|\n//     |   -|\n//     |---|                     <=  Bottom of blob\nint16_t C_BLOB::EstimateBaselinePosition() {\n  TBOX box = bounding_box();\n  int left = box.left();\n  int width = box.width();\n  int bottom = box.bottom();\n  if (outlines.empty() || perimeter() > width * kMaxPerimeterWidthRatio) {\n    return bottom; // This is only for non-CJK blobs.\n  }\n  // Get the minimum y coordinate at each x-coordinate.\n  std::vector<int> y_mins(width + 1, box.top());\n  C_OUTLINE_IT it(&outlines);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    ICOORD pos = outline->start_pos();\n    for (int s = 0; s < outline->pathlength(); ++s) {\n      if (pos.y() < y_mins[pos.x() - left]) {\n        y_mins[pos.x() - left] = pos.y();\n      }\n      pos += outline->step(s);\n    }\n  }\n  // Find the total extent of the bottom or bottom + 1.\n  int bottom_extent = 0;\n  for (int x = 0; x <= width; ++x) {\n    if (y_mins[x] == bottom || y_mins[x] == bottom + 1) {\n      ++bottom_extent;\n    }\n  }\n  // Find the lowest run longer than the bottom extent that is not the bottom.\n  int best_min = box.top();\n  int prev_run = 0;\n  int prev_y = box.top();\n  int prev_prev_y = box.top();\n  for (int x = 0; x < width; x += prev_run) {\n    // Find the length of the current run.\n    int y_at_x = y_mins[x];\n    int run = 1;\n    while (x + run <= width && y_mins[x + run] == y_at_x) {\n      ++run;\n    }\n    if (y_at_x > bottom + 1) {\n      // Possible contender.\n      int total_run = run;\n      // Find extent of current value or +1 to the right of x.\n      while (x + total_run <= width &&\n             (y_mins[x + total_run] == y_at_x || y_mins[x + total_run] == y_at_x + 1)) {\n        ++total_run;\n      }\n      // At least one end has to be higher so it is not a local max.\n      if (prev_prev_y > y_at_x + 1 || x + total_run > width || y_mins[x + total_run] > y_at_x + 1) {\n        // If the prev_run is at y + 1, then we can add that too. There cannot\n        // be a suitable run at y before that or we would have found it already.\n        if (prev_run > 0 && prev_y == y_at_x + 1) {\n          total_run += prev_run;\n        }\n        if (total_run > bottom_extent && y_at_x < best_min) {\n          best_min = y_at_x;\n        }\n      }\n    }\n    prev_run = run;\n    prev_prev_y = prev_y;\n    prev_y = y_at_x;\n  }\n  return best_min == box.top() ? bottom : best_min;\n}\n\nstatic void render_outline_list(C_OUTLINE_LIST *list, int left, int top, Image pix) {\n  C_OUTLINE_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    outline->render(left, top, pix);\n    if (!outline->child()->empty()) {\n      render_outline_list(outline->child(), left, top, pix);\n    }\n  }\n}\n\nstatic void render_outline_list_outline(C_OUTLINE_LIST *list, int left, int top, Image pix) {\n  C_OUTLINE_IT it(list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    C_OUTLINE *outline = it.data();\n    outline->render_outline(left, top, pix);\n  }\n}\n\n// Returns a Pix rendering of the blob. pixDestroy after use.\nImage C_BLOB::render() {\n  TBOX box = bounding_box();\n  Image pix = pixCreate(box.width(), box.height(), 1);\n  render_outline_list(&outlines, box.left(), box.top(), pix);\n  return pix;\n}\n\n// Returns a Pix rendering of the outline of the blob. (no fill).\n// pixDestroy after use.\nImage C_BLOB::render_outline() {\n  TBOX box = bounding_box();\n  Image pix = pixCreate(box.width(), box.height(), 1);\n  render_outline_list_outline(&outlines, box.left(), box.top(), pix);\n  return pix;\n}\n\n/**********************************************************************\n * C_BLOB::plot\n *\n * Draw the C_BLOB in the given colour.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid C_BLOB::plot(ScrollView *window,               // window to draw in\n                  ScrollView::Color blob_colour,    // main colour\n                  ScrollView::Color child_colour) { // for holes\n  plot_outline_list(&outlines, window, blob_colour, child_colour);\n}\n// Draws the blob in the given colour, and child_colour, normalized\n// using the given denorm, making use of sub-pixel accurate information\n// if available.\nvoid C_BLOB::plot_normed(const DENORM &denorm, ScrollView::Color blob_colour,\n                         ScrollView::Color child_colour, ScrollView *window) {\n  plot_normed_outline_list(denorm, &outlines, blob_colour, child_colour, window);\n}\n#endif\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/stepblob.h",
    "content": "/**********************************************************************\n * File:        stepblob.h  (Formerly cblob.h)\n * Description: Code for C_BLOB class.\n * Author:      Ray Smith\n * Created:     Tue Oct 08 10:41:13 BST 1991\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef STEPBLOB_H\n#define STEPBLOB_H\n\n#include \"coutln.h\"     // for C_OUTLINE_LIST, C_OUTLINE\n#include \"elst.h\"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#include \"points.h\"     // for FCOORD, ICOORD (ptr only)\n#include \"rect.h\"       // for TBOX\n#include \"scrollview.h\" // for ScrollView, ScrollView::Color\n\n#include <cstdint> // for int32_t, int16_t\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass C_BLOB;\nclass DENORM;\n\nELISTIZEH(C_BLOB)\nclass TESS_API C_BLOB : public ELIST<C_BLOB>::LINK {\npublic:\n  C_BLOB() = default;\n  explicit C_BLOB(C_OUTLINE_LIST *outline_list);\n  // Simpler constructor to build a blob from a single outline that has\n  // already been fully initialized.\n  explicit C_BLOB(C_OUTLINE *outline);\n\n  // Builds a set of one or more blobs from a list of outlines.\n  // Input: one outline on outline_list contains all the others, but the\n  // nesting and order are undefined.\n  // If good_blob is true, the blob is added to good_blobs_it, unless\n  // an illegal (generation-skipping) parent-child relationship is found.\n  // If so, the parent blob goes to bad_blobs_it, and the immediate children\n  // are promoted to the top level, recursively being sent to good_blobs_it.\n  // If good_blob is false, all created blobs will go to the bad_blobs_it.\n  // Output: outline_list is empty. One or more blobs are added to\n  // good_blobs_it and/or bad_blobs_it.\n  static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list,\n                                         C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it);\n\n  // Sets the COUT_INVERSE flag appropriately on the outlines and their\n  // children recursively, reversing the outlines if needed so that\n  // everything has an anticlockwise top-level.\n  void CheckInverseFlagAndDirection();\n\n  // Build and return a fake blob containing a single fake outline with no\n  // steps.\n  static C_BLOB *FakeBlob(const TBOX &box);\n\n  C_OUTLINE_LIST *out_list() { // get outline list\n    return &outlines;\n  }\n\n  TBOX bounding_box() const; // compute bounding box\n  int32_t area();            // compute area\n  int32_t perimeter();       // Total perimeter of outlines and 1st level children.\n  int32_t outer_area();      // compute area\n  int32_t count_transitions( // count maxima\n      int32_t threshold);    // size threshold\n\n  void move(const ICOORD vec);         // reposition blob by vector\n  void rotate(const FCOORD &rotation); // Rotate by given vector.\n\n  // Adds sub-pixel resolution EdgeOffsets for the outlines using greyscale\n  // if the supplied pix is 8-bit or the binary edges if nullptr.\n  void ComputeEdgeOffsets(int threshold, Image pix);\n\n  // Estimates and returns the baseline position based on the shape of the\n  // outlines.\n  int16_t EstimateBaselinePosition();\n\n  // Returns a Pix rendering of the blob. pixDestroy after use.\n  Image render();\n  // Returns a Pix rendering of the outline of the blob. (no fill).\n  // pixDestroy after use.\n  Image render_outline();\n\n#ifndef GRAPHICS_DISABLED\n  void plot(                           // draw one\n      ScrollView *window,              // window to draw in\n      ScrollView::Color blob_colour,   // for outer bits\n      ScrollView::Color child_colour); // for holes\n  // Draws the blob in the given colour, and child_colour, normalized\n  // using the given denorm, making use of sub-pixel accurate information\n  // if available.\n  void plot_normed(const DENORM &denorm, ScrollView::Color blob_colour,\n                   ScrollView::Color child_colour, ScrollView *window);\n#endif // !GRAPHICS_DISABLED\n\n  C_BLOB &operator=(const C_BLOB &source) {\n    if (!outlines.empty()) {\n      outlines.clear();\n    }\n    outlines.deep_copy(&source.outlines, &C_OUTLINE::deep_copy);\n    return *this;\n  }\n\n  static C_BLOB *deep_copy(const C_BLOB *src) {\n    auto *blob = new C_BLOB;\n    *blob = *src;\n    return blob;\n  }\n\n  static int SortByXMiddle(const C_BLOB *blob1, const C_BLOB *blob2) {\n    return blob1->bounding_box().x_middle() - blob2->bounding_box().x_middle();\n  }\n\nprivate:\n  C_OUTLINE_LIST outlines; // master elements\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccstruct/werd.cpp",
    "content": "/**********************************************************************\n * File:        werd.cpp  (Formerly word.c)\n * Description: Code for the WERD class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"werd.h\"\n\n#include \"linlsq.h\"\n\n#include \"helpers.h\"\n\nnamespace tesseract {\n\n#define FIRST_COLOUR ScrollView::RED       ///< first rainbow colour\n#define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour\n#define CHILD_COLOUR ScrollView::BROWN     ///< colour of children\n\n/**\n * WERD::WERD\n *\n * Constructor to build a WERD from a list of C_BLOBs.\n *   blob_list     The C_BLOBs (in word order) are not copied;\n *                 we take its elements and put them in our lists.\n *   blank_count   blanks in front of the word\n *   text          correct text, outlives this WERD\n */\nWERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text)\n    : blanks(blank_count), flags(0), script_id_(0), correct(text ? text : \"\") {\n  C_BLOB_IT start_it = &cblobs;\n  C_BLOB_IT rej_cblob_it = &rej_cblobs;\n  C_OUTLINE_IT c_outline_it;\n  int16_t inverted_vote = 0;\n  int16_t non_inverted_vote = 0;\n\n  // Move blob_list's elements into cblobs.\n  start_it.add_list_after(blob_list);\n\n  /*\n  Set white on black flag for the WERD, moving any duff blobs onto the\n  rej_cblobs list.\n  First, walk the cblobs checking the inverse flag for each outline of each\n  cblob. If a cblob has inconsistent flag settings for its different\n  outlines, move the blob to the reject list. Otherwise, increment the\n  appropriate w-on-b or b-on-w vote for the word.\n\n  Now set the inversion flag for the WERD by maximum vote.\n\n  Walk the blobs again, moving any blob whose inversion flag does not agree\n  with the concencus onto the reject list.\n*/\n  start_it.set_to_list(&cblobs);\n  if (start_it.empty()) {\n    return;\n  }\n  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {\n    bool reject_blob = false;\n    bool blob_inverted;\n\n    c_outline_it.set_to_list(start_it.data()->out_list());\n    blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);\n    for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob;\n         c_outline_it.forward()) {\n      reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;\n    }\n    if (reject_blob) {\n      rej_cblob_it.add_after_then_move(start_it.extract());\n    } else {\n      if (blob_inverted) {\n        inverted_vote++;\n      } else {\n        non_inverted_vote++;\n      }\n    }\n  }\n\n  flags.set(W_INVERSE, (inverted_vote > non_inverted_vote));\n\n  start_it.set_to_list(&cblobs);\n  if (start_it.empty()) {\n    return;\n  }\n  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {\n    c_outline_it.set_to_list(start_it.data()->out_list());\n    if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) {\n      rej_cblob_it.add_after_then_move(start_it.extract());\n    }\n  }\n}\n\n/**\n * WERD::WERD\n *\n * Constructor to build a WERD from a list of C_BLOBs.\n * The C_BLOBs are not copied so the source list is emptied.\n */\n\nWERD::WERD(C_BLOB_LIST *blob_list, ///< In word order\n           WERD *clone)            ///< Source of flags\n    : flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) {\n  C_BLOB_IT start_it = blob_list; // iterator\n  C_BLOB_IT end_it = blob_list;   // another\n\n  while (!end_it.at_last()) {\n    end_it.forward(); // move to last\n  }\n  cblobs.assign_to_sublist(&start_it, &end_it);\n  // move to our list\n  blanks = clone->blanks;\n  //      fprintf(stderr,\"Wrong constructor!!!!\\n\");\n}\n\n// Construct a WERD from a single_blob and clone the flags from this.\n// W_BOL and W_EOL flags are set according to the given values.\nWERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) {\n  C_BLOB_LIST temp_blobs;\n  C_BLOB_IT temp_it(&temp_blobs);\n  temp_it.add_after_then_move(blob);\n  WERD *blob_word = new WERD(&temp_blobs, this);\n  blob_word->set_flag(W_BOL, bol);\n  blob_word->set_flag(W_EOL, eol);\n  return blob_word;\n}\n\n/**\n * WERD::bounding_box\n *\n * Return the bounding box of the WERD.\n * This is quite a mess to compute!\n * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the\n * words on the row were re-sorted. The original words were built with reject\n * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the\n * blobs in a word are rejected the BB for the word is nullptr, causing the sort\n * to screw up, leading to the erroneous possibility of the first word in a\n * row being marked as FUZZY space.\n */\n\nTBOX WERD::bounding_box() const {\n  return restricted_bounding_box(true, true);\n}\n\n// Returns the bounding box including the desired combination of upper and\n// lower noise/diacritic elements.\nTBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {\n  TBOX box = true_bounding_box();\n  int bottom = box.bottom();\n  int top = box.top();\n  // This is a read-only iteration of the rejected blobs.\n  C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TBOX dot_box = it.data()->bounding_box();\n    if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) {\n      box += dot_box;\n    }\n  }\n  return box;\n}\n\n// Returns the bounding box of only the good blobs.\nTBOX WERD::true_bounding_box() const {\n  TBOX box; // box being built\n  // This is a read-only iteration of the good blobs.\n  C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    box += it.data()->bounding_box();\n  }\n  return box;\n}\n\n/**\n * WERD::move\n *\n * Reposition WERD by vector\n * NOTE!! REJECT CBLOBS ARE NOT MOVED\n */\n\nvoid WERD::move(const ICOORD vec) {\n  C_BLOB_IT cblob_it(&cblobs); // cblob iterator\n\n  for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {\n    cblob_it.data()->move(vec);\n  }\n}\n\n/**\n * WERD::join_on\n *\n * Join other word onto this one. Delete the old word.\n */\n\nvoid WERD::join_on(WERD *other) {\n  C_BLOB_IT blob_it(&cblobs);\n  C_BLOB_IT src_it(&other->cblobs);\n  C_BLOB_IT rej_cblob_it(&rej_cblobs);\n  C_BLOB_IT src_rej_it(&other->rej_cblobs);\n\n  while (!src_it.empty()) {\n    blob_it.add_to_end(src_it.extract());\n    src_it.forward();\n  }\n  while (!src_rej_it.empty()) {\n    rej_cblob_it.add_to_end(src_rej_it.extract());\n    src_rej_it.forward();\n  }\n}\n\n/**\n * WERD::copy_on\n *\n * Copy blobs from other word onto this one.\n */\n\nvoid WERD::copy_on(WERD *other) {\n  bool reversed = other->bounding_box().left() < bounding_box().left();\n  C_BLOB_IT c_blob_it(&cblobs);\n  C_BLOB_LIST c_blobs;\n\n  c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);\n  if (reversed) {\n    c_blob_it.add_list_before(&c_blobs);\n  } else {\n    c_blob_it.move_to_last();\n    c_blob_it.add_list_after(&c_blobs);\n  }\n  if (!other->rej_cblobs.empty()) {\n    C_BLOB_IT rej_c_blob_it(&rej_cblobs);\n    C_BLOB_LIST new_rej_c_blobs;\n\n    new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);\n    if (reversed) {\n      rej_c_blob_it.add_list_before(&new_rej_c_blobs);\n    } else {\n      rej_c_blob_it.move_to_last();\n      rej_c_blob_it.add_list_after(&new_rej_c_blobs);\n    }\n  }\n}\n\n/**\n * WERD::print\n *\n * Display members\n */\n\nvoid WERD::print() const {\n  tprintf(\"Blanks= %d\\n\", blanks);\n  bounding_box().print();\n  tprintf(\"Flags = %lu = 0%lo\\n\", flags.to_ulong(), flags.to_ulong());\n  tprintf(\"   W_SEGMENTED = %s\\n\", flags[W_SEGMENTED] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_ITALIC = %s\\n\", flags[W_ITALIC] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_BOL = %s\\n\", flags[W_BOL] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_EOL = %s\\n\", flags[W_EOL] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_NORMALIZED = %s\\n\", flags[W_NORMALIZED] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_SCRIPT_HAS_XHEIGHT = %s\\n\", flags[W_SCRIPT_HAS_XHEIGHT] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_SCRIPT_IS_LATIN = %s\\n\", flags[W_SCRIPT_IS_LATIN] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_DONT_CHOP = %s\\n\", flags[W_DONT_CHOP] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_REP_CHAR = %s\\n\", flags[W_REP_CHAR] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_FUZZY_SP = %s\\n\", flags[W_FUZZY_SP] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"   W_FUZZY_NON = %s\\n\", flags[W_FUZZY_NON] ? \"TRUE\" : \"FALSE\");\n  tprintf(\"Correct= %s\\n\", correct.c_str());\n  tprintf(\"Rejected cblob count = %d\\n\", rej_cblobs.length());\n  tprintf(\"Script = %d\\n\", script_id_);\n}\n\n/**\n * WERD::plot\n *\n * Draw the WERD in the given colour.\n */\n\n#ifndef GRAPHICS_DISABLED\nvoid WERD::plot(ScrollView *window, ScrollView::Color colour) {\n  C_BLOB_IT it = &cblobs;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot(window, colour, colour);\n  }\n  plot_rej_blobs(window);\n}\n\n// Get the next color in the (looping) rainbow.\nScrollView::Color WERD::NextColor(ScrollView::Color colour) {\n  auto next = static_cast<ScrollView::Color>(colour + 1);\n  if (next >= LAST_COLOUR || next < FIRST_COLOUR) {\n    next = FIRST_COLOUR;\n  }\n  return next;\n}\n\n/**\n * WERD::plot\n *\n * Draw the WERD in rainbow colours in window.\n */\n\nvoid WERD::plot(ScrollView *window) {\n  ScrollView::Color colour = FIRST_COLOUR;\n  C_BLOB_IT it = &cblobs;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot(window, colour, CHILD_COLOUR);\n    colour = NextColor(colour);\n  }\n  plot_rej_blobs(window);\n}\n\n/**\n * WERD::plot_rej_blobs\n *\n * Draw the WERD rejected blobs in window - ALWAYS GREY\n */\n\nvoid WERD::plot_rej_blobs(ScrollView *window) {\n  C_BLOB_IT it = &rej_cblobs;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\n/**\n * WERD::shallow_copy()\n *\n * Make a shallow copy of a word\n */\n\nWERD *WERD::shallow_copy() {\n  WERD *new_word = new WERD;\n\n  new_word->blanks = blanks;\n  new_word->flags = flags;\n  new_word->correct = correct;\n  return new_word;\n}\n\n/**\n * WERD::operator=\n *\n * Assign a word, DEEP copying the blob list\n */\n\nWERD &WERD::operator=(const WERD &source) {\n  this->ELIST2<WERD>::LINK::operator=(source);\n  blanks = source.blanks;\n  flags = source.flags;\n  script_id_ = source.script_id_;\n  correct = source.correct;\n  cblobs.clear();\n  cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);\n  rej_cblobs.clear();\n  rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);\n  return *this;\n}\n\n/**\n *  word_comparator()\n *\n *  word comparator used to sort a word list so that words are in increasing\n *  order of left edge.\n */\n\nint word_comparator(const WERD *word1, const WERD *word2) {\n  return word1->bounding_box().left() - word2->bounding_box().left();\n}\n\n/**\n *  WERD::ConstructWerdWithNewBlobs()\n *\n * This method returns a new werd constructed using the blobs in the input\n * all_blobs list, which correspond to the blobs in this werd object. The\n * blobs used to construct the new word are consumed and removed from the\n * input all_blobs list.\n * Returns nullptr if the word couldn't be constructed.\n * Returns original blobs for which no matches were found in the output list\n * orphan_blobs (appends).\n */\n\nWERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) {\n  C_BLOB_LIST current_blob_list;\n  C_BLOB_IT werd_blobs_it(&current_blob_list);\n  // Add the word's c_blobs.\n  werd_blobs_it.add_list_after(cblob_list());\n\n  // New blob list. These contain the blobs which will form the new word.\n  C_BLOB_LIST new_werd_blobs;\n  C_BLOB_IT new_blobs_it(&new_werd_blobs);\n\n  // not_found_blobs contains the list of current word's blobs for which a\n  // corresponding blob wasn't found in the input all_blobs list.\n  C_BLOB_LIST not_found_blobs;\n  C_BLOB_IT not_found_it(&not_found_blobs);\n  not_found_it.move_to_last();\n\n  werd_blobs_it.move_to_first();\n  for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) {\n    C_BLOB *werd_blob = werd_blobs_it.extract();\n    TBOX werd_blob_box = werd_blob->bounding_box();\n    bool found = false;\n    // Now find the corresponding blob for this blob in the all_blobs\n    // list. For now, follow the inefficient method of pairwise\n    // comparisons. Ideally, one can pre-bucket the blobs by row.\n    C_BLOB_IT all_blobs_it(all_blobs);\n    for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {\n      C_BLOB *a_blob = all_blobs_it.data();\n      // Compute the overlap of the two blobs. If major, a_blob should\n      // be added to the new blobs list.\n      TBOX a_blob_box = a_blob->bounding_box();\n      if (a_blob_box.null_box()) {\n        tprintf(\"Bounding box couldn't be ascertained\\n\");\n      }\n      if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) {\n        // Old blobs are from minimal splits, therefore are expected to be\n        // bigger. The new small blobs should cover a significant portion.\n        // This is it.\n        all_blobs_it.extract();\n        new_blobs_it.add_after_then_move(a_blob);\n        found = true;\n      }\n    }\n    if (!found) {\n      not_found_it.add_after_then_move(werd_blob);\n    } else {\n      delete werd_blob;\n    }\n  }\n  // Iterate over all not found blobs. Some of them may be due to\n  // under-segmentation (which is OK, since the corresponding blob is already\n  // in the list in that case.\n  not_found_it.move_to_first();\n  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {\n    C_BLOB *not_found = not_found_it.data();\n    TBOX not_found_box = not_found->bounding_box();\n    C_BLOB_IT existing_blobs_it(new_blobs_it);\n    for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();\n         existing_blobs_it.forward()) {\n      C_BLOB *a_blob = existing_blobs_it.data();\n      TBOX a_blob_box = a_blob->bounding_box();\n      if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) &&\n          not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {\n        // Already taken care of.\n        delete not_found_it.extract();\n        break;\n      }\n    }\n  }\n  if (orphan_blobs) {\n    C_BLOB_IT orphan_blobs_it(orphan_blobs);\n    orphan_blobs_it.move_to_last();\n    orphan_blobs_it.add_list_after(&not_found_blobs);\n  }\n\n  // New blobs are ready. Create a new werd object with these.\n  WERD *new_werd = nullptr;\n  if (!new_werd_blobs.empty()) {\n    new_werd = new WERD(&new_werd_blobs, this);\n  } else {\n    // Add the blobs back to this word so that it can be reused.\n    C_BLOB_IT this_list_it(cblob_list());\n    this_list_it.add_list_after(&not_found_blobs);\n  }\n  return new_werd;\n}\n\n// Removes noise from the word by moving small outlines to the rej_cblobs\n// list, based on the size_threshold.\nvoid WERD::CleanNoise(float size_threshold) {\n  C_BLOB_IT blob_it(&cblobs);\n  C_BLOB_IT rej_it(&rej_cblobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    C_BLOB *blob = blob_it.data();\n    C_OUTLINE_IT ol_it(blob->out_list());\n    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {\n      C_OUTLINE *outline = ol_it.data();\n      TBOX ol_box = outline->bounding_box();\n      int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();\n      if (ol_size < size_threshold) {\n        // This outline is too small. Move it to a separate blob in the\n        // reject blobs list.\n        auto *rej_blob = new C_BLOB(ol_it.extract());\n        rej_it.add_after_then_move(rej_blob);\n      }\n    }\n    if (blob->out_list()->empty()) {\n      delete blob_it.extract();\n    }\n  }\n}\n\n// Extracts all the noise outlines and stuffs the pointers into the given\n// vector of outlines. Afterwards, the outlines vector owns the pointers.\nvoid WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {\n  C_BLOB_IT rej_it(&rej_cblobs);\n  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {\n    C_BLOB *blob = rej_it.extract();\n    C_OUTLINE_IT ol_it(blob->out_list());\n    outlines->push_back(ol_it.extract());\n    delete blob;\n  }\n}\n\n// Adds the selected outlines to the indicated real blobs, and puts the rest\n// back in rej_cblobs where they came from. Where the target_blobs entry is\n// nullptr, a run of wanted outlines is put into a single new blob.\n// Ownership of the outlines is transferred back to the word. (Hence\n// vector and not PointerVector.)\n// Returns true if any new blob was added to the start of the word, which\n// suggests that it might need joining to the word before it, and likewise\n// sets make_next_word_fuzzy true if any new blob was added to the end.\nbool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,\n                               const std::vector<C_BLOB *> &target_blobs,\n                               const std::vector<C_OUTLINE *> &outlines,\n                               bool *make_next_word_fuzzy) {\n  bool outline_added_to_start = false;\n  if (make_next_word_fuzzy != nullptr) {\n    *make_next_word_fuzzy = false;\n  }\n  C_BLOB_IT rej_it(&rej_cblobs);\n  for (unsigned i = 0; i < outlines.size(); ++i) {\n    C_OUTLINE *outline = outlines[i];\n    if (outline == nullptr) {\n      continue; // Already used it.\n    }\n    if (wanted[i]) {\n      C_BLOB *target_blob = target_blobs[i];\n      TBOX noise_box = outline->bounding_box();\n      if (target_blob == nullptr) {\n        target_blob = new C_BLOB(outline);\n        // Need to find the insertion point.\n        C_BLOB_IT blob_it(&cblobs);\n        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n          C_BLOB *blob = blob_it.data();\n          TBOX blob_box = blob->bounding_box();\n          if (blob_box.left() > noise_box.left()) {\n            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {\n              // We might want to join this word to its predecessor.\n              outline_added_to_start = true;\n            }\n            blob_it.add_before_stay_put(target_blob);\n            break;\n          }\n        }\n        if (blob_it.cycled_list()) {\n          blob_it.add_to_end(target_blob);\n          if (make_next_word_fuzzy != nullptr) {\n            *make_next_word_fuzzy = true;\n          }\n        }\n        // Add all consecutive wanted, but null-blob outlines to same blob.\n        C_OUTLINE_IT ol_it(target_blob->out_list());\n        while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) {\n          ++i;\n          ol_it.add_to_end(outlines[i]);\n        }\n      } else {\n        // Insert outline into this blob.\n        C_OUTLINE_IT ol_it(target_blob->out_list());\n        ol_it.add_to_end(outline);\n      }\n    } else {\n      // Put back on noise list.\n      rej_it.add_to_end(new C_BLOB(outline));\n    }\n  }\n  return outline_added_to_start;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccstruct/werd.h",
    "content": "/**********************************************************************\n * File:        werd.h\n * Description: Code for the WERD class.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef WERD_H\n#define WERD_H\n\n#include \"elst2.h\"\n#include \"params.h\"\n#include \"stepblob.h\"\n\n#include <bitset>\n\nnamespace tesseract {\n\nenum WERD_FLAGS {\n  W_SEGMENTED,          ///< correctly segmented\n  W_ITALIC,             ///< italic text\n  W_BOLD,               ///< bold text\n  W_BOL,                ///< start of line\n  W_EOL,                ///< end of line\n  W_NORMALIZED,         ///< flags\n  W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense.\n  W_SCRIPT_IS_LATIN,    ///< Special case latin for y. splitting.\n  W_DONT_CHOP,          ///< fixed pitch chopped\n  W_REP_CHAR,           ///< repeated character\n  W_FUZZY_SP,           ///< fuzzy space\n  W_FUZZY_NON,          ///< fuzzy nonspace\n  W_INVERSE             ///< white on black\n};\n\nenum DISPLAY_FLAGS {\n  /* Display flags bit number allocations */\n  DF_BOX,          ///< Bounding box\n  DF_TEXT,         ///< Correct ascii\n  DF_POLYGONAL,    ///< Polyg approx\n  DF_EDGE_STEP,    ///< Edge steps\n  DF_BN_POLYGONAL, ///< BL normalisd polyapx\n  DF_BLAMER        ///< Blamer information\n};\n\nclass ROW; // forward decl\n\nclass TESS_API WERD : public ELIST2<WERD>::LINK {\npublic:\n  WERD() = default;\n  // WERD constructed with:\n  //   blob_list - blobs of the word (we take this list's contents)\n  //   blanks - number of blanks before the word\n  //   text - correct text (outlives WERD)\n  WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);\n\n  // WERD constructed from:\n  //   blob_list - blobs in the word\n  //   clone - werd to clone flags, etc from.\n  WERD(C_BLOB_LIST *blob_list, WERD *clone);\n\n  // Construct a WERD from a single_blob and clone the flags from this.\n  // W_BOL and W_EOL flags are set according to the given values.\n  WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob);\n\n  ~WERD() = default;\n\n  // assignment\n  WERD &operator=(const WERD &source);\n\n  // This method returns a new werd constructed using the blobs in the input\n  // all_blobs list, which correspond to the blobs in this werd object. The\n  // blobs used to construct the new word are consumed and removed from the\n  // input all_blobs list.\n  // Returns nullptr if the word couldn't be constructed.\n  // Returns original blobs for which no matches were found in the output list\n  // orphan_blobs (appends).\n  WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs);\n\n  // Accessors for reject / DUFF blobs in various formats\n  C_BLOB_LIST *rej_cblob_list() { // compact format\n    return &rej_cblobs;\n  }\n\n  // Accessors for good blobs in various formats.\n  C_BLOB_LIST *cblob_list() { // get compact blobs\n    return &cblobs;\n  }\n\n  uint8_t space() const { // access function\n    return blanks;\n  }\n  void set_blanks(uint8_t new_blanks) {\n    blanks = new_blanks;\n  }\n  int script_id() const {\n    return script_id_;\n  }\n  void set_script_id(int id) {\n    script_id_ = id;\n  }\n\n  // Returns the (default) bounding box including all the dots.\n  TBOX bounding_box() const; // compute bounding box\n  // Returns the bounding box including the desired combination of upper and\n  // lower noise/diacritic elements.\n  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;\n  // Returns the bounding box of only the good blobs.\n  TBOX true_bounding_box() const;\n\n  const char *text() const {\n    return correct.c_str();\n  }\n  void set_text(const char *new_text) {\n    correct = new_text;\n  }\n\n  bool flag(WERD_FLAGS mask) const {\n    return flags[mask];\n  }\n  void set_flag(WERD_FLAGS mask, bool value) {\n    flags.set(mask, value);\n  }\n\n  bool display_flag(uint8_t flag) const {\n    return disp_flags[flag];\n  }\n  void set_display_flag(uint8_t flag, bool value) {\n    disp_flags.set(flag, value);\n  }\n\n  WERD *shallow_copy(); // shallow copy word\n\n  // reposition word by vector\n  void move(const ICOORD vec);\n\n  // join other's blobs onto this werd, emptying out other.\n  void join_on(WERD *other);\n\n  // copy other's blobs onto this word, leaving other intact.\n  void copy_on(WERD *other);\n\n  // tprintf word metadata (but not blob innards)\n  void print() const;\n\n#ifndef GRAPHICS_DISABLED\n  // plot word on window in a uniform colour\n  void plot(ScrollView *window, ScrollView::Color colour);\n\n  // Get the next color in the (looping) rainbow.\n  static ScrollView::Color NextColor(ScrollView::Color colour);\n\n  // plot word on window in a rainbow of colours\n  void plot(ScrollView *window);\n\n  // plot rejected blobs in a rainbow of colours\n  void plot_rej_blobs(ScrollView *window);\n#endif // !GRAPHICS_DISABLED\n\n  // Removes noise from the word by moving small outlines to the rej_cblobs\n  // list, based on the size_threshold.\n  void CleanNoise(float size_threshold);\n\n  // Extracts all the noise outlines and stuffs the pointers into the given\n  // vector of outlines. Afterwards, the outlines vector owns the pointers.\n  void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);\n  // Adds the selected outlines to the indicated real blobs, and puts the rest\n  // back in rej_cblobs where they came from. Where the target_blobs entry is\n  // nullptr, a run of wanted outlines is put into a single new blob.\n  // Ownership of the outlines is transferred back to the word. (Hence\n  // vector and not PointerVector.)\n  // Returns true if any new blob was added to the start of the word, which\n  // suggests that it might need joining to the word before it, and likewise\n  // sets make_next_word_fuzzy true if any new blob was added to the end.\n  bool AddSelectedOutlines(const std::vector<bool> &wanted,\n                           const std::vector<C_BLOB *> &target_blobs,\n                           const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);\n\nprivate:\n  uint8_t blanks = 0;     // no of blanks\n  std::bitset<16> flags;  // flags about word\n  std::bitset<16> disp_flags; // display flags\n  int16_t script_id_ = 0; // From unicharset.\n  std::string correct;    // correct text\n  C_BLOB_LIST cblobs;     // compacted blobs\n  C_BLOB_LIST rej_cblobs; // DUFF blobs\n};\n\nELIST2IZEH(WERD)\n\n} // namespace tesseract\n\n#include \"ocrrow.h\" // placed here due to\n\nnamespace tesseract {\n\n// compare words by increasing order of left edge, suitable for qsort(3)\nint word_comparator(const WERD *word1, const WERD *word2);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/ambigs.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ambigs.cpp\n// Description: Functions for dealing with ambiguities\n//              (training and recognition).\n// Author:      Daria Antonova\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"ambigs.h\"\n\n#include \"helpers.h\"\n#include \"universalambigs.h\"\n\n#include <cstdio>\n\n#if defined(_WIN32) && !defined(__GNUC__)\n#  define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)\n#endif /* _WIN32 && !__GNUC__ */\n\nnamespace tesseract {\n\nstatic const char kAmbigDelimiters[] = \"\\t \";\nstatic const char kIllegalMsg[] = \"Illegal ambiguity specification on line %d\\n\";\nstatic const char kIllegalUnicharMsg[] = \"Illegal unichar %s in ambiguity specification\\n\";\n\n// Maximum line size:\n//   10 for sizes of ambigs, tabs, abmig type and newline\n//   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig\nconst int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);\n\nAmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) {\n  wrong_ngram[0] = INVALID_UNICHAR_ID;\n  correct_fragments[0] = INVALID_UNICHAR_ID;\n}\n\n// Initializes the ambigs by adding a nullptr pointer to each table.\nvoid UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) {\n  for (unsigned i = 0; i < unicharset.size(); ++i) {\n    replace_ambigs_.push_back(nullptr);\n    dang_ambigs_.push_back(nullptr);\n    one_to_one_definite_ambigs_.push_back(nullptr);\n    if (use_ambigs_for_adaption) {\n      ambigs_for_adaption_.push_back(nullptr);\n      reverse_ambigs_for_adaption_.push_back(nullptr);\n    }\n  }\n}\n\n// Loads the universal ambigs that are useful for any language.\nvoid UnicharAmbigs::LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset) {\n  TFile file;\n  if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) {\n    return;\n  }\n  LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset);\n}\n\nvoid UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file,\n                                      int debug_level, bool use_ambigs_for_adaption,\n                                      UNICHARSET *unicharset) {\n  UnicharIdVector *adaption_ambigs_entry;\n  if (debug_level) {\n    tprintf(\"Reading ambiguities\\n\");\n  }\n\n  int test_ambig_part_size;\n  int replacement_ambig_part_size;\n  // The space for buffer is allocated on the heap to avoid\n  // GCC frame size warning.\n  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;\n  char *buffer = new char[kBufferSize];\n  char replacement_string[kMaxAmbigStringSize];\n  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];\n  int line_num = 0;\n  int type = NOT_AMBIG;\n\n  // Determine the version of the ambigs file.\n  int version = 0;\n  ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != nullptr && buffer[0] != '\\0');\n  if (*buffer == 'v') {\n    version = static_cast<int>(strtol(buffer + 1, nullptr, 10));\n    ++line_num;\n  } else {\n    ambig_file->Rewind();\n  }\n  while (ambig_file->FGets(buffer, kBufferSize) != nullptr) {\n    chomp_string(buffer);\n    if (debug_level > 2) {\n      tprintf(\"read line %s\\n\", buffer);\n    }\n    ++line_num;\n    if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer,\n                            &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size,\n                            replacement_string, &type)) {\n      continue;\n    }\n    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.\n    auto *ambig_spec = new AmbigSpec();\n    if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,\n                         test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size,\n                         replacement_string, type, ambig_spec, unicharset)) {\n      continue;\n    }\n\n    // Update one_to_one_definite_ambigs_.\n    if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {\n      if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == nullptr) {\n        one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();\n      }\n      one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(ambig_spec->correct_ngram_id);\n    }\n    // Update ambigs_for_adaption_.\n    if (use_ambigs_for_adaption) {\n      std::vector<UNICHAR_ID> encoding;\n      // Silently ignore invalid strings, as before, so it is safe to use a\n      // universal ambigs file.\n      if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) {\n        for (int i = 0; i < test_ambig_part_size; ++i) {\n          if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) {\n            ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();\n          }\n          adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];\n          for (int id_to_insert : encoding) {\n            ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);\n            // Add the new unichar id to adaption_ambigs_entry (only if the\n            // vector does not already contain it) keeping it in sorted order.\n            size_t j;\n            for (j = 0;\n                 j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert;\n                 ++j) {\n            }\n            if (j < adaption_ambigs_entry->size()) {\n              if ((*adaption_ambigs_entry)[j] != id_to_insert) {\n                adaption_ambigs_entry->insert(adaption_ambigs_entry->begin() + j, id_to_insert);\n              }\n            } else {\n              adaption_ambigs_entry->push_back(id_to_insert);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] buffer;\n\n  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.\n  if (use_ambigs_for_adaption) {\n    for (size_t i = 0; i < ambigs_for_adaption_.size(); ++i) {\n      adaption_ambigs_entry = ambigs_for_adaption_[i];\n      if (adaption_ambigs_entry == nullptr) {\n        continue;\n      }\n      for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) {\n        UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];\n        if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) {\n          reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();\n        }\n        reverse_ambigs_for_adaption_[ambig_id]->push_back(i);\n      }\n    }\n  }\n\n  // Print what was read from the input file.\n  if (debug_level > 1) {\n    for (int tbl = 0; tbl < 2; ++tbl) {\n      const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_;\n      for (size_t i = 0; i < print_table.size(); ++i) {\n        AmbigSpec_LIST *lst = print_table[i];\n        if (lst == nullptr) {\n          continue;\n        }\n        if (!lst->empty()) {\n          tprintf(\"%s Ambiguities for %s:\\n\", (tbl == 0) ? \"Replaceable\" : \"Dangerous\",\n                  unicharset->debug_str(i).c_str());\n        }\n        AmbigSpec_IT lst_it(lst);\n        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {\n          AmbigSpec *ambig_spec = lst_it.data();\n          tprintf(\"wrong_ngram:\");\n          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);\n          tprintf(\"correct_fragments:\");\n          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);\n        }\n      }\n    }\n    if (use_ambigs_for_adaption) {\n      for (int vec_id = 0; vec_id < 2; ++vec_id) {\n        const std::vector<UnicharIdVector *> &vec =\n            (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_;\n        for (size_t i = 0; i < vec.size(); ++i) {\n          adaption_ambigs_entry = vec[i];\n          if (adaption_ambigs_entry != nullptr) {\n            tprintf(\"%sAmbigs for adaption for %s:\\n\", (vec_id == 0) ? \"\" : \"Reverse \",\n                    unicharset->debug_str(i).c_str());\n            for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) {\n              tprintf(\"%s \", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str());\n            }\n            tprintf(\"\\n\");\n          }\n        }\n      }\n    }\n  }\n}\n\nbool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_level,\n                                       const UNICHARSET &unicharset, char *buffer,\n                                       int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,\n                                       int *replacement_ambig_part_size, char *replacement_string,\n                                       int *type) {\n  if (version > 1) {\n    // Simpler format is just wrong-string correct-string type\\n.\n    std::string input(buffer);\n    std::vector<std::string> fields = split(input, ' ');\n    if (fields.size() != 3) {\n      if (debug_level) {\n        tprintf(kIllegalMsg, line_num);\n      }\n      return false;\n    }\n    // Encode wrong-string.\n    std::vector<UNICHAR_ID> unichars;\n    if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr, nullptr)) {\n      return false;\n    }\n    *test_ambig_part_size = unichars.size();\n    if (*test_ambig_part_size > MAX_AMBIG_SIZE) {\n      if (debug_level) {\n        tprintf(\"Too many unichars in ambiguity on line %d\\n\", line_num);\n      }\n      return false;\n    }\n    // Copy encoded string to output.\n    for (size_t i = 0; i < unichars.size(); ++i) {\n      test_unichar_ids[i] = unichars[i];\n    }\n    test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;\n    // Encode replacement-string to check validity.\n    if (!unicharset.encode_string(fields[1].c_str(), true, &unichars, nullptr, nullptr)) {\n      return false;\n    }\n    *replacement_ambig_part_size = unichars.size();\n    if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {\n      if (debug_level) {\n        tprintf(\"Too many unichars in ambiguity on line %d\\n\", line_num);\n      }\n      return false;\n    }\n    if (sscanf(fields[2].c_str(), \"%d\", type) != 1) {\n      if (debug_level) {\n        tprintf(kIllegalMsg, line_num);\n      }\n      return false;\n    }\n    snprintf(replacement_string, kMaxAmbigStringSize, \"%s\", fields[1].c_str());\n    return true;\n  }\n  int i;\n  char *next_token;\n  char *token = strtok_r(buffer, kAmbigDelimiters, &next_token);\n  if (!token || sscanf(token, \"%d\", test_ambig_part_size) != 1 ||\n      *test_ambig_part_size <= 0) {\n    if (debug_level) {\n      tprintf(kIllegalMsg, line_num);\n    }\n    return false;\n  }\n  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {\n    if (debug_level) {\n      tprintf(\"Too many unichars in ambiguity on line %d\\n\", line_num);\n    }\n    return false;\n  }\n  for (i = 0; i < *test_ambig_part_size; ++i) {\n    if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) {\n      break;\n    }\n    if (!unicharset.contains_unichar(token)) {\n      if (debug_level) {\n        tprintf(kIllegalUnicharMsg, token);\n      }\n      break;\n    }\n    test_unichar_ids[i] = unicharset.unichar_to_id(token);\n  }\n  test_unichar_ids[i] = INVALID_UNICHAR_ID;\n\n  if (i != *test_ambig_part_size || !(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) ||\n      sscanf(token, \"%d\", replacement_ambig_part_size) != 1 ||\n      *replacement_ambig_part_size <= 0) {\n    if (debug_level) {\n      tprintf(kIllegalMsg, line_num);\n    }\n    return false;\n  }\n  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {\n    if (debug_level) {\n      tprintf(\"Too many unichars in ambiguity on line %d\\n\", line_num);\n    }\n    return false;\n  }\n  replacement_string[0] = '\\0';\n  for (i = 0; i < *replacement_ambig_part_size; ++i) {\n    if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) {\n      break;\n    }\n    strcat(replacement_string, token);\n    if (!unicharset.contains_unichar(token)) {\n      if (debug_level) {\n        tprintf(kIllegalUnicharMsg, token);\n      }\n      break;\n    }\n  }\n  if (i != *replacement_ambig_part_size) {\n    if (debug_level) {\n      tprintf(kIllegalMsg, line_num);\n    }\n    return false;\n  }\n  if (version > 0) {\n    // The next field being true indicates that the ambiguity should\n    // always be substituted (e.g. '' should always be changed to \").\n    // For such \"certain\" n -> m ambigs tesseract will insert character\n    // fragments for the n pieces in the unicharset. AmbigsFound()\n    // will then replace the incorrect ngram with the character\n    // fragments of the correct character (or ngram if m > 1).\n    // Note that if m > 1, an ngram will be inserted into the\n    // modified word, not the individual unigrams. Tesseract\n    // has limited support for ngram unichar (e.g. dawg permuter).\n    token = strtok_r(nullptr, kAmbigDelimiters, &next_token);\n    if (!token || sscanf(token, \"%d\", type) != 1) {\n      if (debug_level) {\n        tprintf(kIllegalMsg, line_num);\n      }\n      return false;\n    }\n  }\n  return true;\n}\n\nbool UnicharAmbigs::InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,\n                                    UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,\n                                    const char *replacement_string, int type, AmbigSpec *ambig_spec,\n                                    UNICHARSET *unicharset) {\n  ambig_spec->type = static_cast<AmbigType>(type);\n  if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&\n      unicharset->to_lower(test_unichar_ids[0]) ==\n          unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) {\n    ambig_spec->type = CASE_AMBIG;\n  }\n\n  ambig_spec->wrong_ngram_size =\n      UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);\n\n  // Since we need to maintain a constant number of unichar positions in\n  // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for\n  // each n->m ambiguity we will have to place n character fragments of the\n  // correct ngram into the corresponding positions in the vector (e.g. given\n  // \"vvvvw\" and vvvv->ww we will place v and |ww|0|4 into position 0, v and\n  // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed\n  // from fragments by dawg_permute_and_select().\n\n  // Insert the corresponding correct ngram into the unicharset.\n  // Unicharset code assumes that the \"base\" ngram is inserted into\n  // the unicharset before fragments of this ngram are inserted.\n  unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);\n  ambig_spec->correct_ngram_id = unicharset->unichar_to_id(replacement_string);\n  if (replacement_ambig_part_size > 1) {\n    unicharset->set_isngram(ambig_spec->correct_ngram_id, true);\n  }\n  // Add the corresponding fragments of the wrong ngram to unicharset.\n  int i;\n  for (i = 0; i < test_ambig_part_size; ++i) {\n    UNICHAR_ID unichar_id;\n    if (test_ambig_part_size == 1) {\n      unichar_id = ambig_spec->correct_ngram_id;\n    } else {\n      std::string frag_str =\n          CHAR_FRAGMENT::to_string(replacement_string, i, test_ambig_part_size, false);\n      unicharset->unichar_insert(frag_str.c_str(), OldUncleanUnichars::kTrue);\n      unichar_id = unicharset->unichar_to_id(frag_str.c_str());\n    }\n    ambig_spec->correct_fragments[i] = unichar_id;\n  }\n  ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;\n\n  // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.\n  // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.\n  if (table[test_unichar_ids[0]] == nullptr) {\n    table[test_unichar_ids[0]] = new AmbigSpec_LIST();\n  }\n  if (table[test_unichar_ids[0]]->add_sorted(AmbigSpec::compare_ambig_specs, true, ambig_spec)) {\n    return true;\n  }\n  delete ambig_spec;\n  return false;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/ambigs.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ambigs.h\n// Description: Constants, flags, functions for dealing with\n//              ambiguities (training and recognition).\n// Author:      Daria Antonova\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_AMBIGS_H_\n#define TESSERACT_CCUTIL_AMBIGS_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#if !defined(DISABLED_LEGACY_ENGINE)\n\n#  include <tesseract/unichar.h>\n#  include \"elst.h\"\n#  include \"tprintf.h\"\n#  include \"unicharset.h\"\n\n#  define MAX_AMBIG_SIZE 10\n\nnamespace tesseract {\n\nusing UnicharIdVector = std::vector<UNICHAR_ID>;\n\nenum AmbigType {\n  NOT_AMBIG,      // the ngram pair is not ambiguous\n  REPLACE_AMBIG,  // ocred ngram should always be substituted with correct\n  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)\n  SIMILAR_AMBIG,  // use pairwise classifier for ocred/correct pair (1-1)\n  CASE_AMBIG,     // this is a case ambiguity (1-1)\n\n  AMBIG_TYPE_COUNT // number of enum entries\n};\n\n// A collection of utility functions for arrays of UNICHAR_IDs that are\n// terminated by INVALID_UNICHAR_ID.\nclass UnicharIdArrayUtils {\npublic:\n  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is\n  // less than length of array2, if any array1[i] is less than array2[i].\n  // Returns 0 if the arrays are equal, 1 otherwise.\n  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.\n  static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {\n    for (;;) {\n      const UNICHAR_ID val1 = *ptr1++;\n      const UNICHAR_ID val2 = *ptr2++;\n      if (val1 != val2) {\n        if (val1 == INVALID_UNICHAR_ID) {\n          return -1;\n        }\n        if (val2 == INVALID_UNICHAR_ID) {\n          return 1;\n        }\n        if (val1 < val2) {\n          return -1;\n        }\n        return 1;\n      }\n      if (val1 == INVALID_UNICHAR_ID) {\n        return 0;\n      }\n    }\n  }\n\n  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.\n  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID\n  // and that dst has enough space for all the elements from src.\n  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {\n    int i = 0;\n    do {\n      dst[i] = src[i];\n    } while (dst[i++] != INVALID_UNICHAR_ID);\n    return i - 1;\n  }\n\n  // Prints unichars corresponding to the unichar_ids in the given array.\n  // The function assumes that array is terminated by INVALID_UNICHAR_ID.\n  static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {\n    const UNICHAR_ID *ptr = array;\n    if (*ptr == INVALID_UNICHAR_ID) {\n      tprintf(\"[Empty]\");\n    }\n    while (*ptr != INVALID_UNICHAR_ID) {\n      tprintf(\"%s \", unicharset.id_to_unichar(*ptr++));\n    }\n    tprintf(\"( \");\n    ptr = array;\n    while (*ptr != INVALID_UNICHAR_ID) {\n      tprintf(\"%d \", *ptr++);\n    }\n    tprintf(\")\\n\");\n  }\n};\n\n// AMBIG_SPEC_LIST stores a list of dangerous ambigs that\n// start with the same unichar (e.g. r->t rn->m rr1->m).\nclass AmbigSpec : public ELIST<AmbigSpec>::LINK {\npublic:\n  AmbigSpec();\n  ~AmbigSpec() = default;\n\n  // Comparator function for sorting AmbigSpec_LISTs. The lists will\n  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors\n  // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].\n  static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) {\n    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);\n    if (result != 0) {\n      return result;\n    }\n    return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);\n  }\n\n  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];\n  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];\n  UNICHAR_ID correct_ngram_id;\n  AmbigType type;\n  int wrong_ngram_size;\n};\nELISTIZEH(AmbigSpec)\n\n// AMBIG_TABLE[i] stores a set of ambiguities whose\n// wrong ngram starts with unichar id i.\nusing UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;\n\nclass UnicharAmbigs {\npublic:\n  UnicharAmbigs() = default;\n  ~UnicharAmbigs() {\n    for (auto data : replace_ambigs_) {\n      delete data;\n    }\n    for (auto data : dang_ambigs_) {\n      delete data;\n    }\n    for (auto data : one_to_one_definite_ambigs_) {\n      delete data;\n    }\n  }\n\n  const UnicharAmbigsVector &dang_ambigs() const {\n    return dang_ambigs_;\n  }\n  const UnicharAmbigsVector &replace_ambigs() const {\n    return replace_ambigs_;\n  }\n\n  // Initializes the ambigs by adding a nullptr pointer to each table.\n  void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);\n\n  // Loads the universal ambigs that are useful for any language.\n  void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);\n\n  // Fills in two ambiguity tables (replaceable and dangerous) with information\n  // read from the ambigs file. An ambiguity table is an array of lists.\n  // The array is indexed by a class id. Each entry in the table provides\n  // a list of potential ambiguities which can start with the corresponding\n  // character. For example the ambiguity \"rn -> m\", would be located in the\n  // table at index of unicharset.unichar_to_id('r').\n  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in\n  // one_to_one_definite_ambigs_. This vector is also indexed by the class id\n  // of the wrong part of the ambiguity and each entry contains a vector of\n  // unichar ids that are ambiguous to it.\n  // encoder_set is used to encode the ambiguity strings, undisturbed by new\n  // unichar_ids that may be created by adding the ambigs.\n  void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,\n                         bool use_ambigs_for_adaption, UNICHARSET *unicharset);\n\n  // Returns definite 1-1 ambigs for the given unichar id.\n  inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {\n    if (one_to_one_definite_ambigs_.empty()) {\n      return nullptr;\n    }\n    return one_to_one_definite_ambigs_[unichar_id];\n  }\n\n  // Returns a pointer to the vector with all unichar ids that appear in the\n  // 'correct' part of the ambiguity pair when the given unichar id appears\n  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of\n  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of\n  // m will return a pointer to a vector with unichar ids of r,n,i.\n  inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {\n    if (ambigs_for_adaption_.empty()) {\n      return nullptr;\n    }\n    return ambigs_for_adaption_[unichar_id];\n  }\n\n  // Similar to the above, but return the vector of unichar ids for which\n  // the given unichar_id is an ambiguity (appears in the 'wrong' part of\n  // some ambiguity pair).\n  inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {\n    if (reverse_ambigs_for_adaption_.empty()) {\n      return nullptr;\n    }\n    return reverse_ambigs_for_adaption_[unichar_id];\n  }\n\nprivate:\n  bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,\n                          char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,\n                          int *replacement_ambig_part_size, char *replacement_string, int *type);\n  bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,\n                       UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,\n                       const char *replacement_string, int type, AmbigSpec *ambig_spec,\n                       UNICHARSET *unicharset);\n\n  UnicharAmbigsVector dang_ambigs_;\n  UnicharAmbigsVector replace_ambigs_;\n  std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;\n  std::vector<UnicharIdVector *> ambigs_for_adaption_;\n  std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;\n};\n\n} // namespace tesseract\n\n#endif // !defined(DISABLED_LEGACY_ENGINE)\n\n#endif // TESSERACT_CCUTIL_AMBIGS_H_\n"
  },
  {
    "path": "src/ccutil/bitvector.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        bitvector.cpp\n// Description: Class replacement for BITVECTOR.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"bitvector.h\"\n#include <algorithm>\n#include <cstring>\n#include \"helpers.h\"\n#include \"serialis.h\" // for tesseract::Serialize\n\nnamespace tesseract {\n\n// Fast lookup table to get the first least significant set bit in a byte.\n// For zero, the table has 255, but since it is a special case, most code\n// that uses this table will check for zero before looking up lsb_index_.\nconst uint8_t BitVector::lsb_index_[256] = {\n    255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,\n    0,   1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0,\n    1,   0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1,\n    0,   3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0,\n    2,   0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,\n    0,   1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,\n    1,   0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1,\n    0,   2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0,\n    3,   0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};\n\n// Fast lookup table to get the residual bits after zeroing the first (lowest)\n// set bit in a byte.\nconst uint8_t BitVector::lsb_eroded_[256] = {\n    0,    0,    0,    0x2,  0,    0x4,  0x4,  0x6,  0,    0x8,  0x8,  0x0a, 0x08, 0x0c, 0x0c, 0x0e,\n    0,    0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,\n    0,    0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,\n    0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,\n    0,    0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,\n    0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,\n    0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,\n    0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,\n    0,    0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,\n    0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,\n    0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,\n    0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,\n    0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,\n    0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,\n    0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,\n    0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe};\n\n// Fast lookup table to give the number of set bits in a byte.\nconst int BitVector::hamming_table_[256] = {\n    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,\n    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\n    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\n    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\n    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\n    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\n    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\n    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};\n\nBitVector &BitVector::operator=(const BitVector &src) {\n  array_ = src.array_;\n  bit_size_ = src.bit_size_;\n  return *this;\n}\n\n// Initializes the array to length * false.\nvoid BitVector::Init(int length) {\n  Alloc(length);\n  SetAllFalse();\n}\n\n// Writes to the given file. Returns false in case of error.\nbool BitVector::Serialize(FILE *fp) const {\n  if (!tesseract::Serialize(fp, &bit_size_)) {\n    return false;\n  }\n  int wordlen = WordLength();\n  return tesseract::Serialize(fp, &array_[0], wordlen);\n}\n\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool BitVector::DeSerialize(bool swap, FILE *fp) {\n  uint32_t new_bit_size;\n  if (!tesseract::DeSerialize(fp, &new_bit_size)) {\n    return false;\n  }\n  if (swap) {\n    ReverseN(&new_bit_size, sizeof(new_bit_size));\n  }\n  Alloc(new_bit_size);\n  int wordlen = WordLength();\n  if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) {\n    return false;\n  }\n  if (swap) {\n    for (int i = 0; i < wordlen; ++i) {\n      ReverseN(&array_[i], sizeof(array_[i]));\n    }\n  }\n  return true;\n}\n\nvoid BitVector::SetAllFalse() {\n  memset(&array_[0], 0, ByteLength());\n}\nvoid BitVector::SetAllTrue() {\n  memset(&array_[0], ~0, ByteLength());\n}\n\n// Returns the index of the next set bit after the given index.\n// Useful for quickly iterating through the set bits in a sparse vector.\nint BitVector::NextSetBit(int prev_bit) const {\n  // Move on to the next bit.\n  int next_bit = prev_bit + 1;\n  if (next_bit >= bit_size_) {\n    return -1;\n  }\n  // Check the remains of the word containing the next_bit first.\n  int next_word = WordIndex(next_bit);\n  int bit_index = next_word * kBitFactor;\n  int word_end = bit_index + kBitFactor;\n  uint32_t word = array_[next_word];\n  uint8_t byte = word & 0xff;\n  while (bit_index < word_end) {\n    if (bit_index + 8 > next_bit && byte != 0) {\n      while (bit_index + lsb_index_[byte] < next_bit && byte != 0) {\n        byte = lsb_eroded_[byte];\n      }\n      if (byte != 0) {\n        return bit_index + lsb_index_[byte];\n      }\n    }\n    word >>= 8;\n    bit_index += 8;\n    byte = word & 0xff;\n  }\n  // next_word didn't contain a 1, so find the next word with set bit.\n  ++next_word;\n  int wordlen = WordLength();\n  while (next_word < wordlen && (word = array_[next_word]) == 0) {\n    ++next_word;\n    bit_index += kBitFactor;\n  }\n  if (bit_index >= bit_size_) {\n    return -1;\n  }\n  // Find the first non-zero byte within the word.\n  while ((word & 0xff) == 0) {\n    word >>= 8;\n    bit_index += 8;\n  }\n  return bit_index + lsb_index_[word & 0xff];\n}\n\n// Returns the number of set bits in the vector.\nint BitVector::NumSetBits() const {\n  int wordlen = WordLength();\n  int total_bits = 0;\n  for (int w = 0; w < wordlen; ++w) {\n    uint32_t word = array_[w];\n    for (int i = 0; i < 4; ++i) {\n      total_bits += hamming_table_[word & 0xff];\n      word >>= 8;\n    }\n  }\n  return total_bits;\n}\n\n// Logical in-place operations on whole bit vectors. Tries to do something\n// sensible if they aren't the same size, but they should be really.\nvoid BitVector::operator|=(const BitVector &other) {\n  int length = std::min(WordLength(), other.WordLength());\n  for (int w = 0; w < length; ++w) {\n    array_[w] |= other.array_[w];\n  }\n}\nvoid BitVector::operator&=(const BitVector &other) {\n  int length = std::min(WordLength(), other.WordLength());\n  for (int w = 0; w < length; ++w) {\n    array_[w] &= other.array_[w];\n  }\n  for (int w = WordLength() - 1; w >= length; --w) {\n    array_[w] = 0;\n  }\n}\nvoid BitVector::operator^=(const BitVector &other) {\n  int length = std::min(WordLength(), other.WordLength());\n  for (int w = 0; w < length; ++w) {\n    array_[w] ^= other.array_[w];\n  }\n}\n// Set subtraction *this = v1 - v2.\nvoid BitVector::SetSubtract(const BitVector &v1, const BitVector &v2) {\n  Alloc(v1.size());\n  int length = std::min(v1.WordLength(), v2.WordLength());\n  for (int w = 0; w < length; ++w) {\n    array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);\n  }\n  for (int w = WordLength() - 1; w >= length; --w) {\n    array_[w] = v1.array_[w];\n  }\n}\n\n// Allocates memory for a vector of the given length.\n// Reallocates if the array is a different size, larger or smaller.\nvoid BitVector::Alloc(int length) {\n  int initial_wordlength = WordLength();\n  bit_size_ = length;\n  int new_wordlength = WordLength();\n  if (new_wordlength != initial_wordlength) {\n    array_.resize(new_wordlength);\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccutil/bitvector.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        bitvector.h\n// Description: Class replacement for BITVECTOR.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_BITVECTOR_H_\n#define TESSERACT_CCUTIL_BITVECTOR_H_\n\n#include <tesseract/export.h>\n\n#include <cassert>\n#include <cstdint> // for uint8_t\n#include <cstdio>\n#include <vector>  // for std::vector\n\nnamespace tesseract {\n\n// Trivial class to encapsulate a fixed-length array of bits, with\n// Serialize/DeSerialize. Replaces the old macros.\nclass TESS_API BitVector {\npublic:\n  // Fast lookup table to get the first least significant set bit in a byte.\n  // For zero, the table has 255, but since it is a special case, most code\n  // that uses this table will check for zero before looking up lsb_index_.\n  static const uint8_t lsb_index_[256];\n  // Fast lookup table to get the residual bits after zeroing the least\n  // significant set bit in a byte.\n  static const uint8_t lsb_eroded_[256];\n  // Fast lookup table to give the number of set bits in a byte.\n  static const int hamming_table_[256];\n\n  BitVector() = default;\n  // Initializes the array to length * false.\n  explicit BitVector(int length) : bit_size_(length), array_(WordLength()) {\n  }\n  BitVector(const BitVector &src) : bit_size_(src.bit_size_), array_(src.array_) {\n  }\n  BitVector &operator=(const BitVector &src);\n  ~BitVector() = default;\n\n  // Initializes the array to length * false.\n  void Init(int length);\n\n  int empty() const {\n    return bit_size_ == 0;\n  }\n\n  // Returns the number of bits that are accessible in the vector.\n  int size() const {\n    return bit_size_;\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\n  void SetAllFalse();\n  void SetAllTrue();\n\n  // Accessors to set/reset/get bits.\n  // The range of index is [0, size()-1].\n  // There is debug-only bounds checking.\n  void SetBit(int index) {\n    array_[WordIndex(index)] |= BitMask(index);\n  }\n  void ResetBit(int index) {\n    array_[WordIndex(index)] &= ~BitMask(index);\n  }\n  void SetValue(int index, bool value) {\n    if (value) {\n      SetBit(index);\n    } else {\n      ResetBit(index);\n    }\n  }\n  bool At(int index) const {\n    return (array_[WordIndex(index)] & BitMask(index)) != 0;\n  }\n  bool operator[](int index) const {\n    return (array_[WordIndex(index)] & BitMask(index)) != 0;\n  }\n\n  // Returns the index of the next set bit after the given index.\n  // Useful for quickly iterating through the set bits in a sparse vector.\n  int NextSetBit(int prev_bit) const;\n\n  // Returns the number of set bits in the vector.\n  int NumSetBits() const;\n\n  // Logical in-place operations on whole bit vectors. Tries to do something\n  // sensible if they aren't the same size, but they should be really.\n  void operator|=(const BitVector &other);\n  void operator&=(const BitVector &other);\n  void operator^=(const BitVector &other);\n  // Set subtraction *this = v1 - v2.\n  void SetSubtract(const BitVector &v1, const BitVector &v2);\n\nprivate:\n  // Allocates memory for a vector of the given length.\n  void Alloc(int length);\n\n  // Computes the index to array_ for the given index, with debug range\n  // checking.\n  int WordIndex(int index) const {\n    assert(0 <= index && index < bit_size_);\n    return index / kBitFactor;\n  }\n  // Returns a mask to select the appropriate bit for the given index.\n  uint32_t BitMask(int index) const {\n    return 1 << (index & (kBitFactor - 1));\n  }\n  // Returns the number of array elements needed to represent the current\n  // bit_size_.\n  int WordLength() const {\n    return (bit_size_ + kBitFactor - 1) / kBitFactor;\n  }\n  // Returns the number of bytes consumed by the array_.\n  int ByteLength() const {\n    return WordLength() * sizeof(array_[0]);\n  }\n\n  // Number of bits in this BitVector.\n  int32_t bit_size_ = 0;\n  // Array of words used to pack the bits.\n  // Bits are stored little-endian by uint32_t word, ie by word first and then\n  // starting with the least significant bit in each word.\n  std::vector<uint32_t> array_;\n  // Number of bits in an array_ element.\n  static const int kBitFactor = sizeof(array_[0]) * 8;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCUTIL_BITVECTOR_H_\n"
  },
  {
    "path": "src/ccutil/ccutil.cpp",
    "content": "// Copyright 2008 Google Inc. All Rights Reserved.\n// Author: scharron@google.com (Samuel Charron)\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"ccutil.h\"\n#include \"tprintf.h\"  // for tprintf\n\n#include <cstdlib>\n#include <cstring>    // for std::strrchrA\n#include <filesystem> // for std::filesystem\n\nnamespace tesseract {\n\nCCUtil::CCUtil()\n    : params_()\n      , INT_INIT_MEMBER(ambigs_debug_level, 0, \"Debug level for unichar ambiguities\", &params_)\n      , BOOL_MEMBER(use_ambigs_for_adaption, false,\n                  \"Use ambigs for deciding\"\n                  \" whether to adapt to a character\",\n                  &params_) {}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nCCUtil::~CCUtil() = default;\n\n/**\n * @brief CCUtil::main_setup - set location of tessdata and name of image\n *\n * @param argv0 - paths to the directory with language files and config files.\n * An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is\n * used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If\n * previous is not successful - use current directory.\n * @param basename - name of image\n */\nvoid CCUtil::main_setup(const std::string &argv0, const std::string &basename) {\n  imagebasename = basename; /**< name of image */\n\n  const char *tessdata_prefix = getenv(\"TESSDATA_PREFIX\");\n\n  // Ignore TESSDATA_PREFIX if there is no matching filesystem entry.\n  if (tessdata_prefix != nullptr && !std::filesystem::exists(tessdata_prefix)) {\n    tprintf(\"Warning: TESSDATA_PREFIX %s does not exist, ignore it\\n\", tessdata_prefix);\n    tessdata_prefix = nullptr;\n  }\n\n  if (!argv0.empty()) {\n    /* Use tessdata prefix from the command line. */\n    datadir = argv0;\n  } else if (tessdata_prefix) {\n    /* Use tessdata prefix from the environment. */\n    datadir = tessdata_prefix;\n#if defined(_WIN32)\n  } else if (datadir.empty() || !std::filesystem::exists(datadir)) {\n    /* Look for tessdata in directory of executable. */\n    char path[_MAX_PATH];\n    DWORD length = GetModuleFileName(nullptr, path, sizeof(path));\n    if (length > 0 && length < sizeof(path)) {\n      char *separator = std::strrchr(path, '\\\\');\n      if (separator != nullptr) {\n        *separator = '\\0';\n        std::string subdir = path;\n        subdir += \"/tessdata\";\n        if (std::filesystem::exists(subdir)) {\n          datadir = subdir;\n        }\n      }\n    }\n#endif /* _WIN32 */\n  }\n\n  // datadir may still be empty:\n  if (datadir.empty()) {\n#if defined(TESSDATA_PREFIX)\n    // Use tessdata prefix which was compiled in.\n    datadir = TESSDATA_PREFIX \"/tessdata/\";\n    // Note that some software (for example conda) patches TESSDATA_PREFIX\n    // in the binary, so it might be shorter. Recalculate its length.\n    datadir.resize(std::strlen(datadir.c_str()));\n#else\n    datadir = \"./\";\n#endif /* TESSDATA_PREFIX */\n  }\n\n  // check for missing directory separator\n  const char lastchar = datadir.back();\n  if (lastchar != '/' && lastchar != '\\\\') {\n    datadir += '/';\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/ccutil.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ccutil.h\n// Description: ccutil class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_CCUTIL_H_\n#define TESSERACT_CCUTIL_CCUTIL_H_\n\n#ifndef _WIN32\n#  include <pthread.h>\n#  include <semaphore.h>\n#endif\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"ambigs.h\"\n#endif\n#include \"errcode.h\"\n#ifdef _WIN32\n#  include \"host.h\" // windows.h for HANDLE, ...\n#endif\n#include \"params.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\nclass TESS_API CCUtil {\npublic:\n  CCUtil();\n  virtual ~CCUtil();\n\npublic:\n  // Read the arguments and set up the data path.\n  void main_setup(const std::string &argv0,   // program name\n                  const std::string &basename // name of image\n  );\n  ParamsVectors *params() {\n    return &params_;\n  }\n\n  std::string datadir;       // dir for data files\n  std::string imagebasename; // name of image\n  std::string lang;\n  std::string language_data_path_prefix;\n  UNICHARSET unicharset;\n#ifndef DISABLED_LEGACY_ENGINE\n  UnicharAmbigs unichar_ambigs;\n#endif\n  std::string imagefile; // image file name\n  std::string directory; // main directory\n\nprivate:\n  ParamsVectors params_;\n\npublic:\n  // Member parameters.\n  // These have to be declared and initialized after params_ member, since\n  // params_ should be initialized before parameters are added to it.\n  INT_VAR_H(ambigs_debug_level);\n  BOOL_VAR_H(use_ambigs_for_adaption);\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_CCUTIL_H_\n"
  },
  {
    "path": "src/ccutil/clst.h",
    "content": "/**********************************************************************\n * File:        clst.h  (Formerly clist.h)\n * Description: CONS cell list module include file.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef CLST_H\n#define CLST_H\n\n#include \"lsterr.h\"\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cstdio>\n\nnamespace tesseract {\n\n/**********************************************************************\n * CLASS - CLIST\n *\n * Generic list class for singly linked CONS cell lists\n **********************************************************************/\n\ntemplate <typename T>\nclass ConsList {\n  friend class Link;\n\npublic:\n  /**********************************************************************\n   *              CLASS - Link\n   *\n   *              Generic link class for singly linked CONS cell lists\n   *\n   *  Note:  No destructor - elements are assumed to be destroyed EITHER after\n   *  they have been extracted from a list OR by the ConsList destructor which\n   *  walks the list.\n   **********************************************************************/\n  struct Link {\n    Link *next{};\n    T *data{};\n\n    Link() = default;\n    Link(const Link &) = delete;\n    void operator=(const Link &) = delete;\n  };\n\n  /***********************************************************************\n   *              CLASS - Iterator\n   *\n   *              Generic iterator class for singly linked lists with embedded\n   *links\n   **********************************************************************/\n  class Iterator {\n    ConsList *list;                  // List being iterated\n    Link *prev;             // prev element\n    Link *current;          // current element\n    Link *next;             // next element\n    Link *cycle_pt;         // point we are cycling the list to.\n    bool ex_current_was_last;     // current extracted was end of list\n    bool ex_current_was_cycle_pt; // current extracted was cycle point\n    bool started_cycling;         // Have we moved off the start?\n\n    /***********************************************************************\n     *              Iterator::extract_sublist()\n     *\n     *  This is a private member, used only by ConsList::assign_to_sublist.\n     *  Given another iterator for the same list, extract the links from THIS to\n     *  OTHER inclusive, link them into a new circular list, and return a\n     *  pointer to the last element.\n     *  (Can't inline this function because it contains a loop)\n     **********************************************************************/\n    Link *extract_sublist(  // from this current\n      Iterator *other_it) {              // to other current\n      Iterator temp_it = *this;\n\n      constexpr ERRCODE BAD_SUBLIST(\"Can't find sublist end point in original list\");\n#ifndef NDEBUG\n      constexpr ERRCODE BAD_EXTRACTION_PTS(\"Can't extract sublist from points on different lists\");\n      constexpr ERRCODE DONT_EXTRACT_DELETED(\"Can't extract a sublist marked by deleted points\");\n\n      if (list != other_it->list)\n        BAD_EXTRACTION_PTS.error(\"Iterator.extract_sublist\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"Iterator::extract_sublist\", ABORT);\n\n      if (!current || !other_it->current)\n        DONT_EXTRACT_DELETED.error(\"Iterator.extract_sublist\", ABORT);\n#endif\n\n      ex_current_was_last = other_it->ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n      other_it->ex_current_was_cycle_pt = false;\n\n      temp_it.mark_cycle_pt();\n      do {                         // walk sublist\n        if (temp_it.cycled_list()) { // can't find end pt\n          BAD_SUBLIST.error(\"Iterator.extract_sublist\", ABORT);\n        }\n\n        if (temp_it.at_last()) {\n          list->last = prev;\n          ex_current_was_last = other_it->ex_current_was_last = true;\n        }\n\n        if (temp_it.current == cycle_pt) {\n          ex_current_was_cycle_pt = true;\n        }\n\n        if (temp_it.current == other_it->cycle_pt) {\n          other_it->ex_current_was_cycle_pt = true;\n        }\n\n        temp_it.forward();\n      } while (temp_it.prev != other_it->current);\n\n      // circularise sublist\n      other_it->current->next = current;\n      auto end_of_new_list = other_it->current;\n\n      // sublist = whole list\n      if (prev == other_it->current) {\n        list->last = nullptr;\n        prev = current = next = nullptr;\n        other_it->prev = other_it->current = other_it->next = nullptr;\n      } else {\n        prev->next = other_it->next;\n        current = other_it->current = nullptr;\n        next = other_it->next;\n        other_it->prev = prev;\n      }\n      return end_of_new_list;\n    }\n\n  public:\n    Iterator() { // constructor\n      list = nullptr;\n    } // unassigned list\n\n  /***********************************************************************\n   *              Iterator::Iterator\n   *\n   *  CONSTRUCTOR - set iterator to specified list;\n   **********************************************************************/\n    Iterator( // constructor\n      ConsList *list_to_iterate) {\n      set_to_list(list_to_iterate);\n    }\n\n    /***********************************************************************\n     *              Iterator::set_to_list\n     *\n     *  (Re-)initialise the iterator to point to the start of the list_to_iterate\n     *  over.\n     **********************************************************************/\n    void set_to_list( // change list\n      ConsList *list_to_iterate) {\n      list = list_to_iterate;\n      prev = list->last;\n      current = list->First();\n      next = current != nullptr ? current->next : nullptr;\n      cycle_pt = nullptr; // await explicit set\n      started_cycling = false;\n      ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n    }\n\n    /***********************************************************************\n     *              Iterator::add_after_then_move\n     *\n     *  Add a new element to the list after the current element and move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_after_then_move( // add after current &\n      T *new_data) {\n#ifndef NDEBUG\n      if (!new_data) {\n        BAD_PARAMETER.error(\"Iterator::add_after_then_move\", ABORT, \"new_data is nullptr\");\n      }\n#endif\n\n      auto new_element = new Link;\n      new_element->data = new_data;\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        new_element->next = next;\n\n        if (current) { // not extracted\n          current->next = new_element;\n          prev = current;\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    }      // move to new\n\n    /***********************************************************************\n     *              Iterator::add_after_stay_put\n     *\n     *  Add a new element to the list after the current element but do not move\n     *  the iterator to the new element.\n     **********************************************************************/\n    void add_after_stay_put( // add after current &\n      T *new_data) {\n#ifndef NDEBUG\n      if (!new_data) {\n        BAD_PARAMETER.error(\"Iterator::add_after_stay_put\", ABORT, \"new_data is nullptr\");\n      }\n#endif\n\n      auto new_element = new Link;\n      new_element->data = new_data;\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = false;\n        current = nullptr;\n      } else {\n        new_element->next = next;\n\n        if (current) { // not extracted\n          current->next = new_element;\n          if (prev == current) {\n            prev = new_element;\n          }\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n            ex_current_was_last = false;\n          }\n        }\n        next = new_element;\n      }\n    }     // stay at current\n\n    /***********************************************************************\n     *              Iterator::add_before_then_move\n     *\n     *  Add a new element to the list before the current element and move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_then_move( // add before current &\n      T *new_data) {\n#ifndef NDEBUG\n      if (!new_data) {\n        BAD_PARAMETER.error(\"Iterator::add_before_then_move\", ABORT, \"new_data is nullptr\");\n      }\n#endif\n\n      auto new_element = new Link;\n      new_element->data = new_data;\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        prev->next = new_element;\n        if (current) { // not extracted\n          new_element->next = current;\n          next = current;\n        } else { // current extracted\n          new_element->next = next;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    }       // move to new\n\n    /***********************************************************************\n     *              Iterator::add_before_stay_put\n     *\n     *  Add a new element to the list before the current element but don't move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_stay_put( // add before current &\n      T *new_data) {\n#ifndef NDEBUG\n      if (!new_data) {\n        BAD_PARAMETER.error(\"Iterator::add_before_stay_put\", ABORT, \"new_data is nullptr\");\n      }\n#endif\n\n      auto new_element = new Link;\n      new_element->data = new_data;\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = true;\n        current = nullptr;\n      } else {\n        prev->next = new_element;\n        if (current) { // not extracted\n          new_element->next = current;\n          if (next == current) {\n            next = new_element;\n          }\n        } else { // current extracted\n          new_element->next = next;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n        }\n        prev = new_element;\n      }\n    }      // stay at current\n\n    /***********************************************************************\n     *              Iterator::add_list_after\n     *\n     *  Insert another list to this list after the current element but don't move\n     *the\n     *  iterator.\n     **********************************************************************/\n    void add_list_after(     // add a list &\n      ConsList *list_to_add) {\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          next = list->First();\n          ex_current_was_last = true;\n          current = nullptr;\n        } else {\n          if (current) { // not extracted\n            current->next = list_to_add->First();\n            if (current == list->last) {\n              list->last = list_to_add->last;\n            }\n            list_to_add->last->next = next;\n            next = current->next;\n          } else { // current extracted\n            prev->next = list_to_add->First();\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n              ex_current_was_last = false;\n            }\n            list_to_add->last->next = next;\n            next = prev->next;\n          }\n        }\n        list_to_add->last = nullptr;\n      }\n    } // stay at current\n\n    /***********************************************************************\n     *              Iterator::add_list_before\n     *\n     *  Insert another list to this list before the current element. Move the\n     *  iterator to the start of the inserted elements\n     *  iterator.\n     **********************************************************************/\n    void add_list_before(    // add a list &\n      ConsList *list_to_add) {\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          current = list->First();\n          next = current->next;\n          ex_current_was_last = false;\n        } else {\n          prev->next = list_to_add->First();\n          if (current) { // not extracted\n            list_to_add->last->next = current;\n          } else { // current extracted\n            list_to_add->last->next = next;\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n            }\n            if (ex_current_was_cycle_pt) {\n              cycle_pt = prev->next;\n            }\n          }\n          current = prev->next;\n          next = current->next;\n        }\n        list_to_add->last = nullptr;\n      }\n    } // move to it 1st item\n\n    T *data() { // get current data\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"Iterator::data\", ABORT);\n      }\n#endif\n      return current->data;\n    }\n\n    /***********************************************************************\n     *              Iterator::data_relative\n     *\n     *  Return the data pointer to the element \"offset\" elements from current.\n     *  \"offset\" must not be less than -1.\n     *  (This function can't be INLINEd because it contains a loop)\n     **********************************************************************/\n    T *data_relative(  // get data + or - ...\n      int8_t offset) {                 // offset from current\n      Link *ptr;\n\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"Iterator::data_relative\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"Iterator::data_relative\", ABORT);\n      if (offset < -1)\n        BAD_PARAMETER.error(\"Iterator::data_relative\", ABORT, \"offset < -l\");\n#endif\n\n      if (offset == -1) {\n        ptr = prev;\n      } else {\n        for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) {\n        }\n      }\n\n      return ptr->data;\n    }\n\n    /***********************************************************************\n     *              Iterator::forward\n     *\n     *  Move the iterator to the next element of the list.\n     *  REMEMBER: ALL LISTS ARE CIRCULAR.\n     **********************************************************************/\n    T *forward() {\n      if (list->empty()) {\n        return nullptr;\n      }\n\n      if (current) { // not removed so\n        // set previous\n        prev = current;\n        started_cycling = true;\n        // In case next is deleted by another iterator, get next from current.\n        current = current->next;\n      } else {\n        if (ex_current_was_cycle_pt) {\n          cycle_pt = next;\n        }\n        current = next;\n      }\n\n      next = current->next;\n      return current->data;\n    }\n\n    /***********************************************************************\n     *              Iterator::extract\n     *\n     *  Do extraction by removing current from the list, deleting the cons cell\n     *  and returning the data to the caller, but NOT updating the iterator.  (So\n     *  that any calling loop can do this.)  The iterator's current points to\n     *  nullptr.  If the data is to be deleted, this is the callers responsibility.\n     **********************************************************************/\n    T *extract() {\n#ifndef NDEBUG\n      if (!current) { // list empty or\n        // element extracted\n        NULL_CURRENT.error(\"Iterator::extract\", ABORT);\n      }\n#endif\n\n      if (list->singleton()) {\n        // Special case where we do need to change the iterator.\n        prev = next = list->last = nullptr;\n      } else {\n        prev->next = next; // remove from list\n\n        if (current == list->last) {\n          list->last = prev;\n          ex_current_was_last = true;\n        } else {\n          ex_current_was_last = false;\n        }\n      }\n      // Always set ex_current_was_cycle_pt so an add/forward will work in a loop.\n      ex_current_was_cycle_pt = (current == cycle_pt);\n      auto extracted_data = current->data;\n      delete (current); // destroy CONS cell\n      current = nullptr;\n      return extracted_data;\n    } // remove from list\n\n    /***********************************************************************\n     *              Iterator::move_to_first()\n     *\n     *  Move current so that it is set to the start of the list.\n     *  Return data just in case anyone wants it.\n     **********************************************************************/\n    T *move_to_first() {\n      current = list->First();\n      prev = list->last;\n      next = current != nullptr ? current->next : nullptr;\n      return current != nullptr ? current->data : nullptr;\n    } // go to start of list\n\n    /***********************************************************************\n     *              Iterator::move_to_last()\n     *\n     *  Move current so that it is set to the end of the list.\n     *  Return data just in case anyone wants it.\n     *  (This function can't be INLINEd because it contains a loop)\n     **********************************************************************/\n    T *move_to_last() {\n      while (current != list->last) {\n        forward();\n      }\n\n      if (current == nullptr) {\n        return nullptr;\n      } else {\n        return current->data;\n      }\n    }\n\n    /***********************************************************************\n     *              Iterator::mark_cycle_pt()\n     *\n     *  Remember the current location so that we can tell whether we've returned\n     *  to this point later.\n     *\n     *  If the current point is deleted either now, or in the future, the cycle\n     *  point will be set to the next item which is set to current.  This could be\n     *  by a forward, add_after_then_move or add_after_then_move.\n     **********************************************************************/\n    void mark_cycle_pt() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"Iterator::mark_cycle_pt\", ABORT);\n      }\n#endif\n\n      if (current) {\n        cycle_pt = current;\n      } else {\n        ex_current_was_cycle_pt = true;\n      }\n      started_cycling = false;\n    } // remember current\n\n    bool empty() const { // is list empty?\n      return list->empty();\n    }\n\n    bool current_extracted() const { // current extracted?\n      return !current;\n    }\n\n    /***********************************************************************\n     *              Iterator::at_first()\n     *\n     *  Are we at the start of the list?\n     *\n     **********************************************************************/\n    bool at_first() const {\n      // we're at a deleted\n      return ((list->empty()) || (current == list->First()) ||\n        ((current == nullptr) && (prev == list->last) && // NON-last pt between\n          !ex_current_was_last));                         // first and last\n    } // Current is first?\n\n    /***********************************************************************\n     *              Iterator::at_last()\n     *\n     *  Are we at the end of the list?\n     *\n     **********************************************************************/\n    bool at_last() const {\n      // we're at a deleted\n      return ((list->empty()) || (current == list->last) ||\n        ((current == nullptr) && (prev == list->last) && // last point between\n          ex_current_was_last));                          // first and last\n    } // Current is last?\n\n    /***********************************************************************\n     *              Iterator::cycled_list()\n     *\n     *  Have we returned to the cycle_pt since it was set?\n     *\n     **********************************************************************/\n    bool cycled_list() const { // Completed a cycle?\n      return ((list->empty()) || ((current == cycle_pt) && started_cycling));\n    }\n\n    /***********************************************************************\n     *              Iterator::add_to_end\n     *\n     *  Add a new element to the end of the list without moving the iterator.\n     *  This is provided because a single linked list cannot move to the last as\n     *  the iterator couldn't set its prev pointer.  Adding to the end is\n     *  essential for implementing\n                  queues.\n    **********************************************************************/\n    void add_to_end(  // element to add\n      T *new_data) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"Iterator::add_to_end\", ABORT);\n      }\n      if (!new_data) {\n        BAD_PARAMETER.error(\"Iterator::add_to_end\", ABORT, \"new_data is nullptr\");\n      }\n#endif\n\n      if (this->at_last()) {\n        this->add_after_stay_put(new_data);\n      } else {\n        if (this->at_first()) {\n          this->add_before_stay_put(new_data);\n          list->last = prev;\n        } else { // Iteratr is elsewhere\n          auto new_element = new Link;\n          new_element->data = new_data;\n\n          new_element->next = list->last->next;\n          list->last->next = new_element;\n          list->last = new_element;\n        }\n      }\n    }\n\n    /***********************************************************************\n     *              Iterator::exchange()\n     *\n     *  Given another iterator, whose current element is a different element on\n     *  the same list list OR an element of another list, exchange the two current\n     *  elements.  On return, each iterator points to the element which was the\n     *  other iterators current on entry.\n     *  (This function hasn't been in-lined because its a bit big!)\n     **********************************************************************/\n    void exchange(                 // positions of 2 links\n      Iterator *other_it) { // other iterator\n      constexpr ERRCODE DONT_EXCHANGE_DELETED(\"Can't exchange deleted elements of lists\");\n\n      /* Do nothing if either list is empty or if both iterators reference the same\n    link */\n\n      if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) {\n        return;\n      }\n\n      /* Error if either current element is deleted */\n\n      if (!current || !other_it->current) {\n        DONT_EXCHANGE_DELETED.error(\"Iterator.exchange\", ABORT);\n      }\n\n      /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements\n    (other before this); non-doubleton adjacent elements (this before other);\n    non-adjacent elements. */\n\n    // adjacent links\n      if ((next == other_it->current) || (other_it->next == current)) {\n        // doubleton list\n        if ((next == other_it->current) && (other_it->next == current)) {\n          prev = next = current;\n          other_it->prev = other_it->next = other_it->current;\n        } else { // non-doubleton with\n          // adjacent links\n          // other before this\n          if (other_it->next == current) {\n            other_it->prev->next = current;\n            other_it->current->next = next;\n            current->next = other_it->current;\n            other_it->next = other_it->current;\n            prev = current;\n          } else { // this before other\n            prev->next = other_it->current;\n            current->next = other_it->next;\n            other_it->current->next = current;\n            next = current;\n            other_it->prev = other_it->current;\n          }\n        }\n      } else { // no overlap\n        prev->next = other_it->current;\n        current->next = other_it->next;\n        other_it->prev->next = current;\n        other_it->current->next = next;\n      }\n\n      /* update end of list pointer when necessary (remember that the 2 iterators\n      may iterate over different lists!) */\n\n      if (list->last == current) {\n        list->last = other_it->current;\n      }\n      if (other_it->list->last == other_it->current) {\n        other_it->list->last = current;\n      }\n\n      if (current == cycle_pt) {\n        cycle_pt = other_it->cycle_pt;\n      }\n      if (other_it->current == other_it->cycle_pt) {\n        other_it->cycle_pt = cycle_pt;\n      }\n\n      /* The actual exchange - in all cases*/\n\n      auto old_current = current;\n      current = other_it->current;\n      other_it->current = old_current;\n    }\n\n    /***********************************************************************\n     *              Iterator::length()\n     *\n     *  Return the length of the list\n     *\n     **********************************************************************/\n    int32_t length() const {\n      return list->length();\n    }\n\n    /***********************************************************************\n     *              Iterator::sort()\n     *\n     *  Sort the elements of the list, then reposition at the start.\n     *\n     **********************************************************************/\n    void sort(     // sort elements\n      int comparator(               // comparison routine\n        const T *, const T *)) {\n      list->sort(comparator);\n      move_to_first();\n    }\n  };\n  using ITERATOR = Iterator; // compat\n\nprivate:\n  Link *last = nullptr; // End of list\n\n  //(Points to head)\n  Link *First() { // return first\n    return last != nullptr ? last->next : nullptr;\n  }\n\n  const Link *First() const { // return first\n    return last != nullptr ? last->next : nullptr;\n  }\n\npublic:\n  ~ConsList() { // destructor\n    shallow_clear();\n  }\n\n  /***********************************************************************\n   *              ConsList::internal_deep_clear\n   *\n   *  Used by the \"deep_clear\" member function of derived list\n   *  classes to destroy all the elements on the list.\n   *  The calling function passes a \"zapper\" function which can be called to\n   *  delete each data element of the list, regardless of its class.  This\n   *  technique permits a generic clear function to destroy elements of\n   *  different derived types correctly, without requiring virtual functions and\n   *  the consequential memory overhead.\n   **********************************************************************/\n  void internal_deep_clear() {    // ptr to zapper functn\n    if (!empty()) {\n      auto ptr = last->next;     // set to first\n      last->next = nullptr; // break circle\n      last = nullptr;       // set list empty\n      while (ptr) {\n        auto next = ptr->next;\n        delete ptr->data;\n        delete (ptr);\n        ptr = next;\n      }\n    }\n  }\n  void deep_clear() {\n    internal_deep_clear();\n  }\n\n  /***********************************************************************\n   *              ConsList::shallow_clear\n   *\n   *  Used by the destructor and the \"shallow_clear\" member function of derived\n   *  list classes to destroy the list.\n   *  The data elements are NOT destroyed.\n   *\n   **********************************************************************/\n  void shallow_clear() { // destroy all links\n    if (!empty()) {\n      auto ptr = last->next;     // set to first\n      last->next = nullptr; // break circle\n      last = nullptr;       // set list empty\n      while (ptr) {\n        auto next = ptr->next;\n        delete (ptr);\n        ptr = next;\n      }\n    }\n  }\n\n  bool empty() const { // is list empty?\n    return !last;\n  }\n\n  bool singleton() const {\n    return last != nullptr ? (last == last->next) : false;\n  }\n\n  void shallow_copy(      // dangerous!!\n    ConsList *from_list) { // beware destructors!!\n    last = from_list->last;\n  }\n\n  /***********************************************************************\n   *              ConsList::assign_to_sublist\n   *\n   *  The list is set to a sublist of another list.  \"This\" list must be empty\n   *  before this function is invoked.  The two iterators passed must refer to\n   *  the same list, different from \"this\" one.  The sublist removed is the\n   *  inclusive list from start_it's current position to end_it's current\n   *  position.  If this range passes over the end of the source list then the\n   *  source list has its end set to the previous element of start_it.  The\n   *  extracted sublist is unaffected by the end point of the source list, its\n   *  end point is always the end_it position.\n   **********************************************************************/\n  void assign_to_sublist(  // to this list\n    Iterator *start_it,  // from list start\n    Iterator *end_it) {  // from list end\n    constexpr ERRCODE LIST_NOT_EMPTY(\"Destination list must be empty before extracting a sublist\");\n\n    if (!empty()) {\n      LIST_NOT_EMPTY.error(\"ConsList.assign_to_sublist\", ABORT);\n    }\n\n    last = start_it->extract_sublist(end_it);\n  }\n\n  int32_t length() const { //# elements in list\n    int32_t count = 0;\n    if (last != nullptr) {\n      count = 1;\n      for (auto it = last->next; it != last; it = it->next) {\n        count++;\n      }\n    }\n    return count;\n  }\n\n  /***********************************************************************\n   *              ConsList::sort\n   *\n   *  Sort elements on list\n   **********************************************************************/\n  void sort(          // sort elements\n    int comparator( // comparison routine\n      const T *, const T *)) {\n    // Allocate an array of pointers, one per list element.\n    auto count = length();\n    if (count > 0) {\n      // ptr array to sort\n      std::vector<T *> base;\n      base.reserve(count);\n\n      Iterator it(this);\n\n      // Extract all elements, putting the pointers in the array.\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        base.push_back(it.extract());\n      }\n\n      // Sort the pointer array.\n      std::sort(base.begin(), base.end(),\n        // all current comparators return -1,0,1, so we handle this correctly for std::sort\n        [&](auto &&l, auto &&r) {return comparator(l, r) < 0; });\n\n      // Rebuild the list from the sorted pointers.\n      for (auto current : base) {\n        it.add_to_end(current);\n      }\n    }\n  }\n\n  // Assuming list has been sorted already, insert new_data to\n  // keep the list sorted according to the same comparison function.\n  // Comparison function is the same as used by sort, i.e. uses double\n  // indirection. Time is O(1) to add to beginning or end.\n  // Time is linear to add pre-sorted items to an empty list.\n  // If unique, then don't add duplicate entries.\n  // Returns true if the element was added to the list.\n  bool add_sorted(int comparator(const T *, const T *), bool unique, T *new_data) {\n    // Check for adding at the end.\n    if (last == nullptr || comparator(last->data, new_data) < 0) {\n      auto *new_element = new Link;\n      new_element->data = new_data;\n      if (last == nullptr) {\n        new_element->next = new_element;\n      } else {\n        new_element->next = last->next;\n        last->next = new_element;\n      }\n      last = new_element;\n      return true;\n    } else if (!unique || last->data != new_data) {\n      // Need to use an iterator.\n      Iterator it(this);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        auto data = it.data();\n        if (data == new_data && unique) {\n          return false;\n        }\n        if (comparator(data, new_data) > 0) {\n          break;\n        }\n      }\n      if (it.cycled_list()) {\n        it.add_to_end(new_data);\n      } else {\n        it.add_before_then_move(new_data);\n      }\n      return true;\n    }\n    return false;\n  }\n\n  // Assuming that the minuend and subtrahend are already sorted with\n  // the same comparison function, shallow clears this and then copies\n  // the set difference minuend - subtrahend to this, being the elements\n  // of minuend that do not compare equal to anything in subtrahend.\n  // If unique is true, any duplicates in minuend are also eliminated.\n  void set_subtract(int comparator(const T *, const T *), bool unique, ConsList *minuend,\n    ConsList *subtrahend) {\n    shallow_clear();\n    Iterator m_it(minuend);\n    Iterator s_it(subtrahend);\n    // Since both lists are sorted, finding the subtras that are not\n    // minus is a case of a parallel iteration.\n    for (m_it.mark_cycle_pt(); !m_it.cycled_list(); m_it.forward()) {\n      auto minu = m_it.data();\n      T *subtra = nullptr;\n      if (!s_it.empty()) {\n        subtra = s_it.data();\n        while (!s_it.at_last() && comparator(subtra, minu) < 0) {\n          s_it.forward();\n          subtra = s_it.data();\n        }\n      }\n      if (subtra == nullptr || comparator(subtra, minu) != 0) {\n        add_sorted(comparator, unique, minu);\n      }\n    }\n  }\n};\n\n#define CLISTIZEH(T)                          \\\n  class T##_CLIST : public ConsList<T> {      \\\n    using ConsList<T>::ConsList;              \\\n  };                                          \\\n  using T##_C_IT = ConsList<T>::Iterator;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/elst.h",
    "content": "/**********************************************************************\n * File:        elst.h  (Formerly elist.h)\n * Description: Embedded list module include file.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef ELST_H\n#define ELST_H\n\n#include \"lsterr.h\"\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cstdio>\n\nnamespace tesseract {\n\n/**********************************************************************\nThis module implements list classes and iterators.\nThe following list types and iterators are provided:\n\n  List type        List Class      Iterator Class     Element Class\n  ---------         ----------      --------------      -------------\n\n    Embedded list       ELIST\n              ELIST_ITERATOR\n              ELIST_LINK\n    (Single linked)\n\n    Embedded list       ELIST2\n              ELIST2_ITERATOR\n              ELIST2_LINK\n    (Double linked)\n\n    Cons List           CLIST\n              CLIST_ITERATOR\n              CLIST_LINK\n    (Single linked)\n\nAn embedded list is where the list pointers are provided by a generic class.\nData types to be listed inherit from the generic class.  Data is thus linked\nin only ONE list at any one time.\n\nA cons list has a separate structure for a \"cons cell\".  This contains the\nlist pointer(s) AND a pointer to the data structure held on the list.  A\nstructure can be on many cons lists at the same time, and the structure does\nnot need to inherit from any generic class in order to be on the list.\n\nThe implementation of lists is very careful about space and speed overheads.\nThis is why many embedded lists are provided. The same concerns mean that\nin-line type coercion is done, rather than use virtual functions.  This is\ncumbersome in that each data type to be listed requires its own iterator and\nlist class - though macros can generate these.  It also prevents heterogeneous\nlists.\n**********************************************************************/\n\n/**********************************************************************\n * CLASS - ELIST\n *\n * Generic list class for singly linked lists with embedded links\n **********************************************************************/\n\ntemplate <typename T>\nclass IntrusiveForwardList {\npublic:\n  /**********************************************************************\n   *                          CLASS - ELIST_LINK\n   *\n   *                          Generic link class for singly linked lists with\n   *embedded links\n   *\n   *  Note:  No destructor - elements are assumed to be destroyed EITHER after\n   *  they have been extracted from a list OR by the IntrusiveForwardList destructor which\n   *  walks the list.\n   **********************************************************************/\n\n  class Link {\n    friend class Iterator;\n    friend class IntrusiveForwardList;\n\n    T *next;\n\n  public:\n    Link() {\n      next = nullptr;\n    }\n    // constructor\n\n    // The special copy constructor is used by lots of classes.\n    Link(const Link &) {\n      next = nullptr;\n    }\n\n    // The special assignment operator is used by lots of classes.\n    void operator=(const Link &) {\n      next = nullptr;\n    }\n  };\n  using LINK = Link; // compat\n\n  /***********************************************************************\n   *                          CLASS - ELIST_ITERATOR\n   *\n   *                          Generic iterator class for singly linked lists with\n   *embedded links\n   **********************************************************************/\n\n  class Iterator {\n    friend void IntrusiveForwardList::assign_to_sublist(Iterator *, Iterator *);\n\n    IntrusiveForwardList *list;                  // List being iterated\n    T *prev;             // prev element\n    T *current;          // current element\n    T *next;             // next element\n    T *cycle_pt;         // point we are cycling the list to.\n    bool ex_current_was_last;     // current extracted was end of list\n    bool ex_current_was_cycle_pt; // current extracted was cycle point\n    bool started_cycling;         // Have we moved off the start?\n    /***********************************************************************\n   *              Iterator::extract_sublist()\n   *\n   *  This is a private member, used only by IntrusiveForwardList::assign_to_sublist.\n   *  Given another iterator for the same list, extract the links from THIS to\n   *  OTHER inclusive, link them into a new circular list, and return a\n   *  pointer to the last element.\n   *  (Can't inline this function because it contains a loop)\n   **********************************************************************/\n    T *extract_sublist(   // from this current...\n      Iterator *other_it) {              // to other current\n#ifndef NDEBUG\n      constexpr ERRCODE BAD_EXTRACTION_PTS(\"Can't extract sublist from points on different lists\");\n      constexpr ERRCODE DONT_EXTRACT_DELETED(\"Can't extract a sublist marked by deleted points\");\n#endif\n      constexpr ERRCODE BAD_SUBLIST(\"Can't find sublist end point in original list\");\n\n      Iterator temp_it = *this;\n      T *end_of_new_list;\n\n#ifndef NDEBUG\n      if (!other_it)\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::extract_sublist\", ABORT, \"other_it nullptr\");\n      if (!list)\n        NO_LIST.error(\"ELIST_ITERATOR::extract_sublist\", ABORT);\n      if (list != other_it->list)\n        BAD_EXTRACTION_PTS.error(\"ELIST_ITERATOR.extract_sublist\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"ELIST_ITERATOR::extract_sublist\", ABORT);\n\n      if (!current || !other_it->current)\n        DONT_EXTRACT_DELETED.error(\"ELIST_ITERATOR.extract_sublist\", ABORT);\n#endif\n\n      ex_current_was_last = other_it->ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n      other_it->ex_current_was_cycle_pt = false;\n\n      temp_it.mark_cycle_pt();\n      do {                         // walk sublist\n        if (temp_it.cycled_list()) { // can't find end pt\n          BAD_SUBLIST.error(\"Iterator.extract_sublist\", ABORT);\n        }\n\n        if (temp_it.at_last()) {\n          list->last = prev;\n          ex_current_was_last = other_it->ex_current_was_last = true;\n        }\n\n        if (temp_it.current == cycle_pt) {\n          ex_current_was_cycle_pt = true;\n        }\n\n        if (temp_it.current == other_it->cycle_pt) {\n          other_it->ex_current_was_cycle_pt = true;\n        }\n\n        temp_it.forward();\n      } while (temp_it.prev != other_it->current);\n\n      // circularise sublist\n      other_it->current->next = current;\n      end_of_new_list = other_it->current;\n\n      // sublist = whole list\n      if (prev == other_it->current) {\n        list->last = nullptr;\n        prev = current = next = nullptr;\n        other_it->prev = other_it->current = other_it->next = nullptr;\n      } else {\n        prev->next = other_it->next;\n        current = other_it->current = nullptr;\n        next = other_it->next;\n        other_it->prev = prev;\n      }\n      return end_of_new_list;\n    } // to other current\n\n  public:\n    Iterator() { // constructor\n      list = nullptr;\n    } // unassigned list\n    /***********************************************************************\n   *                          ELIST_ITERATOR::ELIST_ITERATOR\n   *\n   *  CONSTRUCTOR - set iterator to specified list;\n   **********************************************************************/\n    Iterator(IntrusiveForwardList *list_to_iterate) {\n      set_to_list(list_to_iterate);\n    }\n    /***********************************************************************\n   *                          ELIST_ITERATOR::set_to_list\n   *\n   *  (Re-)initialise the iterator to point to the start of the list_to_iterate\n   *  over.\n   **********************************************************************/\n    void set_to_list( // change list\n      IntrusiveForwardList *list_to_iterate) {\n#ifndef NDEBUG\n      if (!list_to_iterate) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::set_to_list\", ABORT, \"list_to_iterate is nullptr\");\n      }\n#endif\n\n      list = list_to_iterate;\n      prev = list->last;\n      current = list->First();\n      next = current ? current->next : nullptr;\n      cycle_pt = nullptr; // await explicit set\n      started_cycling = false;\n      ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n    }\n    /***********************************************************************\n   *                          ELIST_ITERATOR::add_after_then_move\n   *\n   *  Add a new element to the list after the current element and move the\n   *  iterator to the new element.\n   **********************************************************************/\n    void add_after_then_move(  // add after current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_after_then_move\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_after_then_move\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST_ITERATOR::add_after_then_move\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        new_element->next = next;\n\n        if (current) { // not extracted\n          current->next = new_element;\n          prev = current;\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    } // move to new\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_after_stay_put\n     *\n     *  Add a new element to the list after the current element but do not move\n     *  the iterator to the new element.\n     **********************************************************************/\n    void add_after_stay_put(   // add after current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_after_stay_put\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_after_stay_put\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST_ITERATOR::add_after_stay_put\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = false;\n        current = nullptr;\n      } else {\n        new_element->next = next;\n\n        if (current) { // not extracted\n          current->next = new_element;\n          if (prev == current) {\n            prev = new_element;\n          }\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n            ex_current_was_last = false;\n          }\n        }\n        next = new_element;\n      }\n    } // stay at current\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_before_then_move\n     *\n     *  Add a new element to the list before the current element and move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_then_move( // add before current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_before_then_move\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_before_then_move\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST_ITERATOR::add_before_then_move\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        prev->next = new_element;\n        if (current) { // not extracted\n          new_element->next = current;\n          next = current;\n        } else { // current extracted\n          new_element->next = next;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    } // move to new\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_before_stay_put\n     *\n     *  Add a new element to the list before the current element but don't move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_stay_put(  // add before current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_before_stay_put\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_before_stay_put\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST_ITERATOR::add_before_stay_put\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = true;\n        current = nullptr;\n      } else {\n        prev->next = new_element;\n        if (current) { // not extracted\n          new_element->next = current;\n          if (next == current) {\n            next = new_element;\n          }\n        } else { // current extracted\n          new_element->next = next;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n        }\n        prev = new_element;\n      }\n    } // stay at current\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_list_after\n     *\n     *  Insert another list to this list after the current element but don't move\n     *the\n     *  iterator.\n     **********************************************************************/\n    void add_list_after(     // add a list &\n      IntrusiveForwardList *list_to_add) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_list_after\", ABORT);\n      }\n      if (!list_to_add) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_list_after\", ABORT, \"list_to_add is nullptr\");\n      }\n#endif\n\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          next = list->First();\n          ex_current_was_last = true;\n          current = nullptr;\n        } else {\n          if (current) { // not extracted\n            current->next = list_to_add->First();\n            if (current == list->last) {\n              list->last = list_to_add->last;\n            }\n            list_to_add->last->next = next;\n            next = current->next;\n          } else { // current extracted\n            prev->next = list_to_add->First();\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n              ex_current_was_last = false;\n            }\n            list_to_add->last->next = next;\n            next = prev->next;\n          }\n        }\n        list_to_add->last = nullptr;\n      }\n    } // stay at current\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_list_before\n     *\n     *  Insert another list to this list before the current element. Move the\n     *  iterator to the start of the inserted elements\n     *  iterator.\n     **********************************************************************/\n    void add_list_before(    // add a list &\n      IntrusiveForwardList *list_to_add) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_list_before\", ABORT);\n      }\n      if (!list_to_add) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_list_before\", ABORT, \"list_to_add is nullptr\");\n      }\n#endif\n\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          current = list->First();\n          next = current->next;\n          ex_current_was_last = false;\n        } else {\n          prev->next = list_to_add->First();\n          if (current) { // not extracted\n            list_to_add->last->next = current;\n          } else { // current extracted\n            list_to_add->last->next = next;\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n            }\n            if (ex_current_was_cycle_pt) {\n              cycle_pt = prev->next;\n            }\n          }\n          current = prev->next;\n          next = current->next;\n        }\n        list_to_add->last = nullptr;\n      }\n    } // move to it 1st item\n\n    T *data() { // get current data\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::data\", ABORT);\n      }\n      if (!current) {\n        NULL_DATA.error(\"ELIST_ITERATOR::data\", ABORT);\n      }\n#endif\n      return current;\n    }\n    /***********************************************************************\n   *              ELIST_ITERATOR::data_relative\n   *\n   *  Return the data pointer to the element \"offset\" elements from current.\n   *  \"offset\" must not be less than -1.\n   *  (This function can't be INLINEd because it contains a loop)\n   **********************************************************************/\n    T *data_relative( // get data + or - ...\n      int8_t offset) {                       // offset from current\n      T *ptr;\n\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST_ITERATOR::data_relative\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"ELIST_ITERATOR::data_relative\", ABORT);\n      if (offset < -1)\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::data_relative\", ABORT, \"offset < -l\");\n#endif\n\n      if (offset == -1) {\n        ptr = prev;\n      } else {\n        for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) {\n        }\n      }\n\n#ifndef NDEBUG\n      if (!ptr)\n        NULL_DATA.error(\"ELIST_ITERATOR::data_relative\", ABORT);\n#endif\n\n      return ptr;\n    }        // offset from current\n      /***********************************************************************\n     *              ELIST_ITERATOR::forward\n     *\n     *  Move the iterator to the next element of the list.\n     *  REMEMBER: ALL LISTS ARE CIRCULAR.\n     **********************************************************************/\n    T *forward() {\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST_ITERATOR::forward\", ABORT);\n#endif\n      if (list->empty()) {\n        return nullptr;\n      }\n\n      if (current) { // not removed so\n        // set previous\n        prev = current;\n        started_cycling = true;\n        // In case next is deleted by another iterator, get next from current.\n        current = current->next;\n      } else {\n        if (ex_current_was_cycle_pt) {\n          cycle_pt = next;\n        }\n        current = next;\n      }\n#ifndef NDEBUG\n      if (!current)\n        NULL_DATA.error(\"ELIST_ITERATOR::forward\", ABORT);\n#endif\n      next = current->next;\n\n#ifndef NDEBUG\n      if (!next) {\n        NULL_NEXT.error(\"ELIST_ITERATOR::forward\", ABORT,\n          \"This is: %p  Current is: %p\",\n          static_cast<void *>(this),\n          static_cast<void *>(current));\n      }\n#endif\n      return current;\n    } // move to next element\n\n      /***********************************************************************\n     *                          ELIST_ITERATOR::extract\n     *\n     *  Do extraction by removing current from the list, returning it to the\n     *  caller, but NOT updating the iterator.  (So that any calling loop can do\n     *  this.)   The iterator's current points to nullptr.  If the extracted element\n     *  is to be deleted, this is the callers responsibility.\n     **********************************************************************/\n    T *extract() {\n      T *extracted_link;\n\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::extract\", ABORT);\n      }\n      if (!current) { // list empty or\n        // element extracted\n        NULL_CURRENT.error(\"ELIST_ITERATOR::extract\", ABORT);\n      }\n#endif\n\n      if (list->singleton()) {\n        // Special case where we do need to change the iterator.\n        prev = next = list->last = nullptr;\n      } else {\n        prev->next = next; // remove from list\n\n        ex_current_was_last = (current == list->last);\n        if (ex_current_was_last) {\n          list->last = prev;\n        }\n      }\n      // Always set ex_current_was_cycle_pt so an add/forward will work in a loop.\n      ex_current_was_cycle_pt = (current == cycle_pt);\n      extracted_link = current;\n      extracted_link->next = nullptr; // for safety\n      current = nullptr;\n      return extracted_link;\n    }  // remove from list\n      /***********************************************************************\n     *                          ELIST_ITERATOR::move_to_first()\n     *\n     *  Move current so that it is set to the start of the list.\n     *  Return data just in case anyone wants it.\n     **********************************************************************/\n    T *move_to_first() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::move_to_first\", ABORT);\n      }\n#endif\n\n      current = list->First();\n      prev = list->last;\n      next = current ? current->next : nullptr;\n      return current;\n    } // go to start of list\n      /***********************************************************************\n     *              ELIST_ITERATOR::move_to_last()\n     *\n     *  Move current so that it is set to the end of the list.\n     *  Return data just in case anyone wants it.\n     *  (This function can't be INLINEd because it contains a loop)\n     **********************************************************************/\n    T *move_to_last() {\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST_ITERATOR::move_to_last\", ABORT);\n#endif\n\n      while (current != list->last) {\n        forward();\n      }\n\n      return current;\n    } // go to end of list\n      /***********************************************************************\n     *                          ELIST_ITERATOR::mark_cycle_pt()\n     *\n     *  Remember the current location so that we can tell whether we've returned\n     *  to this point later.\n     *\n     *  If the current point is deleted either now, or in the future, the cycle\n     *  point will be set to the next item which is set to current.  This could be\n     *  by a forward, add_after_then_move or add_after_then_move.\n     **********************************************************************/\n    void mark_cycle_pt() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::mark_cycle_pt\", ABORT);\n      }\n#endif\n\n      if (current) {\n        cycle_pt = current;\n      } else {\n        ex_current_was_cycle_pt = true;\n      }\n      started_cycling = false;\n    } // remember current\n\n    bool empty() const { // is list empty?\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::empty\", ABORT);\n      }\n#endif\n      return list->empty();\n    }\n\n    bool current_extracted() const { // current extracted?\n      return !current;\n    }\n    /***********************************************************************\n   *                          ELIST_ITERATOR::at_first()\n   *\n   *  Are we at the start of the list?\n   *\n   **********************************************************************/\n    bool at_first() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::at_first\", ABORT);\n      }\n#endif\n\n      // we're at a deleted\n      return ((list->empty()) || (current == list->First()) ||\n        ((current == nullptr) && (prev == list->last) && // NON-last pt between\n          !ex_current_was_last));                         // first and last\n    } // Current is first?\n      /***********************************************************************\n     *                          ELIST_ITERATOR::at_last()\n     *\n     *  Are we at the end of the list?\n     *\n     **********************************************************************/\n    bool at_last() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::at_last\", ABORT);\n      }\n#endif\n\n      // we're at a deleted\n      return ((list->empty()) || (current == list->last) ||\n        ((current == nullptr) && (prev == list->last) && // last point between\n          ex_current_was_last));                          // first and last\n    } // Current is last?\n      /***********************************************************************\n     *                          ELIST_ITERATOR::cycled_list()\n     *\n     *  Have we returned to the cycle_pt since it was set?\n     *\n     **********************************************************************/\n    bool cycled_list() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::cycled_list\", ABORT);\n      }\n#endif\n\n      return ((list->empty()) || ((current == cycle_pt) && started_cycling));\n    } // Completed a cycle?\n      /***********************************************************************\n     *                          ELIST_ITERATOR::add_to_end\n     *\n     *  Add a new element to the end of the list without moving the iterator.\n     *  This is provided because a single linked list cannot move to the last as\n     *  the iterator couldn't set its prev pointer.  Adding to the end is\n     *  essential for implementing\n                  queues.\n    **********************************************************************/\n    void add_to_end(           // add at end &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::add_to_end\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::add_to_end\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST_ITERATOR::add_to_end\", ABORT);\n      }\n#endif\n\n      if (this->at_last()) {\n        this->add_after_stay_put(new_element);\n      } else {\n        if (this->at_first()) {\n          this->add_before_stay_put(new_element);\n          list->last = new_element;\n        } else { // Iteratr is elsewhere\n          new_element->next = list->last->next;\n          list->last->next = new_element;\n          list->last = new_element;\n        }\n      }\n    } // don't move\n        /***********************************************************************\n     *              ELIST_ITERATOR::exchange()\n     *\n     *  Given another iterator, whose current element is a different element on\n     *  the same list list OR an element of another list, exchange the two current\n     *  elements.  On return, each iterator points to the element which was the\n     *  other iterators current on entry.\n     *  (This function hasn't been in-lined because its a bit big!)\n     **********************************************************************/\n    void exchange(                 // positions of 2 links\n      Iterator *other_it) { // other iterator\n      constexpr ERRCODE DONT_EXCHANGE_DELETED(\"Can't exchange deleted elements of lists\");\n\n      T *old_current;\n\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST_ITERATOR::exchange\", ABORT);\n      if (!other_it)\n        BAD_PARAMETER.error(\"ELIST_ITERATOR::exchange\", ABORT, \"other_it nullptr\");\n      if (!(other_it->list))\n        NO_LIST.error(\"ELIST_ITERATOR::exchange\", ABORT, \"other_it\");\n#endif\n\n      /* Do nothing if either list is empty or if both iterators reference the same\n    link */\n\n      if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) {\n        return;\n      }\n\n      /* Error if either current element is deleted */\n\n      if (!current || !other_it->current) {\n        DONT_EXCHANGE_DELETED.error(\"ELIST_ITERATOR.exchange\", ABORT);\n      }\n\n      /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements\n    (other before this); non-doubleton adjacent elements (this before other);\n    non-adjacent elements. */\n\n    // adjacent links\n      if ((next == other_it->current) || (other_it->next == current)) {\n        // doubleton list\n        if ((next == other_it->current) && (other_it->next == current)) {\n          prev = next = current;\n          other_it->prev = other_it->next = other_it->current;\n        } else { // non-doubleton with\n          // adjacent links\n          // other before this\n          if (other_it->next == current) {\n            other_it->prev->next = current;\n            other_it->current->next = next;\n            current->next = other_it->current;\n            other_it->next = other_it->current;\n            prev = current;\n          } else { // this before other\n            prev->next = other_it->current;\n            current->next = other_it->next;\n            other_it->current->next = current;\n            next = current;\n            other_it->prev = other_it->current;\n          }\n        }\n      } else { // no overlap\n        prev->next = other_it->current;\n        current->next = other_it->next;\n        other_it->prev->next = current;\n        other_it->current->next = next;\n      }\n\n      /* update end of list pointer when necessary (remember that the 2 iterators\n      may iterate over different lists!) */\n\n      if (list->last == current) {\n        list->last = other_it->current;\n      }\n      if (other_it->list->last == other_it->current) {\n        other_it->list->last = current;\n      }\n\n      if (current == cycle_pt) {\n        cycle_pt = other_it->cycle_pt;\n      }\n      if (other_it->current == other_it->cycle_pt) {\n        other_it->cycle_pt = cycle_pt;\n      }\n\n      /* The actual exchange - in all cases*/\n\n      old_current = current;\n      current = other_it->current;\n      other_it->current = old_current;\n    } // other iterator\n\n      //# elements in list\n    int32_t length() const {\n      return list->length();\n    }\n    /***********************************************************************\n   *                          ELIST_ITERATOR::sort()\n   *\n   *  Sort the elements of the list, then reposition at the start.\n   *\n   **********************************************************************/\n    void sort(          // sort elements\n      int comparator( // comparison routine\n        const T *, const T *)) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST_ITERATOR::sort\", ABORT);\n      }\n#endif\n\n      list->sort(comparator);\n      move_to_first();\n    }\n  };\n  using ITERATOR = Iterator; // compat\n\nprivate:\n  T *last = nullptr; // End of list\n  //(Points to head)\n  T *First() { // return first\n    return last ? last->next : nullptr;\n  }\n\npublic:\n  ~IntrusiveForwardList() {\n    clear();\n  }\n\n  /* delete elements */\n  void clear() {\n    internal_clear();\n  }\n\n  /* Become a deep copy of src_list */\n  template <typename U>\n  void deep_copy(const U *src_list, T *(*copier)(const T *)) {\n    Iterator from_it(const_cast<U *>(src_list));\n    Iterator to_it(this);\n\n    for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward())\n      to_it.add_after_then_move((*copier)(from_it.data()));\n  }\n\n  /***********************************************************************\n   *              IntrusiveForwardList::internal_clear\n   *\n   *  Used by the destructor and the \"clear\" member function of derived list\n   *  classes to destroy all the elements on the list.\n   *  The calling function passes a \"zapper\" function which can be called to\n   *  delete each element of the list, regardless of its derived type.  This\n   *  technique permits a generic clear function to destroy elements of\n   *  different derived types correctly, without requiring virtual functions and\n   *  the consequential memory overhead.\n   **********************************************************************/\n\n   // destroy all links\n  void internal_clear() {\n    T *ptr;\n    T *next;\n\n    if (!empty()) {\n      ptr = last->next;     // set to first\n      last->next = nullptr; // break circle\n      last = nullptr;       // set list empty\n      while (ptr) {\n        next = ptr->next;\n        delete ptr;\n        ptr = next;\n      }\n    }\n  }\n\n  bool empty() const {\n    return !last;\n  }\n\n  bool singleton() const {\n    return last ? (last == last->next) : false;\n  }\n\n  void shallow_copy(      // dangerous!!\n    IntrusiveForwardList *from_list) { // beware destructors!!\n    last = from_list->last;\n  }\n\n  /***********************************************************************\n *              IntrusiveForwardList::assign_to_sublist\n *\n *  The list is set to a sublist of another list.  \"This\" list must be empty\n *  before this function is invoked.  The two iterators passed must refer to\n *  the same list, different from \"this\" one.  The sublist removed is the\n *  inclusive list from start_it's current position to end_it's current\n *  position.  If this range passes over the end of the source list then the\n *  source list has its end set to the previous element of start_it.  The\n *  extracted sublist is unaffected by the end point of the source list, its\n *  end point is always the end_it position.\n **********************************************************************/\n  void assign_to_sublist(       // to this list\n    Iterator *start_it, // from list start\n    Iterator *end_it) {  // from list end\n    constexpr ERRCODE LIST_NOT_EMPTY(\"Destination list must be empty before extracting a sublist\");\n\n    if (!empty()) {\n      LIST_NOT_EMPTY.error(\"IntrusiveForwardList.assign_to_sublist\", ABORT);\n    }\n\n    last = start_it->extract_sublist(end_it);\n  }  // from list end\n\n    // # elements in list\n  int32_t length() const {\n    int32_t count = 0;\n    if (last != nullptr) {\n      count = 1;\n      for (auto it = last->next; it != last; it = it->next) {\n        count++;\n      }\n    }\n    return count;\n  }\n\n  /***********************************************************************\n *              IntrusiveForwardList::sort\n *\n *  Sort elements on list\n *  NB If you don't like the const declarations in the comparator, coerce yours:\n *   ( int (*)(const void *, const void *)\n **********************************************************************/\n  void sort(          // sort elements\n    int comparator( // comparison routine\n      const T *, const T *)) {\n    // Allocate an array of pointers, one per list element.\n    auto count = length();\n\n    if (count > 0) {\n      // ptr array to sort\n      std::vector<T *> base;\n      base.reserve(count);\n\n      Iterator it(this);\n\n      // Extract all elements, putting the pointers in the array.\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        base.push_back(it.extract());\n      }\n\n      // Sort the pointer array.\n      std::sort(base.begin(), base.end(),\n        // all current comparators return -1,0,1, so we handle this correctly for std::sort\n        [&](auto &&l, auto &&r) {return comparator(l, r) < 0; });\n\n      // Rebuild the list from the sorted pointers.\n      for (auto current : base) {\n        it.add_to_end(current);\n      }\n    }\n  }\n\n  // Assuming list has been sorted already, insert new_link to\n  // keep the list sorted according to the same comparison function.\n  // Comparison function is the same as used by sort, i.e. uses double\n  // indirection. Time is O(1) to add to beginning or end.\n  // Time is linear to add pre-sorted items to an empty list.\n  // If unique is set to true and comparator() returns 0 (an entry with the\n  // same information as the one contained in new_link is already in the\n  // list) - new_link is not added to the list and the function returns the\n  // pointer to the identical entry that already exists in the list\n  // (otherwise the function returns new_link).\n  T *add_sorted_and_find(int comparator(const T *, const T *), bool unique,\n    T *new_link) {\n    // Check for adding at the end.\n    if (last == nullptr || comparator(last, new_link) < 0) {\n      if (last == nullptr) {\n        new_link->next = new_link;\n      } else {\n        new_link->next = last->next;\n        last->next = new_link;\n      }\n      last = new_link;\n    } else {\n      // Need to use an iterator.\n      Iterator it(this);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        auto *link = it.data();\n        int compare = comparator(link, new_link);\n        if (compare > 0) {\n          break;\n        } else if (unique && compare == 0) {\n          return link;\n        }\n      }\n      if (it.cycled_list()) {\n        it.add_to_end(new_link);\n      } else {\n        it.add_before_then_move(new_link);\n      }\n    }\n    return new_link;\n  }\n\n  // Same as above, but returns true if the new entry was inserted, false\n  // if the identical entry already existed in the list.\n  bool add_sorted(int comparator(const T *, const T *), bool unique, T *new_link) {\n    return (add_sorted_and_find(comparator, unique, new_link) == new_link);\n  }\n};\n\ntemplate <typename CLASSNAME>\nusing ELIST = IntrusiveForwardList<CLASSNAME>;\n\n// add TESS_API?\n// move templated lists to public include dirs?\n#define ELISTIZEH(T)                                        \\\n  class T##_LIST : public IntrusiveForwardList<T> {         \\\n  public:                                                   \\\n    using IntrusiveForwardList<T>::IntrusiveForwardList;    \\\n  };                                                        \\\n  class T##_IT : public IntrusiveForwardList<T>::Iterator { \\\n  public:                                                   \\\n    using IntrusiveForwardList<T>::Iterator::Iterator;      \\\n  };\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/elst2.h",
    "content": "/**********************************************************************\n * File:        elst2.h  (Formerly elist2.h)\n * Description: Double linked embedded list module include file.\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef ELST2_H\n#define ELST2_H\n\n#include \"lsterr.h\"\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cstdio>\n\nnamespace tesseract {\n\n/**********************************************************************\nDESIGN NOTE\n===========\n\nIt would probably be possible to implement the ELIST2 classes as derived\nclasses from ELIST.  I haven't done this because:\n\na) I think it would be harder to understand the code\n(Though the problem with not inheriting is that changes to ELIST must be\n  reflected in ELIST2 and vice versa)\n\nb) Most of the code is inline so:\ni)  The duplication in source does not affect the run time code size - the\n    code is copied inline anyway!\n\n  ii) The compiler should have a bit less work to do!\n**********************************************************************/\n\n/**********************************************************************\n * CLASS - ELIST2\n *\n * Generic list class for doubly linked lists with embedded links\n **********************************************************************/\n\ntemplate <typename T>\nclass IntrusiveList {\npublic:\n  /**********************************************************************\n   *              CLASS - Link\n   *\n   *              Generic link class for doubly linked lists with embedded links\n   *\n   *  Note:  No destructor - elements are assumed to be destroyed EITHER after\n   *  they have been extracted from a list OR by the ELIST2 destructor which\n   *  walks the list.\n   **********************************************************************/\n\n  class Link {\n    friend class Iterator;\n    friend class IntrusiveList;\n\n    T *prev;\n    T *next;\n\n  public:\n    Link() { // constructor\n      prev = next = nullptr;\n    }\n\n    Link(const Link &) = delete;\n\n    // The assignment operator is required for WERD.\n    void operator=(const Link &) {\n      prev = next = nullptr;\n    }\n  };\n  using LINK = Link; // compat\n\n  /***********************************************************************\n   *              CLASS - ELIST2_ITERATOR\n   *\n   *              Generic iterator class for doubly linked lists with embedded\n   *links\n   **********************************************************************/\n\n  class Iterator {\n    friend void IntrusiveList::assign_to_sublist(Iterator *, Iterator *);\n\n    IntrusiveList *list;                 // List being iterated\n    T *prev;            // prev element\n    T *current;         // current element\n    T *next;            // next element\n    T *cycle_pt;        // point we are cycling the list to.\n    bool ex_current_was_last;     // current extracted was end of list\n    bool ex_current_was_cycle_pt; // current extracted was cycle point\n    bool started_cycling;         // Have we moved off the start?\n    /***********************************************************************\n   *              ELIST2_ITERATOR::extract_sublist()\n   *\n   *  This is a private member, used only by IntrusiveList::assign_to_sublist.\n   *  Given another iterator for the same list, extract the links from THIS to\n   *  OTHER inclusive, link them into a new circular list, and return a\n   *  pointer to the last element.\n   *  (Can't inline this function because it contains a loop)\n   **********************************************************************/\n    T *extract_sublist(   // from this current...\n      Iterator *other_it) {               // to other current\n#ifndef NDEBUG\n      constexpr ERRCODE BAD_EXTRACTION_PTS(\"Can't extract sublist from points on different lists\");\n      constexpr ERRCODE DONT_EXTRACT_DELETED(\"Can't extract a sublist marked by deleted points\");\n#endif\n      constexpr ERRCODE BAD_SUBLIST(\"Can't find sublist end point in original list\");\n\n      Iterator temp_it = *this;\n      T *end_of_new_list;\n\n#ifndef NDEBUG\n      if (!other_it)\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::extract_sublist\", ABORT, \"other_it nullptr\");\n      if (!list)\n        NO_LIST.error(\"ELIST2_ITERATOR::extract_sublist\", ABORT);\n      if (list != other_it->list)\n        BAD_EXTRACTION_PTS.error(\"ELIST2_ITERATOR.extract_sublist\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"ELIST2_ITERATOR::extract_sublist\", ABORT);\n\n      if (!current || !other_it->current)\n        DONT_EXTRACT_DELETED.error(\"ELIST2_ITERATOR.extract_sublist\", ABORT);\n#endif\n\n      ex_current_was_last = other_it->ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n      other_it->ex_current_was_cycle_pt = false;\n\n      temp_it.mark_cycle_pt();\n      do {                         // walk sublist\n        if (temp_it.cycled_list()) { // can't find end pt\n          BAD_SUBLIST.error(\"ELIST2_ITERATOR.extract_sublist\", ABORT);\n        }\n\n        if (temp_it.at_last()) {\n          list->last = prev;\n          ex_current_was_last = other_it->ex_current_was_last = true;\n        }\n\n        if (temp_it.current == cycle_pt) {\n          ex_current_was_cycle_pt = true;\n        }\n\n        if (temp_it.current == other_it->cycle_pt) {\n          other_it->ex_current_was_cycle_pt = true;\n        }\n\n        temp_it.forward();\n      }\n      // do INCLUSIVE list\n      while (temp_it.prev != other_it->current);\n\n      // circularise sublist\n      other_it->current->next = current;\n      // circularise sublist\n      current->prev = other_it->current;\n      end_of_new_list = other_it->current;\n\n      // sublist = whole list\n      if (prev == other_it->current) {\n        list->last = nullptr;\n        prev = current = next = nullptr;\n        other_it->prev = other_it->current = other_it->next = nullptr;\n      } else {\n        prev->next = other_it->next;\n        other_it->next->prev = prev;\n\n        current = other_it->current = nullptr;\n        next = other_it->next;\n        other_it->prev = prev;\n      }\n      return end_of_new_list;\n    } // to other current\n\n  public:\n    /***********************************************************************\n   *              ELIST2_ITERATOR::ELIST2_ITERATOR\n   *\n   *  CONSTRUCTOR - set iterator to specified list;\n   **********************************************************************/\n    Iterator( // constructor\n      IntrusiveList *list_to_iterate) {\n      set_to_list(list_to_iterate);\n    }\n\n    /***********************************************************************\n     *              ELIST2_ITERATOR::set_to_list\n     *\n     *  (Re-)initialise the iterator to point to the start of the list_to_iterate\n     *  over.\n     **********************************************************************/\n\n    void set_to_list( // change list\n      IntrusiveList *list_to_iterate) {\n#ifndef NDEBUG\n      if (!list_to_iterate) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::set_to_list\", ABORT, \"list_to_iterate is nullptr\");\n      }\n#endif\n\n      list = list_to_iterate;\n      prev = list->last;\n      current = list->First();\n      next = current ? current->next : nullptr;\n      cycle_pt = nullptr; // await explicit set\n      started_cycling = false;\n      ex_current_was_last = false;\n      ex_current_was_cycle_pt = false;\n    }\n    /***********************************************************************\n   *              ELIST2_ITERATOR::add_after_then_move\n   *\n   *  Add a new element to the list after the current element and move the\n   *  iterator to the new element.\n   **********************************************************************/\n    void add_after_then_move(   // add after current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_after_then_move\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_after_then_move\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST2_ITERATOR::add_after_then_move\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        new_element->prev = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        new_element->next = next;\n        next->prev = new_element;\n\n        if (current) { // not extracted\n          new_element->prev = current;\n          current->next = new_element;\n          prev = current;\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          new_element->prev = prev;\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    } // move to new\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_after_stay_put\n     *\n     *  Add a new element to the list after the current element but do not move\n     *  the iterator to the new element.\n     **********************************************************************/\n    void add_after_stay_put(    // add after current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_after_stay_put\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_after_stay_put\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST2_ITERATOR::add_after_stay_put\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        new_element->prev = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = false;\n        current = nullptr;\n      } else {\n        new_element->next = next;\n        next->prev = new_element;\n\n        if (current) { // not extracted\n          new_element->prev = current;\n          current->next = new_element;\n          if (prev == current) {\n            prev = new_element;\n          }\n          if (current == list->last) {\n            list->last = new_element;\n          }\n        } else { // current extracted\n          new_element->prev = prev;\n          prev->next = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n            ex_current_was_last = false;\n          }\n        }\n        next = new_element;\n      }\n    } // stay at current\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_before_then_move\n     *\n     *  Add a new element to the list before the current element and move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_then_move(  // add before current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_before_then_move\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_before_then_move\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST2_ITERATOR::add_before_then_move\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        new_element->prev = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n      } else {\n        prev->next = new_element;\n        new_element->prev = prev;\n\n        if (current) { // not extracted\n          new_element->next = current;\n          current->prev = new_element;\n          next = current;\n        } else { // current extracted\n          new_element->next = next;\n          next->prev = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n          if (ex_current_was_cycle_pt) {\n            cycle_pt = new_element;\n          }\n        }\n      }\n      current = new_element;\n    } // move to new\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_before_stay_put\n     *\n     *  Add a new element to the list before the current element but don't move the\n     *  iterator to the new element.\n     **********************************************************************/\n    void add_before_stay_put(   // add before current &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_before_stay_put\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_before_stay_put\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST2_ITERATOR::add_before_stay_put\", ABORT);\n      }\n#endif\n\n      if (list->empty()) {\n        new_element->next = new_element;\n        new_element->prev = new_element;\n        list->last = new_element;\n        prev = next = new_element;\n        ex_current_was_last = true;\n        current = nullptr;\n      } else {\n        prev->next = new_element;\n        new_element->prev = prev;\n\n        if (current) { // not extracted\n          new_element->next = current;\n          current->prev = new_element;\n          if (next == current) {\n            next = new_element;\n          }\n        } else { // current extracted\n          new_element->next = next;\n          next->prev = new_element;\n          if (ex_current_was_last) {\n            list->last = new_element;\n          }\n        }\n        prev = new_element;\n      }\n    } // stay at current\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_list_after\n     *\n     *  Insert another list to this list after the current element but don't move\n     *the\n     *  iterator.\n     **********************************************************************/\n    void add_list_after(      // add a list &\n      IntrusiveList *list_to_add) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_list_after\", ABORT);\n      }\n      if (!list_to_add) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_list_after\", ABORT, \"list_to_add is nullptr\");\n      }\n#endif\n\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          next = list->First();\n          ex_current_was_last = true;\n          current = nullptr;\n        } else {\n          if (current) { // not extracted\n            current->next = list_to_add->First();\n            current->next->prev = current;\n            if (current == list->last) {\n              list->last = list_to_add->last;\n            }\n            list_to_add->last->next = next;\n            next->prev = list_to_add->last;\n            next = current->next;\n          } else { // current extracted\n            prev->next = list_to_add->First();\n            prev->next->prev = prev;\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n              ex_current_was_last = false;\n            }\n            list_to_add->last->next = next;\n            next->prev = list_to_add->last;\n            next = prev->next;\n          }\n        }\n        list_to_add->last = nullptr;\n      }\n    } // stay at current\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_list_before\n     *\n     *  Insert another list to this list before the current element. Move the\n     *  iterator to the start of the inserted elements\n     *  iterator.\n     **********************************************************************/\n    void add_list_before(     // add a list &\n      IntrusiveList *list_to_add) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_list_before\", ABORT);\n      }\n      if (!list_to_add) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_list_before\", ABORT, \"list_to_add is nullptr\");\n      }\n#endif\n\n      if (!list_to_add->empty()) {\n        if (list->empty()) {\n          list->last = list_to_add->last;\n          prev = list->last;\n          current = list->First();\n          next = current->next;\n          ex_current_was_last = false;\n        } else {\n          prev->next = list_to_add->First();\n          prev->next->prev = prev;\n\n          if (current) { // not extracted\n            list_to_add->last->next = current;\n            current->prev = list_to_add->last;\n          } else { // current extracted\n            list_to_add->last->next = next;\n            next->prev = list_to_add->last;\n            if (ex_current_was_last) {\n              list->last = list_to_add->last;\n            }\n            if (ex_current_was_cycle_pt) {\n              cycle_pt = prev->next;\n            }\n          }\n          current = prev->next;\n          next = current->next;\n        }\n        list_to_add->last = nullptr;\n      }\n    } // move to it 1st item\n\n    T *data() { // get current data\n#ifndef NDEBUG\n      if (!current) {\n        NULL_DATA.error(\"ELIST2_ITERATOR::data\", ABORT);\n      }\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::data\", ABORT);\n      }\n#endif\n      return current;\n    }\n    /***********************************************************************\n   *              ELIST2_ITERATOR::data_relative\n   *\n   *  Return the data pointer to the element \"offset\" elements from current.\n   *  (This function can't be INLINEd because it contains a loop)\n   **********************************************************************/\n    T *data_relative( // get data + or - ...\n      int8_t offset) {                         // offset from current\n      T *ptr;\n\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST2_ITERATOR::data_relative\", ABORT);\n      if (list->empty())\n        EMPTY_LIST.error(\"ELIST2_ITERATOR::data_relative\", ABORT);\n#endif\n\n      if (offset < 0) {\n        for (ptr = current ? current : next; offset++ < 0; ptr = ptr->prev) {\n        }\n      } else {\n        for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) {\n        }\n      }\n\n#ifndef NDEBUG\n      if (!ptr)\n        NULL_DATA.error(\"ELIST2_ITERATOR::data_relative\", ABORT);\n#endif\n\n      return ptr;\n    }         // offset from current\n      /***********************************************************************\n     *              ELIST2_ITERATOR::forward\n     *\n     *  Move the iterator to the next element of the list.\n     *  REMEMBER: ALL LISTS ARE CIRCULAR.\n     **********************************************************************/\n    T *forward() {\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST2_ITERATOR::forward\", ABORT);\n#endif\n      if (list->empty()) {\n        return nullptr;\n      }\n\n      if (current) { // not removed so\n        // set previous\n        prev = current;\n        started_cycling = true;\n        // In case next is deleted by another iterator, get it from the current.\n        current = current->next;\n      } else {\n        if (ex_current_was_cycle_pt) {\n          cycle_pt = next;\n        }\n        current = next;\n      }\n\n#ifndef NDEBUG\n      if (!current)\n        NULL_DATA.error(\"ELIST2_ITERATOR::forward\", ABORT);\n#endif\n\n      next = current->next;\n\n#ifndef NDEBUG\n      if (!next) {\n        NULL_NEXT.error(\"ELIST2_ITERATOR::forward\", ABORT,\n          \"This is: %p  Current is: %p\",\n          static_cast<void *>(this),\n          static_cast<void *>(current));\n      }\n#endif\n\n      return current;\n    } // move to next element\n      /***********************************************************************\n     *              ELIST2_ITERATOR::backward\n     *\n     *  Move the iterator to the previous element of the list.\n     *  REMEMBER: ALL LISTS ARE CIRCULAR.\n     **********************************************************************/\n    T *backward() {\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST2_ITERATOR::backward\", ABORT);\n#endif\n      if (list->empty()) {\n        return nullptr;\n      }\n\n      if (current) { // not removed so\n        // set previous\n        next = current;\n        started_cycling = true;\n        // In case prev is deleted by another iterator, get it from current.\n        current = current->prev;\n      } else {\n        if (ex_current_was_cycle_pt) {\n          cycle_pt = prev;\n        }\n        current = prev;\n      }\n\n#ifndef NDEBUG\n      if (!current)\n        NULL_DATA.error(\"ELIST2_ITERATOR::backward\", ABORT);\n      if (!prev) {\n        NULL_PREV.error(\"ELIST2_ITERATOR::backward\", ABORT,\n          \"This is: %p  Current is: %p\",\n          static_cast<void *>(this),\n          static_cast<void *>(current));\n      }\n#endif\n\n      prev = current->prev;\n      return current;\n    } // move to prev element\n      /***********************************************************************\n     *              ELIST2_ITERATOR::extract\n     *\n     *  Do extraction by removing current from the list, returning it to the\n     *  caller, but NOT updating the iterator.  (So that any calling loop can do\n     *  this.)   The iterator's current points to nullptr.  If the extracted element\n     *  is to be deleted, this is the callers responsibility.\n     **********************************************************************/\n    T *extract() {\n      T *extracted_link;\n\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::extract\", ABORT);\n      }\n      if (!current) { // list empty or\n        // element extracted\n        NULL_CURRENT.error(\"ELIST2_ITERATOR::extract\", ABORT);\n      }\n#endif\n\n      if (list->singleton()) {\n        // Special case where we do need to change the iterator.\n        prev = next = list->last = nullptr;\n      } else {\n        prev->next = next; // remove from list\n        next->prev = prev;\n\n        if (current == list->last) {\n          list->last = prev;\n          ex_current_was_last = true;\n        } else {\n          ex_current_was_last = false;\n        }\n      }\n      // Always set ex_current_was_cycle_pt so an add/forward will work in a loop.\n      ex_current_was_cycle_pt = (current == cycle_pt);\n      extracted_link = current;\n      extracted_link->next = nullptr; // for safety\n      extracted_link->prev = nullptr; // for safety\n      current = nullptr;\n      return extracted_link;\n    } // remove from list\n      /***********************************************************************\n     *              ELIST2_ITERATOR::move_to_first()\n     *\n     *  Move current so that it is set to the start of the list.\n     *  Return data just in case anyone wants it.\n     **********************************************************************/\n     // go to start of list\n    T *move_to_first() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::move_to_first\", ABORT);\n      }\n#endif\n\n      current = list->First();\n      prev = list->last;\n      next = current ? current->next : nullptr;\n      return current;\n    }\n    /***********************************************************************\n   *              ELIST2_ITERATOR::move_to_last()\n   *\n   *  Move current so that it is set to the end of the list.\n   *  Return data just in case anyone wants it.\n   **********************************************************************/\n    T *move_to_last() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::move_to_last\", ABORT);\n      }\n#endif\n\n      current = list->last;\n      prev = current ? current->prev : nullptr;\n      next = current ? current->next : nullptr;\n      return current;\n    } // go to end of list\n      /***********************************************************************\n     *              ELIST2_ITERATOR::mark_cycle_pt()\n     *\n     *  Remember the current location so that we can tell whether we've returned\n     *  to this point later.\n     *\n     *  If the current point is deleted either now, or in the future, the cycle\n     *  point will be set to the next item which is set to current.  This could be\n     *  by a forward, add_after_then_move or add_after_then_move.\n     **********************************************************************/\n    void mark_cycle_pt() {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::mark_cycle_pt\", ABORT);\n      }\n#endif\n\n      if (current) {\n        cycle_pt = current;\n      } else {\n        ex_current_was_cycle_pt = true;\n      }\n      started_cycling = false;\n    } // remember current\n\n    bool empty() const { // is list empty?\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::empty\", ABORT);\n      }\n#endif\n      return list->empty();\n    }\n\n    bool current_extracted() const { // current extracted?\n      return !current;\n    }\n    /***********************************************************************\n   *              ELIST2_ITERATOR::at_first()\n   *\n   *  Are we at the start of the list?\n   *\n   **********************************************************************/\n    bool at_first() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::at_first\", ABORT);\n      }\n#endif\n\n      // we're at a deleted\n      return ((list->empty()) || (current == list->First()) ||\n        ((current == nullptr) && (prev == list->last) && // NON-last pt between\n          !ex_current_was_last));                         // first and last\n    } // Current is first?\n      /***********************************************************************\n     *              ELIST2_ITERATOR::at_last()\n     *\n     *  Are we at the end of the list?\n     *\n     **********************************************************************/\n    bool at_last() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::at_last\", ABORT);\n      }\n#endif\n\n      // we're at a deleted\n      return ((list->empty()) || (current == list->last) ||\n        ((current == nullptr) && (prev == list->last) && // last point between\n          ex_current_was_last));                          // first and last\n    } // Current is last?\n      /***********************************************************************\n     *              ELIST2_ITERATOR::cycled_list()\n     *\n     *  Have we returned to the cycle_pt since it was set?\n     *\n     **********************************************************************/\n    bool cycled_list() const {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::cycled_list\", ABORT);\n      }\n#endif\n\n      return ((list->empty()) || ((current == cycle_pt) && started_cycling));\n    } // Completed a cycle?\n      /***********************************************************************\n     *              ELIST2_ITERATOR::add_to_end\n     *\n     *  Add a new element to the end of the list without moving the iterator.\n     *  This is provided because a single linked list cannot move to the last as\n     *  the iterator couldn't set its prev pointer.  Adding to the end is\n     *  essential for implementing\n                  queues.\n    **********************************************************************/\n    void add_to_end(            // add at end &\n      T *new_element) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::add_to_end\", ABORT);\n      }\n      if (!new_element) {\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::add_to_end\", ABORT, \"new_element is nullptr\");\n      }\n      if (new_element->next) {\n        STILL_LINKED.error(\"ELIST2_ITERATOR::add_to_end\", ABORT);\n      }\n#endif\n\n      if (this->at_last()) {\n        this->add_after_stay_put(new_element);\n      } else {\n        if (this->at_first()) {\n          this->add_before_stay_put(new_element);\n          list->last = new_element;\n        } else { // Iteratr is elsewhere\n          new_element->next = list->last->next;\n          new_element->prev = list->last;\n          list->last->next->prev = new_element;\n          list->last->next = new_element;\n          list->last = new_element;\n        }\n      }\n    } // don't move\n      /***********************************************************************\n     *              ELIST2_ITERATOR::exchange()\n     *\n     *  Given another iterator, whose current element is a different element on\n     *  the same list list OR an element of another list, exchange the two current\n     *  elements.  On return, each iterator points to the element which was the\n     *  other iterators current on entry.\n     *  (This function hasn't been in-lined because its a bit big!)\n     **********************************************************************/\n    void exchange(                  // positions of 2 links\n      Iterator *other_it) { // other iterator\n      constexpr ERRCODE DONT_EXCHANGE_DELETED(\"Can't exchange deleted elements of lists\");\n\n      T *old_current;\n\n#ifndef NDEBUG\n      if (!list)\n        NO_LIST.error(\"ELIST2_ITERATOR::exchange\", ABORT);\n      if (!other_it)\n        BAD_PARAMETER.error(\"ELIST2_ITERATOR::exchange\", ABORT, \"other_it nullptr\");\n      if (!(other_it->list))\n        NO_LIST.error(\"ELIST2_ITERATOR::exchange\", ABORT, \"other_it\");\n#endif\n\n      /* Do nothing if either list is empty or if both iterators reference the same\n    link */\n\n      if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) {\n        return;\n      }\n\n      /* Error if either current element is deleted */\n\n      if (!current || !other_it->current) {\n        DONT_EXCHANGE_DELETED.error(\"ELIST2_ITERATOR.exchange\", ABORT);\n      }\n\n      /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements\n    (other before this); non-doubleton adjacent elements (this before other);\n    non-adjacent elements. */\n\n    // adjacent links\n      if ((next == other_it->current) || (other_it->next == current)) {\n        // doubleton list\n        if ((next == other_it->current) && (other_it->next == current)) {\n          prev = next = current;\n          other_it->prev = other_it->next = other_it->current;\n        } else { // non-doubleton with\n          // adjacent links\n          // other before this\n          if (other_it->next == current) {\n            other_it->prev->next = current;\n            other_it->current->next = next;\n            other_it->current->prev = current;\n            current->next = other_it->current;\n            current->prev = other_it->prev;\n            next->prev = other_it->current;\n\n            other_it->next = other_it->current;\n            prev = current;\n          } else { // this before other\n            prev->next = other_it->current;\n            current->next = other_it->next;\n            current->prev = other_it->current;\n            other_it->current->next = current;\n            other_it->current->prev = prev;\n            other_it->next->prev = current;\n\n            next = current;\n            other_it->prev = other_it->current;\n          }\n        }\n      } else { // no overlap\n        prev->next = other_it->current;\n        current->next = other_it->next;\n        current->prev = other_it->prev;\n        next->prev = other_it->current;\n        other_it->prev->next = current;\n        other_it->current->next = next;\n        other_it->current->prev = prev;\n        other_it->next->prev = current;\n      }\n\n      /* update end of list pointer when necessary (remember that the 2 iterators\n      may iterate over different lists!) */\n\n      if (list->last == current) {\n        list->last = other_it->current;\n      }\n      if (other_it->list->last == other_it->current) {\n        other_it->list->last = current;\n      }\n\n      if (current == cycle_pt) {\n        cycle_pt = other_it->cycle_pt;\n      }\n      if (other_it->current == other_it->cycle_pt) {\n        other_it->cycle_pt = cycle_pt;\n      }\n\n      /* The actual exchange - in all cases*/\n\n      old_current = current;\n      current = other_it->current;\n      other_it->current = old_current;\n    } // other iterator\n\n      //# elements in list\n    int32_t length() const {\n      return list->length();\n    }\n    /***********************************************************************\n   *              ELIST2_ITERATOR::sort()\n   *\n   *  Sort the elements of the list, then reposition at the start.\n   *\n   **********************************************************************/\n    void sort(          // sort elements\n      int comparator( // comparison routine\n        const T *, const T *)) {\n#ifndef NDEBUG\n      if (!list) {\n        NO_LIST.error(\"ELIST2_ITERATOR::sort\", ABORT);\n      }\n#endif\n\n      list->sort(comparator);\n      move_to_first();\n    }\n\n  private:\n    // Don't use the following constructor.\n    Iterator() = delete;\n  };\n  using ITERATOR = Iterator; // compat\n\nprivate:\n  T *last = nullptr; // End of list\n  //(Points to head)\n  T *First() { // return first\n    return last ? last->next : nullptr;\n  }\n\npublic:\n  ~IntrusiveList() {\n    clear();\n  }\n\n  /* delete elements */\n  void clear() {\n    internal_clear();\n  }\n\n  /* Become a deep copy of src_list */\n  template <typename U>\n  void deep_copy(const U *src_list, T *(*copier)(const T *)) {\n    Iterator from_it(const_cast<U *>(src_list));\n    Iterator to_it(this);\n\n    for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward())\n      to_it.add_after_then_move((*copier)(from_it.data()));\n  }\n\n  /***********************************************************************\n   *              IntrusiveList::internal_clear\n   *\n   *  Used by the destructor and the \"clear\" member function of derived list\n   *  classes to destroy all the elements on the list.\n   *  The calling function passes a \"zapper\" function which can be called to\n   *  delete each element of the list, regardless of its derived type.  This\n   *  technique permits a generic clear function to destroy elements of\n   *  different derived types correctly, without requiring virtual functions and\n   *  the consequential memory overhead.\n   **********************************************************************/\n\n   // destroy all links\n  void internal_clear() {\n    // ptr to zapper functn\n    T *ptr;\n    T *next;\n\n    if (!empty()) {\n      ptr = last->next;     // set to first\n      last->next = nullptr; // break circle\n      last = nullptr;       // set list empty\n      while (ptr) {\n        next = ptr->next;\n        delete ptr;\n        ptr = next;\n      }\n    }\n  }\n\n  bool empty() const { // is list empty?\n    return !last;\n  }\n\n  bool singleton() const {\n    return last ? (last == last->next) : false;\n  }\n\n  void shallow_copy(       // dangerous!!\n    IntrusiveList *from_list) { // beware destructors!!\n    last = from_list->last;\n  }\n\n  /***********************************************************************\n *              IntrusiveList::assign_to_sublist\n *\n *  The list is set to a sublist of another list.  \"This\" list must be empty\n *  before this function is invoked.  The two iterators passed must refer to\n *  the same list, different from \"this\" one.  The sublist removed is the\n *  inclusive list from start_it's current position to end_it's current\n *  position.  If this range passes over the end of the source list then the\n *  source list has its end set to the previous element of start_it.  The\n *  extracted sublist is unaffected by the end point of the source list, its\n *  end point is always the end_it position.\n **********************************************************************/\n  void assign_to_sublist(        // to this list\n    Iterator *start_it, // from list start\n    Iterator *end_it);  // from list end\n\n  // # elements in list\n  int32_t length() const {\n    int32_t count = 0;\n    if (last != nullptr) {\n      count = 1;\n      for (auto it = last->next; it != last; it = it->next) {\n        count++;\n      }\n    }\n    return count;\n  }\n  /***********************************************************************\n *              IntrusiveList::sort\n *\n *  Sort elements on list\n *  NB If you don't like the const declarations in the comparator, coerce yours:\n *   (int (*)(const void *, const void *)\n **********************************************************************/\n  void sort(          // sort elements\n    int comparator( // comparison routine\n      const T *, const T *)) {\n    // Allocate an array of pointers, one per list element.\n    auto count = length();\n    if (count > 0) {\n      // ptr array to sort\n      std::vector<T *> base;\n      base.reserve(count);\n\n      Iterator it(this);\n\n      // Extract all elements, putting the pointers in the array.\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        base.push_back(it.extract());\n      }\n\n      // Sort the pointer array.\n      std::sort(base.begin(), base.end(),\n        // all current comparators return -1,0,1, so we handle this correctly for std::sort\n        [&](auto &&l, auto &&r) {return comparator(l, r) < 0; });\n\n      // Rebuild the list from the sorted pointers.\n      for (auto current : base) {\n        it.add_to_end(current);\n      }\n    }\n  }\n\n  // Assuming list has been sorted already, insert new_link to\n  // keep the list sorted according to the same comparison function.\n  // Comparison function is the same as used by sort, i.e. uses double\n  // indirection. Time is O(1) to add to beginning or end.\n  // Time is linear to add pre-sorted items to an empty list.\n  void add_sorted(int comparator(const T *, const T *), T *new_link) {\n    // Check for adding at the end.\n    if (last == nullptr || comparator(last, new_link) < 0) {\n      if (last == nullptr) {\n        new_link->next = new_link;\n        new_link->prev = new_link;\n      } else {\n        new_link->next = last->next;\n        new_link->prev = last;\n        last->next = new_link;\n        new_link->next->prev = new_link;\n      }\n      last = new_link;\n    } else {\n      // Need to use an iterator.\n      Iterator it(this);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        auto link = it.data();\n        if (comparator(link, new_link) > 0) {\n          break;\n        }\n      }\n      if (it.cycled_list()) {\n        it.add_to_end(new_link);\n      } else {\n        it.add_before_then_move(new_link);\n      }\n    }\n  }\n};\n\ntemplate <typename CLASSNAME>\nusing ELIST2 = IntrusiveList<CLASSNAME>;\n\n// add TESS_API?\n// move templated lists to public include dirs?\n#define ELIST2IZEH(T)                           \\\n  class T##_LIST : public IntrusiveList<T> {   \\\n  public:                                               \\\n    using IntrusiveList<T>::IntrusiveList;                    \\\n  };                                                    \\\n  class T##_IT : public IntrusiveList<T>::Iterator { \\\n  public:                                               \\\n    using base = IntrusiveList<T>::Iterator;           \\\n    using base::base;                                   \\\n  };\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/errcode.cpp",
    "content": "/**********************************************************************\n * File:        errcode.cpp  (Formerly error.c)\n * Description: Generic error handler function\n * Author:      Ray Smith\n *\n * (C) Copyright 1989, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"errcode.h\"\n\n#include <cstdarg>\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <iostream> // for std::cerr\n#include <sstream>  // for std::stringstream\n\nnamespace tesseract {\n\nconstexpr ERRCODE BADERRACTION(\"Illegal error action\");\n#define MAX_MSG 1024\n\n/**********************************************************************\n * error\n *\n * Print an error message and continue, exit or abort according to action.\n * Makes use of error messages and numbers in a common place.\n *\n **********************************************************************/\nvoid ERRCODE::error(         // handle error\n    const char *caller,      // name of caller\n    TessErrorLogCode action, // action to take\n    const char *format, ...  // special message\n    ) const {\n  va_list args; // variable args\n  std::stringstream msg;\n\n  if (caller != nullptr) {\n    // name of caller\n    msg << caller << ':';\n  }\n  // actual message\n  msg << \"Error:\" << message;\n  if (format != nullptr) {\n    char str[MAX_MSG];\n    va_start(args, format); // variable list\n    // print remainder\n    std::vsnprintf(str, sizeof(str), format, args);\n    // ensure termination\n    str[sizeof(str) - 1] = '\\0';\n    va_end(args);\n    msg << ':' << str;\n  }\n\n  std::cerr << msg.str() << '\\n';\n\n  switch (action) {\n    case DBG:\n    case TESSLOG:\n      return; // report only\n    case TESSEXIT:\n    case ABORT:\n#if !defined(NDEBUG)\n      // Create a deliberate abnormal exit as the stack trace is more useful\n      // that way. This is done only in debug builds, because the\n      // error message \"segmentation fault\" confuses most normal users.\n#  if defined(__GNUC__)\n      __builtin_trap();\n#  else\n      *reinterpret_cast<int *>(0) = 0;\n#  endif\n#endif\n      abort();\n    default:\n      BADERRACTION.error(\"error\", ABORT);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/errcode.h",
    "content": "/**********************************************************************\n * File:        errcode.h  (Formerly error.h)\n * Description: Header file for generic error handler class\n * Author:      Ray Smith\n *\n * (C) Copyright 1990, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef ERRCODE_H\n#define ERRCODE_H\n\n#include <tesseract/export.h> // for TESS_API\n\nnamespace tesseract {\n\n/*Control parameters for error()*/\nenum TessErrorLogCode {\n  DBG = -1,     /*log without alert */\n  TESSLOG = 0,  /*alert user */\n  TESSEXIT = 1, /*exit after error */\n  ABORT = 2     /*abort after error */\n};\n\n#if !defined(__GNUC__) && !defined(__attribute__)\n# define __attribute__(attr) // compiler without support for __attribute__\n#endif\n\nclass TESS_API ERRCODE { // error handler class\n  const char *message;   // error message\npublic:\n  void error(                  // error print function\n      const char *caller,      // function location\n      TessErrorLogCode action, // action to take\n      const char *format, ...  // fprintf format\n  ) const __attribute__((format(printf, 4, 5)));\n  void error(const char *caller, TessErrorLogCode action) const {\n    error(caller, action, nullptr);\n  }\n  constexpr ERRCODE(const char *string) : message(string) {} // initialize with string\n};\n\nconstexpr ERRCODE ASSERT_FAILED(\"Assert failed\");\n\n#define DO_NOTHING static_cast<void>(0)\n\n#define ASSERT_HOST(x) \\\n  (x) ? DO_NOTHING : ASSERT_FAILED.error(#x, ABORT, \"in file %s, line %d\", __FILE__, __LINE__)\n\n#define ASSERT_HOST_MSG(x, ...)                                                \\\n  if (!(x)) {                                                                  \\\n    tprintf(__VA_ARGS__);                                                      \\\n    ASSERT_FAILED.error(#x, ABORT, \"in file %s, line %d\", __FILE__, __LINE__); \\\n  }\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/fileerr.h",
    "content": "/**********************************************************************\n * File:        fileerr.h  (Formerly filerr.h)\n * Description: Errors for file utilities.\n * Author:      Ray Smith\n *\n * (C) Copyright 1990, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef FILEERR_H\n#define FILEERR_H\n\n#include \"errcode.h\"\n\nnamespace tesseract {\n\nconstexpr ERRCODE CANTOPENFILE(\"Can't open file\");\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/genericheap.h",
    "content": "// Copyright 2012 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        genericheap.h\n// Description: Template heap class.\n// Author:      Ray Smith, based on Dan Johnson's original code.\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_GENERICHEAP_H_\n#define TESSERACT_CCUTIL_GENERICHEAP_H_\n\n#include \"errcode.h\"\n\n#include <vector>\n\nnamespace tesseract {\n\n// GenericHeap requires 1 template argument:\n// Pair will normally be either KDPairInc<Key, Data> or KDPairDec<Key, Data>\n// for some arbitrary Key and scalar, smart pointer, or non-ownership pointer\n// Data type, according to whether a MIN heap or a MAX heap is desired,\n// respectively. Using KDPtrPairInc<Key, Data> or KDPtrPairDec<Key, Data>,\n// GenericHeap can also handle simple Data pointers and own them.\n// If no additional data is required, Pair can also be a scalar, since\n// GenericHeap doesn't look inside it except for operator<.\n//\n// The heap is stored as a packed binary tree in an array hosted by a\n// vector<Pair>, with the invariant that the children of each node are\n// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<\n// to use Key::operator< to generate a MIN heap and KDPairDec defines\n// Pair::operator< to use Key::operator> to generate a MAX heap by reversing\n// all the comparisons.\n// See http://en.wikipedia.org/wiki/Heap_(data_structure) for more detail on\n// the basic heap implementation.\n//\n// Insertion and removal are both O(log n) and, unlike the STL heap, an\n// explicit Reshuffle function allows a node to be repositioned in time O(log n)\n// after changing its value.\n//\n// Accessing the element for revaluation is a more complex matter, since the\n// index and pointer can be changed arbitrarily by heap operations.\n// Revaluation can be done by making the Data type in the Pair derived from or\n// contain a DoublePtr as its first data element, making it possible to convert\n// the pointer to a Pair using KDPairInc::RecastDataPointer.\ntemplate <typename Pair>\nclass GenericHeap {\npublic:\n  GenericHeap() = default;\n  // The initial size is only a vector::reserve. It is not enforced as\n  // the size limit of the heap. Caller must implement their own enforcement.\n  explicit GenericHeap(int initial_size) {\n    heap_.reserve(initial_size);\n  }\n\n  // Simple accessors.\n  bool empty() const {\n    return heap_.empty();\n  }\n  int size() const {\n    return heap_.size();\n  }\n  int size_reserved() const {\n    return heap_.size_reserved();\n  }\n  void clear() {\n    // Clear truncates to 0 to keep the number reserved in tact.\n    heap_.clear();\n  }\n  // Provides access to the underlying vector.\n  // Caution! any changes that modify the keys will invalidate the heap!\n  std::vector<Pair> &heap() {\n    return heap_;\n  }\n  // Provides read-only access to an element of the underlying vector.\n  const Pair &get(int index) const {\n    return heap_[index];\n  }\n\n  // Add entry to the heap, keeping the smallest item at the top, by operator<.\n  // Note that *entry is used as the source of operator=, but it is non-const\n  // to allow for a smart pointer to be contained within.\n  // Time = O(log n).\n  void Push(Pair *entry) {\n    int hole_index = heap_.size();\n    // Make a hole in the end of heap_ and sift it up to be the correct\n    // location for the new *entry. To avoid needing a default constructor\n    // for primitive types, and to allow for use of DoublePtr in the Pair\n    // somewhere, we have to incur a double copy here.\n    heap_.push_back(*entry);\n    *entry = heap_.back();\n    hole_index = SiftUp(hole_index, *entry);\n    heap_[hole_index] = *entry;\n  }\n\n  // Get the value of the top (smallest, defined by operator< ) element.\n  const Pair &PeekTop() const {\n    return heap_[0];\n  }\n  // Get the value of the worst (largest, defined by operator< ) element.\n  const Pair &PeekWorst() const {\n    return heap_[IndexOfWorst()];\n  }\n\n  // Removes the top element of the heap. If entry is not nullptr, the element\n  // is copied into *entry, otherwise it is discarded.\n  // Returns false if the heap was already empty.\n  // Time = O(log n).\n  bool Pop(Pair *entry) {\n    int new_size = heap_.size() - 1;\n    if (new_size < 0) {\n      return false; // Already empty.\n    }\n    if (entry != nullptr) {\n      *entry = heap_[0];\n    }\n    if (new_size > 0) {\n      // Sift the hole at the start of the heap_ downwards to match the last\n      // element.\n      Pair hole_pair = heap_[new_size];\n      heap_.resize(new_size);\n      int hole_index = SiftDown(0, hole_pair);\n      heap_[hole_index] = std::move(hole_pair);\n    } else {\n      heap_.resize(new_size);\n    }\n    return true;\n  }\n\n  // Removes the MAXIMUM element of the heap. (MIN from a MAX heap.) If entry is\n  // not nullptr, the element is copied into *entry, otherwise it is discarded.\n  // Time = O(n). Returns false if the heap was already empty.\n  bool PopWorst(Pair *entry) {\n    int worst_index = IndexOfWorst();\n    if (worst_index < 0) {\n      return false; // It cannot be empty!\n    }\n    // Extract the worst element from the heap, leaving a hole at worst_index.\n    if (entry != nullptr) {\n      *entry = heap_[worst_index];\n    }\n    int heap_size = heap_.size() - 1;\n    if (heap_size > 0) {\n      // Sift the hole upwards to match the last element of the heap_\n      Pair hole_pair = heap_[heap_size];\n      int hole_index = SiftUp(worst_index, hole_pair);\n      heap_[hole_index] = hole_pair;\n    }\n    heap_.resize(heap_size);\n    return true;\n  }\n\n  // Returns the index of the worst element. Time = O(n/2).\n  int IndexOfWorst() const {\n    int heap_size = heap_.size();\n    if (heap_size == 0) {\n      return -1; // It cannot be empty!\n    }\n\n    // Find the maximum element. Its index is guaranteed to be greater than\n    // the index of the parent of the last element, since by the heap invariant\n    // the parent must be less than or equal to the children.\n    int worst_index = heap_size - 1;\n    int end_parent = ParentNode(worst_index);\n    for (int i = worst_index - 1; i > end_parent; --i) {\n      if (heap_[worst_index] < heap_[i]) {\n        worst_index = i;\n      }\n    }\n    return worst_index;\n  }\n\n  // The pointed-to Pair has changed its key value, so the location of pair\n  // is reshuffled to maintain the heap invariant.\n  // Must be a valid pointer to an element of the heap_!\n  // Caution! Since GenericHeap is based on vector, reallocs may occur\n  // whenever the vector is extended and elements may get shuffled by any\n  // Push or Pop operation. Therefore use this function only if Data in Pair is\n  // of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as\n  // its first element. Reshuffles the heap to maintain the invariant.\n  // Time = O(log n).\n  void Reshuffle(Pair *pair) {\n    int index = pair - &heap_[0];\n    Pair hole_pair = heap_[index];\n    index = SiftDown(index, hole_pair);\n    index = SiftUp(index, hole_pair);\n    heap_[index] = std::move(hole_pair);\n  }\n\nprivate:\n  // A hole in the heap exists at hole_index, and we want to fill it with the\n  // given pair. SiftUp sifts the hole upward to the correct position and\n  // returns the destination index without actually putting pair there.\n  int SiftUp(int hole_index, const Pair &pair) {\n    int parent;\n    while (hole_index > 0 && pair < heap_[parent = ParentNode(hole_index)]) {\n      heap_[hole_index] = heap_[parent];\n      hole_index = parent;\n    }\n    return hole_index;\n  }\n\n  // A hole in the heap exists at hole_index, and we want to fill it with the\n  // given pair. SiftDown sifts the hole downward to the correct position and\n  // returns the destination index without actually putting pair there.\n  int SiftDown(int hole_index, const Pair &pair) {\n    int heap_size = heap_.size();\n    int child;\n    while ((child = LeftChild(hole_index)) < heap_size) {\n      if (child + 1 < heap_size && heap_[child + 1] < heap_[child]) {\n        ++child;\n      }\n      if (heap_[child] < pair) {\n        heap_[hole_index] = heap_[child];\n        hole_index = child;\n      } else {\n        break;\n      }\n    }\n    return hole_index;\n  }\n\n  // Functions to navigate the tree. Unlike the original implementation, we\n  // store the root at index 0.\n  int ParentNode(int index) const {\n    return (index + 1) / 2 - 1;\n  }\n  int LeftChild(int index) const {\n    return index * 2 + 1;\n  }\n\nprivate:\n  std::vector<Pair> heap_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_GENERICHEAP_H_\n"
  },
  {
    "path": "src/ccutil/genericvector.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        genericvector.h\n// Description: Generic vector class\n// Author:      Daria Antonova\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_\n#define TESSERACT_CCUTIL_GENERICVECTOR_H_\n\n#include \"helpers.h\"\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cassert>\n#include <climits> // for LONG_MAX\n#include <cstdint> // for uint32_t\n#include <cstdio>\n#include <cstdlib>\n#include <functional> // for std::function\n\nnamespace tesseract {\n\n// Use PointerVector<T> below in preference to GenericVector<T*>, as that\n// provides automatic deletion of pointers, [De]Serialize that works, and\n// sort that works.\ntemplate <typename T>\nclass GenericVector {\npublic:\n  GenericVector() {\n    init(kDefaultVectorSize);\n  }\n\n  // Copy\n  GenericVector(const GenericVector &other) {\n    this->init(other.size());\n    this->operator+=(other);\n  }\n  GenericVector<T> &operator+=(const GenericVector &other);\n  GenericVector<T> &operator=(const GenericVector &other);\n\n  ~GenericVector();\n\n  // Reserve some memory.\n  void reserve(int size);\n  // Double the size of the internal array.\n  void double_the_size();\n\n  // Resizes to size and sets all values to t.\n  void init_to_size(int size, const T &t);\n  void resize(int size, const T &t);\n  // Resizes to size without any initialization.\n  void resize_no_init(int size) {\n    reserve(size);\n    size_used_ = size;\n  }\n\n  // Return the size used.\n  unsigned size() const {\n    return size_used_;\n  }\n  // Workaround to avoid g++ -Wsign-compare warnings.\n  size_t unsigned_size() const {\n    static_assert(sizeof(size_used_) <= sizeof(size_t), \"Wow! sizeof(size_t) < sizeof(int32_t)!!\");\n    assert(0 <= size_used_);\n    return static_cast<size_t>(size_used_);\n  }\n  int size_reserved() const {\n    return size_reserved_;\n  }\n\n  // Return true if empty.\n  bool empty() const {\n    return size_used_ == 0;\n  }\n\n  // Return the object from an index.\n  T &at(int index) const {\n    assert(index >= 0 && index < size_used_);\n    return data_[index];\n  }\n\n  T &back() const;\n  T &operator[](int index) const;\n  // Returns the last object and removes it.\n  T pop_back();\n\n  // Return the index of the T object.\n  int get_index(const T &object) const;\n\n  // Push an element in the end of the array\n  int push_back(T object);\n  void operator+=(const T &t);\n\n  // Set the value at the given index\n  void set(const T &t, int index);\n\n  // Insert t at the given index, push other elements to the right.\n  void insert(const T &t, int index);\n\n  // Removes an element at the given index and\n  // shifts the remaining elements to the left.\n  void remove(int index);\n\n  // Truncates the array to the given size by removing the end.\n  // If the current size is less, the array is not expanded.\n  void truncate(int size) {\n    if (size < size_used_) {\n      size_used_ = size;\n    }\n  }\n\n  // Add a callback to be called to delete the elements when the array took\n  // their ownership.\n  void set_clear_callback(const std::function<void(T)> &cb) {\n    clear_cb_ = cb;\n  }\n\n  // Clear the array, calling the clear callback function if any.\n  // All the owned callbacks are also deleted.\n  // If you don't want the callbacks to be deleted, before calling clear, set\n  // the callback to nullptr.\n  void clear();\n\n  // Delete objects pointed to by data_[i]\n  void delete_data_pointers();\n\n  // This method clears the current object, then, does a shallow copy of\n  // its argument, and finally invalidates its argument.\n  // Callbacks are moved to the current object;\n  void move(GenericVector<T> *from);\n\n  // Read/Write the array to a file. This does _NOT_ read/write the callbacks.\n  // The callback given must be permanent since they will be called more than\n  // once. The given callback will be deleted at the end.\n  // If the callbacks are nullptr, then the data is simply read/written using\n  // fread (and swapping)/fwrite.\n  // Returns false on error or if the callback returns false.\n  // DEPRECATED. Use [De]Serialize[Classes] instead.\n  bool write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const;\n  bool read(TFile *f, const std::function<bool(TFile *, T *)> &cb);\n  // Writes a vector of simple types to the given file. Assumes that bitwise\n  // read/write of T will work. Returns false in case of error.\n  // TODO(rays) Change all callers to use TFile and remove deprecated methods.\n  bool Serialize(FILE *fp) const;\n  bool Serialize(TFile *fp) const;\n  // Reads a vector of simple types from the given file. Assumes that bitwise\n  // read/write will work with ReverseN according to sizeof(T).\n  // Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  // TFile is assumed to know about swapping.\n  bool DeSerialize(bool swap, FILE *fp);\n  bool DeSerialize(TFile *fp);\n  // Writes a vector of classes to the given file. Assumes the existence of\n  // bool T::Serialize(FILE* fp) const that returns false in case of error.\n  // Returns false in case of error.\n  bool SerializeClasses(FILE *fp) const;\n  // Reads a vector of classes from the given file. Assumes the existence of\n  // bool T::Deserialize(bool swap, FILE* fp) that returns false in case of\n  // error. Also needs T::T() and T::T(constT&), as init_to_size is used in\n  // this function. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerializeClasses(TFile *fp);\n\n  // Reverses the elements of the vector.\n  void reverse() {\n    for (int i = 0; i < size_used_ / 2; ++i) {\n      std::swap(data_[i], data_[size_used_ - 1 - i]);\n    }\n  }\n\n  // Sorts the members of this vector using the less than comparator (cmp_lt),\n  // which compares the values. Useful for GenericVectors to primitive types.\n  // Will not work so great for pointers (unless you just want to sort some\n  // pointers). You need to provide a specialization to sort_cmp to use\n  // your type.\n  void sort();\n\n  // Sort the array into the order defined by the qsort function comparator.\n  // The comparator function is as defined by qsort, ie. it receives pointers\n  // to two Ts and returns negative if the first element is to appear earlier\n  // in the result and positive if it is to appear later, with 0 for equal.\n  void sort(int (*comparator)(const void *, const void *)) {\n    qsort(data_, size_used_, sizeof(*data_), comparator);\n  }\n\n  // Swaps the elements with the given indices.\n  void swap(int index1, int index2) {\n    if (index1 != index2) {\n      T tmp = data_[index1];\n      data_[index1] = data_[index2];\n      data_[index2] = tmp;\n    }\n  }\n  // Returns true if all elements of *this are within the given range.\n  // Only uses operator<\n  /*bool WithinBounds(const T& rangemin, const T& rangemax) const {\n  for (int i = 0; i < size_used_; ++i) {\n    if (data_[i] < rangemin || rangemax < data_[i]) {\n      return false;\n    }\n  }\n  return true;\n}*/\n\nprotected:\n  // Init the object, allocating size memory.\n  void init(int size);\n\n  // We are assuming that the object generally placed in the\n  // vector are small enough that for efficiency it makes sense\n  // to start with a larger initial size.\n  static const int kDefaultVectorSize = 4;\n  int32_t size_used_{};\n  int32_t size_reserved_{};\n  T *data_;\n  std::function<void(T)> clear_cb_;\n};\n\n// The default FileReader loads the whole file into the vector of char,\n// returning false on error.\ninline bool LoadDataFromFile(const char *filename, GenericVector<char> *data) {\n  bool result = false;\n  FILE *fp = fopen(filename, \"rb\");\n  if (fp != nullptr) {\n    fseek(fp, 0, SEEK_END);\n    auto size = std::ftell(fp);\n    fseek(fp, 0, SEEK_SET);\n    // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.\n    if (size > 0 && size < LONG_MAX) {\n      // reserve an extra byte in case caller wants to append a '\\0' character\n      data->reserve(size + 1);\n      data->resize_no_init(size);\n      result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;\n    }\n    fclose(fp);\n  }\n  return result;\n}\n\n// The default FileWriter writes the vector of char to the filename file,\n// returning false on error.\ninline bool SaveDataToFile(const GenericVector<char> &data, const char *filename) {\n  FILE *fp = fopen(filename, \"wb\");\n  if (fp == nullptr) {\n    return false;\n  }\n  bool result = fwrite(&data[0], 1, data.size(), fp) == data.size();\n  fclose(fp);\n  return result;\n}\n\n// Used by sort()\n// return < 0 if t1 < t2\n// return 0 if t1 == t2\n// return > 0 if t1 > t2\ntemplate <typename T>\nint sort_cmp(const void *t1, const void *t2) {\n  const T *a = static_cast<const T *>(t1);\n  const T *b = static_cast<const T *>(t2);\n  if (*a < *b) {\n    return -1;\n  }\n  if (*b < *a) {\n    return 1;\n  }\n  return 0;\n}\n\n// Used by PointerVector::sort()\n// return < 0 if t1 < t2\n// return 0 if t1 == t2\n// return > 0 if t1 > t2\ntemplate <typename T>\nint sort_ptr_cmp(const void *t1, const void *t2) {\n  const T *a = *static_cast<T *const *>(t1);\n  const T *b = *static_cast<T *const *>(t2);\n  if (*a < *b) {\n    return -1;\n  }\n  if (*b < *a) {\n    return 1;\n  }\n  return 0;\n}\n\n// Subclass for a vector of pointers. Use in preference to GenericVector<T*>\n// as it provides automatic deletion and correct serialization, with the\n// corollary that all copy operations are deep copies of the pointed-to objects.\ntemplate <typename T>\nclass PointerVector : public GenericVector<T *> {\npublic:\n  PointerVector() : GenericVector<T *>() {}\n  explicit PointerVector(int size) : GenericVector<T *>(size) {}\n  ~PointerVector() {\n    // Clear must be called here, even though it is called again by the base,\n    // as the base will call the wrong clear.\n    clear();\n  }\n  // Copy must be deep, as the pointers will be automatically deleted on\n  // destruction.\n  PointerVector(const PointerVector &other) : GenericVector<T *>(other) {\n    this->init(other.size());\n    this->operator+=(other);\n  }\n  PointerVector<T> &operator+=(const PointerVector &other) {\n    this->reserve(this->size_used_ + other.size_used_);\n    for (unsigned i = 0; i < other.size(); ++i) {\n      this->push_back(new T(*other.data_[i]));\n    }\n    return *this;\n  }\n\n  PointerVector<T> &operator=(const PointerVector &other) {\n    if (&other != this) {\n      this->truncate(0);\n      this->operator+=(other);\n    }\n    return *this;\n  }\n\n  // Removes an element at the given index and\n  // shifts the remaining elements to the left.\n  void remove(int index) {\n    delete GenericVector<T *>::data_[index];\n    GenericVector<T *>::remove(index);\n  }\n\n  // Truncates the array to the given size by removing the end.\n  // If the current size is less, the array is not expanded.\n  void truncate(int size) {\n    for (int i = size; i < GenericVector<T *>::size_used_; ++i) {\n      delete GenericVector<T *>::data_[i];\n    }\n    GenericVector<T *>::truncate(size);\n  }\n\n  // Clear the array, calling the clear callback function if any.\n  // All the owned callbacks are also deleted.\n  // If you don't want the callbacks to be deleted, before calling clear, set\n  // the callback to nullptr.\n  void clear() {\n    GenericVector<T *>::delete_data_pointers();\n    GenericVector<T *>::clear();\n  }\n\n  // Writes a vector of (pointers to) classes to the given file. Assumes the\n  // existence of bool T::Serialize(FILE*) const that returns false in case of\n  // error. There is no Serialize for simple types, as you would have a\n  // normal GenericVector of those.\n  // Returns false in case of error.\n  bool Serialize(FILE *fp) const {\n    int32_t used = GenericVector<T *>::size_used_;\n    if (fwrite(&used, sizeof(used), 1, fp) != 1) {\n      return false;\n    }\n    for (int i = 0; i < used; ++i) {\n      int8_t non_null = GenericVector<T *>::data_[i] != nullptr;\n      if (fwrite(&non_null, sizeof(non_null), 1, fp) != 1) {\n        return false;\n      }\n      if (non_null && !GenericVector<T *>::data_[i]->Serialize(fp)) {\n        return false;\n      }\n    }\n    return true;\n  }\n  bool Serialize(TFile *fp) const {\n    int32_t used = GenericVector<T *>::size_used_;\n    if (fp->FWrite(&used, sizeof(used), 1) != 1) {\n      return false;\n    }\n    for (int i = 0; i < used; ++i) {\n      int8_t non_null = GenericVector<T *>::data_[i] != nullptr;\n      if (fp->FWrite(&non_null, sizeof(non_null), 1) != 1) {\n        return false;\n      }\n      if (non_null && !GenericVector<T *>::data_[i]->Serialize(fp)) {\n        return false;\n      }\n    }\n    return true;\n  }\n  // Reads a vector of (pointers to) classes to the given file. Assumes the\n  // existence of bool T::DeSerialize(bool, Tfile*) const that returns false in\n  // case of error. There is no Serialize for simple types, as you would have a\n  // normal GenericVector of those.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  // Also needs T::T(), as new T is used in this function.\n  // Returns false in case of error.\n  bool DeSerialize(bool swap, FILE *fp) {\n    uint32_t reserved;\n    if (fread(&reserved, sizeof(reserved), 1, fp) != 1) {\n      return false;\n    }\n    if (swap) {\n      Reverse32(&reserved);\n    }\n    // Arbitrarily limit the number of elements to protect against bad data.\n    assert(reserved <= UINT16_MAX);\n    if (reserved > UINT16_MAX) {\n      return false;\n    }\n    GenericVector<T *>::reserve(reserved);\n    truncate(0);\n    for (uint32_t i = 0; i < reserved; ++i) {\n      int8_t non_null;\n      if (fread(&non_null, sizeof(non_null), 1, fp) != 1) {\n        return false;\n      }\n      T *item = nullptr;\n      if (non_null != 0) {\n        item = new T;\n        if (!item->DeSerialize(swap, fp)) {\n          delete item;\n          return false;\n        }\n        this->push_back(item);\n      } else {\n        // Null elements should keep their place in the vector.\n        this->push_back(nullptr);\n      }\n    }\n    return true;\n  }\n\n  // Sorts the items pointed to by the members of this vector using\n  // t::operator<().\n  void sort() {\n    this->GenericVector<T *>::sort(&sort_ptr_cmp<T>);\n  }\n};\n\ntemplate <typename T>\nvoid GenericVector<T>::init(int size) {\n  size_used_ = 0;\n  if (size <= 0) {\n    data_ = nullptr;\n    size_reserved_ = 0;\n  } else {\n    if (size < kDefaultVectorSize) {\n      size = kDefaultVectorSize;\n    }\n    data_ = new T[size];\n    size_reserved_ = size;\n  }\n  clear_cb_ = nullptr;\n}\n\ntemplate <typename T>\nGenericVector<T>::~GenericVector() {\n  clear();\n}\n\n// Reserve some memory. If the internal array contains elements, they are\n// copied.\ntemplate <typename T>\nvoid GenericVector<T>::reserve(int size) {\n  if (size_reserved_ >= size || size <= 0) {\n    return;\n  }\n  if (size < kDefaultVectorSize) {\n    size = kDefaultVectorSize;\n  }\n  T *new_array = new T[size];\n  for (int i = 0; i < size_used_; ++i) {\n    new_array[i] = data_[i];\n  }\n  delete[] data_;\n  data_ = new_array;\n  size_reserved_ = size;\n}\n\ntemplate <typename T>\nvoid GenericVector<T>::double_the_size() {\n  if (size_reserved_ == 0) {\n    reserve(kDefaultVectorSize);\n  } else {\n    reserve(2 * size_reserved_);\n  }\n}\n\n// Resizes to size and sets all values to t.\ntemplate <typename T>\nvoid GenericVector<T>::init_to_size(int size, const T &t) {\n  reserve(size);\n  size_used_ = size;\n  for (int i = 0; i < size; ++i) {\n    data_[i] = t;\n  }\n}\n\ntemplate <typename T>\nvoid GenericVector<T>::resize(int size, const T &t) {\n  init_to_size(size, t);\n}\n\ntemplate <typename T>\nT &GenericVector<T>::operator[](int index) const {\n  assert(index >= 0 && index < size_used_);\n  return data_[index];\n}\n\ntemplate <typename T>\nT &GenericVector<T>::back() const {\n  assert(size_used_ > 0);\n  return data_[size_used_ - 1];\n}\n// Returns the last object and removes it.\ntemplate <typename T>\nT GenericVector<T>::pop_back() {\n  assert(size_used_ > 0);\n  return data_[--size_used_];\n}\n\n// Return the object from an index.\ntemplate <typename T>\nvoid GenericVector<T>::set(const T &t, int index) {\n  assert(index >= 0 && index < size_used_);\n  data_[index] = t;\n}\n\n// Shifts the rest of the elements to the right to make\n// space for the new elements and inserts the given element\n// at the specified index.\ntemplate <typename T>\nvoid GenericVector<T>::insert(const T &t, int index) {\n  assert(index >= 0 && index <= size_used_);\n  if (size_reserved_ == size_used_) {\n    double_the_size();\n  }\n  for (int i = size_used_; i > index; --i) {\n    data_[i] = data_[i - 1];\n  }\n  data_[index] = t;\n  size_used_++;\n}\n\n// Removes an element at the given index and\n// shifts the remaining elements to the left.\ntemplate <typename T>\nvoid GenericVector<T>::remove(int index) {\n  assert(index >= 0 && index < size_used_);\n  for (int i = index; i < size_used_ - 1; ++i) {\n    data_[i] = data_[i + 1];\n  }\n  size_used_--;\n}\n\n// Return the index of the T object.\ntemplate <typename T>\nint GenericVector<T>::get_index(const T &object) const {\n  for (int i = 0; i < size_used_; ++i) {\n    if (object == data_[i]) {\n      return i;\n    }\n  }\n  return -1;\n}\n\n// Add an element in the array\ntemplate <typename T>\nint GenericVector<T>::push_back(T object) {\n  int index = 0;\n  if (size_used_ == size_reserved_) {\n    double_the_size();\n  }\n  index = size_used_++;\n  data_[index] = std::move(object);\n  return index;\n}\n\ntemplate <typename T>\nvoid GenericVector<T>::operator+=(const T &t) {\n  push_back(t);\n}\n\ntemplate <typename T>\nGenericVector<T> &GenericVector<T>::operator+=(const GenericVector &other) {\n  this->reserve(size_used_ + other.size_used_);\n  for (unsigned i = 0; i < other.size(); ++i) {\n    this->operator+=(other.data_[i]);\n  }\n  return *this;\n}\n\ntemplate <typename T>\nGenericVector<T> &GenericVector<T>::operator=(const GenericVector &other) {\n  if (&other != this) {\n    this->truncate(0);\n    this->operator+=(other);\n  }\n  return *this;\n}\n\n// Clear the array, calling the callback function if any.\ntemplate <typename T>\nvoid GenericVector<T>::clear() {\n  if (size_reserved_ > 0 && clear_cb_ != nullptr) {\n    for (int i = 0; i < size_used_; ++i) {\n      clear_cb_(data_[i]);\n    }\n  }\n  delete[] data_;\n  data_ = nullptr;\n  size_used_ = 0;\n  size_reserved_ = 0;\n  clear_cb_ = nullptr;\n}\n\ntemplate <typename T>\nvoid GenericVector<T>::delete_data_pointers() {\n  for (int i = 0; i < size_used_; ++i) {\n    delete data_[i];\n  }\n}\n\ntemplate <typename T>\nbool GenericVector<T>::write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const {\n  if (fwrite(&size_reserved_, sizeof(size_reserved_), 1, f) != 1) {\n    return false;\n  }\n  if (fwrite(&size_used_, sizeof(size_used_), 1, f) != 1) {\n    return false;\n  }\n  if (cb != nullptr) {\n    for (int i = 0; i < size_used_; ++i) {\n      if (!cb(f, data_[i])) {\n        return false;\n      }\n    }\n  } else {\n    if (fwrite(data_, sizeof(T), size_used_, f) != unsigned_size()) {\n      return false;\n    }\n  }\n  return true;\n}\n\ntemplate <typename T>\nbool GenericVector<T>::read(TFile *f, const std::function<bool(TFile *, T *)> &cb) {\n  int32_t reserved;\n  if (f->FReadEndian(&reserved, sizeof(reserved), 1) != 1) {\n    return false;\n  }\n  reserve(reserved);\n  if (f->FReadEndian(&size_used_, sizeof(size_used_), 1) != 1) {\n    return false;\n  }\n  if (cb != nullptr) {\n    for (int i = 0; i < size_used_; ++i) {\n      if (!cb(f, data_ + i)) {\n        return false;\n      }\n    }\n  } else {\n    if (f->FReadEndian(data_, sizeof(T), size_used_) != static_cast<unsigned>(size_used_)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Writes a vector of simple types to the given file. Assumes that bitwise\n// read/write of T will work. Returns false in case of error.\ntemplate <typename T>\nbool GenericVector<T>::Serialize(FILE *fp) const {\n  if (fwrite(&size_used_, sizeof(size_used_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(data_, sizeof(*data_), size_used_, fp) != unsigned_size()) {\n    return false;\n  }\n  return true;\n}\ntemplate <typename T>\nbool GenericVector<T>::Serialize(TFile *fp) const {\n  if (fp->FWrite(&size_used_, sizeof(size_used_), 1) != 1) {\n    return false;\n  }\n  if (fp->FWrite(data_, sizeof(*data_), size_used_) != size_used_) {\n    return false;\n  }\n  return true;\n}\n\n// Reads a vector of simple types from the given file. Assumes that bitwise\n// read/write will work with ReverseN according to sizeof(T).\n// Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\ntemplate <typename T>\nbool GenericVector<T>::DeSerialize(bool swap, FILE *fp) {\n  uint32_t reserved;\n  if (fread(&reserved, sizeof(reserved), 1, fp) != 1) {\n    return false;\n  }\n  if (swap) {\n    Reverse32(&reserved);\n  }\n  // Arbitrarily limit the number of elements to protect against bad data.\n  assert(reserved <= UINT16_MAX);\n  if (reserved > UINT16_MAX) {\n    return false;\n  }\n  reserve(reserved);\n  size_used_ = reserved;\n  if (fread(data_, sizeof(T), size_used_, fp) != unsigned_size()) {\n    return false;\n  }\n  if (swap) {\n    for (int i = 0; i < size_used_; ++i) {\n      ReverseN(&data_[i], sizeof(data_[i]));\n    }\n  }\n  return true;\n}\ntemplate <typename T>\nbool GenericVector<T>::DeSerialize(TFile *fp) {\n  uint32_t reserved;\n  if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) {\n    return false;\n  }\n  // Arbitrarily limit the number of elements to protect against bad data.\n  const uint32_t limit = 50000000;\n  assert(reserved <= limit);\n  if (reserved > limit) {\n    return false;\n  }\n  reserve(reserved);\n  size_used_ = reserved;\n  return fp->FReadEndian(data_, sizeof(T), size_used_) == size_used_;\n}\n\n// Writes a vector of classes to the given file. Assumes the existence of\n// bool T::Serialize(FILE* fp) const that returns false in case of error.\n// Returns false in case of error.\ntemplate <typename T>\nbool GenericVector<T>::SerializeClasses(FILE *fp) const {\n  if (fwrite(&size_used_, sizeof(size_used_), 1, fp) != 1) {\n    return false;\n  }\n  for (int i = 0; i < size_used_; ++i) {\n    if (!data_[i].Serialize(fp)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Reads a vector of classes from the given file. Assumes the existence of\n// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of\n// error. Also needs T::T() and T::T(constT&), as init_to_size is used in\n// this function. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\ntemplate <typename T>\nbool GenericVector<T>::DeSerializeClasses(TFile *fp) {\n  int32_t reserved;\n  if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) {\n    return false;\n  }\n  T empty;\n  init_to_size(reserved, empty);\n  for (int i = 0; i < reserved; ++i) {\n    if (!data_[i].DeSerialize(fp)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// This method clear the current object, then, does a shallow copy of\n// its argument, and finally invalidates its argument.\ntemplate <typename T>\nvoid GenericVector<T>::move(GenericVector<T> *from) {\n  this->clear();\n  this->data_ = from->data_;\n  this->size_reserved_ = from->size_reserved_;\n  this->size_used_ = from->size_used_;\n  this->clear_cb_ = from->clear_cb_;\n  from->data_ = nullptr;\n  from->clear_cb_ = nullptr;\n  from->size_used_ = 0;\n  from->size_reserved_ = 0;\n}\n\ntemplate <typename T>\nvoid GenericVector<T>::sort() {\n  sort(&sort_cmp<T>);\n}\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_\n"
  },
  {
    "path": "src/ccutil/helpers.h",
    "content": "/******************************************************************************\n *\n * File:         helpers.h\n * Description:  General utility functions\n * Author:       Daria Antonova\n *\n * (c) Copyright 2009, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef TESSERACT_CCUTIL_HELPERS_H_\n#define TESSERACT_CCUTIL_HELPERS_H_\n\n#include <cassert>\n#include <climits> // for INT_MIN, INT_MAX\n#include <cmath> // std::isfinite\n#include <cstdio>\n#include <algorithm>  // for std::find\n#include <string>\n#include <vector>\n\n#include \"serialis.h\"\n\nnamespace tesseract {\n\n// Copy a std::string to a newly allocated char *.\n// TODO: Remove this function once the related code has been converted\n// to use std::string.\ninline char *copy_string(const std::string &from) {\n  auto length = from.length();\n  char *target_string = new char[length + 1];\n  from.copy(target_string, length);\n  target_string[length] = '\\0';\n  return target_string;\n}\n\ntemplate <class T>\ninline bool contains(const std::vector<T> &data, const T &value) {\n  return std::find(data.begin(), data.end(), value) != data.end();\n}\n\ninline const std::vector<std::string> split(const std::string &s, char c) {\n  std::string buff;\n  std::vector<std::string> v;\n  for (auto n : s) {\n    if (n != c) {\n      buff += n;\n    } else if (n == c && !buff.empty()) {\n      v.push_back(buff);\n      buff.clear();\n    }\n  }\n  if (!buff.empty()) {\n    v.push_back(buff);\n  }\n  return v;\n}\n\n// A simple linear congruential random number generator,\n// using Knuth's constants from:\n// http://en.wikipedia.org/wiki/Linear_congruential_generator.\nclass TRand {\npublic:\n  TRand() = default;\n  // Sets the seed to the given value.\n  void set_seed(uint64_t seed) {\n    seed_ = seed;\n  }\n\n  // Returns an integer in the range 0 to INT32_MAX.\n  int32_t IntRand() {\n    Iterate();\n    return seed_ >> 33;\n  }\n  // Returns a floating point value in the range [-range, range].\n  double SignedRand(double range) {\n    return range * 2.0 * IntRand() / INT32_MAX - range;\n  }\n  // Returns a floating point value in the range [0, range].\n  double UnsignedRand(double range) {\n    return range * IntRand() / INT32_MAX;\n  }\n\nprivate:\n  // Steps the generator to the next value.\n  void Iterate() {\n    seed_ *= 6364136223846793005ULL;\n    seed_ += 1442695040888963407ULL;\n  }\n\n  // The current value of the seed.\n  uint64_t seed_{1};\n};\n\n// Remove newline (if any) at the end of the string.\ninline void chomp_string(char *str) {\n  int last_index = static_cast<int>(strlen(str)) - 1;\n  while (last_index >= 0 && (str[last_index] == '\\n' || str[last_index] == '\\r')) {\n    str[last_index--] = '\\0';\n  }\n}\n\n// return the smallest multiple of block_size greater than or equal to n.\ninline int RoundUp(int n, int block_size) {\n  return block_size * ((n + block_size - 1) / block_size);\n}\n\n// Clip a numeric value to the interval [lower_bound, upper_bound].\ntemplate <typename T>\ninline T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound) {\n  if (x < lower_bound) {\n    return lower_bound;\n  }\n  if (x > upper_bound) {\n    return upper_bound;\n  }\n  return x;\n}\n\n// Extend the range [lower_bound, upper_bound] to include x.\ntemplate <typename T1, typename T2>\ninline void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound) {\n  if (x < *lower_bound) {\n    *lower_bound = x;\n  }\n  if (x > *upper_bound) {\n    *upper_bound = x;\n  }\n}\n\n// Decrease lower_bound to be <= x_lo AND increase upper_bound to be >= x_hi.\ntemplate <typename T1, typename T2>\ninline void UpdateRange(const T1 &x_lo, const T1 &x_hi, T2 *lower_bound, T2 *upper_bound) {\n  if (x_lo < *lower_bound) {\n    *lower_bound = x_lo;\n  }\n  if (x_hi > *upper_bound) {\n    *upper_bound = x_hi;\n  }\n}\n\n// Intersect the range [*lower2, *upper2] with the range [lower1, upper1],\n// putting the result back in [*lower2, *upper2].\n// If non-intersecting ranges are given, we end up with *lower2 > *upper2.\ntemplate <typename T>\ninline void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2) {\n  if (lower1 > *lower2) {\n    *lower2 = lower1;\n  }\n  if (upper1 < *upper2) {\n    *upper2 = upper1;\n  }\n}\n\n// Proper modulo arithmetic operator. Returns a mod b that works for -ve a.\n// For any integer a and positive b, returns r : 0<=r<b and a=n*b + r for\n// some integer n.\ninline int Modulo(int a, int b) {\n  return (a % b + b) % b;\n}\n\n// Integer division operator with rounding that works for negative input.\n// Returns a divided by b, rounded to the nearest integer, without double\n// counting at 0. With simple rounding 1/3 = 0, 0/3 = 0 -1/3 = 0, -2/3 = 0,\n// -3/3 = 0 and -4/3 = -1.\n// I want 1/3 = 0, 0/3 = 0, -1/3 = 0, -2/3 = -1, -3/3 = -1 and -4/3 = -1.\ninline int DivRounded(int a, int b) {\n  if (b < 0) {\n    return -DivRounded(a, -b);\n  }\n  return a >= 0 ? (a + b / 2) / b : (a - b / 2) / b;\n}\n\n// Return a double cast to int with rounding.\ninline int IntCastRounded(double x) {\n  assert(std::isfinite(x));\n  assert(x < INT_MAX);\n  assert(x > INT_MIN);\n  return x >= 0.0 ? static_cast<int>(x + 0.5) : -static_cast<int>(-x + 0.5);\n}\n\n// Return a float cast to int with rounding.\ninline int IntCastRounded(float x) {\n  assert(std::isfinite(x));\n  return x >= 0.0F ? static_cast<int>(x + 0.5F) : -static_cast<int>(-x + 0.5F);\n}\n\n// Reverse the order of bytes in a n byte quantity for big/little-endian switch.\ninline void ReverseN(void *ptr, int num_bytes) {\n  assert(num_bytes == 1 || num_bytes == 2 || num_bytes == 4 || num_bytes == 8);\n  char *cptr = static_cast<char *>(ptr);\n  int halfsize = num_bytes / 2;\n  for (int i = 0; i < halfsize; ++i) {\n    char tmp = cptr[i];\n    cptr[i] = cptr[num_bytes - 1 - i];\n    cptr[num_bytes - 1 - i] = tmp;\n  }\n}\n\n// Reverse the order of bytes in a 32 bit quantity for big/little-endian switch.\ninline void Reverse32(void *ptr) {\n  ReverseN(ptr, 4);\n}\n\n// Reads a vector of simple types from the given file. Assumes that bitwise\n// read/write will work with ReverseN according to sizeof(T).\n// Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\ntemplate <typename T>\nbool DeSerialize(bool swap, FILE *fp, std::vector<T> &data) {\n  uint32_t size;\n  if (fread(&size, sizeof(size), 1, fp) != 1) {\n    return false;\n  }\n  if (swap) {\n    Reverse32(&size);\n  }\n  // Arbitrarily limit the number of elements to protect against bad data.\n  assert(size <= UINT16_MAX);\n  if (size > UINT16_MAX) {\n    return false;\n  }\n  // TODO: optimize.\n  data.resize(size);\n  if (size > 0) {\n    if (fread(&data[0], sizeof(T), size, fp) != size) {\n      return false;\n    }\n    if (swap) {\n      for (uint32_t i = 0; i < size; ++i) {\n        ReverseN(&data[i], sizeof(T));\n      }\n    }\n  }\n  return true;\n}\n\n// Writes a vector of simple types to the given file. Assumes that bitwise\n// read/write of T will work. Returns false in case of error.\ntemplate <typename T>\nbool Serialize(FILE *fp, const std::vector<T> &data) {\n  uint32_t size = data.size();\n  if (fwrite(&size, sizeof(size), 1, fp) != 1) {\n    return false;\n  } else if constexpr (std::is_class<T>::value) {\n    // Serialize a tesseract class.\n    for (auto &item : data) {\n      if (!item.Serialize(fp)) {\n        return false;\n      }\n    }\n  } else if constexpr (std::is_pointer<T>::value) {\n    // Serialize pointers.\n    for (auto &item : data) {\n      uint8_t non_null = (item != nullptr);\n      if (!Serialize(fp, &non_null)) {\n        return false;\n      }\n      if (non_null) {\n        if (!item->Serialize(fp)) {\n          return false;\n        }\n      }\n    }\n  } else if (size > 0) {\n    if (fwrite(&data[0], sizeof(T), size, fp) != size) {\n      return false;\n    }\n  }\n  return true;\n}\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_HELPERS_H_\n"
  },
  {
    "path": "src/ccutil/host.h",
    "content": "/******************************************************************************\n **  Filename:       host.h\n **  Purpose:        This is the system independent typedefs and defines\n **  Author:         MN, JG, MD\n **\n **  (c) Copyright Hewlett-Packard Company, 1988-1996.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n */\n\n#ifndef TESSERACT_CCUTIL_HOST_H_\n#define TESSERACT_CCUTIL_HOST_H_\n\n#include <tesseract/export.h>\n\n#include <climits>\n#include <limits>\n\n/* _WIN32 */\n#ifdef _WIN32\n#  ifndef NOMINMAX\n#    define NOMINMAX\n#  endif /* NOMINMAX */\n#  ifndef WIN32_LEAN_AND_MEAN\n#    define WIN32_LEAN_AND_MEAN\n#  endif\n#  include <windows.h>\n#  undef min\n#  undef max\n#endif // _WIN32\n\n#ifndef _WIN32\n#  ifndef PATH_MAX\n#    define MAX_PATH 4096\n#  else\n#    define MAX_PATH PATH_MAX\n#  endif\n#endif\n\nnamespace tesseract {\n\n// Return true if x is within tolerance of y\ntemplate <class T>\nbool NearlyEqual(T x, T y, T tolerance) {\n  T diff = x - y;\n  return diff <= tolerance && -diff <= tolerance;\n}\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_HOST_H_\n"
  },
  {
    "path": "src/ccutil/indexmapbidi.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        indexmapbidi.cpp\n// Description: Bi-directional mapping between a sparse and compact space.\n// Author:      rays@google.com (Ray Smith)\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"helpers.h\"\n#include \"indexmapbidi.h\"\n#include \"serialis.h\"\n\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nIndexMap::~IndexMap() = default;\n\n// SparseToCompact takes a sparse index to an index in the compact space.\n// Uses a binary search to find the result. For faster speed use\n// IndexMapBiDi, but that takes more memory.\nint IndexMap::SparseToCompact(int sparse_index) const {\n  auto pos = std::upper_bound(compact_map_.begin(), compact_map_.end(), sparse_index);\n  if (pos > compact_map_.begin()) {\n    --pos;\n  }\n  auto result = pos - compact_map_.begin();\n  return compact_map_[result] == sparse_index ? result : -1;\n}\n\n// Copy from the input.\nvoid IndexMap::CopyFrom(const IndexMap &src) {\n  sparse_size_ = src.sparse_size_;\n  compact_map_ = src.compact_map_;\n}\nvoid IndexMap::CopyFrom(const IndexMapBiDi &src) {\n  sparse_size_ = src.SparseSize();\n  compact_map_ = src.compact_map_;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool IndexMap::Serialize(FILE *fp) const {\n  return tesseract::Serialize(fp, &sparse_size_) && tesseract::Serialize(fp, compact_map_);\n}\n\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool IndexMap::DeSerialize(bool swap, FILE *fp) {\n  uint32_t sparse_size;\n  if (!tesseract::DeSerialize(fp, &sparse_size)) {\n    return false;\n  }\n  if (swap) {\n    ReverseN(&sparse_size, sizeof(sparse_size));\n  }\n  // Arbitrarily limit the number of elements to protect against bad data.\n  if (sparse_size > UINT16_MAX) {\n    return false;\n  }\n  sparse_size_ = sparse_size;\n  return tesseract::DeSerialize(swap, fp, compact_map_);\n}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nIndexMapBiDi::~IndexMapBiDi() = default;\n\n// Top-level init function in a single call to initialize a map to select\n// a single contiguous subrange [start, end) of the sparse space to be mapped\n// 1 to 1 to the compact space, with all other elements of the sparse space\n// left unmapped.\n// No need to call Setup after this.\nvoid IndexMapBiDi::InitAndSetupRange(int sparse_size, int start, int end) {\n  Init(sparse_size, false);\n  for (int i = start; i < end; ++i) {\n    SetMap(i, true);\n  }\n  Setup();\n}\n\n// Initializes just the sparse_map_ to the given size with either all\n// forward indices mapped (all_mapped = true) or none (all_mapped = false).\n// Call Setup immediately after, or make calls to SetMap first to adjust the\n// mapping and then call Setup before using the map.\nvoid IndexMapBiDi::Init(int size, bool all_mapped) {\n  if (!all_mapped) {\n    sparse_map_.clear();\n  }\n  sparse_map_.resize(size, -1);\n  if (all_mapped) {\n    for (int i = 0; i < size; ++i) {\n      sparse_map_[i] = i;\n    }\n  }\n}\n\n// Sets a given index in the sparse_map_ to be mapped or not.\nvoid IndexMapBiDi::SetMap(int sparse_index, bool mapped) {\n  sparse_map_[sparse_index] = mapped ? 0 : -1;\n}\n\n// Sets up the sparse_map_ and compact_map_ properly after Init and\n// some calls to SetMap. Assumes an ordered 1-1 map from set indices\n// in the forward map to the compact space.\nvoid IndexMapBiDi::Setup() {\n  int compact_size = 0;\n  for (int &i : sparse_map_) {\n    if (i >= 0) {\n      i = compact_size++;\n    }\n  }\n  compact_map_.clear();\n  compact_map_.resize(compact_size, -1);\n  for (size_t i = 0; i < sparse_map_.size(); ++i) {\n    if (sparse_map_[i] >= 0) {\n      compact_map_[sparse_map_[i]] = i;\n    }\n  }\n  sparse_size_ = sparse_map_.size();\n}\n\n// Copy from the input.\nvoid IndexMapBiDi::CopyFrom(const IndexMapBiDi &src) {\n  sparse_map_ = src.sparse_map_;\n  compact_map_ = src.compact_map_;\n  sparse_size_ = sparse_map_.size();\n}\n\n// Merges the two compact space indices. May be called many times, but\n// the merges must be concluded by a call to CompleteMerges.\n// Returns true if a merge was actually performed.\nbool IndexMapBiDi::Merge(int compact_index1, int compact_index2) {\n  // Find the current master index for index1 and index2.\n  compact_index1 = MasterCompactIndex(compact_index1);\n  compact_index2 = MasterCompactIndex(compact_index2);\n  // Be sure that index1 < index2.\n  if (compact_index1 > compact_index2) {\n    int tmp = compact_index1;\n    compact_index1 = compact_index2;\n    compact_index2 = tmp;\n  } else if (compact_index1 == compact_index2) {\n    return false;\n  }\n  // To save iterating over all sparse_map_ entries, simply make the master\n  // entry for index2 point to index1.\n  // This leaves behind a potential chain of parents that needs to be chased,\n  // as above.\n  sparse_map_[compact_map_[compact_index2]] = compact_index1;\n  if (compact_index1 >= 0) {\n    compact_map_[compact_index2] = compact_map_[compact_index1];\n  }\n  return true;\n}\n\n// Completes one or more Merge operations by further compacting the\n// compact space. Unused compact space indices are removed, and the used\n// ones above shuffled down to fill the gaps.\n// Example:\n// Input sparse_map_: (x indicates -1)\n// x x 0 x 2 x x 4 x 0 x 2 x\n// Output sparse_map_:\n// x x 0 x 1 x x 2 x 0 x 1 x\n// Output compact_map_:\n// 2 4 7.\nvoid IndexMapBiDi::CompleteMerges() {\n  // Ensure each sparse_map_entry contains a master compact_map_ index.\n  int compact_size = 0;\n  for (int &i : sparse_map_) {\n    int compact_index = MasterCompactIndex(i);\n    i = compact_index;\n    if (compact_index >= compact_size) {\n      compact_size = compact_index + 1;\n    }\n  }\n  // Re-generate the compact_map leaving holes for unused indices.\n  compact_map_.clear();\n  compact_map_.resize(compact_size, -1);\n  for (size_t i = 0; i < sparse_map_.size(); ++i) {\n    if (sparse_map_[i] >= 0) {\n      if (compact_map_[sparse_map_[i]] == -1) {\n        compact_map_[sparse_map_[i]] = i;\n      }\n    }\n  }\n  // Compact the compact_map, leaving tmp_compact_map saying where each\n  // index went to in the compacted map.\n  std::vector<int32_t> tmp_compact_map(compact_size, -1);\n  compact_size = 0;\n  for (size_t i = 0; i < compact_map_.size(); ++i) {\n    if (compact_map_[i] >= 0) {\n      tmp_compact_map[i] = compact_size;\n      compact_map_[compact_size++] = compact_map_[i];\n    }\n  }\n  compact_map_.resize(compact_size);\n  // Now modify the entries in the sparse map to point to the new locations.\n  for (int &i : sparse_map_) {\n    if (i >= 0) {\n      i = tmp_compact_map[i];\n    }\n  }\n}\n\n// Writes to the given file. Returns false in case of error.\nbool IndexMapBiDi::Serialize(FILE *fp) const {\n  if (!IndexMap::Serialize(fp)) {\n    return false;\n  }\n  // Make a vector containing the rest of the map. If the map is many-to-one\n  // then each additional sparse entry needs to be stored.\n  // Normally we store only the compact map to save space.\n  std::vector<int32_t> remaining_pairs;\n  for (unsigned i = 0; i < sparse_map_.size(); ++i) {\n    if (sparse_map_[i] >= 0 && static_cast<unsigned>(compact_map_[sparse_map_[i]]) != i) {\n      remaining_pairs.push_back(i);\n      remaining_pairs.push_back(sparse_map_[i]);\n    }\n  }\n  return tesseract::Serialize(fp, remaining_pairs);\n}\n\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool IndexMapBiDi::DeSerialize(bool swap, FILE *fp) {\n  if (!IndexMap::DeSerialize(swap, fp)) {\n    return false;\n  }\n  std::vector<int32_t> remaining_pairs;\n  if (!tesseract::DeSerialize(swap, fp, remaining_pairs)) {\n    return false;\n  }\n  sparse_map_.clear();\n  sparse_map_.resize(sparse_size_, -1);\n  for (unsigned i = 0; i < compact_map_.size(); ++i) {\n    sparse_map_[compact_map_[i]] = i;\n  }\n  for (size_t i = 0; i < remaining_pairs.size(); ++i) {\n    int sparse_index = remaining_pairs[i++];\n    sparse_map_[sparse_index] = remaining_pairs[i];\n  }\n  return true;\n}\n\n// Bulk calls to SparseToCompact.\n// Maps the given array of sparse indices to an array of compact indices.\n// Assumes the input is sorted. The output indices are sorted and uniqued.\n// Return value is the number of \"missed\" features, being features that\n// don't map to the compact feature space.\nint IndexMapBiDi::MapFeatures(const std::vector<int> &sparse, std::vector<int> *compact) const {\n  compact->clear();\n  int num_features = sparse.size();\n  int missed_features = 0;\n  int prev_good_feature = -1;\n  for (int f = 0; f < num_features; ++f) {\n    int feature = sparse_map_[sparse[f]];\n    if (feature >= 0) {\n      if (feature != prev_good_feature) {\n        compact->push_back(feature);\n        prev_good_feature = feature;\n      }\n    } else {\n      ++missed_features;\n    }\n  }\n  return missed_features;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccutil/indexmapbidi.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        indexmapbidi.h\n// Description: Bi-directional mapping between a sparse and compact space.\n// Author:      rays@google.com (Ray Smith)\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_INDEXMAPBIDI_H_\n#define TESSERACT_CCUTIL_INDEXMAPBIDI_H_\n\n#include <tesseract/export.h> // for TESS_API\n\n#include <cstdint>  // for int32_t\n#include <cstdio>\n#include <vector>\n\nnamespace tesseract {\n\nclass IndexMapBiDi;\n\n// Bidirectional one-to-one mapping between a sparse and a compact discrete\n// space. Many entries in the sparse space are unmapped, but those that are\n// mapped have a 1-1 mapping to (and from) the compact space, where all\n// values are used. This is useful for forming subsets of larger collections,\n// such as subsets of character sets, or subsets of binary feature spaces.\n//\n// This base class provides basic functionality with binary search for the\n// SparseToCompact mapping to save memory.\n// For a faster inverse mapping, or to allow a many-to-one mapping, use\n// IndexMapBiDi below.\n// NOTE: there are currently no methods to setup an IndexMap on its own!\n// It must be initialized by copying from an IndexMapBiDi or by DeSerialize.\nclass TESS_API IndexMap {\npublic:\n  virtual ~IndexMap();\n\n  // SparseToCompact takes a sparse index to an index in the compact space.\n  // Uses a binary search to find the result. For faster speed use\n  // IndexMapBiDi, but that takes more memory.\n  virtual int SparseToCompact(int sparse_index) const;\n\n  // CompactToSparse takes a compact index to the corresponding index in the\n  // sparse space.\n  int CompactToSparse(int compact_index) const {\n    return compact_map_[compact_index];\n  }\n  // The size of the sparse space.\n  virtual int SparseSize() const {\n    return sparse_size_;\n  }\n  // The size of the compact space.\n  int CompactSize() const {\n    return compact_map_.size();\n  }\n\n  // Copy from the input.\n  void CopyFrom(const IndexMap &src);\n  void CopyFrom(const IndexMapBiDi &src);\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\nprotected:\n  // The sparse space covers integers in the range [0, sparse_size_-1].\n  int32_t sparse_size_;\n  // The compact space covers integers in the range [0, compact_map_.size()-1].\n  // Each element contains the corresponding sparse index.\n  std::vector<int32_t> compact_map_;\n};\n\n// Bidirectional many-to-one mapping between a sparse and a compact discrete\n// space. As with IndexMap, many entries may be unmapped, but unlike IndexMap,\n// of those that are, many may be mapped to the same compact index.\n// If the map is many-to-one, it is not possible to directly obtain all the\n// sparse indices that map to a single compact index.\n// This map is time- rather than space-efficient. It stores the entire sparse\n// space.\n// IndexMapBiDi may be initialized in one of 3 ways:\n// 1. Init(size, true);\n//    Setup();\n//    Sets a complete 1:1 mapping with no unmapped elements.\n// 2. Init(size, false);\n//    for ... SetMap(index, true);\n//    Setup();\n//    Specifies precisely which sparse indices are mapped. The mapping is 1:1.\n// 3. Either of the above, followed by:\n//    for ... Merge(index1, index2);\n//    CompleteMerges();\n//    Allows a many-to-one mapping by merging compact space indices.\nclass TESS_API IndexMapBiDi : public IndexMap {\npublic:\n  ~IndexMapBiDi() override;\n\n  // Top-level init function in a single call to initialize a map to select\n  // a single contiguous subrange [start, end) of the sparse space to be mapped\n  // 1 to 1 to the compact space, with all other elements of the sparse space\n  // left unmapped.\n  // No need to call Setup after this.\n  void InitAndSetupRange(int sparse_size, int start, int end);\n\n  // Initializes just the sparse_map_ to the given size with either all\n  // forward indices mapped (all_mapped = true) or none (all_mapped = false).\n  // Call Setup immediately after, or make calls to SetMap first to adjust the\n  // mapping and then call Setup before using the map.\n  void Init(int size, bool all_mapped);\n  // Sets a given index in the sparse_map_ to be mapped or not.\n  void SetMap(int sparse_index, bool mapped);\n  // Sets up the sparse_map_ and compact_map_ properly after Init and\n  // some calls to SetMap. Assumes an ordered 1-1 map from set indices\n  // in the sparse space to the compact space.\n  void Setup();\n\n  // Merges the two compact space indices. May be called many times, but\n  // the merges must be concluded by a call to CompleteMerges.\n  // Returns true if a merge was actually performed.\n  bool Merge(int compact_index1, int compact_index2);\n  // Returns true if the given compact index has been deleted.\n  bool IsCompactDeleted(int index) const {\n    return MasterCompactIndex(index) < 0;\n  }\n  // Completes one or more Merge operations by further compacting the\n  // compact space.\n  void CompleteMerges();\n\n  // SparseToCompact takes a sparse index to an index in the compact space.\n  int SparseToCompact(int sparse_index) const override {\n    return sparse_map_[sparse_index];\n  }\n  // The size of the sparse space.\n  int SparseSize() const override {\n    return sparse_map_.size();\n  }\n\n  // Copy from the input.\n  void CopyFrom(const IndexMapBiDi &src);\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\n  // Bulk calls to SparseToCompact.\n  // Maps the given array of sparse indices to an array of compact indices.\n  // Assumes the input is sorted. The output indices are sorted and uniqued.\n  // Return value is the number of \"missed\" features, being features that\n  // don't map to the compact feature space.\n  int MapFeatures(const std::vector<int> &sparse, std::vector<int> *compact) const;\n\nprivate:\n  // Returns the master compact index for a given compact index.\n  // During a multiple merge operation, several compact indices may be\n  // combined, so we need to be able to find the master of all.\n  int MasterCompactIndex(int compact_index) const {\n    while (compact_index >= 0 && sparse_map_[compact_map_[compact_index]] != compact_index) {\n      compact_index = sparse_map_[compact_map_[compact_index]];\n    }\n    return compact_index;\n  }\n\n  // Direct look-up of the compact index for each element in sparse space.\n  std::vector<int32_t> sparse_map_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCUTIL_INDEXMAPBIDI_H_\n"
  },
  {
    "path": "src/ccutil/kdpair.h",
    "content": "// Copyright 2012 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        kdpair.h\n// Description: Template pair class like STL pair but geared towards\n//              the Key+Data design pattern in which some data needs\n//              to be sorted or kept in a heap sorted on some separate key.\n// Author:      Ray Smith.\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_KDPAIR_H_\n#define TESSERACT_CCUTIL_KDPAIR_H_\n\n#include <vector>\n\nnamespace tesseract {\n\n// A useful base struct to facilitate the common operation of sorting a vector\n// of simple or smart-pointer data using a separate key. Similar to STL pair.\ntemplate <typename Key, typename Data>\nstruct KDPair {\n  KDPair() = default;\n  KDPair(Key k, Data d) : data_(d), key_(k) {}\n\n  int operator==(const KDPair<Key, Data> &other) const {\n    return key_ == other.key_;\n  }\n\n  Data &data() {\n    return data_;\n  }\n  const Data &data() const {\n    return data_;\n  }\n  Key &key() {\n    return key_;\n  }\n  const Key &key() const {\n    return key_;\n  }\n\n  // WARNING! Keep data as the first element! KDPairInc and KDPairDec depend\n  // on the order of these elements so they can downcast pointers appropriately\n  // for use by GenericHeap::Reshuffle.\n  Data data_;\n  Key key_;\n};\n// Specialization of KDPair to provide operator< for sorting in increasing order\n// and recasting of data pointers for use with DoublePtr.\ntemplate <typename Key, typename Data>\nstruct KDPairInc : public KDPair<Key, Data> {\n  KDPairInc() = default;\n  KDPairInc(Key k, Data d) : KDPair<Key, Data>(k, d) {}\n  // Operator< facilitates sorting in increasing order.\n  int operator<(const KDPairInc<Key, Data> &other) const {\n    return this->key() < other.key();\n  }\n  // Returns the input Data pointer recast to a KDPairInc pointer.\n  // Just casts a pointer to the first element to a pointer to the whole struct.\n  static KDPairInc *RecastDataPointer(Data *data_ptr) {\n    return reinterpret_cast<KDPairInc *>(data_ptr);\n  }\n};\n// Specialization of KDPair to provide operator< for sorting in decreasing order\n// and recasting of data pointers for use with DoublePtr.\ntemplate <typename Key, typename Data>\nstruct KDPairDec : public KDPair<Key, Data> {\n  KDPairDec() = default;\n  KDPairDec(Key k, Data d) : KDPair<Key, Data>(k, d) {}\n  // Operator< facilitates sorting in decreasing order by using operator> on\n  // the key values.\n  int operator<(const KDPairDec<Key, Data> &other) const {\n    return this->key() > other.key();\n  }\n  // Returns the input Data pointer recast to a KDPairDec pointer.\n  // Just casts a pointer to the first element to a pointer to the whole struct.\n  static KDPairDec *RecastDataPointer(Data *data_ptr) {\n    return reinterpret_cast<KDPairDec *>(data_ptr);\n  }\n};\n\n// A useful base class to facilitate the common operation of sorting a vector\n// of owned pointer data using a separate key. This class owns its data pointer,\n// deleting it when it has finished with it, and providing copy constructor and\n// operator= that have move semantics so that the data does not get copied and\n// only a single instance of KDPtrPair holds a specific data pointer.\ntemplate <typename Key, typename Data>\nclass KDPtrPair {\npublic:\n  KDPtrPair() : data_(nullptr) {}\n  KDPtrPair(Key k, Data *d) : data_(d), key_(k) {}\n  // Copy constructor steals the pointer from src and nulls it in src, thereby\n  // moving the (single) ownership of the data.\n  KDPtrPair(const KDPtrPair &src) : data_(src.data_), key_(src.key_) {\n    ((KDPtrPair &)src).data_ = nullptr;\n  }\n  // Destructor deletes data, assuming it is the sole owner.\n  ~KDPtrPair() {\n    delete this->data_;\n    this->data_ = nullptr;\n  }\n  // Operator= steals the pointer from src and nulls it in src, thereby\n  // moving the (single) ownership of the data.\n  void operator=(const KDPtrPair &src) {\n    delete this->data_;\n    this->data_ = src.data_;\n    ((KDPtrPair &)src).data_ = nullptr;\n    this->key_ = src.key_;\n  }\n\n  int operator==(const KDPtrPair<Key, Data> &other) const {\n    return key_ == other.key_;\n  }\n\n  // Accessors.\n  const Key &key() const {\n    return key_;\n  }\n  void set_key(const Key &new_key) {\n    key_ = new_key;\n  }\n  const Data *data() const {\n    return data_;\n  }\n  // Sets the data pointer, taking ownership of the data.\n  void set_data(Data *new_data) {\n    delete data_;\n    data_ = new_data;\n  }\n  // Relinquishes ownership of the data pointer (setting it to nullptr).\n  Data *extract_data() {\n    Data *result = data_;\n    data_ = nullptr;\n    return result;\n  }\n\nprivate:\n  // Data members are private to keep deletion of data_ encapsulated.\n  Data *data_;\n  Key key_;\n};\n// Specialization of KDPtrPair to provide operator< for sorting in increasing\n// order.\ntemplate <typename Key, typename Data>\nstruct KDPtrPairInc : public KDPtrPair<Key, Data> {\n  // Since we are doing non-standard stuff we have to duplicate *all* the\n  // constructors and operator=.\n  KDPtrPairInc() : KDPtrPair<Key, Data>() {}\n  KDPtrPairInc(Key k, Data *d) : KDPtrPair<Key, Data>(k, d) {}\n  KDPtrPairInc(const KDPtrPairInc &src) : KDPtrPair<Key, Data>(src) {}\n  void operator=(const KDPtrPairInc &src) {\n    KDPtrPair<Key, Data>::operator=(src);\n  }\n  // Operator< facilitates sorting in increasing order.\n  int operator<(const KDPtrPairInc<Key, Data> &other) const {\n    return this->key() < other.key();\n  }\n};\n// Specialization of KDPtrPair to provide operator< for sorting in decreasing\n// order.\ntemplate <typename Key, typename Data>\nstruct KDPtrPairDec : public KDPtrPair<Key, Data> {\n  // Since we are doing non-standard stuff we have to duplicate *all* the\n  // constructors and operator=.\n  KDPtrPairDec() : KDPtrPair<Key, Data>() {}\n  KDPtrPairDec(Key k, Data *d) : KDPtrPair<Key, Data>(k, d) {}\n  KDPtrPairDec(const KDPtrPairDec &src) : KDPtrPair<Key, Data>(src) {}\n  void operator=(const KDPtrPairDec &src) {\n    KDPtrPair<Key, Data>::operator=(src);\n  }\n  // Operator< facilitates sorting in decreasing order by using operator> on\n  // the key values.\n  int operator<(const KDPtrPairDec<Key, Data> &other) const {\n    return this->key() > other.key();\n  }\n};\n\n// Specialization for a pair of ints in increasing order.\nusing IntKDPair = KDPairInc<int, int>;\n\n// Vector of IntKDPair.\nclass KDVector : public std::vector<IntKDPair> {\n  // TODO(rays) Add some code to manipulate a KDVector. For now there\n  // is nothing and this class is effectively a specialization typedef.\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_KDPAIR_H_\n"
  },
  {
    "path": "src/ccutil/lsterr.h",
    "content": "/**********************************************************************\n * File:        lsterr.h  (Formerly listerr.h)\n * Description: Errors shared by list modules\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1990, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCUTIL_LSTERR_H_\n#define TESSERACT_CCUTIL_LSTERR_H_\n\n#include \"errcode.h\" //must be last include\n\nnamespace tesseract {\n\n#ifndef NDEBUG\n\nconstexpr ERRCODE NO_LIST(\"Iterator not set to a list\");\nconstexpr ERRCODE NULL_DATA(\"List would have returned a nullptr data pointer\");\nconstexpr ERRCODE NULL_CURRENT(\"List current position is nullptr\");\nconstexpr ERRCODE NULL_NEXT(\"Next element on the list is nullptr\");\nconstexpr ERRCODE NULL_PREV(\"Previous element on the list is nullptr\");\nconstexpr ERRCODE EMPTY_LIST(\"List is empty\");\nconstexpr ERRCODE BAD_PARAMETER(\"List parameter error\");\nconstexpr ERRCODE STILL_LINKED(\"Attempting to add an element with non nullptr links, to a list\");\n\n#endif // !NDEBUG\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_LSTERR_H_\n"
  },
  {
    "path": "src/ccutil/object_cache.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        object_cache.h\n// Description: A string indexed object cache.\n// Author:      David Eger\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_OBJECT_CACHE_H_\n#define TESSERACT_CCUTIL_OBJECT_CACHE_H_\n\n#include <functional> // for std::function\n#include <mutex>      // for std::mutex\n#include <string>\n#include <vector>     // for std::vector\n#include \"ccutil.h\"\n#include \"errcode.h\"\n\nnamespace tesseract {\n\n// A simple object cache which maps a string to an object of type T.\n// Usually, these are expensive objects that are loaded from disk.\n// Reference counting is performed, so every Get() needs to be followed later\n// by a Free().  Actual deletion is accomplished by DeleteUnusedObjects().\ntemplate <typename T>\nclass ObjectCache {\npublic:\n  ObjectCache() = default;\n  ~ObjectCache() {\n    std::lock_guard<std::mutex> guard(mu_);\n    for (auto &it : cache_) {\n      if (it.count > 0) {\n        tprintf(\n            \"ObjectCache(%p)::~ObjectCache(): WARNING! LEAK! object %p \"\n            \"still has count %d (id %s)\\n\",\n            static_cast<void *>(this), static_cast<void *>(it.object),\n            it.count, it.id.c_str());\n      } else {\n        delete it.object;\n        it.object = nullptr;\n      }\n    }\n  }\n\n  // Return a pointer to the object identified by id.\n  // If we haven't yet loaded the object, use loader to load it.\n  // If loader fails to load it, record a nullptr entry in the cache\n  // and return nullptr -- further attempts to load will fail (even\n  // with a different loader) until DeleteUnusedObjects() is called.\n  // We delete the given loader.\n  T *Get(const std::string &id, std::function<T *()> loader) {\n    T *retval = nullptr;\n    std::lock_guard<std::mutex> guard(mu_);\n    for (auto &it : cache_) {\n      if (id == it.id) {\n        retval = it.object;\n        if (it.object != nullptr) {\n          it.count++;\n        }\n        return retval;\n      }\n    }\n    cache_.push_back(ReferenceCount());\n    ReferenceCount &rc = cache_.back();\n    rc.id = id;\n    retval = rc.object = loader();\n    rc.count = (retval != nullptr) ? 1 : 0;\n    return retval;\n  }\n\n  // Decrement the count for t.\n  // Return whether we knew about the given pointer.\n  bool Free(T *t) {\n    if (t == nullptr) {\n      return false;\n    }\n    std::lock_guard<std::mutex> guard(mu_);\n    for (auto &it : cache_) {\n      if (it.object == t) {\n        --it.count;\n        return true;\n      }\n    }\n    return false;\n  }\n\n  void DeleteUnusedObjects() {\n    std::lock_guard<std::mutex> guard(mu_);\n    cache_.erase(std::remove_if(cache_.begin(), cache_.end(),\n                                [](const ReferenceCount &it) {\n                                  if (it.count <= 0) {\n                                    delete it.object;\n                                    return true;\n                                  } else {\n                                    return false;\n                                  }\n                                }),\n                 cache_.end());\n  }\n\nprivate:\n  struct ReferenceCount {\n    std::string id; // A unique ID to identify the object (think path on disk)\n    T *object;      // A copy of the object in memory.  Can be delete'd.\n    int count;      // A count of the number of active users of this object.\n  };\n\n  std::mutex mu_;\n  std::vector<ReferenceCount> cache_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_OBJECT_CACHE_H_\n"
  },
  {
    "path": "src/ccutil/params.cpp",
    "content": "/**********************************************************************\n * File:        params.cpp\n * Description: Initialization and setting of Tesseract parameters.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"params.h\"\n\n#include \"helpers.h\"  // for chomp_string\n#include \"host.h\"     // tesseract/export.h, windows.h for MAX_PATH\n#include \"serialis.h\" // for TFile\n#include \"tprintf.h\"\n\n#include <climits> // for INT_MIN, INT_MAX\n#include <cmath>   // for NAN, std::isnan\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <locale>  // for std::locale::classic\n#include <sstream> // for std::stringstream\n\nnamespace tesseract {\n\ntesseract::ParamsVectors *GlobalParams() {\n  static tesseract::ParamsVectors global_params = tesseract::ParamsVectors();\n  return &global_params;\n}\n\nbool ParamUtils::ReadParamsFile(const char *file, SetParamConstraint constraint,\n                                ParamsVectors *member_params) {\n  TFile fp;\n  if (!fp.Open(file, nullptr)) {\n    tprintf(\"read_params_file: Can't open %s\\n\", file);\n    return true;\n  }\n  return ReadParamsFromFp(constraint, &fp, member_params);\n}\n\nbool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,\n                                  ParamsVectors *member_params) {\n  char line[MAX_PATH]; // input line\n  bool anyerr = false; // true if any error\n  bool foundit;        // found parameter\n  char *valptr;        // value field\n\n  while (fp->FGets(line, MAX_PATH) != nullptr) {\n    if (line[0] != '\\r' && line[0] != '\\n' && line[0] != '#') {\n      chomp_string(line); // remove newline\n      for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\\t'; valptr++) {\n        ;\n      }\n      if (*valptr) {    // found blank\n        *valptr = '\\0'; // make name a string\n        do {\n          valptr++; // find end of blanks\n        } while (*valptr == ' ' || *valptr == '\\t');\n      }\n      foundit = SetParam(line, valptr, constraint, member_params);\n\n      if (!foundit) {\n        anyerr = true; // had an error\n        tprintf(\"Warning: Parameter not found: %s\\n\", line);\n      }\n    }\n  }\n  return anyerr;\n}\n\nbool ParamUtils::SetParam(const char *name, const char *value, SetParamConstraint constraint,\n                          ParamsVectors *member_params) {\n  // Look for the parameter among string parameters.\n  auto *sp =\n      FindParam<StringParam>(name, GlobalParams()->string_params, member_params->string_params);\n  if (sp != nullptr && sp->constraint_ok(constraint)) {\n    sp->set_value(value);\n  }\n  if (*value == '\\0') {\n    return (sp != nullptr);\n  }\n\n  // Look for the parameter among int parameters.\n  auto *ip = FindParam<IntParam>(name, GlobalParams()->int_params, member_params->int_params);\n  if (ip && ip->constraint_ok(constraint)) {\n    int intval = INT_MIN;\n    std::stringstream stream(value);\n    stream.imbue(std::locale::classic());\n    stream >> intval;\n    if (intval != INT_MIN) {\n      ip->set_value(intval);\n    }\n  }\n\n  // Look for the parameter among bool parameters.\n  auto *bp = FindParam<BoolParam>(name, GlobalParams()->bool_params, member_params->bool_params);\n  if (bp != nullptr && bp->constraint_ok(constraint)) {\n    if (*value == 'T' || *value == 't' || *value == 'Y' || *value == 'y' || *value == '1') {\n      bp->set_value(true);\n    } else if (*value == 'F' || *value == 'f' || *value == 'N' || *value == 'n' || *value == '0') {\n      bp->set_value(false);\n    }\n  }\n\n  // Look for the parameter among double parameters.\n  auto *dp =\n      FindParam<DoubleParam>(name, GlobalParams()->double_params, member_params->double_params);\n  if (dp != nullptr && dp->constraint_ok(constraint)) {\n    double doubleval = NAN;\n    std::stringstream stream(value);\n    stream.imbue(std::locale::classic());\n    stream >> doubleval;\n    if (!std::isnan(doubleval)) {\n      dp->set_value(doubleval);\n    }\n  }\n  return (sp || ip || bp || dp);\n}\n\nbool ParamUtils::GetParamAsString(const char *name, const ParamsVectors *member_params,\n                                  std::string *value) {\n  // Look for the parameter among string parameters.\n  auto *sp =\n      FindParam<StringParam>(name, GlobalParams()->string_params, member_params->string_params);\n  if (sp) {\n    *value = sp->c_str();\n    return true;\n  }\n  // Look for the parameter among int parameters.\n  auto *ip = FindParam<IntParam>(name, GlobalParams()->int_params, member_params->int_params);\n  if (ip) {\n    *value = std::to_string(int32_t(*ip));\n    return true;\n  }\n  // Look for the parameter among bool parameters.\n  auto *bp = FindParam<BoolParam>(name, GlobalParams()->bool_params, member_params->bool_params);\n  if (bp != nullptr) {\n    *value = bool(*bp) ? \"1\" : \"0\";\n    return true;\n  }\n  // Look for the parameter among double parameters.\n  auto *dp =\n      FindParam<DoubleParam>(name, GlobalParams()->double_params, member_params->double_params);\n  if (dp != nullptr) {\n    std::ostringstream stream;\n    stream.imbue(std::locale::classic());\n    stream << double(*dp);\n    *value = stream.str();\n    return true;\n  }\n  return false;\n}\n\nvoid ParamUtils::PrintParams(FILE *fp, const ParamsVectors *member_params) {\n  int num_iterations = (member_params == nullptr) ? 1 : 2;\n  std::ostringstream stream;\n  stream.imbue(std::locale::classic());\n  for (int v = 0; v < num_iterations; ++v) {\n    const ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params;\n    for (auto int_param : vec->int_params) {\n      stream << int_param->name_str() << '\\t' << (int32_t)(*int_param) << '\\t'\n             << int_param->info_str() << '\\n';\n    }\n    for (auto bool_param : vec->bool_params) {\n      stream << bool_param->name_str() << '\\t' << bool(*bool_param) << '\\t'\n             << bool_param->info_str() << '\\n';\n    }\n    for (auto string_param : vec->string_params) {\n      stream << string_param->name_str() << '\\t' << string_param->c_str() << '\\t'\n             << string_param->info_str() << '\\n';\n    }\n    for (auto double_param : vec->double_params) {\n      stream << double_param->name_str() << '\\t' << (double)(*double_param) << '\\t'\n             << double_param->info_str() << '\\n';\n    }\n  }\n  fprintf(fp, \"%s\", stream.str().c_str());\n}\n\n// Resets all parameters back to default values;\nvoid ParamUtils::ResetToDefaults(ParamsVectors *member_params) {\n  int num_iterations = (member_params == nullptr) ? 1 : 2;\n  for (int v = 0; v < num_iterations; ++v) {\n    ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params;\n    for (auto &param : vec->int_params) {\n      param->ResetToDefault();\n    }\n    for (auto &param : vec->bool_params) {\n      param->ResetToDefault();\n    }\n    for (auto &param : vec->string_params) {\n      param->ResetToDefault();\n    }\n    for (auto &param : vec->double_params) {\n      param->ResetToDefault();\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/params.h",
    "content": "/**********************************************************************\n * File:        params.h\n * Description: Class definitions of the *_VAR classes for tunable constants.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef PARAMS_H\n#define PARAMS_H\n\n#include <tesseract/export.h> // for TESS_API\n\n#include <cstdint>\n#include <cstdio>\n#include <cstring>\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\nclass IntParam;\nclass BoolParam;\nclass StringParam;\nclass DoubleParam;\nclass TFile;\n\n// Enum for constraints on what kind of params should be set by SetParam().\nenum SetParamConstraint {\n  SET_PARAM_CONSTRAINT_NONE,\n  SET_PARAM_CONSTRAINT_DEBUG_ONLY,\n  SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY,\n  SET_PARAM_CONSTRAINT_NON_INIT_ONLY,\n};\n\nstruct ParamsVectors {\n  std::vector<IntParam *> int_params;\n  std::vector<BoolParam *> bool_params;\n  std::vector<StringParam *> string_params;\n  std::vector<DoubleParam *> double_params;\n};\n\n// Utility functions for working with Tesseract parameters.\nclass TESS_API ParamUtils {\npublic:\n  // Reads a file of parameter definitions and set/modify the values therein.\n  // If the filename begins with a + or -, the BoolVariables will be\n  // ORed or ANDed with any current values.\n  // Blank lines and lines beginning # are ignored.\n  // Values may have any whitespace after the name and are the rest of line.\n  static bool ReadParamsFile(const char *file, // filename to read\n                             SetParamConstraint constraint, ParamsVectors *member_params);\n\n  // Read parameters from the given file pointer.\n  static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,\n                               ParamsVectors *member_params);\n\n  // Set a parameters to have the given value.\n  static bool SetParam(const char *name, const char *value, SetParamConstraint constraint,\n                       ParamsVectors *member_params);\n\n  // Returns the pointer to the parameter with the given name (of the\n  // appropriate type) if it was found in the vector obtained from\n  // GlobalParams() or in the given member_params.\n  template <class T>\n  static T *FindParam(const char *name, const std::vector<T *> &global_vec,\n                      const std::vector<T *> &member_vec) {\n    for (auto *param : global_vec) {\n      if (strcmp(param->name_str(), name) == 0) {\n        return param;\n      }\n    }\n    for (auto *param : member_vec) {\n      if (strcmp(param->name_str(), name) == 0) {\n        return param;\n      }\n    }\n    return nullptr;\n  }\n  // Removes the given pointer to the param from the given vector.\n  template <class T>\n  static void RemoveParam(T *param_ptr, std::vector<T *> *vec) {\n    for (auto it = vec->begin(); it != vec->end(); ++it) {\n      if (*it == param_ptr) {\n        vec->erase(it);\n        break;\n      }\n    }\n  }\n  // Fetches the value of the named param as a string. Returns false if not\n  // found.\n  static bool GetParamAsString(const char *name, const ParamsVectors *member_params,\n                               std::string *value);\n\n  // Print parameters to the given file.\n  static void PrintParams(FILE *fp, const ParamsVectors *member_params);\n\n  // Resets all parameters back to default values;\n  static void ResetToDefaults(ParamsVectors *member_params);\n};\n\n// Definition of various parameter types.\nclass Param {\npublic:\n  ~Param() = default;\n\n  const char *name_str() const {\n    return name_;\n  }\n  const char *info_str() const {\n    return info_;\n  }\n  bool is_init() const {\n    return init_;\n  }\n  bool is_debug() const {\n    return debug_;\n  }\n  bool constraint_ok(SetParamConstraint constraint) const {\n    return (constraint == SET_PARAM_CONSTRAINT_NONE ||\n            (constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY && this->is_debug()) ||\n            (constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY && !this->is_debug()) ||\n            (constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY && !this->is_init()));\n  }\n\nprotected:\n  Param(const char *name, const char *comment, bool init)\n      : name_(name), info_(comment), init_(init) {\n    debug_ = (strstr(name, \"debug\") != nullptr) || (strstr(name, \"display\"));\n  }\n\n  const char *name_; // name of this parameter\n  const char *info_; // for menus\n  bool init_;        // needs to be set before init\n  bool debug_;\n};\n\nclass IntParam : public Param {\npublic:\n  IntParam(int32_t value, const char *name, const char *comment, bool init, ParamsVectors *vec)\n      : Param(name, comment, init) {\n    value_ = value;\n    default_ = value;\n    params_vec_ = &(vec->int_params);\n    vec->int_params.push_back(this);\n  }\n  ~IntParam() {\n    ParamUtils::RemoveParam<IntParam>(this, params_vec_);\n  }\n  operator int32_t() const {\n    return value_;\n  }\n  void operator=(int32_t value) {\n    value_ = value;\n  }\n  void set_value(int32_t value) {\n    value_ = value;\n  }\n  void ResetToDefault() {\n    value_ = default_;\n  }\n  void ResetFrom(const ParamsVectors *vec) {\n    for (auto *param : vec->int_params) {\n      if (strcmp(param->name_str(), name_) == 0) {\n        // printf(\"overriding param %s=%d by =%d\\n\", name_, value_,\n        // param);\n        value_ = *param;\n        break;\n      }\n    }\n  }\n\nprivate:\n  int32_t value_;\n  int32_t default_;\n  // Pointer to the vector that contains this param (not owned by this class).\n  std::vector<IntParam *> *params_vec_;\n};\n\nclass BoolParam : public Param {\npublic:\n  BoolParam(bool value, const char *name, const char *comment, bool init, ParamsVectors *vec)\n      : Param(name, comment, init) {\n    value_ = value;\n    default_ = value;\n    params_vec_ = &(vec->bool_params);\n    vec->bool_params.push_back(this);\n  }\n  ~BoolParam() {\n    ParamUtils::RemoveParam<BoolParam>(this, params_vec_);\n  }\n  operator bool() const {\n    return value_;\n  }\n  void operator=(bool value) {\n    value_ = value;\n  }\n  void set_value(bool value) {\n    value_ = value;\n  }\n  void ResetToDefault() {\n    value_ = default_;\n  }\n  void ResetFrom(const ParamsVectors *vec) {\n    for (auto *param : vec->bool_params) {\n      if (strcmp(param->name_str(), name_) == 0) {\n        // printf(\"overriding param %s=%s by =%s\\n\", name_, value_ ? \"true\" :\n        // \"false\", *param ? \"true\" : \"false\");\n        value_ = *param;\n        break;\n      }\n    }\n  }\n\nprivate:\n  bool value_;\n  bool default_;\n  // Pointer to the vector that contains this param (not owned by this class).\n  std::vector<BoolParam *> *params_vec_;\n};\n\nclass StringParam : public Param {\npublic:\n  StringParam(const char *value, const char *name, const char *comment, bool init,\n              ParamsVectors *vec)\n      : Param(name, comment, init) {\n    value_ = value;\n    default_ = value;\n    params_vec_ = &(vec->string_params);\n    vec->string_params.push_back(this);\n  }\n  ~StringParam() {\n    ParamUtils::RemoveParam<StringParam>(this, params_vec_);\n  }\n  operator std::string &() {\n    return value_;\n  }\n  const char *c_str() const {\n    return value_.c_str();\n  }\n  bool contains(char c) const {\n    return value_.find(c) != std::string::npos;\n  }\n  bool empty() const {\n    return value_.empty();\n  }\n  bool operator==(const std::string &other) const {\n    return value_ == other;\n  }\n  void operator=(const std::string &value) {\n    value_ = value;\n  }\n  void set_value(const std::string &value) {\n    value_ = value;\n  }\n  void ResetToDefault() {\n    value_ = default_;\n  }\n  void ResetFrom(const ParamsVectors *vec) {\n    for (auto *param : vec->string_params) {\n      if (strcmp(param->name_str(), name_) == 0) {\n        // printf(\"overriding param %s=%s by =%s\\n\", name_, value_,\n        // param->c_str());\n        value_ = *param;\n        break;\n      }\n    }\n  }\n\nprivate:\n  std::string value_;\n  std::string default_;\n  // Pointer to the vector that contains this param (not owned by this class).\n  std::vector<StringParam *> *params_vec_;\n};\n\nclass DoubleParam : public Param {\npublic:\n  DoubleParam(double value, const char *name, const char *comment, bool init, ParamsVectors *vec)\n      : Param(name, comment, init) {\n    value_ = value;\n    default_ = value;\n    params_vec_ = &(vec->double_params);\n    vec->double_params.push_back(this);\n  }\n  ~DoubleParam() {\n    ParamUtils::RemoveParam<DoubleParam>(this, params_vec_);\n  }\n  operator double() const {\n    return value_;\n  }\n  void operator=(double value) {\n    value_ = value;\n  }\n  void set_value(double value) {\n    value_ = value;\n  }\n  void ResetToDefault() {\n    value_ = default_;\n  }\n  void ResetFrom(const ParamsVectors *vec) {\n    for (auto *param : vec->double_params) {\n      if (strcmp(param->name_str(), name_) == 0) {\n        // printf(\"overriding param %s=%f by =%f\\n\", name_, value_,\n        // *param);\n        value_ = *param;\n        break;\n      }\n    }\n  }\n\nprivate:\n  double value_;\n  double default_;\n  // Pointer to the vector that contains this param (not owned by this class).\n  std::vector<DoubleParam *> *params_vec_;\n};\n\n// Global parameter lists.\n//\n// To avoid the problem of undetermined order of static initialization\n// global_params are accessed through the GlobalParams function that\n// initializes the static pointer to global_params only on the first time\n// GlobalParams() is called.\n//\n// TODO(daria): remove GlobalParams() when all global Tesseract\n// parameters are converted to members.\nTESS_API\nParamsVectors *GlobalParams();\n\n/*************************************************************************\n * Note on defining parameters.\n *\n * The values of the parameters defined with *_INIT_* macros are guaranteed\n * to be loaded from config files before Tesseract initialization is done\n * (there is no such guarantee for parameters defined with the other macros).\n *************************************************************************/\n\n#define INT_VAR_H(name) ::tesseract::IntParam name\n\n#define BOOL_VAR_H(name) ::tesseract::BoolParam name\n\n#define STRING_VAR_H(name) ::tesseract::StringParam name\n\n#define double_VAR_H(name) ::tesseract::DoubleParam name\n\n#define INT_VAR(name, val, comment) \\\n  ::tesseract::IntParam name(val, #name, comment, false, ::tesseract::GlobalParams())\n\n#define BOOL_VAR(name, val, comment) \\\n  ::tesseract::BoolParam name(val, #name, comment, false, ::tesseract::GlobalParams())\n\n#define STRING_VAR(name, val, comment) \\\n  ::tesseract::StringParam name(val, #name, comment, false, ::tesseract::GlobalParams())\n\n#define double_VAR(name, val, comment) \\\n  ::tesseract::DoubleParam name(val, #name, comment, false, ::tesseract::GlobalParams())\n\n#define INT_MEMBER(name, val, comment, vec) name(val, #name, comment, false, vec)\n\n#define BOOL_MEMBER(name, val, comment, vec) name(val, #name, comment, false, vec)\n\n#define STRING_MEMBER(name, val, comment, vec) name(val, #name, comment, false, vec)\n\n#define double_MEMBER(name, val, comment, vec) name(val, #name, comment, false, vec)\n\n#define INT_INIT_MEMBER(name, val, comment, vec) name(val, #name, comment, true, vec)\n\n#define BOOL_INIT_MEMBER(name, val, comment, vec) name(val, #name, comment, true, vec)\n\n#define STRING_INIT_MEMBER(name, val, comment, vec) name(val, #name, comment, true, vec)\n\n#define double_INIT_MEMBER(name, val, comment, vec) name(val, #name, comment, true, vec)\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/ccutil/qrsequence.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        qrsequence.h\n// Description: Quasi-random sequence generator class.\n// Author:      Ranjith Unnikrishnan\n//\n// Class to generate a (deterministic) quasi-random Van der Corput sequence that\n// covers the interval [0,N) without repetition.\n//\n// The sequence is generated by reversing the base-2 representation of the\n// sequence of natural numbers {0, 1,... M-1}, where M is 2^{num_bits_} and\n// num_bits is the minimum number of bits required to represent N. If a reversed\n// numbers is >= N it is rejected and the next natural number is considered\n// until a valid output number is found.\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// use this file except in compliance with the License.  You may obtain a copy\n// of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required\n// by applicable law or agreed to in writing, software distributed under the\n// License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n// OF ANY KIND, either express or implied.  See the License for the specific\n// language governing permissions and limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_QRSEQUENCE_H_\n#define TESSERACT_CCUTIL_QRSEQUENCE_H_\n\n#include <cmath>\n\nclass QRSequenceGenerator {\npublic:\n  // Object is initialized with the size of the output range.\n  explicit QRSequenceGenerator(int N) : N_(N), next_num_(0) {\n    num_bits_ = static_cast<int>(ceil(log(static_cast<double>(N)) / log(2.0)));\n  }\n\n  // Main worker method that retrieves the next number in the sequence.\n  // Returns kInvalidVal if called more than N times after object initialization\n  int GetVal() {\n    const int kInvalidVal = -1;\n    const int kMaxNaturalNumberValue = 1 << num_bits_;\n    if (next_num_ >= kMaxNaturalNumberValue) {\n      return kInvalidVal;\n    }\n    int n = next_num_;\n\n    while (next_num_ < kMaxNaturalNumberValue) {\n      n = GetBinaryReversedInteger(next_num_++);\n      if (n < N_) {\n        break;\n      }\n    }\n    return (next_num_ > kMaxNaturalNumberValue) ? kInvalidVal : n;\n  }\n\nprotected:\n  // Outputs the integer formed by reversing the bits of the input integer. Only\n  // the lowest num_bits_ bits of the input integer are reversed.\n  int GetBinaryReversedInteger(int in_val) const {\n    int bit_pos = num_bits_;\n    int out_val = 0;\n    while (bit_pos--) {\n      // Set the value of the last bit.\n      out_val |= (in_val & 0x1);\n      if (bit_pos > 0) {\n        // Left-shift output value to prepare for storing the next bit.\n        out_val <<= 1;\n      }\n      // Right-shift input value to prepare for retrieving the next bit.\n      in_val >>= 1;\n    }\n    return out_val;\n  }\n  int N_;\n  // Next number to be considered for reversal and output.\n  int next_num_;\n  // number of bits required to represent the numbers of the sequence\n  int num_bits_;\n};\n\n#endif // TESSERACT_CCUTIL_QRSEQUENCE_H_\n"
  },
  {
    "path": "src/ccutil/scanutils.cpp",
    "content": "// Copyright 2006 Google Inc.\n// All Rights Reserved.\n// Author: renn\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <cctype>\n#include <climits> // for CHAR_BIT\n#include <cmath>\n#include <cstdarg>\n#include <cstddef>\n#include <cstdint>\n#include <cstdio>\n#include <cstring>\n#include <limits> // for std::numeric_limits\n\n#include \"scanutils.h\"\n\nenum Flags {\n  FL_SPLAT = 0x01, // Drop the value, do not assign\n  FL_INV = 0x02,   // Character-set with inverse\n  FL_WIDTH = 0x04, // Field width specified\n  FL_MINUS = 0x08, // Negative number\n};\n\nenum Ranks {\n  RANK_CHAR = -2,\n  RANK_SHORT = -1,\n  RANK_INT = 0,\n  RANK_LONG = 1,\n  RANK_LONGLONG = 2,\n  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers\n};\n\nconst enum Ranks kMinRank = RANK_CHAR;\nconst enum Ranks kMaxRank = RANK_LONGLONG;\n\nconst enum Ranks kIntMaxRank = RANK_LONGLONG;\nconst enum Ranks kSizeTRank = RANK_LONG;\nconst enum Ranks kPtrDiffRank = RANK_LONG;\n\nenum Bail {\n  BAIL_NONE = 0, // No error condition\n  BAIL_EOF,      // Hit EOF\n  BAIL_ERR       // Conversion mismatch\n};\n\n// Helper functions ------------------------------------------------------------\ninline size_t LongBit() {\n  return CHAR_BIT * sizeof(long);\n}\n\nstatic inline int SkipSpace(FILE *s) {\n  int p;\n  while (isascii(p = fgetc(s)) && isspace(p)) {\n    ;\n  }\n  ungetc(p, s); // Make sure next char is available for reading\n  return p;\n}\n\nstatic inline void SetBit(unsigned long *bitmap, unsigned int bit) {\n  bitmap[bit / LongBit()] |= 1UL << (bit % LongBit());\n}\n\nstatic inline int TestBit(unsigned long *bitmap, unsigned int bit) {\n  return static_cast<int>(bitmap[bit / LongBit()] >> (bit % LongBit())) & 1;\n}\n\nstatic inline int DigitValue(int ch, int base) {\n  if (ch >= '0' && ch <= '9') {\n    if (base >= 10 || ch <= '7') {\n      return ch - '0';\n    }\n  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {\n    return ch - 'A' + 10;\n  } else if (ch >= 'a' && ch <= 'z' && base == 16) {\n    return ch - 'a' + 10;\n  }\n  return -1;\n}\n\n// IO (re-)implementations -----------------------------------------------------\nstatic uintmax_t streamtoumax(FILE *s, int base) {\n  int minus = 0;\n  uintmax_t v = 0;\n  int d, c = 0;\n\n  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {\n    ;\n  }\n\n  // Single optional + or -\n  if (c == '-' || c == '+') {\n    minus = (c == '-');\n    c = fgetc(s);\n  }\n\n  // Assign correct base\n  if (base == 0) {\n    if (c == '0') {\n      c = fgetc(s);\n      if (c == 'x' || c == 'X') {\n        base = 16;\n        c = fgetc(s);\n      } else {\n        base = 8;\n      }\n    }\n  } else if (base == 16) {\n    if (c == '0') {\n      c = fgetc(s);\n      if (c == 'x' || c == 'X') {\n        c = fgetc(s);\n      }\n    }\n  }\n\n  // Actual number parsing\n  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) {\n    v = v * base + d;\n  }\n\n  ungetc(c, s);\n  return minus ? -v : v;\n}\n\nstatic double streamtofloat(FILE *s) {\n  bool minus = false;\n  uint64_t v = 0;\n  int d, c;\n  uint64_t k = 1;\n  uint64_t w = 0;\n\n  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {\n    ;\n  }\n\n  // Single optional + or -\n  if (c == '-' || c == '+') {\n    minus = (c == '-');\n    c = fgetc(s);\n  }\n\n  // Actual number parsing\n  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {\n    v = v * 10 + d;\n  }\n  if (c == '.') {\n    for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {\n      w = w * 10 + d;\n      k *= 10;\n    }\n  }\n  double f = v + static_cast<double>(w) / k;\n  if (c == 'e' || c == 'E') {\n    c = fgetc(s);\n    int expsign = 1;\n    if (c == '-' || c == '+') {\n      expsign = (c == '-') ? -1 : 1;\n      c = fgetc(s);\n    }\n    int exponent = 0;\n    for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {\n      exponent = exponent * 10 + d;\n    }\n    exponent *= expsign;\n    f *= pow(10.0, static_cast<double>(exponent));\n  }\n  ungetc(c, s);\n\n  return minus ? -f : f;\n}\n\nstatic int tvfscanf(FILE *stream, const char *format, va_list ap);\n\nint tfscanf(FILE *stream, const char *format, ...) {\n  va_list ap;\n  int rv;\n\n  va_start(ap, format);\n  rv = tvfscanf(stream, format, ap);\n  va_end(ap);\n\n  return rv;\n}\n\nstatic int tvfscanf(FILE *stream, const char *format, va_list ap) {\n  const char *p = format;\n  char ch;\n  int q = 0;\n  uintmax_t val = 0;\n  int rank = RANK_INT; // Default rank\n  unsigned int width = UINT_MAX;\n  int base;\n  int flags = 0;\n  enum {\n    ST_NORMAL,      // Ground state\n    ST_FLAGS,       // Special flags\n    ST_WIDTH,       // Field width\n    ST_MODIFIERS,   // Length or conversion modifiers\n    ST_MATCH_INIT,  // Initial state of %[ sequence\n    ST_MATCH,       // Main state of %[ sequence\n    ST_MATCH_RANGE, // After - in a %[ sequence\n  } state = ST_NORMAL;\n  char *sarg = nullptr; // %s %c or %[ string argument\n  enum Bail bail = BAIL_NONE;\n  int converted = 0; // Successful conversions\n  unsigned long\n      matchmap[((1 << CHAR_BIT) + (CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))];\n  int matchinv = 0; // Is match map inverted?\n  unsigned char range_start = 0;\n  auto start_off = std::ftell(stream);\n\n  // Skip leading spaces\n  SkipSpace(stream);\n\n  while ((ch = *p++) && !bail) {\n    switch (state) {\n      case ST_NORMAL:\n        if (ch == '%') {\n          state = ST_FLAGS;\n          flags = 0;\n          rank = RANK_INT;\n          width = UINT_MAX;\n        } else if (isascii(ch) && isspace(ch)) {\n          SkipSpace(stream);\n        } else {\n          if (fgetc(stream) != ch) {\n            bail = BAIL_ERR; // Match failure\n          }\n        }\n        break;\n\n      case ST_FLAGS:\n        if (ch == '*') {\n          flags |= FL_SPLAT;\n        } else if ('0' <= ch && ch <= '9') {\n          width = (ch - '0');\n          state = ST_WIDTH;\n          flags |= FL_WIDTH;\n        } else {\n          state = ST_MODIFIERS;\n          p--; // Process this character again\n        }\n        break;\n\n      case ST_WIDTH:\n        if (ch >= '0' && ch <= '9') {\n          width = width * 10 + (ch - '0');\n        } else {\n          state = ST_MODIFIERS;\n          p--; // Process this character again\n        }\n        break;\n\n      case ST_MODIFIERS:\n        switch (ch) {\n          // Length modifiers - nonterminal sequences\n          case 'h':\n            rank--; // Shorter rank\n            break;\n          case 'l':\n            rank++; // Longer rank\n            break;\n          case 'j':\n            rank = kIntMaxRank;\n            break;\n          case 'z':\n            rank = kSizeTRank;\n            break;\n          case 't':\n            rank = kPtrDiffRank;\n            break;\n          case 'L':\n          case 'q':\n            rank = RANK_LONGLONG; // long double/long long\n            break;\n\n          default:\n            // Output modifiers - terminal sequences\n            state = ST_NORMAL;   // Next state will be normal\n            if (rank < kMinRank) { // Canonicalize rank\n              rank = kMinRank;\n            } else if (rank > kMaxRank) {\n              rank = kMaxRank;\n            }\n\n            switch (ch) {\n              case 'P': // Upper case pointer\n              case 'p': // Pointer\n                rank = RANK_PTR;\n                base = 0;\n                goto scan_int;\n\n              case 'i': // Base-independent integer\n                base = 0;\n                goto scan_int;\n\n              case 'd': // Decimal integer\n                base = 10;\n                goto scan_int;\n\n              case 'o': // Octal integer\n                base = 8;\n                goto scan_int;\n\n              case 'u': // Unsigned decimal integer\n                base = 10;\n                goto scan_int;\n\n              case 'x': // Hexadecimal integer\n              case 'X':\n                base = 16;\n                goto scan_int;\n\n              case 'n': // Number of characters consumed\n                val = std::ftell(stream) - start_off;\n                goto set_integer;\n\n              scan_int:\n                q = SkipSpace(stream);\n                if (q <= 0) {\n                  bail = BAIL_EOF;\n                  break;\n                }\n                val = streamtoumax(stream, base);\n                // fall through\n\n              set_integer:\n                if (!(flags & FL_SPLAT)) {\n                  converted++;\n                  switch (rank) {\n                    case RANK_CHAR:\n                      *va_arg(ap, unsigned char *) = static_cast<unsigned char>(val);\n                      break;\n                    case RANK_SHORT:\n                      *va_arg(ap, unsigned short *) = static_cast<unsigned short>(val);\n                      break;\n                    case RANK_INT:\n                      *va_arg(ap, unsigned int *) = static_cast<unsigned int>(val);\n                      break;\n                    case RANK_LONG:\n                      *va_arg(ap, unsigned long *) = static_cast<unsigned long>(val);\n                      break;\n                    case RANK_LONGLONG:\n                      *va_arg(ap, unsigned long long *) = static_cast<unsigned long long>(val);\n                      break;\n                    case RANK_PTR:\n                      *va_arg(ap, void **) = reinterpret_cast<void *>(static_cast<uintptr_t>(val));\n                      break;\n                  }\n                }\n                break;\n\n              case 'f': // Preliminary float value parsing\n              case 'g':\n              case 'G':\n              case 'e':\n              case 'E':\n                q = SkipSpace(stream);\n                if (q <= 0) {\n                  bail = BAIL_EOF;\n                  break;\n                }\n\n                {\n                  double fval = streamtofloat(stream);\n                  if (!(flags & FL_SPLAT)) {\n                    if (rank == RANK_INT) {\n                      *va_arg(ap, float *) = static_cast<float>(fval);\n                    } else if (rank == RANK_LONG) {\n                      *va_arg(ap, double *) = static_cast<double>(fval);\n                    }\n                    converted++;\n                  }\n                }\n                break;\n\n              case 'c':                                 // Character\n                width = (flags & FL_WIDTH) ? width : 1; // Default width == 1\n                sarg = va_arg(ap, char *);\n                while (width--) {\n                  if ((q = fgetc(stream)) <= 0) {\n                    bail = BAIL_EOF;\n                    break;\n                  }\n                  if (!(flags & FL_SPLAT)) {\n                    *sarg++ = q;\n                    converted++;\n                  }\n                }\n                break;\n\n              case 's': // String\n              {\n                if (!(flags & FL_SPLAT)) {\n                  sarg = va_arg(ap, char *);\n                }\n                unsigned length = 0;\n                while (width--) {\n                  q = fgetc(stream);\n                  if ((isascii(q) && isspace(q)) || (q <= 0)) {\n                    ungetc(q, stream);\n                    break;\n                  }\n                  if (!(flags & FL_SPLAT)) {\n                    sarg[length] = q;\n                  }\n                  length++;\n                }\n                if (length == 0) {\n                  bail = BAIL_EOF;\n                } else if (!(flags & FL_SPLAT)) {\n                  sarg[length] = '\\0'; // Terminate output\n                  converted++;\n                }\n              } break;\n\n              case '[': // Character range\n                sarg = va_arg(ap, char *);\n                state = ST_MATCH_INIT;\n                matchinv = 0;\n                memset(matchmap, 0, sizeof matchmap);\n                break;\n\n              case '%': // %% sequence\n                if (fgetc(stream) != '%') {\n                  bail = BAIL_ERR;\n                }\n                break;\n\n              default:           // Anything else\n                bail = BAIL_ERR; // Unknown sequence\n                break;\n            }\n        }\n        break;\n\n      case ST_MATCH_INIT: // Initial state for %[ match\n        if (ch == '^' && !(flags & FL_INV)) {\n          matchinv = 1;\n        } else {\n          SetBit(matchmap, static_cast<unsigned char>(ch));\n          state = ST_MATCH;\n        }\n        break;\n\n      case ST_MATCH: // Main state for %[ match\n        if (ch == ']') {\n          goto match_run;\n        } else if (ch == '-') {\n          range_start = static_cast<unsigned char>(ch);\n          state = ST_MATCH_RANGE;\n        } else {\n          SetBit(matchmap, static_cast<unsigned char>(ch));\n        }\n        break;\n\n      case ST_MATCH_RANGE: // %[ match after -\n        if (ch == ']') {\n          SetBit(matchmap, static_cast<unsigned char>('-'));\n          goto match_run;\n        } else {\n          int i;\n          for (i = range_start; i < (static_cast<unsigned char>(ch)); i++) {\n            SetBit(matchmap, i);\n          }\n          state = ST_MATCH;\n        }\n        break;\n\n      match_run: // Match expression finished\n        char *oarg = sarg;\n        while (width) {\n          q = fgetc(stream);\n          auto qc = static_cast<unsigned char>(q);\n          if (q <= 0 || !(TestBit(matchmap, qc) ^ matchinv)) {\n            ungetc(q, stream);\n            break;\n          }\n          if (!(flags & FL_SPLAT)) {\n            *sarg = q;\n          }\n          sarg++;\n        }\n        if (oarg == sarg) {\n          bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;\n        } else if (!(flags & FL_SPLAT)) {\n          *sarg = '\\0';\n          converted++;\n        }\n        break;\n    }\n  }\n\n  if (bail == BAIL_EOF && !converted) {\n    converted = -1; // Return EOF (-1)\n  }\n\n  return converted;\n}\n"
  },
  {
    "path": "src/ccutil/scanutils.h",
    "content": "// Copyright 2006 Google Inc.\n// All Rights Reserved.\n// Author: renn\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCUTIL_SCANUTILS_H_\n#define TESSERACT_CCUTIL_SCANUTILS_H_\n\n#include <tesseract/export.h>\n\n#include <cstdio> // for FILE\n\n/**\n * fscanf variant to ensure correct reading regardless of locale.\n *\n * tfscanf parse a file stream according to the given format. See the fscanf\n * manpage for more information, as this function attempts to mimic its\n * behavior.\n *\n * @note Note that scientific floating-point notation is not supported.\n *\n */\nTESS_API\nint tfscanf(FILE *stream, const char *format, ...);\n\n#endif // TESSERACT_CCUTIL_SCANUTILS_H_\n"
  },
  {
    "path": "src/ccutil/serialis.cpp",
    "content": "/**********************************************************************\n * File:        serialis.cpp  (Formerly serialmac.h)\n * Description: Inline routines and macros for serialisation functions\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1990, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"serialis.h\"\n\n#include \"errcode.h\"\n\n#include \"helpers.h\" // for ReverseN\n\n#include <climits> // for INT_MAX\n#include <cstdio>\n\nnamespace tesseract {\n\n// The default FileReader loads the whole file into the vector of char,\n// returning false on error.\nbool LoadDataFromFile(const char *filename, std::vector<char> *data) {\n  bool result = false;\n  FILE *fp = fopen(filename, \"rb\");\n  if (fp != nullptr) {\n    fseek(fp, 0, SEEK_END);\n    auto size = std::ftell(fp);\n    fseek(fp, 0, SEEK_SET);\n    // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.\n    if (size > 0 && size < LONG_MAX) {\n      // reserve an extra byte in case caller wants to append a '\\0' character\n      data->reserve(size + 1);\n      data->resize(size); // TODO: optimize no init\n      result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;\n    }\n    fclose(fp);\n  }\n  return result;\n}\n\n// The default FileWriter writes the vector of char to the filename file,\n// returning false on error.\nbool SaveDataToFile(const std::vector<char> &data, const char *filename) {\n  FILE *fp = fopen(filename, \"wb\");\n  if (fp == nullptr) {\n    return false;\n  }\n  bool result = fwrite(&data[0], 1, data.size(), fp) == data.size();\n  fclose(fp);\n  return result;\n}\n\nTFile::TFile() {\n}\n\nTFile::~TFile() {\n  if (data_is_owned_) {\n    delete data_;\n  }\n}\n\nbool TFile::DeSerializeSize(int32_t *pSize) {\n  uint32_t size;\n  if (FReadEndian(&size, sizeof(size), 1) != 1) {\n    return false;\n  }\n  if (size > data_->size() / 4) {\n    // Reverse endianness.\n    swap_ = !swap_;\n    ReverseN(&size, 4);\n  }\n  *pSize = size;\n  return true;\n}\n\nbool TFile::DeSerializeSkip(size_t size) {\n  uint32_t len;\n  if (!DeSerialize(&len)) {\n    return false;\n  }\n  return Skip(len * size);\n}\n\nbool TFile::DeSerialize(std::string &data) {\n  uint32_t size;\n  if (!DeSerialize(&size)) {\n    return false;\n  } else if (size > 0) {\n    // TODO: optimize.\n    data.resize(size);\n    return DeSerialize(&data[0], size);\n  }\n  data.clear();\n  return true;\n}\n\nbool TFile::Serialize(const std::string &data) {\n  uint32_t size = data.size();\n  return Serialize(&size) && Serialize(data.c_str(), size);\n}\n\nbool TFile::DeSerialize(std::vector<char> &data) {\n  uint32_t size;\n  if (!DeSerialize(&size)) {\n    return false;\n  } else if (size > 0) {\n    // TODO: optimize.\n    data.resize(size);\n    return DeSerialize(&data[0], data.size());\n  }\n  data.clear();\n  return true;\n}\n\nbool TFile::Serialize(const std::vector<char> &data) {\n  uint32_t size = data.size();\n  if (!Serialize(&size)) {\n    return false;\n  } else if (size > 0) {\n    return Serialize(&data[0], size);\n  }\n  return true;\n}\n\nbool TFile::Skip(size_t count) {\n  offset_ += count;\n  return true;\n}\n\nbool TFile::Open(const char *filename, FileReader reader) {\n  if (!data_is_owned_) {\n    data_ = new std::vector<char>;\n    data_is_owned_ = true;\n  }\n  offset_ = 0;\n  is_writing_ = false;\n  swap_ = false;\n  if (reader == nullptr) {\n    return LoadDataFromFile(filename, data_);\n  } else {\n    return (*reader)(filename, data_);\n  }\n}\n\nbool TFile::Open(const char *data, size_t size) {\n  offset_ = 0;\n  if (!data_is_owned_) {\n    data_ = new std::vector<char>;\n    data_is_owned_ = true;\n  }\n  is_writing_ = false;\n  swap_ = false;\n  data_->resize(size); // TODO: optimize no init\n  memcpy(&(*data_)[0], data, size);\n  return true;\n}\n\nbool TFile::Open(FILE *fp, int64_t end_offset) {\n  offset_ = 0;\n  auto current_pos = std::ftell(fp);\n  if (current_pos < 0) {\n    // ftell failed.\n    return false;\n  }\n  if (end_offset < 0) {\n    if (fseek(fp, 0, SEEK_END)) {\n      return false;\n    }\n    end_offset = ftell(fp);\n    if (fseek(fp, current_pos, SEEK_SET)) {\n      return false;\n    }\n  }\n  size_t size = end_offset - current_pos;\n  is_writing_ = false;\n  swap_ = false;\n  if (!data_is_owned_) {\n    data_ = new std::vector<char>;\n    data_is_owned_ = true;\n  }\n  data_->resize(size); // TODO: optimize no init\n  return fread(&(*data_)[0], 1, size, fp) == size;\n}\n\nchar *TFile::FGets(char *buffer, int buffer_size) {\n  ASSERT_HOST(!is_writing_);\n  int size = 0;\n  while (size + 1 < buffer_size && offset_ < data_->size()) {\n    buffer[size++] = (*data_)[offset_++];\n    if ((*data_)[offset_ - 1] == '\\n') {\n      break;\n    }\n  }\n  if (size < buffer_size) {\n    buffer[size] = '\\0';\n  }\n  return size > 0 ? buffer : nullptr;\n}\n\nsize_t TFile::FReadEndian(void *buffer, size_t size, size_t count) {\n  auto num_read = FRead(buffer, size, count);\n  if (swap_ && size != 1) {\n    char *char_buffer = static_cast<char *>(buffer);\n    for (size_t i = 0; i < num_read; ++i, char_buffer += size) {\n      ReverseN(char_buffer, size);\n    }\n  }\n  return num_read;\n}\n\nsize_t TFile::FRead(void *buffer, size_t size, size_t count) {\n  ASSERT_HOST(!is_writing_);\n  ASSERT_HOST(size > 0);\n  size_t required_size;\n  if (SIZE_MAX / size <= count) {\n    // Avoid integer overflow.\n    required_size = data_->size() - offset_;\n  } else {\n    required_size = size * count;\n    if (data_->size() - offset_ < required_size) {\n      required_size = data_->size() - offset_;\n    }\n  }\n  if (required_size > 0 && buffer != nullptr) {\n    memcpy(buffer, &(*data_)[offset_], required_size);\n  }\n  offset_ += required_size;\n  return required_size / size;\n}\n\nvoid TFile::Rewind() {\n  ASSERT_HOST(!is_writing_);\n  offset_ = 0;\n}\n\nvoid TFile::OpenWrite(std::vector<char> *data) {\n  offset_ = 0;\n  if (data != nullptr) {\n    if (data_is_owned_) {\n      delete data_;\n    }\n    data_ = data;\n    data_is_owned_ = false;\n  } else if (!data_is_owned_) {\n    data_ = new std::vector<char>;\n    data_is_owned_ = true;\n  }\n  is_writing_ = true;\n  swap_ = false;\n  data_->clear();\n}\n\nbool TFile::CloseWrite(const char *filename, FileWriter writer) {\n  ASSERT_HOST(is_writing_);\n  if (writer == nullptr) {\n    return SaveDataToFile(*data_, filename);\n  } else {\n    return (*writer)(*data_, filename);\n  }\n}\n\nsize_t TFile::FWrite(const void *buffer, size_t size, size_t count) {\n  ASSERT_HOST(is_writing_);\n  ASSERT_HOST(size > 0);\n  ASSERT_HOST(SIZE_MAX / size > count);\n  size_t total = size * count;\n  const char *buf = static_cast<const char *>(buffer);\n  // This isn't very efficient, but memory is so fast compared to disk\n  // that it is relatively unimportant, and very simple.\n  for (size_t i = 0; i < total; ++i) {\n    data_->push_back(buf[i]);\n  }\n  return count;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccutil/serialis.h",
    "content": "/**********************************************************************\n * File:        serialis.h  (Formerly serialmac.h)\n * Description: Inline routines and macros for serialisation functions\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1990, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef SERIALIS_H\n#define SERIALIS_H\n\n#include <tesseract/baseapi.h> // FileReader\n#include <cstdint>             // uint8_t\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <type_traits>\n#include <vector> // std::vector\n\nnamespace tesseract {\n\n// Return number of elements of an array.\ntemplate <typename T, size_t N>\nconstexpr size_t countof(T const (&)[N]) noexcept {\n  return N;\n}\n\n// Function to write a std::vector<char> to a whole file.\n// Returns false on failure.\nusing FileWriter = bool (*)(const std::vector<char> &data, const char *filename);\n\nTESS_API\nbool LoadDataFromFile(const char *filename, std::vector<char> *data);\nTESS_API\nbool SaveDataToFile(const std::vector<char> &data, const char *filename);\n\n// Deserialize data from file.\ntemplate <typename T>\nbool DeSerialize(FILE *fp, T *data, size_t n = 1) {\n  return fread(data, sizeof(T), n, fp) == n;\n}\n\n// Serialize data to file.\ntemplate <typename T>\nbool Serialize(FILE *fp, const T *data, size_t n = 1) {\n  return fwrite(data, sizeof(T), n, fp) == n;\n}\n\n// Simple file class.\n// Allows for portable file input from memory and from foreign file systems.\nclass TESS_API TFile {\npublic:\n  TFile();\n  ~TFile();\n\n  // All the Open methods load the whole file into memory for reading.\n  // Opens a file with a supplied reader, or nullptr to use the default.\n  // Note that mixed read/write is not supported.\n  bool Open(const char *filename, FileReader reader);\n  // From an existing memory buffer.\n  bool Open(const char *data, size_t size);\n  // From an open file and an end offset.\n  bool Open(FILE *fp, int64_t end_offset);\n  // Sets the value of the swap flag, so that FReadEndian does the right thing.\n  void set_swap(bool value) {\n    swap_ = value;\n  }\n\n  // Deserialize data.\n  bool DeSerializeSize(int32_t *data);\n  bool DeSerializeSkip(size_t size = 1);\n  bool DeSerialize(std::string &data);\n  bool DeSerialize(std::vector<char> &data);\n  //bool DeSerialize(std::vector<std::string> &data);\n  template <typename T>\n  bool DeSerialize(T *data, size_t count = 1) {\n    return FReadEndian(data, sizeof(T), count) == count;\n  }\n  template <typename T>\n  bool DeSerialize(std::vector<T> &data) {\n    uint32_t size;\n    if (!DeSerialize(&size)) {\n      return false;\n    } else if (size == 0) {\n      data.clear();\n    } else if (size > 50000000) {\n      // Arbitrarily limit the number of elements to protect against bad data.\n      return false;\n    } else if constexpr (std::is_same<T, std::string>::value) {\n      // Deserialize a string.\n      // TODO: optimize.\n      data.resize(size);\n      for (auto &item : data) {\n        if (!DeSerialize(item)) {\n          return false;\n        }\n      }\n    } else if constexpr (std::is_class<T>::value) {\n      // Deserialize a tesseract class.\n      // TODO: optimize.\n      data.resize(size);\n      for (auto &item : data) {\n        if (!item.DeSerialize(this)) {\n          return false;\n        }\n      }\n    } else if constexpr (std::is_pointer<T>::value) {\n      // Deserialize pointers.\n      // TODO: optimize.\n      data.resize(size);\n      for (uint32_t i = 0; i < size; i++) {\n        uint8_t non_null;\n\tif (!DeSerialize(&non_null)) {\n          return false;\n\t}\n        if (non_null) {\n          typedef typename std::remove_pointer<T>::type ST;\n          auto item = new ST;\n          if (!item->DeSerialize(this)) {\n            delete item;\n            return false;\n          }\n          data[i] = item;\n        }\n      }\n    } else {\n      // Deserialize a non-class.\n      // TODO: optimize.\n      data.resize(size);\n      return DeSerialize(&data[0], size);\n    }\n    return true;\n  }\n\n  // Serialize data.\n  bool Serialize(const std::string &data);\n  bool Serialize(const std::vector<char> &data);\n  template <typename T>\n  bool Serialize(const T *data, size_t count = 1) {\n    return FWrite(data, sizeof(T), count) == count;\n  }\n  template <typename T>\n  bool Serialize(const std::vector<T> &data) {\n    // Serialize number of elements first.\n    uint32_t size = data.size();\n    if (!Serialize(&size)) {\n      return false;\n    } else if constexpr (std::is_same<T, std::string>::value) {\n      // Serialize strings.\n      for (auto &&string : data) {\n        if (!Serialize(string)) {\n          return false;\n        }\n      }\n    } else if constexpr (std::is_class<T>::value) {\n      // Serialize a tesseract class.\n      for (auto &item : data) {\n        if (!item.Serialize(this)) {\n          return false;\n        }\n      }\n    } else if constexpr (std::is_pointer<T>::value) {\n      // Serialize pointers.\n      for (auto &item : data) {\n        uint8_t non_null = (item != nullptr);\n\tif (!Serialize(&non_null)) {\n          return false;\n\t}\n        if (non_null) {\n          if (!item->Serialize(this)) {\n            return false;\n\t  }\n\t}\n      }\n    } else if (size > 0) {\n      // Serialize a non-class.\n      return Serialize(&data[0], size);\n    }\n    return true;\n  }\n\n  // Skip data.\n  bool Skip(size_t count);\n\n  // Reads a line like fgets. Returns nullptr on EOF, otherwise buffer.\n  // Reads at most buffer_size bytes, including '\\0' terminator, even if\n  // the line is longer. Does nothing if buffer_size <= 0.\n  char *FGets(char *buffer, int buffer_size);\n  // Replicates fread, followed by a swap of the bytes if needed, returning the\n  // number of items read. If swap_ is true then the count items will each have\n  // size bytes reversed.\n  size_t FReadEndian(void *buffer, size_t size, size_t count);\n  // Replicates fread, returning the number of items read.\n  size_t FRead(void *buffer, size_t size, size_t count);\n  // Resets the TFile as if it has been Opened, but nothing read.\n  // Only allowed while reading!\n  void Rewind();\n\n  // Open for writing. Either supply a non-nullptr data with OpenWrite before\n  // calling FWrite, (no close required), or supply a nullptr data to OpenWrite\n  // and call CloseWrite to write to a file after the FWrites.\n  void OpenWrite(std::vector<char> *data);\n  bool CloseWrite(const char *filename, FileWriter writer);\n\n  // Replicates fwrite, returning the number of items written.\n  // To use fprintf, use snprintf and FWrite.\n  size_t FWrite(const void *buffer, size_t size, size_t count);\n\nprivate:\n  // The buffered data from the file.\n  std::vector<char> *data_ = nullptr;\n  // The number of bytes used so far.\n  unsigned offset_ = 0;\n  // True if the data_ pointer is owned by *this.\n  bool data_is_owned_ = false;\n  // True if the TFile is open for writing.\n  bool is_writing_ = false;\n  // True if bytes need to be swapped in FReadEndian.\n  bool swap_ = false;\n};\n\n} // namespace tesseract.\n\n#endif\n"
  },
  {
    "path": "src/ccutil/sorthelper.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        sorthelper.h\n// Description: Generic sort and maxfinding class.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_SORTHELPER_H_\n#define TESSERACT_CCUTIL_SORTHELPER_H_\n\n#include <cstdlib>\n#include <vector>\n\nnamespace tesseract {\n\n// Generic class to provide functions based on a <value,count> pair.\n// T is the value type.\n// The class keeps a count of each value and can return the most frequent\n// value or a sorted array of the values with counts.\n// Note that this class uses linear search for adding. It is better\n// to use the STATS class to get the mode of a large number of values\n// in a small space. SortHelper is better to get the mode of a small number\n// of values from a large space.\n// T must have a copy constructor.\ntemplate <typename T>\nclass SortHelper {\npublic:\n  // Simple pair class to hold the values and counts.\n  template <typename PairT>\n  struct SortPair {\n    PairT value;\n    int count;\n  };\n  // qsort function to sort by decreasing count.\n  static int SortPairsByCount(const void *v1, const void *v2) {\n    const auto *p1 = static_cast<const SortPair<T> *>(v1);\n    const auto *p2 = static_cast<const SortPair<T> *>(v2);\n    return p2->count - p1->count;\n  }\n  // qsort function to sort by decreasing value.\n  static int SortPairsByValue(const void *v1, const void *v2) {\n    const auto *p1 = static_cast<const SortPair<T> *>(v1);\n    const auto *p2 = static_cast<const SortPair<T> *>(v2);\n    if (p2->value - p1->value < 0) {\n      return -1;\n    }\n    if (p2->value - p1->value > 0) {\n      return 1;\n    }\n    return 0;\n  }\n\n  // Constructor takes a hint of the array size, but it need not be accurate.\n  explicit SortHelper(int sizehint) {\n    counts_.reserve(sizehint);\n  }\n\n  // Add a value that may be a duplicate of an existing value.\n  // Uses a linear search.\n  void Add(T value, int count) {\n    // Linear search for value.\n    for (auto &it : counts_) {\n      if (it.value == value) {\n        it.count += count;\n        return;\n      }\n    }\n    SortPair<T> new_pair = {value, count};\n    counts_.push_back(SortPair<T>(new_pair));\n  }\n\n  // Returns the frequency of the most frequent value.\n  // If max_value is not nullptr, returns the most frequent value.\n  // If the array is empty, returns -INT32_MAX and max_value is unchanged.\n  int MaxCount(T *max_value) const {\n    int best_count = -INT32_MAX;\n    for (auto &it : counts_) {\n      if (it.count > best_count) {\n        best_count = it.count;\n        if (max_value != nullptr) {\n          *max_value = it.value;\n        }\n      }\n    }\n    return best_count;\n  }\n\n  // Returns the data array sorted by decreasing frequency.\n  const std::vector<SortPair<T>> &SortByCount() {\n    counts_.sort(&SortPairsByCount);\n    return counts_;\n  }\n  // Returns the data array sorted by decreasing value.\n  const std::vector<SortPair<T>> &SortByValue() {\n    counts_.sort(&SortPairsByValue);\n    return counts_;\n  }\n\nprivate:\n  std::vector<SortPair<T>> counts_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_SORTHELPER_H_.\n"
  },
  {
    "path": "src/ccutil/tessdatamanager.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tessdatamanager.cpp\n// Description: Functions to handle loading/combining tesseract data files.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"tessdatamanager.h\"\n\n#include <cstdio>\n#include <string>\n\n#if defined(HAVE_LIBARCHIVE)\n#  include <archive.h>\n#  include <archive_entry.h>\n#endif\n\n#include <tesseract/version.h>\n#include \"errcode.h\"\n#include \"helpers.h\"\n#include \"params.h\"\n#include \"serialis.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\nTessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {\n  SetVersionString(TESSERACT_VERSION_STR);\n}\n\nTessdataManager::TessdataManager(FileReader reader)\n    : reader_(reader), is_loaded_(false), swap_(false) {\n  SetVersionString(TESSERACT_VERSION_STR);\n}\n\n// Lazily loads from the given filename. Won't actually read the file\n// until it needs it.\nvoid TessdataManager::LoadFileLater(const char *data_file_name) {\n  Clear();\n  data_file_name_ = data_file_name;\n}\n\n#if defined(HAVE_LIBARCHIVE)\nbool TessdataManager::LoadArchiveFile(const char *filename) {\n  bool result = false;\n  archive *a = archive_read_new();\n  if (a != nullptr) {\n    archive_read_support_filter_all(a);\n    archive_read_support_format_all(a);\n    if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {\n      archive_entry *ae;\n      while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {\n        const char *component = archive_entry_pathname(ae);\n        if (component != nullptr) {\n          TessdataType type;\n          if (TessdataTypeFromFileName(component, &type)) {\n            int64_t size = archive_entry_size(ae);\n            if (size > 0) {\n              entries_[type].resize(size);\n              if (archive_read_data(a, &entries_[type][0], size) == size) {\n                is_loaded_ = true;\n              }\n            }\n          }\n        }\n      }\n      result = is_loaded_;\n    }\n    archive_read_free(a);\n  }\n  return result;\n}\n#endif\n\nbool TessdataManager::Init(const char *data_file_name) {\n  std::vector<char> data;\n  if (reader_ == nullptr) {\n#if defined(HAVE_LIBARCHIVE)\n    if (LoadArchiveFile(data_file_name)) {\n      return true;\n    }\n#endif\n    if (!LoadDataFromFile(data_file_name, &data)) {\n      return false;\n    }\n  } else {\n    if (!(*reader_)(data_file_name, &data)) {\n      return false;\n    }\n  }\n  return LoadMemBuffer(data_file_name, &data[0], data.size());\n}\n\n// Loads from the given memory buffer as if a file.\nbool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) {\n  // TODO: This method supports only the proprietary file format.\n  Clear();\n  data_file_name_ = name;\n  TFile fp;\n  fp.Open(data, size);\n  uint32_t num_entries;\n  if (!fp.DeSerialize(&num_entries)) {\n    return false;\n  }\n  swap_ = num_entries > kMaxNumTessdataEntries;\n  fp.set_swap(swap_);\n  if (swap_) {\n    ReverseN(&num_entries, sizeof(num_entries));\n  }\n  if (num_entries > kMaxNumTessdataEntries) {\n    return false;\n  }\n  // TODO: optimize (no init required).\n  std::vector<int64_t> offset_table(num_entries);\n  if (!fp.DeSerialize(&offset_table[0], num_entries)) {\n    return false;\n  }\n  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {\n    if (offset_table[i] >= 0) {\n      int64_t entry_size = size - offset_table[i];\n      unsigned j = i + 1;\n      while (j < num_entries && offset_table[j] == -1) {\n        ++j;\n      }\n      if (j < num_entries) {\n        entry_size = offset_table[j] - offset_table[i];\n      }\n      entries_[i].resize(entry_size);\n      if (!fp.DeSerialize(&entries_[i][0], entry_size)) {\n        return false;\n      }\n    }\n  }\n  if (entries_[TESSDATA_VERSION].empty()) {\n    SetVersionString(\"Pre-4.0.0\");\n  }\n  is_loaded_ = true;\n  return true;\n}\n\n// Overwrites a single entry of the given type.\nvoid TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) {\n  is_loaded_ = true;\n  entries_[type].resize(size);\n  memcpy(&entries_[type][0], data, size);\n}\n\n// Saves to the given filename.\nbool TessdataManager::SaveFile(const char *filename, FileWriter writer) const {\n  // TODO: This method supports only the proprietary file format.\n  ASSERT_HOST(is_loaded_);\n  std::vector<char> data;\n  Serialize(&data);\n  if (writer == nullptr) {\n    return SaveDataToFile(data, filename);\n  } else {\n    return (*writer)(data, filename);\n  }\n}\n\n// Serializes to the given vector.\nvoid TessdataManager::Serialize(std::vector<char> *data) const {\n  // TODO: This method supports only the proprietary file format.\n  ASSERT_HOST(is_loaded_);\n  // Compute the offset_table and total size.\n  int64_t offset_table[TESSDATA_NUM_ENTRIES];\n  int64_t offset = sizeof(int32_t) + sizeof(offset_table);\n  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {\n    if (entries_[i].empty()) {\n      offset_table[i] = -1;\n    } else {\n      offset_table[i] = offset;\n      offset += entries_[i].size();\n    }\n  }\n  data->resize(offset, 0);\n  int32_t num_entries = TESSDATA_NUM_ENTRIES;\n  TFile fp;\n  fp.OpenWrite(data);\n  fp.Serialize(&num_entries);\n  fp.Serialize(&offset_table[0], countof(offset_table));\n  for (const auto &entry : entries_) {\n    if (!entry.empty()) {\n      fp.Serialize(&entry[0], entry.size());\n    }\n  }\n}\n\n// Resets to the initial state, keeping the reader.\nvoid TessdataManager::Clear() {\n  for (auto &entry : entries_) {\n    entry.clear();\n  }\n  is_loaded_ = false;\n}\n\n// Prints a directory of contents.\nvoid TessdataManager::Directory() const {\n  printf(\"Version:%s\\n\", VersionString().c_str());\n  auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);\n  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {\n    if (!entries_[i].empty()) {\n      printf(\"%u:%s:size=%zu, offset=%zu\\n\", i, kTessdataFileSuffixes[i], entries_[i].size(),\n              offset);\n      offset += entries_[i].size();\n    }\n  }\n}\n\n// Opens the given TFile pointer to the given component type.\n// Returns false in case of failure.\nbool TessdataManager::GetComponent(TessdataType type, TFile *fp) {\n  if (!is_loaded_ && !Init(data_file_name_.c_str())) {\n    return false;\n  }\n  const TessdataManager *const_this = this;\n  return const_this->GetComponent(type, fp);\n}\n\n// As non-const version except it can't load the component if not already\n// loaded.\nbool TessdataManager::GetComponent(TessdataType type, TFile *fp) const {\n  ASSERT_HOST(is_loaded_);\n  if (entries_[type].empty()) {\n    return false;\n  }\n  fp->Open(&entries_[type][0], entries_[type].size());\n  fp->set_swap(swap_);\n  return true;\n}\n\n// Returns the current version string.\nstd::string TessdataManager::VersionString() const {\n  return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());\n}\n\n// Sets the version string to the given v_str.\nvoid TessdataManager::SetVersionString(const std::string &v_str) {\n  entries_[TESSDATA_VERSION].resize(v_str.size());\n  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());\n}\n\nbool TessdataManager::CombineDataFiles(const char *language_data_path_prefix,\n                                       const char *output_filename) {\n  // Load individual tessdata components from files.\n  for (auto filesuffix : kTessdataFileSuffixes) {\n    TessdataType type;\n    ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));\n    std::string filename = language_data_path_prefix;\n    filename += filesuffix;\n    FILE *fp = fopen(filename.c_str(), \"rb\");\n    if (fp != nullptr) {\n      fclose(fp);\n      if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {\n        tprintf(\"Load of file %s failed!\\n\", filename.c_str());\n        return false;\n      }\n    }\n  }\n  is_loaded_ = true;\n\n  // Make sure that the required components are present.\n  if (!IsBaseAvailable() && !IsLSTMAvailable()) {\n    tprintf(\n        \"Error: traineddata file must contain at least (a unicharset file\"\n        \" and inttemp) OR an lstm file.\\n\");\n    return false;\n  }\n  // Write updated data to the output traineddata file.\n  return SaveFile(output_filename, nullptr);\n}\n\nbool TessdataManager::OverwriteComponents(const char *new_traineddata_filename,\n                                          char **component_filenames, int num_new_components) {\n  // Open the files with the new components.\n  // TODO: This method supports only the proprietary file format.\n  for (int i = 0; i < num_new_components; ++i) {\n    TessdataType type;\n    if (TessdataTypeFromFileName(component_filenames[i], &type)) {\n      if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {\n        tprintf(\"Failed to read component file:%s\\n\", component_filenames[i]);\n        return false;\n      }\n    }\n  }\n\n  // Write updated data to the output traineddata file.\n  return SaveFile(new_traineddata_filename, nullptr);\n}\n\nbool TessdataManager::ExtractToFile(const char *filename) {\n  TessdataType type = TESSDATA_NUM_ENTRIES;\n  ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));\n  if (entries_[type].empty()) {\n    return false;\n  }\n  return SaveDataToFile(entries_[type], filename);\n}\n\nbool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) {\n  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {\n    if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {\n      *type = static_cast<TessdataType>(i);\n      return true;\n    }\n  }\n#if !defined(NDEBUG)\n  tprintf(\n      \"TessdataManager can't determine which tessdata\"\n      \" component is represented by %s\\n\",\n      suffix);\n#endif\n  return false;\n}\n\nbool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) {\n  // Get the file suffix (extension)\n  const char *suffix = strrchr(filename, '.');\n  if (suffix == nullptr || *(++suffix) == '\\0') {\n    return false;\n  }\n  return TessdataTypeFromFileSuffix(suffix, type);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/tessdatamanager.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tessdatamanager.h\n// Description: Functions to handle loading/combining tesseract data files.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_\n#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_\n\n#include <tesseract/baseapi.h> // FileReader\n#include <string>              // std::string\n#include <vector>              // std::vector\n#include \"serialis.h\"          // FileWriter\n\nstatic const char kTrainedDataSuffix[] = \"traineddata\";\n\n// When adding new tessdata types and file suffixes, please make sure to\n// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.\nstatic const char kLangConfigFileSuffix[] = \"config\";\nstatic const char kUnicharsetFileSuffix[] = \"unicharset\";\nstatic const char kAmbigsFileSuffix[] = \"unicharambigs\";\nstatic const char kBuiltInTemplatesFileSuffix[] = \"inttemp\";\nstatic const char kBuiltInCutoffsFileSuffix[] = \"pffmtable\";\nstatic const char kNormProtoFileSuffix[] = \"normproto\";\nstatic const char kPuncDawgFileSuffix[] = \"punc-dawg\";\nstatic const char kSystemDawgFileSuffix[] = \"word-dawg\";\nstatic const char kNumberDawgFileSuffix[] = \"number-dawg\";\nstatic const char kFreqDawgFileSuffix[] = \"freq-dawg\";\nstatic const char kFixedLengthDawgsFileSuffix[] = \"fixed-length-dawgs\";\nstatic const char kCubeUnicharsetFileSuffix[] = \"cube-unicharset\";\nstatic const char kCubeSystemDawgFileSuffix[] = \"cube-word-dawg\";\nstatic const char kShapeTableFileSuffix[] = \"shapetable\";\nstatic const char kBigramDawgFileSuffix[] = \"bigram-dawg\";\nstatic const char kUnambigDawgFileSuffix[] = \"unambig-dawg\";\nstatic const char kParamsModelFileSuffix[] = \"params-model\";\nstatic const char kLSTMModelFileSuffix[] = \"lstm\";\nstatic const char kLSTMPuncDawgFileSuffix[] = \"lstm-punc-dawg\";\nstatic const char kLSTMSystemDawgFileSuffix[] = \"lstm-word-dawg\";\nstatic const char kLSTMNumberDawgFileSuffix[] = \"lstm-number-dawg\";\nstatic const char kLSTMUnicharsetFileSuffix[] = \"lstm-unicharset\";\nstatic const char kLSTMRecoderFileSuffix[] = \"lstm-recoder\";\nstatic const char kVersionFileSuffix[] = \"version\";\n\nnamespace tesseract {\n\nenum TessdataType {\n  TESSDATA_LANG_CONFIG,        // 0\n  TESSDATA_UNICHARSET,         // 1\n  TESSDATA_AMBIGS,             // 2\n  TESSDATA_INTTEMP,            // 3\n  TESSDATA_PFFMTABLE,          // 4\n  TESSDATA_NORMPROTO,          // 5\n  TESSDATA_PUNC_DAWG,          // 6\n  TESSDATA_SYSTEM_DAWG,        // 7\n  TESSDATA_NUMBER_DAWG,        // 8\n  TESSDATA_FREQ_DAWG,          // 9\n  TESSDATA_FIXED_LENGTH_DAWGS, // 10  // deprecated\n  TESSDATA_CUBE_UNICHARSET,    // 11  // deprecated\n  TESSDATA_CUBE_SYSTEM_DAWG,   // 12  // deprecated\n  TESSDATA_SHAPE_TABLE,        // 13\n  TESSDATA_BIGRAM_DAWG,        // 14\n  TESSDATA_UNAMBIG_DAWG,       // 15\n  TESSDATA_PARAMS_MODEL,       // 16\n  TESSDATA_LSTM,               // 17\n  TESSDATA_LSTM_PUNC_DAWG,     // 18\n  TESSDATA_LSTM_SYSTEM_DAWG,   // 19\n  TESSDATA_LSTM_NUMBER_DAWG,   // 20\n  TESSDATA_LSTM_UNICHARSET,    // 21\n  TESSDATA_LSTM_RECODER,       // 22\n  TESSDATA_VERSION,            // 23\n\n  TESSDATA_NUM_ENTRIES\n};\n\n/**\n * kTessdataFileSuffixes[i] indicates the file suffix for\n * tessdata of type i (from TessdataType enum).\n */\nstatic const char *const kTessdataFileSuffixes[] = {\n    kLangConfigFileSuffix,       // 0\n    kUnicharsetFileSuffix,       // 1\n    kAmbigsFileSuffix,           // 2\n    kBuiltInTemplatesFileSuffix, // 3\n    kBuiltInCutoffsFileSuffix,   // 4\n    kNormProtoFileSuffix,        // 5\n    kPuncDawgFileSuffix,         // 6\n    kSystemDawgFileSuffix,       // 7\n    kNumberDawgFileSuffix,       // 8\n    kFreqDawgFileSuffix,         // 9\n    kFixedLengthDawgsFileSuffix, // 10  // deprecated\n    kCubeUnicharsetFileSuffix,   // 11  // deprecated\n    kCubeSystemDawgFileSuffix,   // 12  // deprecated\n    kShapeTableFileSuffix,       // 13\n    kBigramDawgFileSuffix,       // 14\n    kUnambigDawgFileSuffix,      // 15\n    kParamsModelFileSuffix,      // 16\n    kLSTMModelFileSuffix,        // 17\n    kLSTMPuncDawgFileSuffix,     // 18\n    kLSTMSystemDawgFileSuffix,   // 19\n    kLSTMNumberDawgFileSuffix,   // 20\n    kLSTMUnicharsetFileSuffix,   // 21\n    kLSTMRecoderFileSuffix,      // 22\n    kVersionFileSuffix,          // 23\n};\n\n/**\n * TessdataType could be updated to contain more entries, however\n * we do not expect that number to be astronomically high.\n * In order to automatically detect endianness TessdataManager will\n * flip the bits if actual_tessdata_num_entries_ is larger than\n * kMaxNumTessdataEntries.\n */\nstatic const int kMaxNumTessdataEntries = 1000;\n\nclass TESS_API TessdataManager {\npublic:\n  TessdataManager();\n  explicit TessdataManager(FileReader reader);\n\n  ~TessdataManager() = default;\n\n  bool swap() const {\n    return swap_;\n  }\n  bool is_loaded() const {\n    return is_loaded_;\n  }\n\n  // Lazily loads from the given filename. Won't actually read the file\n  // until it needs it.\n  void LoadFileLater(const char *data_file_name);\n  /**\n   * Opens and reads the given data file right now.\n   * @return true on success.\n   */\n  bool Init(const char *data_file_name);\n  // Loads from the given memory buffer as if a file, remembering name as some\n  // arbitrary source id for caching.\n  bool LoadMemBuffer(const char *name, const char *data, int size);\n  // Overwrites a single entry of the given type.\n  void OverwriteEntry(TessdataType type, const char *data, int size);\n\n  // Saves to the given filename.\n  bool SaveFile(const char *filename, FileWriter writer) const;\n  // Serializes to the given vector.\n  void Serialize(std::vector<char> *data) const;\n  // Resets to the initial state, keeping the reader.\n  void Clear();\n\n  // Prints a directory of contents.\n  void Directory() const;\n\n  // Returns true if the component requested is present.\n  bool IsComponentAvailable(TessdataType type) const {\n    return !entries_[type].empty();\n  }\n  // Opens the given TFile pointer to the given component type.\n  // Returns false in case of failure.\n  bool GetComponent(TessdataType type, TFile *fp);\n  // As non-const version except it can't load the component if not already\n  // loaded.\n  bool GetComponent(TessdataType type, TFile *fp) const;\n\n  // Returns the current version string.\n  std::string VersionString() const;\n  // Sets the version string to the given v_str.\n  void SetVersionString(const std::string &v_str);\n\n  // Returns true if the base Tesseract components are present.\n  bool IsBaseAvailable() const {\n    return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();\n  }\n\n  // Returns true if the LSTM components are present.\n  bool IsLSTMAvailable() const {\n    return !entries_[TESSDATA_LSTM].empty();\n  }\n\n  // Return the name of the underlying data file.\n  const std::string &GetDataFileName() const {\n    return data_file_name_;\n  }\n\n  /**\n   * Reads all the standard tesseract config and data files for a language\n   * at the given path and bundles them up into one binary data file.\n   * Returns true if the combined traineddata file was successfully written.\n   */\n  bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);\n\n  /**\n   * Gets the individual components from the data_file_ with which the class was\n   * initialized. Overwrites the components specified by component_filenames.\n   * Writes the updated traineddata file to new_traineddata_filename.\n   */\n  bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,\n                           int num_new_components);\n\n  /**\n   * Extracts tessdata component implied by the name of the input file from\n   * the combined traineddata loaded into TessdataManager.\n   * Writes the extracted component to the file indicated by the file name.\n   * E.g. if the filename given is somepath/somelang.unicharset, unicharset\n   * will be extracted from the data loaded into the TessdataManager and will\n   * be written to somepath/somelang.unicharset.\n   * @return true if the component was successfully extracted, false if the\n   * component was not present in the traineddata loaded into TessdataManager.\n   */\n  bool ExtractToFile(const char *filename);\n\nprivate:\n  // Use libarchive.\n  bool LoadArchiveFile(const char *filename);\n\n  /**\n   * Fills type with TessdataType of the tessdata component represented by the\n   * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.\n   * @return true if the tessdata component type could be determined\n   * from the given file name.\n   */\n  static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);\n\n  /**\n   * Tries to determine tessdata component file suffix from filename,\n   * returns true on success.\n   */\n  static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);\n\n  // Name of file it came from.\n  std::string data_file_name_;\n  // Function to load the file when we need it.\n  FileReader reader_;\n  // True if the file has been loaded.\n  bool is_loaded_;\n  // True if the bytes need swapping.\n  bool swap_;\n  // Contents of each element of the traineddata file.\n  std::vector<char> entries_[TESSDATA_NUM_ENTRIES];\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_\n"
  },
  {
    "path": "src/ccutil/tesserrstream.h",
    "content": "// File:        tesserrstream.h\n// Description: C++ stream which enhances tprintf\n// Author:      Stefan Weil\n//\n// (C) Copyright 2024\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_CCUTIL_TESSERRSTREAM_H\n#define TESSERACT_CCUTIL_TESSERRSTREAM_H\n\n#include \"tprintf.h\"\n#include <tesseract/export.h> // for TESS_API\n\n#include <ostream> // for std::ostream\n\nnamespace tesseract {\n\nclass TessStreamBuf : public std::streambuf {\npublic:\n  TessStreamBuf() = default;\n\nprotected:\n  virtual int_type overflow(int_type c) override {\n    if (c != EOF) {\n      if (debugfp == nullptr) {\n        debugfp = get_debugfp();\n      }\n      if (fputc(c, debugfp) == EOF) {\n        return EOF;\n      }\n    }\n    return c;\n  }\n\n  virtual std::streamsize xsputn(const char* s, std::streamsize n) override {\n    if (debugfp == nullptr) {\n      debugfp = get_debugfp();\n    }\n    return fwrite(s, 1, n, debugfp);\n  }\n\nprivate:\n  FILE *debugfp = nullptr;\n};\n\nclass TessErrStream : public std::ostream {\nprivate:\n  TessStreamBuf buf;\n\npublic:\n  TessErrStream() : std::ostream(nullptr), buf() {\n    rdbuf(&buf);\n  }\n};\n\nextern TESS_API TessErrStream tesserr;\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_TESSERRSTREAM_H\n"
  },
  {
    "path": "src/ccutil/tesstypes.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tesstypes.h\n// Description: Simple data types used by Tesseract code.\n// Author:      Stefan Weil\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TESSTYPES_H\n#define TESSERACT_TESSTYPES_H\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // FAST_FLOAT\n#endif\n\n#include <cstdint> // for int16_t, int32_t\n\nnamespace tesseract {\n\n// Image dimensions (width and height, coordinates).\n#if defined(LARGE_IMAGES)\nusing TDimension = int32_t;\n#else\nusing TDimension = int16_t;\n#endif\n\n// Floating point data type used for LSTM calculations.\n#if defined(FAST_FLOAT)\nusing TFloat = float;\n#else\nusing TFloat = double;\n#endif\n\n}\n\n#endif // TESSERACT_TESSTYPES_H\n"
  },
  {
    "path": "src/ccutil/tprintf.cpp",
    "content": "/**********************************************************************\n * File:        tprintf.cpp\n * Description: Trace version of printf - portable between UX and NT\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1995, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"tesserrstream.h\"\n#include \"tprintf.h\"\n\n#include \"params.h\"\n\n#include <climits> // for INT_MAX\n#include <cstdarg>\n#include <cstdio>\n\nnamespace tesseract {\n\nINT_VAR(log_level, INT_MAX, \"Logging level\");\n\nstatic STRING_VAR(debug_file, \"\", \"File to send tprintf output to\");\n\n// File for debug output.\nFILE *debugfp;\n\n// Set output for log messages.\n// The output is written to stderr if debug_file is empty.\n// Otherwise it is written to debug_file.\n// It is possible to switch between stderr and debug_file output:\n// tprintf(\"write to configured output\\n\");\n// debug_file = \"\";\n// tprintf(\"write to stderr\\n\");\n// debug_file = \"/tmp/log\";\n// tprintf(\"write to /tmp/log\\n\");\n// debug_file = \"\";\n// tprintf(\"write to stderr\\n\");\nFILE *get_debugfp() {\n  if (debug_file.empty()) {\n    // Write to stderr.\n    if (debugfp != stderr && debugfp != nullptr) {\n      fclose(debugfp);\n    }\n    debugfp = stderr;\n  } else if (debugfp == stderr || debugfp == nullptr) {\n    // Write to file.\n#ifdef _WIN32\n    if (debug_file == \"/dev/null\") {\n      // Replace /dev/null by nul for Windows.\n      debug_file = \"nul\";\n    }\n#endif\n    debugfp = fopen(debug_file.c_str(), \"wb\");\n  }\n  return debugfp;\n}\n\nTessErrStream tesserr;\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/tprintf.h",
    "content": "/**********************************************************************\n * File:        tprintf.h\n * Description: Trace version of printf - portable between UX and NT\n * Author:      Phil Cheatle\n *\n * (C) Copyright 1995, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCUTIL_TPRINTF_H\n#define TESSERACT_CCUTIL_TPRINTF_H\n\n#include \"params.h\"           // for INT_VAR_H\n#include <tesseract/export.h> // for TESS_API\n#include <cstdarg>\n#include <utility>            // for std::forward\n\nnamespace tesseract {\n\n// Disable some log messages by setting log_level > 0.\nextern TESS_API INT_VAR_H(log_level);\n\n// Get file for debug output.\nTESS_API FILE *get_debugfp();\n\n// Main logging function. Trace printf.\ninline void tprintf(const char *format, ...) {\n  va_list args;\n  va_start(args, format);\n  vfprintf(get_debugfp(), format, args);\n  va_end(args);\n}\n\n} // namespace tesseract\n\n#endif // define TESSERACT_CCUTIL_TPRINTF_H\n"
  },
  {
    "path": "src/ccutil/unichar.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unichar.cpp\n// Description: Unicode character/ligature class.\n// Author:      Ray Smith\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/unichar.h>\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\n#define UNI_MAX_LEGAL_UTF32 0x0010FFFF\n\nnamespace tesseract {\n\n// Construct from a utf8 string. If len<0 then the string is null terminated.\n// If the string is too long to fit in the UNICHAR then it takes only what\n// will fit. Checks for illegal input and stops at an illegal sequence.\n// The resulting UNICHAR may be empty.\nUNICHAR::UNICHAR(const char *utf8_str, int len) {\n  int total_len = 0;\n  int step = 0;\n  if (len < 0) {\n    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {\n      ;\n    }\n  }\n  for (total_len = 0; total_len < len; total_len += step) {\n    step = utf8_step(utf8_str + total_len);\n    if (total_len + step > UNICHAR_LEN) {\n      break; // Too long.\n    }\n    if (step == 0) {\n      break; // Illegal first byte.\n    }\n    int i;\n    for (i = 1; i < step; ++i) {\n      if ((utf8_str[total_len + i] & 0xc0) != 0x80) {\n        break;\n      }\n    }\n    if (i < step) {\n      break; // Illegal surrogate\n    }\n  }\n  memcpy(chars, utf8_str, total_len);\n  if (total_len < UNICHAR_LEN) {\n    chars[UNICHAR_LEN - 1] = total_len;\n    while (total_len < UNICHAR_LEN - 1) {\n      chars[total_len++] = 0;\n    }\n  }\n}\n\n// Construct from a single UCS4 character. Illegal values are ignored,\n// resulting in an empty UNICHAR.\nUNICHAR::UNICHAR(int unicode) {\n  const int bytemask = 0xBF;\n  const int bytemark = 0x80;\n\n  if (unicode < 0x80) {\n    chars[UNICHAR_LEN - 1] = 1;\n    chars[2] = 0;\n    chars[1] = 0;\n    chars[0] = static_cast<char>(unicode);\n  } else if (unicode < 0x800) {\n    chars[UNICHAR_LEN - 1] = 2;\n    chars[2] = 0;\n    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[0] = static_cast<char>(unicode | 0xc0);\n  } else if (unicode < 0x10000) {\n    chars[UNICHAR_LEN - 1] = 3;\n    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[0] = static_cast<char>(unicode | 0xe0);\n  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {\n    chars[UNICHAR_LEN - 1] = 4;\n    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);\n    unicode >>= 6;\n    chars[0] = static_cast<char>(unicode | 0xf0);\n  } else {\n    memset(chars, 0, UNICHAR_LEN);\n  }\n}\n\n// Get the first character as UCS-4.\nint UNICHAR::first_uni() const {\n  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};\n  int uni = 0;\n  int len = utf8_step(chars);\n  const char *src = chars;\n\n  switch (len) {\n    default:\n      break;\n    case 4:\n      uni += static_cast<unsigned char>(*src++);\n      uni <<= 6;\n      // Fall through.\n    case 3:\n      uni += static_cast<unsigned char>(*src++);\n      uni <<= 6;\n      // Fall through.\n    case 2:\n      uni += static_cast<unsigned char>(*src++);\n      uni <<= 6;\n      // Fall through.\n    case 1:\n      uni += static_cast<unsigned char>(*src++);\n  }\n  uni -= utf8_offsets[len];\n  return uni;\n}\n\n// Get a terminated UTF8 string: Must delete[] it after use.\nchar *UNICHAR::utf8_str() const {\n  int len = utf8_len();\n  char *str = new char[len + 1];\n  memcpy(str, chars, len);\n  str[len] = 0;\n  return str;\n}\n\n// Get the number of bytes in the first character of the given utf8 string.\nint UNICHAR::utf8_step(const char *utf8_str) {\n  static const char utf8_bytes[256] = {\n      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,\n      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};\n\n  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];\n}\n\nUNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {\n  ASSERT_HOST(it_ != nullptr);\n  int step = utf8_step(it_);\n  if (step == 0) {\n    tprintf(\"ERROR: Illegal UTF8 encountered.\\n\");\n    for (int i = 0; i < 5 && it_[i] != '\\0'; ++i) {\n      tprintf(\"Index %d char = 0x%x\\n\", i, it_[i]);\n    }\n    step = 1;\n  }\n  it_ += step;\n  return *this;\n}\n\nint UNICHAR::const_iterator::operator*() const {\n  ASSERT_HOST(it_ != nullptr);\n  const int len = utf8_step(it_);\n  if (len == 0) {\n    tprintf(\"WARNING: Illegal UTF8 encountered\\n\");\n    return ' ';\n  }\n  UNICHAR uch(it_, len);\n  return uch.first_uni();\n}\n\nint UNICHAR::const_iterator::get_utf8(char *utf8_output) const {\n  ASSERT_HOST(it_ != nullptr);\n  const int len = utf8_step(it_);\n  if (len == 0) {\n    tprintf(\"WARNING: Illegal UTF8 encountered\\n\");\n    utf8_output[0] = ' ';\n    return 1;\n  }\n  strncpy(utf8_output, it_, len);\n  return len;\n}\n\nint UNICHAR::const_iterator::utf8_len() const {\n  ASSERT_HOST(it_ != nullptr);\n  const int len = utf8_step(it_);\n  if (len == 0) {\n    tprintf(\"WARNING: Illegal UTF8 encountered\\n\");\n    return 1;\n  }\n  return len;\n}\n\nbool UNICHAR::const_iterator::is_legal() const {\n  return utf8_step(it_) > 0;\n}\n\nUNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {\n  return UNICHAR::const_iterator(utf8_str);\n}\n\nUNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {\n  return UNICHAR::const_iterator(utf8_str + len);\n}\n\n// Converts a utf-8 string to a vector of unicodes.\n// Returns an empty vector if the input contains invalid UTF-8.\n/* static */\nstd::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {\n  const int utf8_length = strlen(utf8_str);\n  std::vector<char32> unicodes;\n  unicodes.reserve(utf8_length);\n  const_iterator end_it(end(utf8_str, utf8_length));\n  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {\n    if (it.is_legal()) {\n      unicodes.push_back(*it);\n    } else {\n      unicodes.clear();\n      return unicodes;\n    }\n  }\n  return unicodes;\n}\n\n// Returns an empty string if the input contains an invalid unicode.\nstd::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {\n  std::string utf8_str;\n  for (char32 ch : str32) {\n    UNICHAR uni_ch(ch);\n    int step;\n    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {\n      utf8_str.append(uni_ch.utf8(), step);\n    } else {\n      return \"\";\n    }\n  }\n  return utf8_str;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/unicharcompress.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharcompress.cpp\n// Description: Unicode re-encoding using a sequence of smaller numbers in\n//              place of a single large code for CJK, similarly for Indic,\n//              and dissection of ligatures for other scripts.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"unicharcompress.h\"\n#include <algorithm>\n#include <memory>\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// String used to represent the null_id in direct_set.\nstatic const char *kNullChar = \"<nul>\";\n// Radix to make unique values from the stored radical codes.\nconst int kRadicalRadix = 29;\n\n// \"Hash\" function for const std::vector<int> computes the sum of elements.\n// Build a unique number for each code sequence that we can use as the index in\n// a hash map of ints instead of trying to hash the vectors.\nstatic int RadicalPreHash(const std::vector<int> &rs) {\n  size_t result = 0;\n  for (int radical : rs) {\n    result *= kRadicalRadix;\n    result += radical;\n  }\n  return result;\n}\n\n// A hash map to convert unicodes to radical encoding.\nusing RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;\n// A hash map to count occurrences of each radical encoding.\nusing RSCounts = std::unordered_map<int, int>;\n\nstatic bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {\n  if (radical_data_line.empty() || (radical_data_line)[0] == '#') {\n    return true;\n  }\n  std::vector<std::string> entries = split(radical_data_line, ' ');\n  if (entries.size() < 2) {\n    return false;\n  }\n  char *end = nullptr;\n  int unicode = strtol(&entries[0][0], &end, 10);\n  if (*end != '\\0') {\n    return false;\n  }\n  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);\n  for (size_t i = 1; i < entries.size(); ++i) {\n    int radical = strtol(&entries[i][0], &end, 10);\n    if (*end != '\\0') {\n      return false;\n    }\n    radicals->push_back(radical);\n  }\n  (*radical_map)[unicode] = std::move(radicals);\n  return true;\n}\n\n// Helper function builds the RSMap from the radical-stroke file, which has\n// already been read into a string. Returns false on error.\n// The radical_stroke_table is non-const because it gets split and the caller\n// is unlikely to want to use it again.\nstatic bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {\n  std::vector<std::string> lines = split(radical_data, '\\n');\n  for (unsigned i = 0; i < lines.size(); ++i) {\n    if (!DecodeRadicalLine(lines[i], radical_map)) {\n      tprintf(\"Invalid format in radical table at line %d: %s\\n\", i, lines[i].c_str());\n      return false;\n    }\n  }\n  return true;\n}\n\nUnicharCompress::UnicharCompress() : code_range_(0) {}\nUnicharCompress::UnicharCompress(const UnicharCompress &src) {\n  *this = src;\n}\nUnicharCompress::~UnicharCompress() {\n  Cleanup();\n}\nUnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {\n  Cleanup();\n  encoder_ = src.encoder_;\n  code_range_ = src.code_range_;\n  SetupDecoder();\n  return *this;\n}\n\n// Computes the encoding for the given unicharset. It is a requirement that\n// the file training/langdata/radical-stroke.txt have been read into the\n// input string radical_stroke_table.\n// Returns false if the encoding cannot be constructed.\nbool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,\n                                      std::string *radical_stroke_table) {\n  RSMap radical_map;\n  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {\n    return false;\n  }\n  encoder_.clear();\n  UNICHARSET direct_set;\n  // To avoid unused codes, clear the special codes from the direct_set.\n  direct_set.clear();\n  // Always keep space as 0;\n  direct_set.unichar_insert(\" \", OldUncleanUnichars::kTrue);\n  // Null char is next if we have one.\n  if (null_id >= 0) {\n    direct_set.unichar_insert(kNullChar);\n  }\n  RSCounts radical_counts;\n  // In the initial map, codes [0, unicharset.size()) are\n  // reserved for non-han/hangul sequences of 1 or more unicodes.\n  int hangul_offset = unicharset.size();\n  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).\n  const int kTotalJamos = kLCount + kVCount + kTCount;\n  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard\n  // to measure the number of radicals and strokes, initially we use the same\n  // code range for all 3 Han code positions, and fix them after.\n  int han_offset = hangul_offset + kTotalJamos;\n  for (unsigned u = 0; u <= unicharset.size(); ++u) {\n    // We special-case allow null_id to be equal to unicharset.size() in case\n    // there is no space in unicharset for it.\n    if (u == unicharset.size() && static_cast<int>(u) != null_id) {\n      break; // Finished\n    }\n    RecodedCharID code;\n    // Convert to unicodes.\n    std::vector<char32> unicodes;\n    std::string cleaned;\n    if (u < unicharset.size()) {\n      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));\n    }\n    if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {\n      // Check single unicodes for Hangul/Han and encode if so.\n      int unicode = unicodes[0];\n      int leading, vowel, trailing;\n      auto it = radical_map.find(unicode);\n      if (it != radical_map.end()) {\n        // This is Han. Use the radical codes directly.\n        int num_radicals = it->second->size();\n        for (int c = 0; c < num_radicals; ++c) {\n          code.Set(c, han_offset + (*it->second)[c]);\n        }\n        int pre_hash = RadicalPreHash(*it->second);\n        int num_samples = radical_counts[pre_hash]++;\n        if (num_samples > 0) {\n          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);\n        }\n      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {\n        // This is Hangul. Since we know the exact size of each part at compile\n        // time, it gets the bottom set of codes.\n        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,\n                  trailing + kLCount + kVCount + hangul_offset);\n      }\n    }\n    // If the code is still empty, it wasn't Han or Hangul.\n    if (code.empty()) {\n      // Special cases.\n      if (u == UNICHAR_SPACE) {\n        code.Set(0, 0); // Space.\n      } else if (static_cast<int>(u) == null_id ||\n                 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {\n        code.Set(0, direct_set.unichar_to_id(kNullChar));\n      } else {\n        // Add the direct_set unichar-ids of the unicodes in sequence to the\n        // code.\n        for (int uni : unicodes) {\n          int position = code.length();\n          if (position >= RecodedCharID::kMaxCodeLen) {\n            tprintf(\"Unichar %d=%s is too long to encode!!\\n\", u, unicharset.id_to_unichar(u));\n            return false;\n          }\n          UNICHAR unichar(uni);\n          char *utf8 = unichar.utf8_str();\n          if (!direct_set.contains_unichar(utf8)) {\n            direct_set.unichar_insert(utf8);\n          }\n          code.Set(position, direct_set.unichar_to_id(utf8));\n          delete[] utf8;\n          if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {\n            // Code space got bigger!\n            tprintf(\"Code space expanded from original unicharset!!\\n\");\n            return false;\n          }\n        }\n      }\n    }\n    encoder_.push_back(code);\n  }\n  // Now renumber Han to make all codes unique. We already added han_offset to\n  // all Han. Now separate out the radical, stroke, and count codes for Han.\n  int code_offset = 0;\n  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {\n    int max_offset = 0;\n    for (unsigned u = 0; u < unicharset.size(); ++u) {\n      RecodedCharID *code = &encoder_[u];\n      if (code->length() <= i) {\n        continue;\n      }\n      max_offset = std::max(max_offset, (*code)(i)-han_offset);\n      code->Set(i, (*code)(i) + code_offset);\n    }\n    if (max_offset == 0) {\n      break;\n    }\n    code_offset += max_offset + 1;\n  }\n  DefragmentCodeValues(null_id >= 0 ? 1 : -1);\n  SetupDecoder();\n  return true;\n}\n\n// Sets up an encoder that doesn't change the unichars at all, so it just\n// passes them through unchanged.\nvoid UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {\n  std::vector<RecodedCharID> codes;\n  for (unsigned u = 0; u < unicharset.size(); ++u) {\n    RecodedCharID code;\n    code.Set(0, u);\n    codes.push_back(code);\n  }\n  if (!unicharset.has_special_codes()) {\n    RecodedCharID code;\n    code.Set(0, unicharset.size());\n    codes.push_back(code);\n  }\n  SetupDirect(codes);\n}\n\n// Sets up an encoder directly using the given encoding vector, which maps\n// unichar_ids to the given codes.\nvoid UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {\n  encoder_ = codes;\n  ComputeCodeRange();\n  SetupDecoder();\n}\n\n// Renumbers codes to eliminate unused values.\nvoid UnicharCompress::DefragmentCodeValues(int encoded_null) {\n  // There may not be any Hangul, but even if there is, it is possible that not\n  // all codes are used. Likewise with the Han encoding, it is possible that not\n  // all numbers of strokes are used.\n  ComputeCodeRange();\n  std::vector<int> offsets(code_range_);\n  // Find which codes are used\n  for (auto &code : encoder_) {\n    for (int i = 0; i < code.length(); ++i) {\n      offsets[code(i)] = 1;\n    }\n  }\n  // Compute offsets based on code use.\n  int offset = 0;\n  for (unsigned i = 0; i < offsets.size(); ++i) {\n    // If not used, decrement everything above here.\n    // We are moving encoded_null to the end, so it is not \"used\".\n    if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {\n      --offset;\n    } else {\n      offsets[i] = offset;\n    }\n  }\n  if (encoded_null >= 0) {\n    // The encoded_null is moving to the end, for the benefit of TensorFlow,\n    // which is offsets.size() + offsets.back().\n    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;\n  }\n  // Now apply the offsets.\n  for (auto &c : encoder_) {\n    RecodedCharID *code = &c;\n    for (int i = 0; i < code->length(); ++i) {\n      int value = (*code)(i);\n      code->Set(i, value + offsets[value]);\n    }\n  }\n  ComputeCodeRange();\n}\n\n// Encodes a single unichar_id. Returns the length of the code, or zero if\n// invalid input, and the encoding itself\nint UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {\n  if (unichar_id >= encoder_.size()) {\n    return 0;\n  }\n  *code = encoder_[unichar_id];\n  return code->length();\n}\n\n// Decodes code, returning the original unichar-id, or\n// INVALID_UNICHAR_ID if the input is invalid.\nint UnicharCompress::DecodeUnichar(const RecodedCharID &code) const {\n  int len = code.length();\n  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {\n    return INVALID_UNICHAR_ID;\n  }\n  auto it = decoder_.find(code);\n  if (it == decoder_.end()) {\n    return INVALID_UNICHAR_ID;\n  }\n  return it->second;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool UnicharCompress::Serialize(TFile *fp) const {\n  return fp->Serialize(encoder_);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool UnicharCompress::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(encoder_)) {\n    return false;\n  }\n  ComputeCodeRange();\n  SetupDecoder();\n  return true;\n}\n\n// Returns a string containing a text file that describes the encoding thus:\n// <index>[,<index>]*<tab><UTF8-str><newline>\n// In words, a comma-separated list of one or more indices, followed by a tab\n// and the UTF-8 string that the code represents per line. Most simple scripts\n// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean\n// and the Indic scripts will contain a many-to-many mapping.\n// See the class comment above for details.\nstd::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {\n  std::string encoding;\n  for (unsigned c = 0; c < encoder_.size(); ++c) {\n    const RecodedCharID &code = encoder_[c];\n    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {\n      // Don't show the duplicate entry.\n      continue;\n    }\n    encoding += std::to_string(code(0));\n    for (int i = 1; i < code.length(); ++i) {\n      encoding += \",\" + std::to_string(code(i));\n    }\n    encoding += \"\\t\";\n    if (c >= unicharset.size() ||\n        (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {\n      encoding += kNullChar;\n    } else {\n      encoding += unicharset.id_to_unichar(c);\n    }\n    encoding += \"\\n\";\n  }\n  return encoding;\n}\n\n// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.\n// Note that the returned values are 0-based indices, NOT unicode Jamo.\n// Returns false if the input is not in the Hangul unicode range.\n/* static */\nbool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) {\n  if (unicode < kFirstHangul) {\n    return false;\n  }\n  int offset = unicode - kFirstHangul;\n  if (offset >= kNumHangul) {\n    return false;\n  }\n  const int kNCount = kVCount * kTCount;\n  *leading = offset / kNCount;\n  *vowel = (offset % kNCount) / kTCount;\n  *trailing = offset % kTCount;\n  return true;\n}\n\n// Computes the value of code_range_ from the encoder_.\nvoid UnicharCompress::ComputeCodeRange() {\n  code_range_ = -1;\n  for (auto &code : encoder_) {\n    for (int i = 0; i < code.length(); ++i) {\n      if (code(i) > code_range_) {\n        code_range_ = code(i);\n      }\n    }\n  }\n  ++code_range_;\n}\n\n// Initializes the decoding hash_map from the encoding array.\nvoid UnicharCompress::SetupDecoder() {\n  Cleanup();\n  is_valid_start_.clear();\n  is_valid_start_.resize(code_range_);\n  for (unsigned c = 0; c < encoder_.size(); ++c) {\n    const RecodedCharID &code = encoder_[c];\n    decoder_[code] = c;\n    is_valid_start_[code(0)] = true;\n    RecodedCharID prefix = code;\n    int len = code.length() - 1;\n    prefix.Truncate(len);\n    auto final_it = final_codes_.find(prefix);\n    if (final_it == final_codes_.end()) {\n      auto *code_list = new std::vector<int>;\n      code_list->push_back(code(len));\n      final_codes_[prefix] = code_list;\n      while (--len >= 0) {\n        prefix.Truncate(len);\n        auto next_it = next_codes_.find(prefix);\n        if (next_it == next_codes_.end()) {\n          auto *code_list = new std::vector<int>;\n          code_list->push_back(code(len));\n          next_codes_[prefix] = code_list;\n        } else {\n          // We still have to search the list as we may get here via multiple\n          // lengths of code.\n          if (!contains(*next_it->second, code(len))) {\n            next_it->second->push_back(code(len));\n          }\n          break; // This prefix has been processed.\n        }\n      }\n    } else {\n      if (!contains(*final_it->second, code(len))) {\n        final_it->second->push_back(code(len));\n      }\n    }\n  }\n}\n\n// Frees allocated memory.\nvoid UnicharCompress::Cleanup() {\n  decoder_.clear();\n  is_valid_start_.clear();\n  for (auto &next_code : next_codes_) {\n    delete next_code.second;\n  }\n  for (auto &final_code : final_codes_) {\n    delete final_code.second;\n  }\n  next_codes_.clear();\n  final_codes_.clear();\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/ccutil/unicharcompress.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharcompress.h\n// Description: Unicode re-encoding using a sequence of smaller numbers in\n//              place of a single large code for CJK, similarly for Indic,\n//              and dissection of ligatures for other scripts.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_\n#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_\n\n#include <unordered_map>\n#include <vector>\n#include \"serialis.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\n// Trivial class to hold the code for a recoded unichar-id.\nclass RecodedCharID {\npublic:\n  // The maximum length of a code.\n  static const int kMaxCodeLen = 9;\n\n  RecodedCharID() : self_normalized_(1), length_(0) {\n    memset(code_, 0, sizeof(code_));\n  }\n  void Truncate(int length) {\n    length_ = length;\n  }\n  // Sets the code value at the given index in the code.\n  void Set(int index, int value) {\n    code_[index] = value;\n    if (length_ <= index) {\n      length_ = index + 1;\n    }\n  }\n  // Shorthand for setting codes of length 3, as all Hangul and Han codes are\n  // length 3.\n  void Set3(int code0, int code1, int code2) {\n    length_ = 3;\n    code_[0] = code0;\n    code_[1] = code1;\n    code_[2] = code2;\n  }\n  bool empty() const {\n    return length_ == 0;\n  }\n  // Accessors\n  int length() const {\n    return length_;\n  }\n  int operator()(int index) const {\n    return code_[index];\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const {\n    return fp->Serialize(&self_normalized_) && fp->Serialize(&length_) &&\n           fp->Serialize(&code_[0], length_);\n  }\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) {\n    return fp->DeSerialize(&self_normalized_) && fp->DeSerialize(&length_) &&\n           fp->DeSerialize(&code_[0], length_);\n  }\n  bool operator==(const RecodedCharID &other) const {\n    if (length_ != other.length_) {\n      return false;\n    }\n    for (int i = 0; i < length_; ++i) {\n      if (code_[i] != other.code_[i]) {\n        return false;\n      }\n    }\n    return true;\n  }\n  // Hash functor for RecodedCharID.\n  struct RecodedCharIDHash {\n    uint64_t operator()(const RecodedCharID &code) const {\n      uint64_t result = 0;\n      for (int i = 0; i < code.length_; ++i) {\n        result ^= static_cast<uint64_t>(code(i)) << (7 * i);\n      }\n      return result;\n    }\n  };\n\nprivate:\n  // True if this code is self-normalizing, ie is the master entry for indices\n  // that map to the same code. Has boolean value, but int8_t for serialization.\n  int8_t self_normalized_;\n  // The number of elements in use in code_;\n  int32_t length_;\n  // The re-encoded form of the unichar-id to which this RecodedCharID relates.\n  int32_t code_[kMaxCodeLen];\n};\n\n// Class holds a \"compression\" of a unicharset to simplify the learning problem\n// for a neural-network-based classifier.\n// Objectives:\n// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as\n//          a sequence of 3 codes with much fewer values.\n//          This is achieved using the Jamo coding for Hangul and the Unicode\n//          Radical-Stroke-index for Han.\n// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code\n//            as the unicode sequence (but coded in a more compact space).\n// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing\n//               and not significantly distinct shapes (quotes) together, ie\n//               represent the fi ligature as the f-i pair, and fold u+2019 and\n//               friends all onto ascii single '\n// 4 The null character and mapping to target activations:\n//    To save horizontal coding space, the compressed codes are generally mapped\n//    to target network activations without intervening null characters, BUT\n//    in the case of ligatures, such as ff, null characters have to be included\n//    so existence of repeated codes is detected at codebook-building time, and\n//    null characters are embedded directly into the codes, so the rest of the\n//    system doesn't need to worry about the problem (much). There is still an\n//    effect on the range of ways in which the target activations can be\n//    generated.\n//\n// The computed code values are compact (no unused values), and, for CJK,\n// unique (each code position uses a disjoint set of values from each other code\n// position). For non-CJK, the same code value CAN be used in multiple\n// positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>\n// is the same code as is used for the single f.\nclass TESS_API UnicharCompress {\npublic:\n  UnicharCompress();\n  UnicharCompress(const UnicharCompress &src);\n  ~UnicharCompress();\n  UnicharCompress &operator=(const UnicharCompress &src);\n\n  // The 1st Hangul unicode.\n  static const int kFirstHangul = 0xac00;\n  // The number of Hangul unicodes.\n  static const int kNumHangul = 11172;\n  // The number of Jamos for each of the 3 parts of a Hangul character, being\n  // the Leading consonant, Vowel and Trailing consonant.\n  static const int kLCount = 19;\n  static const int kVCount = 21;\n  static const int kTCount = 28;\n\n  // Computes the encoding for the given unicharset. It is a requirement that\n  // the file training/langdata/radical-stroke.txt have been read into the\n  // input string radical_stroke_table.\n  // Returns false if the encoding cannot be constructed.\n  bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);\n  // Sets up an encoder that doesn't change the unichars at all, so it just\n  // passes them through unchanged.\n  void SetupPassThrough(const UNICHARSET &unicharset);\n  // Sets up an encoder directly using the given encoding vector, which maps\n  // unichar_ids to the given codes.\n  void SetupDirect(const std::vector<RecodedCharID> &codes);\n\n  // Returns the number of different values that can be used in a code, ie\n  // 1 + the maximum value that will ever be used by an RecodedCharID code in\n  // any position in its array.\n  int code_range() const {\n    return code_range_;\n  }\n\n  // Encodes a single unichar_id. Returns the length of the code, (or zero if\n  // invalid input), and the encoding itself in code.\n  int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const;\n  // Decodes code, returning the original unichar-id, or\n  // INVALID_UNICHAR_ID if the input is invalid.\n  int DecodeUnichar(const RecodedCharID &code) const;\n  // Returns true if the given code is a valid start or single code.\n  bool IsValidFirstCode(int code) const {\n    return is_valid_start_[code];\n  }\n  // Returns a list of valid non-final next codes for a given prefix code,\n  // which may be empty.\n  const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {\n    auto it = next_codes_.find(code);\n    return it == next_codes_.end() ? nullptr : it->second;\n  }\n  // Returns a list of valid final codes for a given prefix code, which may\n  // be empty.\n  const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {\n    auto it = final_codes_.find(code);\n    return it == final_codes_.end() ? nullptr : it->second;\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n\n  bool DeSerialize(TFile *fp);\n\n  // Returns a string containing a text file that describes the encoding thus:\n  // <index>[,<index>]*<tab><UTF8-str><newline>\n  // In words, a comma-separated list of one or more indices, followed by a tab\n  // and the UTF-8 string that the code represents per line. Most simple scripts\n  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean\n  // and the Indic scripts will contain a many-to-many mapping.\n  // See the class comment above for details.\n  std::string GetEncodingAsString(const UNICHARSET &unicharset) const;\n\n  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.\n  // Note that the returned values are 0-based indices, NOT unicode Jamo.\n  // Returns false if the input is not in the Hangul unicode range.\n  static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing);\n\nprivate:\n  // Renumbers codes to eliminate unused values.\n  void DefragmentCodeValues(int encoded_null);\n  // Computes the value of code_range_ from the encoder_.\n  void ComputeCodeRange();\n  // Initializes the decoding hash_map from the encoder_ array.\n  void SetupDecoder();\n  // Frees allocated memory.\n  void Cleanup();\n\n  // The encoder that maps a unichar-id to a sequence of small codes.\n  // encoder_ is the only part that is serialized. The rest is computed on load.\n  std::vector<RecodedCharID> encoder_;\n  // Decoder converts the output of encoder back to a unichar-id.\n  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;\n  // True if the index is a valid single or start code.\n  std::vector<bool> is_valid_start_;\n  // Maps a prefix code to a list of valid next codes.\n  // The map owns the vectors.\n  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>\n      next_codes_;\n  // Maps a prefix code to a list of valid final codes.\n  // The map owns the vectors.\n  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>\n      final_codes_;\n  // Max of any value in encoder_ + 1.\n  int code_range_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_\n"
  },
  {
    "path": "src/ccutil/unicharmap.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharmap.cpp\n// Description: Unicode character/ligature to integer id class.\n// Author:      Thomas Kielbus\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"unicharmap.h\"\n\n#include <tesseract/unichar.h>\n\n#include <cassert>\n\nnamespace tesseract {\n\nUNICHARMAP::UNICHARMAP() : nodes(nullptr) {}\n\nUNICHARMAP::~UNICHARMAP() {\n  delete[] nodes;\n}\n\n// Search the given unichar representation in the tree, using length characters\n// from it maximum. Each character in the string is interpreted as an index in\n// an array of nodes.\nUNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const {\n  UNICHARMAP_NODE *current_nodes = nodes;\n\n  assert(*unichar_repr != '\\0');\n  assert(length > 0 && length <= UNICHAR_LEN);\n\n  int index = 0;\n  if (length <= 0 || unichar_repr[index] == '\\0') {\n    return INVALID_UNICHAR_ID;\n  }\n  do {\n    if (index + 1 >= length || unichar_repr[index + 1] == '\\0') {\n      return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;\n    }\n    current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;\n    ++index;\n  } while (true);\n}\n\n// Search the given unichar representation in the tree, creating the possibly\n// missing nodes. Once the right place has been found, insert the given id and\n// update the inserted flag to keep track of the insert. Each character in the\n// string is interpreted as an index in an array of nodes.\nvoid UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) {\n  const char *current_char = unichar_repr;\n  if (*current_char == '\\0') {\n    return;\n  }\n  UNICHARMAP_NODE **current_nodes_pointer = &nodes;\n  do {\n    if (*current_nodes_pointer == nullptr) {\n      *current_nodes_pointer = new UNICHARMAP_NODE[256];\n    }\n    if (current_char[1] == '\\0') {\n      (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id;\n      return;\n    }\n    current_nodes_pointer =\n        &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children);\n    ++current_char;\n  } while (true);\n}\n\n// Search the given unichar representation in the tree, using length characters\n// from it maximum. Each character in the string is interpreted as an index in\n// an array of nodes. Stop once the tree does not have anymore nodes or once we\n// found the right unichar_repr.\nbool UNICHARMAP::contains(const char *const unichar_repr, int length) const {\n  if (unichar_repr == nullptr || *unichar_repr == '\\0') {\n    return false;\n  }\n  if (length <= 0 || length > UNICHAR_LEN) {\n    return false;\n  }\n  int index = 0;\n  if (unichar_repr[index] == '\\0') {\n    return false;\n  }\n  UNICHARMAP_NODE *current_nodes = nodes;\n\n  while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\\0') {\n    current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;\n    ++index;\n  }\n  return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\\0') &&\n         current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;\n}\n\n// Return the minimum number of characters that must be used from this string\n// to obtain a match in the UNICHARMAP.\nint UNICHARMAP::minmatch(const char *const unichar_repr) const {\n  const char *current_char = unichar_repr;\n  if (*current_char == '\\0') {\n    return 0;\n  }\n  UNICHARMAP_NODE *current_nodes = nodes;\n\n  while (current_nodes != nullptr && *current_char != '\\0') {\n    if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) {\n      return current_char + 1 - unichar_repr;\n    }\n    current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children;\n    ++current_char;\n  }\n  return 0;\n}\n\nvoid UNICHARMAP::clear() {\n  delete[] nodes;\n  nodes = nullptr;\n}\n\nUNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {}\n\n// Recursively delete the children\nUNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {\n  delete[] children;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/unicharmap.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharmap.h\n// Description: Unicode character/ligature to integer id class.\n// Author:      Thomas Kielbus\n// Created:     Wed Jun 28 17:05:01 PDT 2006\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_UNICHARMAP_H_\n#define TESSERACT_CCUTIL_UNICHARMAP_H_\n\n#include <tesseract/unichar.h>\n\nnamespace tesseract {\n\n// A UNICHARMAP stores unique unichars. Each of them is associated with one\n// UNICHAR_ID.\nclass TESS_API UNICHARMAP {\npublic:\n  // Create an empty UNICHARMAP\n  UNICHARMAP();\n\n  ~UNICHARMAP();\n\n  // Insert the given unichar representation in the UNICHARMAP and associate it\n  // with the given id. The length of the representation MUST be non-zero.\n  void insert(const char *const unichar_repr, UNICHAR_ID id);\n\n  // Return the id associated with the given unichar representation,\n  // this representation MUST exist within the UNICHARMAP. The first\n  // length characters (maximum) from unichar_repr are used. The length\n  // MUST be non-zero.\n  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;\n\n  // Return true if the given unichar representation is already present in the\n  // UNICHARMAP. The first length characters (maximum) from unichar_repr are\n  // used. The length MUST be non-zero.\n  bool contains(const char *const unichar_repr, int length) const;\n\n  // Return the minimum number of characters that must be used from this string\n  // to obtain a match in the UNICHARMAP.\n  int minmatch(const char *const unichar_repr) const;\n\n  // Clear the UNICHARMAP. All previous data is lost.\n  void clear();\n\nprivate:\n  // The UNICHARMAP is represented as a tree whose nodes are of type\n  // UNICHARMAP_NODE.\n  struct UNICHARMAP_NODE {\n    UNICHARMAP_NODE();\n    ~UNICHARMAP_NODE();\n\n    UNICHARMAP_NODE *children;\n    UNICHAR_ID id;\n  };\n\n  UNICHARMAP_NODE *nodes;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_UNICHARMAP_H_\n"
  },
  {
    "path": "src/ccutil/unicharset.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharset.cpp\n// Description: Unicode character/ligature set class.\n// Author:      Thomas Kielbus\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"unicharset.h\"\n\n#include \"params.h\"\n\n#include <tesseract/unichar.h>\n#include \"serialis.h\"\n\n#include <algorithm>\n#include <cassert>\n#include <cstdio>\n#include <cstring>\n#include <iomanip> // for std::setw\n#include <locale>  // for std::locale::classic\n#include <sstream> // for std::istringstream, std::ostringstream\n\nnamespace tesseract {\n\n// Special character used in representing character fragments.\nstatic const char kSeparator = '|';\n// Special character used in representing 'natural' character fragments.\nstatic const char kNaturalFlag = 'n';\n\nstatic const int ISALPHA_MASK = 0x1;\nstatic const int ISLOWER_MASK = 0x2;\nstatic const int ISUPPER_MASK = 0x4;\nstatic const int ISDIGIT_MASK = 0x8;\nstatic const int ISPUNCTUATION_MASK = 0x10;\n\n// Y coordinate threshold for determining cap-height vs x-height.\n// TODO(rays) Bring the global definition down to the ccutil library level,\n// so this constant is relative to some other constants.\nstatic const int kMeanlineThreshold = 220;\n// Let C be the number of alpha chars for which all tops exceed\n// kMeanlineThreshold, and X the number of alpha chars for which all\n// tops are below kMeanlineThreshold, then if X > C *\n// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than\n// half the alpha characters have upper or lower case, then the\n// unicharset \"has x-height\".\nconst double kMinXHeightFraction = 0.25;\nconst double kMinCapHeightFraction = 0.05;\n\n/*static */\nconst char *UNICHARSET::kCustomLigatures[][2] = {\n    {\"ct\", \"\\uE003\"}, // c + t -> U+E003\n    {\"ſh\", \"\\uE006\"}, // long-s + h -> U+E006\n    {\"ſi\", \"\\uE007\"}, // long-s + i -> U+E007\n    {\"ſl\", \"\\uE008\"}, // long-s + l -> U+E008\n    {\"ſſ\", \"\\uE009\"}, // long-s + long-s -> U+E009\n    {nullptr, nullptr}};\n\n// List of mappings to make when ingesting strings from the outside.\n// The substitutions clean up text that should exist for rendering of\n// synthetic data, but not in the recognition set.\nconst char *UNICHARSET::kCleanupMaps[][2] = {\n    {\"\\u0640\", \"\"},   // TATWEEL is deleted.\n    {\"\\ufb01\", \"fi\"}, // fi ligature->fi pair.\n    {\"\\ufb02\", \"fl\"}, // fl ligature->fl pair.\n    {nullptr, nullptr}};\n\n// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.\nconst char *UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {\n    \" \", \"Joined\", \"|Broken|0|1\"};\n\nconst char *UNICHARSET::null_script = \"NULL\";\n\nUNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {\n  Init();\n}\n\n// Initialize all properties to sensible default values.\nvoid UNICHARSET::UNICHAR_PROPERTIES::Init() {\n  isalpha = false;\n  islower = false;\n  isupper = false;\n  isdigit = false;\n  ispunctuation = false;\n  isngram = false;\n  enabled = false;\n  SetRangesOpen();\n  script_id = 0;\n  other_case = 0;\n  mirror = 0;\n  normed = \"\";\n  direction = UNICHARSET::U_LEFT_TO_RIGHT;\n  fragment = nullptr;\n}\n\n// Sets all ranges wide open. Initialization default in case there are\n// no useful values available.\nvoid UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {\n  min_bottom = 0;\n  max_bottom = UINT8_MAX;\n  min_top = 0;\n  max_top = UINT8_MAX;\n  width = 0.0f;\n  width_sd = 0.0f;\n  bearing = 0.0f;\n  bearing_sd = 0.0f;\n  advance = 0.0f;\n  advance_sd = 0.0f;\n}\n\n// Sets all ranges to empty. Used before expanding with font-based data.\nvoid UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {\n  min_bottom = UINT8_MAX;\n  max_bottom = 0;\n  min_top = UINT8_MAX;\n  max_top = 0;\n  width = 0.0f;\n  width_sd = 0.0f;\n  bearing = 0.0f;\n  bearing_sd = 0.0f;\n  advance = 0.0f;\n  advance_sd = 0.0f;\n}\n\n// Returns true if any of the top/bottom/width/bearing/advance ranges/stats\n// is empty.\nbool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {\n  return width == 0.0f || advance == 0.0f;\n}\n\n// Expands the ranges with the ranges from the src properties.\nvoid UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(\n    const UNICHAR_PROPERTIES &src) {\n  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);\n  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);\n  UpdateRange(src.min_top, &min_top, &max_top);\n  UpdateRange(src.max_top, &min_top, &max_top);\n  if (src.width_sd > width_sd) {\n    width = src.width;\n    width_sd = src.width_sd;\n  }\n  if (src.bearing_sd > bearing_sd) {\n    bearing = src.bearing;\n    bearing_sd = src.bearing_sd;\n  }\n  if (src.advance_sd > advance_sd) {\n    advance = src.advance;\n    advance_sd = src.advance_sd;\n  }\n}\n\n// Copies the properties from src into this.\nvoid UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {\n  // Apart from the fragment, everything else can be done with a default copy.\n  CHAR_FRAGMENT *saved_fragment = fragment;\n  *this = src; // Bitwise copy.\n  fragment = saved_fragment;\n}\n\nUNICHARSET::UNICHARSET()\n    : ids(), script_table(nullptr), script_table_size_used(0) {\n  clear();\n  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {\n    unichar_insert(kSpecialUnicharCodes[i]);\n    if (i == UNICHAR_JOINED) {\n      set_isngram(i, true);\n    }\n  }\n}\n\nUNICHARSET::~UNICHARSET() {\n  clear();\n}\n\nUNICHAR_ID\nUNICHARSET::unichar_to_id(const char *const unichar_repr) const {\n  std::string cleaned =\n      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);\n  return ids.contains(cleaned.data(), cleaned.size())\n             ? ids.unichar_to_id(cleaned.data(), cleaned.size())\n             : INVALID_UNICHAR_ID;\n}\n\nUNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr,\n                                     int length) const {\n  assert(length > 0 && length <= UNICHAR_LEN);\n  std::string cleaned(unichar_repr, length);\n  if (!old_style_included_) {\n    cleaned = CleanupString(unichar_repr, length);\n  }\n  return ids.contains(cleaned.data(), cleaned.size())\n             ? ids.unichar_to_id(cleaned.data(), cleaned.size())\n             : INVALID_UNICHAR_ID;\n}\n\n// Return the minimum number of bytes that matches a legal UNICHAR_ID,\n// while leaving the rest of the string encodable. Returns 0 if the\n// beginning of the string is not encodable.\n// WARNING: this function now encodes the whole string for precision.\n// Use encode_string in preference to repeatedly calling step.\nint UNICHARSET::step(const char *str) const {\n  std::vector<UNICHAR_ID> encoding;\n  std::vector<char> lengths;\n  encode_string(str, true, &encoding, &lengths, nullptr);\n  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {\n    return 0;\n  }\n  return lengths[0];\n}\n\n// Return whether the given UTF-8 string is encodable with this UNICHARSET.\n// If not encodable, write the first byte offset which cannot be converted\n// into the second (return) argument.\nbool UNICHARSET::encodable_string(const char *str,\n                                  unsigned *first_bad_position) const {\n  std::vector<UNICHAR_ID> encoding;\n  return encode_string(str, true, &encoding, nullptr, first_bad_position);\n}\n\n// Encodes the given UTF-8 string with this UNICHARSET.\n// Returns true if the encoding succeeds completely, false if there is at\n// least one INVALID_UNICHAR_ID in the returned encoding, but in this case\n// the rest of the string is still encoded.\n// If lengths is not nullptr, then it is filled with the corresponding\n// byte length of each encoded UNICHAR_ID.\n// WARNING: Caller must guarantee that str has already been cleaned of codes\n// that do not belong in the unicharset, or encoding may fail.\n// Use CleanupString to perform the cleaning.\nbool UNICHARSET::encode_string(const char *str, bool give_up_on_failure,\n                               std::vector<UNICHAR_ID> *encoding,\n                               std::vector<char> *lengths,\n                               unsigned *encoded_length) const {\n  std::vector<UNICHAR_ID> working_encoding;\n  std::vector<char> working_lengths;\n  std::vector<char> best_lengths;\n  encoding->clear(); // Just in case str is empty.\n  auto str_length = strlen(str);\n  unsigned str_pos = 0;\n  bool perfect = true;\n  while (str_pos < str_length) {\n    encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,\n                  &str_pos, encoding, &best_lengths);\n    if (str_pos < str_length) {\n      // This is a non-match. Skip one utf-8 character.\n      perfect = false;\n      if (give_up_on_failure) {\n        break;\n      }\n      int step = UNICHAR::utf8_step(str + str_pos);\n      if (step == 0) {\n        step = 1;\n      }\n      encoding->push_back(INVALID_UNICHAR_ID);\n      best_lengths.push_back(step);\n      str_pos += step;\n      working_encoding = *encoding;\n      working_lengths = best_lengths;\n    }\n  }\n  if (lengths != nullptr) {\n    *lengths = std::move(best_lengths);\n  }\n  if (encoded_length != nullptr) {\n    *encoded_length = str_pos;\n  }\n  return perfect;\n}\n\nconst char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {\n  if (id == INVALID_UNICHAR_ID) {\n    return INVALID_UNICHAR;\n  }\n  ASSERT_HOST(static_cast<unsigned>(id) < this->size());\n  return unichars[id].representation;\n}\n\nconst char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {\n  if (id == INVALID_UNICHAR_ID) {\n    return INVALID_UNICHAR;\n  }\n  ASSERT_HOST(static_cast<unsigned>(id) < this->size());\n  // Resolve from the kCustomLigatures table if this is a private encoding.\n  if (get_isprivate(id)) {\n    const char *ch = id_to_unichar(id);\n    for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {\n      if (!strcmp(ch, kCustomLigatures[i][1])) {\n        return kCustomLigatures[i][0];\n      }\n    }\n  }\n  // Otherwise return the stored representation.\n  return unichars[id].representation;\n}\n\n// Return a string that reformats the utf8 str into the str followed\n// by its hex unicodes.\nstd::string UNICHARSET::debug_utf8_str(const char *str) {\n  std::string result = str;\n  result += \" [\";\n  int step = 1;\n  // Chop into unicodes and code each as hex.\n  for (int i = 0; str[i] != '\\0'; i += step) {\n    char hex[sizeof(int) * 2 + 1];\n    step = UNICHAR::utf8_step(str + i);\n    if (step == 0) {\n      step = 1;\n      snprintf(hex, sizeof(hex), \"%x\", str[i]);\n    } else {\n      UNICHAR ch(str + i, step);\n      snprintf(hex, sizeof(hex), \"%x\", ch.first_uni());\n    }\n    result += hex;\n    result += \" \";\n  }\n  result += \"]\";\n  return result;\n}\n\n// Return a string containing debug information on the unichar, including\n// the id_to_unichar, its hex unicodes and the properties.\nstd::string UNICHARSET::debug_str(UNICHAR_ID id) const {\n  if (id == INVALID_UNICHAR_ID) {\n    return std::string(id_to_unichar(id));\n  }\n  const CHAR_FRAGMENT *fragment = this->get_fragment(id);\n  if (fragment) {\n    return fragment->to_string();\n  }\n  const char *str = id_to_unichar(id);\n  std::string result = debug_utf8_str(str);\n  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.\n  if (get_isalpha(id)) {\n    if (get_islower(id)) {\n      result += \"a\";\n    } else if (get_isupper(id)) {\n      result += \"A\";\n    } else {\n      result += \"x\";\n    }\n  }\n  // Append 0 if a digit.\n  if (get_isdigit(id)) {\n    result += \"0\";\n  }\n  // Append p is a punctuation symbol.\n  if (get_ispunctuation(id)) {\n    result += \"p\";\n  }\n  return result;\n}\n\n// Sets the normed_ids vector from the normed string. normed_ids is not\n// stored in the file, and needs to be set when the UNICHARSET is loaded.\nvoid UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {\n  unichars[unichar_id].properties.normed_ids.clear();\n  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {\n    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);\n  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),\n                            true, &unichars[unichar_id].properties.normed_ids,\n                            nullptr, nullptr)) {\n    unichars[unichar_id].properties.normed_ids.clear();\n    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);\n  }\n}\n\n// Returns whether the unichar id represents a unicode value in the private use\n// area. We use this range only internally to represent uncommon ligatures\n// (eg. 'ct') that do not have regular unicode values.\nbool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {\n  UNICHAR uc(id_to_unichar(unichar_id), -1);\n  int uni = uc.first_uni();\n  return (uni >= 0xE000 && uni <= 0xF8FF);\n}\n\n// Sets all ranges to empty, so they can be expanded to set the values.\nvoid UNICHARSET::set_ranges_empty() {\n  for (auto &uc : unichars) {\n    uc.properties.SetRangesEmpty();\n  }\n}\n\n// Sets all the properties for this unicharset given a src unicharset with\n// everything set. The unicharsets don't have to be the same, and graphemes\n// are correctly accounted for.\nvoid UNICHARSET::PartialSetPropertiesFromOther(int start_index,\n                                               const UNICHARSET &src) {\n  for (unsigned ch = start_index; ch < unichars.size(); ++ch) {\n    const char *utf8 = id_to_unichar(ch);\n    UNICHAR_PROPERTIES properties;\n    if (src.GetStrProperties(utf8, &properties)) {\n      // Setup the script_id, other_case, and mirror properly.\n      const char *script = src.get_script_from_script_id(properties.script_id);\n      properties.script_id = add_script(script);\n      const char *other_case = src.id_to_unichar(properties.other_case);\n      if (contains_unichar(other_case)) {\n        properties.other_case = unichar_to_id(other_case);\n      } else {\n        properties.other_case = ch;\n      }\n      const char *mirror_str = src.id_to_unichar(properties.mirror);\n      if (contains_unichar(mirror_str)) {\n        properties.mirror = unichar_to_id(mirror_str);\n      } else {\n        properties.mirror = ch;\n      }\n      unichars[ch].properties.CopyFrom(properties);\n      set_normed_ids(ch);\n    }\n  }\n}\n\n// Expands the tops and bottoms and widths for this unicharset given a\n// src unicharset with ranges in it. The unicharsets don't have to be the\n// same, and graphemes are correctly accounted for.\nvoid UNICHARSET::ExpandRangesFromOther(const UNICHARSET &src) {\n  for (unsigned ch = 0; ch < unichars.size(); ++ch) {\n    const char *utf8 = id_to_unichar(ch);\n    UNICHAR_PROPERTIES properties;\n    if (src.GetStrProperties(utf8, &properties)) {\n      // Expand just the ranges from properties.\n      unichars[ch].properties.ExpandRangesFrom(properties);\n    }\n  }\n}\n\n// Makes this a copy of src. Clears this completely first, so the automatic\n// ids will not be present in this if not in src. Does NOT reorder the set!\nvoid UNICHARSET::CopyFrom(const UNICHARSET &src) {\n  clear();\n  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {\n    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;\n    const char *utf8 = src.id_to_unichar(ch);\n    unichar_insert_backwards_compatible(utf8);\n    unichars[ch].properties.ExpandRangesFrom(src_props);\n  }\n  // Set properties, including mirror and other_case, WITHOUT reordering\n  // the unicharset.\n  PartialSetPropertiesFromOther(0, src);\n}\n\n// For each id in src, if it does not occur in this, add it, as in\n// SetPropertiesFromOther, otherwise expand the ranges, as in\n// ExpandRangesFromOther.\nvoid UNICHARSET::AppendOtherUnicharset(const UNICHARSET &src) {\n  int initial_used = unichars.size();\n  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {\n    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;\n    const char *utf8 = src.id_to_unichar(ch);\n    int id = unichars.size();\n    if (contains_unichar(utf8)) {\n      id = unichar_to_id(utf8);\n      // Just expand current ranges.\n      unichars[id].properties.ExpandRangesFrom(src_props);\n    } else {\n      unichar_insert_backwards_compatible(utf8);\n      unichars[id].properties.SetRangesEmpty();\n    }\n  }\n  // Set properties, including mirror and other_case, WITHOUT reordering\n  // the unicharset.\n  PartialSetPropertiesFromOther(initial_used, src);\n}\n\n// Returns true if the acceptable ranges of the tops of the characters do\n// not overlap, making their x-height calculations distinct.\nbool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {\n  int overlap = std::min(unichars[id1].properties.max_top,\n                         unichars[id2].properties.max_top) -\n                std::max(unichars[id1].properties.min_top,\n                         unichars[id2].properties.min_top);\n  return overlap <= 0;\n}\n\n// Internal recursive version of encode_string above.\n// Seeks to encode the given string as a sequence of UNICHAR_IDs such that\n// each UNICHAR_ID uses the least possible part of the utf8 str.\n// It does this by depth-first tail recursion on increasing length matches\n// to the UNICHARSET, saving the first encountered result that encodes the\n// maximum total length of str. It stops on a failure to encode to make\n// the overall process of encoding a partially failed string more efficient.\n// See unicharset.h for definition of the args.\nvoid UNICHARSET::encode_string(const char *str, int str_index, int str_length,\n                               std::vector<UNICHAR_ID> *encoding,\n                               std::vector<char> *lengths,\n                               unsigned *best_total_length,\n                               std::vector<UNICHAR_ID> *best_encoding,\n                               std::vector<char> *best_lengths) const {\n  if (str_index > static_cast<int>(*best_total_length)) {\n    // This is the best result so far.\n    *best_total_length = str_index;\n    *best_encoding = *encoding;\n    if (best_lengths != nullptr) {\n      *best_lengths = *lengths;\n    }\n  }\n  if (str_index == str_length) {\n    return;\n  }\n  int encoding_index = encoding->size();\n  // Find the length of the first matching unicharset member.\n  int length = ids.minmatch(str + str_index);\n  if (length == 0 || str_index + length > str_length) {\n    return;\n  }\n  do {\n    if (ids.contains(str + str_index, length)) {\n      // Successful encoding so far.\n      UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);\n      encoding->push_back(id);\n      lengths->push_back(length);\n      encode_string(str, str_index + length, str_length, encoding, lengths,\n                    best_total_length, best_encoding, best_lengths);\n      if (static_cast<int>(*best_total_length) == str_length) {\n        return; // Tail recursion success!\n      }\n      // Failed with that length, truncate back and try again.\n      encoding->resize(encoding_index);\n      lengths->resize(encoding_index);\n    }\n    int step = UNICHAR::utf8_step(str + str_index + length);\n    if (step == 0) {\n      step = 1;\n    }\n    length += step;\n  } while (length <= UNICHAR_LEN && str_index + length <= str_length);\n}\n\n// Gets the properties for a grapheme string, combining properties for\n// multiple characters in a meaningful way where possible.\n// Returns false if no valid match was found in the unicharset.\n// NOTE that script_id, mirror, and other_case refer to this unicharset on\n// return and will need translation if the target unicharset is different.\nbool UNICHARSET::GetStrProperties(const char *utf8_str,\n                                  UNICHAR_PROPERTIES *props) const {\n  props->Init();\n  props->SetRangesEmpty();\n  int total_unicodes = 0;\n  std::vector<UNICHAR_ID> encoding;\n  if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) {\n    return false; // Some part was invalid.\n  }\n  for (auto it : encoding) {\n    int id = it;\n    const UNICHAR_PROPERTIES &src_props = unichars[id].properties;\n    // Logical OR all the bools.\n    if (src_props.isalpha) {\n      props->isalpha = true;\n    }\n    if (src_props.islower) {\n      props->islower = true;\n    }\n    if (src_props.isupper) {\n      props->isupper = true;\n    }\n    if (src_props.isdigit) {\n      props->isdigit = true;\n    }\n    if (src_props.ispunctuation) {\n      props->ispunctuation = true;\n    }\n    if (src_props.isngram) {\n      props->isngram = true;\n    }\n    if (src_props.enabled) {\n      props->enabled = true;\n    }\n    // Min/max the tops/bottoms.\n    UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);\n    UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);\n    UpdateRange(src_props.min_top, &props->min_top, &props->max_top);\n    UpdateRange(src_props.max_top, &props->min_top, &props->max_top);\n    float bearing = props->advance + src_props.bearing;\n    if (total_unicodes == 0 || bearing < props->bearing) {\n      props->bearing = bearing;\n      props->bearing_sd = props->advance_sd + src_props.bearing_sd;\n    }\n    props->advance += src_props.advance;\n    props->advance_sd += src_props.advance_sd;\n    // With a single width, just use the widths stored in the unicharset.\n    props->width = src_props.width;\n    props->width_sd = src_props.width_sd;\n    // Use the first script id, other_case, mirror, direction.\n    // Note that these will need translation, except direction.\n    if (total_unicodes == 0) {\n      props->script_id = src_props.script_id;\n      props->other_case = src_props.other_case;\n      props->mirror = src_props.mirror;\n      props->direction = src_props.direction;\n    }\n    // The normed string for the compound character is the concatenation of\n    // the normed versions of the individual characters.\n    props->normed += src_props.normed;\n    ++total_unicodes;\n  }\n  if (total_unicodes > 1) {\n    // Estimate the total widths from the advance - bearing.\n    props->width = props->advance - props->bearing;\n    props->width_sd = props->advance_sd + props->bearing_sd;\n  }\n  return total_unicodes > 0;\n}\n\n// TODO(rays) clean-up the order of functions to match unicharset.h.\n\nunsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {\n  unsigned int properties = 0;\n  if (this->get_isalpha(id)) {\n    properties |= ISALPHA_MASK;\n  }\n  if (this->get_islower(id)) {\n    properties |= ISLOWER_MASK;\n  }\n  if (this->get_isupper(id)) {\n    properties |= ISUPPER_MASK;\n  }\n  if (this->get_isdigit(id)) {\n    properties |= ISDIGIT_MASK;\n  }\n  if (this->get_ispunctuation(id)) {\n    properties |= ISPUNCTUATION_MASK;\n  }\n  return properties;\n}\n\nchar UNICHARSET::get_chartype(UNICHAR_ID id) const {\n  if (this->get_isupper(id)) {\n    return 'A';\n  }\n  if (this->get_islower(id)) {\n    return 'a';\n  }\n  if (this->get_isalpha(id)) {\n    return 'x';\n  }\n  if (this->get_isdigit(id)) {\n    return '0';\n  }\n  if (this->get_ispunctuation(id)) {\n    return 'p';\n  }\n  return 0;\n}\n\nvoid UNICHARSET::unichar_insert(const char *const unichar_repr,\n                                OldUncleanUnichars old_style) {\n  if (old_style == OldUncleanUnichars::kTrue) {\n    old_style_included_ = true;\n  }\n  std::string cleaned =\n      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);\n  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {\n    const char *str = cleaned.c_str();\n    std::vector<int> encoding;\n    if (!old_style_included_ &&\n        encode_string(str, true, &encoding, nullptr, nullptr)) {\n      return;\n    }\n    unichars.emplace_back();\n    auto &u = unichars.back();\n    int index = 0;\n    do {\n      if (index >= UNICHAR_LEN) {\n        fprintf(stderr, \"Utf8 buffer too big, size>%d for %s\\n\", UNICHAR_LEN,\n                unichar_repr);\n        return;\n      }\n      u.representation[index++] = *str++;\n    } while (*str != '\\0');\n    u.representation[index] = '\\0';\n    this->set_script(unichars.size() - 1, null_script);\n    // If the given unichar_repr represents a fragmented character, set\n    // fragment property to a pointer to CHAR_FRAGMENT class instance with\n    // information parsed from the unichar representation. Use the script\n    // of the base unichar for the fragmented character if possible.\n    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);\n    u.properties.fragment = frag;\n    if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {\n      u.properties.script_id = this->get_script(frag->get_unichar());\n    }\n    u.properties.enabled = true;\n    ids.insert(u.representation, unichars.size() - 1);\n  }\n}\n\nbool UNICHARSET::contains_unichar(const char *const unichar_repr) const {\n  std::string cleaned =\n      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);\n  return ids.contains(cleaned.data(), cleaned.size());\n}\n\nbool UNICHARSET::contains_unichar(const char *const unichar_repr,\n                                  int length) const {\n  if (length == 0) {\n    return false;\n  }\n  std::string cleaned(unichar_repr, length);\n  if (!old_style_included_) {\n    cleaned = CleanupString(unichar_repr, length);\n  }\n  return ids.contains(cleaned.data(), cleaned.size());\n}\n\nbool UNICHARSET::eq(UNICHAR_ID unichar_id,\n                    const char *const unichar_repr) const {\n  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;\n}\n\nbool UNICHARSET::save_to_string(std::string &str) const {\n  const int kFileBufSize = 1024;\n  char buffer[kFileBufSize + 1];\n  snprintf(buffer, kFileBufSize, \"%zu\\n\", this->size());\n  str = buffer;\n  for (unsigned id = 0; id < this->size(); ++id) {\n    int min_bottom, max_bottom, min_top, max_top;\n    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);\n    float width, width_sd;\n    get_width_stats(id, &width, &width_sd);\n    float bearing, bearing_sd;\n    get_bearing_stats(id, &bearing, &bearing_sd);\n    float advance, advance_sd;\n    get_advance_stats(id, &advance, &advance_sd);\n    unsigned int properties = this->get_properties(id);\n    if (strcmp(this->id_to_unichar(id), \" \") == 0) {\n      snprintf(buffer, kFileBufSize, \"%s %x %s %d\\n\", \"NULL\", properties,\n               this->get_script_from_script_id(this->get_script(id)),\n               this->get_other_case(id));\n      str += buffer;\n    } else {\n      std::ostringstream stream;\n      stream.imbue(std::locale::classic());\n      stream << this->id_to_unichar(id) << ' ' << properties << ' '\n             << min_bottom << ',' << max_bottom << ',' << min_top << ','\n             << max_top << ',' << width << ',' << width_sd << ',' << bearing\n             << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '\n             << this->get_script_from_script_id(this->get_script(id)) << ' '\n             << this->get_other_case(id) << ' ' << this->get_direction(id)\n             << ' ' << this->get_mirror(id) << ' '\n             << this->get_normed_unichar(id) << \"\\t# \"\n             << this->debug_str(id).c_str() << '\\n';\n      str += stream.str().c_str();\n    }\n  }\n  return true;\n}\n\nclass LocalFilePointer {\npublic:\n  LocalFilePointer(FILE *stream) : fp_(stream) {}\n  char *fgets(char *dst, int size) {\n    return ::fgets(dst, size, fp_);\n  }\n\nprivate:\n  FILE *fp_;\n};\n\nbool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {\n  LocalFilePointer lfp(file);\n  using namespace std::placeholders; // for _1, _2\n  std::function<char *(char *, int)> fgets_cb =\n      std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);\n  bool success = load_via_fgets(fgets_cb, skip_fragments);\n  return success;\n}\n\nbool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {\n  using namespace std::placeholders; // for _1, _2\n  std::function<char *(char *, int)> fgets_cb =\n      std::bind(&tesseract::TFile::FGets, file, _1, _2);\n  bool success = load_via_fgets(fgets_cb, skip_fragments);\n  return success;\n}\n\nbool UNICHARSET::load_via_fgets(\n    const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) {\n  int unicharset_size;\n  char buffer[256];\n\n  this->clear();\n  if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||\n      sscanf(buffer, \"%d\", &unicharset_size) != 1) {\n    return false;\n  }\n  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {\n    char unichar[256];\n    unsigned int properties;\n    char script[64];\n\n    strncpy(script, null_script, sizeof(script) - 1);\n    int min_bottom = 0;\n    int max_bottom = UINT8_MAX;\n    int min_top = 0;\n    int max_top = UINT8_MAX;\n    float width = 0.0f;\n    float width_sd = 0.0f;\n    float bearing = 0.0f;\n    float bearing_sd = 0.0f;\n    float advance = 0.0f;\n    float advance_sd = 0.0f;\n    // TODO(eger): check that this default it ok\n    // after enabling BiDi iterator for Arabic.\n    int direction = UNICHARSET::U_LEFT_TO_RIGHT;\n    UNICHAR_ID other_case = unicharset_size;\n    UNICHAR_ID mirror = unicharset_size;\n    if (fgets_cb(buffer, sizeof(buffer)) == nullptr) {\n      return false;\n    }\n    char normed[64];\n    normed[0] = '\\0';\n    std::istringstream stream(buffer);\n    stream.imbue(std::locale::classic());\n    // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标  # 标 [6807 ]x\n    // stream.flags(std::ios::hex);\n    stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;\n    // stream.flags(std::ios::dec);\n    if (stream.fail()) {\n      fprintf(stderr, \"%s:%d failed\\n\", __FILE__, __LINE__);\n      return false;\n    }\n    auto position = stream.tellg();\n    stream.seekg(position);\n    char c1, c2, c3, c4, c5, c6, c7, c8, c9;\n    stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>\n        max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>\n        bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>\n        script >> other_case >> direction >> mirror >> std::setw(63) >> normed;\n    if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||\n        c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {\n      stream.clear();\n      stream.seekg(position);\n      stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>\n          max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>\n          bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>\n          script >> other_case >> direction >> mirror;\n      if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||\n          c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {\n        stream.clear();\n        stream.seekg(position);\n        stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>\n            max_top >> std::setw(63) >> script >> other_case >> direction >>\n            mirror;\n        if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {\n          stream.clear();\n          stream.seekg(position);\n          stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>\n              max_top >> std::setw(63) >> script >> other_case;\n          if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {\n            stream.clear();\n            stream.seekg(position);\n            stream >> std::setw(63) >> script >> other_case;\n            if (stream.fail()) {\n              stream.clear();\n              stream.seekg(position);\n              stream >> std::setw(63) >> script;\n            }\n          }\n        }\n      }\n    }\n\n    // Skip fragments if needed.\n    CHAR_FRAGMENT *frag = nullptr;\n    if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {\n      int num_pieces = frag->get_total();\n      delete frag;\n      // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.\n      if (num_pieces > 1) {\n        continue;\n      }\n    }\n    // Insert unichar into unicharset and set its properties.\n    if (strcmp(unichar, \"NULL\") == 0) {\n      this->unichar_insert(\" \");\n    } else {\n      this->unichar_insert_backwards_compatible(unichar);\n    }\n\n    this->set_isalpha(id, properties & ISALPHA_MASK);\n    this->set_islower(id, properties & ISLOWER_MASK);\n    this->set_isupper(id, properties & ISUPPER_MASK);\n    this->set_isdigit(id, properties & ISDIGIT_MASK);\n    this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);\n    this->set_isngram(id, false);\n    this->set_script(id, script);\n    this->unichars[id].properties.enabled = true;\n    this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);\n    this->set_width_stats(id, width, width_sd);\n    this->set_bearing_stats(id, bearing, bearing_sd);\n    this->set_advance_stats(id, advance, advance_sd);\n    this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));\n    this->set_other_case(id, (other_case < unicharset_size) ? other_case : id);\n    this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);\n    this->set_normed(id, normed[0] != '\\0' ? normed : unichar);\n  }\n  post_load_setup();\n  return true;\n}\n\n// Sets up internal data after loading the file, based on the char\n// properties. Called from load_from_file, but also needs to be run\n// during set_unicharset_properties.\nvoid UNICHARSET::post_load_setup() {\n  // Number of alpha chars with the case property minus those without,\n  // in order to determine that half the alpha chars have case.\n  int net_case_alphas = 0;\n  int x_height_alphas = 0;\n  int cap_height_alphas = 0;\n  top_bottom_set_ = false;\n  for (unsigned id = 0; id < unichars.size(); ++id) {\n    int min_bottom = 0;\n    int max_bottom = UINT8_MAX;\n    int min_top = 0;\n    int max_top = UINT8_MAX;\n    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);\n    if (min_top > 0) {\n      top_bottom_set_ = true;\n    }\n    if (get_isalpha(id)) {\n      if (get_islower(id) || get_isupper(id)) {\n        ++net_case_alphas;\n      } else {\n        --net_case_alphas;\n      }\n      if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {\n        ++x_height_alphas;\n      } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {\n        ++cap_height_alphas;\n      }\n    }\n    set_normed_ids(id);\n  }\n\n  script_has_upper_lower_ = net_case_alphas > 0;\n  script_has_xheight_ =\n      script_has_upper_lower_ ||\n      (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&\n       cap_height_alphas > x_height_alphas * kMinCapHeightFraction);\n\n  null_sid_ = get_script_id_from_name(null_script);\n  ASSERT_HOST(null_sid_ == 0);\n  common_sid_ = get_script_id_from_name(\"Common\");\n  latin_sid_ = get_script_id_from_name(\"Latin\");\n  cyrillic_sid_ = get_script_id_from_name(\"Cyrillic\");\n  greek_sid_ = get_script_id_from_name(\"Greek\");\n  han_sid_ = get_script_id_from_name(\"Han\");\n  hiragana_sid_ = get_script_id_from_name(\"Hiragana\");\n  katakana_sid_ = get_script_id_from_name(\"Katakana\");\n  thai_sid_ = get_script_id_from_name(\"Thai\");\n  hangul_sid_ = get_script_id_from_name(\"Hangul\");\n\n  // Compute default script. Use the highest-counting alpha script, that is\n  // not the common script, as that still contains some \"alphas\".\n  int *script_counts = new int[script_table_size_used];\n  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);\n  for (unsigned id = 0; id < unichars.size(); ++id) {\n    if (get_isalpha(id)) {\n      ++script_counts[get_script(id)];\n    }\n  }\n  default_sid_ = 0;\n  for (int s = 1; s < script_table_size_used; ++s) {\n    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {\n      default_sid_ = s;\n    }\n  }\n  delete[] script_counts;\n}\n\n// Returns true if right_to_left scripts are significant in the unicharset,\n// but without being so sensitive that \"universal\" unicharsets containing\n// characters from many scripts, like orientation and script detection,\n// look like they are right_to_left.\nbool UNICHARSET::major_right_to_left() const {\n  int ltr_count = 0;\n  int rtl_count = 0;\n  for (unsigned id = 0; id < unichars.size(); ++id) {\n    int dir = get_direction(id);\n    if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {\n      ltr_count++;\n    }\n    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||\n        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||\n        dir == UNICHARSET::U_ARABIC_NUMBER) {\n      rtl_count++;\n    }\n  }\n  return rtl_count > ltr_count;\n}\n\n// Set a whitelist and/or blacklist of characters to recognize.\n// An empty or nullptr whitelist enables everything (minus any blacklist).\n// An empty or nullptr blacklist disables nothing.\n// An empty or nullptr unblacklist has no effect.\nvoid UNICHARSET::set_black_and_whitelist(const char *blacklist,\n                                         const char *whitelist,\n                                         const char *unblacklist) {\n  bool def_enabled = whitelist == nullptr || whitelist[0] == '\\0';\n  // Set everything to default\n  for (auto &uc : unichars) {\n    uc.properties.enabled = def_enabled;\n  }\n  if (!def_enabled) {\n    // Enable the whitelist.\n    std::vector<UNICHAR_ID> encoding;\n    encode_string(whitelist, false, &encoding, nullptr, nullptr);\n    for (auto it : encoding) {\n      if (it != INVALID_UNICHAR_ID) {\n        unichars[it].properties.enabled = true;\n      }\n    }\n  }\n  if (blacklist != nullptr && blacklist[0] != '\\0') {\n    // Disable the blacklist.\n    std::vector<UNICHAR_ID> encoding;\n    encode_string(blacklist, false, &encoding, nullptr, nullptr);\n    for (auto it : encoding) {\n      if (it != INVALID_UNICHAR_ID) {\n        unichars[it].properties.enabled = false;\n      }\n    }\n  }\n  if (unblacklist != nullptr && unblacklist[0] != '\\0') {\n    // Re-enable the unblacklist.\n    std::vector<UNICHAR_ID> encoding;\n    encode_string(unblacklist, false, &encoding, nullptr, nullptr);\n    for (auto it : encoding) {\n      if (it != INVALID_UNICHAR_ID) {\n        unichars[it].properties.enabled = true;\n      }\n    }\n  }\n}\n\n// Returns true if there are any repeated unicodes in the normalized\n// text of any unichar-id in the unicharset.\nbool UNICHARSET::AnyRepeatedUnicodes() const {\n  int start_id = 0;\n  if (has_special_codes()) {\n    start_id = SPECIAL_UNICHAR_CODES_COUNT;\n  }\n  for (unsigned id = start_id; id < unichars.size(); ++id) {\n    // Convert to unicodes.\n    std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));\n    for (size_t u = 1; u < unicodes.size(); ++u) {\n      if (unicodes[u - 1] == unicodes[u]) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\nint UNICHARSET::add_script(const char *script) {\n  for (int i = 0; i < script_table_size_used; ++i) {\n    if (strcmp(script, script_table[i]) == 0) {\n      return i;\n    }\n  }\n  if (script_table_size_reserved == 0) {\n    script_table_size_reserved = 8;\n    script_table = new char *[script_table_size_reserved];\n  } else if (script_table_size_used >= script_table_size_reserved) {\n    assert(script_table_size_used == script_table_size_reserved);\n    script_table_size_reserved += script_table_size_reserved;\n    char **new_script_table = new char *[script_table_size_reserved];\n    memcpy(new_script_table, script_table,\n           script_table_size_used * sizeof(char *));\n    delete[] script_table;\n    script_table = new_script_table;\n  }\n  script_table[script_table_size_used] = new char[strlen(script) + 1];\n  strcpy(script_table[script_table_size_used], script);\n  return script_table_size_used++;\n}\n\n// Returns the string that represents a fragment\n// with the given unichar, pos and total.\nstd::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,\n                                     bool natural) {\n  if (total == 1) {\n    return std::string(unichar);\n  }\n  std::string result;\n  result += kSeparator;\n  result += unichar;\n  char buffer[kMaxLen];\n  snprintf(buffer, kMaxLen, \"%c%d%c%d\", kSeparator, pos,\n           natural ? kNaturalFlag : kSeparator, total);\n  result += buffer;\n  return result;\n}\n\nCHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {\n  const char *ptr = string;\n  int len = strlen(string);\n  if (len < kMinLen || *ptr != kSeparator) {\n    return nullptr; // this string cannot represent a fragment\n  }\n  ptr++; // move to the next character\n  int step = 0;\n  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {\n    step += UNICHAR::utf8_step(ptr + step);\n  }\n  if (step == 0 || step > UNICHAR_LEN) {\n    return nullptr; // no character for unichar or the character is too long\n  }\n  char unichar[UNICHAR_LEN + 1];\n  strncpy(unichar, ptr, step);\n  unichar[step] = '\\0'; // null terminate unichar\n  ptr += step;          // move to the next fragment separator\n  int pos = 0;\n  int total = 0;\n  bool natural = false;\n  char *end_ptr = nullptr;\n  for (int i = 0; i < 2; i++) {\n    if (ptr > string + len || *ptr != kSeparator) {\n      if (i == 1 && *ptr == kNaturalFlag) {\n        natural = true;\n      } else {\n        return nullptr; // Failed to parse fragment representation.\n      }\n    }\n    ptr++; // move to the next character\n    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))\n           : total = static_cast<int>(strtol(ptr, &end_ptr, 10));\n    ptr = end_ptr;\n  }\n  if (ptr != string + len) {\n    return nullptr; // malformed fragment representation\n  }\n  auto *fragment = new CHAR_FRAGMENT();\n  fragment->set_all(unichar, pos, total, natural);\n  return fragment;\n}\n\nint UNICHARSET::get_script_id_from_name(const char *script_name) const {\n  for (int i = 0; i < script_table_size_used; ++i) {\n    if (strcmp(script_name, script_table[i]) == 0) {\n      return i;\n    }\n  }\n  return 0; // 0 is always the null_script\n}\n\n// Removes/replaces content that belongs in rendered text, but not in the\n// unicharset.\n/* static */\nstd::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) {\n  std::string result;\n  result.reserve(length);\n  char ch;\n  while ((ch = *utf8_str) != '\\0' && length-- > 0) {\n    int key_index = 0;\n    const char *key;\n    while ((key = kCleanupMaps[key_index][0]) != nullptr) {\n      int match = 0;\n      while (key[match] != '\\0' && key[match] == utf8_str[match]) {\n        ++match;\n      }\n      if (key[match] == '\\0') {\n        utf8_str += match;\n        break;\n      }\n      ++key_index;\n    }\n    if (key == nullptr) {\n      result.push_back(ch);\n      ++utf8_str;\n    } else {\n      result.append(kCleanupMaps[key_index][1]);\n    }\n  }\n  return result;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/ccutil/unicharset.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharset.h\n// Description: Unicode character/ligature set class.\n// Author:      Thomas Kielbus\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_UNICHARSET_H_\n#define TESSERACT_CCUTIL_UNICHARSET_H_\n\n#include \"errcode.h\"\n#include \"unicharmap.h\"\n\n#include <tesseract/unichar.h>\n#include \"helpers.h\"\n#include \"serialis.h\"\n\n#include <functional> // for std::function\n\nnamespace tesseract {\n\n// Enum holding special values of unichar_id. Every unicharset has these.\n// Warning! Keep in sync with kSpecialUnicharCodes.\nenum SpecialUnicharCodes {\n  UNICHAR_SPACE,\n  UNICHAR_JOINED,\n  UNICHAR_BROKEN,\n\n  SPECIAL_UNICHAR_CODES_COUNT\n};\n\n// Boolean flag for unichar_insert. It's a bit of a double negative to allow\n// the default value to be false.\nenum class OldUncleanUnichars {\n  kFalse,\n  kTrue,\n};\n\nclass TESS_API CHAR_FRAGMENT {\npublic:\n  // Minimum number of characters used for fragment representation.\n  static const int kMinLen = 6;\n  // Maximum number of characters used for fragment representation.\n  static const int kMaxLen = 3 + UNICHAR_LEN + 2;\n  // Maximum number of fragments per character.\n  static const int kMaxChunks = 5;\n\n  // Setters and Getters.\n  inline void set_all(const char *unichar, int pos, int total, bool natural) {\n    set_unichar(unichar);\n    set_pos(pos);\n    set_total(total);\n    set_natural(natural);\n  }\n  inline void set_unichar(const char *uch) {\n    strncpy(this->unichar, uch, sizeof(this->unichar));\n    this->unichar[UNICHAR_LEN] = '\\0';\n  }\n  inline void set_pos(int p) {\n    this->pos = p;\n  }\n  inline void set_total(int t) {\n    this->total = t;\n  }\n  inline const char *get_unichar() const {\n    return this->unichar;\n  }\n  inline int get_pos() const {\n    return this->pos;\n  }\n  inline int get_total() const {\n    return this->total;\n  }\n\n  // Returns the string that represents a fragment\n  // with the given unichar, pos and total.\n  static std::string to_string(const char *unichar, int pos, int total,\n                               bool natural);\n  // Returns the string that represents this fragment.\n  std::string to_string() const {\n    return to_string(unichar, pos, total, natural);\n  }\n\n  // Checks whether a fragment has the same unichar,\n  // position and total as the given inputs.\n  inline bool equals(const char *other_unichar, int other_pos,\n                     int other_total) const {\n    return (strcmp(this->unichar, other_unichar) == 0 &&\n            this->pos == other_pos && this->total == other_total);\n  }\n  inline bool equals(const CHAR_FRAGMENT *other) const {\n    return this->equals(other->get_unichar(), other->get_pos(),\n                        other->get_total());\n  }\n\n  // Checks whether a given fragment is a continuation of this fragment.\n  // Assumes that the given fragment pointer is not nullptr.\n  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {\n    return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&\n            this->total == fragment->get_total() &&\n            this->pos == fragment->get_pos() + 1);\n  }\n\n  // Returns true if this fragment is a beginning fragment.\n  inline bool is_beginning() const {\n    return this->pos == 0;\n  }\n\n  // Returns true if this fragment is an ending fragment.\n  inline bool is_ending() const {\n    return this->pos == this->total - 1;\n  }\n\n  // Returns true if the fragment was a separate component to begin with,\n  // ie did not need chopping to be isolated, but may have been separated\n  // out from a multi-outline blob.\n  inline bool is_natural() const {\n    return natural;\n  }\n  void set_natural(bool value) {\n    natural = value;\n  }\n\n  // Parses the string to see whether it represents a character fragment\n  // (rather than a regular character). If so, allocates memory for a new\n  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment\n  // information. Fragments are of the form:\n  // |m|1|2, meaning chunk 1 of 2 of character m, or\n  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed\n  // to divide the parts, as they were already separate connected components.\n  //\n  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT\n  // instance, otherwise (if the string does not represent a fragment or it\n  // looks like it does, but parsing it as a fragment fails) returns nullptr.\n  //\n  // Note: The caller is responsible for deallocating memory\n  // associated with the returned pointer.\n  static CHAR_FRAGMENT *parse_from_string(const char *str);\n\nprivate:\n  char unichar[UNICHAR_LEN + 1];\n  // True if the fragment was a separate component to begin with,\n  // ie did not need chopping to be isolated, but may have been separated\n  // out from a multi-outline blob.\n  bool natural;\n  int16_t pos;   // fragment position in the character\n  int16_t total; // total number of fragments in the character\n};\n\n// The UNICHARSET class is an utility class for Tesseract that holds the\n// set of characters that are used by the engine. Each character is identified\n// by a unique number, from 0 to (size - 1).\nclass TESS_API UNICHARSET {\npublic:\n  // Custom list of characters and their ligature forms (UTF8)\n  // These map to unicode values in the private use area (PUC) and are supported\n  // by only few font families (eg. Wyld, Adobe Caslon Pro).\n  static const char *kCustomLigatures[][2];\n\n  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.\n  static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];\n\n  // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)\n  enum Direction {\n    U_LEFT_TO_RIGHT = 0,\n    U_RIGHT_TO_LEFT = 1,\n    U_EUROPEAN_NUMBER = 2,\n    U_EUROPEAN_NUMBER_SEPARATOR = 3,\n    U_EUROPEAN_NUMBER_TERMINATOR = 4,\n    U_ARABIC_NUMBER = 5,\n    U_COMMON_NUMBER_SEPARATOR = 6,\n    U_BLOCK_SEPARATOR = 7,\n    U_SEGMENT_SEPARATOR = 8,\n    U_WHITE_SPACE_NEUTRAL = 9,\n    U_OTHER_NEUTRAL = 10,\n    U_LEFT_TO_RIGHT_EMBEDDING = 11,\n    U_LEFT_TO_RIGHT_OVERRIDE = 12,\n    U_RIGHT_TO_LEFT_ARABIC = 13,\n    U_RIGHT_TO_LEFT_EMBEDDING = 14,\n    U_RIGHT_TO_LEFT_OVERRIDE = 15,\n    U_POP_DIRECTIONAL_FORMAT = 16,\n    U_DIR_NON_SPACING_MARK = 17,\n    U_BOUNDARY_NEUTRAL = 18,\n    U_FIRST_STRONG_ISOLATE = 19,\n    U_LEFT_TO_RIGHT_ISOLATE = 20,\n    U_RIGHT_TO_LEFT_ISOLATE = 21,\n    U_POP_DIRECTIONAL_ISOLATE = 22,\n#ifndef U_HIDE_DEPRECATED_API\n    U_CHAR_DIRECTION_COUNT\n#endif // U_HIDE_DEPRECATED_API\n  };\n\n  // Create an empty UNICHARSET\n  UNICHARSET();\n\n  ~UNICHARSET();\n\n  // Return the UNICHAR_ID of a given unichar representation within the\n  // UNICHARSET.\n  UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;\n\n  // Return the UNICHAR_ID of a given unichar representation within the\n  // UNICHARSET. Only the first length characters from unichar_repr are used.\n  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;\n\n  // Return the minimum number of bytes that matches a legal UNICHAR_ID,\n  // while leaving the rest of the string encodable. Returns 0 if the\n  // beginning of the string is not encodable.\n  // WARNING: this function now encodes the whole string for precision.\n  // Use encode_string in preference to repeatedly calling step.\n  int step(const char *str) const;\n\n  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.\n  // If not encodable, write the first byte offset which cannot be converted\n  // into the second (return) argument.\n  bool encodable_string(const char *str, unsigned *first_bad_position) const;\n\n  // Encodes the given UTF-8 string with this UNICHARSET.\n  // Any part of the string that cannot be encoded (because the utf8 can't\n  // be broken up into pieces that are in the unicharset) then:\n  // if give_up_on_failure, stops and returns a partial encoding,\n  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.\n  // Returns true if the encoding succeeds completely, false if there is at\n  // least one failure.\n  // If lengths is not nullptr, then it is filled with the corresponding\n  // byte length of each encoded UNICHAR_ID.\n  // If encoded_length is not nullptr then on return it contains the length of\n  // str that was encoded. (if give_up_on_failure the location of the first\n  // failure, otherwise strlen(str).)\n  // WARNING: Caller must guarantee that str has already been cleaned of codes\n  // that do not belong in the unicharset, or encoding may fail.\n  // Use CleanupString to perform the cleaning.\n  bool encode_string(const char *str, bool give_up_on_failure,\n                     std::vector<UNICHAR_ID> *encoding,\n                     std::vector<char> *lengths,\n                     unsigned *encoded_length) const;\n\n  // Return the unichar representation corresponding to the given UNICHAR_ID\n  // within the UNICHARSET.\n  const char *id_to_unichar(UNICHAR_ID id) const;\n\n  // Return the UTF8 representation corresponding to the given UNICHAR_ID after\n  // resolving any private encodings internal to Tesseract. This method is\n  // preferable to id_to_unichar for outputting text that will be visible to\n  // external applications.\n  const char *id_to_unichar_ext(UNICHAR_ID id) const;\n\n  // Return a string that reformats the utf8 str into the str followed\n  // by its hex unicodes.\n  static std::string debug_utf8_str(const char *str);\n\n  // Removes/replaces content that belongs in rendered text, but not in the\n  // unicharset.\n  static std::string CleanupString(const char *utf8_str) {\n    return CleanupString(utf8_str, strlen(utf8_str));\n  }\n  static std::string CleanupString(const char *utf8_str, size_t length);\n\n  // Return a string containing debug information on the unichar, including\n  // the id_to_unichar, its hex unicodes and the properties.\n  std::string debug_str(UNICHAR_ID id) const;\n  std::string debug_str(const char *unichar_repr) const {\n    return debug_str(unichar_to_id(unichar_repr));\n  }\n\n  // Adds a unichar representation to the set. If old_style is true, then\n  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL\n  // characters are ignored/skipped as if they don't exist and n-grams that\n  // can already be encoded are not added.\n  void unichar_insert(const char *const unichar_repr,\n                      OldUncleanUnichars old_style);\n  void unichar_insert(const char *const unichar_repr) {\n    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);\n  }\n  // Adds a unichar representation to the set. Avoids setting old_style to true,\n  // unless it is necessary to make the new unichar get added.\n  void unichar_insert_backwards_compatible(const char *const unichar_repr) {\n    std::string cleaned = CleanupString(unichar_repr);\n    if (cleaned != unichar_repr) {\n      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);\n    } else {\n      auto old_size = size();\n      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);\n      if (size() == old_size) {\n        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);\n      }\n    }\n  }\n\n  // Return true if the given unichar id exists within the set.\n  // Relies on the fact that unichar ids are contiguous in the unicharset.\n  bool contains_unichar_id(UNICHAR_ID unichar_id) const {\n    return static_cast<size_t>(unichar_id) < unichars.size();\n  }\n\n  // Return true if the given unichar representation exists within the set.\n  bool contains_unichar(const char *const unichar_repr) const;\n  bool contains_unichar(const char *const unichar_repr, int length) const;\n\n  // Return true if the given unichar representation corresponds to the given\n  // UNICHAR_ID within the set.\n  bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;\n\n  // Delete CHAR_FRAGMENTs stored in properties of unichars array.\n  void delete_pointers_in_unichars() {\n    for (auto &unichar : unichars) {\n      delete unichar.properties.fragment;\n      unichar.properties.fragment = nullptr;\n    }\n  }\n\n  // Clear the UNICHARSET (all the previous data is lost).\n  void clear() {\n    if (script_table != nullptr) {\n      for (int i = 0; i < script_table_size_used; ++i) {\n        delete[] script_table[i];\n      }\n      delete[] script_table;\n      script_table = nullptr;\n      script_table_size_used = 0;\n    }\n    script_table_size_reserved = 0;\n    delete_pointers_in_unichars();\n    unichars.clear();\n    ids.clear();\n    top_bottom_set_ = false;\n    script_has_upper_lower_ = false;\n    script_has_xheight_ = false;\n    old_style_included_ = false;\n    null_sid_ = 0;\n    common_sid_ = 0;\n    latin_sid_ = 0;\n    cyrillic_sid_ = 0;\n    greek_sid_ = 0;\n    han_sid_ = 0;\n    hiragana_sid_ = 0;\n    katakana_sid_ = 0;\n    thai_sid_ = 0;\n    hangul_sid_ = 0;\n    default_sid_ = 0;\n  }\n\n  // Return the size of the set (the number of different UNICHAR it holds).\n  size_t size() const {\n    return unichars.size();\n  }\n\n  // Opens the file indicated by filename and saves unicharset to that file.\n  // Returns true if the operation is successful.\n  bool save_to_file(const char *const filename) const {\n    FILE *file = fopen(filename, \"w+b\");\n    if (file == nullptr) {\n      return false;\n    }\n    bool result = save_to_file(file);\n    fclose(file);\n    return result;\n  }\n\n  // Saves the content of the UNICHARSET to the given file.\n  // Returns true if the operation is successful.\n  bool save_to_file(FILE *file) const {\n    std::string str;\n    return save_to_string(str) &&\n           tesseract::Serialize(file, &str[0], str.length());\n  }\n\n  bool save_to_file(tesseract::TFile *file) const {\n    std::string str;\n    return save_to_string(str) && file->Serialize(&str[0], str.length());\n  }\n\n  // Saves the content of the UNICHARSET to the given string.\n  // Returns true if the operation is successful.\n  bool save_to_string(std::string &str) const;\n\n  // Opens the file indicated by filename and loads the UNICHARSET\n  // from the given file. The previous data is lost.\n  // Returns true if the operation is successful.\n  bool load_from_file(const char *const filename, bool skip_fragments) {\n    FILE *file = fopen(filename, \"rb\");\n    if (file == nullptr) {\n      return false;\n    }\n    bool result = load_from_file(file, skip_fragments);\n    fclose(file);\n    return result;\n  }\n  // returns true if the operation is successful.\n  bool load_from_file(const char *const filename) {\n    return load_from_file(filename, false);\n  }\n\n  // Loads the UNICHARSET from the given file. The previous data is lost.\n  // Returns true if the operation is successful.\n  bool load_from_file(FILE *file, bool skip_fragments);\n  bool load_from_file(FILE *file) {\n    return load_from_file(file, false);\n  }\n  bool load_from_file(tesseract::TFile *file, bool skip_fragments);\n\n  // Sets up internal data after loading the file, based on the char\n  // properties. Called from load_from_file, but also needs to be run\n  // during set_unicharset_properties.\n  void post_load_setup();\n\n  // Returns true if right_to_left scripts are significant in the unicharset,\n  // but without being so sensitive that \"universal\" unicharsets containing\n  // characters from many scripts, like orientation and script detection,\n  // look like they are right_to_left.\n  bool major_right_to_left() const;\n\n  // Set a whitelist and/or blacklist of characters to recognize.\n  // An empty or nullptr whitelist enables everything (minus any blacklist).\n  // An empty or nullptr blacklist disables nothing.\n  // An empty or nullptr unblacklist has no effect.\n  // The blacklist overrides the whitelist.\n  // The unblacklist overrides the blacklist.\n  // Each list is a string of utf8 character strings. Boundaries between\n  // unicharset units are worked out automatically, and characters not in\n  // the unicharset are silently ignored.\n  void set_black_and_whitelist(const char *blacklist, const char *whitelist,\n                               const char *unblacklist);\n\n  // Set the isalpha property of the given unichar to the given value.\n  void set_isalpha(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.isalpha = value;\n  }\n\n  // Set the islower property of the given unichar to the given value.\n  void set_islower(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.islower = value;\n  }\n\n  // Set the isupper property of the given unichar to the given value.\n  void set_isupper(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.isupper = value;\n  }\n\n  // Set the isdigit property of the given unichar to the given value.\n  void set_isdigit(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.isdigit = value;\n  }\n\n  // Set the ispunctuation property of the given unichar to the given value.\n  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.ispunctuation = value;\n  }\n\n  // Set the isngram property of the given unichar to the given value.\n  void set_isngram(UNICHAR_ID unichar_id, bool value) {\n    unichars[unichar_id].properties.isngram = value;\n  }\n\n  // Set the script name of the given unichar to the given value.\n  // Value is copied and thus can be a temporary;\n  void set_script(UNICHAR_ID unichar_id, const char *value) {\n    unichars[unichar_id].properties.script_id = add_script(value);\n  }\n\n  // Set other_case unichar id in the properties for the given unichar id.\n  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {\n    unichars[unichar_id].properties.other_case = other_case;\n  }\n\n  // Set the direction property of the given unichar to the given value.\n  void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {\n    unichars[unichar_id].properties.direction = value;\n  }\n\n  // Set mirror unichar id in the properties for the given unichar id.\n  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {\n    unichars[unichar_id].properties.mirror = mirror;\n  }\n\n  // Record normalized version of unichar with the given unichar_id.\n  void set_normed(UNICHAR_ID unichar_id, const char *normed) {\n    unichars[unichar_id].properties.normed = normed;\n    unichars[unichar_id].properties.normed_ids.clear();\n  }\n  // Sets the normed_ids vector from the normed string. normed_ids is not\n  // stored in the file, and needs to be set when the UNICHARSET is loaded.\n  void set_normed_ids(UNICHAR_ID unichar_id);\n\n  // Return the isalpha property of the given unichar.\n  bool get_isalpha(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.isalpha;\n  }\n\n  // Return the islower property of the given unichar.\n  bool get_islower(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.islower;\n  }\n\n  // Return the isupper property of the given unichar.\n  bool get_isupper(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.isupper;\n  }\n\n  // Return the isdigit property of the given unichar.\n  bool get_isdigit(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.isdigit;\n  }\n\n  // Return the ispunctuation property of the given unichar.\n  bool get_ispunctuation(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.ispunctuation;\n  }\n\n  // Return the isngram property of the given unichar.\n  bool get_isngram(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return false;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.isngram;\n  }\n\n  // Returns whether the unichar id represents a unicode value in the private\n  // use area.\n  bool get_isprivate(UNICHAR_ID unichar_id) const;\n\n  // Returns true if the ids have useful min/max top/bottom values.\n  bool top_bottom_useful() const {\n    return top_bottom_set_;\n  }\n  // Sets all ranges to empty, so they can be expanded to set the values.\n  void set_ranges_empty();\n  // Sets all the properties for this unicharset given a src_unicharset with\n  // everything set. The unicharsets don't have to be the same, and graphemes\n  // are correctly accounted for.\n  void SetPropertiesFromOther(const UNICHARSET &src) {\n    PartialSetPropertiesFromOther(0, src);\n  }\n  // Sets properties from Other, starting only at the given index.\n  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);\n  // Expands the tops and bottoms and widths for this unicharset given a\n  // src_unicharset with ranges in it. The unicharsets don't have to be the\n  // same, and graphemes are correctly accounted for.\n  void ExpandRangesFromOther(const UNICHARSET &src);\n  // Makes this a copy of src. Clears this completely first, so the automatic\n  // ids will not be present in this if not in src.\n  void CopyFrom(const UNICHARSET &src);\n  // For each id in src, if it does not occur in this, add it, as in\n  // SetPropertiesFromOther, otherwise expand the ranges, as in\n  // ExpandRangesFromOther.\n  void AppendOtherUnicharset(const UNICHARSET &src);\n  // Returns true if the acceptable ranges of the tops of the characters do\n  // not overlap, making their x-height calculations distinct.\n  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;\n  // Returns the min and max bottom and top of the given unichar in\n  // baseline-normalized coordinates, ie, where the baseline is\n  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight\n  // (See normalis.h for the definitions).\n  void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,\n                      int *min_top, int *max_top) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      *min_bottom = *min_top = 0;\n      *max_bottom = *max_top = 256; // kBlnCellHeight\n      return;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    *min_bottom = unichars[unichar_id].properties.min_bottom;\n    *max_bottom = unichars[unichar_id].properties.max_bottom;\n    *min_top = unichars[unichar_id].properties.min_top;\n    *max_top = unichars[unichar_id].properties.max_top;\n  }\n  void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,\n                      int min_top, int max_top) {\n    unichars[unichar_id].properties.min_bottom =\n        ClipToRange<int>(min_bottom, 0, UINT8_MAX);\n    unichars[unichar_id].properties.max_bottom =\n        ClipToRange<int>(max_bottom, 0, UINT8_MAX);\n    unichars[unichar_id].properties.min_top =\n        ClipToRange<int>(min_top, 0, UINT8_MAX);\n    unichars[unichar_id].properties.max_top =\n        ClipToRange<int>(max_top, 0, UINT8_MAX);\n  }\n  // Returns the width stats (as mean, sd) of the given unichar relative to the\n  // median advance of all characters in the character set.\n  void get_width_stats(UNICHAR_ID unichar_id, float *width,\n                       float *width_sd) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      *width = 0.0f;\n      *width_sd = 0.0f;\n      return;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    *width = unichars[unichar_id].properties.width;\n    *width_sd = unichars[unichar_id].properties.width_sd;\n  }\n  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {\n    unichars[unichar_id].properties.width = width;\n    unichars[unichar_id].properties.width_sd = width_sd;\n  }\n  // Returns the stats of the x-bearing (as mean, sd) of the given unichar\n  // relative to the median advance of all characters in the character set.\n  void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,\n                         float *bearing_sd) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      *bearing = *bearing_sd = 0.0f;\n      return;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    *bearing = unichars[unichar_id].properties.bearing;\n    *bearing_sd = unichars[unichar_id].properties.bearing_sd;\n  }\n  void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,\n                         float bearing_sd) {\n    unichars[unichar_id].properties.bearing = bearing;\n    unichars[unichar_id].properties.bearing_sd = bearing_sd;\n  }\n  // Returns the stats of the x-advance of the given unichar (as mean, sd)\n  // relative to the median advance of all characters in the character set.\n  void get_advance_stats(UNICHAR_ID unichar_id, float *advance,\n                         float *advance_sd) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      *advance = *advance_sd = 0;\n      return;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    *advance = unichars[unichar_id].properties.advance;\n    *advance_sd = unichars[unichar_id].properties.advance_sd;\n  }\n  void set_advance_stats(UNICHAR_ID unichar_id, float advance,\n                         float advance_sd) {\n    unichars[unichar_id].properties.advance = advance;\n    unichars[unichar_id].properties.advance_sd = advance_sd;\n  }\n  // Returns true if the font metrics properties are empty.\n  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {\n    return unichars[unichar_id].properties.AnyRangeEmpty();\n  }\n\n  // Returns true if the script of the given id is space delimited.\n  // Returns false for Han and Thai scripts.\n  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return true;\n    }\n    int script_id = get_script(unichar_id);\n    return script_id != han_sid_ && script_id != thai_sid_ &&\n           script_id != hangul_sid_ && script_id != hiragana_sid_ &&\n           script_id != katakana_sid_;\n  }\n\n  // Return the script name of the given unichar.\n  // The returned pointer will always be the same for the same script, it's\n  // managed by unicharset and thus MUST NOT be deleted\n  int get_script(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return null_sid_;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.script_id;\n  }\n\n  // Return the character properties, eg. alpha/upper/lower/digit/punct,\n  // as a bit field of unsigned int.\n  unsigned int get_properties(UNICHAR_ID unichar_id) const;\n\n  // Return the character property as a single char.  If a character has\n  // multiple attributes, the main property is defined by the following order:\n  //   upper_case : 'A'\n  //   lower_case : 'a'\n  //   alpha      : 'x'\n  //   digit      : '0'\n  //   punctuation: 'p'\n  char get_chartype(UNICHAR_ID unichar_id) const;\n\n  // Get other_case unichar id in the properties for the given unichar id.\n  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return INVALID_UNICHAR_ID;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.other_case;\n  }\n\n  // Returns the direction property of the given unichar.\n  Direction get_direction(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return UNICHARSET::U_OTHER_NEUTRAL;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.direction;\n  }\n\n  // Get mirror unichar id in the properties for the given unichar id.\n  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return INVALID_UNICHAR_ID;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.mirror;\n  }\n\n  // Returns UNICHAR_ID of the corresponding lower-case unichar.\n  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return INVALID_UNICHAR_ID;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    if (unichars[unichar_id].properties.islower) {\n      return unichar_id;\n    }\n    return unichars[unichar_id].properties.other_case;\n  }\n\n  // Returns UNICHAR_ID of the corresponding upper-case unichar.\n  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return INVALID_UNICHAR_ID;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    if (unichars[unichar_id].properties.isupper) {\n      return unichar_id;\n    }\n    return unichars[unichar_id].properties.other_case;\n  }\n\n  // Returns true if this UNICHARSET has the special codes in\n  // SpecialUnicharCodes available. If false then there are normal unichars\n  // at these codes and they should not be used.\n  bool has_special_codes() const {\n    return get_fragment(UNICHAR_BROKEN) != nullptr &&\n           strcmp(id_to_unichar(UNICHAR_BROKEN),\n                  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;\n  }\n\n  // Returns true if there are any repeated unicodes in the normalized\n  // text of any unichar-id in the unicharset.\n  bool AnyRepeatedUnicodes() const;\n\n  // Return a pointer to the CHAR_FRAGMENT class if the given\n  // unichar id represents a character fragment.\n  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {\n    if (INVALID_UNICHAR_ID == unichar_id) {\n      return nullptr;\n    }\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.fragment;\n  }\n\n  // Return the isalpha property of the given unichar representation.\n  bool get_isalpha(const char *const unichar_repr) const {\n    return get_isalpha(unichar_to_id(unichar_repr));\n  }\n\n  // Return the islower property of the given unichar representation.\n  bool get_islower(const char *const unichar_repr) const {\n    return get_islower(unichar_to_id(unichar_repr));\n  }\n\n  // Return the isupper property of the given unichar representation.\n  bool get_isupper(const char *const unichar_repr) const {\n    return get_isupper(unichar_to_id(unichar_repr));\n  }\n\n  // Return the isdigit property of the given unichar representation.\n  bool get_isdigit(const char *const unichar_repr) const {\n    return get_isdigit(unichar_to_id(unichar_repr));\n  }\n\n  // Return the ispunctuation property of the given unichar representation.\n  bool get_ispunctuation(const char *const unichar_repr) const {\n    return get_ispunctuation(unichar_to_id(unichar_repr));\n  }\n\n  // Return the character properties, eg. alpha/upper/lower/digit/punct,\n  // of the given unichar representation\n  unsigned int get_properties(const char *const unichar_repr) const {\n    return get_properties(unichar_to_id(unichar_repr));\n  }\n\n  char get_chartype(const char *const unichar_repr) const {\n    return get_chartype(unichar_to_id(unichar_repr));\n  }\n\n  // Return the script name of the given unichar representation.\n  // The returned pointer will always be the same for the same script, it's\n  // managed by unicharset and thus MUST NOT be deleted\n  int get_script(const char *const unichar_repr) const {\n    return get_script(unichar_to_id(unichar_repr));\n  }\n\n  // Return a pointer to the CHAR_FRAGMENT class struct if the given\n  // unichar representation represents a character fragment.\n  const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {\n    if (unichar_repr == nullptr || unichar_repr[0] == '\\0' ||\n        !ids.contains(unichar_repr, false)) {\n      return nullptr;\n    }\n    return get_fragment(unichar_to_id(unichar_repr));\n  }\n\n  // Return the isalpha property of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  bool get_isalpha(const char *const unichar_repr, int length) const {\n    return get_isalpha(unichar_to_id(unichar_repr, length));\n  }\n\n  // Return the islower property of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  bool get_islower(const char *const unichar_repr, int length) const {\n    return get_islower(unichar_to_id(unichar_repr, length));\n  }\n\n  // Return the isupper property of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  bool get_isupper(const char *const unichar_repr, int length) const {\n    return get_isupper(unichar_to_id(unichar_repr, length));\n  }\n\n  // Return the isdigit property of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  bool get_isdigit(const char *const unichar_repr, int length) const {\n    return get_isdigit(unichar_to_id(unichar_repr, length));\n  }\n\n  // Return the ispunctuation property of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  bool get_ispunctuation(const char *const unichar_repr, int length) const {\n    return get_ispunctuation(unichar_to_id(unichar_repr, length));\n  }\n\n  // Returns normalized version of unichar with the given unichar_id.\n  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {\n    if (unichar_id == UNICHAR_SPACE) {\n      return \" \";\n    }\n    return unichars[unichar_id].properties.normed.c_str();\n  }\n  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized\n  // version of the given id. There may be more than one UNICHAR_ID in the\n  // vector if unichar_id represents a ligature.\n  const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {\n    return unichars[unichar_id].properties.normed_ids;\n  }\n\n  // Return the script name of the given unichar representation.\n  // Only the first length characters from unichar_repr are used.\n  // The returned pointer will always be the same for the same script, it's\n  // managed by unicharset and thus MUST NOT be deleted\n  int get_script(const char *const unichar_repr, int length) const {\n    return get_script(unichar_to_id(unichar_repr, length));\n  }\n\n  // Return the (current) number of scripts in the script table\n  int get_script_table_size() const {\n    return script_table_size_used;\n  }\n\n  // Return the script string from its id\n  const char *get_script_from_script_id(int id) const {\n    if (id >= script_table_size_used || id < 0) {\n      return null_script;\n    }\n    return script_table[id];\n  }\n\n  // Returns the id from the name of the script, or 0 if script is not found.\n  // Note that this is an expensive operation since it involves iteratively\n  // comparing strings in the script table.  To avoid dependency on STL, we\n  // won't use a hash.  Instead, the calling function can use this to lookup\n  // and save the ID for relevant scripts for fast comparisons later.\n  int get_script_id_from_name(const char *script_name) const;\n\n  // Return true if the given script is the null script\n  bool is_null_script(const char *script) const {\n    return script == null_script;\n  }\n\n  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,\n  // then the returned pointer will be the same.\n  // The script parameter is copied and thus can be a temporary.\n  int add_script(const char *script);\n\n  // Return the enabled property of the given unichar.\n  bool get_enabled(UNICHAR_ID unichar_id) const {\n    ASSERT_HOST(contains_unichar_id(unichar_id));\n    return unichars[unichar_id].properties.enabled;\n  }\n\n  int null_sid() const {\n    return null_sid_;\n  }\n  int common_sid() const {\n    return common_sid_;\n  }\n  int latin_sid() const {\n    return latin_sid_;\n  }\n  int cyrillic_sid() const {\n    return cyrillic_sid_;\n  }\n  int greek_sid() const {\n    return greek_sid_;\n  }\n  int han_sid() const {\n    return han_sid_;\n  }\n  int hiragana_sid() const {\n    return hiragana_sid_;\n  }\n  int katakana_sid() const {\n    return katakana_sid_;\n  }\n  int thai_sid() const {\n    return thai_sid_;\n  }\n  int hangul_sid() const {\n    return hangul_sid_;\n  }\n  int default_sid() const {\n    return default_sid_;\n  }\n\n  // Returns true if the unicharset has the concept of upper/lower case.\n  bool script_has_upper_lower() const {\n    return script_has_upper_lower_;\n  }\n  // Returns true if the unicharset has the concept of x-height.\n  // script_has_xheight can be true even if script_has_upper_lower is not,\n  // when the script has a sufficiently predominant top line with ascenders,\n  // such as Devanagari and Thai.\n  bool script_has_xheight() const {\n    return script_has_xheight_;\n  }\n\nprivate:\n  struct TESS_API UNICHAR_PROPERTIES {\n    UNICHAR_PROPERTIES();\n    // Initializes all properties to sensible default values.\n    void Init();\n    // Sets all ranges wide open. Initialization default in case there are\n    // no useful values available.\n    void SetRangesOpen();\n    // Sets all ranges to empty. Used before expanding with font-based data.\n    void SetRangesEmpty();\n    // Returns true if any of the top/bottom/width/bearing/advance ranges/stats\n    // is empty.\n    bool AnyRangeEmpty() const;\n    // Expands the ranges with the ranges from the src properties.\n    void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);\n    // Copies the properties from src into this.\n    void CopyFrom(const UNICHAR_PROPERTIES &src);\n\n    bool isalpha;\n    bool islower;\n    bool isupper;\n    bool isdigit;\n    bool ispunctuation;\n    bool isngram;\n    bool enabled;\n    // Possible limits of the top and bottom of the bounding box in\n    // baseline-normalized coordinates, ie, where the baseline is\n    // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight\n    // (See normalis.h for the definitions).\n    uint8_t min_bottom;\n    uint8_t max_bottom;\n    uint8_t min_top;\n    uint8_t max_top;\n    // Statistics of the widths of bounding box, relative to the median advance.\n    float width;\n    float width_sd;\n    // Stats of the x-bearing and advance, also relative to the median advance.\n    float bearing;\n    float bearing_sd;\n    float advance;\n    float advance_sd;\n    int script_id;\n    UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar\n    Direction direction;   // direction of this unichar\n    // Mirror property is useful for reverse DAWG lookup for words in\n    // right-to-left languages (e.g. \"(word)\" would be in\n    // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.\n    // However, what we want in our DAWG is\n    // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not\n    // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.\n    UNICHAR_ID mirror;\n    // A string of unichar_ids that represent the corresponding normed string.\n    // For awkward characters like em-dash, this gives hyphen.\n    // For ligatures, this gives the string of normal unichars.\n    std::vector<UNICHAR_ID> normed_ids;\n    std::string normed; // normalized version of this unichar\n    // Contains meta information about the fragment if a unichar represents\n    // a fragment of a character, otherwise should be set to nullptr.\n    // It is assumed that character fragments are added to the unicharset\n    // after the corresponding 'base' characters.\n    CHAR_FRAGMENT *fragment;\n  };\n\n  struct UNICHAR_SLOT {\n    char representation[UNICHAR_LEN + 1];\n    UNICHAR_PROPERTIES properties;\n  };\n\n  // Internal recursive version of encode_string above.\n  // str is the start of the whole string.\n  // str_index is the current position in str.\n  // str_length is the length of str.\n  // encoding is a working encoding of str.\n  // lengths is a working set of lengths of each element of encoding.\n  // best_total_length is the longest length of str that has been successfully\n  // encoded so far.\n  // On return:\n  // best_encoding contains the encoding that used the longest part of str.\n  // best_lengths (may be null) contains the lengths of best_encoding.\n  void encode_string(const char *str, int str_index, int str_length,\n                     std::vector<UNICHAR_ID> *encoding,\n                     std::vector<char> *lengths, unsigned *best_total_length,\n                     std::vector<UNICHAR_ID> *best_encoding,\n                     std::vector<char> *best_lengths) const;\n\n  // Gets the properties for a grapheme string, combining properties for\n  // multiple characters in a meaningful way where possible.\n  // Returns false if no valid match was found in the unicharset.\n  // NOTE that script_id, mirror, and other_case refer to this unicharset on\n  // return and will need redirecting if the target unicharset is different.\n  bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;\n\n  // Load ourselves from a \"file\" where our only interface to the file is\n  // an implementation of fgets().  This is the parsing primitive accessed by\n  // the public routines load_from_file().\n  bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,\n                      bool skip_fragments);\n\n  // List of mappings to make when ingesting strings from the outside.\n  // The substitutions clean up text that should exists for rendering of\n  // synthetic data, but not in the recognition set.\n  static const char *kCleanupMaps[][2];\n  static const char *null_script;\n\n  std::vector<UNICHAR_SLOT> unichars;\n  UNICHARMAP ids;\n  char **script_table;\n  int script_table_size_used;\n  int script_table_size_reserved;\n  // True if the unichars have their tops/bottoms set.\n  bool top_bottom_set_;\n  // True if the unicharset has significant upper/lower case chars.\n  bool script_has_upper_lower_;\n  // True if the unicharset has a significant mean-line with significant\n  // ascenders above that.\n  bool script_has_xheight_;\n  // True if the set contains chars that would be changed by the cleanup.\n  bool old_style_included_;\n\n  // A few convenient script name-to-id mapping without using hash.\n  // These are initialized when unicharset file is loaded.  Anything\n  // missing from this list can be looked up using get_script_id_from_name.\n  int null_sid_;\n  int common_sid_;\n  int latin_sid_;\n  int cyrillic_sid_;\n  int greek_sid_;\n  int han_sid_;\n  int hiragana_sid_;\n  int katakana_sid_;\n  int thai_sid_;\n  int hangul_sid_;\n  // The most frequently occurring script in the charset.\n  int default_sid_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_UNICHARSET_H_\n"
  },
  {
    "path": "src/ccutil/unicity_table.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicity_table.h\n// Description: a class to uniquify objects, manipulating them using integers\n//              ids.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_UNICITY_TABLE_H_\n#define TESSERACT_CCUTIL_UNICITY_TABLE_H_\n\n#include \"errcode.h\"\n\n#include \"genericvector.h\"\n\n#include <functional> // for std::function\n\nnamespace tesseract {\n\n// A class to uniquify objects, manipulating them using integers ids.\n// T requirements:\n//   operator= to add an element\n//   default-constructible: allocating the internal table will call the default\n//     constructor.\ntemplate <typename T>\nclass UnicityTable {\npublic:\n  /// Clear the structures and deallocate internal structures.\n  ~UnicityTable() {\n    clear();\n  }\n\n  /// Reserve some memory. If there is size or more elements, the table will\n  /// then allocate size * 2 elements.\n  void reserve(int size) {\n    table_.reserve(size);\n  }\n\n  /// Return the size used.\n  int size() const  {\n    return table_.size();\n  }\n\n  /// Return the object from an id.\n  const T &at(int id) const {\n    return table_.at(id);\n  }\n\n  // Return the pointer to an object with the given id.\n  T &at(int id) {\n    return table_.at(id);\n  }\n\n  T &operator[](size_t id) {\n    return table_[id];\n  }\n  const T &operator[](size_t id) const {\n    return table_[id];\n  }\n\n  /// Return the id of the T object.\n  /// This method NEEDS a compare_callback to be passed to\n  /// set_compare_callback.\n  int get_index(T object) const {\n    return table_.get_index(object);\n  }\n\n  /// Add an element in the table\n  int push_back(T object)  {\n    auto idx = get_index(object);\n    if (idx == -1) {\n      idx = table_.push_back(std::move(object));\n    }\n    return idx;\n  }\n\n  /// Add a callback to be called to delete the elements when the table took\n  /// their ownership.\n  void set_clear_callback(const std::function<void(T)> &cb) {\n    table_.set_clear_callback(cb);\n  }\n\n  /// Clear the table, calling the callback function if any.\n  /// All the owned Callbacks are also deleted.\n  /// If you don't want the Callbacks to be deleted, before calling clear, set\n  /// the callback to nullptr.\n  void clear()  {\n    table_.clear();\n  }\n\n  /// This method clear the current object, then, does a shallow copy of\n  /// its argument, and finally invalidate its argument.\n  void move(UnicityTable<T> *from) {\n    table_.move(&from->table_);\n  }\n\n  /// Read/Write the table to a file. This does _NOT_ read/write the callbacks.\n  /// The Callback given must be permanent since they will be called more than\n  /// once. The given callback will be deleted at the end.\n  /// Returns false on read/write error.\n  bool write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const {\n    return table_.write(f, cb);\n  }\n  bool read(tesseract::TFile *f, const std::function<bool(tesseract::TFile *, T *)> &cb) {\n    return table_.read(f, cb);\n  }\n\nprivate:\n  GenericVector<T> table_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_UNICITY_TABLE_H_\n"
  },
  {
    "path": "src/ccutil/universalambigs.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        universalambigs.h\n// Description: Data for a universal ambigs file that is useful for\n//              any language.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_\n#define TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_\n\nnamespace tesseract {\n\n#ifndef _MSC_VER\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Woverlength-strings\"\n#endif\ninline const char kUniversalAmbigsFile[] = {\n    \"v2\\n\"\n    \"'' \\\" 1\\n\"\n    \"`' \\\" 1\\n\"\n    \"'` \\\" 1\\n\"\n    \"‘' \\\" 1\\n\"\n    \"'‘ \\\" 1\\n\"\n    \"’' \\\" 1\\n\"\n    \"'’ \\\" 1\\n\"\n    \"`` \\\" 1\\n\"\n    \"`‘ \\\" 1\\n\"\n    \"‘` \\\" 1\\n\"\n    \"`’ \\\" 1\\n\"\n    \"’` \\\" 1\\n\"\n    \"‘‘ “ 1\\n\"\n    \"‘’ \\\" 1\\n\"\n    \"’‘ \\\" 1\\n\"\n    \"’’ ” 1\\n\"\n    \",, „ 1\\n\"\n    \"m rn 0\\n\"\n    \"rn m 0\\n\"\n    \"m in 0\\n\"\n    \"in m 0\\n\"\n    \"d cl 0\\n\"\n    \"cl d 0\\n\"\n    \"nn rm 0\\n\"\n    \"rm nn 0\\n\"\n    \"n ri 0\\n\"\n    \"ri n 0\\n\"\n    \"li h 0\\n\"\n    \"lr h 0\\n\"\n    \"ii u 0\\n\"\n    \"ii n 0\\n\"\n    \"ni m 0\\n\"\n    \"iii m 0\\n\"\n    \"ll H 0\\n\"\n    \"I-I H 0\\n\"\n    \"vv w 0\\n\"\n    \"VV W 0\\n\"\n    \"t f 0\\n\"\n    \"f t 0\\n\"\n    \"a o 0\\n\"\n    \"o a 0\\n\"\n    \"e c 0\\n\"\n    \"c e 0\\n\"\n    \"rr n 0\\n\"\n    \"E fi 0\\n\"\n    \"l< k 0\\n\"\n    \"ld ki 0\\n\"\n    \"lx h 0\\n\"\n    \"xn m 0\\n\"\n    \"ux in 0\\n\"\n    \"r t 0\\n\"\n    \"d tl 0\\n\"\n    \"di th 0\\n\"\n    \"ur in 0\\n\"\n    \"un im 0\\n\"\n    \"u a 0\\n\"\n    \"o ó 0\\n\"\n    \"ó o 0\\n\"\n    \"i í 0\\n\"\n    \"í i 0\\n\"\n    \"a á 0\\n\"\n    \"á a 0\\n\"\n    \"e é 0\\n\"\n    \"é e 0\\n\"\n    \"u ú 0\\n\"\n    \"ú u 0\\n\"\n    \"n ñ 0\\n\"\n    \"ñ n 0\\n\"\n    \"0 o 0\\n\"\n    \"d tr 0\\n\"\n    \"n tr 0\\n\"\n    \"ñ fi 0\\n\"\n    \"u ti 0\\n\"\n    \"ñ ti 0\\n\"\n    \"d ti 0\\n\"\n    \"d tí 0\\n\"\n    \"d rí 0\\n\"\n    \"a à 0\\n\"\n    \"e è 0\\n\"\n    \"n ij 0\\n\"\n    \"g ij 0\\n\"\n    \"o ò 0\\n\"\n    \"E É 0\\n\"\n    \"E È 0\\n\"\n    \"u ü 0\\n\"\n    \"xnE an 1\\n\"\n    \"mYx me 1\\n\"\n    \"qtE nt 1\\n\"\n    \"Tlb le 1\\n\"\n    \"vxN va 1\\n\"\n    \"gjQ ng 1\\n\"\n    \"jpF ij 1\\n\"\n    \"Yrl le 1\\n\"\n    \"aqY an 1\\n\"\n    \"zvJ va 1\\n\"\n    \"fbL be 1\\n\"\n    \"Nvk va 1\\n\"\n    \"fJp pr 1\\n\"\n    \"wxC wa 1\\n\"\n    \"cuJ qu 1\\n\"\n    \"Qzt ta 1\\n\"\n    \"qKw wa 1\\n\"\n    \"scJ st 1\\n\"\n    \"pXp po 1\\n\"\n    \"Vqi ti 1\\n\"\n    \"Uxk ka 1\\n\"\n    \"kJv ka 1\\n\"\n    \"Ykd ka 1\\n\"\n    \"vpX va 1\\n\"\n    \"iBv ti 1\\n\"\n    \"zRb sz 1\\n\"\n    \"yTm mi 1\\n\"\n    \"mKp pr 1\\n\"\n    \"Vzq qu 1\\n\"\n    \"Xtp ti 1\\n\"\n    \"mvD va 1\\n\"\n    \"mDq me 1\\n\"\n    \"jxP ij 1\\n\"\n    \"Bxv va 1\\n\"\n    \"oIu qu 1\\n\"\n    \"Rvc va 1\\n\"\n    \"uCj qu 1\\n\"\n    \"oAo vo 1\\n\"\n    \"quB tu 1\\n\"\n    \"btV ti 1\\n\"\n    \"Lmc me 1\\n\"\n    \"tVw ti 1\\n\"\n    \"Yxv va 1\\n\"\n    \"Hxm me 1\\n\"\n    \"dVh th 1\\n\"\n    \"xYc ch 1\\n\"\n    \"uPj tu 1\\n\"\n    \"fTf fo 1\\n\"\n    \"Rjw ij 1\\n\"\n    \"xdA di 1\\n\"\n    \"jzN ij 1\\n\"\n    \"mxL me 1\\n\"\n    \"ygJ ng 1\\n\"\n    \"Vvg va 1\\n\"\n    \"rjK ij 1\\n\"\n    \"yuV tu 1\\n\"\n    \"sWk ku 1\\n\"\n    \"Pgz sz 1\\n\"\n    \"jHm me 1\\n\"\n    \"zkU ku 1\\n\"\n    \"gvG va 1\\n\"\n    \"hdP th 1\\n\"\n    \"mVb me 1\\n\"\n    \"Qgd di 1\\n\"\n    \"zcZ ch 1\\n\"\n    \"zqj ij 1\\n\"\n    \"zsJ sz 1\\n\"\n    \"dfN di 1\\n\"\n    \"dgW di 1\\n\"\n    \"wNr ri 1\\n\"\n    \"zvC va 1\\n\"\n    \"qYw qu 1\\n\"\n    \"uHy tu 1\\n\"\n    \"tNq th 1\\n\"\n    \"lxJ li 1\\n\"\n    \"Hbk ku 1\\n\"\n    \"xsG st 1\\n\"\n    \"vSb va 1\\n\"\n    \"xFb bu 1\\n\"\n    \"Ntg th 1\\n\"\n    \"oBj ij 1\\n\"\n    \"qkv qu 1\\n\"\n    \"bVj ij 1\\n\"\n    \"zjT ij 1\\n\"\n    \"bvX va 1\\n\"\n    \"oZf to 1\\n\"\n    \"kcU ko 1\\n\"\n    \"fFm me 1\\n\"\n    \"Xbj ij 1\\n\"\n    \"Kqv va 1\\n\"\n    \"Rwj ij 1\\n\"\n    \"dvJ va 1\\n\"\n    \"znJ sz 1\\n\"\n    \"qqV qu 1\\n\"\n    \"pxM po 1\\n\"\n    \"eBj ij 1\\n\"\n    \"mJx me 1\\n\"\n    \"xnM ng 1\\n\"\n    \"aCq va 1\\n\"\n    \"pHj ij 1\\n\"\n    \"tfQ th 1\\n\"\n    \"wqn qu 1\\n\"\n    \"mSs is 1\\n\"\n    \"sBw st 1\\n\"\n    \"Fhn th 1\\n\"\n    \"zNb sz 1\\n\"\n    \"Mvb va 1\\n\"\n    \"bVt th 1\\n\"\n    \"qHt th 1\\n\"\n    \"qLv qu 1\\n\"\n    \"kgF ng 1\\n\"\n    \"vxW va 1\\n\"\n    \"cdY ch 1\\n\"\n    \"Xrz sz 1\\n\"\n    \"Efh th 1\\n\"\n    \"lqI qu 1\\n\"\n    \"Lzq qu 1\\n\"\n    \"zhX th 1\\n\"\n    \"ghZ th 1\\n\"\n    \"lFg ng 1\\n\"\n    \"vVc va 1\\n\"\n    \"lMr er 1\\n\"\n    \"Tqj qu 1\\n\"\n    \"jAx ij 1\\n\"\n    \"iMt th 1\\n\"\n    \"Nlv va 1\\n\"\n    \"zbP sz 1\\n\"\n    \"kVx ka 1\\n\"\n    \"eQl te 1\\n\"\n    \"sWb st 1\\n\"\n    \"Bqy qu 1\\n\"\n    \"dXk ka 1\\n\"\n    \"vUc va 1\\n\"\n    \"vOb va 1\\n\"\n    \"uHf qu 1\\n\"\n    \"qNr qu 1\\n\"\n    \"uFz qu 1\\n\"\n    \"Mlr er 1\\n\"\n    \"kmZ ka 1\\n\"\n    \"sRt th 1\\n\"\n    \"Wqv qu 1\\n\"\n    \"hfK th 1\\n\"\n    \"vxQ va 1\\n\"\n    \"lCq qu 1\\n\"\n    \"fYw wa 1\\n\"\n    \"tfS th 1\\n\"\n    \"qdO qu 1\\n\"\n    \"dQd de 1\\n\"\n    \"xdX de 1\\n\"\n    \"mNx me 1\\n\"\n    \"kFz sz 1\\n\"\n    \"wjS ij 1\\n\"\n    \"yPp pr 1\\n\"\n    \"wcW ch 1\\n\"\n    \"Njz sz 1\\n\"\n    \"dVp de 1\\n\"\n    \"dqD qu 1\\n\"\n    \"rJs sz 1\\n\"\n    \"xpH po 1\\n\"\n    \"xqR qu 1\\n\"\n    \"gVr er 1\\n\"\n    \"Btq th 1\\n\"\n    \"nmB nt 1\\n\"\n    \"zcM sz 1\\n\"\n    \"cfG ch 1\\n\"\n    \"mfO me 1\\n\"\n    \"Yhc th 1\\n\"\n    \"bZm me 1\\n\"\n    \"mzB sz 1\\n\"\n    \"vRw va 1\\n\"\n    \"yDh th 1\\n\"\n    \"Zgf ng 1\\n\"\n    \"kqT qu 1\\n\"\n    \"Iuz qu 1\\n\"\n    \"rbW er 1\\n\"\n    \"Jmq qu 1\\n\"\n    \"Kvj va 1\\n\"\n    \"zcD ch 1\\n\"\n    \"xgC ng 1\\n\"\n    \"jCx ij 1\\n\"\n    \"bWg ng 1\\n\"\n    \"ywW wa 1\\n\"\n    \"Jkc ch 1\\n\"\n    \"xGs sz 1\\n\"\n    \"vbH va 1\\n\"\n    \"lTz sz 1\\n\"\n    \"eCb er 1\\n\"\n    \"jVv va 1\\n\"\n    \"jDq qu 1\\n\"\n    \"joQ po 1\\n\"\n    \"qtM th 1\\n\"\n    \"Rqk qu 1\\n\"\n    \"Hvg va 1\\n\"\n    \"uAz qu 1\\n\"\n    \"mfW me 1\\n\"\n    \"tgS th 1\\n\"\n    \"cqD qu 1\\n\"\n    \"sfY sz 1\\n\"\n    \"Yhv th 1\\n\"\n    \"uqM qu 1\\n\"\n    \"xpK pr 1\\n\"\n    \"Jzh th 1\\n\"\n    \"cQk ch 1\\n\"\n    \"tjO th 1\\n\"\n    \"qxZ qu 1\\n\"\n    \"zPv sz 1\\n\"\n    \"qNk qu 1\\n\"\n    \"lvQ va 1\\n\"\n    \"kGw ka 1\\n\"\n    \"xuD qu 1\\n\"\n    \"Jvy va 1\\n\"\n    \"jYe te 1\\n\"\n    \"fZu qu 1\\n\"\n    \"qYo qu 1\\n\"\n    \"vhI th 1\\n\"\n    \"fxY fo 1\\n\"\n    \"yPf fo 1\\n\"\n    \"fGj ij 1\\n\"\n    \"dmT me 1\\n\"\n    \"vfX va 1\\n\"\n    \"xQt th 1\\n\"\n    \"cxS ch 1\\n\"\n    \"vzA va 1\\n\"\n    \"qaA qu 1\\n\"\n    \"Jbx be 1\\n\"\n    \"kVd ka 1\\n\"\n    \"Xjv va 1\\n\"\n    \"hkI th 1\\n\"\n    \"vQu qu 1\\n\"\n    \"vhK th 1\\n\"\n    \"Dvj va 1\\n\"\n    \"Vbm me 1\\n\"\n    \"fpN pr 1\\n\"\n    \"pkG ka 1\\n\"\n    \"bLc ch 1\\n\"\n    \"tJc th 1\\n\"\n    \"wwJ wa 1\\n\"\n    \"Zrw er 1\\n\"\n    \"wdW de 1\\n\"\n    \"Wgf ng 1\\n\"\n    \"Pqz qu 1\\n\"\n    \"wgN ng 1\\n\"\n    \"zHt th 1\\n\"\n    \"xTl le 1\\n\"\n    \"Dvt th 1\\n\"\n    \"wmU me 1\\n\"\n    \"xhm th 1\\n\"\n    \"hCx th 1\\n\"\n    \"vwV va 1\\n\"\n    \"zvL va 1\\n\"\n    \"nGf nt 1\\n\"\n    \"jjC ij 1\\n\"\n    \"Ucg ch 1\\n\"\n    \"pWf pr 1\\n\"\n    \"jxG ij 1\\n\"\n    \"Mqn qu 1\\n\"\n    \"yvW va 1\\n\"\n    \"lWk ka 1\\n\"\n    \"mdO me 1\\n\"\n    \"qNm qu 1\\n\"\n    \"Rwg ng 1\\n\"\n    \"xfv va 1\\n\"\n    \"uOw qu 1\\n\"\n    \"xhZ th 1\\n\"\n    \"jLr er 1\\n\"\n    \"fBy fo 1\\n\"\n    \"nUj nt 1\\n\"\n    \"lTg ng 1\\n\"\n    \"jlP ij 1\\n\"\n    \"wrR er 1\\n\"\n    \"rXw er 1\\n\"\n    \"eVw ve 1\\n\"\n    \"zWn ng 1\\n\"\n    \"mJs sz 1\\n\"\n    \"Mgy ng 1\\n\"\n    \"uZq qu 1\\n\"\n    \"Tdg ng 1\\n\"\n    \"mqI qu 1\\n\"\n    \"Dhp th 1\\n\"\n    \"pmK me 1\\n\"\n    \"Ssf sz 1\\n\"\n    \"sWl sz 1\\n\"\n    \"iqK qu 1\\n\"\n    \"gjG ng 1\\n\"\n    \"djB ij 1\\n\"\n    \"wKv va 1\\n\"\n    \"wvI va 1\\n\"\n    \"tcU th 1\\n\"\n    \"tkG th 1\\n\"\n    \"zUe te 1\\n\"\n    \"lUh th 1\\n\"\n    \"nBg nt 1\\n\"\n    \"dHx de 1\\n\"\n    \"Wbz sz 1\\n\"\n    \"vuQ qu 1\\n\"\n    \"Hpl le 1\\n\"\n    \"oVj ij 1\\n\"\n    \"vBb va 1\\n\"\n    \"Tdz sz 1\\n\"\n    \"pfV pr 1\\n\"\n    \"qgN qu 1\\n\"\n    \"pcU ch 1\\n\"\n    \"gcN ch 1\\n\"\n    \"vkA va 1\\n\"\n    \"cQf ch 1\\n\"\n    \"Yzx sz 1\\n\"\n    \"ypF pr 1\\n\"\n    \"vBw va 1\\n\"\n    \"pPd de 1\\n\"\n    \"qmU qu 1\\n\"\n    \"eWf ve 1\\n\"\n    \"jZr er 1\\n\"\n    \"Hwl le 1\\n\"\n    \"yyI ny 1\\n\"\n    \"Zfh th 1\\n\"\n    \"Lgw ng 1\\n\"\n    \"uqp qu 1\\n\"\n    \"xOj ij 1\\n\"\n    \"dkJ ko 1\\n\"\n    \"dqM qu 1\\n\"\n    \"sbW is 1\\n\"\n    \"zMp sz 1\\n\"\n    \"nJz ng 1\\n\"\n    \"kMc ko 1\\n\"\n    \"zqW qu 1\\n\"\n    \"vQk va 1\\n\"\n    \"eqD qu 1\\n\"\n    \"hFn th 1\\n\"\n    \"vcZ ch 1\\n\"\n    \"xGk ka 1\\n\"\n    \"kzf sz 1\\n\"\n    \"xZx xe 1\\n\"\n    \"qvN qu 1\\n\"\n    \"ykY ka 1\\n\"\n    \"brH er 1\\n\"\n    \"Wrh th 1\\n\"\n    \"wjE ij 1\\n\"\n    \"kjQ ka 1\\n\"\n    \"fLj ij 1\\n\"\n    \"mgE ng 1\\n\"\n    \"xwI wa 1\\n\"\n    \"iDw ti 1\\n\"\n    \"Btx th 1\\n\"\n    \"vPz va 1\\n\"\n    \"yqH qu 1\\n\"\n    \"wFe er 1\\n\"\n    \"lQy le 1\\n\"\n    \"gBp ng 1\\n\"\n    \"jdY de 1\\n\"\n    \"tvQ th 1\\n\"\n    \"ljO le 1\\n\"\n    \"Nsq qu 1\\n\"\n    \"xdO de 1\\n\"\n    \"gzW ng 1\\n\"\n    \"wtM th 1\\n\"\n    \"qfR qu 1\\n\"\n    \"jZh th 1\\n\"\n    \"Wcb ch 1\\n\"\n    \"dvQ va 1\\n\"\n    \"jHb ij 1\\n\"\n    \"xbM be 1\\n\"\n    \"nWg nt 1\\n\"\n    \"Ywj ij 1\\n\"\n    \"Xwj ij 1\\n\"\n    \"pxK pr 1\\n\"\n    \"ybQ be 1\\n\"\n    \"Wvm va 1\\n\"\n    \"Lgz ng 1\\n\"\n    \"btS th 1\\n\"\n    \"jRl le 1\\n\"\n    \"qqJ qu 1\\n\"\n    \"Cnq qu 1\\n\"\n    \"Fmw me 1\\n\"\n    \"dvP va 1\\n\"\n    \"vqB qu 1\\n\"\n    \"djI de 1\\n\"\n    \"jVq qu 1\\n\"\n    \"fvZ va 1\\n\"\n    \"Cwt th 1\\n\"\n    \"Uyb be 1\\n\"\n    \"Ffc ch 1\\n\"\n    \"soX sz 1\\n\"\n    \"qhR th 1\\n\"\n    \"fWz sz 1\\n\"\n    \"vrX va 1\\n\"\n    \"eOq qu 1\\n\"\n    \"bwZ be 1\\n\"\n    \"dnV ng 1\\n\"\n    \"Gbw be 1\\n\"\n    \"xGd de 1\\n\"\n    \"mnZ ng 1\\n\"\n    \"bpN pr 1\\n\"\n    \"dzX de 1\\n\"\n    \"Bxq qu 1\\n\"\n    \"zpx sz 1\\n\"\n    \"dqZ qu 1\\n\"\n    \"xTf fo 1\\n\"\n    \"wPv va 1\\n\"\n    \"cxq qu 1\\n\"\n    \"hdT th 1\\n\"\n    \"ywX wa 1\\n\"\n    \"Uvv va 1\\n\"\n    \"rKp er 1\\n\"\n    \"sdF de 1\\n\"\n    \"Jcg ch 1\\n\"\n    \"xzO sz 1\\n\"\n    \"xTt th 1\\n\"\n    \"djP de 1\\n\"\n    \"gTn ng 1\\n\"\n    \"Gtp th 1\\n\"\n    \"xgA ng 1\\n\"\n    \"bdL de 1\\n\"\n    \"wzO sz 1\\n\"\n    \"fhI th 1\\n\"\n    \"Wmp me 1\\n\"\n    \"Qdt th 1\\n\"\n    \"uYq qu 1\\n\"\n    \"pbJ pr 1\\n\"\n    \"jRd de 1\\n\"\n    \"Xsx sz 1\\n\"\n    \"zgI ng 1\\n\"\n    \"qhY th 1\\n\"\n    \"Ggj ng 1\\n\"\n    \"Fjq qu 1\\n\"\n    \"Qwk ka 1\\n\"\n    \"zxW sz 1\\n\"\n    \"vCc ch 1\\n\"\n    \"ccL ch 1\\n\"\n    \"Kxs sz 1\\n\"\n    \"mYr er 1\\n\"\n    \"rQt er 1\\n\"\n    \"Zxs sz 1\\n\"\n    \"hdQ th 1\\n\"\n    \"dwH de 1\\n\"\n    \"Yml le 1\\n\"\n    \"qVz qu 1\\n\"\n    \"Rvl va 1\\n\"\n    \"yHk ka 1\\n\"\n    \"Wjt th 1\\n\"\n    \"hMw th 1\\n\"\n    \"pzU sz 1\\n\"\n    \"gcL ch 1\\n\"\n    \"qOa qu 1\\n\"\n    \"eqI qu 1\\n\"\n    \"iYp ti 1\\n\"\n    \"vCq qu 1\\n\"\n    \"uoV ro 1\\n\"\n    \"fZx fo 1\\n\"\n    \"qQd qu 1\\n\"\n    \"qdE qu 1\\n\"\n    \"qWx qu 1\\n\"\n    \"Ykj ij 1\\n\"\n    \"Fpj ij 1\\n\"\n    \"zGv va 1\\n\"\n    \"rwO er 1\\n\"\n    \"Qzq qu 1\\n\"\n    \"Kqb qu 1\\n\"\n    \"zgT ng 1\\n\"\n    \"jsZ sz 1\\n\"\n    \"aHq qu 1\\n\"\n    \"yjL ij 1\\n\"\n    \"Ycw ch 1\\n\"\n    \"bnP an 1\\n\"\n    \"vWn an 1\\n\"\n    \"zyY sz 1\\n\"\n    \"zRs st 1\\n\"\n    \"wuP qu 1\\n\"\n    \"vjB va 1\\n\"\n    \"jrT er 1\\n\"\n    \"vwJ va 1\\n\"\n    \"dVj de 1\\n\"\n    \"zvW va 1\\n\"\n    \"dZk de 1\\n\"\n    \"nrG an 1\\n\"\n    \"qsU qu 1\\n\"\n    \"Pvs va 1\\n\"\n    \"lLh th 1\\n\"\n    \"qCz qu 1\\n\"\n    \"dvV de 1\\n\"\n    \"Pjw ij 1\\n\"\n    \"Kmj ij 1\\n\"\n    \"Jfh th 1\\n\"\n    \"nwY an 1\\n\"\n    \"gwC ng 1\\n\"\n    \"vGb va 1\\n\"\n    \"qWr qu 1\\n\"\n    \"qpW qu 1\\n\"\n    \"dKk de 1\\n\"\n    \"yWb be 1\\n\"\n    \"jmN ij 1\\n\"\n    \"gpV ng 1\\n\"\n    \"qzS qu 1\\n\"\n    \"oZh th 1\\n\"\n    \"Qmt th 1\\n\"\n    \"mNk me 1\\n\"\n    \"ypM pr 1\\n\"\n    \"lwH le 1\\n\"\n    \"zHs sz 1\\n\"\n    \"jzC jo 1\\n\"\n    \"oJh th 1\\n\"\n    \"Lqh th 1\\n\"\n    \"hXg th 1\\n\"\n    \"xEf fo 1\\n\"\n    \"uWx qu 1\\n\"\n    \"kvT va 1\\n\"\n    \"zsG sz 1\\n\"\n    \"lSx le 1\\n\"\n    \"qKb qu 1\\n\"\n    \"Qye de 1\\n\"\n    \"xHk ka 1\\n\"\n    \"Cwp pr 1\\n\"\n    \"zmJ sz 1\\n\"\n    \"xuL qu 1\\n\"\n    \"bdH de 1\\n\"\n    \"Pbw wa 1\\n\"\n    \"qdX qu 1\\n\"\n    \"lVc ch 1\\n\"\n    \"bqL qu 1\\n\"\n    \"wNs sz 1\\n\"\n    \"vzN va 1\\n\"\n    \"qjA qu 1\\n\"\n    \"Zhf th 1\\n\"\n    \"ypJ pr 1\\n\"\n    \"xMq qu 1\\n\"\n    \"bTk ka 1\\n\"\n    \"tLf th 1\\n\"\n    \"xgR ng 1\\n\"\n    \"kQz sz 1\\n\"\n    \"Rjp ij 1\\n\"\n    \"xhG th 1\\n\"\n    \"bCc ch 1\\n\"\n    \"hbF th 1\\n\"\n    \"rxQ er 1\\n\"\n    \"qVp qu 1\\n\"\n    \"bkY ka 1\\n\"\n    \"qPl qu 1\\n\"\n    \"jQk ij 1\\n\"\n    \"Ovq qu 1\\n\"\n    \"sVv va 1\\n\"\n    \"pmU me 1\\n\"\n    \"uFv qu 1\\n\"\n    \"xaZ va 1\\n\"\n    \"gGn an 1\\n\"\n    \"pgI ng 1\\n\"\n    \"zTj sz 1\\n\"\n    \"lvC va 1\\n\"\n    \"wGv va 1\\n\"\n    \"rNv va 1\\n\"\n    \"Qtq th 1\\n\"\n    \"vNh th 1\\n\"\n    \"lPv va 1\\n\"\n    \"Jdq qu 1\\n\"\n    \"Xdj de 1\\n\"\n    \"yqk qu 1\\n\"\n    \"iwY ti 1\\n\"\n    \"Nmq qu 1\\n\"\n    \"fTp pr 1\\n\"\n    \"qzQ qu 1\\n\"\n    \"pjA ij 1\\n\"\n    \"pvH va 1\\n\"\n    \"xLj ij 1\\n\"\n    \"qWh th 1\\n\"\n    \"vVq qu 1\\n\"\n    \"gQd de 1\\n\"\n    \"svY va 1\\n\"\n    \"fLf fo 1\\n\"\n    \"qzB qu 1\\n\"\n    \"Dxg ng 1\\n\"\n    \"uzY qu 1\\n\"\n    \"gVz sz 1\\n\"\n    \"hZb th 1\\n\"\n    \"Gpx pr 1\\n\"\n    \"xqh th 1\\n\"\n    \"gcX ch 1\\n\"\n    \"Hxd de 1\\n\"\n    \"tUq th 1\\n\"\n    \"bKp pr 1\\n\"\n    \"iGx ti 1\\n\"\n    \"xvQ va 1\\n\"\n    \"lxA le 1\\n\"\n    \"sjH st 1\\n\"\n    \"Gqo qu 1\\n\"\n    \"dgQ de 1\\n\"\n    \"yDk ka 1\\n\"\n    \"Znv va 1\\n\"\n    \"vfU va 1\\n\"\n    \"vuD qu 1\\n\"\n    \"oQj ij 1\\n\"\n    \"bhD th 1\\n\"\n    \"qLj qu 1\\n\"\n    \"mdY de 1\\n\"\n    \"rZb er 1\\n\"\n    \"kDv va 1\\n\"\n    \"fsK sz 1\\n\"\n    \"Kqf qu 1\\n\"\n    \"yWl le 1\\n\"\n    \"mVw me 1\\n\"\n    \"mcV ch 1\\n\"\n    \"tDf th 1\\n\"\n    \"lAo le 1\\n\"\n    \"fzR sz 1\\n\"\n    \"Xrq qu 1\\n\"\n    \"jrZ er 1\\n\"\n    \"qmN qu 1\\n\"\n    \"Jnp an 1\\n\"\n    \"jhC th 1\\n\"\n    \"kqR qu 1\\n\"\n    \"dWn de 1\\n\"\n    \"Wmw me 1\\n\"\n    \"Rgy ng 1\\n\"\n    \"uvN qu 1\\n\"\n    \"jiY ti 1\\n\"\n    \"xWc ch 1\\n\"\n    \"yJr er 1\\n\"\n    \"oHq qu 1\\n\"\n    \"yvw va 1\\n\"\n    \"Ydn de 1\\n\"\n    \"Nvq qu 1\\n\"\n    \"Gmv va 1\\n\"\n    \"xxZ xe 1\\n\"\n    \"Xdf de 1\\n\"\n    \"xYh th 1\\n\"\n    \"Vnv an 1\\n\"\n    \"jNz sz 1\\n\"\n    \"Wnq qu 1\\n\"\n    \"Xwk ka 1\\n\"\n    \"qWz qu 1\\n\"\n    \"mQs sz 1\\n\"\n    \"Vxb be 1\\n\"\n    \"xwG wa 1\\n\"\n    \"wvp va 1\\n\"\n    \"gmV ng 1\\n\"\n    \"Rzq qu 1\\n\"\n    \"Cpw pr 1\\n\"\n    \"Gyy ny 1\\n\"\n    \"xzA sz 1\\n\"\n    \"wGx wa 1\\n\"\n    \"bqS qu 1\\n\"\n    \"whR th 1\\n\"\n    \"jPc ch 1\\n\"\n    \"iqG qu 1\\n\"\n    \"djK de 1\\n\"\n    \"cVk ch 1\\n\"\n    \"rwT er 1\\n\"\n    \"Vhn th 1\\n\"\n    \"Hfw wa 1\\n\"\n    \"bnJ an 1\\n\"\n    \"Cpd de 1\\n\"\n    \"Nmd de 1\\n\"\n    \"dnO an 1\\n\"\n    \"qWc qu 1\\n\"\n    \"aVq qu 1\\n\"\n    \"qOn qu 1\\n\"\n    \"Qlr er 1\\n\"\n    \"qnN qu 1\\n\"\n    \"rLq qu 1\\n\"\n    \"wtE th 1\\n\"\n    \"jgR ng 1\\n\"\n    \"Yqp qu 1\\n\"\n    \"Hwg ng 1\\n\"\n    \"nWk an 1\\n\"\n    \"wqB qu 1\\n\"\n    \"fAp pr 1\\n\"\n    \"hZv th 1\\n\"\n    \"Kzp sz 1\\n\"\n    \"fNk ka 1\\n\"\n    \"Tkd de 1\\n\"\n    \"uYm qu 1\\n\"\n    \"kcR ch 1\\n\"\n    \"xNl le 1\\n\"\n    \"kHk ka 1\\n\"\n    \"bJk ka 1\\n\"\n    \"jjD ij 1\\n\"\n    \"Nlq qu 1\\n\"\n    \"dhB th 1\\n\"\n    \"jXl le 1\\n\"\n    \"nwB an 1\\n\"\n    \"Hzb sz 1\\n\"\n    \"qQz qu 1\\n\"\n    \"fKc ch 1\\n\"\n    \"jVw ij 1\\n\"\n    \"ylU le 1\\n\"\n    \"Lzj sz 1\\n\"\n    \"sXu qu 1\\n\"\n    \"wBw wa 1\\n\"\n    \"Iqg qu 1\\n\"\n    \"wjV ij 1\\n\"\n    \"wxt th 1\\n\"\n    \"jzK sz 1\\n\"\n    \"rDd de 1\\n\"\n    \"uQy qu 1\\n\"\n    \"qGw qu 1\\n\"\n    \"tbU th 1\\n\"\n    \"kUo ka 1\\n\"\n    \"dVm de 1\\n\"\n    \"Ddn an 1\\n\"\n    \"vqC vo 1\\n\"\n    \"jkZ ij 1\\n\"\n    \"Lvz va 1\\n\"\n    \"tPy th 1\\n\"\n    \"Vfj ij 1\\n\"\n    \"Qhb th 1\\n\"\n    \"whB th 1\\n\"\n    \"Fqf qu 1\\n\"\n    \"hCv th 1\\n\"\n    \"Fjf ij 1\\n\"\n    \"Qfr er 1\\n\"\n    \"zwF sz 1\\n\"\n    \"Fwf wa 1\\n\"\n    \"pvU va 1\\n\"\n    \"whC th 1\\n\"\n    \"hTk th 1\\n\"\n    \"dlQ de 1\\n\"\n    \"wzL sz 1\\n\"\n    \"zqS qu 1\\n\"\n    \"qtP th 1\\n\"\n    \"yhC th 1\\n\"\n    \"yjB ij 1\\n\"\n    \"iTd de 1\\n\"\n    \"kLx ka 1\\n\"\n    \"Rqi qu 1\\n\"\n    \"qjS qu 1\\n\"\n    \"vjI va 1\\n\"\n    \"pGz sz 1\\n\"\n    \"wnV an 1\\n\"\n    \"lQx le 1\\n\"\n    \"uvS qu 1\\n\"\n    \"Zge de 1\\n\"\n    \"gJv ng 1\\n\"\n    \"Ydb de 1\\n\"\n    \"wDh th 1\\n\"\n    \"zwV sz 1\\n\"\n    \"hNm th 1\\n\"\n    \"zwQ sz 1\\n\"\n    \"fRr er 1\\n\"\n    \"wVr er 1\\n\"\n    \"nKg an 1\\n\"\n    \"Tgg ng 1\\n\"\n    \"bYp pr 1\\n\"\n    \"lBn an 1\\n\"\n    \"zjp sz 1\\n\"\n    \"qAf qu 1\\n\"\n    \"zmK me 1\\n\"\n    \"wqK qu 1\\n\"\n    \"vjT va 1\\n\"\n    \"Lql qu 1\\n\"\n    \"snC an 1\\n\"\n    \"fzY sz 1\\n\"\n    \"vqU qu 1\\n\"\n    \"mGb me 1\\n\"\n    \"fkP ka 1\\n\"\n    \"wQg ng 1\\n\"\n    \"Fqt th 1\\n\"\n    \"bVm me 1\\n\"\n    \"Wcx ch 1\\n\"\n    \"wpY wa 1\\n\"\n    \"lFv va 1\\n\"\n    \"gwD ng 1\\n\"\n    \"gWp ng 1\\n\"\n    \"fjT ij 1\\n\"\n    \"pFt th 1\\n\"\n    \"iIp in 1\\n\"\n    \"tbD th 1\\n\"\n    \"Xqc qu 1\\n\"\n    \"Qkc ch 1\\n\"\n    \"qeZ qu 1\\n\"\n    \"qPb qu 1\\n\"\n    \"gwL ng 1\\n\"\n    \"fHi in 1\\n\"\n    \"xwP wa 1\\n\"\n    \"xvB va 1\\n\"\n    \"jSw ij 1\\n\"\n    \"pzF sz 1\\n\"\n    \"wYp wa 1\\n\"\n    \"dDx de 1\\n\"\n    \"nBx an 1\\n\"\n    \"cNv ch 1\\n\"\n    \"Ubm me 1\\n\"\n    \"xXu qu 1\\n\"\n    \"dRl de 1\\n\"\n    \"dBz de 1\\n\"\n    \"Xvh th 1\\n\"\n    \"Xld de 1\\n\"\n    \"mwY me 1\\n\"\n    \"whQ th 1\\n\"\n    \"Mzl le 1\\n\"\n    \"Aqj qu 1\\n\"\n    \"uDp qu 1\\n\"\n    \"cjZ ch 1\\n\"\n    \"Vkf ka 1\\n\"\n    \"uGq qu 1\\n\"\n    \"hBs th 1\\n\"\n    \"qLh th 1\\n\"\n    \"tfW th 1\\n\"\n    \"cPn an 1\\n\"\n    \"xoN on 1\\n\"\n    \"Ydx de 1\\n\"\n    \"Lxk ka 1\\n\"\n    \"ccZ ch 1\\n\"\n    \"uJh th 1\\n\"\n    \"sVp sz 1\\n\"\n    \"wrE er 1\\n\"\n    \"xgP ng 1\\n\"\n    \"hPp th 1\\n\"\n    \"euU qu 1\\n\"\n    \"sZh th 1\\n\"\n    \"qnK qu 1\\n\"\n    \"Bgh th 1\\n\"\n    \"slQ le 1\\n\"\n    \"gxA ng 1\\n\"\n    \"jLd de 1\\n\"\n    \"znD an 1\\n\"\n    \"kXk ka 1\\n\"\n    \"tfV th 1\\n\"\n    \"Vwl le 1\\n\"\n    \"xWd do 1\\n\"\n    \"xnH an 1\\n\"\n    \"cOq ch 1\\n\"\n    \"Lkk ka 1\\n\"\n    \"Nvy va 1\\n\"\n    \"xIh th 1\\n\"\n    \"xkK ka 1\\n\"\n    \"rMr er 1\\n\"\n    \"rmQ er 1\\n\"\n    \"bPn an 1\\n\"\n    \"fAa an 1\\n\"\n    \"vQv va 1\\n\"\n    \"fHr er 1\\n\"\n    \"Pmv va 1\\n\"\n    \"vzJ sz 1\\n\"\n    \"wTg ng 1\\n\"\n    \"bWc ch 1\\n\"\n    \"Zwg ng 1\\n\"\n    \"gKx ng 1\\n\"\n    \"Gbq qu 1\\n\"\n    \"wMk ka 1\\n\"\n    \"Nfx fo 1\\n\"\n    \"fAo on 1\\n\"\n    \"dHb de 1\\n\"\n    \"lxH le 1\\n\"\n    \"dqO qu 1\\n\"\n    \"Tlq qu 1\\n\"\n    \"Yjj ij 1\\n\"\n    \"Iyh th 1\\n\"\n    \"uoY qu 1\\n\"\n    \"mhH th 1\\n\"\n    \"lMj le 1\\n\"\n    \"fzF sz 1\\n\"\n    \"frR er 1\\n\"\n    \"yNl le 1\\n\"\n    \"aPv an 1\\n\"\n    \"ywG wa 1\\n\"\n    \"Cmw me 1\\n\"\n    \"svK va 1\\n\"\n    \"srO er 1\\n\"\n    \"Uhz th 1\\n\"\n    \"vPn an 1\\n\"\n    \"zTq qu 1\\n\"\n    \"kzH sz 1\\n\"\n    \"Iox on 1\\n\"\n    \"fQa an 1\\n\"\n    \"wZr er 1\\n\"\n    \"nqU an 1\\n\"\n    \"wPb wa 1\\n\"\n    \"Tzg ng 1\\n\"\n    \"pnR an 1\\n\"\n    \"vfJ va 1\\n\"\n    \"vyX va 1\\n\"\n    \"fLz sz 1\\n\"\n    \"zjP sz 1\\n\"\n    \"pmR me 1\\n\"\n    \"ePq qu 1\\n\"\n    \"jyT ij 1\\n\"\n    \"mjP ij 1\\n\"\n    \"fsH sz 1\\n\"\n    \"vwB va 1\\n\"\n    \"Ynr an 1\\n\"\n    \"Tqh th 1\\n\"\n    \"Lvv va 1\\n\"\n    \"tCf th 1\\n\"\n    \"wpB wa 1\\n\"\n    \"wXh th 1\\n\"\n    \"mhX th 1\\n\"\n    \"kYd de 1\\n\"\n    \"Dpg ng 1\\n\"\n    \"ygR ng 1\\n\"\n    \"Rfp pr 1\\n\"\n    \"Jyq qu 1\\n\"\n    \"yxq qu 1\\n\"\n    \"pPc ch 1\\n\"\n    \"aOj an 1\\n\"\n    \"Zww wa 1\\n\"\n    \"fFx fo 1\\n\"\n    \"bDh th 1\\n\"\n    \"qKx qu 1\\n\"\n    \"wHx wa 1\\n\"\n    \"hrX th 1\\n\"\n    \"rFh th 1\\n\"\n    \"lLx le 1\\n\"\n    \"aYj an 1\\n\"\n    \"kCs sz 1\\n\"\n    \"lWt th 1\\n\"\n    \"pdY de 1\\n\"\n    \"swI sz 1\\n\"\n    \"bLw wa 1\\n\"\n    \"Mzx sz 1\\n\"\n    \"cKk ch 1\\n\"\n    \"hMz th 1\\n\"\n    \"Jcu qu 1\\n\"\n    \"wjB ij 1\\n\"\n    \"Mqe qu 1\\n\"\n    \"rxW er 1\\n\"\n    \"gZv ng 1\\n\"\n    \"Rfn an 1\\n\"\n    \"pwD wa 1\\n\"\n    \"lhX th 1\\n\"\n    \"fVg ng 1\\n\"\n    \"vfW va 1\\n\"\n    \"lxP le 1\\n\"\n    \"Yyj ij 1\\n\"\n    \"hPg th 1\\n\"\n    \"Uxq qu 1\\n\"\n    \"bdO de 1\\n\"\n    \"bRz sz 1\\n\"\n    \"dXq qu 1\\n\"\n    \"Rjq qu 1\\n\"\n    \"fgV ng 1\\n\"\n    \"xAf fo 1\\n\"\n    \"wXn an 1\\n\"\n    \"Kvv va 1\\n\"\n    \"svL va 1\\n\"\n    \"fWv va 1\\n\"\n    \"drQ er 1\\n\"\n    \"Lpv va 1\\n\"\n    \"qKp qu 1\\n\"\n    \"eCv er 1\\n\"\n    \"xwH wa 1\\n\"\n    \"cvC ch 1\\n\"\n    \"kUf ka 1\\n\"\n    \"oPx on 1\\n\"\n    \"tjJ th 1\\n\"\n    \"bBk ka 1\\n\"\n    \"vpI va 1\\n\"\n    \"gzY ng 1\\n\"\n    \"oZs on 1\\n\"\n    \"pKc ch 1\\n\"\n    \"xKs sz 1\\n\"\n    \"qcH qu 1\\n\"\n    \"Vfm me 1\\n\"\n    \"svM va 1\\n\"\n    \"Vjx ij 1\\n\"\n    \"lVw le 1\\n\"\n    \"wWf wa 1\\n\"\n    \"Xpx pr 1\\n\"\n    \"lcA ch 1\\n\"\n    \"tLc th 1\\n\"\n    \"lDg ng 1\\n\"\n    \"Xjh th 1\\n\"\n    \"Xdh th 1\\n\"\n    \"rKm er 1\\n\"\n    \"fnW an 1\\n\"\n    \"Tcb ch 1\\n\"\n    \"qgX qu 1\\n\"\n    \"qZo qu 1\\n\"\n    \"eJv er 1\\n\"\n    \"Yxy ny 1\\n\"\n    \"kfM ka 1\\n\"\n    \"qKe qu 1\\n\"\n    \"vMf va 1\\n\"\n    \"dgY de 1\\n\"\n    \"gGd ng 1\\n\"\n    \"Vcj ch 1\\n\"\n    \"Sfw wa 1\\n\"\n    \"xDk ka 1\\n\"\n    \"fTc ch 1\\n\"\n    \"qRw qu 1\\n\"\n    \"tOa th 1\\n\"\n    \"guQ qu 1\\n\"\n    \"mgJ ng 1\\n\"\n    \"bRd de 1\\n\"\n    \"kYq qu 1\\n\"\n    \"xwD wa 1\\n\"\n    \"vXs va 1\\n\"\n    \"zlC le 1\\n\"\n    \"kmH ka 1\\n\"\n    \"jhZ th 1\\n\"\n    \"Wxo on 1\\n\"\n    \"vtX th 1\\n\"\n    \"iWm in 1\\n\"\n    \"qVx qu 1\\n\"\n    \"Hjv va 1\\n\"\n    \"Pxs sz 1\\n\"\n    \"bYi in 1\\n\"\n    \"wgG ng 1\\n\"\n    \"Jvs va 1\\n\"\n    \"gHh th 1\\n\"\n    \"Kzy sz 1\\n\"\n    \"xjI ij 1\\n\"\n    \"uVb qu 1\\n\"\n    \"Pzq qu 1\\n\"\n    \"hxC th 1\\n\"\n    \"wPy wa 1\\n\"\n    \"bXh th 1\\n\"\n    \"jzY sz 1\\n\"\n    \"fqJ qu 1\\n\"\n    \"qxX qu 1\\n\"\n    \"vfB va 1\\n\"\n    \"pPm me 1\\n\"\n    \"bpC pr 1\\n\"\n    \"hFv th 1\\n\"\n    \"Cql qu 1\\n\"\n    \"dwI de 1\\n\"\n    \"Tcq ch 1\\n\"\n    \"Zjx ij 1\\n\"\n    \"wOz sz 1\\n\"\n    \"Jfj ij 1\\n\"\n    \"iZr in 1\\n\"\n    \"Vxf fo 1\\n\"\n    \"Lpx pr 1\\n\"\n    \"fHt th 1\\n\"\n    \"hFy th 1\\n\"\n    \"lcD ch 1\\n\"\n    \"vMc ch 1\\n\"\n    \"xyU ny 1\\n\"\n    \"mGq qu 1\\n\"\n    \"wJv va 1\\n\"\n    \"zKs sz 1\\n\"\n    \"lMm le 1\\n\"\n    \"mqU qu 1\\n\"\n    \"vHg ng 1\\n\"\n    \"lGc ch 1\\n\"\n    \"eIj te 1\\n\"\n    \"Vdh th 1\\n\"\n    \"rCk er 1\\n\"\n    \"wQh th 1\\n\"\n    \"Ywf wa 1\\n\"\n    \"zUf sz 1\\n\"\n    \"qZs qu 1\\n\"\n    \"vNt th 1\\n\"\n    \"Dxj ij 1\\n\"\n    \"cYr ch 1\\n\"\n    \"dKt th 1\\n\"\n    \"vDp va 1\\n\"\n    \"qnF an 1\\n\"\n    \"Lsj sz 1\\n\"\n    \"xHv va 1\\n\"\n    \"jCt th 1\\n\"\n    \"bnX an 1\\n\"\n    \"fBx fo 1\\n\"\n    \"jVt th 1\\n\"\n    \"qOy qu 1\\n\"\n    \"uqD qu 1\\n\"\n    \"Rfw wa 1\\n\"\n    \"cjS ch 1\\n\"\n    \"ufX qu 1\\n\"\n    \"fvI va 1\\n\"\n    \"Owx wa 1\\n\"\n    \"gXw ng 1\\n\"\n    \"oCv va 1\\n\"\n    \"Mrx er 1\\n\"\n    \"cIb ch 1\\n\"\n    \"fJj ij 1\\n\"\n    \"kqM qu 1\\n\"\n    \"zqL qu 1\\n\"\n    \"rPz er 1\\n\"\n    \"iwW in 1\\n\"\n    \"cMp ch 1\\n\"\n    \"lVt th 1\\n\"\n    \"vTb va 1\\n\"\n    \"Iwf wa 1\\n\"\n    \"xlZ le 1\\n\"\n    \"vjQ va 1\\n\"\n    \"iPb in 1\\n\"\n    \"Whk th 1\\n\"\n    \"Wvh th 1\\n\"\n    \"mzD sz 1\\n\"\n    \"Hqk qu 1\\n\"\n    \"jqB qu 1\\n\"\n    \"qhM th 1\\n\"\n    \"prR er 1\\n\"\n    \"nlV an 1\\n\"\n    \"qYk qu 1\\n\"\n    \"zVp sz 1\\n\"\n    \"vpO va 1\\n\"\n    \"Rvr er 1\\n\"\n    \"scY ch 1\\n\"\n    \"qdA qu 1\\n\"\n    \"vLk va 1\\n\"\n    \"svI va 1\\n\"\n    \"mdE de 1\\n\"\n    \"hBx th 1\\n\"\n    \"Zrv er 1\\n\"\n    \"jWt th 1\\n\"\n    \"fTx fo 1\\n\"\n    \"Ypc ch 1\\n\"\n    \"mMk ka 1\\n\"\n    \"fdq qu 1\\n\"\n    \"hcK th 1\\n\"\n    \"xCy ny 1\\n\"\n    \"fVr er 1\\n\"\n    \"aPx an 1\\n\"\n    \"fpU pr 1\\n\"\n    \"Vkb ka 1\\n\"\n    \"tbM th 1\\n\"\n    \"zQt th 1\\n\"\n    \"gxV ng 1\\n\"\n    \"Sfg ng 1\\n\"\n    \"pYl le 1\\n\"\n    \"gWt th 1\\n\"\n    \"xEb be 1\\n\"\n    \"mXy me 1\\n\"\n    \"lnQ an 1\\n\"\n    \"qmL qu 1\\n\"\n    \"Vky ka 1\\n\"\n    \"wwX wa 1\\n\"\n    \"Uwx wa 1\\n\"\n    \"cfB ch 1\\n\"\n    \"Gxp pr 1\\n\"\n    \"fpL pr 1\\n\"\n    \"jTx ij 1\\n\"\n    \"cZv ch 1\\n\"\n    \"zlK le 1\\n\"\n    \"hBc th 1\\n\"\n    \"Wqi qu 1\\n\"\n    \"lGs le 1\\n\"\n    \"Dqz qu 1\\n\"\n    \"Jgw ng 1\\n\"\n    \"gCx ng 1\\n\"\n    \"cNj ch 1\\n\"\n    \"cqJ ch 1\\n\"\n    \"blD le 1\\n\"\n    \"qXr qu 1\\n\"\n    \"kXr er 1\\n\"\n    \"khK th 1\\n\"\n    \"xZh th 1\\n\"\n    \"jSs sz 1\\n\"\n    \"yjx ij 1\\n\"\n    \"Hwf wa 1\\n\"\n    \"fXs sz 1\\n\"\n    \"qgz qu 1\\n\"\n    \"Xdw de 1\\n\"\n    \"hcN th 1\\n\"\n    \"jJd de 1\\n\"\n    \"cmQ ch 1\\n\"\n    \"mvV va 1\\n\"\n    \"Nqe qu 1\\n\"\n    \"zxS sz 1\\n\"\n    \"kGt th 1\\n\"\n    \"tFg th 1\\n\"\n    \"fzM sz 1\\n\"\n    \"Xrr er 1\\n\"\n    \"dcJ ch 1\\n\"\n    \"dQa an 1\\n\"\n    \"qNy qu 1\\n\"\n    \"hxT th 1\\n\"\n    \"twB th 1\\n\"\n    \"Bqj qu 1\\n\"\n    \"prK er 1\\n\"\n    \"zdC de 1\\n\"\n    \"yAo on 1\\n\"\n    \"dLt st 1\\n\"\n    \"pgF ng 1\\n\"\n    \"vgW ng 1\\n\"\n    \"vpN va 1\\n\"\n    \"Ivx va 1\\n\"\n    \"vYl le 1\\n\"\n    \"xRg ng 1\\n\"\n    \"jPu qu 1\\n\"\n    \"Oqr qu 1\\n\"\n    \"vjg ng 1\\n\"\n    \"dpH de 1\\n\"\n    \"yDp pr 1\\n\"\n    \"xfJ fo 1\\n\"\n    \"fqV qu 1\\n\"\n    \"eBf er 1\\n\"\n    \"Zkw ka 1\\n\"\n    \"qHp qu 1\\n\"\n    \"Aqz qu 1\\n\"\n    \"bNw wa 1\\n\"\n    \"fjX ij 1\\n\"\n    \"fqS qu 1\\n\"\n    \"ljK le 1\\n\"\n    \"Gkf ka 1\\n\"\n    \"bSf be 1\\n\"\n    \"Mxg ng 1\\n\"\n    \"Dqm qu 1\\n\"\n    \"hKp th 1\\n\"\n    \"wFq qu 1\\n\"\n    \"wmJ me 1\\n\"\n    \"vzT va 1\\n\"\n    \"rhJ th 1\\n\"\n    \"nHf an 1\\n\"\n    \"jJo on 1\\n\"\n    \"qWy qu 1\\n\"\n    \"Wvk va 1\\n\"\n    \"gkB ng 1\\n\"\n    \"mEw me 1\\n\"\n    \"Ugx ng 1\\n\"\n    \"Qmy me 1\\n\"\n    \"Ljq qu 1\\n\"\n    \"bGp pr 1\\n\"\n    \"lHg ng 1\\n\"\n    \"cGg ch 1\\n\"\n    \"gFk ng 1\\n\"\n    \"xnV an 1\\n\"\n    \"eFy er 1\\n\"\n    \"Nfm me 1\\n\"\n    \"hSf th 1\\n\"\n    \"gXj ng 1\\n\"\n    \"xHf fo 1\\n\"\n    \"uqj qu 1\\n\"\n    \"wXa an 1\\n\"\n    \"vcT ch 1\\n\"\n    \"uJw qu 1\\n\"\n    \"pWx pr 1\\n\"\n    \"qpQ qu 1\\n\"\n    \"hqE th 1\\n\"\n    \"Yfn an 1\\n\"\n    \"jrI er 1\\n\"\n    \"cgK ch 1\\n\"\n    \"yyP ny 1\\n\"\n    \"Zmg ng 1\\n\"\n    \"Lkc ch 1\\n\"\n    \"eUq qu 1\\n\"\n    \"jrY er 1\\n\"\n    \"kFs sz 1\\n\"\n    \"sUq qu 1\\n\"\n    \"jlZ le 1\\n\"\n    \"cnV ch 1\\n\"\n    \"aPj an 1\\n\"\n    \"mjE ij 1\\n\"\n    \"pZl le 1\\n\"\n    \"uFs qu 1\\n\"\n    \"Knf an 1\\n\"\n    \"Fpc ch 1\\n\"\n    \"hfR th 1\\n\"\n    \"qnC an 1\\n\"\n    \"Dlq qu 1\\n\"\n    \"frM er 1\\n\"\n    \"sfB sz 1\\n\"\n    \"Gxk ka 1\\n\"\n    \"Fkj ij 1\\n\"\n    \"vGk va 1\\n\"\n    \"gRm ng 1\\n\"\n    \"rWf er 1\\n\"\n    \"rYv er 1\\n\"\n    \"qEd qu 1\\n\"\n    \"qHr qu 1\\n\"\n    \"Smv va 1\\n\"\n    \"lFp le 1\\n\"\n    \"kDs sz 1\\n\"\n    \"dSd de 1\\n\"\n    \"rLw er 1\\n\"\n    \"cnZ an 1\\n\"\n    \"Wjp ij 1\\n\"\n    \"pTq qu 1\\n\"\n    \"Kcx ch 1\\n\"\n    \"vKs va 1\\n\"\n    \"bcK ch 1\\n\"\n    \"vwy va 1\\n\"\n    \"Ujx ij 1\\n\"\n    \"Qvr er 1\\n\"\n    \"dcV ch 1\\n\"\n    \"xVf fo 1\\n\"\n    \"uIk qu 1\\n\"\n    \"jlN le 1\\n\"\n    \"vwL va 1\\n\"\n    \"fWp pr 1\\n\"\n    \"Pxr er 1\\n\"\n    \"rRb er 1\\n\"\n    \"bfD be 1\\n\"\n    \"yCx ny 1\\n\"\n    \"nJs an 1\\n\"\n    \"dCm de 1\\n\"\n    \"cbG ch 1\\n\"\n    \"gCf ng 1\\n\"\n    \"tmV th 1\\n\"\n    \"qeC qu 1\\n\"\n    \"knS an 1\\n\"\n    \"gwY ng 1\\n\"\n    \"Wjl le 1\\n\"\n    \"mIw me 1\\n\"\n    \"qjW qu 1\\n\"\n    \"gwv ng 1\\n\"\n    \"qJw wa 1\\n\"\n    \"cnA an 1\\n\"\n    \"bBm me 1\\n\"\n    \"gFw ng 1\\n\"\n    \"wDn an 1\\n\"\n    \"qgL qu 1\\n\"\n    \"lUa an 1\\n\"\n    \"hDn th 1\\n\"\n    \"kHx ka 1\\n\"\n    \"wXm me 1\\n\"\n    \"qyY qu 1\\n\"\n    \"pkD ka 1\\n\"\n    \"sLz st 1\\n\"\n    \"zxF sz 1\\n\"\n    \"vMx va 1\\n\"\n    \"plR le 1\\n\"\n    \"pwZ pr 1\\n\"\n    \"pYd de 1\\n\"\n    \"zfL sz 1\\n\"\n    \"ztK th 1\\n\"\n    \"mTm me 1\\n\"\n    \"dCp de 1\\n\"\n    \"bwx wa 1\\n\"\n    \"xCs sz 1\\n\"\n    \"tfF th 1\\n\"\n    \"Lnq an 1\\n\"\n    \"dYi in 1\\n\"\n    \"pWq qu 1\\n\"\n    \"oIx on 1\\n\"\n    \"ywE wa 1\\n\"\n    \"wNk ka 1\\n\"\n    \"jwO ij 1\\n\"\n    \"xZz sz 1\\n\"\n    \"wGm me 1\\n\"\n    \"cVw ch 1\\n\"\n    \"bjK ij 1\\n\"\n    \"Gzg ng 1\\n\"\n    \"kwz sz 1\\n\"\n    \"pBn an 1\\n\"\n    \"cTx ch 1\\n\"\n    \"rHq qu 1\\n\"\n    \"Wsg ng 1\\n\"\n    \"xEh th 1\\n\"\n    \"yrK er 1\\n\"\n    \"mMb me 1\\n\"\n    \"pHw pr 1\\n\"\n    \"cjN ch 1\\n\"\n    \"nXn an 1\\n\"\n    \"bwO wa 1\\n\"\n    \"flB le 1\\n\"\n    \"Qqj qu 1\\n\"\n    \"mKv va 1\\n\"\n    \"fFn an 1\\n\"\n    \"wfG wa 1\\n\"\n    \"wfB wa 1\\n\"\n    \"Jqk qu 1\\n\"\n    \"bwK wa 1\\n\"\n    \"hhI th 1\\n\"\n    \"lUe er 1\\n\"\n    \"wFd de 1\\n\"\n    \"vkT va 1\\n\"\n    \"xLg ng 1\\n\"\n    \"fhB th 1\\n\"\n    \"wmV me 1\\n\"\n    \"tmF th 1\\n\"\n    \"Rtc th 1\\n\"\n    \"dyY de 1\\n\"\n    \"jyw ij 1\\n\"\n    \"kRf ka 1\\n\"\n    \"fXz sz 1\\n\"\n    \"Znz an 1\\n\"\n    \"wqX qu 1\\n\"\n    \"uMx qu 1\\n\"\n    \"gwV ng 1\\n\"\n    \"Pbh th 1\\n\"\n    \"dcM ch 1\\n\"\n    \"nPz an 1\\n\"\n    \"cwU ch 1\\n\"\n    \"vJt th 1\\n\"\n    \"gyQ ng 1\\n\"\n    \"fXi in 1\\n\"\n    \"bsZ sz 1\\n\"\n    \"Bqi qu 1\\n\"\n    \"vGn an 1\\n\"\n    \"knN an 1\\n\"\n    \"wYq qu 1\\n\"\n    \"tTb th 1\\n\"\n    \"bmP me 1\\n\"\n    \"jpZ ij 1\\n\"\n    \"Mqw qu 1\\n\"\n    \"vjM va 1\\n\"\n    \"qVh th 1\\n\"\n    \"juY qu 1\\n\"\n    \"rBk er 1\\n\"\n    \"juI qu 1\\n\"\n    \"zEq qu 1\\n\"\n    \"zWg ng 1\\n\"\n    \"fzH sz 1\\n\"\n    \"tLx th 1\\n\"\n    \"Ncf ch 1\\n\"\n    \"kfN ka 1\\n\"\n    \"uUo qu 1\\n\"\n    \"fCs sz 1\\n\"\n    \"tCv th 1\\n\"\n    \"sUy sz 1\\n\"\n    \"pBf pr 1\\n\"\n    \"jBz sz 1\\n\"\n    \"vDc ch 1\\n\"\n    \"qmx qu 1\\n\"\n    \"qtK th 1\\n\"\n    \"qcS ch 1\\n\"\n    \"vPt th 1\\n\"\n    \"gQm ng 1\\n\"\n    \"hzR th 1\\n\"\n    \"dcL ch 1\\n\"\n    \"xrI er 1\\n\"\n    \"dvN va 1\\n\"\n    \"Cwv va 1\\n\"\n    \"xhQ th 1\\n\"\n    \"Gzu qu 1\\n\"\n    \"pdO de 1\\n\"\n    \"Bqr qu 1\\n\"\n    \"vLn an 1\\n\"\n    \"lxf le 1\\n\"\n    \"vYk va 1\\n\"\n    \"wSq qu 1\\n\"\n    \"pkS ka 1\\n\"\n    \"zKg ng 1\\n\"\n    \"tPm th 1\\n\"\n    \"Pmj ij 1\\n\"\n    \"lWu qu 1\\n\"\n    \"Xuu qu 1\\n\"\n    \"jcX ch 1\\n\"\n    \"xzQ sz 1\\n\"\n    \"Gzw sz 1\\n\"\n    \"ePm er 1\\n\"\n    \"fwW wa 1\\n\"\n    \"qwA qu 1\\n\"\n    \"vQt th 1\\n\"\n    \"bxP be 1\\n\"\n    \"dmD de 1\\n\"\n    \"awQ an 1\\n\"\n    \"fVf fo 1\\n\"\n    \"bwY wa 1\\n\"\n    \"Zxt th 1\\n\"\n    \"Xhk th 1\\n\"\n    \"gYk ng 1\\n\"\n    \"zCf sz 1\\n\"\n    \"yfQ ny 1\\n\"\n    \"zGw sz 1\\n\"\n    \"gvE ng 1\\n\"\n    \"gCv ng 1\\n\"\n    \"oPf on 1\\n\"\n    \"zXi in 1\\n\"\n    \"hvI th 1\\n\"\n    \"hzS th 1\\n\"\n    \"mfX me 1\\n\"\n    \"dPd de 1\\n\"\n    \"Lrf er 1\\n\"\n    \"lrG er 1\\n\"\n    \"mYf me 1\\n\"\n    \"hNj th 1\\n\"\n    \"qAj qu 1\\n\"\n    \"sxQ st 1\\n\"\n    \"kTl le 1\\n\"\n    \"qOf qu 1\\n\"\n    \"Jdx de 1\\n\"\n    \"swK sz 1\\n\"\n    \"jQb ij 1\\n\"\n    \"Dqp qu 1\\n\"\n    \"cWv ch 1\\n\"\n    \"dxE de 1\\n\"\n    \"sXj sz 1\\n\"\n    \"nvB an 1\\n\"\n    \"wXf wa 1\\n\"\n    \"Cqi qu 1\\n\"\n    \"bzW sz 1\\n\"\n    \"rRf er 1\\n\"\n    \"mZj ij 1\\n\"\n    \"bnF an 1\\n\"\n    \"qaG an 1\\n\"\n    \"Bqs qu 1\\n\"\n    \"lMn an 1\\n\"\n    \"wHp pr 1\\n\"\n    \"Ljc ch 1\\n\"\n    \"Mwf wa 1\\n\"\n    \"pzK sz 1\\n\"\n    \"mPb me 1\\n\"\n    \"qjE qu 1\\n\"\n    \"wRr er 1\\n\"\n    \"xZf fo 1\\n\"\n    \"nqG an 1\\n\"\n    \"vVb va 1\\n\"\n    \"pjC ij 1\\n\"\n    \"uHl qu 1\\n\"\n    \"jDn an 1\\n\"\n    \"pqX qu 1\\n\"\n    \"pqk qu 1\\n\"\n    \"xgU ng 1\\n\"\n    \"wJx wa 1\\n\"\n    \"znK an 1\\n\"\n    \"rhB th 1\\n\"\n    \"vDq qu 1\\n\"\n    \"sJc ch 1\\n\"\n    \"Xkh th 1\\n\"\n    \"lnJ an 1\\n\"\n    \"bRq qu 1\\n\"\n    \"fzA sz 1\\n\"\n    \"bQe er 1\\n\"\n    \"Txw wa 1\\n\"\n    \"bkG ka 1\\n\"\n    \"ywZ wa 1\\n\"\n    \"zWc ch 1\\n\"\n    \"lhL th 1\\n\"\n    \"gmF ng 1\\n\"\n    \"sfQ sz 1\\n\"\n    \"zmG sz 1\\n\"\n    \"Ogz ng 1\\n\"\n    \"xuA qu 1\\n\"\n    \"qAq qu 1\\n\"\n    \"zDw sz 1\\n\"\n    \"lVu qu 1\\n\"\n    \"xRw wa 1\\n\"\n    \"xmM me 1\\n\"\n    \"pxB pr 1\\n\"\n    \"ztT th 1\\n\"\n    \"kzJ sz 1\\n\"\n    \"nFz an 1\\n\"\n    \"uVz qu 1\\n\"\n    \"pnQ an 1\\n\"\n    \"pGt th 1\\n\"\n    \"Xdn an 1\\n\"\n    \"fVz sz 1\\n\"\n    \"Mhg th 1\\n\"\n    \"Xqo qu 1\\n\"\n    \"sHq qu 1\\n\"\n    \"jwC ij 1\\n\"\n    \"vkG va 1\\n\"\n    \"Xkx ka 1\\n\"\n    \"tRg th 1\\n\"\n    \"nvV an 1\\n\"\n    \"qwG qu 1\\n\"\n    \"Vhh th 1\\n\"\n    \"zwO sz 1\\n\"\n    \"qQb qu 1\\n\"\n    \"crR ch 1\\n\"\n    \"Mrq qu 1\\n\"\n    \"oQe er 1\\n\"\n    \"mBt th 1\\n\"\n    \"vUy va 1\\n\"\n    \"twW th 1\\n\"\n    \"Qgn an 1\\n\"\n    \"Nxu qu 1\\n\"\n    \"qhF th 1\\n\"\n    \"xpX pr 1\\n\"\n    \"fvD va 1\\n\"\n    \"Cvy va 1\\n\"\n    \"oHj on 1\\n\"\n    \"Qqo qu 1\\n\"\n    \"vYd de 1\\n\"\n    \"xhV th 1\\n\"\n    \"fZf fo 1\\n\"\n    \"yKm me 1\\n\"\n    \"xYq qu 1\\n\"\n    \"fcU ch 1\\n\"\n    \"qEp qu 1\\n\"\n    \"jXd de 1\\n\"\n    \"mlQ le 1\\n\"\n    \"Ggz ng 1\\n\"\n    \"cLp ch 1\\n\"\n    \"yxU ny 1\\n\"\n    \"gvJ ng 1\\n\"\n    \"wqD qu 1\\n\"\n    \"vsN sz 1\\n\"\n    \"Ijf ij 1\\n\"\n    \"jbJ ij 1\\n\"\n    \"bMx be 1\\n\"\n    \"kXs sz 1\\n\"\n    \"grT ng 1\\n\"\n    \"wOd de 1\\n\"\n    \"pGw pr 1\\n\"\n    \"Gkd de 1\\n\"\n    \"qCj qu 1\\n\"\n    \"hqY th 1\\n\"\n    \"rDp er 1\\n\"\n    \"nQt th 1\\n\"\n    \"kdV de 1\\n\"\n    \"bgS ng 1\\n\"\n    \"Tqo qu 1\\n\"\n    \"fEj ij 1\\n\"\n    \"hZs th 1\\n\"\n    \"jYn an 1\\n\"\n    \"bPx be 1\\n\"\n    \"hgY th 1\\n\"\n    \"Pvy va 1\\n\"\n    \"fxK fo 1\\n\"\n    \"Hww wa 1\\n\"\n    \"xRk ka 1\\n\"\n    \"dmP de 1\\n\"\n    \"mcY ch 1\\n\"\n    \"bxR be 1\\n\"\n    \"Lsl le 1\\n\"\n    \"hRl th 1\\n\"\n    \"iwQ in 1\\n\"\n    \"Wqx qu 1\\n\"\n    \"kfV ka 1\\n\"\n    \"qwN qu 1\\n\"\n    \"Qpv va 1\\n\"\n    \"mrO er 1\\n\"\n    \"iFc ti 1\\n\"\n    \"wzD sz 1\\n\"\n    \"qbF qu 1\\n\"\n    \"xfS fo 1\\n\"\n    \"Pqh th 1\\n\"\n    \"xYb be 1\\n\"\n    \"lDh th 1\\n\"\n    \"vtG th 1\\n\"\n    \"Xzu qu 1\\n\"\n    \"xjK ij 1\\n\"\n    \"jDx ij 1\\n\"\n    \"nCj an 1\\n\"\n    \"mCk ka 1\\n\"\n    \"qxP qu 1\\n\"\n    \"oMv on 1\\n\"\n    \"cgY ch 1\\n\"\n    \"Wqt th 1\\n\"\n    \"kkQ ka 1\\n\"\n    \"tqO th 1\\n\"\n    \"jnC an 1\\n\"\n    \"fGq qu 1\\n\"\n    \"Bfv va 1\\n\"\n    \"vYi in 1\\n\"\n    \"pcL ch 1\\n\"\n    \"Fgp ng 1\\n\"\n    \"jtR th 1\\n\"\n    \"vhF th 1\\n\"\n    \"wUi in 1\\n\"\n    \"nNj an 1\\n\"\n    \"jTw ij 1\\n\"\n    \"qsM qu 1\\n\"\n    \"aJg an 1\\n\"\n    \"jQe er 1\\n\"\n    \"Gnj an 1\\n\"\n    \"fmM me 1\\n\"\n    \"zqM qu 1\\n\"\n    \"gjZ ng 1\\n\"\n    \"nxH an 1\\n\"\n    \"cdO ch 1\\n\"\n    \"aAx an 1\\n\"\n    \"tUv th 1\\n\"\n    \"hXk th 1\\n\"\n    \"qBx qu 1\\n\"\n    \"tgK th 1\\n\"\n    \"fZy ny 1\\n\"\n    \"Jkx ka 1\\n\"\n    \"pvD va 1\\n\"\n    \"bmT me 1\\n\"\n    \"oYx on 1\\n\"\n    \"hwV th 1\\n\"\n    \"mjB ij 1\\n\"\n    \"bYn an 1\\n\"\n    \"iHx in 1\\n\"\n    \"lYh th 1\\n\"\n    \"qCi in 1\\n\"\n    \"fhR th 1\\n\"\n    \"nDf an 1\\n\"\n    \"hCd th 1\\n\"\n    \"lxB le 1\\n\"\n    \"eXj er 1\\n\"\n    \"fvW va 1\\n\"\n    \"ccW ch 1\\n\"\n    \"dTc ch 1\\n\"\n    \"sqA qu 1\\n\"\n    \"fNt th 1\\n\"\n    \"zkM sz 1\\n\"\n    \"lRv le 1\\n\"\n    \"qnI an 1\\n\"\n    \"xwC wa 1\\n\"\n    \"zqY qu 1\\n\"\n    \"yQb be 1\\n\"\n    \"xrC er 1\\n\"\n    \"xFm me 1\\n\"\n    \"oeQ er 1\\n\"\n    \"mLl le 1\\n\"\n    \"jwT ij 1\\n\"\n    \"fwD wa 1\\n\"\n    \"vpE va 1\\n\"\n    \"flY le 1\\n\"\n    \"sRg ng 1\\n\"\n    \"vSd de 1\\n\"\n    \"wuR qu 1\\n\"\n    \"wrI er 1\\n\"\n    \"Ysn st 1\\n\"\n    \"Vhj th 1\\n\"\n    \"Cqh th 1\\n\"\n    \"Ygb ng 1\\n\"\n    \"hPq th 1\\n\"\n    \"mkB ka 1\\n\"\n    \"tRq th 1\\n\"\n    \"ajQ an 1\\n\"\n    \"hcR th 1\\n\"\n    \"vDw va 1\\n\"\n    \"pQn an 1\\n\"\n    \"xeU er 1\\n\"\n    \"vcM ch 1\\n\"\n    \"zVc ch 1\\n\"\n    \"bRh th 1\\n\"\n    \"uFx qu 1\\n\"\n    \"fbW be 1\\n\"\n    \"uUv qu 1\\n\"\n    \"Nhv th 1\\n\"\n    \"Ykx ka 1\\n\"\n    \"Wtp th 1\\n\"\n    \"Mzj sz 1\\n\"\n    \"npT in 1\\n\"\n    \"Xqk qu 1\\n\"\n    \"xwN wa 1\\n\"\n    \"hXw th 1\\n\"\n    \"zLb sz 1\\n\"\n    \"Gxy ny 1\\n\"\n    \"dDq qu 1\\n\"\n    \"Bfy ny 1\\n\"\n    \"fkx ka 1\\n\"\n    \"jOq qu 1\\n\"\n    \"Ddk de 1\\n\"\n    \"Njp ij 1\\n\"\n    \"xjJ ij 1\\n\"\n    \"qhS th 1\\n\"\n    \"Qwm me 1\\n\"\n    \"yWj ij 1\\n\"\n    \"nFv an 1\\n\"\n    \"pLb pr 1\\n\"\n    \"qbB qu 1\\n\"\n    \"smX sz 1\\n\"\n    \"tnZ th 1\\n\"\n    \"zQh th 1\\n\"\n    \"Fzb sz 1\\n\"\n    \"cNb ch 1\\n\"\n    \"hpV th 1\\n\"\n    \"Bxz sz 1\\n\"\n    \"xgG ng 1\\n\"\n    \"Rlj le 1\\n\"\n    \"iHq in 1\\n\"\n    \"swN sz 1\\n\"\n    \"Njv va 1\\n\"\n    \"wPk ka 1\\n\"\n    \"oRv on 1\\n\"\n    \"pJs sz 1\\n\"\n    \"kZw ka 1\\n\"\n    \"vVs st 1\\n\"\n    \"Vbw wa 1\\n\"\n    \"Ffh th 1\\n\"\n    \"mzQ sz 1\\n\"\n    \"Gvl le 1\\n\"\n    \"Pgq qu 1\\n\"\n    \"lPp le 1\\n\"\n    \"vCv va 1\\n\"\n    \"kNf ka 1\\n\"\n    \"bmD me 1\\n\"\n    \"mWt th 1\\n\"\n    \"slF le 1\\n\"\n    \"qiX in 1\\n\"\n    \"yRt th 1\\n\"\n    \"lqx qu 1\\n\"\n    \"qlj qu 1\\n\"\n    \"sfZ sz 1\\n\"\n    \"Wfy ny 1\\n\"\n    \"vrO er 1\\n\"\n    \"gxT ng 1\\n\"\n    \"lwE le 1\\n\"\n    \"qdJ qu 1\\n\"\n    \"Ypk ka 1\\n\"\n    \"Qpf pr 1\\n\"\n    \"Znw an 1\\n\"\n    \"bfJ be 1\\n\"\n    \"qQy qu 1\\n\"\n    \"qAy qu 1\\n\"\n    \"aqW an 1\\n\"\n    \"qqI qu 1\\n\"\n    \"Lwg ng 1\\n\"\n    \"Nnw an 1\\n\"\n    \"cLv ch 1\\n\"\n    \"Wtx th 1\\n\"\n    \"qcq ch 1\\n\"\n    \"sjR sz 1\\n\"\n    \"lWn an 1\\n\"\n    \"Zmx me 1\\n\"\n    \"qZg qu 1\\n\"\n    \"tYz th 1\\n\"\n    \"gVx ng 1\\n\"\n    \"mXt th 1\\n\"\n    \"nwJ an 1\\n\"\n    \"jwZ ij 1\\n\"\n    \"lwL le 1\\n\"\n    \"eGx er 1\\n\"\n    \"Sqk qu 1\\n\"\n    \"gBg ng 1\\n\"\n    \"zsS sz 1\\n\"\n    \"knQ an 1\\n\"\n    \"Nnf an 1\\n\"\n    \"qmT qu 1\\n\"\n    \"Sqp qu 1\\n\"\n    \"ffQ fo 1\\n\"\n    \"Vcv ch 1\\n\"\n    \"fmD me 1\\n\"\n    \"zYg ng 1\\n\"\n    \"bAx be 1\\n\"\n    \"nbW an 1\\n\"\n    \"gJm ng 1\\n\"\n    \"Jwn an 1\\n\"\n    \"mxJ me 1\\n\"\n    \"xbC be 1\\n\"\n    \"Rbq qu 1\\n\"\n    \"xZc ch 1\\n\"\n    \"bJy be 1\\n\"\n    \"Xyk ka 1\\n\"\n    \"zkV sz 1\\n\"\n    \"uoF qu 1\\n\"\n    \"bcU ch 1\\n\"\n    \"cZq ch 1\\n\"\n    \"rPm er 1\\n\"\n    \"rGn an 1\\n\"\n    \"lcL ch 1\\n\"\n    \"rVt th 1\\n\"\n    \"Cgw ng 1\\n\"\n    \"Ctq th 1\\n\"\n    \"eGv er 1\\n\"\n    \"Rzs st 1\\n\"\n    \"Qhz th 1\\n\"\n    \"sLv va 1\\n\"\n    \"Vqm qu 1\\n\"\n    \"ydJ de 1\\n\"\n    \"xVr er 1\\n\"\n    \"tLk th 1\\n\"\n    \"qfy qu 1\\n\"\n    \"wxV wa 1\\n\"\n    \"yRq qu 1\\n\"\n    \"Vxq qu 1\\n\"\n    \"qYz qu 1\\n\"\n    \"zhM th 1\\n\"\n    \"mLn an 1\\n\"\n    \"Zvt th 1\\n\"\n    \"Fvm va 1\\n\"\n    \"hcM th 1\\n\"\n    \"Mwp wa 1\\n\"\n    \"cTg ch 1\\n\"\n    \"lXr er 1\\n\"\n    \"fQe er 1\\n\"\n    \"Jbw wa 1\\n\"\n    \"yfG ny 1\\n\"\n    \"phK th 1\\n\"\n    \"gjH ng 1\\n\"\n    \"Wdg de 1\\n\"\n    \"pPn an 1\\n\"\n    \"Bwg ng 1\\n\"\n    \"znB an 1\\n\"\n    \"fwJ wa 1\\n\"\n    \"utQ th 1\\n\"\n    \"cjC ch 1\\n\"\n    \"fVd de 1\\n\"\n    \"cTm ch 1\\n\"\n    \"wMv va 1\\n\"\n    \"Kgk ng 1\\n\"\n    \"nRd an 1\\n\"\n    \"mMt th 1\\n\"\n    \"xjQ ij 1\\n\"\n    \"qYt th 1\\n\"\n    \"sYj st 1\\n\"\n    \"jNc ch 1\\n\"\n    \"qXt th 1\\n\"\n    \"wzB sz 1\\n\"\n    \"Sjq qu 1\\n\"\n    \"qtF th 1\\n\"\n    \"wYi in 1\\n\"\n    \"glT ng 1\\n\"\n    \"Uug ng 1\\n\"\n    \"uOp qu 1\\n\"\n    \"iBx in 1\\n\"\n    \"Rqt th 1\\n\"\n    \"zWj sz 1\\n\"\n    \"Hcx ch 1\\n\"\n    \"jNd de 1\\n\"\n    \"zQr er 1\\n\"\n    \"iHd in 1\\n\"\n    \"Wpx pr 1\\n\"\n    \"nfY an 1\\n\"\n    \"Rkz sz 1\\n\"\n    \"Kqg qu 1\\n\"\n    \"Gfv va 1\\n\"\n    \"krC er 1\\n\"\n    \"Whc th 1\\n\"\n    \"ljM le 1\\n\"\n    \"yxG ny 1\\n\"\n    \"fpW pr 1\\n\"\n    \"bcF ch 1\\n\"\n    \"krx er 1\\n\"\n    \"uDt th 1\\n\"\n    \"Fzo on 1\\n\"\n    \"wPn an 1\\n\"\n    \"Lfj ij 1\\n\"\n    \"Bkp ka 1\\n\"\n    \"Xkq qu 1\\n\"\n    \"jxH ij 1\\n\"\n    \"vIj va 1\\n\"\n    \"gTc ch 1\\n\"\n    \"hEj th 1\\n\"\n    \"fqB qu 1\\n\"\n    \"jlD le 1\\n\"\n    \"tFf th 1\\n\"\n    \"Nfw wa 1\\n\"\n    \"Fqe qu 1\\n\"\n    \"Tzp sz 1\\n\"\n    \"sJr er 1\\n\"\n    \"qIt th 1\\n\"\n    \"dFb de 1\\n\"\n    \"qzE qu 1\\n\"\n    \"mVv va 1\\n\"\n    \"Vqa an 1\\n\"\n    \"bqM qu 1\\n\"\n    \"mdJ de 1\\n\"\n    \"dIp de 1\\n\"\n    \"Znx an 1\\n\"\n    \"jkK ij 1\\n\"\n    \"rfQ er 1\\n\"\n    \"xkI ku 1\\n\"\n    \"fIo ro 1\\n\"\n    \"lqV qu 1\\n\"\n    \"Qpd de 1\\n\"\n    \"pAx pr 1\\n\"\n    \"rrQ er 1\\n\"\n    \"bIu qu 1\\n\"\n    \"xDw wa 1\\n\"\n    \"oHx on 1\\n\"\n    \"wJw wa 1\\n\"\n    \"Cqv qu 1\\n\"\n    \"yvB va 1\\n\"\n    \"yqU qu 1\\n\"\n    \"rLx er 1\\n\"\n    \"Fzx sz 1\\n\"\n    \"dZf de 1\\n\"\n    \"Nqh th 1\\n\"\n    \"Rnz an 1\\n\"\n    \"hTc th 1\\n\"\n    \"bVb be 1\\n\"\n    \"Fdm de 1\\n\"\n    \"vfv va 1\\n\"\n    \"hwS th 1\\n\"\n    \"zPt th 1\\n\"\n    \"Gxv va 1\\n\"\n    \"Fvt th 1\\n\"\n    \"mZr er 1\\n\"\n    \"zVr er 1\\n\"\n    \"mBc ch 1\\n\"\n    \"fXq qu 1\\n\"\n    \"Plw le 1\\n\"\n    \"Nlx le 1\\n\"\n    \"jCd de 1\\n\"\n    \"Kwv va 1\\n\"\n    \"Jqa an 1\\n\"\n    \"zGs st 1\\n\"\n    \"fuV qu 1\\n\"\n    \"pzL sz 1\\n\"\n    \"iFx in 1\\n\"\n    \"fTm me 1\\n\"\n    \"yWd de 1\\n\"\n    \"cHv ch 1\\n\"\n    \"fFk ka 1\\n\"\n    \"mqd qu 1\\n\"\n    \"aQk an 1\\n\"\n    \"uDf qu 1\\n\"\n    \"Vbf be 1\\n\"\n    \"pgJ ng 1\\n\"\n    \"fkN ka 1\\n\"\n    \"pBm me 1\\n\"\n    \"Bdv de 1\\n\"\n    \"jmW ij 1\\n\"\n    \"Jvv va 1\\n\"\n    \"Xpk ka 1\\n\"\n    \"qQc ch 1\\n\"\n    \"kdG de 1\\n\"\n    \"qkP qu 1\\n\"\n    \"cSd ch 1\\n\"\n    \"Fdc ch 1\\n\"\n    \"qgK qu 1\\n\"\n    \"qdH qu 1\\n\"\n    \"uNv qu 1\\n\"\n    \"eVt th 1\\n\"\n    \"dfA de 1\\n\"\n    \"Hzy sz 1\\n\"\n    \"lWc ch 1\\n\"\n    \"vxH va 1\\n\"\n    \"hxW th 1\\n\"\n    \"Khp th 1\\n\"\n    \"xQb be 1\\n\"\n    \"pwT pr 1\\n\"\n    \"Lwf wa 1\\n\"\n    \"zDq qu 1\\n\"\n    \"kxK ka 1\\n\"\n    \"mtY th 1\\n\"\n    \"bhT th 1\\n\"\n    \"ywR wa 1\\n\"\n    \"jIa an 1\\n\"\n    \"Wze er 1\\n\"\n    \"hqK th 1\\n\"\n    \"flZ le 1\\n\"\n    \"qMi in 1\\n\"\n    \"wpR wa 1\\n\"\n    \"qHh th 1\\n\"\n    \"aOw an 1\\n\"\n    \"dkU de 1\\n\"\n    \"vRr er 1\\n\"\n    \"vjX va 1\\n\"\n    \"cuQ ch 1\\n\"\n    \"qmJ qu 1\\n\"\n    \"uuJ ou 1\\n\"\n    \"yWx ny 1\\n\"\n    \"hUf th 1\\n\"\n    \"vzP va 1\\n\"\n    \"rSx er 1\\n\"\n    \"qgy qu 1\\n\"\n    \"Rzf sz 1\\n\"\n    \"zjB sz 1\\n\"\n    \"Sjx ij 1\\n\"\n    \"xfA fo 1\\n\"\n    \"fHj ij 1\\n\"\n    \"qkB qu 1\\n\"\n    \"cdF ch 1\\n\"\n    \"fWj ij 1\\n\"\n    \"jbA ij 1\\n\"\n    \"Bmb me 1\\n\"\n    \"yjg ng 1\\n\"\n    \"rxZ er 1\\n\"\n    \"Vmr er 1\\n\"\n    \"iIq in 1\\n\"\n    \"Wgl ng 1\\n\"\n    \"mRp me 1\\n\"\n    \"wvS va 1\\n\"\n    \"Uvy va 1\\n\"\n    \"ypQ pr 1\\n\"\n    \"vFw vo 1\\n\"\n    \"fqE qu 1\\n\"\n    \"swJ st 1\\n\"\n    \"Jrx er 1\\n\"\n    \"cxE ch 1\\n\"\n    \"lZk le 1\\n\"\n    \"fVn an 1\\n\"\n    \"bhZ th 1\\n\"\n    \"jhR th 1\\n\"\n    \"vSq qu 1\\n\"\n    \"yQz sz 1\\n\"\n    \"fHv va 1\\n\"\n    \"vuN qu 1\\n\"\n    \"jpG ij 1\\n\"\n    \"Pkz sz 1\\n\"\n    \"gQb ng 1\\n\"\n    \"pFs st 1\\n\"\n    \"Gjq qu 1\\n\"\n    \"hsK th 1\\n\"\n    \"twx th 1\\n\"\n    \"yyQ ny 1\\n\"\n    \"dqF qu 1\\n\"\n    \"bHh th 1\\n\"\n    \"qMq qu 1\\n\"\n    \"qKv qu 1\\n\"\n    \"zLg ng 1\\n\"\n    \"jmO ij 1\\n\"\n    \"wBk ka 1\\n\"\n    \"pjQ ij 1\\n\"\n    \"xZv va 1\\n\"\n    \"qIu un 1\\n\"\n    \"ycY ch 1\\n\"\n    \"mDf me 1\\n\"\n    \"yJs st 1\\n\"\n    \"Isx st 1\\n\"\n    \"Qqr qu 1\\n\"\n    \"Fkw ka 1\\n\"\n    \"Cpj ij 1\\n\"\n    \"Yvq qu 1\\n\"\n    \"zjG sz 1\\n\"\n    \"gGc ch 1\\n\"\n    \"Xdm de 1\\n\"\n    \"hBv th 1\\n\"\n    \"Wxj ij 1\\n\"\n    \"Ywb ow 1\\n\"\n    \"Vtq th 1\\n\"\n    \"tjY th 1\\n\"\n    \"jDj ij 1\\n\"\n    \"uGd qu 1\\n\"\n    \"wvF va 1\\n\"\n    \"uqg qu 1\\n\"\n    \"Rwp pr 1\\n\"\n    \"Bgb ng 1\\n\"\n    \"mnU an 1\\n\"\n    \"dpI de 1\\n\"\n    \"wKd de 1\\n\"\n    \"yXz sz 1\\n\"\n    \"kLd de 1\\n\"\n    \"gYx ng 1\\n\"\n    \"qxk qu 1\\n\"\n    \"Hhy th 1\\n\"\n    \"fpJ pr 1\\n\"\n    \"cVc ch 1\\n\"\n    \"kVv va 1\\n\"\n    \"Jzs st 1\\n\"\n    \"nDw an 1\\n\"\n    \"tjF th 1\\n\"\n    \"bZj ij 1\\n\"\n    \"mqL qu 1\\n\"\n    \"hFt th 1\\n\"\n    \"nNw an 1\\n\"\n    \"wFv va 1\\n\"\n    \"gHc ch 1\\n\"\n    \"qRx qu 1\\n\"\n    \"Jxh th 1\\n\"\n    \"Vpv va 1\\n\"\n    \"nMk an 1\\n\"\n    \"tjN th 1\\n\"\n    \"fhQ th 1\\n\"\n    \"bpD pr 1\\n\"\n    \"Dfg ng 1\\n\"\n    \"jyO ij 1\\n\"\n    \"jhV th 1\\n\"\n    \"kVk ka 1\\n\"\n    \"nKc an 1\\n\"\n    \"jkJ ij 1\\n\"\n    \"cwS ch 1\\n\"\n    \"oDf on 1\\n\"\n    \"mkY ka 1\\n\"\n    \"gdV ng 1\\n\"\n    \"Xhb th 1\\n\"\n    \"jUq qu 1\\n\"\n    \"aJf an 1\\n\"\n    \"Qxg ng 1\\n\"\n    \"xzS sz 1\\n\"\n    \"vUw va 1\\n\"\n    \"hTj th 1\\n\"\n    \"oVt th 1\\n\"\n    \"zdq qu 1\\n\"\n    \"fHs st 1\\n\"\n    \"xKk ka 1\\n\"\n    \"bFc ch 1\\n\"\n    \"gWq qu 1\\n\"\n    \"Yqa an 1\\n\"\n    \"dmH de 1\\n\"\n    \"Ttq th 1\\n\"\n    \"iQc ch 1\\n\"\n    \"jFh ij 1\\n\"\n    \"fcY ch 1\\n\"\n    \"fsR st 1\\n\"\n    \"iWg in 1\\n\"\n    \"Xyj ij 1\\n\"\n    \"Xjs st 1\\n\"\n    \"xpb pr 1\\n\"\n    \"lzY le 1\\n\"\n    \"pzg ng 1\\n\"\n    \"dVw de 1\\n\"\n    \"Ijc ch 1\\n\"\n    \"fvq qu 1\\n\"\n    \"Vnb an 1\\n\"\n    \"zdH de 1\\n\"\n    \"cDd ch 1\\n\"\n    \"wqI qu 1\\n\"\n    \"yfU ny 1\\n\"\n    \"qoH qu 1\\n\"\n    \"xkw ka 1\\n\"\n    \"Kck ch 1\\n\"\n    \"mUq qu 1\\n\"\n    \"zWm sz 1\\n\"\n    \"Bfj ij 1\\n\"\n    \"rQj er 1\\n\"\n    \"qeW qu 1\\n\"\n    \"qpC qu 1\\n\"\n    \"oqM qu 1\\n\"\n    \"pzO sz 1\\n\"\n    \"cjQ ch 1\\n\"\n    \"zTx sz 1\\n\"\n    \"gRw ng 1\\n\"\n    \"kdQ de 1\\n\"\n    \"wbQ wa 1\\n\"\n    \"Qpj ij 1\\n\"\n    \"zIc ch 1\\n\"\n    \"yxN ny 1\\n\"\n    \"nCk an 1\\n\"\n    \"Jqz qu 1\\n\"\n    \"dEq qu 1\\n\"\n    \"gdE ng 1\\n\"\n    \"wCg ng 1\\n\"\n    \"pQt th 1\\n\"\n    \"vKe er 1\\n\"\n    \"Tjm ij 1\\n\"\n    \"Zcy ch 1\\n\"\n    \"kmR ka 1\\n\"\n    \"cTp ch 1\\n\"\n    \"bqE qu 1\\n\"\n    \"vvZ va 1\\n\"\n    \"cLw ch 1\\n\"\n    \"oIw on 1\\n\"\n    \"xjG ij 1\\n\"\n    \"vtU th 1\\n\"\n    \"hcH th 1\\n\"\n    \"xgT ng 1\\n\"\n    \"vqR qu 1\\n\"\n    \"wuM qu 1\\n\"\n    \"xsY st 1\\n\"\n    \"jCu qu 1\\n\"\n    \"Fbn an 1\\n\"\n    \"cqH ch 1\\n\"\n    \"Xjz ij 1\\n\"\n    \"fgR ng 1\\n\"\n    \"yiX in 1\\n\"\n    \"qnO an 1\\n\"\n    \"wmN me 1\\n\"\n    \"wgH ng 1\\n\"\n    \"tbZ th 1\\n\"\n    \"Xks st 1\\n\"\n    \"pzC po 1\\n\"\n    \"lfX le 1\\n\"\n    \"qBu un 1\\n\"\n    \"mLw me 1\\n\"\n    \"pmY me 1\\n\"\n    \"xqE qu 1\\n\"\n    \"rjY er 1\\n\"\n    \"vrH er 1\\n\"\n    \"Iuf qu 1\\n\"\n    \"yfD ny 1\\n\"\n    \"clG ch 1\\n\"\n    \"cdZ ch 1\\n\"\n    \"eTd er 1\\n\"\n    \"lXv le 1\\n\"\n    \"kpV ka 1\\n\"\n    \"sZq qu 1\\n\"\n    \"Wxc ch 1\\n\"\n    \"vmJ va 1\\n\"\n    \"hkE th 1\\n\"\n    \"pUw pr 1\\n\"\n    \"Cqd qu 1\\n\"\n    \"wCn an 1\\n\"\n    \"pxQ pr 1\\n\"\n    \"Ywp pr 1\\n\"\n    \"xwb wa 1\\n\"\n    \"Wjm ij 1\\n\"\n    \"zqQ qu 1\\n\"\n    \"gTp ng 1\\n\"\n    \"uZv qu 1\\n\"\n    \"mdH de 1\\n\"\n    \"juQ qu 1\\n\"\n    \"gVm ng 1\\n\"\n    \"zjY ij 1\\n\"\n    \"fhN th 1\\n\"\n    \"wfD wa 1\\n\"\n    \"Zjc ch 1\\n\"\n    \"iPv in 1\\n\"\n    \"mzW sz 1\\n\"\n    \"vXm va 1\\n\"\n    \"fEq qu 1\\n\"\n    \"Ozq qu 1\\n\"\n    \"gEp ng 1\\n\"\n    \"kDj ij 1\\n\"\n    \"Zlw le 1\\n\"\n    \"zbR sz 1\\n\"\n    \"zCt th 1\\n\"\n    \"woY on 1\\n\"\n    \"pkT ka 1\\n\"\n    \"kbI ka 1\\n\"\n    \"hdW de 1\\n\"\n    \"Hsx st 1\\n\"\n    \"zpX sz 1\\n\"\n    \"zfV sz 1\\n\"\n    \"Dhk th 1\\n\"\n    \"wMp pr 1\\n\"\n    \"hzJ th 1\\n\"\n    \"Lwp pr 1\\n\"\n    \"zmN sz 1\\n\"\n    \"xfq qu 1\\n\"\n    \"sjQ sz 1\\n\"\n    \"zkK sz 1\\n\"\n    \"bBv va 1\\n\"\n    \"bdE de 1\\n\"\n    \"Qxn an 1\\n\"\n    \"jqt th 1\\n\"\n    \"jhG th 1\\n\"\n    \"fYv va 1\\n\"\n    \"xhE th 1\\n\"\n    \"cbF ch 1\\n\"\n    \"Jnb an 1\\n\"\n    \"jxN ij 1\\n\"\n    \"fYx fo 1\\n\"\n    \"hJp th 1\\n\"\n    \"cRt th 1\\n\"\n    \"qnS an 1\\n\"\n    \"vLp va 1\\n\"\n    \"cBd ch 1\\n\"\n    \"qqU qu 1\\n\"\n    \"Sdd de 1\\n\"\n    \"xeZ er 1\\n\"\n    \"Jwo on 1\\n\"\n    \"dPf de 1\\n\"\n    \"fNl le 1\\n\"\n    \"kIb ka 1\\n\"\n    \"cbL ch 1\\n\"\n    \"Qdr er 1\\n\"\n    \"Mfb be 1\\n\"\n    \"jJl le 1\\n\"\n    \"mxY me 1\\n\"\n    \"lFd le 1\\n\"\n    \"twT th 1\\n\"\n    \"kFk ka 1\\n\"\n    \"crB ch 1\\n\"\n    \"jRr er 1\\n\"\n    \"Htz th 1\\n\"\n    \"pYf pr 1\\n\"\n    \"rVc er 1\\n\"\n    \"vRf va 1\\n\"\n    \"wVq qu 1\\n\"\n    \"zpA sz 1\\n\"\n    \"glY le 1\\n\"\n    \"sNj ij 1\\n\"\n    \"vKx va 1\\n\"\n    \"tvB th 1\\n\"\n    \"Yjf ij 1\\n\"\n    \"mwP me 1\\n\"\n    \"Jyb be 1\\n\"\n    \"tBc th 1\\n\"\n    \"gSb ng 1\\n\"\n    \"cMl ch 1\\n\"\n    \"gjJ ng 1\\n\"\n    \"dYz de 1\\n\"\n    \"zPg ng 1\\n\"\n    \"kqB qu 1\\n\"\n    \"sFv st 1\\n\"\n    \"xkH ka 1\\n\"\n    \"fZt th 1\\n\"\n    \"yhR th 1\\n\"\n    \"bwN wa 1\\n\"\n    \"qjG qu 1\\n\"\n    \"nQm an 1\\n\"\n    \"qMr qu 1\\n\"\n    \"jcW ch 1\\n\"\n    \"qJv qu 1\\n\"\n    \"gTm ng 1\\n\"\n    \"kmQ ka 1\\n\"\n    \"Wlc ch 1\\n\"\n    \"kYf ka 1\\n\"\n    \"eJp er 1\\n\"\n    \"Tkb ka 1\\n\"\n    \"hfM th 1\\n\"\n    \"nxY an 1\\n\"\n    \"pDl le 1\\n\"\n    \"wcN ch 1\\n\"\n    \"pQa an 1\\n\"\n    \"ohZ th 1\\n\"\n    \"xRz sz 1\\n\"\n    \"lbV le 1\\n\"\n    \"lKc ch 1\\n\"\n    \"wxB wa 1\\n\"\n    \"Lww wa 1\\n\"\n    \"fqQ qu 1\\n\"\n    \"kkZ ka 1\\n\"\n    \"iwO in 1\\n\"\n    \"dgU ng 1\\n\"\n    \"dvO de 1\\n\"\n    \"pDt th 1\\n\"\n    \"kvK ka 1\\n\"\n    \"jlV le 1\\n\"\n    \"xXd de 1\\n\"\n    \"ykF ku 1\\n\"\n    \"iyT in 1\\n\"\n    \"Ufx fo 1\\n\"\n    \"nzU an 1\\n\"\n    \"xbH bu 1\\n\"\n    \"lSb le 1\\n\"\n    \"Xpf pr 1\\n\"\n    \"Uvf va 1\\n\"\n    \"yyF ny 1\\n\"\n    \"fxP fo 1\\n\"\n    \"jYu qu 1\\n\"\n    \"qjb qu 1\\n\"\n    \"gxL ng 1\\n\"\n    \"pwI pr 1\\n\"\n    \"jUe er 1\\n\"\n    \"rFc ch 1\\n\"\n    \"fsF st 1\\n\"\n    \"cdW ch 1\\n\"\n    \"Xwp pr 1\\n\"\n    \"xdH de 1\\n\"\n    \"jYs ij 1\\n\"\n    \"bFd de 1\\n\"\n    \"qIh th 1\\n\"\n    \"yIg ng 1\\n\"\n    \"vTd de 1\\n\"\n    \"wfE wa 1\\n\"\n    \"qRb qu 1\\n\"\n    \"yhK th 1\\n\"\n    \"kMn an 1\\n\"\n    \"cpB ch 1\\n\"\n    \"txN th 1\\n\"\n    \"kPd de 1\\n\"\n    \"nbB an 1\\n\"\n    \"skQ st 1\\n\"\n    \"uKw qu 1\\n\"\n    \"wQf wa 1\\n\"\n    \"kWf ka 1\\n\"\n    \"wqA qu 1\\n\"\n    \"cwA ch 1\\n\"\n    \"vJk ka 1\\n\"\n    \"hcD th 1\\n\"\n    \"nfK an 1\\n\"\n    \"uXf qu 1\\n\"\n    \"cgA ch 1\\n\"\n    \"Pjd de 1\\n\"\n    \"Lqs qu 1\\n\"\n    \"zwC sz 1\\n\"\n    \"ljN le 1\\n\"\n    \"vkP ka 1\\n\"\n    \"Rqp qu 1\\n\"\n    \"zGx sz 1\\n\"\n    \"jPg ng 1\\n\"\n    \"kbT ka 1\\n\"\n    \"kpQ ka 1\\n\"\n    \"Mzq qu 1\\n\"\n    \"Gjs st 1\\n\"\n    \"kDl le 1\\n\"\n    \"jwR ij 1\\n\"\n    \"Wyq qu 1\\n\"\n    \"qxS qu 1\\n\"\n    \"qGt th 1\\n\"\n    \"Wvr er 1\\n\"\n    \"zNx sz 1\\n\"\n    \"vCm va 1\\n\"\n    \"hlD th 1\\n\"\n    \"vBp va 1\\n\"\n    \"mJc ch 1\\n\"\n    \"hFb th 1\\n\"\n    \"vDm va 1\\n\"\n    \"pfC pr 1\\n\"\n    \"Lpy pr 1\\n\"\n    \"Fhd th 1\\n\"\n    \"dxS de 1\\n\"\n    \"wWg ng 1\\n\"\n    \"Fgn an 1\\n\"\n    \"nFf an 1\\n\"\n    \"cxF ch 1\\n\"\n    \"aVh th 1\\n\"\n    \"Sqx qu 1\\n\"\n    \"Vjz ij 1\\n\"\n    \"znC an 1\\n\"\n    \"qqv qu 1\\n\"\n    \"zrZ er 1\\n\"\n    \"bNl le 1\\n\"\n    \"nvW an 1\\n\"\n    \"Qyb be 1\\n\"\n    \"Fht th 1\\n\"\n    \"jGv ij 1\\n\"\n    \"gLp ng 1\\n\"\n    \"gLb ng 1\\n\"\n    \"qKj qu 1\\n\"\n    \"hJd th 1\\n\"\n    \"Zjg ng 1\\n\"\n    \"nQq an 1\\n\"\n    \"npX an 1\\n\"\n    \"qiO in 1\\n\"\n    \"vvG va 1\\n\"\n    \"jOx ij 1\\n\"\n    \"hhE th 1\\n\"\n    \"vdN de 1\\n\"\n    \"Czz sz 1\\n\"\n    \"gjU ng 1\\n\"\n    \"hVb th 1\\n\"\n    \"Kcg ch 1\\n\"\n    \"dvH de 1\\n\"\n    \"wtD th 1\\n\"\n    \"jIo on 1\\n\"\n    \"jQa an 1\\n\"\n    \"Fyj ij 1\\n\"\n    \"cpU ch 1\\n\"\n    \"hxY th 1\\n\"\n    \"qbD qu 1\\n\"\n    \"svJ st 1\\n\"\n    \"vjW ij 1\\n\"\n    \"gpY ng 1\\n\"\n    \"qnR an 1\\n\"\n    \"gQn an 1\\n\"\n    \"Cvh th 1\\n\"\n    \"ykB ka 1\\n\"\n    \"xgB ng 1\\n\"\n    \"zfD sz 1\\n\"\n    \"yHw wa 1\\n\"\n    \"qdG qu 1\\n\"\n    \"qTn an 1\\n\"\n    \"lTm le 1\\n\"\n    \"jgB ng 1\\n\"\n    \"gxS ng 1\\n\"\n    \"qPe qu 1\\n\"\n    \"ppQ pr 1\\n\"\n    \"yxW ny 1\\n\"\n    \"Hjk ij 1\\n\"\n    \"kNk ka 1\\n\"\n    \"cnJ an 1\\n\"\n    \"uHd qu 1\\n\"\n    \"jvH ij 1\\n\"\n    \"Ggn ng 1\\n\"\n    \"lbS le 1\\n\"\n    \"Qcx ch 1\\n\"\n    \"cqR ch 1\\n\"\n    \"Jyc ch 1\\n\"\n    \"wRp pr 1\\n\"\n    \"nfA an 1\\n\"\n    \"lXw le 1\\n\"\n    \"cmJ ch 1\\n\"\n    \"Ysw st 1\\n\"\n    \"qQs qu 1\\n\"\n    \"gsX ng 1\\n\"\n    \"cIq ch 1\\n\"\n    \"jjZ ij 1\\n\"\n    \"Llb le 1\\n\"\n    \"mMv va 1\\n\"\n    \"lVh th 1\\n\"\n    \"Fph th 1\\n\"\n    \"Zmm me 1\\n\"\n    \"xMd de 1\\n\"\n    \"Gwb wa 1\\n\"\n    \"Qjv ij 1\\n\"\n    \"lqZ qu 1\\n\"\n    \"zJh th 1\\n\"\n    \"Wky ka 1\\n\"\n    \"hDk th 1\\n\"\n    \"yLg ng 1\\n\"\n    \"dYw de 1\\n\"\n    \"dCq qu 1\\n\"\n    \"Gmj ij 1\\n\"\n    \"xTq qu 1\\n\"\n    \"wkF ka 1\\n\"\n    \"hFp th 1\\n\"\n    \"qnB an 1\\n\"\n    \"xyJ ny 1\\n\"\n    \"nIj an 1\\n\"\n    \"xYd de 1\\n\"\n    \"Wqr qu 1\\n\"\n    \"xqV qu 1\\n\"\n    \"wYk ka 1\\n\"\n    \"Qdz de 1\\n\"\n    \"fbN be 1\\n\"\n    \"qwY qu 1\\n\"\n    \"Ubx be 1\\n\"\n    \"wtL th 1\\n\"\n    \"nQw an 1\\n\"\n    \"jJk ij 1\\n\"\n    \"Nzs st 1\\n\"\n    \"dCn an 1\\n\"\n    \"Nfv va 1\\n\"\n    \"Hgh th 1\\n\"\n    \"Hcq ch 1\\n\"\n    \"Xvb va 1\\n\"\n    \"sxJ st 1\\n\"\n    \"wMx wa 1\\n\"\n    \"qFn an 1\\n\"\n    \"Gzf sz 1\\n\"\n    \"qfJ qu 1\\n\"\n    \"zdQ de 1\\n\"\n    \"Xgz ng 1\\n\"\n    \"fkI ka 1\\n\"\n    \"pvK va 1\\n\"\n    \"Cqr qu 1\\n\"\n    \"zFd de 1\\n\"\n    \"oHm on 1\\n\"\n    \"aJj an 1\\n\"\n    \"Fzd de 1\\n\"\n    \"dWk de 1\\n\"\n    \"wmE me 1\\n\"\n    \"sMl le 1\\n\"\n    \"tBp th 1\\n\"\n    \"vNw va 1\\n\"\n    \"Qdh th 1\\n\"\n    \"whG th 1\\n\"\n    \"qAp qu 1\\n\"\n    \"jrM er 1\\n\"\n    \"rHw er 1\\n\"\n    \"Lvc ch 1\\n\"\n    \"gRn an 1\\n\"\n    \"yjV ij 1\\n\"\n    \"hRk th 1\\n\"\n    \"bkV ka 1\\n\"\n    \"jWm ij 1\\n\"\n    \"yYz sz 1\\n\"\n    \"vTy va 1\\n\"\n    \"dxV de 1\\n\"\n    \"mKy me 1\\n\"\n    \"Qlq qu 1\\n\"\n    \"Upx pr 1\\n\"\n    \"Qpq qu 1\\n\"\n    \"Lwm me 1\\n\"\n    \"yXr er 1\\n\"\n    \"gTk ng 1\\n\"\n    \"qnT an 1\\n\"\n    \"Vlq qu 1\\n\"\n    \"Qqd qu 1\\n\"\n    \"Zdd de 1\\n\"\n    \"Xqt th 1\\n\"\n    \"Dfb be 1\\n\"\n    \"oeO on 1\\n\"\n    \"nCx an 1\\n\"\n    \"lXd le 1\\n\"\n    \"vHc ch 1\\n\"\n    \"vAb va 1\\n\"\n    \"Ybw wa 1\\n\"\n    \"zDn an 1\\n\"\n    \"dGk de 1\\n\"\n    \"plH le 1\\n\"\n    \"lxG le 1\\n\"\n    \"Hgp ng 1\\n\"\n    \"jRz ij 1\\n\"\n    \"dTs de 1\\n\"\n    \"mCj ij 1\\n\"\n    \"lHf le 1\\n\"\n    \"lLj le 1\\n\"\n    \"tNb th 1\\n\"\n    \"mKk ka 1\\n\"\n    \"gGj ng 1\\n\"\n    \"jlQ le 1\\n\"\n    \"Yyg ng 1\\n\"\n    \"fDv va 1\\n\"\n    \"zXg ng 1\\n\"\n    \"qzZ qu 1\\n\"\n    \"fEg ng 1\\n\"\n    \"lhS th 1\\n\"\n    \"mzM sz 1\\n\"\n    \"xqT qu 1\\n\"\n    \"Ycj ch 1\\n\"\n    \"fbF be 1\\n\"\n    \"Xsj ij 1\\n\"\n    \"Lnc an 1\\n\"\n    \"Gqp qu 1\\n\"\n    \"fjO ij 1\\n\"\n    \"zhI th 1\\n\"\n    \"zgH ng 1\\n\"\n    \"gWc ch 1\\n\"\n    \"yKf ny 1\\n\"\n    \"uQd qu 1\\n\"\n    \"Kwl le 1\\n\"\n    \"dxG de 1\\n\"\n    \"Yqw qu 1\\n\"\n    \"tKc th 1\\n\"\n    \"cWn an 1\\n\"\n    \"hcI th 1\\n\"\n    \"wfY wa 1\\n\"\n    \"rBp er 1\\n\"\n    \"cJd ch 1\\n\"\n    \"sYf sz 1\\n\"\n    \"Sqj qu 1\\n\"\n    \"kQv ka 1\\n\"\n    \"xpF pr 1\\n\"\n    \"fcX ch 1\\n\"\n    \"yfK ny 1\\n\"\n    \"jQo on 1\\n\"\n    \"gTg ng 1\\n\"\n    \"Qwn an 1\\n\"\n    \"Pnx an 1\\n\"\n    \"yZt th 1\\n\"\n    \"wPz sz 1\\n\"\n    \"juX qu 1\\n\"\n    \"Lxv va 1\\n\"\n    \"iXr in 1\\n\"\n    \"pcE ch 1\\n\"\n    \"Nqy qu 1\\n\"\n    \"hjI th 1\\n\"\n    \"hzV th 1\\n\"\n    \"nmF an 1\\n\"\n    \"pvW va 1\\n\"\n    \"eJw er 1\\n\"\n    \"Iqd qu 1\\n\"\n    \"gXy ng 1\\n\"\n    \"wfW wa 1\\n\"\n    \"Vdw de 1\\n\"\n    \"qJx qu 1\\n\"\n    \"Pdq qu 1\\n\"\n    \"Bjb ij 1\\n\"\n    \"qLl qu 1\\n\"\n    \"zdW de 1\\n\"\n    \"fQr er 1\\n\"\n    \"xzW sz 1\\n\"\n    \"vwQ va 1\\n\"\n    \"rwU er 1\\n\"\n    \"qPn an 1\\n\"\n    \"bFw wa 1\\n\"\n    \"vHl le 1\\n\"\n    \"hWl th 1\\n\"\n    \"wgO ng 1\\n\"\n    \"hLk th 1\\n\"\n    \"Jkb ka 1\\n\"\n    \"zBh th 1\\n\"\n    \"Dhx th 1\\n\"\n    \"Fgv ng 1\\n\"\n    \"bpA pr 1\\n\"\n    \"zxC sz 1\\n\"\n    \"gfS ng 1\\n\"\n    \"Mvx va 1\\n\"\n    \"uPk qu 1\\n\"\n    \"Vqn an 1\\n\"\n    \"yqC qu 1\\n\"\n    \"vMk ka 1\\n\"\n    \"wqL qu 1\\n\"\n    \"wrJ er 1\\n\"\n    \"cdN ch 1\\n\"\n    \"pwR pr 1\\n\"\n    \"hMf th 1\\n\"\n    \"jPf ij 1\\n\"\n    \"Vbv va 1\\n\"\n    \"qzF qu 1\\n\"\n    \"qNc ch 1\\n\"\n    \"Jbq qu 1\\n\"\n    \"fTk ka 1\\n\"\n    \"Zff fo 1\\n\"\n    \"Fzt th 1\\n\"\n    \"Kcw ch 1\\n\"\n    \"eKf er 1\\n\"\n    \"pqZ qu 1\\n\"\n    \"Wpb pr 1\\n\"\n    \"jkF ij 1\\n\"\n    \"Vxp pr 1\\n\"\n    \"hGq th 1\\n\"\n    \"qBc ch 1\\n\"\n    \"fcT ch 1\\n\"\n    \"jMq qu 1\\n\"\n    \"kZv ka 1\\n\"\n    \"qkG qu 1\\n\"\n    \"Ifp pr 1\\n\"\n    \"dRw de 1\\n\"\n    \"Zlj le 1\\n\"\n    \"Kwj ij 1\\n\"\n    \"fNb be 1\\n\"\n    \"dYy de 1\\n\"\n    \"hZl th 1\\n\"\n    \"wtP th 1\\n\"\n    \"hPz th 1\\n\"\n    \"Ykc ch 1\\n\"\n    \"Jlw le 1\\n\"\n    \"jNt th 1\\n\"\n    \"yrW er 1\\n\"\n    \"gWd ng 1\\n\"\n    \"yXd de 1\\n\"\n    \"fQl le 1\\n\"\n    \"jfF ij 1\\n\"\n    \"Ejx ij 1\\n\"\n    \"fGk ka 1\\n\"\n    \"Zjz ij 1\\n\"\n    \"wdM de 1\\n\"\n    \"jlF le 1\\n\"\n    \"cxZ ch 1\\n\"\n    \"Zgk ng 1\\n\"\n    \"mcJ ch 1\\n\"\n    \"slE le 1\\n\"\n    \"nYq an 1\\n\"\n    \"Wfg ng 1\\n\"\n    \"zJk ka 1\\n\"\n    \"bvF va 1\\n\"\n    \"Hnz an 1\\n\"\n    \"Wkv ka 1\\n\"\n    \"Mvq qu 1\\n\"\n    \"Dxh th 1\\n\"\n    \"Bvt th 1\\n\"\n    \"sMj ij 1\\n\"\n    \"wRf wa 1\\n\"\n    \"vLb va 1\\n\"\n    \"zGq qu 1\\n\"\n    \"mFp me 1\\n\"\n    \"gNb ng 1\\n\"\n    \"pCg ng 1\\n\"\n    \"xFs sz 1\\n\"\n    \"jKf ij 1\\n\"\n    \"qJb qu 1\\n\"\n    \"pzI sz 1\\n\"\n    \"jgG ng 1\\n\"\n    \"pKs sz 1\\n\"\n    \"fqD qu 1\\n\"\n    \"gxQ ng 1\\n\"\n    \"fvG va 1\\n\"\n    \"wgF ng 1\\n\"\n    \"Xxz sz 1\\n\"\n    \"Lwu qu 1\\n\"\n    \"dlX le 1\\n\"\n    \"lPz le 1\\n\"\n    \"Wqk qu 1\\n\"\n    \"Xzj ij 1\\n\"\n    \"uHj qu 1\\n\"\n    \"uFj qu 1\\n\"\n    \"jvV ij 1\\n\"\n    \"jXe le 1\\n\"\n    \"Zfm me 1\\n\"\n    \"qIm qu 1\\n\"\n    \"zbB sz 1\\n\"\n    \"yZf ny 1\\n\"\n    \"sKk sz 1\\n\"\n    \"zpL sz 1\\n\"\n    \"qKg qu 1\\n\"\n    \"Ibj ij 1\\n\"\n    \"iQb in 1\\n\"\n    \"Fxu qu 1\\n\"\n    \"Fpb pr 1\\n\"\n    \"Wva an 1\\n\"\n    \"fzD sz 1\\n\"\n    \"bkT ka 1\\n\"\n    \"Ykt th 1\\n\"\n    \"njG an 1\\n\"\n    \"Uvh th 1\\n\"\n    \"gfT ng 1\\n\"\n    \"zcI ch 1\\n\"\n    \"bDq qu 1\\n\"\n    \"Jdh th 1\\n\"\n    \"xMg ng 1\\n\"\n    \"Jby be 1\\n\"\n    \"lwJ le 1\\n\"\n    \"sWw sz 1\\n\"\n    \"Svw va 1\\n\"\n    \"nrX an 1\\n\"\n    \"uvV qu 1\\n\"\n    \"jVr er 1\\n\"\n    \"tqB th 1\\n\"\n    \"bVr er 1\\n\"\n    \"kQl le 1\\n\"\n    \"fbG be 1\\n\"\n    \"rqM qu 1\\n\"\n    \"zHj ij 1\\n\"\n    \"fhY th 1\\n\"\n    \"Yzr er 1\\n\"\n    \"vFf va 1\\n\"\n    \"Qpg ng 1\\n\"\n    \"uAq qu 1\\n\"\n    \"zxP sz 1\\n\"\n    \"jCn an 1\\n\"\n    \"qaM an 1\\n\"\n    \"xlY le 1\\n\"\n    \"cTf ch 1\\n\"\n    \"kBf ka 1\\n\"\n    \"cQc ch 1\\n\"\n    \"Rbj ij 1\\n\"\n    \"kVs sz 1\\n\"\n    \"bGv va 1\\n\"\n    \"wdN de 1\\n\"\n    \"gfN ng 1\\n\"\n    \"bPj ij 1\\n\"\n    \"gcI ch 1\\n\"\n    \"gxj ng 1\\n\"\n    \"rHb er 1\\n\"\n    \"pVr er 1\\n\"\n    \"rVj er 1\\n\"\n    \"vgS ng 1\\n\"\n    \"Fqz qu 1\\n\"\n    \"xMk ka 1\\n\"\n    \"qQm qu 1\\n\"\n    \"jZc ch 1\\n\"\n    \"jBc ch 1\\n\"\n    \"uwY qu 1\\n\"\n    \"rHf er 1\\n\"\n    \"czX ch 1\\n\"\n    \"zcT ch 1\\n\"\n    \"bFj ij 1\\n\"\n    \"qcB ch 1\\n\"\n    \"hfT th 1\\n\"\n    \"xqO qu 1\\n\"\n    \"qfp qu 1\\n\"\n    \"xjU ij 1\\n\"\n    \"bhR th 1\\n\"\n    \"tWv th 1\\n\"\n    \"iqE in 1\\n\"\n    \"gpU ng 1\\n\"\n    \"iWb in 1\\n\"\n    \"tlP th 1\\n\"\n    \"tYq th 1\\n\"\n    \"bCv va 1\\n\"\n    \"oKc ch 1\\n\"\n    \"Sgj ng 1\\n\"\n    \"hvq th 1\\n\"\n    \"kfY ka 1\\n\"\n    \"zbM sz 1\\n\"\n    \"zvA sz 1\\n\"\n    \"cHp ch 1\\n\"\n    \"vvK va 1\\n\"\n    \"fpZ pr 1\\n\"\n    \"dfX de 1\\n\"\n    \"wrK er 1\\n\"\n    \"xeE er 1\\n\"\n    \"fkY ka 1\\n\"\n    \"sbX sz 1\\n\"\n    \"fcS ch 1\\n\"\n    \"vKh th 1\\n\"\n    \"Qlx le 1\\n\"\n    \"Zqh th 1\\n\"\n    \"qWg qu 1\\n\"\n    \"cdL ch 1\\n\"\n    \"jvG ij 1\\n\"\n    \"Mgx ng 1\\n\"\n    \"gwF ng 1\\n\"\n    \"kdP de 1\\n\"\n    \"uMr qu 1\\n\"\n    \"tcD th 1\\n\"\n    \"qrL qu 1\\n\"\n    \"Mtm th 1\\n\"\n    \"bQz sz 1\\n\"\n    \"Hpx pr 1\\n\"\n    \"zpI sz 1\\n\"\n    \"jkR ij 1\\n\"\n    \"khH th 1\\n\"\n    \"mSq qu 1\\n\"\n    \"pFz sz 1\\n\"\n    \"juO qu 1\\n\"\n    \"Xyq qu 1\\n\"\n    \"jGd de 1\\n\"\n    \"Yzd de 1\\n\"\n    \"wbC wa 1\\n\"\n    \"wSb wa 1\\n\"\n    \"sZd de 1\\n\"\n    \"Rzx sz 1\\n\"\n    \"Flx le 1\\n\"\n    \"bqC qu 1\\n\"\n    \"lcH ch 1\\n\"\n    \"wmG me 1\\n\"\n    \"zCj ij 1\\n\"\n    \"xaD an 1\\n\"\n    \"iwH in 1\\n\"\n    \"qDp qu 1\\n\"\n    \"sGx sz 1\\n\"\n    \"Xhy th 1\\n\"\n    \"eVc ch 1\\n\"\n    \"wkJ wa 1\\n\"\n    \"Lcf ch 1\\n\"\n    \"lgQ ng 1\\n\"\n    \"Dhh th 1\\n\"\n    \"zfO sz 1\\n\"\n    \"kVc ch 1\\n\"\n    \"hmL th 1\\n\"\n    \"Owf wa 1\\n\"\n    \"wZc ch 1\\n\"\n    \"dnN an 1\\n\"\n    \"Mzp sz 1\\n\"\n    \"mYw me 1\\n\"\n    \"yLh th 1\\n\"\n    \"Xxr er 1\\n\"\n    \"qwI qu 1\\n\"\n    \"Txs sz 1\\n\"\n    \"yKp pr 1\\n\"\n    \"bjX ij 1\\n\"\n    \"pbS pr 1\\n\"\n    \"zrP er 1\\n\"\n    \"hJm th 1\\n\"\n    \"qgA qu 1\\n\"\n    \"zwY sz 1\\n\"\n    \"rXk er 1\\n\"\n    \"nDx an 1\\n\"\n    \"vGz sz 1\\n\"\n    \"mQq qu 1\\n\"\n    \"upY qu 1\\n\"\n    \"rLn an 1\\n\"\n    \"Vfk ka 1\\n\"\n    \"wCv va 1\\n\"\n    \"cgx ch 1\\n\"\n    \"kZq qu 1\\n\"\n    \"Wjw ij 1\\n\"\n    \"Qax an 1\\n\"\n    \"grG ng 1\\n\"\n    \"bJd de 1\\n\"\n    \"dJx de 1\\n\"\n    \"cMd ch 1\\n\"\n    \"Qcs ch 1\\n\"\n    \"mkK ka 1\\n\"\n    \"jNx ij 1\\n\"\n    \"mrY er 1\\n\"\n    \"Xwx wa 1\\n\"\n    \"rZl er 1\\n\"\n    \"gxU ng 1\\n\"\n    \"Lnv an 1\\n\"\n    \"ygC ng 1\\n\"\n    \"Dqh th 1\\n\"\n    \"lLn an 1\\n\"\n    \"mnQ an 1\\n\"\n    \"kjU ij 1\\n\"\n    \"bvO va 1\\n\"\n    \"oVm on 1\\n\"\n    \"vWt th 1\\n\"\n    \"rGq qu 1\\n\"\n    \"tbJ th 1\\n\"\n    \"fSv va 1\\n\"\n    \"wJn an 1\\n\"\n    \"fJv va 1\\n\"\n    \"oQv on 1\\n\"\n    \"Vws sz 1\\n\"\n    \"pnU an 1\\n\"\n    \"Nmh th 1\\n\"\n    \"cTq ch 1\\n\"\n    \"Edx de 1\\n\"\n    \"uqw qu 1\\n\"\n    \"Yrh th 1\\n\"\n    \"Qnx an 1\\n\"\n    \"mJf me 1\\n\"\n    \"kDq qu 1\\n\"\n    \"Xhd th 1\\n\"\n    \"nLx an 1\\n\"\n    \"xkU ka 1\\n\"\n    \"fqT qu 1\\n\"\n    \"qYh th 1\\n\"\n    \"bFv va 1\\n\"\n    \"xbQ be 1\\n\"\n    \"vcS ch 1\\n\"\n    \"qqT qu 1\\n\"\n    \"gkF ng 1\\n\"\n    \"zFh th 1\\n\"\n    \"kpE ka 1\\n\"\n    \"Gxb be 1\\n\"\n    \"Ztw th 1\\n\"\n    \"qIl qu 1\\n\"\n    \"Qkd de 1\\n\"\n    \"wdV de 1\\n\"\n    \"rwP er 1\\n\"\n    \"aCg an 1\\n\"\n    \"Zrs er 1\\n\"\n    \"zmW sz 1\\n\"\n    \"vfO va 1\\n\"\n    \"hBj th 1\\n\"\n    \"tbH th 1\\n\"\n    \"Dxv va 1\\n\"\n    \"zdD de 1\\n\"\n    \"nBw an 1\\n\"\n    \"lrV er 1\\n\"\n    \"gQq ng 1\\n\"\n    \"tlK th 1\\n\"\n    \"ztP th 1\\n\"\n    \"yqV qu 1\\n\"\n    \"nRm an 1\\n\"\n    \"jVz sz 1\\n\"\n    \"Crq er 1\\n\"\n    \"fFg ng 1\\n\"\n    \"Xjg ng 1\\n\"\n    \"Cml le 1\\n\"\n    \"qWj qu 1\\n\"\n    \"jzO ij 1\\n\"\n    \"Mdq qu 1\\n\"\n    \"mtQ th 1\\n\"\n    \"rGv er 1\\n\"\n    \"kGn an 1\\n\"\n    \"mLg ng 1\\n\"\n    \"uWj qu 1\\n\"\n    \"Rcq ch 1\\n\"\n    \"cVp ch 1\\n\"\n    \"bWk ka 1\\n\"\n    \"Xzx sz 1\\n\"\n    \"Wkb ka 1\\n\"\n    \"xzH sz 1\\n\"\n    \"quP un 1\\n\"\n    \"dHv de 1\\n\"\n    \"Dmq qu 1\\n\"\n    \"Dgv ng 1\\n\"\n    \"tgY th 1\\n\"\n    \"jtM th 1\\n\"\n    \"tMz th 1\\n\"\n    \"bHm me 1\\n\"\n    \"Zfk ka 1\\n\"\n    \"xZp pr 1\\n\"\n    \"jkH ij 1\\n\"\n    \"rNp er 1\\n\"\n    \"xMv va 1\\n\"\n    \"wpF pr 1\\n\"\n    \"djD de 1\\n\"\n    \"bxV be 1\\n\"\n    \"hgS th 1\\n\"\n    \"Pkh th 1\\n\"\n    \"Dxq qu 1\\n\"\n    \"mMx me 1\\n\"\n    \"dGj de 1\\n\"\n    \"kbH ka 1\\n\"\n    \"Lhg th 1\\n\"\n    \"Dvq qu 1\\n\"\n    \"qrT qu 1\\n\"\n    \"Ijw ij 1\\n\"\n    \"wuI qu 1\\n\"\n    \"Zwn an 1\\n\"\n    \"dhJ th 1\\n\"\n    \"qcR ch 1\\n\"\n    \"whM th 1\\n\"\n    \"pgP ng 1\\n\"\n    \"qkR qu 1\\n\"\n    \"sqR qu 1\\n\"\n    \"lxY le 1\\n\"\n    \"vVw va 1\\n\"\n    \"lKd le 1\\n\"\n    \"Nly le 1\\n\"\n    \"yKz sz 1\\n\"\n    \"qBb qu 1\\n\"\n    \"wQx wa 1\\n\"\n    \"kYw ka 1\\n\"\n    \"fQd de 1\\n\"\n    \"svW sz 1\\n\"\n    \"yGp pr 1\\n\"\n    \"ytB th 1\\n\"\n    \"jvU ij 1\\n\"\n    \"kjz ka 1\\n\"\n    \"jVc ch 1\\n\"\n    \"Qbz sz 1\\n\"\n    \"pqM qu 1\\n\"\n    \"vwu ku 1\\n\"\n    \"Qww wa 1\\n\"\n    \"dcZ ch 1\\n\"\n    \"lhG th 1\\n\"\n    \"gmS ng 1\\n\"\n    \"Iqz qu 1\\n\"\n    \"zZf sz 1\\n\"\n    \"hLn th 1\\n\"\n    \"eMf er 1\\n\"\n    \"xNq qu 1\\n\"\n    \"mPm um 1\\n\"\n    \"pMg ng 1\\n\"\n    \"wzW sz 1\\n\"\n    \"kRl le 1\\n\"\n    \"hzK th 1\\n\"\n    \"fbO be 1\\n\"\n    \"Xxt th 1\\n\"\n    \"Fnx an 1\\n\"\n    \"Bvn an 1\\n\"\n    \"bjZ ij 1\\n\"\n    \"tcY th 1\\n\"\n    \"dmB de 1\\n\"\n    \"qFe qu 1\\n\"\n    \"kxB ka 1\\n\"\n    \"qBz qu 1\\n\"\n    \"pVp pr 1\\n\"\n    \"boQ on 1\\n\"\n    \"xoH on 1\\n\"\n    \"dWg de 1\\n\"\n    \"Tdq qu 1\\n\"\n    \"zNq qu 1\\n\"\n    \"vYp va 1\\n\"\n    \"pDf pr 1\\n\"\n    \"lwG le 1\\n\"\n    \"hDq th 1\\n\"\n    \"Jdy de 1\\n\"\n    \"snZ an 1\\n\"\n    \"mzU sz 1\\n\"\n    \"zKx sz 1\\n\"\n    \"rvC er 1\\n\"\n    \"wuS qu 1\\n\"\n    \"dnQ an 1\\n\"\n    \"vCy va 1\\n\"\n    \"Udw wa 1\\n\"\n    \"bTl le 1\\n\"\n    \"qbC qu 1\\n\"\n    \"tbT th 1\\n\"\n    \"iDk ka 1\\n\"\n    \"Whb th 1\\n\"\n    \"tbX th 1\\n\"\n    \"tfO th 1\\n\"\n    \"Tfq qu 1\\n\"\n    \"dbW de 1\\n\"\n    \"Bdy de 1\\n\"\n    \"vjR ij 1\\n\"\n    \"cbC ch 1\\n\"\n    \"wuW qu 1\\n\"\n    \"wCw wa 1\\n\"\n    \"Wdq qu 1\\n\"\n    \"vRb va 1\\n\"\n    \"bWm me 1\\n\"\n    \"vZw va 1\\n\"\n    \"dJj de 1\\n\"\n    \"qZy qu 1\\n\"\n    \"Jgq ng 1\\n\"\n    \"zbH sz 1\\n\"\n    \"hJl th 1\\n\"\n    \"Xhg th 1\\n\"\n    \"nVp an 1\\n\"\n    \"dVc ch 1\\n\"\n    \"qCc ch 1\\n\"\n    \"oYg ng 1\\n\"\n    \"kwH ka 1\\n\"\n    \"vwN va 1\\n\"\n    \"zfw sz 1\\n\"\n    \"vlO le 1\\n\"\n    \"ztX ti 1\\n\"\n    \"dKx de 1\\n\"\n    \"xQs sz 1\\n\"\n    \"cDl ch 1\\n\"\n    \"yVv va 1\\n\"\n    \"zpN sz 1\\n\"\n    \"xkG ka 1\\n\"\n    \"eqW qu 1\\n\"\n    \"jdD di 1\\n\"\n    \"fQm me 1\\n\"\n    \"Yhl th 1\\n\"\n    \"tBf th 1\\n\"\n    \"qEf qu 1\\n\"\n    \"whX th 1\\n\"\n    \"Vgv ng 1\\n\"\n    \"Lsq qu 1\\n\"\n    \"dfJ de 1\\n\"\n    \"Zdp de 1\\n\"\n    \"rZc ch 1\\n\"\n    \"tZh ch 1\\n\"\n    \"mtC th 1\\n\"\n    \"zxQ sz 1\\n\"\n    \"Vnj an 1\\n\"\n    \"sHg ng 1\\n\"\n    \"wYl le 1\\n\"\n    \"Bqb qu 1\\n\"\n    \"yrV er 1\\n\"\n    \"Ycs ch 1\\n\"\n    \"jRw ij 1\\n\"\n    \"iWt th 1\\n\"\n    \"hVw th 1\\n\"\n    \"wZs sz 1\\n\"\n    \"Cqo qu 1\\n\"\n    \"Gfn an 1\\n\"\n    \"rBv er 1\\n\"\n    \"Ojz sz 1\\n\"\n    \"zGf sz 1\\n\"\n    \"bZc ch 1\\n\"\n    \"Fvd de 1\\n\"\n    \"Zgs ng 1\\n\"\n    \"Rfg ng 1\\n\"\n    \"Rww wa 1\\n\"\n    \"Yrp er 1\\n\"\n    \"iFp in 1\\n\"\n    \"bVx be 1\\n\"\n    \"zfM sz 1\\n\"\n    \"qdV qu 1\\n\"\n    \"bGm me 1\\n\"\n    \"tnJ th 1\\n\"\n    \"pdR de 1\\n\"\n    \"gBc ch 1\\n\"\n    \"gzC ng 1\\n\"\n    \"Pwc ch 1\\n\"\n    \"uAw qu 1\\n\"\n    \"znX an 1\\n\"\n    \"vgT ng 1\\n\"\n    \"oAw ko 1\\n\"\n    \"xBm me 1\\n\"\n    \"dNf de 1\\n\"\n    \"Pqs qu 1\\n\"\n    \"Npd di 1\\n\"\n    \"oUy ko 1\\n\"\n    \"fpD pr 1\\n\"\n    \"Rfx fo 1\\n\"\n    \"lXm le 1\\n\"\n    \"qWs qu 1\\n\"\n    \"gWv vi 1\\n\"\n    \"Fwv va 1\\n\"\n    \"Lqj qu 1\\n\"\n    \"fvQ va 1\\n\"\n    \"zgB ng 1\\n\"\n    \"kJl le 1\\n\"\n    \"vWo on 1\\n\"\n    \"Xvc ch 1\\n\"\n    \"yDq qu 1\\n\"\n    \"bdP de 1\\n\"\n    \"jVf ij 1\\n\"\n    \"wPw wa 1\\n\"\n    \"dwA de 1\\n\"\n    \"Oqp qu 1\\n\"\n    \"qiZ in 1\\n\"\n    \"xdV de 1\\n\"\n    \"qFg ng 1\\n\"\n    \"qzI qu 1\\n\"\n    \"ywL wa 1\\n\"\n    \"sWv sz 1\\n\"\n    \"Tpy pr 1\\n\"\n    \"wbf wa 1\\n\"\n    \"uPg ng 1\\n\"\n    \"Knw an 1\\n\"\n    \"iuO in 1\\n\"\n    \"Qdn an 1\\n\"\n    \"Yfv va 1\\n\"\n    \"wuK qu 1\\n\"\n    \"xLn an 1\\n\"\n    \"yJg ng 1\\n\"\n    \"Nfk ka 1\\n\"\n    \"Yql qu 1\\n\"\n    \"qsH qu 1\\n\"\n    \"Rzv sz 1\\n\"\n    \"bIp pr 1\\n\"\n    \"sQt th 1\\n\"\n    \"tgC th 1\\n\"\n    \"qSa an 1\\n\"\n    \"fxQ fo 1\\n\"\n    \"hcZ th 1\\n\"\n    \"wbJ wa 1\\n\"\n    \"qRl qu 1\\n\"\n    \"Gcy ch 1\\n\"\n    \"vZm va 1\\n\"\n    \"Xzl le 1\\n\"\n    \"wgR ng 1\\n\"\n    \"dlO le 1\\n\"\n    \"tCb th 1\\n\"\n    \"qmY qu 1\\n\"\n    \"qZx qu 1\\n\"\n    \"Lbp pr 1\\n\"\n    \"Dgq ng 1\\n\"\n    \"Vkj ij 1\\n\"\n    \"wqU qu 1\\n\"\n    \"Mqk qu 1\\n\"\n    \"wUv va 1\\n\"\n    \"qgC ng 1\\n\"\n    \"sbD sz 1\\n\"\n    \"Sqy qu 1\\n\"\n    \"bMq qu 1\\n\"\n    \"Bzt th 1\\n\"\n    \"sIq qu 1\\n\"\n    \"cVj ch 1\\n\"\n    \"wJt th 1\\n\"\n    \"Xjm ij 1\\n\"\n    \"Hmg ng 1\\n\"\n    \"aQd an 1\\n\"\n    \"iHt th 1\\n\"\n    \"fMm me 1\\n\"\n    \"wWc ch 1\\n\"\n    \"fuE qu 1\\n\"\n    \"mCf me 1\\n\"\n    \"qnP an 1\\n\"\n    \"zLn an 1\\n\"\n    \"kRt th 1\\n\"\n    \"Mvl le 1\\n\"\n    \"mRd de 1\\n\"\n    \"yfJ ny 1\\n\"\n    \"xCb be 1\\n\"\n    \"sQb sz 1\\n\"\n    \"quC un 1\\n\"\n    \"Ctc th 1\\n\"\n    \"pPv va 1\\n\"\n    \"zjI sz 1\\n\"\n    \"xmC me 1\\n\"\n    \"xdJ de 1\\n\"\n    \"nXv an 1\\n\"\n    \"vsO sz 1\\n\"\n    \"pRd de 1\\n\"\n    \"vbF va 1\\n\"\n    \"wNl le 1\\n\"\n    \"kHq qu 1\\n\"\n    \"rwM er 1\\n\"\n    \"gxD ng 1\\n\"\n    \"Qhi th 1\\n\"\n    \"mqB qu 1\\n\"\n    \"pnL an 1\\n\"\n    \"bKb be 1\\n\"\n    \"iqN in 1\\n\"\n    \"dkX de 1\\n\"\n    \"bQd de 1\\n\"\n    \"bNj ij 1\\n\"\n    \"Tlk le 1\\n\"\n    \"Nlg ng 1\\n\"\n    \"Cxh th 1\\n\"\n    \"Mqf qu 1\\n\"\n    \"Pvj ij 1\\n\"\n    \"zwZ sz 1\\n\"\n    \"pGb pr 1\\n\"\n    \"nrF an 1\\n\"\n    \"bkS ka 1\\n\"\n    \"dRv de 1\\n\"\n    \"jJm ij 1\\n\"\n    \"iqF in 1\\n\"\n    \"fGc ch 1\\n\"\n    \"nxW an 1\\n\"\n    \"xsW sz 1\\n\"\n    \"mfQ me 1\\n\"\n    \"fgP ng 1\\n\"\n    \"jlH le 1\\n\"\n    \"nrI an 1\\n\"\n    \"kXv ka 1\\n\"\n    \"Vpq qu 1\\n\"\n    \"zMk sz 1\\n\"\n    \"pHf pr 1\\n\"\n    \"jdM de 1\\n\"\n    \"bqJ qu 1\\n\"\n    \"Ckt th 1\\n\"\n    \"zKv sz 1\\n\"\n    \"jzG sz 1\\n\"\n    \"uIx qu 1\\n\"\n    \"yNm me 1\\n\"\n    \"jYt th 1\\n\"\n    \"fwL wa 1\\n\"\n    \"dZx de 1\\n\"\n    \"vgF ng 1\\n\"\n    \"wXi in 1\\n\"\n    \"vZt th 1\\n\"\n    \"Ctf th 1\\n\"\n    \"xqC qu 1\\n\"\n    \"qOc ch 1\\n\"\n    \"ygX ng 1\\n\"\n    \"kWk ka 1\\n\"\n    \"grF ng 1\\n\"\n    \"qnX an 1\\n\"\n    \"xUi in 1\\n\"\n    \"pmC me 1\\n\"\n    \"uzE qu 1\\n\"\n    \"Ivw va 1\\n\"\n    \"gvI ng 1\\n\"\n    \"knZ an 1\\n\"\n    \"lxZ le 1\\n\"\n    \"Xwf wa 1\\n\"\n    \"Dqb qu 1\\n\"\n    \"yKg ng 1\\n\"\n    \"Vwg ng 1\\n\"\n    \"xSb be 1\\n\"\n    \"Hwp pr 1\\n\"\n    \"yNx ny 1\\n\"\n    \"yoQ on 1\\n\"\n    \"cSx ch 1\\n\"\n    \"Evq qu 1\\n\"\n    \"tIw th 1\\n\"\n    \"dfZ de 1\\n\"\n    \"hzP th 1\\n\"\n    \"xBk ka 1\\n\"\n    \"kqr qu 1\\n\"\n    \"yBm me 1\\n\"\n    \"lJj le 1\\n\"\n    \"cjq ch 1\\n\"\n    \"drW er 1\\n\"\n    \"qaD an 1\\n\"\n    \"wDf wa 1\\n\"\n    \"Lxz sz 1\\n\"\n    \"zQf fo 1\\n\"\n    \"Jtq th 1\\n\"\n    \"qRv qu 1\\n\"\n    \"Gfc ch 1\\n\"\n    \"Xbt th 1\\n\"\n    \"wZb wa 1\\n\"\n    \"srQ er 1\\n\"\n    \"gJq ng 1\\n\"\n    \"jFt th 1\\n\"\n    \"gNc ch 1\\n\"\n    \"Rkr er 1\\n\"\n    \"pzJ sz 1\\n\"\n    \"lbA le 1\\n\"\n    \"cBq ch 1\\n\"\n    \"Kyq qu 1\\n\"\n    \"xcO ch 1\\n\"\n    \"zXr er 1\\n\"\n    \"cVs ch 1\\n\"\n    \"rYm er 1\\n\"\n    \"kVm ka 1\\n\"\n    \"fcZ ch 1\\n\"\n    \"fzC sz 1\\n\"\n    \"tKp th 1\\n\"\n    \"gPz ng 1\\n\"\n    \"qcL ch 1\\n\"\n    \"Yjr er 1\\n\"\n    \"zxU sz 1\\n\"\n    \"xbT be 1\\n\"\n    \"nvX an 1\\n\"\n    \"qmR qu 1\\n\"\n    \"bxL be 1\\n\"\n    \"Xww wa 1\\n\"\n    \"jSf ij 1\\n\"\n    \"lNf le 1\\n\"\n    \"zTs sz 1\\n\"\n    \"kFq qu 1\\n\"\n    \"qLz qu 1\\n\"\n    \"rrX er 1\\n\"\n    \"wXg ng 1\\n\"\n    \"zvE sz 1\\n\"\n    \"Hwx wa 1\\n\"\n    \"qFm qu 1\\n\"\n    \"cgR ch 1\\n\"\n    \"pDp pr 1\\n\"\n    \"Oqb qu 1\\n\"\n    \"sVc ch 1\\n\"\n    \"Xtx th 1\\n\"\n    \"Qwt th 1\\n\"\n    \"Wfe er 1\\n\"\n    \"Pcx ch 1\\n\"\n    \"bpO pr 1\\n\"\n    \"Cwg ng 1\\n\"\n    \"wxO wa 1\\n\"\n    \"bVs sz 1\\n\"\n    \"jFw ij 1\\n\"\n    \"fnF an 1\\n\"\n    \"kxH ka 1\\n\"\n    \"Yws sz 1\\n\"\n    \"gdD ng 1\\n\"\n    \"jWx ij 1\\n\"\n    \"cTl ch 1\\n\"\n    \"kmW ka 1\\n\"\n    \"mhW th 1\\n\"\n    \"bzT sz 1\\n\"\n    \"rvJ er 1\\n\"\n    \"xcJ ch 1\\n\"\n    \"vkS ka 1\\n\"\n    \"sXr er 1\\n\"\n    \"sCv sz 1\\n\"\n    \"Ntp th 1\\n\"\n    \"oHh lo 1\\n\"\n    \"Yvs sz 1\\n\"\n    \"pVf pr 1\\n\"\n    \"kEq qu 1\\n\"\n    \"qfE qu 1\\n\"\n    \"oWm on 1\\n\"\n    \"tMw th 1\\n\"\n    \"zYp sz 1\\n\"\n    \"nFw an 1\\n\"\n    \"yQc ch 1\\n\"\n    \"zQj sz 1\\n\"\n    \"wKq qu 1\\n\"\n    \"mKf me 1\\n\"\n    \"uLr qu 1\\n\"\n    \"wIb wa 1\\n\"\n    \"wrH er 1\\n\"\n    \"pgL ng 1\\n\"\n    \"Lbt th 1\\n\"\n    \"zjF sz 1\\n\"\n    \"qFp qu 1\\n\"\n    \"zdX de 1\\n\"\n    \"wTc ch 1\\n\"\n    \"Jwl le 1\\n\"\n    \"lxU le 1\\n\"\n    \"hjA th 1\\n\"\n    \"iPg in 1\\n\"\n    \"Xns an 1\\n\"\n    \"wkW ka 1\\n\"\n    \"pfP pr 1\\n\"\n    \"Dyq qu 1\\n\"\n    \"jWu qu 1\\n\"\n    \"qzR qu 1\\n\"\n    \"Yjz sz 1\\n\"\n    \"twX th 1\\n\"\n    \"Nwj ij 1\\n\"\n    \"jbB ij 1\\n\"\n    \"qwR qu 1\\n\"\n    \"Ytf th 1\\n\"\n    \"blX le 1\\n\"\n    \"xZk ka 1\\n\"\n    \"Ymw me 1\\n\"\n    \"wfX wa 1\\n\"\n    \"Vqy qu 1\\n\"\n    \"Xqn an 1\\n\"\n    \"yUw wa 1\\n\"\n    \"jzT jo 1\\n\"\n    \"kNt th 1\\n\"\n    \"pmQ me 1\\n\"\n    \"dXr er 1\\n\"\n    \"ylq qu 1\\n\"\n    \"tWz th 1\\n\"\n    \"Kvr er 1\\n\"\n    \"bhQ th 1\\n\"\n    \"uJn an 1\\n\"\n    \"pbT pr 1\\n\"\n    \"aBf an 1\\n\"\n    \"Rhj th 1\\n\"\n    \"uAx qu 1\\n\"\n    \"Bgx ng 1\\n\"\n    \"jqN qu 1\\n\"\n    \"jdC ij 1\\n\"\n    \"fBs st 1\\n\"\n    \"cXk ch 1\\n\"\n    \"nmM an 1\\n\"\n    \"xRr er 1\\n\"\n    \"Hkz sz 1\\n\"\n    \"dhZ th 1\\n\"\n    \"Fyp pr 1\\n\"\n    \"kGm ka 1\\n\"\n    \"sGq qu 1\\n\"\n    \"jKh th 1\\n\"\n    \"vDz sz 1\\n\"\n    \"vLq qu 1\\n\"\n    \"lJs le 1\\n\"\n    \"zNn an 1\\n\"\n    \"Wgj ng 1\\n\"\n    \"jmL ij 1\\n\"\n    \"gVt th 1\\n\"\n    \"wFz sz 1\\n\"\n    \"zbD sz 1\\n\"\n    \"kTd de 1\\n\"\n    \"dwX de 1\\n\"\n    \"xRl le 1\\n\"\n    \"Azv sz 1\\n\"\n    \"bQh th 1\\n\"\n    \"qQf qu 1\\n\"\n    \"yoZ on 1\\n\"\n    \"jPs sz 1\\n\"\n    \"jyG ij 1\\n\"\n    \"kXj ka 1\\n\"\n    \"yBv va 1\\n\"\n    \"nwP an 1\\n\"\n    \"xnA an 1\\n\"\n    \"bKf be 1\\n\"\n    \"qbP qu 1\\n\"\n    \"vGs sz 1\\n\"\n    \"jjG ij 1\\n\"\n    \"Kqc ch 1\\n\"\n    \"zVt th 1\\n\"\n    \"wSg ng 1\\n\"\n    \"sWm sz 1\\n\"\n    \"fDg ng 1\\n\"\n    \"pHz sz 1\\n\"\n    \"fYp pr 1\\n\"\n    \"zrW er 1\\n\"\n    \"lDx le 1\\n\"\n    \"hQh th 1\\n\"\n    \"Bdp de 1\\n\"\n    \"fqZ qu 1\\n\"\n    \"oQm on 1\\n\"\n    \"Qsq qu 1\\n\"\n    \"xjq qu 1\\n\"\n    \"Mfv va 1\\n\"\n    \"zbQ sz 1\\n\"\n    \"quR un 1\\n\"\n    \"cMb ch 1\\n\"\n    \"zqD qu 1\\n\"\n    \"dXf de 1\\n\"\n    \"rHh th 1\\n\"\n    \"jhF th 1\\n\"\n    \"nNf an 1\\n\"\n    \"wHb wa 1\\n\"\n    \"Tpq qu 1\\n\"\n    \"bjY ij 1\\n\"\n    \"cJq ch 1\\n\"\n    \"lCk le 1\\n\"\n    \"Pfp pr 1\\n\"\n    \"Oqn an 1\\n\"\n    \"fmR me 1\\n\"\n    \"Qpu qu 1\\n\"\n    \"Ncv ch 1\\n\"\n    \"qYr qu 1\\n\"\n    \"sfA sz 1\\n\"\n    \"frS er 1\\n\"\n    \"Gpf pr 1\\n\"\n    \"jmD ij 1\\n\"\n    \"hwI th 1\\n\"\n    \"Rbz sz 1\\n\"\n    \"jhB th 1\\n\"\n    \"xXj ij 1\\n\"\n    \"qYd qu 1\\n\"\n    \"sVf sz 1\\n\"\n    \"cCz ch 1\\n\"\n    \"qMl qu 1\\n\"\n    \"fpK pr 1\\n\"\n    \"hVy th 1\\n\"\n    \"lcJ ch 1\\n\"\n    \"Okj ij 1\\n\"\n    \"qJg ng 1\\n\"\n    \"jLp ij 1\\n\"\n    \"nYf an 1\\n\"\n    \"npF on 1\\n\"\n    \"rWk er 1\\n\"\n    \"mcP ch 1\\n\"\n    \"nZm an 1\\n\"\n    \"fYb fo 1\\n\"\n    \"zbC sz 1\\n\"\n    \"nBq an 1\\n\"\n    \"fjy ij 1\\n\"\n    \"bIx be 1\\n\"\n    \"twN th 1\\n\"\n    \"Ggk ng 1\\n\"\n    \"Czm sz 1\\n\"\n    \"jtO th 1\\n\"\n    \"nRl an 1\\n\"\n    \"jyC ij 1\\n\"\n    \"yEh th 1\\n\"\n    \"vmH va 1\\n\"\n    \"wtQ th 1\\n\"\n    \"wIf wa 1\\n\"\n    \"jIf ij 1\\n\"\n    \"qbM qu 1\\n\"\n    \"Rwq qu 1\\n\"\n    \"fqF qu 1\\n\"\n    \"Wfj ij 1\\n\"\n    \"jfW ij 1\\n\"\n    \"wWm me 1\\n\"\n    \"Wpp pr 1\\n\"\n    \"Mgj ng 1\\n\"\n    \"dSf de 1\\n\"\n    \"wYv va 1\\n\"\n    \"ccI ch 1\\n\"\n    \"ylT le 1\\n\"\n    \"Gqh th 1\\n\"\n    \"Cmz sz 1\\n\"\n    \"Hfk ka 1\\n\"\n    \"qBt th 1\\n\"\n    \"yCf ny 1\\n\"\n    \"qzO qu 1\\n\"\n    \"ydF de 1\\n\"\n    \"Vdt th 1\\n\"\n    \"pJd de 1\\n\"\n    \"sfR sz 1\\n\"\n    \"dlV le 1\\n\"\n    \"jOd de 1\\n\"\n    \"nfF an 1\\n\"\n    \"wTt th 1\\n\"\n    \"rGk er 1\\n\"\n    \"xAw wa 1\\n\"\n    \"vfF va 1\\n\"\n    \"Dzg ng 1\\n\"\n    \"kFp ka 1\\n\"\n    \"jTm ij 1\\n\"\n    \"nNq an 1\\n\"\n    \"qcN ch 1\\n\"\n    \"Jjx ij 1\\n\"\n    \"tKf th 1\\n\"\n    \"Zrq qu 1\\n\"\n    \"hmK th 1\\n\"\n    \"Mqz qu 1\\n\"\n    \"xfR fo 1\\n\"\n    \"wQq qu 1\\n\"\n    \"mqG qu 1\\n\"\n    \"xUr er 1\\n\"\n    \"oiU in 1\\n\"\n    \"qsS qu 1\\n\"\n    \"qGg ng 1\\n\"\n    \"qtO th 1\\n\"\n    \"tPb th 1\\n\"\n    \"Rqm qu 1\\n\"\n    \"vkX ka 1\\n\"\n    \"Wsb st 1\\n\"\n    \"cxR ch 1\\n\"\n    \"fZr er 1\\n\"\n    \"yQg ng 1\\n\"\n    \"ziU in 1\\n\"\n    \"xvW va 1\\n\"\n    \"aDx an 1\\n\"\n    \"bQj ij 1\\n\"\n    \"jxC ij 1\\n\"\n    \"Twk ka 1\\n\"\n    \"sQh th 1\\n\"\n    \"Bfx fo 1\\n\"\n    \"aGj an 1\\n\"\n    \"Pgc ch 1\\n\"\n    \"Hzh th 1\\n\"\n    \"qgW ng 1\\n\"\n    \"kdF de 1\\n\"\n    \"kbY ka 1\\n\"\n    \"Qjx ij 1\\n\"\n    \"Hxj ij 1\\n\"\n    \"tVx th 1\\n\"\n    \"nxZ an 1\\n\"\n    \"oVd on 1\\n\"\n    \"Hlq qu 1\\n\"\n    \"jKz sz 1\\n\"\n    \"qAi in 1\\n\"\n    \"dNl le 1\\n\"\n    \"pqA qu 1\\n\"\n    \"eIv er 1\\n\"\n    \"xmW me 1\\n\"\n    \"ycK ch 1\\n\"\n    \"mQd de 1\\n\"\n    \"hmU th 1\\n\"\n    \"nlF an 1\\n\"\n    \"Gkl le 1\\n\"\n    \"qBq qu 1\\n\"\n    \"rhQ th 1\\n\"\n    \"Znk an 1\\n\"\n    \"Vfp pr 1\\n\"\n    \"nBn an 1\\n\"\n    \"qvL qu 1\\n\"\n    \"aqN an 1\\n\"\n    \"kLf ka 1\\n\"\n    \"zJr er 1\\n\"\n    \"tQw th 1\\n\"\n    \"sWq qu 1\\n\"\n    \"bwW wa 1\\n\"\n    \"vzB sz 1\\n\"\n    \"yyR ny 1\\n\"\n    \"qqN qu 1\\n\"\n    \"wyI ny 1\\n\"\n    \"jzJ sz 1\\n\"\n    \"qgI qu 1\\n\"\n    \"bgQ ng 1\\n\"\n    \"yLt th 1\\n\"\n    \"Vqq qu 1\\n\"\n    \"Xnr an 1\\n\"\n    \"wHg ng 1\\n\"\n    \"aQg an 1\\n\"\n    \"cFh th 1\\n\"\n    \"zjQ sz 1\\n\"\n    \"gpD ng 1\\n\"\n    \"xzN sz 1\\n\"\n    \"iIw in 1\\n\"\n    \"dQg ng 1\\n\"\n    \"pQy pr 1\\n\"\n    \"Xyx ny 1\\n\"\n    \"sWc ch 1\\n\"\n    \"jFd de 1\\n\"\n    \"bpF pr 1\\n\"\n    \"Vsv st 1\\n\"\n    \"Qql qu 1\\n\"\n    \"wzT sz 1\\n\"\n    \"sqQ qu 1\\n\"\n    \"Kzm sz 1\\n\"\n    \"oFq qu 1\\n\"\n    \"gkJ ng 1\\n\"\n    \"hkH th 1\\n\"\n    \"qLg ng 1\\n\"\n    \"bmU me 1\\n\"\n    \"crJ ch 1\\n\"\n    \"slX le 1\\n\"\n    \"Tzx sz 1\\n\"\n    \"qbx qu 1\\n\"\n    \"kpI ka 1\\n\"\n    \"xCf fo 1\\n\"\n    \"Fml le 1\\n\"\n    \"Qhj th 1\\n\"\n    \"tQs th 1\\n\"\n    \"vRd de 1\\n\"\n    \"Ycb ch 1\\n\"\n    \"cjP ch 1\\n\"\n    \"yuE qu 1\\n\"\n    \"gIi in 1\\n\"\n    \"kWg ng 1\\n\"\n    \"Jwh th 1\\n\"\n    \"fVy ny 1\\n\"\n    \"jqy qu 1\\n\"\n    \"Wzp sz 1\\n\"\n    \"Cwc ch 1\\n\"\n    \"qEy qu 1\\n\"\n    \"jrX er 1\\n\"\n    \"Kqi in 1\\n\"\n    \"lYv le 1\\n\"\n    \"dGv de 1\\n\"\n    \"Cwj ij 1\\n\"\n    \"nDv an 1\\n\"\n    \"Ojm ij 1\\n\"\n    \"Dnx an 1\\n\"\n    \"vrF er 1\\n\"\n    \"Jmr er 1\\n\"\n    \"zfI sz 1\\n\"\n    \"bqT qu 1\\n\"\n    \"Xvj ij 1\\n\"\n    \"nPp an 1\\n\"\n    \"aVw an 1\\n\"\n    \"wBv va 1\\n\"\n    \"kVb ka 1\\n\"\n    \"gcH ch 1\\n\"\n    \"Xbs sz 1\\n\"\n    \"tRd th 1\\n\"\n    \"mQz sz 1\\n\"\n    \"Hxe er 1\\n\"\n    \"Dnw an 1\\n\"\n    \"xWg ng 1\\n\"\n    \"pGc ch 1\\n\"\n    \"hgI th 1\\n\"\n    \"ywP wa 1\\n\"\n    \"nrW an 1\\n\"\n    \"iVq di 1\\n\"\n    \"xzE sz 1\\n\"\n    \"Vxd de 1\\n\"\n    \"Lzc ch 1\\n\"\n    \"Jwp pr 1\\n\"\n    \"gCq ng 1\\n\"\n    \"Otq th 1\\n\"\n    \"wvP va 1\\n\"\n    \"cNr ch 1\\n\"\n    \"iXq in 1\\n\"\n    \"Qnl in 1\\n\"\n    \"tPz th 1\\n\"\n    \"hIb th 1\\n\"\n    \"aPg an 1\\n\"\n    \"zvw sz 1\\n\"\n    \"nqO an 1\\n\"\n    \"sqO qu 1\\n\"\n    \"bjQ ij 1\\n\"\n    \"lwQ le 1\\n\"\n    \"pEq qu 1\\n\"\n    \"bWj ij 1\\n\"\n    \"swT sz 1\\n\"\n    \"gmY ng 1\\n\"\n    \"gRk ng 1\\n\"\n    \"dZr er 1\\n\"\n    \"fMr er 1\\n\"\n    \"lxO le 1\\n\"\n    \"kbQ ka 1\\n\"\n    \"yfN ny 1\\n\"\n    \"ymq qu 1\\n\"\n    \"jpK ij 1\\n\"\n    \"Wjn an 1\\n\"\n    \"fmW me 1\\n\"\n    \"rKx er 1\\n\"\n    \"dlH le 1\\n\"\n    \"kcK ch 1\\n\"\n    \"vbV va 1\\n\"\n    \"qNl qu 1\\n\"\n    \"pHt th 1\\n\"\n    \"hlT th 1\\n\"\n    \"lBv le 1\\n\"\n    \"oaF an 1\\n\"\n    \"xfM fo 1\\n\"\n    \"rZd er 1\\n\"\n    \"jgW ng 1\\n\"\n    \"Hvh th 1\\n\"\n    \"Fkf ka 1\\n\"\n    \"cDc ch 1\\n\"\n    \"hLh th 1\\n\"\n    \"qQp qu 1\\n\"\n    \"zhJ th 1\\n\"\n    \"ivQ in 1\\n\"\n    \"Ukq qu 1\\n\"\n    \"bpV pr 1\\n\"\n    \"bJq qu 1\\n\"\n    \"aPw an 1\\n\"\n    \"sdK de 1\\n\"\n    \"cGf ch 1\\n\"\n    \"Ljw ij 1\\n\"\n    \"qhP th 1\\n\"\n    \"mFw me 1\\n\"\n    \"fIu qu 1\\n\"\n    \"zhB th 1\\n\"\n    \"fuH qu 1\\n\"\n    \"bFq qu 1\\n\"\n    \"Wgk ng 1\\n\"\n    \"Fqh th 1\\n\"\n    \"zmf sz 1\\n\"\n    \"Zpf pr 1\\n\"\n    \"nFh th 1\\n\"\n    \"yBw wa 1\\n\"\n    \"gIj ng 1\\n\"\n    \"qBf fo 1\\n\"\n    \"Uwl le 1\\n\"\n    \"zrM er 1\\n\"\n    \"yBd de 1\\n\"\n    \"Rlf le 1\\n\"\n    \"Pzh ch 1\\n\"\n    \"rZx er 1\\n\"\n    \"qVs qu 1\\n\"\n    \"dxJ de 1\\n\"\n    \"Lcz ch 1\\n\"\n    \"gFn an 1\\n\"\n    \"vIm va 1\\n\"\n    \"qtG th 1\\n\"\n    \"qbG qu 1\\n\"\n    \"bHg ng 1\\n\"\n    \"xrY er 1\\n\"\n    \"tBd th 1\\n\"\n    \"nKq an 1\\n\"\n    \"Nkt th 1\\n\"\n    \"jCq qu 1\\n\"\n    \"byX be 1\\n\"\n    \"oBp on 1\\n\"\n    \"Wjz sz 1\\n\"\n    \"zfP sz 1\\n\"\n    \"aQz an 1\\n\"\n    \"sjx ij 1\\n\"\n    \"nfW an 1\\n\"\n    \"nXw an 1\\n\"\n    \"bJw wa 1\\n\"\n    \"aSf an 1\\n\"\n    \"iRf in 1\\n\"\n    \"yMd de 1\\n\"\n    \"fBc ch 1\\n\"\n    \"vxR va 1\\n\"\n    \"Llx le 1\\n\"\n    \"yGs sz 1\\n\"\n    \"Jsy sz 1\\n\"\n    \"Lvx va 1\\n\"\n    \"eFh th 1\\n\"\n    \"wbM wa 1\\n\"\n    \"uOq qu 1\\n\"\n    \"wWl le 1\\n\"\n    \"bvU va 1\\n\"\n    \"fnO an 1\\n\"\n    \"mzI sz 1\\n\"\n    \"Vcf ch 1\\n\"\n    \"mhE th 1\\n\"\n    \"vgQ ng 1\\n\"\n    \"jgP ng 1\\n\"\n    \"qbj qu 1\\n\"\n    \"bZf be 1\\n\"\n    \"Xtj th 1\\n\"\n    \"yYq qu 1\\n\"\n    \"jdK de 1\\n\"\n    \"jzB sz 1\\n\"\n    \"Yys sz 1\\n\"\n    \"wUg ng 1\\n\"\n    \"yBb be 1\\n\"\n    \"qjM qu 1\\n\"\n    \"sXw sz 1\\n\"\n    \"Xqw qu 1\\n\"\n    \"cTb ch 1\\n\"\n    \"jrE er 1\\n\"\n    \"sNp sz 1\\n\"\n    \"Zhm th 1\\n\"\n    \"xVs sz 1\\n\"\n    \"jGz sz 1\\n\"\n    \"Jqh th 1\\n\"\n    \"zTm sz 1\\n\"\n    \"vhE th 1\\n\"\n    \"dQi in 1\\n\"\n    \"Tmv va 1\\n\"\n    \"qxD qu 1\\n\"\n    \"fzE sz 1\\n\"\n    \"vMr er 1\\n\"\n    \"Cqx qu 1\\n\"\n    \"twY th 1\\n\"\n    \"nVz an 1\\n\"\n    \"lRk le 1\\n\"\n    \"Owq qu 1\\n\"\n    \"qYj qu 1\\n\"\n    \"yQk ka 1\\n\"\n    \"Nlf le 1\\n\"\n    \"qDn an 1\\n\"\n    \"bHw wa 1\\n\"\n    \"cjA ch 1\\n\"\n    \"sgU ng 1\\n\"\n    \"kQi in 1\\n\"\n    \"yNf ny 1\\n\"\n    \"lwZ le 1\\n\"\n    \"vGd de 1\\n\"\n    \"Vmn an 1\\n\"\n    \"tpB th 1\\n\"\n    \"cFd ch 1\\n\"\n    \"xHm me 1\\n\"\n    \"bSg ng 1\\n\"\n    \"hEq th 1\\n\"\n    \"ewQ er 1\\n\"\n    \"eWd er 1\\n\"\n    \"jfR ij 1\\n\"\n    \"zpY sz 1\\n\"\n    \"cvQ ch 1\\n\"\n    \"hXr th 1\\n\"\n    \"cJw ch 1\\n\"\n    \"wEp pr 1\\n\"\n    \"Nxl le 1\\n\"\n    \"qMf qu 1\\n\"\n    \"vGc ch 1\\n\"\n    \"pyQ pr 1\\n\"\n    \"jpU ij 1\\n\"\n    \"xoA on 1\\n\"\n    \"gXn an 1\\n\"\n    \"qqG qu 1\\n\"\n    \"pXn an 1\\n\"\n    \"vlP le 1\\n\"\n    \"Lzv sz 1\\n\"\n    \"jxB ij 1\\n\"\n    \"cJc ch 1\\n\"\n    \"jcT ch 1\\n\"\n    \"Wtm th 1\\n\"\n    \"cLg ch 1\\n\"\n    \"kUx ka 1\\n\"\n    \"nFp an 1\\n\"\n    \"Jsw sz 1\\n\"\n    \"sBg ng 1\\n\"\n    \"jFn an 1\\n\"\n    \"gvC ng 1\\n\"\n    \"fFy ny 1\\n\"\n    \"qnA an 1\\n\"\n    \"Zbb be 1\\n\"\n    \"Pzx sz 1\\n\"\n    \"psJ sz 1\\n\"\n    \"lZq qu 1\\n\"\n    \"yfP ny 1\\n\"\n    \"gYv ng 1\\n\"\n    \"bfC be 1\\n\"\n    \"dMx de 1\\n\"\n    \"hlN th 1\\n\"\n    \"wRl le 1\\n\"\n    \"qjH qu 1\\n\"\n    \"Wjc ch 1\\n\"\n    \"uQp qu 1\\n\"\n    \"zTb sz 1\\n\"\n    \"qUr qu 1\\n\"\n    \"zqp qu 1\\n\"\n    \"vlR le 1\\n\"\n    \"jqX qu 1\\n\"\n    \"swR sz 1\\n\"\n    \"qMy ny 1\\n\"\n    \"zkT sz 1\\n\"\n    \"yqX qu 1\\n\"\n    \"nlR an 1\\n\"\n    \"Hqn an 1\\n\"\n    \"aaJ an 1\\n\"\n    \"lKw le 1\\n\"\n    \"bzB sz 1\\n\"\n    \"Vgk ng 1\\n\"\n    \"aVm an 1\\n\"\n    \"dnR an 1\\n\"\n    \"txQ th 1\\n\"\n    \"Qzi in 1\\n\"\n    \"zxV sz 1\\n\"\n    \"xgQ ng 1\\n\"\n    \"tvZ th 1\\n\"\n    \"jwN ij 1\\n\"\n    \"Eqj qu 1\\n\"\n    \"Bxj ij 1\\n\"\n    \"hzH th 1\\n\"\n    \"Qfy ny 1\\n\"\n    \"Ppj ij 1\\n\"\n    \"Aqp qu 1\\n\"\n    \"zJn an 1\\n\"\n    \"szF st 1\\n\"\n    \"qfX qu 1\\n\"\n    \"pzV sz 1\\n\"\n    \"tgN th 1\\n\"\n    \"xsS sz 1\\n\"\n    \"nQz an 1\\n\"\n    \"tkF th 1\\n\"\n    \"Qhq th 1\\n\"\n    \"gJc ch 1\\n\"\n    \"uOa an 1\\n\"\n    \"rqW qu 1\\n\"\n    \"fYz sz 1\\n\"\n    \"uFc ch 1\\n\"\n    \"Ncx ch 1\\n\"\n    \"lMw le 1\\n\"\n    \"cjI ch 1\\n\"\n    \"Jcw ch 1\\n\"\n    \"vEo on 1\\n\"\n    \"eQy er 1\\n\"\n    \"Sxc ch 1\\n\"\n    \"bUx mb 1\\n\"\n    \"zdJ sz 1\\n\"\n    \"lpN le 1\\n\"\n    \"Rkq qu 1\\n\"\n    \"vvI va 1\\n\"\n    \"Qmq qu 1\\n\"\n    \"tgJ th 1\\n\"\n    \"gfE ng 1\\n\"\n    \"qcX ch 1\\n\"\n    \"klT le 1\\n\"\n    \"bbV be 1\\n\"\n    \"pmZ me 1\\n\"\n    \"uqA qu 1\\n\"\n    \"cYy ch 1\\n\"\n    \"wmY me 1\\n\"\n    \"zlB le 1\\n\"\n    \"zNd sz 1\\n\"\n    \"cvZ ch 1\\n\"\n    \"dvL de 1\\n\"\n    \"wLz sz 1\\n\"\n    \"qcG ch 1\\n\"\n    \"Qjl le 1\\n\"\n    \"nqf an 1\\n\"\n    \"gxY ng 1\\n\"\n    \"aqI an 1\\n\"\n    \"Kqa an 1\\n\"\n    \"Xqp qu 1\\n\"\n    \"Yvg ng 1\\n\"\n    \"qqF qu 1\\n\"\n    \"yHh th 1\\n\"\n    \"nHc an 1\\n\"\n    \"Uqq qu 1\\n\"\n    \"zfN sz 1\\n\"\n    \"mXq qu 1\\n\"\n    \"Fgj ng 1\\n\"\n    \"Dsx sz 1\\n\"\n    \"xRv va 1\\n\"\n    \"wbZ wa 1\\n\"\n    \"Hnp an 1\\n\"\n    \"fUx fo 1\\n\"\n    \"cYd ch 1\\n\"\n    \"qTg ng 1\\n\"\n    \"Bgq ng 1\\n\"\n    \"pCn an 1\\n\"\n    \"Xmh th 1\\n\"\n    \"vjJ ij 1\\n\"\n    \"tdG th 1\\n\"\n    \"Zhk th 1\\n\"\n    \"xFn an 1\\n\"\n    \"dkQ de 1\\n\"\n    \"Lcg ch 1\\n\"\n    \"mIu qu 1\\n\"\n    \"Iwd de 1\\n\"\n    \"wjw ij 1\\n\"\n    \"zbX sz 1\\n\"\n    \"Yhp th 1\\n\"\n    \"cvH ch 1\\n\"\n    \"Lcx ch 1\\n\"\n    \"Wfn an 1\\n\"\n    \"Nfq qu 1\\n\"\n    \"qMv qu 1\\n\"\n    \"Uvw va 1\\n\"\n    \"Qnh th 1\\n\"\n    \"nbG an 1\\n\"\n    \"sFg ng 1\\n\"\n    \"xlJ le 1\\n\"\n    \"bPb be 1\\n\"\n    \"xpI pr 1\\n\"\n    \"mrV er 1\\n\"\n    \"Fwu qu 1\\n\"\n    \"wOy wa 1\\n\"\n    \"Pmh th 1\\n\"\n    \"Jhq th 1\\n\"\n    \"Zbx be 1\\n\"\n    \"pgY ng 1\\n\"\n    \"Rbw wa 1\\n\"\n    \"Awx wa 1\\n\"\n    \"mcB ch 1\\n\"\n    \"gkG ng 1\\n\"\n    \"xkW ka 1\\n\"\n    \"Pnw in 1\\n\"\n    \"bNs sz 1\\n\"\n    \"nXr an 1\\n\"\n    \"Vmt th 1\\n\"\n    \"eUv er 1\\n\"\n    \"yQv va 1\\n\"\n    \"kxr er 1\\n\"\n    \"Ksw sz 1\\n\"\n    \"bpW pr 1\\n\"\n    \"qeD qu 1\\n\"\n    \"Qvh th 1\\n\"\n    \"bRm me 1\\n\"\n    \"qJm qu 1\\n\"\n    \"csY ch 1\\n\"\n    \"qwH qu 1\\n\"\n    \"Cqc ch 1\\n\"\n    \"lYq qu 1\\n\"\n    \"dPp de 1\\n\"\n    \"oAe er 1\\n\"\n    \"dcS ch 1\\n\"\n    \"uwU qu 1\\n\"\n    \"zjL sz 1\\n\"\n    \"oZx on 1\\n\"\n    \"kjR ij 1\\n\"\n    \"cDy ch 1\\n\"\n    \"fSs sz 1\\n\"\n    \"eQf le 1\\n\"\n    \"qBm qu 1\\n\"\n    \"mLb me 1\\n\"\n    \"Zrj er 1\\n\"\n    \"Gkx ka 1\\n\"\n    \"pkX ka 1\\n\"\n    \"vTk ka 1\\n\"\n    \"Zgp ng 1\\n\"\n    \"dhP th 1\\n\"\n    \"nPv an 1\\n\"\n    \"xnQ an 1\\n\"\n    \"bHp pr 1\\n\"\n    \"Xgf ng 1\\n\"\n    \"Cwf wa 1\\n\"\n    \"lbN le 1\\n\"\n    \"jNm ij 1\\n\"\n    \"xNt th 1\\n\"\n    \"rJp er 1\\n\"\n    \"oJd on 1\\n\"\n    \"Ryq qu 1\\n\"\n    \"lvL le 1\\n\"\n    \"qvY qu 1\\n\"\n    \"vwC va 1\\n\"\n    \"kFj ij 1\\n\"\n    \"qHd qu 1\\n\"\n    \"wcB ch 1\\n\"\n    \"xTs sz 1\\n\"\n    \"fQz sz 1\\n\"\n    \"Dlf le 1\\n\"\n    \"wLt th 1\\n\"\n    \"Fbh th 1\\n\"\n    \"rqJ qu 1\\n\"\n    \"hhO th 1\\n\"\n    \"xOi in 1\\n\"\n    \"mqz qu 1\\n\"\n    \"qmQ me 1\\n\"\n    \"qQj qu 1\\n\"\n    \"ovQ on 1\\n\"\n    \"gfR ng 1\\n\"\n    \"Pmq qu 1\\n\"\n    \"Tcj ch 1\\n\"\n    \"mqQ qu 1\\n\"\n    \"mwV me 1\\n\"\n    \"bXw wa 1\\n\"\n    \"jlA le 1\\n\"\n    \"fjG ij 1\\n\"\n    \"jxY ij 1\\n\"\n    \"qwM qu 1\\n\"\n    \"kvU ka 1\\n\"\n    \"Bkq qu 1\\n\"\n    \"gfA ng 1\\n\"\n    \"Awc ch 1\\n\"\n    \"Vmv va 1\\n\"\n    \"Qhl th 1\\n\"\n    \"Wmj ij 1\\n\"\n    \"cMq ch 1\\n\"\n    \"tHp th 1\\n\"\n    \"lPb le 1\\n\"\n    \"vlK le 1\\n\"\n    \"Ygk ng 1\\n\"\n    \"gJs ng 1\\n\"\n    \"tWl th 1\\n\"\n    \"xVw wa 1\\n\"\n    \"srN er 1\\n\"\n    \"Uhb th 1\\n\"\n    \"vfR va 1\\n\"\n    \"kFf ka 1\\n\"\n    \"Jlz le 1\\n\"\n    \"fKq qu 1\\n\"\n    \"mRq qu 1\\n\"\n    \"kWw ka 1\\n\"\n    \"zvO sz 1\\n\"\n    \"Xqz qu 1\\n\"\n    \"dIj de 1\\n\"\n    \"wJm me 1\\n\"\n    \"Fqv qu 1\\n\"\n    \"wNt th 1\\n\"\n    \"lxL le 1\\n\"\n    \"xLm me 1\\n\"\n    \"dqN qu 1\\n\"\n    \"wRj ij 1\\n\"\n    \"Ljt th 1\\n\"\n    \"wRw wa 1\\n\"\n    \"cxB ch 1\\n\"\n    \"cjH ch 1\\n\"\n    \"Vqj qu 1\\n\"\n    \"qJs qu 1\\n\"\n    \"cFk ch 1\\n\"\n    \"xqd qu 1\\n\"\n    \"Eqh th 1\\n\"\n    \"qRd qu 1\\n\"\n    \"vfT va 1\\n\"\n    \"Zqb qu 1\\n\"\n    \"mGc ch 1\\n\"\n    \"Sbd de 1\\n\"\n    \"iwV in 1\\n\"\n    \"jfI ij 1\\n\"\n    \"nWz an 1\\n\"\n    \"Ljg ng 1\\n\"\n    \"rjG er 1\\n\"\n    \"cFb ch 1\\n\"\n    \"uqZ qu 1\\n\"\n    \"mVm me 1\\n\"\n    \"jgK ng 1\\n\"\n    \"dZh th 1\\n\"\n    \"Bqx qu 1\\n\"\n    \"quG un 1\\n\"\n    \"lCv le 1\\n\"\n    \"lxW le 1\\n\"\n    \"gGb ng 1\\n\"\n    \"gvY ng 1\\n\"\n    \"mjF ij 1\\n\"\n    \"ptX th 1\\n\"\n    \"pYy pr 1\\n\"\n    \"Yrf er 1\\n\"\n    \"mVd de 1\\n\"\n    \"zpR sz 1\\n\"\n    \"xKw wa 1\\n\"\n    \"wpM pr 1\\n\"\n    \"cLk ch 1\\n\"\n    \"Sqz qu 1\\n\"\n    \"gWn an 1\\n\"\n    \"sWz st 1\\n\"\n    \"srS er 1\\n\"\n    \"cVx ch 1\\n\"\n    \"xNb be 1\\n\"\n    \"hPb th 1\\n\"\n    \"bGq qu 1\\n\"\n    \"tdH th 1\\n\"\n    \"yJl le 1\\n\"\n    \"vUk ka 1\\n\"\n    \"dJz sz 1\\n\"\n    \"qhI th 1\\n\"\n    \"mtP th 1\\n\"\n    \"lGb le 1\\n\"\n    \"hDx th 1\\n\"\n    \"zfW sz 1\\n\"\n    \"Nml le 1\\n\"\n    \"Hsw st 1\\n\"\n    \"pfG pr 1\\n\"\n    \"dMj de 1\\n\"\n    \"kKq qu 1\\n\"\n    \"rjS er 1\\n\"\n    \"Qlg ng 1\\n\"\n    \"Nfy ny 1\\n\"\n    \"cqM ch 1\\n\"\n    \"hWm th 1\\n\"\n    \"fuO qu 1\\n\"\n    \"zfF sz 1\\n\"\n    \"qgH ng 1\\n\"\n    \"bpZ pr 1\\n\"\n    \"btY th 1\\n\"\n    \"uqB qu 1\\n\"\n    \"qyA qu 1\\n\"\n    \"Xrp er 1\\n\"\n    \"ytX th 1\\n\"\n    \"dHm de 1\\n\"\n    \"vBg ng 1\\n\"\n    \"yyN ny 1\\n\"\n    \"Qrj er 1\\n\"\n    \"gKd ng 1\\n\"\n    \"bfU be 1\\n\"\n    \"Qft th 1\\n\"\n    \"bqP qu 1\\n\"\n    \"qOz qu 1\\n\"\n    \"Xhc th 1\\n\"\n    \"dqY qu 1\\n\"\n    \"hjQ th 1\\n\"\n    \"Yfu qu 1\\n\"\n    \"aXk an 1\\n\"\n    \"pbV pr 1\\n\"\n    \"vjP ij 1\\n\"\n    \"Ybp pr 1\\n\"\n    \"Jmb me 1\\n\"\n    \"qFq qu 1\\n\"\n    \"yPq qu 1\\n\"\n    \"yWw wa 1\\n\"\n    \"vhX th 1\\n\"\n    \"iwT in 1\\n\"\n    \"qZf qu 1\\n\"\n    \"uqU qu 1\\n\"\n    \"uFk qu 1\\n\"\n    \"cpW ch 1\\n\"\n    \"Lpq qu 1\\n\"\n    \"kfL ka 1\\n\"\n    \"pQe er 1\\n\"\n    \"gwz ng 1\\n\"\n    \"jpM ij 1\\n\"\n    \"Qkm ka 1\\n\"\n    \"jgH ng 1\\n\"\n    \"xjP ij 1\\n\"\n    \"xgL ng 1\\n\"\n    \"jLm ij 1\\n\"\n    \"dxN de 1\\n\"\n    \"vWs st 1\\n\"\n    \"Jjh th 1\\n\"\n    \"hhG th 1\\n\"\n    \"Yvc ch 1\\n\"\n    \"xrE er 1\\n\"\n    \"bZw wa 1\\n\"\n    \"Lvw va 1\\n\"\n    \"eNw er 1\\n\"\n    \"fjB ij 1\\n\"\n    \"dcQ ch 1\\n\"\n    \"lZt th 1\\n\"\n    \"Jwq qu 1\\n\"\n    \"qPg ng 1\\n\"\n    \"xMb be 1\\n\"\n    \"hfD th 1\\n\"\n    \"jzQ sz 1\\n\"\n    \"Uuf qu 1\\n\"\n    \"zGk sz 1\\n\"\n    \"zCc ch 1\\n\"\n    \"npC an 1\\n\"\n    \"tWd th 1\\n\"\n    \"hjF th 1\\n\"\n    \"Pzs st 1\\n\"\n    \"wuA qu 1\\n\"\n    \"Qhg th 1\\n\"\n    \"Mqm qu 1\\n\"\n    \"fsI st 1\\n\"\n    \"fdU de 1\\n\"\n    \"Xrm er 1\\n\"\n    \"qQg ng 1\\n\"\n    \"bkW ka 1\\n\"\n    \"dHg ng 1\\n\"\n    \"rcB ch 1\\n\"\n    \"hWu th 1\\n\"\n    \"nIq an 1\\n\"\n    \"rYq qu 1\\n\"\n    \"xXv va 1\\n\"\n    \"wqP qu 1\\n\"\n    \"xmN me 1\\n\"\n    \"sJf st 1\\n\"\n    \"yMf ny 1\\n\"\n    \"Sfk ka 1\\n\"\n    \"qzW qu 1\\n\"\n    \"cvT ch 1\\n\"\n    \"kmX ka 1\\n\"\n    \"xqU qu 1\\n\"\n    \"cnG an 1\\n\"\n    \"Jpi in 1\\n\"\n    \"frX er 1\\n\"\n    \"yLf ny 1\\n\"\n    \"uyU qu 1\\n\"\n    \"Ddw de 1\\n\"\n    \"Tgj ng 1\\n\"\n    \"qeH qu 1\\n\"\n    \"fEz sz 1\\n\"\n    \"pCk ka 1\\n\"\n    \"qmf qu 1\\n\"\n    \"rjH er 1\\n\"\n    \"xMp pr 1\\n\"\n    \"Ywo on 1\\n\"\n    \"zgD ng 1\\n\"\n    \"Pqx qu 1\\n\"\n    \"nqM on 1\\n\"\n    \"wdX de 1\\n\"\n    \"Bpz sz 1\\n\"\n    \"lhM th 1\\n\"\n    \"Epb pr 1\\n\"\n    \"bhJ th 1\\n\"\n    \"kvQ ka 1\\n\"\n    \"Rsq qu 1\\n\"\n    \"xbP be 1\\n\"\n    \"nMm an 1\\n\"\n    \"xuC qu 1\\n\"\n    \"wjs sz 1\\n\"\n    \"fxX fo 1\\n\"\n    \"hvT th 1\\n\"\n    \"uPx qu 1\\n\"\n    \"Jmy me 1\\n\"\n    \"Qzd de 1\\n\"\n    \"Nsz st 1\\n\"\n    \"vWd de 1\\n\"\n    \"hfX th 1\\n\"\n    \"jCg ng 1\\n\"\n    \"yQx ny 1\\n\"\n    \"whJ th 1\\n\"\n    \"wrq qu 1\\n\"\n    \"xgW ng 1\\n\"\n    \"Jhj th 1\\n\"\n    \"lhC th 1\\n\"\n    \"Pwf ow 1\\n\"\n    \"ljC le 1\\n\"\n    \"vvB va 1\\n\"\n    \"mcN ch 1\\n\"\n    \"yHx ny 1\\n\"\n    \"bBj ij 1\\n\"\n    \"qRz qu 1\\n\"\n    \"glH ng 1\\n\"\n    \"cZp ch 1\\n\"\n    \"qJh th 1\\n\"\n    \"tSg th 1\\n\"\n    \"xVm me 1\\n\"\n    \"uWs qu 1\\n\"\n    \"Vxo on 1\\n\"\n    \"fjM ij 1\\n\"\n    \"zhK th 1\\n\"\n    \"Cjh th 1\\n\"\n    \"vZr er 1\\n\"\n    \"bCs sz 1\\n\"\n    \"rwY er 1\\n\"\n    \"xEi in 1\\n\"\n    \"dUv de 1\\n\"\n    \"fRg ng 1\\n\"\n    \"Gcu ch 1\\n\"\n    \"jDf ij 1\\n\"\n    \"djH de 1\\n\"\n    \"vlU le 1\\n\"\n    \"qyG qu 1\\n\"\n    \"kfq qu 1\\n\"\n    \"lXg ng 1\\n\"\n    \"lbC le 1\\n\"\n    \"Pwg ng 1\\n\"\n    \"Oae an 1\\n\"\n    \"pbC pr 1\\n\"\n    \"dWt th 1\\n\"\n    \"lzU le 1\\n\"\n    \"wJz sz 1\\n\"\n    \"dYj de 1\\n\"\n    \"cBj ch 1\\n\"\n    \"fRv va 1\\n\"\n    \"djG de 1\\n\"\n    \"mYg ng 1\\n\"\n    \"Qbc ch 1\\n\"\n    \"gnX an 1\\n\"\n    \"wPm me 1\\n\"\n    \"wvN va 1\\n\"\n    \"qGm qu 1\\n\"\n    \"qNh th 1\\n\"\n    \"mRg ng 1\\n\"\n    \"Uqv qu 1\\n\"\n    \"Qxm me 1\\n\"\n    \"fzX sz 1\\n\"\n    \"zjM sz 1\\n\"\n    \"xqA qu 1\\n\"\n    \"bMs sz 1\\n\"\n    \"vmL me 1\\n\"\n    \"Eyx ny 1\\n\"\n    \"hHj th 1\\n\"\n    \"jGp ij 1\\n\"\n    \"mfD me 1\\n\"\n    \"Jfw wa 1\\n\"\n    \"Wjh th 1\\n\"\n    \"bZs sz 1\\n\"\n    \"Iyk ka 1\\n\"\n    \"zRn an 1\\n\"\n    \"cdU ch 1\\n\"\n    \"mJh th 1\\n\"\n    \"Qjy ij 1\\n\"\n    \"Qao an 1\\n\"\n    \"bXv va 1\\n\"\n    \"hSg th 1\\n\"\n    \"rAo er 1\\n\"\n    \"hLs th 1\\n\"\n    \"lCs le 1\\n\"\n    \"qkJ qu 1\\n\"\n    \"Rxu qu 1\\n\"\n    \"xdN de 1\\n\"\n    \"yYx ny 1\\n\"\n    \"dkN de 1\\n\"\n    \"Rgw ng 1\\n\"\n    \"zgL sz 1\\n\"\n    \"Rcj ch 1\\n\"\n    \"iWz in 1\\n\"\n    \"dLk de 1\\n\"\n    \"mpX me 1\\n\"\n    \"Gbd de 1\\n\"\n    \"bnH an 1\\n\"\n    \"kdM de 1\\n\"\n    \"wqG qu 1\\n\"\n    \"vMz sz 1\\n\"\n    \"zwH sz 1\\n\"\n    \"wgx ng 1\\n\"\n    \"Ljk ij 1\\n\"\n    \"tlG th 1\\n\"\n    \"tgE th 1\\n\"\n    \"Wcw ch 1\\n\"\n    \"Vby be 1\\n\"\n    \"mVz sz 1\\n\"\n    \"Hgc ch 1\\n\"\n    \"gqP ng 1\\n\"\n    \"hhB th 1\\n\"\n    \"nFx an 1\\n\"\n    \"yBf ny 1\\n\"\n    \"Wmx me 1\\n\"\n    \"vNb va 1\\n\"\n    \"Mnv an 1\\n\"\n    \"Zmc ch 1\\n\"\n    \"bzS sz 1\\n\"\n    \"yfC ny 1\\n\"\n    \"Epx pr 1\\n\"\n    \"ljG le 1\\n\"\n    \"wUa an 1\\n\"\n    \"Qgo ng 1\\n\"\n    \"pqb qu 1\\n\"\n    \"Jkm ka 1\\n\"\n    \"Wvy va 1\\n\"\n    \"Bjp ij 1\\n\"\n    \"vfZ va 1\\n\"\n    \"wxT wa 1\\n\"\n    \"Vxw wa 1\\n\"\n    \"dRt th 1\\n\"\n    \"nVq an 1\\n\"\n    \"iWf in 1\\n\"\n    \"Smq qu 1\\n\"\n    \"jwG ij 1\\n\"\n    \"vcW ch 1\\n\"\n    \"Qgz ng 1\\n\"\n    \"Wkq qu 1\\n\"\n    \"xrL er 1\\n\"\n    \"tVh ch 1\\n\"\n    \"Zlr er 1\\n\"\n    \"zDt th 1\\n\"\n    \"yxP ny 1\\n\"\n    \"Yyw wa 1\\n\"\n    \"zPk sz 1\\n\"\n    \"Bgg ng 1\\n\"\n    \"xOk ka 1\\n\"\n    \"oXq qu 1\\n\"\n    \"tQf th 1\\n\"\n    \"fxF fo 1\\n\"\n    \"dOq qu 1\\n\"\n    \"Vtp th 1\\n\"\n    \"jhP th 1\\n\"\n    \"vhZ th 1\\n\"\n    \"Gqq qu 1\\n\"\n    \"dFg ng 1\\n\"\n    \"eCg ng 1\\n\"\n    \"kjH ij 1\\n\"\n    \"vqQ qu 1\\n\"\n    \"jpL ij 1\\n\"\n    \"hgZ th 1\\n\"\n    \"xFd de 1\\n\"\n    \"Qjd de 1\\n\"\n    \"xKm me 1\\n\"\n    \"zQc ch 1\\n\"\n    \"Nhw th 1\\n\"\n    \"Kqo qu 1\\n\"\n    \"hwO th 1\\n\"\n    \"oYn an 1\\n\"\n    \"Wnf an 1\\n\"\n    \"vSc ch 1\\n\"\n    \"Afq qu 1\\n\"\n    \"jqJ qu 1\\n\"\n    \"jEg ng 1\\n\"\n    \"dKp de 1\\n\"\n    \"nmK an 1\\n\"\n    \"wXw wa 1\\n\"\n    \"vjC ij 1\\n\"\n    \"dXb de 1\\n\"\n    \"tQn th 1\\n\"\n    \"qoR qu 1\\n\"\n    \"bRf be 1\\n\"\n    \"yyL ny 1\\n\"\n    \"kSj ij 1\\n\"\n    \"Xyu qu 1\\n\"\n    \"vmA va 1\\n\"\n    \"Zgm ng 1\\n\"\n    \"Lbx be 1\\n\"\n    \"bIv va 1\\n\"\n    \"Zdq qu 1\\n\"\n    \"gHn an 1\\n\"\n    \"bYq qu 1\\n\"\n    \"Mqd qu 1\\n\"\n    \"qMk qu 1\\n\"\n    \"Qsv st 1\\n\"\n    \"zXx sz 1\\n\"\n    \"hQf th 1\\n\"\n    \"wcV ch 1\\n\"\n    \"Xfz sz 1\\n\"\n    \"Mhc th 1\\n\"\n    \"kBz sz 1\\n\"\n    \"bWp pr 1\\n\"\n    \"Wzu qu 1\\n\"\n    \"hWw th 1\\n\"\n    \"yNp pr 1\\n\"\n    \"xbZ be 1\\n\"\n    \"mTb me 1\\n\"\n    \"Kdf de 1\\n\"\n    \"pfQ pr 1\\n\"\n    \"vCd de 1\\n\"\n    \"Pqf qu 1\\n\"\n    \"ofZ on 1\\n\"\n    \"wYd de 1\\n\"\n    \"Tfc ch 1\\n\"\n    \"Gnb an 1\\n\"\n    \"Zdx de 1\\n\"\n    \"zVj sz 1\\n\"\n    \"Tqw qu 1\\n\"\n    \"fzV sz 1\\n\"\n    \"Igq ng 1\\n\"\n    \"Qvv vi 1\\n\"\n    \"Pmf me 1\\n\"\n    \"qHe qu 1\\n\"\n    \"ybR be 1\\n\"\n    \"cFg ch 1\\n\"\n    \"Kvf va 1\\n\"\n    \"Zxm me 1\\n\"\n    \"oVc ch 1\\n\"\n    \"Yhb th 1\\n\"\n    \"bwP wa 1\\n\"\n    \"Vvz sz 1\\n\"\n    \"sdW de 1\\n\"\n    \"gFz ng 1\\n\"\n    \"mRl le 1\\n\"\n    \"bqN qu 1\\n\"\n    \"bhU th 1\\n\"\n    \"tBw th 1\\n\"\n    \"Hbb be 1\\n\"\n    \"Jzp sz 1\\n\"\n    \"zrS er 1\\n\"\n    \"mkZ me 1\\n\"\n    \"bKw wa 1\\n\"\n    \"jPx ij 1\\n\"\n    \"Xqa an 1\\n\"\n    \"fGz sz 1\\n\"\n    \"xLk ka 1\\n\"\n    \"nrV an 1\\n\"\n    \"Tmx me 1\\n\"\n    \"zvZ sz 1\\n\"\n    \"gWl ng 1\\n\"\n    \"Yxb be 1\\n\"\n    \"yWt th 1\\n\"\n    \"lqN qu 1\\n\"\n    \"tWu th 1\\n\"\n    \"xZt th 1\\n\"\n    \"iqI in 1\\n\"\n    \"cpQ ch 1\\n\"\n    \"zPf sz 1\\n\"\n    \"bqG qu 1\\n\"\n    \"gmI ng 1\\n\"\n    \"Wkc ch 1\\n\"\n    \"Zvs sz 1\\n\"\n    \"qdN qu 1\\n\"\n    \"hYf th 1\\n\"\n    \"sBn an 1\\n\"\n    \"Dwb ow 1\\n\"\n    \"Wzq qu 1\\n\"\n    \"Qdw de 1\\n\"\n    \"svR sz 1\\n\"\n    \"Nvv va 1\\n\"\n    \"jRc ch 1\\n\"\n    \"qDv qu 1\\n\"\n    \"qGe qu 1\\n\"\n    \"cwT ch 1\\n\"\n    \"fTy ny 1\\n\"\n    \"Cvv va 1\\n\"\n    \"flQ le 1\\n\"\n    \"mWg ng 1\\n\"\n    \"twS th 1\\n\"\n    \"npM an 1\\n\"\n    \"Ufq qu 1\\n\"\n    \"fuG qu 1\\n\"\n    \"oCj on 1\\n\"\n    \"txF th 1\\n\"\n    \"Yft th 1\\n\"\n    \"qwy qu 1\\n\"\n    \"Vdz de 1\\n\"\n    \"Vgq ng 1\\n\"\n    \"Rkg ng 1\\n\"\n    \"Pxz sz 1\\n\"\n    \"mCn an 1\\n\"\n    \"whZ th 1\\n\"\n    \"fgB ng 1\\n\"\n    \"jvW ij 1\\n\"\n    \"kdL de 1\\n\"\n    \"Lxi in 1\\n\"\n    \"svB sz 1\\n\"\n    \"xuH qu 1\\n\"\n    \"gFy ng 1\\n\"\n    \"oVv on 1\\n\"\n    \"Zhq th 1\\n\"\n    \"oqG qu 1\\n\"\n    \"oJp on 1\\n\"\n    \"gIf ng 1\\n\"\n    \"bwF wa 1\\n\"\n    \"vLh th 1\\n\"\n    \"jgX ng 1\\n\"\n    \"qKi in 1\\n\"\n    \"xRh th 1\\n\"\n    \"qwV qu 1\\n\"\n    \"mNl le 1\\n\"\n    \"Gvv va 1\\n\"\n    \"pQf pr 1\\n\"\n    \"xbV be 1\\n\"\n    \"dpZ de 1\\n\"\n    \"fHq qu 1\\n\"\n    \"bBd de 1\\n\"\n    \"vUh th 1\\n\"\n    \"hzA th 1\\n\"\n    \"Mnz an 1\\n\"\n    \"pBt th 1\\n\"\n    \"oaE an 1\\n\"\n    \"slK le 1\\n\"\n    \"Wlg ng 1\\n\"\n    \"jhK th 1\\n\"\n    \"xvX va 1\\n\"\n    \"Ffx fo 1\\n\"\n    \"gXh th 1\\n\"\n    \"cWf ch 1\\n\"\n    \"Gpy pr 1\\n\"\n    \"xmS me 1\\n\"\n    \"gZn an 1\\n\"\n    \"djX de 1\\n\"\n    \"bkX ka 1\\n\"\n    \"xlP le 1\\n\"\n    \"hCt th 1\\n\"\n    \"Yhj th 1\\n\"\n    \"gwQ ng 1\\n\"\n    \"klD le 1\\n\"\n    \"Rhq th 1\\n\"\n    \"aEj an 1\\n\"\n    \"jpY ij 1\\n\"\n    \"pVn an 1\\n\"\n    \"nJx an 1\\n\"\n    \"zdV de 1\\n\"\n    \"Rvf va 1\\n\"\n    \"Oqy qu 1\\n\"\n    \"zpT sz 1\\n\"\n    \"Pzc ch 1\\n\"\n    \"qTm qu 1\\n\"\n    \"jfq ij 1\\n\"\n    \"ztY th 1\\n\"\n    \"Zqv qu 1\\n\"\n    \"nZb an 1\\n\"\n    \"pHl le 1\\n\"\n    \"Qcr ch 1\\n\"\n    \"zVm sz 1\\n\"\n    \"pNm me 1\\n\"\n    \"Xhj th 1\\n\"\n    \"oYy on 1\\n\"\n    \"Flq qu 1\\n\"\n    \"lwj le 1\\n\"\n    \"rwH er 1\\n\"\n    \"oWq qu 1\\n\"\n    \"Bwm me 1\\n\"\n    \"jXs sz 1\\n\"\n    \"Lkt th 1\\n\"\n    \"lVn an 1\\n\"\n    \"jXa an 1\\n\"\n    \"hkB th 1\\n\"\n    \"qrQ qu 1\\n\"\n    \"dqK qu 1\\n\"\n    \"Zxn an 1\\n\"\n    \"ygZ ng 1\\n\"\n    \"Fgt th 1\\n\"\n    \"nwM an 1\\n\"\n    \"Wzx sz 1\\n\"\n    \"qgb ng 1\\n\"\n    \"Ygv ng 1\\n\"\n    \"Xdd de 1\\n\"\n    \"xjM ij 1\\n\"\n    \"qHb qu 1\\n\"\n    \"zKz sz 1\\n\"\n    \"dvM de 1\\n\"\n    \"Zpx pr 1\\n\"\n    \"wPt th 1\\n\"\n    \"qiA in 1\\n\"\n    \"jyV ij 1\\n\"\n    \"jyR ij 1\\n\"\n    \"Uox on 1\\n\"\n    \"Qkz ka 1\\n\"\n    \"Lxq qu 1\\n\"\n    \"fpq qu 1\\n\"\n    \"Xmf me 1\\n\"\n    \"kRx ka 1\\n\"\n    \"jFk ij 1\\n\"\n    \"nZc an 1\\n\"\n    \"hCp th 1\\n\"\n    \"Hbw wa 1\\n\"\n    \"zlF le 1\\n\"\n    \"kqI qu 1\\n\"\n    \"wWj ij 1\\n\"\n    \"qKk qu 1\\n\"\n    \"Jpf pr 1\\n\"\n    \"lbR le 1\\n\"\n    \"rbJ er 1\\n\"\n    \"zfK sz 1\\n\"\n    \"gVk ng 1\\n\"\n    \"bZx be 1\\n\"\n    \"znQ an 1\\n\"\n    \"gZb ga 1\\n\"\n    \"wtI th 1\\n\"\n    \"bvW va 1\\n\"\n    \"qhG th 1\\n\"\n    \"xrV er 1\\n\"\n    \"pYc ch 1\\n\"\n    \"bQq qu 1\\n\"\n    \"qpV qu 1\\n\"\n    \"pFm me 1\\n\"\n    \"zdO de 1\\n\"\n    \"Jvj ij 1\\n\"\n    \"mQl le 1\\n\"\n    \"xWm me 1\\n\"\n    \"Dtz th 1\\n\"\n    \"lKz le 1\\n\"\n    \"dkI de 1\\n\"\n    \"fSx fo 1\\n\"\n    \"yCp pr 1\\n\"\n    \"whF th 1\\n\"\n    \"lVm le 1\\n\"\n    \"yHv va 1\\n\"\n    \"Plm le 1\\n\"\n    \"Jpm me 1\\n\"\n    \"hEw ha 1\\n\"\n    \"zHz sz 1\\n\"\n    \"uIj qu 1\\n\"\n    \"gzB ng 1\\n\"\n    \"qsV qu 1\\n\"\n    \"pbX pr 1\\n\"\n    \"jyY ij 1\\n\"\n    \"mjq qu 1\\n\"\n    \"zDd de 1\\n\"\n    \"Tqc ch 1\\n\"\n    \"fTg ng 1\\n\"\n    \"qbh th 1\\n\"\n    \"Cjq qu 1\\n\"\n    \"pcW ch 1\\n\"\n    \"Xhp th 1\\n\"\n    \"fwR wa 1\\n\"\n    \"dQm de 1\\n\"\n    \"xCk ka 1\\n\"\n    \"yhM th 1\\n\"\n    \"glQ ng 1\\n\"\n    \"gVb ng 1\\n\"\n    \"Pdy de 1\\n\"\n    \"yOj ij 1\\n\"\n    \"jZg ng 1\\n\"\n    \"oqZ qu 1\\n\"\n    \"bqI qu 1\\n\"\n    \"jkX ij 1\\n\"\n    \"Kfh th 1\\n\"\n    \"xpQ pr 1\\n\"\n    \"rhX th 1\\n\"\n    \"wjI ij 1\\n\"\n    \"Bqf qu 1\\n\"\n    \"aCp an 1\\n\"\n    \"ccX ch 1\\n\"\n    \"vGm ma 1\\n\"\n    \"paU an 1\\n\"\n    \"xUh th 1\\n\"\n    \"gLd ng 1\\n\"\n    \"tfJ th 1\\n\"\n    \"fwH wa 1\\n\"\n    \"Pnq an 1\\n\"\n    \"kxV ka 1\\n\"\n    \"Nbk ka 1\\n\"\n    \"sqE qu 1\\n\"\n    \"Cjp ij 1\\n\"\n    \"kcZ ka 1\\n\"\n    \"Wqj ij 1\\n\"\n    \"tzY th 1\\n\"\n    \"nqX an 1\\n\"\n    \"Yyc ch 1\\n\"\n    \"Lzd de 1\\n\"\n    \"xZy ny 1\\n\"\n    \"sdY de 1\\n\"\n    \"jXn an 1\\n\"\n    \"Nbm me 1\\n\"\n    \"wLr er 1\\n\"\n    \"Nqr qu 1\\n\"\n    \"Zwx wa 1\\n\"\n    \"yvH va 1\\n\"\n    \"ylC le 1\\n\"\n    \"qyh th 1\\n\"\n    \"Jnz an 1\\n\"\n    \"hHv th 1\\n\"\n    \"zUq qu 1\\n\"\n    \"xgI ng 1\\n\"\n    \"Ztp th 1\\n\"\n    \"Vvb va 1\\n\"\n    \"tGn th 1\\n\"\n    \"Ujq qu 1\\n\"\n    \"jHs sz 1\\n\"\n    \"bWq qu 1\\n\"\n    \"bXr er 1\\n\"\n    \"hFg th 1\\n\"\n    \"gdT ng 1\\n\"\n    \"qHc ch 1\\n\"\n    \"lCj le 1\\n\"\n    \"mVg ng 1\\n\"\n    \"pQq qu 1\\n\"\n    \"vWl le 1\\n\"\n    \"yFq qu 1\\n\"\n    \"djY de 1\\n\"\n    \"btQ th 1\\n\"\n    \"vlM le 1\\n\"\n    \"Iwt th 1\\n\"\n    \"Pdb de 1\\n\"\n    \"jtQ th 1\\n\"\n    \"xjR ij 1\\n\"\n    \"dhW th 1\\n\"\n    \"zXs sz 1\\n\"\n    \"fbE be 1\\n\"\n    \"Hqr qu 1\\n\"\n    \"vLt th 1\\n\"\n    \"kbD ka 1\\n\"\n    \"vUd de 1\\n\"\n    \"yZc ch 1\\n\"\n    \"Qke le 1\\n\"\n    \"fhG th 1\\n\"\n    \"eHt th 1\\n\"\n    \"vHj ij 1\\n\"\n    \"Tfg ng 1\\n\"\n    \"uoA qu 1\\n\"\n    \"zCx sz 1\\n\"\n    \"zLk sz 1\\n\"\n    \"jdW de 1\\n\"\n    \"Cgn an 1\\n\"\n    \"Lrq qu 1\\n\"\n    \"yOi in 1\\n\"\n    \"qOw qu 1\\n\"\n    \"fqs qu 1\\n\"\n    \"ltQ th 1\\n\"\n    \"nwU an 1\\n\"\n    \"zYq qu 1\\n\"\n    \"Gzs st 1\\n\"\n    \"nWv an 1\\n\"\n    \"lNx le 1\\n\"\n    \"Wql qu 1\\n\"\n    \"dcD ch 1\\n\"\n    \"vfD va 1\\n\"\n    \"qVd qu 1\\n\"\n    \"Wzz sz 1\\n\"\n    \"jfH ij 1\\n\"\n    \"Rrt th 1\\n\"\n    \"qDr qu 1\\n\"\n    \"lOh th 1\\n\"\n    \"wwZ wa 1\\n\"\n    \"mQw me 1\\n\"\n    \"nqK an 1\\n\"\n    \"Uvl le 1\\n\"\n    \"kRq qu 1\\n\"\n    \"Vhg th 1\\n\"\n    \"xsD st 1\\n\"\n    \"Ldd de 1\\n\"\n    \"sQv st 1\\n\"\n    \"qMj qu 1\\n\"\n    \"hbQ th 1\\n\"\n    \"cjX ch 1\\n\"\n    \"nbT an 1\\n\"\n    \"xNf fo 1\\n\"\n    \"wCt th 1\\n\"\n    \"jnX an 1\\n\"\n    \"tZf th 1\\n\"\n    \"qCk qu 1\\n\"\n    \"dHk de 1\\n\"\n    \"Ccq ch 1\\n\"\n    \"uMf qu 1\\n\"\n    \"bvG va 1\\n\"\n    \"zPz sz 1\\n\"\n    \"yIy ny 1\\n\"\n    \"lHx le 1\\n\"\n    \"fnB an 1\\n\"\n    \"Ebx be 1\\n\"\n    \"rGc ch 1\\n\"\n    \"mgD ng 1\\n\"\n    \"hJg th 1\\n\"\n    \"jcG ch 1\\n\"\n    \"Ybd de 1\\n\"\n    \"oDq qu 1\\n\"\n    \"jRx ij 1\\n\"\n    \"kJf ka 1\\n\"\n    \"tFv th 1\\n\"\n    \"Gdv de 1\\n\"\n    \"fHn an 1\\n\"\n    \"Uqp qu 1\\n\"\n    \"cYh th 1\\n\"\n    \"kHp ka 1\\n\"\n    \"qhZ th 1\\n\"\n    \"wZh th 1\\n\"\n    \"kQt th 1\\n\"\n    \"hwH th 1\\n\"\n    \"xzU sz 1\\n\"\n    \"tQg th 1\\n\"\n    \"Qbj ij 1\\n\"\n    \"zVl le 1\\n\"\n    \"qJd qu 1\\n\"\n    \"Xrf er 1\\n\"\n    \"fMv va 1\\n\"\n    \"qJc ch 1\\n\"\n    \"Dqy qu 1\\n\"\n    \"qMs qu 1\\n\"\n    \"fzl le 1\\n\"\n    \"Wdx de 1\\n\"\n    \"Tdw wa 1\\n\"\n    \"mcT ch 1\\n\"\n    \"fOd de 1\\n\"\n    \"Kgj ng 1\\n\"\n    \"yrT er 1\\n\"\n    \"bqA qu 1\\n\"\n    \"snq an 1\\n\"\n    \"Lzt th 1\\n\"\n    \"gLw ng 1\\n\"\n    \"dLq qu 1\\n\"\n    \"Qzr er 1\\n\"\n    \"Qrn an 1\\n\"\n    \"eFn an 1\\n\"\n    \"Nmw wa 1\\n\"\n    \"pxE pr 1\\n\"\n    \"Cqk qu 1\\n\"\n    \"Wcd ch 1\\n\"\n    \"fXw wa 1\\n\"\n    \"fbU be 1\\n\"\n    \"aeO an 1\\n\"\n    \"svV st 1\\n\"\n    \"yVt th 1\\n\"\n    \"sRp st 1\\n\"\n    \"rxU er 1\\n\"\n    \"qhK th 1\\n\"\n    \"uQw qu 1\\n\"\n    \"oXw on 1\\n\"\n    \"Jvw va 1\\n\"\n    \"kvH ka 1\\n\"\n    \"zVy sz 1\\n\"\n    \"rOq qu 1\\n\"\n    \"cWx ch 1\\n\"\n    \"iXv in 1\\n\"\n    \"cBk ch 1\\n\"\n    \"xkM ka 1\\n\"\n    \"vHb va 1\\n\"\n    \"jbW ij 1\\n\"\n    \"mYq qu 1\\n\"\n    \"fnH an 1\\n\"\n    \"zRj sz 1\\n\"\n    \"hvN th 1\\n\"\n    \"oMh th 1\\n\"\n    \"yqO qu 1\\n\"\n    \"fBf fo 1\\n\"\n    \"oPj on 1\\n\"\n    \"fFc ch 1\\n\"\n    \"lVq qu 1\\n\"\n    \"ptJ th 1\\n\"\n    \"Ntj th 1\\n\"\n    \"rwL er 1\\n\"\n    \"cFz ch 1\\n\"\n    \"jVd de 1\\n\"\n    \"Gbv va 1\\n\"\n    \"oJn an 1\\n\"\n    \"wkL ka 1\\n\"\n    \"qoT qu 1\\n\"\n    \"Qxk ka 1\\n\"\n    \"rZj ij 1\\n\"\n    \"Cgd ng 1\\n\"\n    \"gvW ng 1\\n\"\n    \"kYv ka 1\\n\"\n    \"qjR qu 1\\n\"\n    \"Vnq an 1\\n\"\n    \"yJt th 1\\n\"\n    \"xWy ny 1\\n\"\n    \"bXl le 1\\n\"\n    \"xVk ka 1\\n\"\n    \"xuG qu 1\\n\"\n    \"Hzs st 1\\n\"\n    \"uDq qu 1\\n\"\n    \"Ywk ka 1\\n\"\n    \"Jkh th 1\\n\"\n    \"Gdm de 1\\n\"\n    \"qcO ch 1\\n\"\n    \"hlH th 1\\n\"\n    \"Jfv va 1\\n\"\n    \"cLn an 1\\n\"\n    \"wzG sz 1\\n\"\n    \"yhF th 1\\n\"\n    \"kfD ka 1\\n\"\n    \"kbJ ka 1\\n\"\n    \"Nqp qu 1\\n\"\n    \"gYq ng 1\\n\"\n    \"ztM th 1\\n\"\n    \"jcD ch 1\\n\"\n    \"wgY ng 1\\n\"\n    \"qdT da 1\\n\"\n    \"vTw va 1\\n\"\n    \"cNz ch 1\\n\"\n    \"Jbc ch 1\\n\"\n    \"Xcj ch 1\\n\"\n    \"rUw er 1\\n\"\n    \"gXv ng 1\\n\"\n    \"dRf de 1\\n\"\n    \"bJz sz 1\\n\"\n    \"aqA an 1\\n\"\n    \"uOz qu 1\\n\"\n    \"wPj ij 1\\n\"\n    \"uDw qu 1\\n\"\n    \"mqF qu 1\\n\"\n    \"cXr ch 1\\n\"\n    \"yrL er 1\\n\"\n    \"nJk an 1\\n\"\n    \"hsY th 1\\n\"\n    \"Zqs qu 1\\n\"\n    \"qeS qu 1\\n\"\n    \"bLv va 1\\n\"\n    \"jEo on 1\\n\"\n    \"pmE me 1\\n\"\n    \"jIt th 1\\n\"\n    \"vzZ sz 1\\n\"\n    \"Qhd th 1\\n\"\n    \"cnN an 1\\n\"\n    \"bPq qu 1\\n\"\n    \"pZw pr 1\\n\"\n    \"iwR in 1\\n\"\n    \"oJv ko 1\\n\"\n    \"ufI qu 1\\n\"\n    \"wKm me 1\\n\"\n    \"uWv qu 1\\n\"\n    \"fCf fo 1\\n\"\n    \"wBn an 1\\n\"\n    \"Uyf ny 1\\n\"\n    \"uVx qu 1\\n\"\n    \"kKf ka 1\\n\"\n    \"mrZ er 1\\n\"\n    \"lXb le 1\\n\"\n    \"zJm sz 1\\n\"\n    \"wYr er 1\\n\"\n    \"Hkw ka 1\\n\"\n    \"Ewz sz 1\\n\"\n    \"xJy ny 1\\n\"\n    \"Emx me 1\\n\"\n    \"cqL ch 1\\n\"\n    \"zVk sz 1\\n\"\n    \"yPb be 1\\n\"\n    \"zcC ch 1\\n\"\n    \"Ndq qu 1\\n\"\n    \"uWf qu 1\\n\"\n    \"kcM ch 1\\n\"\n    \"tkB th 1\\n\"\n    \"yhq th 1\\n\"\n    \"qaP an 1\\n\"\n    \"rVs er 1\\n\"\n    \"dLd de 1\\n\"\n    \"Sgm ng 1\\n\"\n    \"Xhx th 1\\n\"\n    \"xqH qu 1\\n\"\n    \"Kqy qu 1\\n\"\n    \"yRw wa 1\\n\"\n    \"Wdw de 1\\n\"\n    \"qcQ ch 1\\n\"\n    \"zbp sz 1\\n\"\n    \"dtY th 1\\n\"\n    \"cwB ch 1\\n\"\n    \"nfV an 1\\n\"\n    \"cgP ch 1\\n\"\n    \"pwW pr 1\\n\"\n    \"pqf qu 1\\n\"\n    \"Xkp ka 1\\n\"\n    \"izJ in 1\\n\"\n    \"cYw ch 1\\n\"\n    \"iQl in 1\\n\"\n    \"Qvy va 1\\n\"\n    \"ylR le 1\\n\"\n    \"sFp st 1\\n\"\n    \"Lqg ng 1\\n\"\n    \"xnP an 1\\n\"\n    \"gYl ng 1\\n\"\n    \"wIr er 1\\n\"\n    \"fqR qu 1\\n\"\n    \"Qpk ka 1\\n\"\n    \"qXz qu 1\\n\"\n    \"Lrr er 1\\n\"\n    \"sjI st 1\\n\"\n    \"iyX in 1\\n\"\n    \"Zfq qu 1\\n\"\n    \"vtH th 1\\n\"\n    \"cZf ch 1\\n\"\n    \"hXp th 1\\n\"\n    \"rJw er 1\\n\"\n    \"gbP ng 1\\n\"\n    \"Qug ng 1\\n\"\n    \"jRt th 1\\n\"\n    \"lXh th 1\\n\"\n    \"pVc ch 1\\n\"\n    \"kGc ch 1\\n\"\n    \"Nxr er 1\\n\"\n    \"yKk ka 1\\n\"\n    \"xAo on 1\\n\"\n    \"oUx on 1\\n\"\n    \"nWx an 1\\n\"\n    \"fwU wa 1\\n\"\n    \"mKg ng 1\\n\"\n    \"qhO th 1\\n\"\n    \"sGg ng 1\\n\"\n    \"Wwu qu 1\\n\"\n    \"cnE an 1\\n\"\n    \"tjS th 1\\n\"\n    \"Qyd de 1\\n\"\n    \"yWm me 1\\n\"\n    \"Qdj de 1\\n\"\n    \"jSd de 1\\n\"\n    \"Ioy on 1\\n\"\n    \"Xpp pr 1\\n\"\n    \"xJb be 1\\n\"\n    \"xvT va 1\\n\"\n    \"cdT ch 1\\n\"\n    \"khX th 1\\n\"\n    \"hVp th 1\\n\"\n    \"cjT ch 1\\n\"\n    \"Hqf qu 1\\n\"\n    \"nbP an 1\\n\"\n    \"Uwb wa 1\\n\"\n    \"Kcb ch 1\\n\"\n    \"qsQ qu 1\\n\"\n    \"tkZ th 1\\n\"\n    \"zrX er 1\\n\"\n    \"zbN sz 1\\n\"\n    \"mYi in 1\\n\"\n    \"gLx ng 1\\n\"\n    \"sGc ch 1\\n\"\n    \"Pbv va 1\\n\"\n    \"gcV ch 1\\n\"\n    \"Qjf ij 1\\n\"\n    \"wvB va 1\\n\"\n    \"gKp ng 1\\n\"\n    \"jZy ij 1\\n\"\n    \"qhW th 1\\n\"\n    \"vCg ng 1\\n\"\n    \"Lrk er 1\\n\"\n    \"fRw wa 1\\n\"\n    \"cMj ch 1\\n\"\n    \"ohK th 1\\n\"\n    \"frK er 1\\n\"\n    \"dQq qu 1\\n\"\n    \"Hdj de 1\\n\"\n    \"Bkx ka 1\\n\"\n    \"yXv va 1\\n\"\n    \"fdO de 1\\n\"\n    \"sWg ng 1\\n\"\n    \"Xtf th 1\\n\"\n    \"rUx ar 1\\n\"\n    \"qHm qu 1\\n\"\n    \"kQh th 1\\n\"\n    \"wzU sz 1\\n\"\n    \"vTt th 1\\n\"\n    \"zkN sz 1\\n\"\n    \"Fqp qu 1\\n\"\n    \"xJc ch 1\\n\"\n    \"wkQ ka 1\\n\"\n    \"wxF wa 1\\n\"\n    \"vRj ij 1\\n\"\n    \"jzD sz 1\\n\"\n    \"Zqu un 1\\n\"\n    \"zWw sz 1\\n\"\n    \"zgU ng 1\\n\"\n    \"ugX ng 1\\n\"\n    \"pmB me 1\\n\"\n    \"gzA ng 1\\n\"\n    \"Zjj ij 1\\n\"\n    \"xIj ij 1\\n\"\n    \"xoK on 1\\n\"\n    \"Gqx qu 1\\n\"\n    \"uLq qu 1\\n\"\n    \"lGw le 1\\n\"\n    \"tZq th 1\\n\"\n    \"zcN ch 1\\n\"\n    \"yPz sz 1\\n\"\n    \"rqN qu 1\\n\"\n    \"pwG pr 1\\n\"\n    \"vfP va 1\\n\"\n    \"vIy va 1\\n\"\n    \"vEj ij 1\\n\"\n    \"jqD qu 1\\n\"\n    \"Hxu qu 1\\n\"\n    \"qLs qu 1\\n\"\n    \"Jpy pr 1\\n\"\n    \"pRw pr 1\\n\"\n    \"fZs st 1\\n\"\n    \"Vvx va 1\\n\"\n    \"zkB sz 1\\n\"\n    \"yGk ka 1\\n\"\n    \"kvZ ka 1\\n\"\n    \"cqW ch 1\\n\"\n    \"wLg ng 1\\n\"\n    \"Ypg ng 1\\n\"\n    \"jrR er 1\\n\"\n    \"vwZ va 1\\n\"\n    \"gVd ng 1\\n\"\n    \"iCw ij 1\\n\"\n    \"Fxw wa 1\\n\"\n    \"qyZ qu 1\\n\"\n    \"qgT qu 1\\n\"\n    \"xLs st 1\\n\"\n    \"pXg ng 1\\n\"\n    \"gNv ng 1\\n\"\n    \"Hgz ng 1\\n\"\n    \"zJv sz 1\\n\"\n    \"Hvm va 1\\n\"\n    \"uXb qu 1\\n\"\n    \"lLz le 1\\n\"\n    \"dwP de 1\\n\"\n    \"gvN ng 1\\n\"\n    \"cpF ch 1\\n\"\n    \"vZj ij 1\\n\"\n    \"Pfv va 1\\n\"\n    \"xcI ch 1\\n\"\n    \"yVp pr 1\\n\"\n    \"fdC de 1\\n\"\n    \"pbE pr 1\\n\"\n    \"jQm ij 1\\n\"\n    \"Tqt th 1\\n\"\n    \"wMh th 1\\n\"\n    \"Gkq qu 1\\n\"\n    \"tdV th 1\\n\"\n    \"xIk ka 1\\n\"\n    \"hHp th 1\\n\"\n    \"Lsb st 1\\n\"\n    \"Wvs st 1\\n\"\n    \"Qcw ch 1\\n\"\n    \"gfQ ng 1\\n\"\n    \"Fjt th 1\\n\"\n    \"xBz sz 1\\n\"\n    \"fLx fo 1\\n\"\n    \"zkR sz 1\\n\"\n    \"kjA ij 1\\n\"\n    \"Fcw ch 1\\n\"\n    \"fhT th 1\\n\"\n    \"qiK qu 1\\n\"\n    \"wQv va 1\\n\"\n    \"pXl le 1\\n\"\n    \"hLg th 1\\n\"\n    \"jJw ij 1\\n\"\n    \"sOj st 1\\n\"\n    \"vWb va 1\\n\"\n    \"Ajq qu 1\\n\"\n    \"vKc ch 1\\n\"\n    \"iIy in 1\\n\"\n    \"pJy pr 1\\n\"\n    \"Lqc ch 1\\n\"\n    \"wBd de 1\\n\"\n    \"kRb ka 1\\n\"\n    \"Lcp ch 1\\n\"\n    \"gfB ng 1\\n\"\n    \"zVn an 1\\n\"\n    \"qWf qu 1\\n\"\n    \"Qyf ny 1\\n\"\n    \"puF qu 1\\n\"\n    \"fIe er 1\\n\"\n    \"wGb wa 1\\n\"\n    \"jjL ij 1\\n\"\n    \"hcE th 1\\n\"\n    \"qhp th 1\\n\"\n    \"gxN ng 1\\n\"\n    \"tMd th 1\\n\"\n    \"Rzt th 1\\n\"\n    \"cgO ch 1\\n\"\n    \"vmT va 1\\n\"\n    \"Dcq ch 1\\n\"\n    \"qoI qu 1\\n\"\n    \"Nqz qu 1\\n\"\n    \"vhM th 1\\n\"\n    \"gBq ng 1\\n\"\n    \"jWv ij 1\\n\"\n    \"xmE me 1\\n\"\n    \"qcd ch 1\\n\"\n    \"lYj le 1\\n\"\n    \"dDc ch 1\\n\"\n    \"xUa an 1\\n\"\n    \"kVl le 1\\n\"\n    \"wqN qu 1\\n\"\n    \"uuI qu 1\\n\"\n    \"Wzf sz 1\\n\"\n    \"yvX va 1\\n\"\n    \"Pyq qu 1\\n\"\n    \"wuU qu 1\\n\"\n    \"hLp th 1\\n\"\n    \"qqL qu 1\\n\"\n    \"cVh th 1\\n\"\n    \"Fgs ng 1\\n\"\n    \"xjF ij 1\\n\"\n    \"wkG ka 1\\n\"\n    \"qJr qu 1\\n\"\n    \"Gzq qu 1\\n\"\n    \"Ixv va 1\\n\"\n    \"hMv th 1\\n\"\n    \"dfQ de 1\\n\"\n    \"eOx er 1\\n\"\n    \"mHq qu 1\\n\"\n    \"Zkn an 1\\n\"\n    \"nqW an 1\\n\"\n    \"nJd an 1\\n\"\n    \"pEh th 1\\n\"\n    \"gVg ng 1\\n\"\n    \"Zyf ny 1\\n\"\n    \"nmT an 1\\n\"\n    \"csQ ch 1\\n\"\n    \"Pkq qu 1\\n\"\n    \"tdP th 1\\n\"\n    \"fkz sz 1\\n\"\n    \"Qnc an 1\\n\"\n    \"pBj ij 1\\n\"\n    \"Mjv ij 1\\n\"\n    \"ymJ me 1\\n\"\n    \"Mxs st 1\\n\"\n    \"hbL th 1\\n\"\n    \"vQh th 1\\n\"\n    \"xDy ny 1\\n\"\n    \"djC de 1\\n\"\n    \"cdQ ch 1\\n\"\n    \"bnL an 1\\n\"\n    \"Yjl le 1\\n\"\n    \"qUc ch 1\\n\"\n    \"mjW ij 1\\n\"\n    \"zWs st 1\\n\"\n    \"xvF va 1\\n\"\n    \"Gqi qu 1\\n\"\n    \"fGm me 1\\n\"\n    \"Xuw qu 1\\n\"\n    \"qCs qu 1\\n\"\n    \"Kxm me 1\\n\"\n    \"lNn an 1\\n\"\n    \"sdL de 1\\n\"\n    \"Vtn th 1\\n\"\n    \"sJj st 1\\n\"\n    \"kQj ij 1\\n\"\n    \"xfX fo 1\\n\"\n    \"Nqk qu 1\\n\"\n    \"cBs ch 1\\n\"\n    \"yzP sz 1\\n\"\n    \"xUv va 1\\n\"\n    \"lbT le 1\\n\"\n    \"wyV wa 1\\n\"\n    \"Xkm ka 1\\n\"\n    \"Wdv de 1\\n\"\n    \"qQn an 1\\n\"\n    \"sqZ qu 1\\n\"\n    \"sfW st 1\\n\"\n    \"gfM ng 1\\n\"\n    \"Vlp le 1\\n\"\n    \"Xjx ij 1\\n\"\n    \"hIj th 1\\n\"\n    \"Jws st 1\\n\"\n    \"xZr er 1\\n\"\n    \"iKw in 1\\n\"\n    \"Tbd de 1\\n\"\n    \"zQv sz 1\\n\"\n    \"nmZ an 1\\n\"\n    \"bpE pr 1\\n\"\n    \"zSv sz 1\\n\"\n    \"Fgi ng 1\\n\"\n    \"uIw qu 1\\n\"\n    \"Zvx va 1\\n\"\n    \"rqR qu 1\\n\"\n    \"vjZ ij 1\\n\"\n    \"Njr er 1\\n\"\n    \"kwF ka 1\\n\"\n    \"Ovw va 1\\n\"\n    \"hwZ th 1\\n\"\n    \"Mvk ka 1\\n\"\n    \"Dvf va 1\\n\"\n    \"xsP st 1\\n\"\n    \"gZq ng 1\\n\"\n    \"vXv va 1\\n\"\n    \"wGt th 1\\n\"\n    \"qlO qu 1\\n\"\n    \"fNz sz 1\\n\"\n    \"Nvw va 1\\n\"\n    \"zdZ de 1\\n\"\n    \"vxV va 1\\n\"\n    \"Nhz th 1\\n\"\n    \"tZm th 1\\n\"\n    \"iyS in 1\\n\"\n    \"qZa an 1\\n\"\n    \"xrZ er 1\\n\"\n    \"qly qu 1\\n\"\n    \"cjM ch 1\\n\"\n    \"kYj ij 1\\n\"\n    \"iyF in 1\\n\"\n    \"Cdq qu 1\\n\"\n    \"xwE wa 1\\n\"\n    \"xfV fo 1\\n\"\n    \"wbF wa 1\\n\"\n    \"wuO qu 1\\n\"\n    \"Rlh th 1\\n\"\n    \"fCj ij 1\\n\"\n    \"bcZ ch 1\\n\"\n    \"Gjv ij 1\\n\"\n    \"gLl ng 1\\n\"\n    \"wLc ch 1\\n\"\n    \"zmP sz 1\\n\"\n    \"cYo ch 1\\n\"\n    \"Rhk th 1\\n\"\n    \"grM ng 1\\n\"\n    \"fDh th 1\\n\"\n    \"Yyb be 1\\n\"\n    \"uyW un 1\\n\"\n    \"kGb ka 1\\n\"\n    \"iwK in 1\\n\"\n    \"qkN qu 1\\n\"\n    \"qXd qu 1\\n\"\n    \"zCb sz 1\\n\"\n    \"rQf er 1\\n\"\n    \"xrO er 1\\n\"\n    \"Fzh th 1\\n\"\n    \"wSj ij 1\\n\"\n    \"yPw wa 1\\n\"\n    \"Bqw qu 1\\n\"\n    \"kWc ch 1\\n\"\n    \"qhX th 1\\n\"\n    \"kBw ka 1\\n\"\n    \"yvL va 1\\n\"\n    \"xcT ch 1\\n\"\n    \"Fbz sz 1\\n\"\n    \"cEb ch 1\\n\"\n    \"vEk ka 1\\n\"\n    \"uQh th 1\\n\"\n    \"sHw us 1\\n\"\n    \"Fvf va 1\\n\"\n    \"wkO ka 1\\n\"\n    \"wiY in 1\\n\"\n    \"sPm st 1\\n\"\n    \"dFn an 1\\n\"\n    \"qQx qu 1\\n\"\n    \"Rsg ng 1\\n\"\n    \"fUj ij 1\\n\"\n    \"tLw th 1\\n\"\n    \"sRk st 1\\n\"\n    \"zkP sz 1\\n\"\n    \"mvF va 1\\n\"\n    \"jYb ij 1\\n\"\n    \"swY is 1\\n\"\n    \"rRc ch 1\\n\"\n    \"rHd er 1\\n\"\n    \"bDk ka 1\\n\"\n    \"lWv le 1\\n\"\n    \"vqv qu 1\\n\"\n    \"qoN qu 1\\n\"\n    \"zMl le 1\\n\"\n    \"pfJ pr 1\\n\"\n    \"Dmz sz 1\\n\"\n    \"obQ on 1\\n\"\n    \"Vfz sz 1\\n\"\n    \"bVd de 1\\n\"\n    \"Cjv ij 1\\n\"\n    \"mKz sz 1\\n\"\n    \"jjE ij 1\\n\"\n    \"Aqc ch 1\\n\"\n    \"Cxn an 1\\n\"\n    \"vpH va 1\\n\"\n    \"Lxa an 1\\n\"\n    \"zpH sz 1\\n\"\n    \"qoF qu 1\\n\"\n    \"hRz th 1\\n\"\n    \"yYw wa 1\\n\"\n    \"dUx de 1\\n\"\n    \"Kxl le 1\\n\"\n    \"xUo on 1\\n\"\n    \"hDp th 1\\n\"\n    \"zDf sz 1\\n\"\n    \"Wsq qu 1\\n\"\n    \"jzZ sz 1\\n\"\n    \"mGf me 1\\n\"\n    \"jjV ij 1\\n\"\n    \"pfR pr 1\\n\"\n    \"bPd de 1\\n\"\n    \"wjq qu 1\\n\"\n    \"Rjx ij 1\\n\"\n    \"Lwq qu 1\\n\"\n    \"fqH qu 1\\n\"\n    \"jRs sz 1\\n\"\n    \"sfT sz 1\\n\"\n    \"Grw er 1\\n\"\n    \"zGn an 1\\n\"\n    \"ycW ch 1\\n\"\n    \"lUq qu 1\\n\"\n    \"pRq qu 1\\n\"\n    \"nZq an 1\\n\"\n    \"Svx va 1\\n\"\n    \"Phf th 1\\n\"\n    \"Fvj ij 1\\n\"\n    \"Qlm le 1\\n\"\n    \"jgS ng 1\\n\"\n    \"Mmv va 1\\n\"\n    \"xPd de 1\\n\"\n    \"qqw qu 1\\n\"\n    \"rWp er 1\\n\"\n    \"qIr qu 1\\n\"\n    \"Cxf fo 1\\n\"\n    \"wtG th 1\\n\"\n    \"cKb ch 1\\n\"\n    \"btL th 1\\n\"\n    \"pRx pr 1\\n\"\n    \"zsB sz 1\\n\"\n    \"nbD an 1\\n\"\n    \"jKg ng 1\\n\"\n    \"bhL th 1\\n\"\n    \"Yhw th 1\\n\"\n    \"yYr er 1\\n\"\n    \"jCm ij 1\\n\"\n    \"xzK sz 1\\n\"\n    \"pJl le 1\\n\"\n    \"Qrr er 1\\n\"\n    \"uvG qu 1\\n\"\n    \"cfJ ch 1\\n\"\n    \"iqX in 1\\n\"\n    \"vNd de 1\\n\"\n    \"qcM ch 1\\n\"\n    \"Wvj ij 1\\n\"\n    \"vmS va 1\\n\"\n    \"vWp va 1\\n\"\n    \"aIj an 1\\n\"\n    \"jmS ij 1\\n\"\n    \"Fmk ka 1\\n\"\n    \"iyN in 1\\n\"\n    \"bZu qu 1\\n\"\n    \"Kzj sz 1\\n\"\n    \"Vwd de 1\\n\"\n    \"Ulx le 1\\n\"\n    \"rCv er 1\\n\"\n    \"wvq qu 1\\n\"\n    \"Qkr ri 1\\n\"\n    \"fjC ij 1\\n\"\n    \"tRr th 1\\n\"\n    \"pCy pr 1\\n\"\n    \"fbC be 1\\n\"\n    \"fQc ch 1\\n\"\n    \"Xkf ka 1\\n\"\n    \"Dqr qu 1\\n\"\n    \"fgE ng 1\\n\"\n    \"vMm va 1\\n\"\n    \"dPb de 1\\n\"\n    \"vjL ij 1\\n\"\n    \"wKc ch 1\\n\"\n    \"Pyw wa 1\\n\"\n    \"eXv er 1\\n\"\n    \"nVw an 1\\n\"\n    \"Jww wa 1\\n\"\n    \"Dfq qu 1\\n\"\n    \"tCc th 1\\n\"\n    \"qtH th 1\\n\"\n    \"Xqm qu 1\\n\"\n    \"Bhc th 1\\n\"\n    \"tcX th 1\\n\"\n    \"xKp pr 1\\n\"\n    \"tfN th 1\\n\"\n    \"ibZ in 1\\n\"\n    \"Nzb sz 1\\n\"\n    \"Wnj an 1\\n\"\n    \"vXy va 1\\n\"\n    \"iVf in 1\\n\"\n    \"dxT de 1\\n\"\n    \"jxQ ij 1\\n\"\n    \"Ddv de 1\\n\"\n    \"mXd de 1\\n\"\n    \"fUq qu 1\\n\"\n    \"wgQ ng 1\\n\"\n    \"Lgj ng 1\\n\"\n    \"mgY ng 1\\n\"\n    \"qMw qu 1\\n\"\n    \"gpJ ng 1\\n\"\n    \"sZx st 1\\n\"\n    \"nXz an 1\\n\"\n    \"Wve er 1\\n\"\n    \"lVk le 1\\n\"\n    \"wCb wa 1\\n\"\n    \"xvI va 1\\n\"\n    \"mfJ me 1\\n\"\n    \"tQq th 1\\n\"\n    \"dTt th 1\\n\"\n    \"fqk qu 1\\n\"\n    \"nVt th 1\\n\"\n    \"wIh th 1\\n\"\n    \"Qvp va 1\\n\"\n    \"vfN va 1\\n\"\n    \"gQs ng 1\\n\"\n    \"iVp in 1\\n\"\n    \"jGl le 1\\n\"\n    \"xMf fo 1\\n\"\n    \"xvw wi 1\\n\"\n    \"zIl le 1\\n\"\n    \"zfR sz 1\\n\"\n    \"zWv sz 1\\n\"\n    \"ehV th 1\\n\"\n    \"dZq qu 1\\n\"\n    \"tmK th 1\\n\"\n    \"cLt th 1\\n\"\n    \"pZb pr 1\\n\"\n    \"vnJ an 1\\n\"\n    \"fvk ka 1\\n\"\n    \"Xhv th 1\\n\"\n    \"Vjn an 1\\n\"\n    \"tgI th 1\\n\"\n    \"xaJ an 1\\n\"\n    \"mSf me 1\\n\"\n    \"Xzm sz 1\\n\"\n    \"dTz de 1\\n\"\n    \"xXm me 1\\n\"\n    \"pQz sz 1\\n\"\n    \"Cqg ng 1\\n\"\n    \"bSs st 1\\n\"\n    \"prW er 1\\n\"\n    \"hDb th 1\\n\"\n    \"sXt th 1\\n\"\n    \"kcD ch 1\\n\"\n    \"kgZ ng 1\\n\"\n    \"Tzt th 1\\n\"\n    \"zcR ch 1\\n\"\n    \"Xwu qu 1\\n\"\n    \"kXg ng 1\\n\"\n    \"Ywv wi 1\\n\"\n    \"rpK er 1\\n\"\n    \"wPs is 1\\n\"\n    \"Kjz sz 1\\n\"\n    \"fDb be 1\\n\"\n    \"jrF er 1\\n\"\n    \"bbQ be 1\\n\"\n    \"Qdb de 1\\n\"\n    \"rKt th 1\\n\"\n    \"vYf va 1\\n\"\n    \"vxA va 1\\n\"\n    \"fhM th 1\\n\"\n    \"jsU st 1\\n\"\n    \"zXk sz 1\\n\"\n    \"uwO qu 1\\n\"\n    \"jsR st 1\\n\"\n    \"kHn an 1\\n\"\n    \"xWv va 1\\n\"\n    \"vfS va 1\\n\"\n    \"pIv va 1\\n\"\n    \"bcW ch 1\\n\"\n    \"zdM sz 1\\n\"\n    \"gCz ng 1\\n\"\n    \"hzN th 1\\n\"\n    \"bQw wa 1\\n\"\n    \"ojX on 1\\n\"\n    \"Vqv qu 1\\n\"\n    \"qWb qu 1\\n\"\n    \"Ykb ka 1\\n\"\n    \"xnJ an 1\\n\"\n    \"sJz st 1\\n\"\n    \"hRr th 1\\n\"\n    \"tXs th 1\\n\"\n    \"Qeb er 1\\n\"\n    \"Uwd de 1\\n\"\n    \"nYg an 1\\n\"\n    \"Yfx fo 1\\n\"\n    \"xrG er 1\\n\"\n    \"eZr le 1\\n\"\n    \"ufV us 1\\n\"\n    \"rXm er 1\\n\"\n    \"qZv qu 1\\n\"\n    \"vQz sz 1\\n\"\n    \"Tnq an 1\\n\"\n    \"Rmj ij 1\\n\"\n    \"jlM le 1\\n\"\n    \"cqO ch 1\\n\"\n    \"xWf fo 1\\n\"\n    \"jcZ ch 1\\n\"\n    \"jfV ij 1\\n\"\n    \"Zmj ij 1\\n\"\n    \"bxM be 1\\n\"\n    \"fFd de 1\\n\"\n    \"gjP ng 1\\n\"\n    \"hMs th 1\\n\"\n    \"Ysq qu 1\\n\"\n    \"qkV qu 1\\n\"\n    \"Kmc ch 1\\n\"\n    \"xYy ny 1\\n\"\n    \"dvX de 1\\n\"\n    \"rwC er 1\\n\"\n    \"gwW wa 1\\n\"\n    \"Qpy pr 1\\n\"\n    \"jXy ij 1\\n\"\n    \"qOj qu 1\\n\"\n    \"Qmz sz 1\\n\"\n    \"Eqq qu 1\\n\"\n    \"zJs st 1\\n\"\n    \"fHy ny 1\\n\"\n    \"hDt th 1\\n\"\n    \"sDh th 1\\n\"\n    \"Vkq qu 1\\n\"\n    \"yLc ch 1\\n\"\n    \"vHm va 1\\n\"\n    \"vnX an 1\\n\"\n    \"jxS ij 1\\n\"\n    \"Jtj th 1\\n\"\n    \"qgE ng 1\\n\"\n    \"bpH pr 1\\n\"\n    \"Iqy qu 1\\n\"\n    \"qMn an 1\\n\"\n    \"dmE de 1\\n\"\n    \"Hfq qu 1\\n\"\n    \"pSb pr 1\\n\"\n    \"xhI th 1\\n\"\n    \"Qjt th 1\\n\"\n    \"yfX ny 1\\n\"\n    \"vuF qu 1\\n\"\n    \"wFw wa 1\\n\"\n    \"znS an 1\\n\"\n    \"zlV le 1\\n\"\n    \"lkK le 1\\n\"\n    \"Fvz sz 1\\n\"\n    \"qjT qu 1\\n\"\n    \"zoQ on 1\\n\"\n    \"Wvx va 1\\n\"\n    \"hMn th 1\\n\"\n    \"dMw de 1\\n\"\n    \"gcF ch 1\\n\"\n    \"dbB de 1\\n\"\n    \"Cqj qu 1\\n\"\n    \"mCv va 1\\n\"\n    \"pJx pr 1\\n\"\n    \"Dfv va 1\\n\"\n    \"sjL st 1\\n\"\n    \"qiG in 1\\n\"\n    \"Zls le 1\\n\"\n    \"Vsf st 1\\n\"\n    \"Fgd ng 1\\n\"\n    \"wmD me 1\\n\"\n    \"Dxo on 1\\n\"\n    \"qrk qu 1\\n\"\n    \"pJr er 1\\n\"\n    \"cLx ch 1\\n\"\n    \"jdB de 1\\n\"\n    \"ybM be 1\\n\"\n    \"mvM va 1\\n\"\n    \"jtX th 1\\n\"\n    \"cnB an 1\\n\"\n    \"wtW th 1\\n\"\n    \"Ksd st 1\\n\"\n    \"wql wa 1\\n\"\n    \"mhU th 1\\n\"\n    \"oJy on 1\\n\"\n    \"Ghp th 1\\n\"\n    \"qoX qu 1\\n\"\n    \"xsI st 1\\n\"\n    \"vFs st 1\\n\"\n    \"fYe er 1\\n\"\n    \"lnV an 1\\n\"\n    \"uXn an 1\\n\"\n    \"Eoh th 1\\n\"\n    \"wcM wa 1\\n\"\n    \"jwK ij 1\\n\"\n    \"Gke er 1\\n\"\n    \"uFq qu 1\\n\"\n    \"Ycg ch 1\\n\"\n    \"xqy qu 1\\n\"\n    \"btM th 1\\n\"\n    \"jHw ij 1\\n\"\n    \"qeU qu 1\\n\"\n    \"Qjz sz 1\\n\"\n    \"nuQ an 1\\n\"\n    \"Fcx ch 1\\n\"\n    \"Kqt th 1\\n\"\n    \"Lqv qu 1\\n\"\n    \"mwU me 1\\n\"\n    \"fQs st 1\\n\"\n    \"kSd de 1\\n\"\n    \"nYv an 1\\n\"\n    \"wGj ij 1\\n\"\n    \"gvZ ng 1\\n\"\n    \"mqN qu 1\\n\"\n    \"Fhp th 1\\n\"\n    \"pMq qu 1\\n\"\n    \"dBh ch 1\\n\"\n    \"bXk ka 1\\n\"\n    \"fqK qu 1\\n\"\n    \"Yyq qu 1\\n\"\n    \"Krq qu 1\\n\"\n    \"Rnv an 1\\n\"\n    \"uuE qu 1\\n\"\n    \"Xsz st 1\\n\"\n    \"fKb be 1\\n\"\n    \"yIh th 1\\n\"\n    \"Ncd ch 1\\n\"\n    \"mLr er 1\\n\"\n    \"cSs ch 1\\n\"\n    \"lbE le 1\\n\"\n    \"xaW an 1\\n\"\n    \"Rtd th 1\\n\"\n    \"rbF er 1\\n\"\n    \"vgR ng 1\\n\"\n    \"scZ ch 1\\n\"\n    \"rHp er 1\\n\"\n    \"eYw er 1\\n\"\n    \"Lxj ij 1\\n\"\n    \"qRg ng 1\\n\"\n    \"jpN ij 1\\n\"\n    \"rjW er 1\\n\"\n    \"lgK ng 1\\n\"\n    \"mCc ch 1\\n\"\n    \"fGu qu 1\\n\"\n    \"xzT sz 1\\n\"\n    \"wQw wa 1\\n\"\n    \"klJ li 1\\n\"\n    \"cqk ch 1\\n\"\n    \"lMh th 1\\n\"\n    \"pYs st 1\\n\"\n    \"hQk th 1\\n\"\n    \"Hxz sz 1\\n\"\n    \"feY er 1\\n\"\n    \"fhF th 1\\n\"\n    \"fBm me 1\\n\"\n    \"fVt th 1\\n\"\n    \"zfh th 1\\n\"\n    \"sbT st 1\\n\"\n    \"dQy de 1\\n\"\n    \"Fmc ch 1\\n\"\n    \"vhL th 1\\n\"\n    \"Jtb th 1\\n\"\n    \"Vrx er 1\\n\"\n    \"yqZ qu 1\\n\"\n    \"jDm ij 1\\n\"\n    \"mfV me 1\\n\"\n    \"oSx on 1\\n\"\n    \"Jxg ng 1\\n\"\n    \"wOq qu 1\\n\"\n    \"dJq qu 1\\n\"\n    \"Vvc ch 1\\n\"\n    \"Eqe qu 1\\n\"\n    \"jqO qu 1\\n\"\n    \"zxI sz 1\\n\"\n    \"qKf qu 1\\n\"\n    \"fdW de 1\\n\"\n    \"ccM ch 1\\n\"\n    \"gcW ch 1\\n\"\n    \"lFn an 1\\n\"\n    \"Rvq qu 1\\n\"\n    \"znN an 1\\n\"\n    \"zbU sz 1\\n\"\n    \"tNw th 1\\n\"\n    \"wjK ij 1\\n\"\n    \"Jbd de 1\\n\"\n    \"Bfc ch 1\\n\"\n    \"qeX le 1\\n\"\n    \"tXk th 1\\n\"\n    \"slJ le 1\\n\"\n    \"cKd ch 1\\n\"\n    \"nCf an 1\\n\"\n    \"qgV ng 1\\n\"\n    \"Mhx th 1\\n\"\n    \"sKf st 1\\n\"\n    \"hqZ th 1\\n\"\n    \"Fdt th 1\\n\"\n    \"qzJ qu 1\\n\"\n    \"sNn an 1\\n\"\n    \"tjW th 1\\n\"\n    \"xcN ch 1\\n\"\n    \"fcJ ch 1\\n\"\n    \"djU de 1\\n\"\n    \"Ygh th 1\\n\"\n    \"woI on 1\\n\"\n    \"Yyz sz 1\\n\"\n    \"kQc ch 1\\n\"\n    \"hfQ th 1\\n\"\n    \"nrL an 1\\n\"\n    \"lQs le 1\\n\"\n    \"mtF th 1\\n\"\n    \"wbX wa 1\\n\"\n    \"gmR ng 1\\n\"\n    \"Zsq qu 1\\n\"\n    \"ytQ th 1\\n\"\n    \"mbF me 1\\n\"\n    \"fgT ng 1\\n\"\n    \"cWu ch 1\\n\"\n    \"gxG ng 1\\n\"\n    \"hNv th 1\\n\"\n    \"dfW de 1\\n\"\n    \"zrC er 1\\n\"\n    \"woX on 1\\n\"\n    \"wjT ij 1\\n\"\n    \"Pqw qu 1\\n\"\n    \"vkf ka 1\\n\"\n    \"nLz an 1\\n\"\n    \"cjV ch 1\\n\"\n    \"fcP ch 1\\n\"\n    \"vlQ le 1\\n\"\n    \"Fgq ng 1\\n\"\n    \"hgP th 1\\n\"\n    \"Gqy qu 1\\n\"\n    \"tKs th 1\\n\"\n    \"Xfv va 1\\n\"\n    \"yZq qu 1\\n\"\n    \"yiZ in 1\\n\"\n    \"rXv er 1\\n\"\n    \"Ycy ch 1\\n\"\n    \"fvA va 1\\n\"\n    \"Tqs qu 1\\n\"\n    \"hZy th 1\\n\"\n    \"xwc ch 1\\n\"\n    \"qVf qu 1\\n\"\n    \"Mhq th 1\\n\"\n    \"zSj sz 1\\n\"\n    \"vhQ th 1\\n\"\n    \"tzX th 1\\n\"\n    \"Gvm va 1\\n\"\n    \"cqU ch 1\\n\"\n    \"Hhp th 1\\n\"\n    \"gQk ng 1\\n\"\n    \"pwL pr 1\\n\"\n    \"sNw st 1\\n\"\n    \"qEt th 1\\n\"\n    \"Nzq qu 1\\n\"\n    \"zsD st 1\\n\"\n    \"mDg ng 1\\n\"\n    \"Rtq th 1\\n\"\n    \"jLf ij 1\\n\"\n    \"wTp pr 1\\n\"\n    \"xJh th 1\\n\"\n    \"Vqo qu 1\\n\"\n    \"Zqk qu 1\\n\"\n    \"qqQ qu 1\\n\"\n    \"hrY th 1\\n\"\n    \"Wqo qu 1\\n\"\n    \"mIy me 1\\n\"\n    \"Ipk ka 1\\n\"\n    \"xjC ij 1\\n\"\n    \"lLp le 1\\n\"\n    \"hqF th 1\\n\"\n    \"cWg ch 1\\n\"\n    \"qYc qu 1\\n\"\n    \"cjU ch 1\\n\"\n    \"qXk qu 1\\n\"\n    \"hqL th 1\\n\"\n    \"zxT sz 1\\n\"\n    \"dnX an 1\\n\"\n    \"zBt th 1\\n\"\n    \"Qls le 1\\n\"\n    \"khC th 1\\n\"\n    \"uqX qu 1\\n\"\n    \"Zbf be 1\\n\"\n    \"iDx li 1\\n\"\n    \"Znp an 1\\n\"\n    \"Jxq qu 1\\n\"\n    \"jqY qu 1\\n\"\n    \"vbU va 1\\n\"\n    \"qRr qu 1\\n\"\n    \"qpj qu 1\\n\"\n    \"wlG le 1\\n\"\n    \"Wgx ng 1\\n\"\n    \"Vxj ij 1\\n\"\n    \"zSw sz 1\\n\"\n    \"ihW th 1\\n\"\n    \"kzT sz 1\\n\"\n    \"aeZ an 1\\n\"\n    \"hKj th 1\\n\"\n    \"tWs th 1\\n\"\n    \"gLc ch 1\\n\"\n    \"gpK ng 1\\n\"\n    \"yJz sz 1\\n\"\n    \"Gvt th 1\\n\"\n    \"fEo on 1\\n\"\n    \"sKd st 1\\n\"\n    \"xhN th 1\\n\"\n    \"aMq an 1\\n\"\n    \"ehX th 1\\n\"\n    \"kfZ ku 1\\n\"\n    \"Wwc ch 1\\n\"\n    \"Ymz sz 1\\n\"\n    \"Vkd de 1\\n\"\n    \"bzD sz 1\\n\"\n    \"Xkg ng 1\\n\"\n    \"Vzz sz 1\\n\"\n    \"xvV va 1\\n\"\n    \"pHh th 1\\n\"\n    \"rKq qu 1\\n\"\n    \"vmM va 1\\n\"\n    \"Qxj ij 1\\n\"\n    \"zNr er 1\\n\"\n    \"bqB qu 1\\n\"\n    \"Jqw qu 1\\n\"\n    \"zqB qu 1\\n\"\n    \"Xvm va 1\\n\"\n    \"lBf le 1\\n\"\n    \"qqB qu 1\\n\"\n    \"gCs ng 1\\n\"\n    \"rRg ng 1\\n\"\n    \"Rnm an 1\\n\"\n    \"Lzw sz 1\\n\"\n    \"iwN in 1\\n\"\n    \"pfN pr 1\\n\"\n    \"hCw wa 1\\n\"\n    \"uHz qu 1\\n\"\n    \"cLc ch 1\\n\"\n    \"lwD le 1\\n\"\n    \"qjB qu 1\\n\"\n    \"Ojy ij 1\\n\"\n    \"dmV di 1\\n\"\n    \"cCw ch 1\\n\"\n    \"lXs le 1\\n\"\n    \"smR st 1\\n\"\n    \"mxO me 1\\n\"\n    \"Jrt th 1\\n\"\n    \"zjN sz 1\\n\"\n    \"bBn an 1\\n\"\n    \"cxQ ch 1\\n\"\n    \"Kdp de 1\\n\"\n    \"Dlb le 1\\n\"\n    \"pqD qu 1\\n\"\n    \"qqC qu 1\\n\"\n    \"Spz sz 1\\n\"\n    \"tCd th 1\\n\"\n    \"gfP ng 1\\n\"\n    \"uGj qu 1\\n\"\n    \"xbE be 1\\n\"\n    \"Xpv va 1\\n\"\n    \"Xzt th 1\\n\"\n    \"gqG qu 1\\n\"\n    \"kqq qu 1\\n\"\n    \"Kvq qu 1\\n\"\n    \"qWi qu 1\\n\"\n    \"mxZ me 1\\n\"\n    \"qoY qu 1\\n\"\n    \"Sgf ng 1\\n\"\n    \"cRv ch 1\\n\"\n    \"Wgi ng 1\\n\"\n    \"eDx er 1\\n\"\n    \"cWw ch 1\\n\"\n    \"vFq qu 1\\n\"\n    \"Kxv va 1\\n\"\n    \"iWp in 1\\n\"\n    \"fRx fo 1\\n\"\n    \"wtB th 1\\n\"\n    \"swW st 1\\n\"\n    \"grK ng 1\\n\"\n    \"Hfe er 1\\n\"\n    \"gfZ ng 1\\n\"\n    \"xqX qu 1\\n\"\n    \"oKj on 1\\n\"\n    \"vfq qu 1\\n\"\n    \"pWw pr 1\\n\"\n    \"uWc ch 1\\n\"\n    \"lCg ng 1\\n\"\n    \"qkg qu 1\\n\"\n    \"cDh th 1\\n\"\n    \"Sfz sz 1\\n\"\n    \"uYx qu 1\\n\"\n    \"xvR va 1\\n\"\n    \"eAo er 1\\n\"\n    \"pYg ng 1\\n\"\n    \"dRx de 1\\n\"\n    \"iWd in 1\\n\"\n    \"gGx ng 1\\n\"\n    \"bXz sz 1\\n\"\n    \"kcP ch 1\\n\"\n    \"hcJ th 1\\n\"\n    \"lCf le 1\\n\"\n    \"gmW ng 1\\n\"\n    \"Hkf ka 1\\n\"\n    \"rhL th 1\\n\"\n    \"jqP qu 1\\n\"\n    \"rQp er 1\\n\"\n    \"vCn an 1\\n\"\n    \"dWj de 1\\n\"\n    \"Hrx er 1\\n\"\n    \"sTz st 1\\n\"\n    \"aVt th 1\\n\"\n    \"qwK qu 1\\n\"\n    \"vvE va 1\\n\"\n    \"wKp pr 1\\n\"\n    \"xcY ch 1\\n\"\n    \"vpM va 1\\n\"\n    \"jlC le 1\\n\"\n    \"dlG le 1\\n\"\n    \"oTq qu 1\\n\"\n    \"iLp in 1\\n\"\n    \"xsL st 1\\n\"\n    \"lFz le 1\\n\"\n    \"vhC th 1\\n\"\n    \"ylX le 1\\n\"\n    \"pmO me 1\\n\"\n    \"Ycc ch 1\\n\"\n    \"Ynp an 1\\n\"\n    \"Ybm me 1\\n\"\n    \"Qln an 1\\n\"\n    \"bxA be 1\\n\"\n    \"tFs th 1\\n\"\n    \"Lqw qu 1\\n\"\n    \"zcU ch 1\\n\"\n    \"vfK va 1\\n\"\n    \"vpQ va 1\\n\"\n    \"Dtf th 1\\n\"\n    \"bTj ij 1\\n\"\n    \"Vvw va 1\\n\"\n    \"Qbx be 1\\n\"\n    \"zWk sz 1\\n\"\n    \"bSx be 1\\n\"\n    \"zpK sz 1\\n\"\n    \"wTb wa 1\\n\"\n    \"mkC ka 1\\n\"\n    \"cRh th 1\\n\"\n    \"nBk an 1\\n\"\n    \"xGv va 1\\n\"\n    \"hnQ th 1\\n\"\n    \"aqQ an 1\\n\"\n    \"zhZ th 1\\n\"\n    \"zwP sz 1\\n\"\n    \"vqL qu 1\\n\"\n    \"scU ch 1\\n\"\n    \"glS ng 1\\n\"\n    \"pjE ij 1\\n\"\n    \"qqD qu 1\\n\"\n    \"lRx le 1\\n\"\n    \"qVr qu 1\\n\"\n    \"Xuh th 1\\n\"\n    \"brB er 1\\n\"\n    \"Qyc ch 1\\n\"\n    \"Sgx ng 1\\n\"\n    \"dqk qu 1\\n\"\n    \"bYj ij 1\\n\"\n    \"mPx me 1\\n\"\n    \"Fdv de 1\\n\"\n    \"Xmd de 1\\n\"\n    \"cPj ch 1\\n\"\n    \"Pqg qu 1\\n\"\n    \"vYh th 1\\n\"\n    \"bJx be 1\\n\"\n    \"dQt th 1\\n\"\n    \"fxj ij 1\\n\"\n    \"Hwq qu 1\\n\"\n    \"vgC ng 1\\n\"\n    \"kjK ij 1\\n\"\n    \"nrC an 1\\n\"\n    \"vqX qu 1\\n\"\n    \"Bgk ng 1\\n\"\n    \"Cbv va 1\\n\"\n    \"Uww wa 1\\n\"\n    \"wcJ ch 1\\n\"\n    \"gBf ng 1\\n\"\n    \"zTv va 1\\n\"\n    \"zwX sz 1\\n\"\n    \"lWg le 1\\n\"\n    \"qOs qu 1\\n\"\n    \"fbB be 1\\n\"\n    \"xqG qu 1\\n\"\n    \"jQj ij 1\\n\"\n    \"voQ on 1\\n\"\n    \"yjW ij 1\\n\"\n    \"qvO qu 1\\n\"\n    \"xbF be 1\\n\"\n    \"nWu an 1\\n\"\n    \"yjQ ij 1\\n\"\n    \"cjK ch 1\\n\"\n    \"Sxn an 1\\n\"\n    \"ybX be 1\\n\"\n    \"eYg ng 1\\n\"\n    \"Bmn an 1\\n\"\n    \"fDt th 1\\n\"\n    \"jXm ij 1\\n\"\n    \"nMt th 1\\n\"\n    \"Sxb be 1\\n\"\n    \"lHm le 1\\n\"\n    \"gfY ng 1\\n\"\n    \"nwG an 1\\n\"\n    \"gHl ng 1\\n\"\n    \"Wpm me 1\\n\"\n    \"wFj ij 1\\n\"\n    \"hGm th 1\\n\"\n    \"wwC wa 1\\n\"\n    \"Mlf le 1\\n\"\n    \"cJb ch 1\\n\"\n    \"bnC an 1\\n\"\n    \"Fvp va 1\\n\"\n    \"tGc th 1\\n\"\n    \"fhZ th 1\\n\"\n    \"Vkh th 1\\n\"\n    \"jwg ng 1\\n\"\n    \"xbK be 1\\n\"\n    \"zVq qu 1\\n\"\n    \"qTz qu 1\\n\"\n    \"vrD er 1\\n\"\n    \"fRt th 1\\n\"\n    \"fFs st 1\\n\"\n    \"hWg th 1\\n\"\n    \"lzE le 1\\n\"\n    \"lwX le 1\\n\"\n    \"jHy ij 1\\n\"\n    \"Qqt th 1\\n\"\n    \"Dqi in 1\\n\"\n    \"Tvj ij 1\\n\"\n    \"gPb ng 1\\n\"\n    \"dPz sz 1\\n\"\n    \"zdT sz 1\\n\"\n    \"mvA va 1\\n\"\n    \"Zvh th 1\\n\"\n    \"qaU an 1\\n\"\n    \"fwQ wa 1\\n\"\n    \"Rsw st 1\\n\"\n    \"klB le 1\\n\"\n    \"vlN le 1\\n\"\n    \"Gvx va 1\\n\"\n    \"pdJ de 1\\n\"\n    \"lcB ch 1\\n\"\n    \"vTq qu 1\\n\"\n    \"yhV th 1\\n\"\n    \"jLv ij 1\\n\"\n    \"pzR sz 1\\n\"\n    \"Xyw wa 1\\n\"\n    \"Xlq qu 1\\n\"\n    \"Rqw wa 1\\n\"\n    \"zhP th 1\\n\"\n    \"sgT ng 1\\n\"\n    \"gpG ng 1\\n\"\n    \"tkY th 1\\n\"\n    \"dqE qu 1\\n\"\n    \"Qcg ch 1\\n\"\n    \"bfB be 1\\n\"\n    \"Wpv va 1\\n\"\n    \"Wxl le 1\\n\"\n    \"Xbq qu 1\\n\"\n    \"yFh th 1\\n\"\n    \"Rfq qu 1\\n\"\n    \"hhL th 1\\n\"\n    \"jxz sz 1\\n\"\n    \"bKh th 1\\n\"\n    \"ptU th 1\\n\"\n    \"cXe ch 1\\n\"\n    \"zXm sz 1\\n\"\n    \"Ghw th 1\\n\"\n    \"dzY sz 1\\n\"\n    \"dXn an 1\\n\"\n    \"kxW ka 1\\n\"\n    \"vVr er 1\\n\"\n    \"Jxu un 1\\n\"\n    \"bbX be 1\\n\"\n    \"rPb er 1\\n\"\n    \"qCm qu 1\\n\"\n    \"qiJ qu 1\\n\"\n    \"Xgw ng 1\\n\"\n    \"Nhq th 1\\n\"\n    \"cGp po 1\\n\"\n    \"hPw th 1\\n\"\n    \"bTz sz 1\\n\"\n    \"qIg ng 1\\n\"\n    \"pJh th 1\\n\"\n    \"wcE ch 1\\n\"\n    \"mCb me 1\\n\"\n    \"bJc ch 1\\n\"\n    \"nzQ an 1\\n\"\n    \"yqR qu 1\\n\"\n    \"xHw wa 1\\n\"\n    \"bwH wa 1\\n\"\n    \"qCr qu 1\\n\"\n    \"Uqe qu 1\\n\"\n    \"qxM qu 1\\n\"\n    \"fpO pr 1\\n\"\n    \"kcN ch 1\\n\"\n    \"ykV ka 1\\n\"\n    \"mQb me 1\\n\"\n    \"Yqs qu 1\\n\"\n    \"yVk ka 1\\n\"\n    \"vbX va 1\\n\"\n    \"mTd de 1\\n\"\n    \"jXo on 1\\n\"\n    \"wqJ qu 1\\n\"\n    \"kKt th 1\\n\"\n    \"fkS ka 1\\n\"\n    \"Wvz sz 1\\n\"\n    \"Iyv va 1\\n\"\n    \"hGk th 1\\n\"\n    \"Fze er 1\\n\"\n    \"bhM th 1\\n\"\n    \"qvI qu 1\\n\"\n    \"nXq an 1\\n\"\n    \"nXc an 1\\n\"\n    \"kJt th 1\\n\"\n    \"Nqc ch 1\\n\"\n    \"Yjc ch 1\\n\"\n    \"Fhb th 1\\n\"\n    \"jyK ij 1\\n\"\n    \"Jzj sz 1\\n\"\n    \"yqc ch 1\\n\"\n    \"wmZ me 1\\n\"\n    \"zbF sz 1\\n\"\n    \"spq qu 1\\n\"\n    \"gPn an 1\\n\"\n    \"jSg ng 1\\n\"\n    \"gMh th 1\\n\"\n    \"fXt th 1\\n\"\n    \"Fyw wa 1\\n\"\n    \"Fwg ng 1\\n\"\n    \"hmN th 1\\n\"\n    \"hNl th 1\\n\"\n    \"tqY th 1\\n\"\n    \"pGm me 1\\n\"\n    \"mXz sz 1\\n\"\n    \"qYy qu 1\\n\"\n    \"Rmq qu 1\\n\"\n    \"Dqa an 1\\n\"\n    \"Wkx ka 1\\n\"\n    \"dpT de 1\\n\"\n    \"jyJ ij 1\\n\"\n    \"Jqj qu 1\\n\"\n    \"wjZ ij 1\\n\"\n    \"xNr er 1\\n\"\n    \"qAm qu 1\\n\"\n    \"hBn th 1\\n\"\n    \"qpJ qu 1\\n\"\n    \"ygW ng 1\\n\"\n    \"jXf ij 1\\n\"\n    \"rMl er 1\\n\"\n    \"zgV ng 1\\n\"\n    \"nLp an 1\\n\"\n    \"pFx pr 1\\n\"\n    \"tvG th 1\\n\"\n    \"zQl le 1\\n\"\n    \"fdF de 1\\n\"\n    \"bxK be 1\\n\"\n    \"Bcx ch 1\\n\"\n    \"rpY er 1\\n\"\n    \"sJb st 1\\n\"\n    \"Kvh th 1\\n\"\n    \"kNq qu 1\\n\"\n    \"zHd sz 1\\n\"\n    \"dzF sz 1\\n\"\n    \"tJq th 1\\n\"\n    \"Hfv va 1\\n\"\n    \"vQd de 1\\n\"\n    \"pKj ij 1\\n\"\n    \"fhV th 1\\n\"\n    \"qZi qu 1\\n\"\n    \"ohY th 1\\n\"\n    \"vqq qu 1\\n\"\n    \"tnQ th 1\\n\"\n    \"Vqk qu 1\\n\"\n    \"zJf sz 1\\n\"\n    \"Jkz sz 1\\n\"\n    \"Rwf wa 1\\n\"\n    \"zvM va 1\\n\"\n    \"bxY be 1\\n\"\n    \"pXh th 1\\n\"\n    \"fUy ny 1\\n\"\n    \"pvE va 1\\n\"\n    \"Lpk ka 1\\n\"\n    \"dzV sz 1\\n\"\n    \"xIf fo 1\\n\"\n    \"wZw wa 1\\n\"\n    \"npQ an 1\\n\"\n    \"pWk ka 1\\n\"\n    \"jgQ ng 1\\n\"\n    \"Jqr qu 1\\n\"\n    \"gmX ng 1\\n\"\n    \"jfM ij 1\\n\"\n    \"lWj le 1\\n\"\n    \"pbN pr 1\\n\"\n    \"fvF va 1\\n\"\n    \"sDd st 1\\n\"\n    \"qdB qu 1\\n\"\n    \"frL er 1\\n\"\n    \"uHn an 1\\n\"\n    \"gwN ng 1\\n\"\n    \"yBh th 1\\n\"\n    \"Zzq qu 1\\n\"\n    \"vDg ng 1\\n\"\n    \"Qcz ch 1\\n\"\n    \"qzf qu 1\\n\"\n    \"wEc ch 1\\n\"\n    \"pxH pr 1\\n\"\n    \"fqO qu 1\\n\"\n    \"Vqe qu 1\\n\"\n    \"gkD ng 1\\n\"\n    \"Xfq qu 1\\n\"\n    \"uXg qu 1\\n\"\n    \"jCw ij 1\\n\"\n    \"Pzu qu 1\\n\"\n    \"gRh th 1\\n\"\n    \"vqH qu 1\\n\"\n    \"vvW va 1\\n\"\n    \"Rfb be 1\\n\"\n    \"gqJ qu 1\\n\"\n    \"tgO th 1\\n\"\n    \"wUy wa 1\\n\"\n    \"Jkw ka 1\\n\"\n    \"hSs th 1\\n\"\n    \"gkW ng 1\\n\"\n    \"Qgy ng 1\\n\"\n    \"dJb de 1\\n\"\n    \"prF er 1\\n\"\n    \"buX qu 1\\n\"\n    \"cVg ch 1\\n\"\n    \"jtU th 1\\n\"\n    \"fDc ch 1\\n\"\n    \"Ygc ch 1\\n\"\n    \"Kqr qu 1\\n\"\n    \"Uyp pr 1\\n\"\n    \"lJk le 1\\n\"\n    \"sxY st 1\\n\"\n    \"xfY fo 1\\n\"\n    \"Xkz sz 1\\n\"\n    \"cgZ ch 1\\n\"\n    \"cyX ch 1\\n\"\n    \"gbF ng 1\\n\"\n    \"zTk sz 1\\n\"\n    \"hsU th 1\\n\"\n    \"tlW th 1\\n\"\n    \"Zzv sz 1\\n\"\n    \"kqE qu 1\\n\"\n    \"lpQ po 1\\n\"\n    \"qJu un 1\\n\"\n    \"hYi th 1\\n\"\n    \"zlM le 1\\n\"\n    \"vDt th 1\\n\"\n    \"Hvn an 1\\n\"\n    \"Nsf st 1\\n\"\n    \"bJg ng 1\\n\"\n    \"fNg ng 1\\n\"\n    \"kQo on 1\\n\"\n    \"Kqp qu 1\\n\"\n    \"bKs st 1\\n\"\n    \"mHp me 1\\n\"\n    \"Uyj ij 1\\n\"\n    \"cxY ch 1\\n\"\n    \"yIe er 1\\n\"\n    \"qTj qu 1\\n\"\n    \"wfP wa 1\\n\"\n    \"fxI fo 1\\n\"\n    \"vQa an 1\\n\"\n    \"fvN va 1\\n\"\n    \"pwN pr 1\\n\"\n    \"vaQ an 1\\n\"\n    \"mxQ me 1\\n\"\n    \"bdV de 1\\n\"\n    \"Cgj ng 1\\n\"\n    \"xjz sz 1\\n\"\n    \"Wqw qu 1\\n\"\n    \"wpO pr 1\\n\"\n    \"woQ on 1\\n\"\n    \"xYj ij 1\\n\"\n    \"fpT pr 1\\n\"\n    \"lNp le 1\\n\"\n    \"pvX va 1\\n\"\n    \"pLp pr 1\\n\"\n    \"Ksg ng 1\\n\"\n    \"rWg ng 1\\n\"\n    \"iUy in 1\\n\"\n    \"bfX be 1\\n\"\n    \"xsV st 1\\n\"\n    \"Xnj an 1\\n\"\n    \"dmW de 1\\n\"\n    \"oQw on 1\\n\"\n    \"Zxy ny 1\\n\"\n    \"Oay an 1\\n\"\n    \"pjG ij 1\\n\"\n    \"Zbt th 1\\n\"\n    \"Hql qu 1\\n\"\n    \"Zxq qu 1\\n\"\n    \"jWd de 1\\n\"\n    \"qUp qu 1\\n\"\n    \"qxN qu 1\\n\"\n    \"qCo qu 1\\n\"\n    \"Yfd de 1\\n\"\n    \"vvU va 1\\n\"\n    \"vIk ka 1\\n\"\n    \"Dfj ij 1\\n\"\n    \"Zmh th 1\\n\"\n    \"Cqt th 1\\n\"\n    \"vQf va 1\\n\"\n    \"Nbn an 1\\n\"\n    \"tJs th 1\\n\"\n    \"Fhx th 1\\n\"\n    \"dzQ sz 1\\n\"\n    \"zYj ij 1\\n\"\n    \"qBw qu 1\\n\"\n    \"vcV ch 1\\n\"\n    \"gGt th 1\\n\"\n    \"iVw in 1\\n\"\n    \"Fzp sz 1\\n\"\n    \"bjH ij 1\\n\"\n    \"cuY ch 1\\n\"\n    \"jwS ij 1\\n\"\n    \"Cqp qu 1\\n\"\n    \"yJv va 1\\n\"\n    \"kdJ de 1\\n\"\n    \"kdT de 1\\n\"\n    \"nqB an 1\\n\"\n    \"hWs th 1\\n\"\n    \"qsj qu 1\\n\"\n    \"hLw th 1\\n\"\n    \"hdX th 1\\n\"\n    \"cgV ch 1\\n\"\n    \"tYc th 1\\n\"\n    \"eZx er 1\\n\"\n    \"hfN th 1\\n\"\n    \"gvw ng 1\\n\"\n    \"aVp an 1\\n\"\n    \"gMs ng 1\\n\"\n    \"Pbf be 1\\n\"\n    \"mQf me 1\\n\"\n    \"yUi in 1\\n\"\n    \"vGf va 1\\n\"\n    \"xgF ng 1\\n\"\n    \"zvY sz 1\\n\"\n    \"wrA er 1\\n\"\n    \"yrM er 1\\n\"\n    \"vMj ij 1\\n\"\n    \"Uyv va 1\\n\"\n    \"dLp de 1\\n\"\n    \"Gjj ij 1\\n\"\n    \"zEi in 1\\n\"\n    \"Xdg ng 1\\n\"\n    \"jHf ij 1\\n\"\n    \"oPz on 1\\n\"\n    \"xIz sz 1\\n\"\n    \"bCb be 1\\n\"\n    \"Dzq qu 1\\n\"\n    \"Yjn an 1\\n\"\n    \"gGz ng 1\\n\"\n    \"mjU ij 1\\n\"\n    \"Cjx ij 1\\n\"\n    \"xKc ch 1\\n\"\n    \"mvO va 1\\n\"\n    \"Pzb sz 1\\n\"\n    \"crK ch 1\\n\"\n    \"xhO th 1\\n\"\n    \"ylB le 1\\n\"\n    \"lDk le 1\\n\"\n    \"zlO le 1\\n\"\n    \"pgH ng 1\\n\"\n    \"vQb va 1\\n\"\n    \"sdZ st 1\\n\"\n    \"kQm ka 1\\n\"\n    \"lRh th 1\\n\"\n    \"oQy on 1\\n\"\n    \"twC th 1\\n\"\n    \"Bdj ij 1\\n\"\n    \"Qjg ng 1\\n\"\n    \"dnP an 1\\n\"\n    \"Nnp an 1\\n\"\n    \"qiP qu 1\\n\"\n    \"Ccj ch 1\\n\"\n    \"uHt th 1\\n\"\n    \"qLx qu 1\\n\"\n    \"Qsf st 1\\n\"\n    \"fKx fo 1\\n\"\n    \"fkE ka 1\\n\"\n    \"jlX le 1\\n\"\n    \"jZb ij 1\\n\"\n    \"Vwj ij 1\\n\"\n    \"zbA sz 1\\n\"\n    \"Hhd th 1\\n\"\n    \"cbY ch 1\\n\"\n    \"Ikf ka 1\\n\"\n    \"Grx er 1\\n\"\n    \"jpP ij 1\\n\"\n    \"Qfh th 1\\n\"\n    \"xhW th 1\\n\"\n    \"wmX me 1\\n\"\n    \"aJb an 1\\n\"\n    \"sfO st 1\\n\"\n    \"qXq qu 1\\n\"\n    \"mXg ng 1\\n\"\n    \"bnV an 1\\n\"\n    \"Ypw pr 1\\n\"\n    \"zCy sz 1\\n\"\n    \"lhN th 1\\n\"\n    \"rXn an 1\\n\"\n    \"fGh th 1\\n\"\n    \"Wxq qu 1\\n\"\n    \"cxT ch 1\\n\"\n    \"Zsg ng 1\\n\"\n    \"uGv qu 1\\n\"\n    \"bzM sz 1\\n\"\n    \"zjS sz 1\\n\"\n    \"dfS de 1\\n\"\n    \"gpH ng 1\\n\"\n    \"qgO ng 1\\n\"\n    \"kqF qu 1\\n\"\n    \"qfU qu 1\\n\"\n    \"qTp qu 1\\n\"\n    \"vZb va 1\\n\"\n    \"Ejw ij 1\\n\"\n    \"zQn an 1\\n\"\n    \"gYz ng 1\\n\"\n    \"kjV ij 1\\n\"\n    \"fWl le 1\\n\"\n    \"fRk ka 1\\n\"\n    \"uSj qu 1\\n\"\n    \"Cxg ng 1\\n\"\n    \"Lcv ch 1\\n\"\n    \"bzK sz 1\\n\"\n    \"wqF qu 1\\n\"\n    \"qJp qu 1\\n\"\n    \"rCj er 1\\n\"\n    \"qvs qu 1\\n\"\n    \"lwN le 1\\n\"\n    \"xmR me 1\\n\"\n    \"btC th 1\\n\"\n    \"kTx ka 1\\n\"\n    \"qkU qu 1\\n\"\n    \"Lhj th 1\\n\"\n    \"dIx de 1\\n\"\n    \"vsQ st 1\\n\"\n    \"gSd ng 1\\n\"\n    \"wDl le 1\\n\"\n    \"Vjm ij 1\\n\"\n    \"pmI me 1\\n\"\n    \"vWh th 1\\n\"\n    \"fKv va 1\\n\"\n    \"xPt th 1\\n\"\n    \"uoQ qu 1\\n\"\n    \"Kgh th 1\\n\"\n    \"gwX ng 1\\n\"\n    \"sgJ ng 1\\n\"\n    \"pWj ij 1\\n\"\n    \"Qff fo 1\\n\"\n    \"hkJ th 1\\n\"\n    \"Hqo qu 1\\n\"\n    \"jwW ij 1\\n\"\n    \"sQz st 1\\n\"\n    \"wUw wa 1\\n\"\n    \"mKx me 1\\n\"\n    \"oQf on 1\\n\"\n    \"jVk ij 1\\n\"\n    \"xwT wa 1\\n\"\n    \"sTq qu 1\\n\"\n    \"uqV qu 1\\n\"\n    \"Qlp le 1\\n\"\n    \"pMb pr 1\\n\"\n    \"xKj ij 1\\n\"\n    \"bpX pr 1\\n\"\n    \"vQe er 1\\n\"\n    \"Jjq qu 1\\n\"\n    \"qKh th 1\\n\"\n    \"fkJ ka 1\\n\"\n    \"jbQ ij 1\\n\"\n    \"mZw me 1\\n\"\n    \"Xgc ch 1\\n\"\n    \"vzU sz 1\\n\"\n    \"pTm me 1\\n\"\n    \"pNq qu 1\\n\"\n    \"rwD er 1\\n\"\n    \"Qdg ng 1\\n\"\n    \"wqC qu 1\\n\"\n    \"Yrn an 1\\n\"\n    \"qww qu 1\\n\"\n    \"qwU qu 1\\n\"\n    \"xzF sz 1\\n\"\n    \"flW le 1\\n\"\n    \"jzP sz 1\\n\"\n    \"Wxp pr 1\\n\"\n    \"rDq qu 1\\n\"\n    \"dGp de 1\\n\"\n    \"Ztj th 1\\n\"\n    \"Uvp va 1\\n\"\n    \"eGc ch 1\\n\"\n    \"zZb sz 1\\n\"\n    \"gQh th 1\\n\"\n    \"tFd th 1\\n\"\n    \"Mqg ng 1\\n\"\n    \"dnD an 1\\n\"\n    \"hvY th 1\\n\"\n    \"Iyb be 1\\n\"\n    \"fDz sz 1\\n\"\n    \"Kbj ij 1\\n\"\n    \"vYm va 1\\n\"\n    \"Wxr er 1\\n\"\n    \"Kwz sz 1\\n\"\n    \"hrQ th 1\\n\"\n    \"yCt th 1\\n\"\n    \"Hxw wa 1\\n\"\n    \"hEf th 1\\n\"\n    \"bdU de 1\\n\"\n    \"sGj st 1\\n\"\n    \"Gwt th 1\\n\"\n    \"bYh th 1\\n\"\n    \"zmU sz 1\\n\"\n    \"pDm po 1\\n\"\n    \"qmC qu 1\\n\"\n    \"dTd de 1\\n\"\n    \"Qxq qu 1\\n\"\n    \"uVf qu 1\\n\"\n    \"qAl qu 1\\n\"\n    \"jEa an 1\\n\"\n    \"Kpy pr 1\\n\"\n    \"Hqv qu 1\\n\"\n    \"fCk ka 1\\n\"\n    \"aqZ an 1\\n\"\n    \"lUo on 1\\n\"\n    \"Pvo on 1\\n\"\n    \"Dqf qu 1\\n\"\n    \"gdM ng 1\\n\"\n    \"fzL sz 1\\n\"\n    \"Bhh th 1\\n\"\n    \"dGd de 1\\n\"\n    \"wtY th 1\\n\"\n    \"qTy qu 1\\n\"\n    \"Uxr er 1\\n\"\n    \"Vvm va 1\\n\"\n    \"vHh th 1\\n\"\n    \"qZc ch 1\\n\"\n    \"fhC th 1\\n\"\n    \"xdZ de 1\\n\"\n    \"hZp th 1\\n\"\n    \"Pmz sz 1\\n\"\n    \"cfT ch 1\\n\"\n    \"pjI ij 1\\n\"\n    \"mdZ de 1\\n\"\n    \"jkQ ij 1\\n\"\n    \"Sdj de 1\\n\"\n    \"hDf th 1\\n\"\n    \"eJj er 1\\n\"\n    \"wjY ij 1\\n\"\n    \"zLm sz 1\\n\"\n    \"eFs er 1\\n\"\n    \"wgj ng 1\\n\"\n    \"Zmk ka 1\\n\"\n    \"lvJ le 1\\n\"\n    \"xYm me 1\\n\"\n    \"Nzf sz 1\\n\"\n    \"wJi in 1\\n\"\n    \"yQs st 1\\n\"\n    \"pfM pr 1\\n\"\n    \"dhR th 1\\n\"\n    \"cmK ch 1\\n\"\n    \"dhM th 1\\n\"\n    \"qGb qu 1\\n\"\n    \"wvQ va 1\\n\"\n    \"Cgq ng 1\\n\"\n    \"Jfc ch 1\\n\"\n    \"bkD ka 1\\n\"\n    \"fdS de 1\\n\"\n    \"Ivp va 1\\n\"\n    \"Gkj ij 1\\n\"\n    \"zIv sz 1\\n\"\n    \"Bzl le 1\\n\"\n    \"gBb ng 1\\n\"\n    \"Tpj ij 1\\n\"\n    \"vyY va 1\\n\"\n    \"Uxs st 1\\n\"\n    \"kwW ka 1\\n\"\n    \"gPf ng 1\\n\"\n    \"pqC qu 1\\n\"\n    \"cTj ch 1\\n\"\n    \"yzI sz 1\\n\"\n    \"Yph th 1\\n\"\n    \"bvD va 1\\n\"\n    \"xCc ch 1\\n\"\n    \"pcQ ch 1\\n\"\n    \"fZw wa 1\\n\"\n    \"Zxf fo 1\\n\"\n    \"wbA wa 1\\n\"\n    \"bTf be 1\\n\"\n    \"rxR er 1\\n\"\n    \"qqE qu 1\\n\"\n    \"yFp pr 1\\n\"\n    \"pNf pr 1\\n\"\n    \"kMv ka 1\\n\"\n    \"vUq qu 1\\n\"\n    \"wOh th 1\\n\"\n    \"hxH th 1\\n\"\n    \"Xqh th 1\\n\"\n    \"uIu qu 1\\n\"\n    \"Fzq qu 1\\n\"\n    \"Ysd st 1\\n\"\n    \"ojY on 1\\n\"\n    \"cEo ch 1\\n\"\n    \"lwR le 1\\n\"\n    \"qjF qu 1\\n\"\n    \"jTp ij 1\\n\"\n    \"yzT sz 1\\n\"\n    \"jfO ij 1\\n\"\n    \"qSg ng 1\\n\"\n    \"Nck ch 1\\n\"\n    \"hwF th 1\\n\"\n    \"Gmq qu 1\\n\"\n    \"Iiq qu 1\\n\"\n    \"zwE sz 1\\n\"\n    \"qQv qu 1\\n\"\n    \"xVd de 1\\n\"\n    \"Ywq qu 1\\n\"\n    \"sFx st 1\\n\"\n    \"fvB va 1\\n\"\n    \"qYe le 1\\n\"\n    \"gwT ng 1\\n\"\n    \"Wjx ij 1\\n\"\n    \"bHn an 1\\n\"\n    \"fMn an 1\\n\"\n    \"gJg ng 1\\n\"\n    \"Vkg ng 1\\n\"\n    \"Fxv va 1\\n\"\n    \"lHv le 1\\n\"\n    \"Wpk ka 1\\n\"\n    \"xAq qu 1\\n\"\n    \"rxB pr 1\\n\"\n    \"xuQ qu 1\\n\"\n    \"pIb pr 1\\n\"\n    \"bfE be 1\\n\"\n    \"gRx ng 1\\n\"\n    \"Bpb pr 1\\n\"\n    \"bxN be 1\\n\"\n    \"kgU ng 1\\n\"\n    \"Pxc ch 1\\n\"\n    \"cCq ch 1\\n\"\n    \"Npb pr 1\\n\"\n    \"lxE le 1\\n\"\n    \"lCy le 1\\n\"\n    \"dgX ng 1\\n\"\n    \"xLf fo 1\\n\"\n    \"bQt th 1\\n\"\n    \"qgF ng 1\\n\"\n    \"pxZ pr 1\\n\"\n    \"pPx pr 1\\n\"\n    \"iYz in 1\\n\"\n    \"vJl le 1\\n\"\n    \"kTf ka 1\\n\"\n    \"qVm qu 1\\n\"\n    \"gwS ng 1\\n\"\n    \"zTd sz 1\\n\"\n    \"pQk ka 1\\n\"\n    \"xEg ng 1\\n\"\n    \"fpP pr 1\\n\"\n    \"qjw qu 1\\n\"\n    \"Oyw wa 1\\n\"\n    \"mcO ch 1\\n\"\n    \"Vjd de 1\\n\"\n    \"qdg ng 1\\n\"\n    \"Lfp pr 1\\n\"\n    \"vZc ch 1\\n\"\n    \"nOq an 1\\n\"\n    \"qjn an 1\\n\"\n    \"sKc ch 1\\n\"\n    \"wgU ng 1\\n\"\n    \"hgX th 1\\n\"\n    \"dMv de 1\\n\"\n    \"Xcp ch 1\\n\"\n    \"Fwz sz 1\\n\"\n    \"pwA pr 1\\n\"\n    \"Lpj ij 1\\n\"\n    \"bkP ka 1\\n\"\n    \"vHn an 1\\n\"\n    \"Jjy ij 1\\n\"\n    \"mCq qu 1\\n\"\n    \"wvM va 1\\n\"\n    \"Icb ch 1\\n\"\n    \"kfJ ka 1\\n\"\n    \"hsQ th 1\\n\"\n    \"dWd de 1\\n\"\n    \"fUs st 1\\n\"\n    \"fLn an 1\\n\"\n    \"pjN ij 1\\n\"\n    \"zgQ ng 1\\n\"\n    \"jLj ij 1\\n\"\n    \"zqE qu 1\\n\"\n    \"Qmv va 1\\n\"\n    \"Zjr er 1\\n\"\n    \"Zkp ka 1\\n\"\n    \"iyH in 1\\n\"\n    \"wuY qu 1\\n\"\n    \"mzT sz 1\\n\"\n    \"cwK ch 1\\n\"\n    \"bCm me 1\\n\"\n    \"ydG de 1\\n\"\n    \"xdU de 1\\n\"\n    \"wTf wa 1\\n\"\n    \"lHh th 1\\n\"\n    \"qyD qu 1\\n\"\n    \"xlV le 1\\n\"\n    \"qyT qu 1\\n\"\n    \"tWn th 1\\n\"\n    \"rMz er 1\\n\"\n    \"pXv va 1\\n\"\n    \"Xbz sz 1\\n\"\n    \"kHm ka 1\\n\"\n    \"cVd ch 1\\n\"\n    \"qzH qu 1\\n\"\n    \"ydN de 1\\n\"\n    \"qMb qu 1\\n\"\n    \"yjS ij 1\\n\"\n    \"gmC ng 1\\n\"\n    \"zIi in 1\\n\"\n    \"fpM pr 1\\n\"\n    \"lcZ ch 1\\n\"\n    \"qHn an 1\\n\"\n    \"Jjd de 1\\n\"\n    \"jlG le 1\\n\"\n    \"qcK ch 1\\n\"\n    \"xQm me 1\\n\"\n    \"vIi in 1\\n\"\n    \"wBp pr 1\\n\"\n    \"wcI ch 1\\n\"\n    \"dJd de 1\\n\"\n    \"Qbn an 1\\n\"\n    \"Bjf ij 1\\n\"\n    \"dpY de 1\\n\"\n    \"dcF ch 1\\n\"\n    \"xSj ij 1\\n\"\n    \"iXj in 1\\n\"\n    \"Qgb ng 1\\n\"\n    \"gDt th 1\\n\"\n    \"xxq qu 1\\n\"\n    \"xcQ ch 1\\n\"\n    \"Sqs qu 1\\n\"\n    \"Qmg ng 1\\n\"\n    \"gcU ch 1\\n\"\n    \"Bvv va 1\\n\"\n    \"pzE sz 1\\n\"\n    \"wtT th 1\\n\"\n    \"vbL va 1\\n\"\n    \"bCt th 1\\n\"\n    \"Qpo on 1\\n\"\n    \"mXs me 1\\n\"\n    \"Zqr qu 1\\n\"\n    \"Gky ka 1\\n\"\n    \"Xmr er 1\\n\"\n    \"Lnz an 1\\n\"\n    \"vYq qu 1\\n\"\n    \"yRl le 1\\n\"\n    \"gmK ng 1\\n\"\n    \"vwP va 1\\n\"\n    \"eFg ng 1\\n\"\n    \"Njd de 1\\n\"\n    \"klG le 1\\n\"\n    \"hbE th 1\\n\"\n    \"kWz sz 1\\n\"\n    \"qpM qu 1\\n\"\n    \"oZc ch 1\\n\"\n    \"jRm ij 1\\n\"\n    \"wXl le 1\\n\"\n#ifndef _MSC_VER // TODO: Hack to avoid unsupported long string for MS VC.\n    \"iyD in 1\\n\"\n    \"fvL va 1\\n\"\n    \"rPw er 1\\n\"\n    \"fdR de 1\\n\"\n    \"iSg ng 1\\n\"\n    \"dbQ de 1\\n\"\n    \"xxQ xe 1\\n\"\n    \"Djc ch 1\\n\"\n    \"ygK ng 1\\n\"\n    \"Rhb th 1\\n\"\n    \"zgG ng 1\\n\"\n    \"Yky ka 1\\n\"\n    \"Cxj ij 1\\n\"\n    \"wWk ka 1\\n\"\n    \"lmY le 1\\n\"\n    \"qrB qu 1\\n\"\n    \"ywK wa 1\\n\"\n    \"xqI qu 1\\n\"\n    \"Twj ij 1\\n\"\n    \"Xgq ng 1\\n\"\n    \"dwZ de 1\\n\"\n    \"nQl an 1\\n\"\n    \"Ghc th 1\\n\"\n    \"pnH an 1\\n\"\n    \"vmU va 1\\n\"\n    \"qqK qu 1\\n\"\n    \"cjB ch 1\\n\"\n    \"gzS ng 1\\n\"\n    \"Rwz sz 1\\n\"\n    \"gYr ng 1\\n\"\n    \"Fgx ng 1\\n\"\n    \"wdK de 1\\n\"\n    \"hxZ th 1\\n\"\n    \"xUx xe 1\\n\"\n    \"wmT me 1\\n\"\n    \"yYk ka 1\\n\"\n    \"fcD ch 1\\n\"\n    \"hVv th 1\\n\"\n    \"Sgv ng 1\\n\"\n    \"zPn an 1\\n\"\n    \"vYb va 1\\n\"\n    \"bzE sz 1\\n\"\n    \"whV th 1\\n\"\n    \"qNz qu 1\\n\"\n    \"wtS th 1\\n\"\n    \"vhY th 1\\n\"\n    \"nLf an 1\\n\"\n    \"Lfw wa 1\\n\"\n    \"gVc ch 1\\n\"\n    \"gkS ng 1\\n\"\n    \"Jqb qu 1\\n\"\n    \"hWx th 1\\n\"\n    \"zgO ng 1\\n\"\n    \"tgX th 1\\n\"\n    \"jPb ij 1\\n\"\n    \"Wxb be 1\\n\"\n    \"gqw ng 1\\n\"\n    \"Cfw wa 1\\n\"\n    \"woU on 1\\n\"\n    \"ycJ ch 1\\n\"\n    \"kwD ka 1\\n\"\n    \"Sbp pr 1\\n\"\n    \"qcw ch 1\\n\"\n    \"Hwr er 1\\n\"\n    \"bmL me 1\\n\"\n    \"gwZ ng 1\\n\"\n    \"yKj ij 1\\n\"\n    \"fXv va 1\\n\"\n    \"iKx in 1\\n\"\n    \"lRz le 1\\n\"\n    \"cHj ch 1\\n\"\n    \"fFt th 1\\n\"\n    \"sJv sz 1\\n\"\n    \"xmI me 1\\n\"\n    \"cCd ch 1\\n\"\n    \"iYd in 1\\n\"\n    \"yfY ny 1\\n\"\n    \"xbY be 1\\n\"\n    \"bmE me 1\\n\"\n    \"fBv va 1\\n\"\n    \"dHw de 1\\n\"\n    \"ycR ch 1\\n\"\n    \"wvL va 1\\n\"\n    \"rjL er 1\\n\"\n    \"sYv sz 1\\n\"\n    \"Wpn an 1\\n\"\n    \"zxB sz 1\\n\"\n    \"yBq qu 1\\n\"\n    \"gdJ ng 1\\n\"\n    \"Yjo on 1\\n\"\n    \"fpQ pr 1\\n\"\n    \"qOq qu 1\\n\"\n    \"Wjf ij 1\\n\"\n    \"qcT ch 1\\n\"\n    \"Lfh th 1\\n\"\n    \"cFj ch 1\\n\"\n    \"lMq qu 1\\n\"\n    \"wSf wa 1\\n\"\n    \"wQc ch 1\\n\"\n    \"zDy sz 1\\n\"\n    \"qrl qu 1\\n\"\n    \"pYw pr 1\\n\"\n    \"Vnf an 1\\n\"\n    \"Hcj ch 1\\n\"\n    \"zdU sz 1\\n\"\n    \"bvP va 1\\n\"\n    \"Yfj ij 1\\n\"\n    \"Qkn an 1\\n\"\n    \"wHm me 1\\n\"\n    \"qVv qu 1\\n\"\n    \"gkV ng 1\\n\"\n    \"vpq qu 1\\n\"\n    \"hFk th 1\\n\"\n    \"fWf fo 1\\n\"\n    \"pYq qu 1\\n\"\n    \"dNv de 1\\n\"\n    \"Wwj ij 1\\n\"\n    \"Fmx me 1\\n\"\n    \"mDl le 1\\n\"\n    \"jMg ng 1\\n\"\n    \"fZk ka 1\\n\"\n    \"jNp ij 1\\n\"\n    \"qhf th 1\\n\"\n    \"Vbg ng 1\\n\"\n    \"lKx le 1\\n\"\n    \"iZx in 1\\n\"\n    \"sjT sz 1\\n\"\n    \"ijY in 1\\n\"\n    \"qtV th 1\\n\"\n    \"yTk ka 1\\n\"\n    \"Hpz sz 1\\n\"\n    \"iGq qu 1\\n\"\n    \"yqW qu 1\\n\"\n    \"hgF th 1\\n\"\n    \"mFk ka 1\\n\"\n    \"Oqw qu 1\\n\"\n    \"dXa an 1\\n\"\n    \"Zbq qu 1\\n\"\n    \"lKm le 1\\n\"\n    \"Svz sz 1\\n\"\n    \"zKc ch 1\\n\"\n    \"Vmz sz 1\\n\"\n    \"mIx me 1\\n\"\n    \"gKj ng 1\\n\"\n    \"gTt th 1\\n\"\n    \"vfC fo 1\\n\"\n    \"hKg th 1\\n\"\n    \"hSx th 1\\n\"\n    \"oKg ng 1\\n\"\n    \"nQs an 1\\n\"\n    \"yiG in 1\\n\"\n    \"qgM ng 1\\n\"\n    \"kQg ng 1\\n\"\n    \"Cjd de 1\\n\"\n    \"jPy ij 1\\n\"\n    \"Xqe qu 1\\n\"\n    \"Pzy sz 1\\n\"\n    \"Ftq th 1\\n\"\n    \"fcE ch 1\\n\"\n    \"mkL ka 1\\n\"\n    \"Hzj sz 1\\n\"\n    \"bTn an 1\\n\"\n    \"qXy qu 1\\n\"\n    \"dmM de 1\\n\"\n    \"dVx de 1\\n\"\n    \"Tqn an 1\\n\"\n    \"xWj ij 1\\n\"\n    \"qxQ qu 1\\n\"\n    \"fQx fo 1\\n\"\n    \"vLl le 1\\n\"\n    \"Pgk ng 1\\n\"\n    \"gHk ng 1\\n\"\n    \"hxV th 1\\n\"\n    \"tJz th 1\\n\"\n    \"fMz sz 1\\n\"\n    \"Ixb be 1\\n\"\n    \"Cyy ny 1\\n\"\n    \"pXf pr 1\\n\"\n    \"pLl le 1\\n\"\n    \"Twq qu 1\\n\"\n    \"Dtw th 1\\n\"\n    \"wRn an 1\\n\"\n    \"uXl qu 1\\n\"\n    \"zhq th 1\\n\"\n    \"wIv va 1\\n\"\n    \"cjL ch 1\\n\"\n    \"qxH qu 1\\n\"\n    \"lDm le 1\\n\"\n    \"tXv th 1\\n\"\n    \"gjC ng 1\\n\"\n    \"Zzd sz 1\\n\"\n    \"tgT th 1\\n\"\n    \"hnP th 1\\n\"\n    \"Kjc ch 1\\n\"\n    \"gVw ng 1\\n\"\n    \"xbI be 1\\n\"\n    \"Zpc ch 1\\n\"\n    \"bfO be 1\\n\"\n    \"mSx me 1\\n\"\n    \"qaF an 1\\n\"\n    \"aQh th 1\\n\"\n    \"Hjd de 1\\n\"\n    \"qXj qu 1\\n\"\n    \"fqA qu 1\\n\"\n    \"bvR va 1\\n\"\n    \"qSn an 1\\n\"\n    \"cdV ch 1\\n\"\n    \"pTf pr 1\\n\"\n    \"Kzc ch 1\\n\"\n    \"qtI th 1\\n\"\n    \"egY ng 1\\n\"\n    \"Rxt th 1\\n\"\n    \"bhY th 1\\n\"\n    \"pGh th 1\\n\"\n    \"jDg ng 1\\n\"\n    \"foY on 1\\n\"\n    \"dKs sz 1\\n\"\n    \"qJt th 1\\n\"\n    \"Xwz sz 1\\n\"\n    \"Ixg ng 1\\n\"\n    \"rMt th 1\\n\"\n    \"zXu qu 1\\n\"\n    \"sQy sz 1\\n\"\n    \"Npz sz 1\\n\"\n    \"Qfz sz 1\\n\"\n    \"rLm er 1\\n\"\n    \"zGm sz 1\\n\"\n    \"wHz sz 1\\n\"\n    \"vcY ch 1\\n\"\n    \"kqZ qu 1\\n\"\n    \"jDh th 1\\n\"\n    \"qgG ng 1\\n\"\n    \"Dqq qu 1\\n\"\n    \"fmO me 1\\n\"\n    \"qdW qu 1\\n\"\n    \"dNw de 1\\n\"\n    \"rXj er 1\\n\"\n    \"Jwc ch 1\\n\"\n    \"mDb me 1\\n\"\n    \"wMw wa 1\\n\"\n    \"Yjg ng 1\\n\"\n    \"fjY ij 1\\n\"\n    \"iJb in 1\\n\"\n    \"cdC ch 1\\n\"\n    \"Yxq qu 1\\n\"\n    \"Vbk ka 1\\n\"\n    \"Fpx pr 1\\n\"\n    \"zhD th 1\\n\"\n    \"hCs th 1\\n\"\n    \"dXw de 1\\n\"\n    \"kDd de 1\\n\"\n    \"uqT un 1\\n\"\n    \"Bxw wa 1\\n\"\n    \"Bjq qu 1\\n\"\n    \"jGx ij 1\\n\"\n    \"fXb be 1\\n\"\n    \"ybF be 1\\n\"\n    \"dtA th 1\\n\"\n    \"cVv ch 1\\n\"\n    \"Cbd de 1\\n\"\n    \"wtH th 1\\n\"\n    \"Kdj de 1\\n\"\n    \"kPs sz 1\\n\"\n    \"Zvk ka 1\\n\"\n    \"xPv va 1\\n\"\n    \"woH on 1\\n\"\n    \"Xpz sz 1\\n\"\n    \"qXe qu 1\\n\"\n    \"pTj ij 1\\n\"\n    \"kwQ ka 1\\n\"\n    \"kZf ka 1\\n\"\n    \"Uqj qu 1\\n\"\n    \"yJh th 1\\n\"\n    \"hCq th 1\\n\"\n    \"jMj ij 1\\n\"\n    \"phY th 1\\n\"\n    \"kbB ka 1\\n\"\n    \"Gpz sz 1\\n\"\n    \"sGz st 1\\n\"\n    \"fwE wa 1\\n\"\n    \"Ttf th 1\\n\"\n    \"Gqm qu 1\\n\"\n    \"bzN sz 1\\n\"\n    \"fkO ka 1\\n\"\n    \"uzW qu 1\\n\"\n    \"oxQ on 1\\n\"\n    \"Vgm ng 1\\n\"\n    \"qmD qu 1\\n\"\n    \"xqn an 1\\n\"\n    \"vRl le 1\\n\"\n    \"Tnr an 1\\n\"\n    \"zjW sz 1\\n\"\n    \"vwq qu 1\\n\"\n    \"jtW th 1\\n\"\n    \"qnL an 1\\n\"\n    \"yDx ny 1\\n\"\n    \"xfQ fo 1\\n\"\n    \"wxJ wa 1\\n\"\n    \"nxE an 1\\n\"\n    \"vQn in 1\\n\"\n    \"Wkh th 1\\n\"\n    \"ywD wa 1\\n\"\n    \"pFf pr 1\\n\"\n    \"lbK le 1\\n\"\n    \"vHy va 1\\n\"\n    \"gVj ng 1\\n\"\n    \"Oqh th 1\\n\"\n    \"bcN ch 1\\n\"\n    \"tWm th 1\\n\"\n    \"wMc ch 1\\n\"\n    \"nwQ an 1\\n\"\n    \"qnM an 1\\n\"\n    \"Ztx th 1\\n\"\n    \"nQj an 1\\n\"\n    \"Vxt th 1\\n\"\n    \"Uxc ch 1\\n\"\n    \"pWv va 1\\n\"\n    \"yRx ny 1\\n\"\n    \"qKu un 1\\n\"\n    \"jXg ng 1\\n\"\n    \"jpX ij 1\\n\"\n    \"dkG de 1\\n\"\n    \"Bnf an 1\\n\"\n    \"Ykf ka 1\\n\"\n    \"gbW ng 1\\n\"\n    \"klX le 1\\n\"\n    \"vkH ka 1\\n\"\n    \"dKd de 1\\n\"\n    \"Kpq qu 1\\n\"\n    \"gqM ng 1\\n\"\n    \"yBz sz 1\\n\"\n    \"rPj er 1\\n\"\n    \"Hzv sz 1\\n\"\n    \"wYz sz 1\\n\"\n    \"qGa an 1\\n\"\n    \"jIs sz 1\\n\"\n    \"bUj ij 1\\n\"\n    \"rTt th 1\\n\"\n    \"nqI an 1\\n\"\n    \"jfP ij 1\\n\"\n    \"hRt th 1\\n\"\n    \"yRr er 1\\n\"\n    \"jjK ij 1\\n\"\n    \"tfE th 1\\n\"\n    \"Qsw st 1\\n\"\n    \"Fcm ch 1\\n\"\n    \"bJm me 1\\n\"\n    \"tXq th 1\\n\"\n    \"fRl le 1\\n\"\n    \"gqE ng 1\\n\"\n    \"wGg ng 1\\n\"\n    \"gKc ch 1\\n\"\n    \"yXc ch 1\\n\"\n    \"zBy sz 1\\n\"\n    \"lTd le 1\\n\"\n    \"Wqc ch 1\\n\"\n    \"Ftf th 1\\n\"\n    \"wdB de 1\\n\"\n    \"xnX an 1\\n\"\n    \"Bqc ch 1\\n\"\n    \"zqO qu 1\\n\"\n    \"Qdl le 1\\n\"\n    \"ojJ on 1\\n\"\n    \"qZn an 1\\n\"\n    \"hzW th 1\\n\"\n    \"ylQ le 1\\n\"\n    \"Zbw wa 1\\n\"\n    \"mvL va 1\\n\"\n    \"Ljb ij 1\\n\"\n    \"Gqe qu 1\\n\"\n    \"mfE me 1\\n\"\n    \"xQq qu 1\\n\"\n    \"fLv va 1\\n\"\n    \"xLt th 1\\n\"\n    \"wBj ij 1\\n\"\n    \"jUm ij 1\\n\"\n    \"pdL de 1\\n\"\n    \"mJv va 1\\n\"\n    \"dxU de 1\\n\"\n    \"xqN qu 1\\n\"\n    \"fpG pr 1\\n\"\n    \"tlO th 1\\n\"\n    \"whL th 1\\n\"\n    \"kDx ka 1\\n\"\n    \"Rqb qu 1\\n\"\n    \"uvX qu 1\\n\"\n    \"vjY ij 1\\n\"\n    \"crQ ch 1\\n\"\n    \"xyY ny 1\\n\"\n    \"yhQ th 1\\n\"\n    \"yYc ch 1\\n\"\n    \"Lmg ng 1\\n\"\n    \"Jsq qu 1\\n\"\n    \"Gbj ij 1\\n\"\n    \"aPb an 1\\n\"\n    \"dwJ de 1\\n\"\n    \"Xyv va 1\\n\"\n    \"ucJ ch 1\\n\"\n    \"dTf de 1\\n\"\n    \"lBb le 1\\n\"\n    \"hKz th 1\\n\"\n    \"jcR ch 1\\n\"\n    \"eQc ch 1\\n\"\n    \"qYi in 1\\n\"\n    \"Vtb th 1\\n\"\n    \"Ccg ch 1\\n\"\n    \"zAe er 1\\n\"\n    \"gxJ ng 1\\n\"\n    \"uvC qu 1\\n\"\n    \"Bhm ma 1\\n\"\n    \"Zgx ng 1\\n\"\n    \"yzJ sz 1\\n\"\n    \"cvJ ch 1\\n\"\n    \"xTk ka 1\\n\"\n    \"qdK qu 1\\n\"\n    \"vwG va 1\\n\"\n    \"Ymx me 1\\n\"\n    \"oYw on 1\\n\"\n    \"jXx ij 1\\n\"\n    \"ywf wa 1\\n\"\n    \"vVx vi 1\\n\"\n    \"Rwm me 1\\n\"\n    \"Dvk ka 1\\n\"\n    \"xKt th 1\\n\"\n    \"qLp qu 1\\n\"\n    \"Yyv vi 1\\n\"\n    \"Cqa an 1\\n\"\n    \"xRf fo 1\\n\"\n    \"Qqk qu 1\\n\"\n    \"Jqe qu 1\\n\"\n    \"yZg ng 1\\n\"\n    \"vqG qu 1\\n\"\n    \"hbO th 1\\n\"\n    \"uVq qu 1\\n\"\n    \"Rlm le 1\\n\"\n    \"uZc ch 1\\n\"\n    \"Ppv va 1\\n\"\n    \"pVd de 1\\n\"\n    \"yVd de 1\\n\"\n    \"zJl le 1\\n\"\n    \"Yzg ng 1\\n\"\n    \"Cvq qu 1\\n\"\n    \"pwS pr 1\\n\"\n    \"Kkw ka 1\\n\"\n    \"Wvv va 1\\n\"\n    \"Fdy de 1\\n\"\n    \"ppX pr 1\\n\"\n    \"hvC th 1\\n\"\n    \"iwG in 1\\n\"\n    \"rBg ng 1\\n\"\n    \"hBq th 1\\n\"\n    \"nYs an 1\\n\"\n    \"kcO ch 1\\n\"\n    \"qEe qu 1\\n\"\n    \"Ybv va 1\\n\"\n    \"Qsn an 1\\n\"\n    \"svC st 1\\n\"\n    \"qkD qu 1\\n\"\n    \"Qiw in 1\\n\"\n    \"Gtj th 1\\n\"\n    \"qAh th 1\\n\"\n    \"wVy wa 1\\n\"\n    \"bxT be 1\\n\"\n    \"Qhs th 1\\n\"\n    \"tlX th 1\\n\"\n    \"hbA th 1\\n\"\n    \"Qfb be 1\\n\"\n    \"xWl le 1\\n\"\n    \"xeV er 1\\n\"\n    \"rqG qu 1\\n\"\n    \"vqZ qu 1\\n\"\n    \"jKv ij 1\\n\"\n    \"iTf in 1\\n\"\n    \"kwU ka 1\\n\"\n    \"iFq in 1\\n\"\n    \"mjZ ij 1\\n\"\n    \"xgJ ng 1\\n\"\n    \"zLp sz 1\\n\"\n    \"qsR qu 1\\n\"\n    \"zDj sz 1\\n\"\n    \"pdF de 1\\n\"\n    \"wxN wa 1\\n\"\n    \"wGk ka 1\\n\"\n    \"dUq qu 1\\n\"\n    \"dJw de 1\\n\"\n    \"fCb be 1\\n\"\n    \"Dhz th 1\\n\"\n    \"yIq qu 1\\n\"\n    \"aQm an 1\\n\"\n    \"Yzs st 1\\n\"\n    \"vHf va 1\\n\"\n    \"bjV ij 1\\n\"\n    \"zSq qu 1\\n\"\n    \"Wqs qu 1\\n\"\n    \"jrW er 1\\n\"\n    \"Hzq qu 1\\n\"\n    \"wWs st 1\\n\"\n    \"Mkg ng 1\\n\"\n    \"zgF ng 1\\n\"\n    \"Cnk an 1\\n\"\n    \"rDg ng 1\\n\"\n    \"fzB sz 1\\n\"\n    \"fOm me 1\\n\"\n    \"uVt th 1\\n\"\n    \"Qfi in 1\\n\"\n    \"Mhj th 1\\n\"\n    \"uYj qu 1\\n\"\n    \"Rqx qu 1\\n\"\n    \"hkY th 1\\n\"\n    \"wYb wa 1\\n\"\n    \"tqP th 1\\n\"\n    \"Jpb pr 1\\n\"\n    \"bGw wa 1\\n\"\n    \"xFh th 1\\n\"\n    \"Xwb wa 1\\n\"\n    \"Kgt th 1\\n\"\n    \"Iqc ch 1\\n\"\n    \"pJm me 1\\n\"\n    \"Qkq qu 1\\n\"\n    \"bVh th 1\\n\"\n    \"yTq qu 1\\n\"\n    \"zZg ng 1\\n\"\n    \"cDz ch 1\\n\"\n    \"qfm qu 1\\n\"\n    \"afQ an 1\\n\"\n    \"Qwc ch 1\\n\"\n    \"bdJ de 1\\n\"\n    \"qTu un 1\\n\"\n    \"Ucx ch 1\\n\"\n    \"Hnx an 1\\n\"\n    \"Hbh th 1\\n\"\n    \"gyH ng 1\\n\"\n    \"tTz th 1\\n\"\n    \"txV th 1\\n\"\n    \"bdS de 1\\n\"\n    \"Wgg ng 1\\n\"\n    \"oqP qu 1\\n\"\n    \"Rrf er 1\\n\"\n    \"gYy ng 1\\n\"\n    \"fMs st 1\\n\"\n    \"fKd de 1\\n\"\n    \"Hyx ny 1\\n\"\n    \"Mxz sz 1\\n\"\n    \"qHk qu 1\\n\"\n    \"tfM th 1\\n\"\n    \"hgQ th 1\\n\"\n    \"zmO sz 1\\n\"\n    \"wzS sz 1\\n\"\n    \"jwQ ij 1\\n\"\n    \"Fhc ic 1\\n\"\n    \"xIy ny 1\\n\"\n    \"fHg ng 1\\n\"\n    \"wqY qu 1\\n\"\n    \"bFp pr 1\\n\"\n    \"Qdq qu 1\\n\"\n    \"bhV th 1\\n\"\n    \"bCg ng 1\\n\"\n    \"Hgr ng 1\\n\"\n    \"xqL qu 1\\n\"\n    \"qgS ng 1\\n\"\n    \"Nqg ng 1\\n\"\n    \"fQv va 1\\n\"\n    \"Qzw sz 1\\n\"\n    \"Ixd de 1\\n\"\n    \"Cxm me 1\\n\"\n    \"mxN me 1\\n\"\n    \"vQi in 1\\n\"\n    \"cAq ch 1\\n\"\n    \"eCx er 1\\n\"\n    \"mqX qu 1\\n\"\n    \"rqY qu 1\\n\"\n    \"fVp pr 1\\n\"\n    \"qoP qu 1\\n\"\n    \"Gxc ch 1\\n\"\n    \"vzX sz 1\\n\"\n    \"fXf fo 1\\n\"\n    \"Qtc th 1\\n\"\n    \"ohQ th 1\\n\"\n    \"Ygy ng 1\\n\"\n    \"Xnb an 1\\n\"\n    \"cWm ch 1\\n\"\n    \"jXw ij 1\\n\"\n    \"gWj ng 1\\n\"\n    \"Kmg ng 1\\n\"\n    \"vvH va 1\\n\"\n    \"Uew er 1\\n\"\n    \"qJk qu 1\\n\"\n    \"Hkd de 1\\n\"\n    \"xmP me 1\\n\"\n    \"slR is 1\\n\"\n    \"Uaq an 1\\n\"\n    \"zbG sz 1\\n\"\n    \"vNv va 1\\n\"\n    \"cVb ch 1\\n\"\n    \"bGg ng 1\\n\"\n    \"iwU in 1\\n\"\n    \"Cnw an 1\\n\"\n    \"rXd er 1\\n\"\n    \"vWz sz 1\\n\"\n    \"tGf th 1\\n\"\n    \"fbY be 1\\n\"\n    \"hzp th 1\\n\"\n    \"uWz qu 1\\n\"\n    \"bMb be 1\\n\"\n    \"jzW sz 1\\n\"\n    \"gLh th 1\\n\"\n    \"kZc ch 1\\n\"\n    \"kHg ng 1\\n\"\n    \"Vwf wa 1\\n\"\n    \"vtY th 1\\n\"\n    \"qeA qu 1\\n\"\n    \"cxG ch 1\\n\"\n    \"uQz qu 1\\n\"\n    \"jGc ch 1\\n\"\n    \"cvA ch 1\\n\"\n    \"oTm on 1\\n\"\n    \"pjY ij 1\\n\"\n    \"bUo on 1\\n\"\n    \"jwU ij 1\\n\"\n    \"Jgm ng 1\\n\"\n    \"tfZ th 1\\n\"\n    \"xeO er 1\\n\"\n    \"qBp qu 1\\n\"\n    \"pBz sz 1\\n\"\n    \"qSb qu 1\\n\"\n    \"jyP ij 1\\n\"\n    \"Fkq qu 1\\n\"\n    \"njS an 1\\n\"\n    \"jtA th 1\\n\"\n    \"Zmf me 1\\n\"\n    \"Ytm th 1\\n\"\n    \"Pqc ch 1\\n\"\n    \"bwJ wa 1\\n\"\n    \"oWf on 1\\n\"\n    \"kxJ ka 1\\n\"\n    \"jHx ij 1\\n\"\n    \"gcP ch 1\\n\"\n    \"gBs ng 1\\n\"\n    \"bkK ka 1\\n\"\n    \"vdQ de 1\\n\"\n    \"pjZ ij 1\\n\"\n    \"Vgf ng 1\\n\"\n    \"svG st 1\\n\"\n    \"kGj ij 1\\n\"\n    \"Wjg ng 1\\n\"\n    \"Qmk ka 1\\n\"\n    \"Glv le 1\\n\"\n    \"tmY th 1\\n\"\n    \"klY le 1\\n\"\n    \"Pcj ch 1\\n\"\n    \"fQw wi 1\\n\"\n    \"xaO an 1\\n\"\n    \"jfN ij 1\\n\"\n    \"qGx qu 1\\n\"\n    \"qvB qu 1\\n\"\n    \"hwA th 1\\n\"\n    \"Xmq qu 1\\n\"\n    \"Xvt th 1\\n\"\n    \"Bpq qu 1\\n\"\n    \"oJq qu 1\\n\"\n    \"vmZ va 1\\n\"\n    \"nJp an 1\\n\"\n    \"zqJ qu 1\\n\"\n    \"qHf qu 1\\n\"\n    \"mQg ng 1\\n\"\n    \"yGz sz 1\\n\"\n    \"hQm th 1\\n\"\n    \"mBp me 1\\n\"\n    \"tpJ th 1\\n\"\n    \"Qkj ij 1\\n\"\n    \"uUg ng 1\\n\"\n    \"tdJ th 1\\n\"\n    \"Jfn an 1\\n\"\n    \"Lvj ij 1\\n\"\n    \"iXc ch 1\\n\"\n    \"pOq qu 1\\n\"\n    \"bhK th 1\\n\"\n    \"bMk ka 1\\n\"\n    \"Fsw st 1\\n\"\n    \"qAt th 1\\n\"\n    \"xwJ wa 1\\n\"\n    \"fPm me 1\\n\"\n    \"Dfy ny 1\\n\"\n    \"Zbp pr 1\\n\"\n    \"Bgw ng 1\\n\"\n    \"pQp pr 1\\n\"\n    \"kQp ka 1\\n\"\n    \"qoV qu 1\\n\"\n    \"Uqd qu 1\\n\"\n    \"jYo on 1\\n\"\n    \"sDf st 1\\n\"\n    \"xuJ qu 1\\n\"\n    \"vRk ka 1\\n\"\n    \"Qsg ng 1\\n\"\n    \"yTd de 1\\n\"\n    \"Qxr er 1\\n\"\n    \"Hvc ch 1\\n\"\n    \"hZt th 1\\n\"\n    \"qDu un 1\\n\"\n    \"fxA fo 1\\n\"\n    \"xPf fo 1\\n\"\n    \"wXc ch 1\\n\"\n    \"jJb ij 1\\n\"\n    \"pdK de 1\\n\"\n    \"gpW ng 1\\n\"\n    \"Qgx ng 1\\n\"\n    \"kxG ka 1\\n\"\n    \"dLx de 1\\n\"\n    \"Bwz sz 1\\n\"\n    \"Vdx de 1\\n\"\n    \"yQh th 1\\n\"\n    \"Wsx st 1\\n\"\n    \"fSb be 1\\n\"\n    \"Ukg ng 1\\n\"\n    \"Pjz sz 1\\n\"\n    \"rFg ng 1\\n\"\n    \"fjP ij 1\\n\"\n    \"kWv ka 1\\n\"\n    \"Khf th 1\\n\"\n    \"yGv va 1\\n\"\n    \"pnD an 1\\n\"\n    \"jYf ij 1\\n\"\n    \"mgR ng 1\\n\"\n    \"rjC er 1\\n\"\n    \"Xjl le 1\\n\"\n    \"kzE sz 1\\n\"\n    \"Qgq ng 1\\n\"\n    \"zgb ng 1\\n\"\n    \"mhD th 1\\n\"\n    \"vkO ka 1\\n\"\n    \"uwV qu 1\\n\"\n    \"rPp er 1\\n\"\n    \"wXd de 1\\n\"\n    \"gAo ng 1\\n\"\n    \"kvG ka 1\\n\"\n    \"vcX ch 1\\n\"\n    \"xOz sz 1\\n\"\n    \"Xzq qu 1\\n\"\n    \"Fmu qu 1\\n\"\n    \"xGg ng 1\\n\"\n    \"jjR ij 1\\n\"\n    \"qkI ku 1\\n\"\n    \"pqH qu 1\\n\"\n    \"cnH an 1\\n\"\n    \"dhT th 1\\n\"\n    \"mdR de 1\\n\"\n    \"dDf de 1\\n\"\n    \"qIq qu 1\\n\"\n    \"xCj ij 1\\n\"\n    \"qRk qu 1\\n\"\n    \"kKc ch 1\\n\"\n    \"Iuu qu 1\\n\"\n    \"jqR qu 1\\n\"\n    \"qEk qu 1\\n\"\n    \"hfO th 1\\n\"\n    \"quJ un 1\\n\"\n    \"nRp an 1\\n\"\n    \"txI th 1\\n\"\n    \"yfZ ny 1\\n\"\n    \"oqT ho 1\\n\"\n    \"cgX ch 1\\n\"\n    \"pbL pr 1\\n\"\n    \"Xmx me 1\\n\"\n    \"Vjr er 1\\n\"\n    \"ylY le 1\\n\"\n    \"dfK de 1\\n\"\n    \"xgD ng 1\\n\"\n    \"uwL qu 1\\n\"\n    \"bPm me 1\\n\"\n    \"qCy qu 1\\n\"\n    \"Rpq qu 1\\n\"\n    \"yqh th 1\\n\"\n    \"xJt th 1\\n\"\n    \"lzQ le 1\\n\"\n    \"fgM ng 1\\n\"\n    \"Ylc ch 1\\n\"\n    \"fTz sz 1\\n\"\n    \"Rjf ij 1\\n\"\n    \"Rgj jo 1\\n\"\n    \"Gkt th 1\\n\"\n    \"fxG fo 1\\n\"\n    \"mtG th 1\\n\"\n    \"lgJ ng 1\\n\"\n    \"tdR th 1\\n\"\n    \"iHk in 1\\n\"\n    \"Gqv qu 1\\n\"\n    \"lDj le 1\\n\"\n    \"wzZ sz 1\\n\"\n    \"dFp de 1\\n\"\n    \"qTt th 1\\n\"\n    \"Wtg th 1\\n\"\n    \"cbT ch 1\\n\"\n    \"dvK de 1\\n\"\n    \"Ctw th 1\\n\"\n    \"mdG de 1\\n\"\n    \"vKj ij 1\\n\"\n    \"Clf le 1\\n\"\n    \"wrU er 1\\n\"\n    \"gmT ng 1\\n\"\n    \"bXx be 1\\n\"\n    \"zOx sz 1\\n\"\n    \"Xnf an 1\\n\"\n    \"rzQ er 1\\n\"\n    \"vQj ij 1\\n\"\n    \"kpT ka 1\\n\"\n    \"fYh th 1\\n\"\n    \"zLr er 1\\n\"\n    \"Xgd ng 1\\n\"\n    \"cZl ch 1\\n\"\n    \"lFy le 1\\n\"\n    \"Zng an 1\\n\"\n    \"aXg an 1\\n\"\n    \"qbE qu 1\\n\"\n    \"zcY ch 1\\n\"\n    \"sqK qu 1\\n\"\n    \"Blx le 1\\n\"\n    \"oqJ qu 1\\n\"\n    \"jPv ij 1\\n\"\n    \"qZd qu 1\\n\"\n    \"fdZ de 1\\n\"\n    \"Bqm qu 1\\n\"\n    \"cpG ch 1\\n\"\n    \"xdP de 1\\n\"\n    \"fuF qu 1\\n\"\n    \"vbq qu 1\\n\"\n    \"dhH th 1\\n\"\n    \"Jwm me 1\\n\"\n    \"qkO ko 1\\n\"\n    \"gsY ng 1\\n\"\n    \"qGh th 1\\n\"\n    \"Jkv ka 1\\n\"\n    \"zpg ng 1\\n\"\n    \"rwK er 1\\n\"\n    \"Lhq th 1\\n\"\n    \"zuV qu 1\\n\"\n    \"bqV qu 1\\n\"\n    \"Qcv ch 1\\n\"\n    \"mWd de 1\\n\"\n    \"cnF an 1\\n\"\n    \"lWw le 1\\n\"\n    \"txS th 1\\n\"\n    \"znE an 1\\n\"\n    \"fTj ij 1\\n\"\n    \"lFq qu 1\\n\"\n    \"wdJ de 1\\n\"\n    \"eVk er 1\\n\"\n    \"zjZ sz 1\\n\"\n    \"fPq qu 1\\n\"\n    \"cqQ ch 1\\n\"\n    \"Pcg ch 1\\n\"\n    \"Ydk de 1\\n\"\n    \"svE st 1\\n\"\n    \"Wqb qu 1\\n\"\n    \"bcV ch 1\\n\"\n    \"nHx on 1\\n\"\n    \"wAx wa 1\\n\"\n    \"hfB th 1\\n\"\n    \"aMv an 1\\n\"\n    \"pwO pr 1\\n\"\n    \"Ywx wa 1\\n\"\n    \"cbH ch 1\\n\"\n    \"ojZ on 1\\n\"\n    \"suU qu 1\\n\"\n    \"jcU ch 1\\n\"\n    \"sqY qu 1\\n\"\n    \"jMr er 1\\n\"\n    \"pxG pr 1\\n\"\n    \"rBq qu 1\\n\"\n    \"vlY le 1\\n\"\n    \"hyY th 1\\n\"\n    \"Cvw va 1\\n\"\n    \"Tqe qu 1\\n\"\n    \"fSj ij 1\\n\"\n    \"fVs st 1\\n\"\n    \"Eqc ch 1\\n\"\n    \"xnD an 1\\n\"\n    \"Owp pr 1\\n\"\n    \"xTb be 1\\n\"\n    \"wjL ij 1\\n\"\n    \"Rxv va 1\\n\"\n    \"nWf an 1\\n\"\n    \"vHp va 1\\n\"\n    \"vBk ka 1\\n\"\n    \"Nqv qu 1\\n\"\n    \"Lzf sz 1\\n\"\n    \"bwS wa 1\\n\"\n    \"Cby be 1\\n\"\n    \"zRr er 1\\n\"\n    \"qwJ qu 1\\n\"\n    \"xnB an 1\\n\"\n    \"qIc ch 1\\n\"\n    \"cGk ch 1\\n\"\n    \"Yji in 1\\n\"\n    \"gVh th 1\\n\"\n    \"lDc ch 1\\n\"\n    \"Qyr er 1\\n\"\n    \"fcH ch 1\\n\"\n    \"nxB an 1\\n\"\n    \"dvw de 1\\n\"\n    \"gQc ch 1\\n\"\n    \"mrR er 1\\n\"\n    \"fnK an 1\\n\"\n    \"Hlr le 1\\n\"\n    \"Dnq an 1\\n\"\n    \"bnU an 1\\n\"\n    \"qCe qu 1\\n\"\n    \"Tjv ij 1\\n\"\n    \"Epq qu 1\\n\"\n    \"wLf wa 1\\n\"\n    \"pZj ij 1\\n\"\n    \"gvR ng 1\\n\"\n    \"kqK qu 1\\n\"\n    \"vlG le 1\\n\"\n    \"vvN va 1\\n\"\n    \"gbM ng 1\\n\"\n    \"bNk ka 1\\n\"\n    \"jzL sz 1\\n\"\n    \"Wlq qu 1\\n\"\n    \"aYq an 1\\n\"\n    \"zdY de 1\\n\"\n    \"sfG st 1\\n\"\n    \"qfW qu 1\\n\"\n    \"kBv ka 1\\n\"\n    \"btG th 1\\n\"\n    \"Mqb qu 1\\n\"\n    \"lrC er 1\\n\"\n    \"vuE qu 1\\n\"\n    \"fyJ ny 1\\n\"\n    \"qmZ qu 1\\n\"\n    \"Jkq qu 1\\n\"\n    \"Cmj ij 1\\n\"\n    \"bXy be 1\\n\"\n    \"Ymy me 1\\n\"\n    \"qxY qu 1\\n\"\n    \"cNl ch 1\\n\"\n    \"fzU fo 1\\n\"\n    \"Rvt th 1\\n\"\n    \"ylI le 1\\n\"\n    \"xMs st 1\\n\"\n    \"Qhm th 1\\n\"\n    \"dHq qu 1\\n\"\n    \"dwL de 1\\n\"\n    \"vYr er 1\\n\"\n    \"Qxu qu 1\\n\"\n    \"dNh th 1\\n\"\n    \"zNc ch 1\\n\"\n    \"jmP ij 1\\n\"\n    \"Pbq qu 1\\n\"\n    \"fqj qu 1\\n\"\n    \"fUw wa 1\\n\"\n    \"Hyq qu 1\\n\"\n    \"Qdx de 1\\n\"\n    \"zSl le 1\\n\"\n    \"cWt th 1\\n\"\n    \"Fke er 1\\n\"\n    \"Ztz th 1\\n\"\n    \"uUq qu 1\\n\"\n    \"nBm an 1\\n\"\n    \"zJy sz 1\\n\"\n    \"pdI de 1\\n\"\n    \"nTd an 1\\n\"\n    \"Yjb ij 1\\n\"\n    \"Qjn an 1\\n\"\n    \"yXj ij 1\\n\"\n    \"xwB ow 1\\n\"\n    \"klq qu 1\\n\"\n    \"hfY th 1\\n\"\n    \"pDg ng 1\\n\"\n    \"zZd de 1\\n\"\n    \"mqO qu 1\\n\"\n    \"hZr th 1\\n\"\n    \"cmY ch 1\\n\"\n    \"gLk ng 1\\n\"\n    \"Qcj ch 1\\n\"\n    \"uKj qu 1\\n\"\n    \"nqD an 1\\n\"\n    \"yKw wa 1\\n\"\n    \"bfR be 1\\n\"\n    \"Rqz qu 1\\n\"\n    \"jhQ th 1\\n\"\n    \"vNj ij 1\\n\"\n    \"Tcf ch 1\\n\"\n    \"Hbn an 1\\n\"\n    \"Lwv va 1\\n\"\n    \"wcZ ch 1\\n\"\n    \"cdK ch 1\\n\"\n    \"bpR pr 1\\n\"\n    \"lWm le 1\\n\"\n    \"wNq qu 1\\n\"\n    \"pAj ij 1\\n\"\n    \"grV ng 1\\n\"\n    \"qmk qu 1\\n\"\n    \"cLf ch 1\\n\"\n    \"iwB in 1\\n\"\n    \"eqV qu 1\\n\"\n    \"Wqz qu 1\\n\"\n    \"Qnj an 1\\n\"\n    \"uoJ qu 1\\n\"\n    \"fVj ij 1\\n\"\n    \"cbU ch 1\\n\"\n    \"qpT qu 1\\n\"\n    \"pdZ de 1\\n\"\n    \"dzW de 1\\n\"\n    \"Wfw wa 1\\n\"\n    \"Zqm qu 1\\n\"\n    \"kJd de 1\\n\"\n    \"zWf sz 1\\n\"\n    \"bYg ng 1\\n\"\n    \"rjQ er 1\\n\"\n    \"dwB de 1\\n\"\n    \"Vlx le 1\\n\"\n    \"zKd de 1\\n\"\n    \"Lxw wa 1\\n\"\n    \"Hpw pr 1\\n\"\n    \"mvR va 1\\n\"\n    \"qMt th 1\\n\"\n    \"pWb pr 1\\n\"\n    \"dcW ch 1\\n\"\n    \"zEh th 1\\n\"\n    \"Xrs er 1\\n\"\n    \"Ftz th 1\\n\"\n    \"qyL qu 1\\n\"\n    \"jSn an 1\\n\"\n    \"Wzh th 1\\n\"\n    \"Pzf sz 1\\n\"\n    \"zkW sz 1\\n\"\n    \"ywY wa 1\\n\"\n    \"oGb on 1\\n\"\n    \"jBw ij 1\\n\"\n    \"Qpz sz 1\\n\"\n    \"rWm er 1\\n\"\n    \"smQ st 1\\n\"\n    \"uGk qu 1\\n\"\n    \"xkV ka 1\\n\"\n    \"wJf wa 1\\n\"\n    \"cjW ch 1\\n\"\n    \"wNx wa 1\\n\"\n    \"wjR ij 1\\n\"\n    \"wDd wa 1\\n\"\n    \"lrB er 1\\n\"\n    \"qhJ th 1\\n\"\n    \"jKp ij 1\\n\"\n    \"kNn an 1\\n\"\n    \"tqU th 1\\n\"\n    \"Jmj ij 1\\n\"\n    \"bJv va 1\\n\"\n    \"frN er 1\\n\"\n    \"uBj qu 1\\n\"\n    \"Uuv qu 1\\n\"\n    \"Mzv sz 1\\n\"\n    \"Djq qu 1\\n\"\n    \"Qgl le 1\\n\"\n    \"hdC th 1\\n\"\n    \"mFh th 1\\n\"\n    \"vjU ij 1\\n\"\n    \"prX er 1\\n\"\n    \"Kvc ch 1\\n\"\n    \"ryY er 1\\n\"\n    \"vzQ sz 1\\n\"\n    \"Ojh th 1\\n\"\n    \"Qfn an 1\\n\"\n    \"Vqg ng 1\\n\"\n    \"aQv an 1\\n\"\n    \"hHx th 1\\n\"\n    \"uIg ng 1\\n\"\n    \"Kpv va 1\\n\"\n    \"dQk ko 1\\n\"\n    \"Ghq th 1\\n\"\n    \"cZs ch 1\\n\"\n    \"nvH an 1\\n\"\n    \"jwJ ij 1\\n\"\n    \"dMm de 1\\n\"\n    \"gjI ng 1\\n\"\n    \"lPg ng 1\\n\"\n    \"qBs qu 1\\n\"\n    \"Vhq th 1\\n\"\n    \"qLt th 1\\n\"\n    \"hBd th 1\\n\"\n    \"Vcu ch 1\\n\"\n    \"cQd ch 1\\n\"\n    \"ypX pr 1\\n\"\n    \"mQv va 1\\n\"\n    \"vmR va 1\\n\"\n    \"xfH fo 1\\n\"\n    \"pqY qu 1\\n\"\n    \"Xtb th 1\\n\"\n    \"Vcx ch 1\\n\"\n    \"tWb th 1\\n\"\n    \"Pxa an 1\\n\"\n    \"Qmr er 1\\n\"\n    \"mdX de 1\\n\"\n    \"Bxt th 1\\n\"\n    \"jZv ij 1\\n\"\n    \"hNp th 1\\n\"\n    \"ybN be 1\\n\"\n    \"bkZ ka 1\\n\"\n    \"nVf an 1\\n\"\n    \"lKq qu 1\\n\"\n    \"oJj on 1\\n\"\n    \"pBv va 1\\n\"\n    \"hgA th 1\\n\"\n    \"qxE qu 1\\n\"\n    \"nvJ an 1\\n\"\n    \"Xcf ch 1\\n\"\n    \"Fdb de 1\\n\"\n    \"zAo on 1\\n\"\n    \"wQk ka 1\\n\"\n    \"tmX th 1\\n\"\n    \"pvZ va 1\\n\"\n    \"fNw wa 1\\n\"\n    \"zKk sz 1\\n\"\n    \"hRx th 1\\n\"\n    \"Tlj le 1\\n\"\n    \"iQj in 1\\n\"\n    \"jmU ij 1\\n\"\n    \"tbW th 1\\n\"\n    \"wVh th 1\\n\"\n    \"Tvh th 1\\n\"\n    \"nVg an 1\\n\"\n    \"Lxp pr 1\\n\"\n    \"vgO ng 1\\n\"\n    \"dfE de 1\\n\"\n    \"nVm an 1\\n\"\n    \"qKy qu 1\\n\"\n    \"eqZ qu 1\\n\"\n    \"Tcc ch 1\\n\"\n    \"cTk ch 1\\n\"\n    \"fKz sz 1\\n\"\n    \"Wkz sz 1\\n\"\n    \"lvZ le 1\\n\"\n    \"rGp er 1\\n\"\n    \"kKz sz 1\\n\"\n    \"Cbf be 1\\n\"\n    \"jQd de 1\\n\"\n    \"Zfc ch 1\\n\"\n    \"hvX th 1\\n\"\n    \"xgN ng 1\\n\"\n    \"Kpe er 1\\n\"\n    \"hzM th 1\\n\"\n    \"jxZ ij 1\\n\"\n    \"yqL qu 1\\n\"\n    \"pgC ng 1\\n\"\n    \"Fqd qu 1\\n\"\n    \"tMb th 1\\n\"\n    \"njQ an 1\\n\"\n    \"tfB th 1\\n\"\n    \"gjN ng 1\\n\"\n    \"wNc ch 1\\n\"\n    \"Pzj sz 1\\n\"\n    \"mhO th 1\\n\"\n    \"qUm qu 1\\n\"\n    \"Fhh th 1\\n\"\n    \"Sjd de 1\\n\"\n    \"hWj th 1\\n\"\n    \"yhL th 1\\n\"\n    \"lGp le 1\\n\"\n    \"dtX th 1\\n\"\n    \"hwX th 1\\n\"\n    \"srK er 1\\n\"\n    \"vqE qu 1\\n\"\n    \"bcO ch 1\\n\"\n    \"xQl le 1\\n\"\n    \"Qqf qu 1\\n\"\n    \"kJg ng 1\\n\"\n    \"pXz sz 1\\n\"\n    \"yuJ qu 1\\n\"\n    \"Gnp an 1\\n\"\n    \"Dlc ch 1\\n\"\n    \"Mxf fo 1\\n\"\n    \"yNr er 1\\n\"\n    \"bmV me 1\\n\"\n    \"fXo on 1\\n\"\n    \"mwW me 1\\n\"\n    \"lIj le 1\\n\"\n    \"Fvq qu 1\\n\"\n    \"Utq th 1\\n\"\n    \"jGk ij 1\\n\"\n    \"wYw wa 1\\n\"\n    \"wVm me 1\\n\"\n    \"bTq qu 1\\n\"\n    \"Ijp ij 1\\n\"\n    \"znM an 1\\n\"\n    \"xmO me 1\\n\"\n    \"gQx ng 1\\n\"\n    \"dKw de 1\\n\"\n    \"dUf de 1\\n\"\n    \"cSb ch 1\\n\"\n    \"zVb sz 1\\n\"\n    \"ccY ch 1\\n\"\n    \"xjE ij 1\\n\"\n    \"pYt th 1\\n\"\n    \"Vrq qu 1\\n\"\n    \"kzK sz 1\\n\"\n    \"zfC sz 1\\n\"\n    \"Ybh th 1\\n\"\n    \"dgS ng 1\\n\"\n    \"xcV ch 1\\n\"\n    \"xNm me 1\\n\"\n    \"Xkw ka 1\\n\"\n    \"Tpw pr 1\\n\"\n    \"Bwd de 1\\n\"\n    \"hwT th 1\\n\"\n    \"gQl ng 1\\n\"\n    \"cDs ch 1\\n\"\n    \"zYr er 1\\n\"\n    \"xTp pr 1\\n\"\n    \"qWm qu 1\\n\"\n    \"xjT ij 1\\n\"\n    \"hjK th 1\\n\"\n    \"uDc ch 1\\n\"\n    \"xhS th 1\\n\"\n    \"bWd de 1\\n\"\n    \"vCw va 1\\n\"\n    \"jyB ij 1\\n\"\n    \"uWd qu 1\\n\"\n    \"Nnq qu 1\\n\"\n    \"Qvb va 1\\n\"\n    \"jzV sz 1\\n\"\n    \"zBx sz 1\\n\"\n    \"wIj ij 1\\n\"\n    \"qRt th 1\\n\"\n    \"qrJ qu 1\\n\"\n    \"zZj sz 1\\n\"\n    \"kRr er 1\\n\"\n    \"Nzv sz 1\\n\"\n    \"Qfw wa 1\\n\"\n    \"Njt th 1\\n\"\n    \"bFy be 1\\n\"\n    \"lhY th 1\\n\"\n    \"eWj er 1\\n\"\n    \"jbM ij 1\\n\"\n    \"Xsg ng 1\\n\"\n    \"Rsd de 1\\n\"\n    \"flF le 1\\n\"\n    \"Phz th 1\\n\"\n    \"xWs st 1\\n\"\n    \"bCw wa 1\\n\"\n    \"gfJ ng 1\\n\"\n    \"qVo qu 1\\n\"\n    \"eQh th 1\\n\"\n    \"vcP ch 1\\n\"\n    \"mDj ij 1\\n\"\n    \"qTs qu 1\\n\"\n    \"Xgs ng 1\\n\"\n    \"Vuq qu 1\\n\"\n    \"ufN qu 1\\n\"\n    \"xBs st 1\\n\"\n    \"pTk ka 1\\n\"\n    \"fSq qu 1\\n\"\n    \"mbD me 1\\n\"\n    \"Vwz sz 1\\n\"\n    \"hhQ th 1\\n\"\n    \"kfP ka 1\\n\"\n    \"Pwq qu 1\\n\"\n    \"dhG th 1\\n\"\n    \"qZj qu 1\\n\"\n    \"yRj ij 1\\n\"\n    \"yCs st 1\\n\"\n    \"fjN ij 1\\n\"\n    \"Rqg ng 1\\n\"\n    \"jJh th 1\\n\"\n    \"dlR le 1\\n\"\n    \"Xmb me 1\\n\"\n    \"Jjt th 1\\n\"\n    \"gqI ng 1\\n\"\n    \"fqM qu 1\\n\"\n    \"iVg ng 1\\n\"\n    \"Hgu ng 1\\n\"\n    \"iHw in 1\\n\"\n    \"eQv er 1\\n\"\n    \"mzE sz 1\\n\"\n    \"fjZ ij 1\\n\"\n    \"qNn an 1\\n\"\n    \"wlE le 1\\n\"\n    \"kGp ka 1\\n\"\n    \"Iqv qu 1\\n\"\n    \"kBn an 1\\n\"\n    \"xZd de 1\\n\"\n    \"Dkc ch 1\\n\"\n    \"zlH le 1\\n\"\n    \"txB th 1\\n\"\n    \"tQr th 1\\n\"\n    \"uOx qu 1\\n\"\n    \"pJi in 1\\n\"\n    \"zbL sz 1\\n\"\n    \"xkD ka 1\\n\"\n    \"scV ch 1\\n\"\n    \"qXh th 1\\n\"\n    \"kIq qu 1\\n\"\n    \"xNn an 1\\n\"\n    \"gJf ng 1\\n\"\n    \"tmB th 1\\n\"\n    \"tcK th 1\\n\"\n    \"kwZ ka 1\\n\"\n    \"uZj qu 1\\n\"\n    \"snQ an 1\\n\"\n    \"uKq qu 1\\n\"\n    \"crX ch 1\\n\"\n    \"hXy th 1\\n\"\n    \"Zcc ch 1\\n\"\n    \"Pfz sz 1\\n\"\n    \"dwM de 1\\n\"\n    \"qIy qu 1\\n\"\n    \"xuP qu 1\\n\"\n    \"wDw wa 1\\n\"\n    \"Hjr er 1\\n\"\n    \"dQf de 1\\n\"\n    \"wvJ wa 1\\n\"\n    \"tHm th 1\\n\"\n    \"Ydw de 1\\n\"\n    \"wxI wa 1\\n\"\n    \"pOv va 1\\n\"\n    \"Wmq qu 1\\n\"\n    \"dhD th 1\\n\"\n    \"qpw qu 1\\n\"\n    \"bmC me 1\\n\"\n    \"wcX ch 1\\n\"\n    \"wjH ij 1\\n\"\n    \"bWf be 1\\n\"\n    \"Gdp de 1\\n\"\n    \"Ldw de 1\\n\"\n    \"Sbq qu 1\\n\"\n    \"vZv va 1\\n\"\n    \"Kwb wa 1\\n\"\n    \"qhT th 1\\n\"\n    \"yRf ny 1\\n\"\n    \"hwC th 1\\n\"\n    \"npJ an 1\\n\"\n    \"jmV ij 1\\n\"\n    \"vGg ng 1\\n\"\n    \"xqF qu 1\\n\"\n    \"Phm th 1\\n\"\n    \"pWc ch 1\\n\"\n    \"Vxk ka 1\\n\"\n    \"sHz st 1\\n\"\n    \"Wbx be 1\\n\"\n    \"bfK be 1\\n\"\n    \"Jgl ng 1\\n\"\n    \"kTb ka 1\\n\"\n    \"Kbf be 1\\n\"\n    \"kzC sz 1\\n\"\n    \"pKq qu 1\\n\"\n    \"zwB sz 1\\n\"\n    \"uZg ng 1\\n\"\n    \"btI th 1\\n\"\n    \"zXj sz 1\\n\"\n    \"uzS qu 1\\n\"\n    \"vWk ka 1\\n\"\n    \"xrH er 1\\n\"\n    \"oQc ch 1\\n\"\n    \"zlT le 1\\n\"\n    \"dfI de 1\\n\"\n    \"Qmf me 1\\n\"\n    \"sgE ng 1\\n\"\n    \"Ysx st 1\\n\"\n    \"Rzd de 1\\n\"\n    \"xLd de 1\\n\"\n    \"qsX qu 1\\n\"\n    \"kqJ qu 1\\n\"\n    \"kCm ka 1\\n\"\n    \"bFm me 1\\n\"\n    \"igQ ng 1\\n\"\n    \"sRq qu 1\\n\"\n    \"jGm ij 1\\n\"\n    \"Szs st 1\\n\"\n    \"Yvz sz 1\\n\"\n    \"kXz sz 1\\n\"\n    \"Gnz an 1\\n\"\n    \"mWc ch 1\\n\"\n    \"tDq th 1\\n\"\n    \"gqz ng 1\\n\"\n    \"nHb ng 1\\n\"\n    \"tdM th 1\\n\"\n    \"Ovx va 1\\n\"\n    \"Znl an 1\\n\"\n    \"wuE qu 1\\n\"\n    \"zLt th 1\\n\"\n    \"ofQ on 1\\n\"\n    \"vYj ij 1\\n\"\n    \"jyH ij 1\\n\"\n    \"zqA qu 1\\n\"\n    \"cJy ch 1\\n\"\n    \"Wbf be 1\\n\"\n    \"lTt th 1\\n\"\n    \"klW le 1\\n\"\n    \"Xxa an 1\\n\"\n    \"fCz sz 1\\n\"\n    \"lKf le 1\\n\"\n    \"qwT qu 1\\n\"\n    \"rHk er 1\\n\"\n    \"dbN de 1\\n\"\n    \"uUy qu 1\\n\"\n    \"zgN ng 1\\n\"\n    \"Pxg ng 1\\n\"\n    \"pNc ch 1\\n\"\n    \"cyJ ch 1\\n\"\n    \"jpH ij 1\\n\"\n    \"Vtf th 1\\n\"\n    \"sjJ st 1\\n\"\n    \"Qlh th 1\\n\"\n    \"twV th 1\\n\"\n    \"yGq qu 1\\n\"\n    \"tVp th 1\\n\"\n    \"ksQ st 1\\n\"\n    \"xnT an 1\\n\"\n    \"rpJ er 1\\n\"\n    \"wzI sz 1\\n\"\n    \"Zhp th 1\\n\"\n    \"aDf an 1\\n\"\n    \"Uxj ij 1\\n\"\n    \"cPg ch 1\\n\"\n    \"qSq qu 1\\n\"\n    \"mKq qu 1\\n\"\n    \"vBz sz 1\\n\"\n    \"yPj ij 1\\n\"\n    \"Vkz sz 1\\n\"\n    \"qiB qu 1\\n\"\n    \"tkJ th 1\\n\"\n    \"Ouq qu 1\\n\"\n    \"zoH on 1\\n\"\n    \"qVt th 1\\n\"\n    \"Gxs st 1\\n\"\n    \"jzF sz 1\\n\"\n    \"swH st 1\\n\"\n    \"nBb an 1\\n\"\n    \"zhQ th 1\\n\"\n    \"yRn an 1\\n\"\n    \"fnX an 1\\n\"\n    \"qoQ qu 1\\n\"\n    \"mxP me 1\\n\"\n    \"bwR wa 1\\n\"\n    \"gJj ng 1\\n\"\n    \"qnk an 1\\n\"\n    \"tMk th 1\\n\"\n    \"dxO de 1\\n\"\n    \"rzV er 1\\n\"\n    \"vpP va 1\\n\"\n    \"Nvz sz 1\\n\"\n    \"Nfp pr 1\\n\"\n    \"Cnz an 1\\n\"\n    \"oTd on 1\\n\"\n    \"dqG qu 1\\n\"\n    \"Hmx me 1\\n\"\n    \"psX st 1\\n\"\n    \"swM st 1\\n\"\n    \"dqC qu 1\\n\"\n    \"Vwx wa 1\\n\"\n    \"nXf an 1\\n\"\n    \"wkY ka 1\\n\"\n    \"wfC wa 1\\n\"\n    \"qSr qu 1\\n\"\n    \"qVc ch 1\\n\"\n    \"kDn an 1\\n\"\n    \"Yvb va 1\\n\"\n    \"zqH qu 1\\n\"\n    \"qxJ qu 1\\n\"\n    \"zKj sz 1\\n\"\n    \"jcN ch 1\\n\"\n    \"tWk th 1\\n\"\n    \"Rrz er 1\\n\"\n    \"bmG me 1\\n\"\n    \"srZ er 1\\n\"\n    \"wWq qu 1\\n\"\n    \"Cfh th 1\\n\"\n    \"lNt th 1\\n\"\n    \"hcV th 1\\n\"\n    \"Znf an 1\\n\"\n    \"Jhv th 1\\n\"\n    \"qIp qu 1\\n\"\n    \"vSz sz 1\\n\"\n    \"feU er 1\\n\"\n    \"xIi in 1\\n\"\n    \"Zmq qu 1\\n\"\n    \"eGf er 1\\n\"\n    \"bQk ka 1\\n\"\n    \"Xcb ch 1\\n\"\n    \"nlK an 1\\n\"\n    \"tmJ th 1\\n\"\n    \"jlL le 1\\n\"\n    \"mwC me 1\\n\"\n    \"qjr qu 1\\n\"\n    \"zBb sz 1\\n\"\n    \"fhU th 1\\n\"\n    \"sPq qu 1\\n\"\n    \"sBf st 1\\n\"\n    \"uXy qu 1\\n\"\n    \"Lkx ka 1\\n\"\n    \"rGz er 1\\n\"\n    \"hXz th 1\\n\"\n    \"zuW qu 1\\n\"\n    \"Rvx va 1\\n\"\n    \"bcJ ch 1\\n\"\n    \"Eoj on 1\\n\"\n    \"iVt in 1\\n\"\n    \"yhH th 1\\n\"\n    \"xVv va 1\\n\"\n    \"pMr er 1\\n\"\n    \"vZd de 1\\n\"\n    \"Vvn an 1\\n\"\n    \"iCv in 1\\n\"\n    \"vQp va 1\\n\"\n    \"vlB le 1\\n\"\n    \"wVt th 1\\n\"\n    \"Ugk ng 1\\n\"\n    \"ktQ th 1\\n\"\n    \"jCr er 1\\n\"\n    \"qvz qu 1\\n\"\n    \"bVf be 1\\n\"\n    \"rPv er 1\\n\"\n    \"wfH wa 1\\n\"\n    \"hbU th 1\\n\"\n    \"pjF ij 1\\n\"\n    \"oXg ng 1\\n\"\n    \"zSr er 1\\n\"\n    \"wRb wa 1\\n\"\n    \"Hcu ch 1\\n\"\n    \"yxJ ny 1\\n\"\n    \"lTc ch 1\\n\"\n    \"bYb be 1\\n\"\n    \"Wxz sz 1\\n\"\n    \"vrE er 1\\n\"\n    \"zGy sz 1\\n\"\n    \"Jqm qu 1\\n\"\n    \"rzI er 1\\n\"\n    \"xgV gi 1\\n\"\n    \"Rvw va 1\\n\"\n    \"Vnx an 1\\n\"\n    \"uJg ng 1\\n\"\n    \"hFq th 1\\n\"\n    \"Tgz ng 1\\n\"\n    \"aQc an 1\\n\"\n    \"xzJ sz 1\\n\"\n    \"tNc th 1\\n\"\n    \"jfA ij 1\\n\"\n    \"ycO ch 1\\n\"\n    \"Wkj ij 1\\n\"\n    \"yBp pr 1\\n\"\n    \"hgD th 1\\n\"\n    \"iSx in 1\\n\"\n    \"xCm me 1\\n\"\n    \"yjX ij 1\\n\"\n    \"uIh th 1\\n\"\n    \"qgq ng 1\\n\"\n    \"Tzj sz 1\\n\"\n    \"yjO ij 1\\n\"\n    \"yrY er 1\\n\"\n    \"bmZ me 1\\n\"\n    \"zqT qu 1\\n\"\n    \"mBd de 1\\n\"\n    \"qvK qu 1\\n\"\n    \"zcA ch 1\\n\"\n    \"xrX er 1\\n\"\n    \"mJm me 1\\n\"\n    \"Xqf qu 1\\n\"\n    \"Pxk ka 1\\n\"\n    \"aDb an 1\\n\"\n    \"qXg ng 1\\n\"\n    \"eGw er 1\\n\"\n    \"hjD th 1\\n\"\n    \"tTx th 1\\n\"\n    \"oMd on 1\\n\"\n    \"fKg ng 1\\n\"\n    \"Npn an 1\\n\"\n    \"kqU qu 1\\n\"\n    \"lbF le 1\\n\"\n    \"Hvj ij 1\\n\"\n    \"qZe qu 1\\n\"\n    \"lQj le 1\\n\"\n    \"dkY de 1\\n\"\n    \"dZl le 1\\n\"\n    \"zZh th 1\\n\"\n    \"qyM qu 1\\n\"\n    \"dmJ de 1\\n\"\n    \"kfK ka 1\\n\"\n    \"iPq qu 1\\n\"\n    \"zwU sz 1\\n\"\n    \"pvS va 1\\n\"\n    \"ihJ th 1\\n\"\n    \"ucW ch 1\\n\"\n    \"Jjz sz 1\\n\"\n    \"mMd de 1\\n\"\n    \"vpw va 1\\n\"\n    \"xCg ng 1\\n\"\n    \"hKs th 1\\n\"\n    \"vlI le 1\\n\"\n    \"Nmc ch 1\\n\"\n    \"xzV sz 1\\n\"\n    \"gZs ng 1\\n\"\n    \"rRp er 1\\n\"\n    \"Ufd de 1\\n\"\n    \"fpF pr 1\\n\"\n    \"fwY wa 1\\n\"\n    \"Gxr er 1\\n\"\n    \"xLr er 1\\n\"\n    \"vzE sz 1\\n\"\n    \"jRf ij 1\\n\"\n    \"brR er 1\\n\"\n    \"gkZ ng 1\\n\"\n    \"dUy de 1\\n\"\n    \"Xji in 1\\n\"\n    \"Kdb de 1\\n\"\n    \"jpC ij 1\\n\"\n    \"oUj on 1\\n\"\n    \"qmh th 1\\n\"\n    \"qjL qu 1\\n\"\n    \"wRs sz 1\\n\"\n    \"jhM th 1\\n\"\n    \"Rhr th 1\\n\"\n    \"btN th 1\\n\"\n    \"Pjq ij 1\\n\"\n    \"xwU wa 1\\n\"\n    \"qyE qu 1\\n\"\n    \"Jxd de 1\\n\"\n    \"Pqr qu 1\\n\"\n    \"lRd le 1\\n\"\n    \"jqI qu 1\\n\"\n    \"qFs qu 1\\n\"\n    \"Mwk ka 1\\n\"\n    \"jEb ij 1\\n\"\n    \"Nxy ny 1\\n\"\n    \"Pzm sz 1\\n\"\n    \"tfL th 1\\n\"\n    \"vFc ch 1\\n\"\n    \"jQg ng 1\\n\"\n    \"Bnx an 1\\n\"\n    \"lMv le 1\\n\"\n    \"tKq th 1\\n\"\n    \"eVq qu 1\\n\"\n    \"Tyq qu 1\\n\"\n    \"drJ er 1\\n\"\n    \"oHw on 1\\n\"\n    \"lFk le 1\\n\"\n    \"jpW ij 1\\n\"\n    \"Qjw ij 1\\n\"\n    \"cNx ch 1\\n\"\n    \"Bhz th 1\\n\"\n    \"bhB th 1\\n\"\n    \"pDx pr 1\\n\"\n    \"xpY pr 1\\n\"\n    \"tnH th 1\\n\"\n    \"dfL de 1\\n\"\n    \"hzL th 1\\n\"\n    \"zNk sz 1\\n\"\n    \"lBm le 1\\n\"\n    \"lXl le 1\\n\"\n    \"yPv va 1\\n\"\n    \"Zcl ch 1\\n\"\n    \"hMq th 1\\n\"\n    \"rJj ri 1\\n\"\n    \"aXw an 1\\n\"\n    \"zsQ sz 1\\n\"\n    \"cQm ch 1\\n\"\n    \"Sqc ch 1\\n\"\n    \"tKm th 1\\n\"\n    \"hvO th 1\\n\"\n    \"hGd th 1\\n\"\n    \"Wbn an 1\\n\"\n    \"vCf va 1\\n\"\n    \"lGg ng 1\\n\"\n    \"vDh th 1\\n\"\n    \"wDq qu 1\\n\"\n    \"xRy ny 1\\n\"\n    \"vXi in 1\\n\"\n    \"qiQ qu 1\\n\"\n    \"cFs ch 1\\n\"\n    \"Lhp th 1\\n\"\n    \"xEp pr 1\\n\"\n    \"fQt th 1\\n\"\n    \"cJv ch 1\\n\"\n    \"lzO le 1\\n\"\n    \"Fxk ka 1\\n\"\n    \"tDd th 1\\n\"\n    \"Xnx an 1\\n\"\n    \"txC th 1\\n\"\n    \"tGb th 1\\n\"\n    \"zvG sz 1\\n\"\n    \"gpC ng 1\\n\"\n    \"pxD pr 1\\n\"\n    \"Zfp pr 1\\n\"\n    \"oWt th 1\\n\"\n    \"vvV va 1\\n\"\n    \"Gwf wa 1\\n\"\n    \"Ycv ch 1\\n\"\n    \"gcZ ch 1\\n\"\n    \"mMw me 1\\n\"\n    \"yQl le 1\\n\"\n    \"uGp qu 1\\n\"\n    \"lNj le 1\\n\"\n    \"Ycm ch 1\\n\"\n    \"vIx va 1\\n\"\n    \"yLp pr 1\\n\"\n    \"mRx me 1\\n\"\n    \"nrK an 1\\n\"\n    \"Zyh th 1\\n\"\n    \"Nct th 1\\n\"\n    \"Qml le 1\\n\"\n    \"zPd de 1\\n\"\n    \"dWq qu 1\\n\"\n    \"Egx ng 1\\n\"\n    \"vNs st 1\\n\"\n    \"sNl le 1\\n\"\n    \"pdW de 1\\n\"\n    \"Snh th 1\\n\"\n    \"yrP er 1\\n\"\n    \"fJl le 1\\n\"\n    \"tVg th 1\\n\"\n    \"jvC ij 1\\n\"\n    \"yhN th 1\\n\"\n    \"qdC qu 1\\n\"\n    \"pmT me 1\\n\"\n    \"Lbg ng 1\\n\"\n    \"xpJ pr 1\\n\"\n    \"mYt th 1\\n\"\n    \"bwV wa 1\\n\"\n    \"wjD ij 1\\n\"\n    \"fqC qu 1\\n\"\n    \"xUf fo 1\\n\"\n    \"dhU th 1\\n\"\n    \"bZb be 1\\n\"\n    \"twD th 1\\n\"\n    \"bbM be 1\\n\"\n    \"hgC th 1\\n\"\n    \"dKb de 1\\n\"\n    \"vJm va 1\\n\"\n    \"wEq qu 1\\n\"\n    \"Ofq qu 1\\n\"\n    \"cXl ch 1\\n\"\n    \"wpV pr 1\\n\"\n    \"tqM th 1\\n\"\n    \"pUf pr 1\\n\"\n    \"Twx wa 1\\n\"\n    \"Mgq ng 1\\n\"\n    \"vQo on 1\\n\"\n    \"yjT ij 1\\n\"\n    \"aVd an 1\\n\"\n    \"eHp er 1\\n\"\n    \"vGv va 1\\n\"\n    \"srG er 1\\n\"\n    \"qVb qu 1\\n\"\n    \"tlM th 1\\n\"\n    \"nrT an 1\\n\"\n    \"zRh th 1\\n\"\n    \"cLr ch 1\\n\"\n    \"lrH er 1\\n\"\n    \"wTl le 1\\n\"\n    \"cvI ch 1\\n\"\n    \"kqN qu 1\\n\"\n    \"Ixp pr 1\\n\"\n    \"xeQ er 1\\n\"\n    \"cNy ch 1\\n\"\n    \"kRh th 1\\n\"\n    \"ruY qu 1\\n\"\n    \"Xcq ch 1\\n\"\n    \"Kzb bi 1\\n\"\n    \"Wxh th 1\\n\"\n    \"pjM ij 1\\n\"\n    \"jdO de 1\\n\"\n    \"Jfy ny 1\\n\"\n    \"bVz sz 1\\n\"\n    \"dQo on 1\\n\"\n    \"ncQ an 1\\n\"\n    \"pVw pr 1\\n\"\n    \"Sxj ij 1\\n\"\n    \"Ubp pr 1\\n\"\n    \"wvC va 1\\n\"\n    \"khG th 1\\n\"\n    \"cqF ch 1\\n\"\n    \"Nxj ij 1\\n\"\n    \"wDm me 1\\n\"\n    \"yDd de 1\\n\"\n    \"iyI in 1\\n\"\n    \"eXq qu 1\\n\"\n    \"hqP th 1\\n\"\n    \"Kxr er 1\\n\"\n    \"vsY st 1\\n\"\n    \"Twb wa 1\\n\"\n    \"fqw qu 1\\n\"\n    \"wmC me 1\\n\"\n    \"vFx va 1\\n\"\n    \"vnC an 1\\n\"\n    \"nWq an 1\\n\"\n    \"hzB th 1\\n\"\n    \"Kfk ka 1\\n\"\n    \"tQe th 1\\n\"\n    \"juW qu 1\\n\"\n    \"qlX qu 1\\n\"\n    \"hGw th 1\\n\"\n    \"Oqd qu 1\\n\"\n    \"Npw pr 1\\n\"\n    \"hgW th 1\\n\"\n    \"fxM fo 1\\n\"\n    \"jSy ij 1\\n\"\n    \"fJt th 1\\n\"\n    \"mjG ij 1\\n\"\n    \"tgV th 1\\n\"\n    \"Ogx ng 1\\n\"\n    \"Hbx be 1\\n\"\n    \"Ljl le 1\\n\"\n    \"ivZ in 1\\n\"\n    \"bmY me 1\\n\"\n    \"Qfp pr 1\\n\"\n    \"wfQ wa 1\\n\"\n    \"hCg th 1\\n\"\n    \"vuU qu 1\\n\"\n    \"ydZ de 1\\n\"\n    \"vVk ka 1\\n\"\n    \"mZf me 1\\n\"\n    \"lOq qu 1\\n\"\n    \"qIv qu 1\\n\"\n    \"xZb be 1\\n\"\n    \"xqk qu 1\\n\"\n    \"Wmy me 1\\n\"\n    \"Jqi qu 1\\n\"\n    \"cxL ch 1\\n\"\n    \"Ztq th 1\\n\"\n    \"tdT th 1\\n\"\n    \"uWt th 1\\n\"\n    \"xGz sz 1\\n\"\n    \"Wwk ka 1\\n\"\n    \"pBk ka 1\\n\"\n    \"yqg ng 1\\n\"\n    \"cYl ch 1\\n\"\n    \"ynW an 1\\n\"\n    \"wyJ wa 1\\n\"\n    \"qGy qu 1\\n\"\n    \"fNp pr 1\\n\"\n    \"hFs th 1\\n\"\n    \"Yxu qu 1\\n\"\n    \"kvJ ka 1\\n\"\n    \"Fxz sz 1\\n\"\n    \"twG th 1\\n\"\n    \"qvG qu 1\\n\"\n    \"vRp va 1\\n\"\n    \"Qqi qu 1\\n\"\n    \"gzE ng 1\\n\"\n    \"pNl le 1\\n\"\n    \"zpW sz 1\\n\"\n    \"dcP ch 1\\n\"\n    \"cPx ch 1\\n\"\n    \"wcQ ch 1\\n\"\n    \"pQc ch 1\\n\"\n    \"qyF qu 1\\n\"\n    \"zcX ch 1\\n\"\n    \"wqk qu 1\\n\"\n    \"kmY ka 1\\n\"\n    \"qlG qu 1\\n\"\n    \"xEz sz 1\\n\"\n    \"pqV qu 1\\n\"\n    \"Ohp th 1\\n\"\n    \"xdM de 1\\n\"\n    \"fLp pr 1\\n\"\n    \"qAe qu 1\\n\"\n    \"Xwv va 1\\n\"\n    \"Lzi in 1\\n\"\n    \"qOk qu 1\\n\"\n    \"cXn an 1\\n\"\n    \"Kds de 1\\n\"\n    \"gvU ng 1\\n\"\n    \"fPk ka 1\\n\"\n    \"nZr an 1\\n\"\n    \"Hxq qu 1\\n\"\n    \"fCm me 1\\n\"\n    \"qfD qu 1\\n\"\n    \"Wfv va 1\\n\"\n    \"qfb qu 1\\n\"\n    \"jqC qu 1\\n\"\n    \"fuX qu 1\\n\"\n    \"qfA qu 1\\n\"\n    \"Rlt th 1\\n\"\n    \"xjD ij 1\\n\"\n    \"wtF th 1\\n\"\n    \"Xmz sz 1\\n\"\n    \"pWp pr 1\\n\"\n    \"Qxv va 1\\n\"\n    \"zVf sz 1\\n\"\n    \"gmZ ng 1\\n\"\n    \"qdU qu 1\\n\"\n    \"jqV qu 1\\n\"\n    \"gXc ch 1\\n\"\n    \"qmK qu 1\\n\"\n    \"Gfj ij 1\\n\"\n    \"cQr ch 1\\n\"\n    \"Yhr th 1\\n\"\n    \"vvS va 1\\n\"\n    \"uDb qu 1\\n\"\n    \"cdB ch 1\\n\"\n    \"bvE va 1\\n\"\n    \"xvS va 1\\n\"\n    \"jRq qu 1\\n\"\n    \"rvD er 1\\n\"\n    \"Xyy ny 1\\n\"\n    \"Jfi in 1\\n\"\n    \"aBw an 1\\n\"\n    \"nWc an 1\\n\"\n    \"xBq qu 1\\n\"\n    \"kgY ng 1\\n\"\n    \"bGb bi 1\\n\"\n    \"gjE ng 1\\n\"\n    \"Rlw le 1\\n\"\n    \"wrT er 1\\n\"\n    \"bQr er 1\\n\"\n    \"ljY le 1\\n\"\n    \"qvU qu 1\\n\"\n    \"fKm me 1\\n\"\n    \"pTt th 1\\n\"\n    \"zTw sz 1\\n\"\n    \"qnV an 1\\n\"\n    \"rWx er 1\\n\"\n    \"nWd an 1\\n\"\n    \"nKf an 1\\n\"\n    \"kMf ka 1\\n\"\n    \"fkG ka 1\\n\"\n    \"bwX wa 1\\n\"\n    \"cwV ch 1\\n\"\n    \"uwK qu 1\\n\"\n    \"rLv er 1\\n\"\n    \"zMb sz 1\\n\"\n    \"zpZ sz 1\\n\"\n    \"rMq qu 1\\n\"\n    \"Ttj th 1\\n\"\n    \"gvO ng 1\\n\"\n    \"Jcz ch 1\\n\"\n    \"Cyx ny 1\\n\"\n    \"njX an 1\\n\"\n    \"aVx an 1\\n\"\n    \"qXn an 1\\n\"\n    \"Uqs qu 1\\n\"\n    \"dVz de 1\\n\"\n    \"Rcp ch 1\\n\"\n    \"eKg ng 1\\n\"\n    \"Xzn in 1\\n\"\n    \"vyF va 1\\n\"\n    \"Klc ch 1\\n\"\n    \"xdI de 1\\n\"\n    \"Hqb qu 1\\n\"\n    \"xEe er 1\\n\"\n    \"qpI qu 1\\n\"\n    \"gDx ng 1\\n\"\n    \"Jhf th 1\\n\"\n    \"quK un 1\\n\"\n    \"vgU ng 1\\n\"\n    \"rWv er 1\\n\"\n    \"Pnm an 1\\n\"\n    \"nLm an 1\\n\"\n    \"Bhj th 1\\n\"\n    \"bPt th 1\\n\"\n    \"jpI ij 1\\n\"\n    \"tLz th 1\\n\"\n    \"vpS va 1\\n\"\n    \"Fxj ij 1\\n\"\n    \"qDs qu 1\\n\"\n    \"wzM sz 1\\n\"\n    \"gwJ ng 1\\n\"\n    \"zBw sz 1\\n\"\n    \"qGv qu 1\\n\"\n    \"rLh th 1\\n\"\n    \"Bjl le 1\\n\"\n    \"hfH th 1\\n\"\n    \"clW ch 1\\n\"\n    \"Rgk ng 1\\n\"\n    \"Gsg ng 1\\n\"\n    \"Uvx va 1\\n\"\n    \"Qgv ng 1\\n\"\n    \"gfX ng 1\\n\"\n    \"rQv er 1\\n\"\n    \"xvG va 1\\n\"\n    \"kjx ij 1\\n\"\n    \"dGf de 1\\n\"\n    \"fcA ch 1\\n\"\n    \"Ehq th 1\\n\"\n    \"zBz sz 1\\n\"\n    \"Gpk ka 1\\n\"\n    \"tBv th 1\\n\"\n    \"Xfg ng 1\\n\"\n    \"yJm me 1\\n\"\n    \"sqT qu 1\\n\"\n    \"prY er 1\\n\"\n    \"Dqo qu 1\\n\"\n    \"Jzg ng 1\\n\"\n    \"qMp qu 1\\n\"\n    \"yfM ny 1\\n\"\n    \"Gxf fo 1\\n\"\n    \"wzP sz 1\\n\"\n    \"zNm sz 1\\n\"\n    \"wKg ng 1\\n\"\n    \"Rrd er 1\\n\"\n    \"Hvw va 1\\n\"\n    \"gfD ng 1\\n\"\n    \"Wmz sz 1\\n\"\n    \"cJn an 1\\n\"\n    \"nTf an 1\\n\"\n    \"uvW qu 1\\n\"\n    \"uPf qu 1\\n\"\n    \"vwR va 1\\n\"\n    \"bMf be 1\\n\"\n    \"wIu qu 1\\n\"\n    \"kxY ka 1\\n\"\n    \"gZk ng 1\\n\"\n    \"qFd qu 1\\n\"\n    \"bMl le 1\\n\"\n    \"wHl le 1\\n\"\n    \"wVg ng 1\\n\"\n    \"wlX le 1\\n\"\n    \"fsL st 1\\n\"\n    \"pRf pr 1\\n\"\n    \"zsX st 1\\n\"\n    \"qBk qu 1\\n\"\n    \"Xzp sz 1\\n\"\n    \"jdR de 1\\n\"\n    \"Zlz le 1\\n\"\n    \"Wfc ch 1\\n\"\n    \"Rjv ij 1\\n\"\n    \"vFz sz 1\\n\"\n    \"tkV th 1\\n\"\n    \"Xbw wa 1\\n\"\n    \"xQc ch 1\\n\"\n    \"Kxy ny 1\\n\"\n    \"xCv va 1\\n\"\n    \"nqV an 1\\n\"\n    \"Wwx wa 1\\n\"\n    \"kdW de 1\\n\"\n    \"pkI ka 1\\n\"\n    \"ohS th 1\\n\"\n    \"Zdc ch 1\\n\"\n    \"mCg ng 1\\n\"\n    \"sxL st 1\\n\"\n    \"Qrx er 1\\n\"\n    \"qXw qu 1\\n\"\n    \"wqQ qu 1\\n\"\n    \"ijK in 1\\n\"\n    \"sFz st 1\\n\"\n    \"Hlw le 1\\n\"\n    \"Gqn an 1\\n\"\n    \"xPk ka 1\\n\"\n    \"wZq qu 1\\n\"\n    \"jqm qu 1\\n\"\n    \"Lzp sz 1\\n\"\n    \"Bdz de 1\\n\"\n    \"wQl le 1\\n\"\n    \"wtJ th 1\\n\"\n    \"Uyi in 1\\n\"\n    \"Wcy ch 1\\n\"\n    \"wqH qu 1\\n\"\n    \"Bns an 1\\n\"\n    \"cDt th 1\\n\"\n    \"xJv va 1\\n\"\n    \"Wfz sz 1\\n\"\n    \"xhP th 1\\n\"\n    \"cWp ch 1\\n\"\n    \"rqZ qu 1\\n\"\n    \"bkB ka 1\\n\"\n    \"Wtl th 1\\n\"\n    \"gzf ng 1\\n\"\n    \"bMr er 1\\n\"\n    \"pxN pr 1\\n\"\n    \"vhV th 1\\n\"\n    \"kqX qu 1\\n\"\n    \"Kdq qu 1\\n\"\n    \"vQl le 1\\n\"\n    \"ykC ka 1\\n\"\n    \"zMh th 1\\n\"\n    \"Eqz qu 1\\n\"\n    \"lXq qu 1\\n\"\n    \"zmZ sz 1\\n\"\n    \"qpB qu 1\\n\"\n    \"vGj ij 1\\n\"\n    \"Tjx zj 1\\n\"\n    \"tvK th 1\\n\"\n    \"gYc ch 1\\n\"\n    \"lFc ch 1\\n\"\n    \"iJt th 1\\n\"\n    \"Pkx ka 1\\n\"\n    \"cDv ch 1\\n\"\n    \"Yyd de 1\\n\"\n    \"Vcq ch 1\\n\"\n    \"Xhq th 1\\n\"\n    \"zNf sz 1\\n\"\n    \"vcD ch 1\\n\"\n    \"bnW an 1\\n\"\n    \"uvQ qu 1\\n\"\n    \"Zzj sz 1\\n\"\n    \"gPj ng 1\\n\"\n    \"jwD ij 1\\n\"\n    \"jpO ij 1\\n\"\n    \"bDx be 1\\n\"\n    \"vEi in 1\\n\"\n    \"Zct th 1\\n\"\n    \"wrX er 1\\n\"\n    \"dhS th 1\\n\"\n    \"zjJ sz 1\\n\"\n    \"dDk de 1\\n\"\n    \"srJ er 1\\n\"\n    \"aWg an 1\\n\"\n    \"mvJ va 1\\n\"\n    \"Ytc th 1\\n\"\n    \"jiQ in 1\\n\"\n    \"tFz th 1\\n\"\n    \"sJl le 1\\n\"\n    \"vZq qu 1\\n\"\n    \"xUd de 1\\n\"\n    \"oqB qu 1\\n\"\n    \"xDh th 1\\n\"\n    \"hfE th 1\\n\"\n    \"mSb me 1\\n\"\n    \"jmR ij 1\\n\"\n    \"rFp er 1\\n\"\n    \"Xjy ij 1\\n\"\n    \"bPp pr 1\\n\"\n    \"iqQ ti 1\\n\"\n    \"mfq qu 1\\n\"\n    \"txL th 1\\n\"\n    \"jBd de 1\\n\"\n    \"Xvq qu 1\\n\"\n    \"dvY de 1\\n\"\n    \"sdM de 1\\n\"\n    \"xgY ng 1\\n\"\n    \"rYh th 1\\n\"\n    \"vlA le 1\\n\"\n    \"pFb pr 1\\n\"\n    \"yFz sz 1\\n\"\n    \"gcK ch 1\\n\"\n    \"xfZ fo 1\\n\"\n    \"jDc ch 1\\n\"\n    \"yNv va 1\\n\"\n    \"tKt th 1\\n\"\n    \"wtU th 1\\n\"\n    \"bHk ka 1\\n\"\n    \"qCw qu 1\\n\"\n    \"Zca an 1\\n\"\n    \"kDw ka 1\\n\"\n    \"Ywc ch 1\\n\"\n    \"pXs st 1\\n\"\n    \"yMm me 1\\n\"\n    \"Gwq qu 1\\n\"\n    \"mYv va 1\\n\"\n    \"wCx wa 1\\n\"\n    \"jZx ij 1\\n\"\n    \"oQd on 1\\n\"\n    \"Fzk sz 1\\n\"\n    \"lwF le 1\\n\"\n    \"Xzk sz 1\\n\"\n    \"Njx ij 1\\n\"\n    \"yoI on 1\\n\"\n    \"sJm st 1\\n\"\n    \"wKk ka 1\\n\"\n    \"Qth ch 1\\n\"\n    \"Llz le 1\\n\"\n    \"gVf gi 1\\n\"\n    \"pPq qu 1\\n\"\n    \"lGy le 1\\n\"\n    \"gzR ng 1\\n\"\n    \"rXg ng 1\\n\"\n    \"Npf pr 1\\n\"\n    \"wvR va 1\\n\"\n    \"yXs st 1\\n\"\n    \"mMl li 1\\n\"\n    \"bYx be 1\\n\"\n    \"fzZ sz 1\\n\"\n    \"vrG er 1\\n\"\n    \"Kdk de 1\\n\"\n    \"yqw qu 1\\n\"\n    \"Lkq qu 1\\n\"\n    \"jKs st 1\\n\"\n    \"Zqx qu 1\\n\"\n    \"Pfm me 1\\n\"\n    \"rlW er 1\\n\"\n    \"hPv th 1\\n\"\n    \"Ojx ij 1\\n\"\n    \"Gtq th 1\\n\"\n    \"vtJ th 1\\n\"\n    \"Wly le 1\\n\"\n    \"yHd de 1\\n\"\n    \"kQb ka 1\\n\"\n    \"Ldc de 1\\n\"\n    \"sUx st 1\\n\"\n    \"cJg ch 1\\n\"\n    \"fLd de 1\\n\"\n    \"Mjq qu 1\\n\"\n    \"Cjm ij 1\\n\"\n    \"awX an 1\\n\"\n    \"Gtl th 1\\n\"\n    \"wzN sz 1\\n\"\n    \"bqx qu 1\\n\"\n    \"fAq qu 1\\n\"\n    \"ezX er 1\\n\"\n    \"cBx ch 1\\n\"\n    \"csX ch 1\\n\"\n    \"cUf ch 1\\n\"\n    \"qsJ qu 1\\n\"\n    \"hsZ th 1\\n\"\n    \"qzg ng 1\\n\"\n    \"Qgk ng 1\\n\"\n    \"Nxg ng 1\\n\"\n    \"Hqa an 1\\n\"\n    \"rXl er 1\\n\"\n    \"nlP an 1\\n\"\n    \"aVg an 1\\n\"\n    \"yhG th 1\\n\"\n    \"kfA ka 1\\n\"\n    \"Vmk mG 1\\n\"\n    \"jKm ij 1\\n\"\n    \"hPd th 1\\n\"\n    \"aPd an 1\\n\"\n    \"bYy be 1\\n\"\n    \"bnZ an 1\\n\"\n    \"Gsj st 1\\n\"\n    \"kxQ ka 1\\n\"\n    \"vkF ka 1\\n\"\n    \"jzS sz 1\\n\"\n    \"fWm me 1\\n\"\n    \"Qcu ch 1\\n\"\n    \"rZf er 1\\n\"\n    \"jbZ ij 1\\n\"\n    \"aQj an 1\\n\"\n    \"bzO sz 1\\n\"\n    \"fZq qu 1\\n\"\n    \"lrN er 1\\n\"\n    \"fkL ka 1\\n\"\n    \"Dqv qu 1\\n\"\n    \"zkC sz 1\\n\"\n    \"sLw st 1\\n\"\n    \"Nvr er 1\\n\"\n    \"Nby be 1\\n\"\n    \"eMh th 1\\n\"\n    \"wFc ch 1\\n\"\n    \"Cxz sz 1\\n\"\n    \"iZp in 1\\n\"\n    \"dvZ de 1\\n\"\n    \"vIh th 1\\n\"\n    \"qCl qu 1\\n\"\n    \"Pzo on 1\\n\"\n    \"vNq qu 1\\n\"\n    \"zqK qu 1\\n\"\n    \"Lmx me 1\\n\"\n    \"xVt th 1\\n\"\n    \"glD ng 1\\n\"\n    \"Gbf be 1\\n\"\n    \"Jvq qu 1\\n\"\n    \"zFw sz 1\\n\"\n    \"tMq th 1\\n\"\n    \"vkJ ka 1\\n\"\n    \"Sxu qu 1\\n\"\n    \"afU an 1\\n\"\n    \"mHb me 1\\n\"\n    \"jxU ij 1\\n\"\n    \"cJl ch 1\\n\"\n    \"uqE qu 1\\n\"\n    \"Nqq qu 1\\n\"\n    \"xGt th 1\\n\"\n    \"czG ch 1\\n\"\n    \"Kfg ng 1\\n\"\n    \"zWh th 1\\n\"\n    \"yXm me 1\\n\"\n    \"fnD an 1\\n\"\n    \"Jrd er 1\\n\"\n    \"oxZ on 1\\n\"\n    \"hXn th 1\\n\"\n    \"fqI qu 1\\n\"\n    \"wAo on 1\\n\"\n    \"iGk in 1\\n\"\n    \"xEw wa 1\\n\"\n    \"fVq qu 1\\n\"\n    \"ytU th 1\\n\"\n    \"bhG th 1\\n\"\n    \"oQz on 1\\n\"\n    \"pgO ng 1\\n\"\n    \"Yqm qu 1\\n\"\n    \"bJi in 1\\n\"\n    \"kcV ch 1\\n\"\n    \"knM an 1\\n\"\n    \"Cwr er 1\\n\"\n    \"Wgd ng 1\\n\"\n    \"bpT pr 1\\n\"\n    \"Jdj de 1\\n\"\n    \"Nbq qu 1\\n\"\n    \"twJ th 1\\n\"\n    \"Qep er 1\\n\"\n    \"Kdc ch 1\\n\"\n    \"kQq qu 1\\n\"\n    \"rPq qu 1\\n\"\n    \"lWp le 1\\n\"\n    \"Fbq qu 1\\n\"\n    \"bVk ka 1\\n\"\n    \"zlI le 1\\n\"\n    \"Bzp sz 1\\n\"\n    \"jfK ij 1\\n\"\n    \"Yvm va 1\\n\"\n    \"Ftm th 1\\n\"\n    \"aMj an 1\\n\"\n    \"zzV sz 1\\n\"\n    \"zOa an 1\\n\"\n    \"mHc ch 1\\n\"\n    \"xWn an 1\\n\"\n    \"fFh th 1\\n\"\n    \"sDv st 1\\n\"\n    \"vmD va 1\\n\"\n    \"xjL ij 1\\n\"\n    \"iBq qu 1\\n\"\n    \"jqT qu 1\\n\"\n    \"hsR th 1\\n\"\n    \"Qxo on 1\\n\"\n    \"jsG st 1\\n\"\n    \"cXb ch 1\\n\"\n    \"Ybj ij 1\\n\"\n    \"xeJ er 1\\n\"\n    \"oPq qu 1\\n\"\n    \"yXt th 1\\n\"\n    \"xvL va 1\\n\"\n    \"jcF ch 1\\n\"\n    \"kFb ka 1\\n\"\n    \"jXv ij 1\\n\"\n    \"Aox on 1\\n\"\n    \"zkQ sz 1\\n\"\n    \"fPd de 1\\n\"\n    \"Fvx va 1\\n\"\n    \"fbX be 1\\n\"\n    \"oCf on 1\\n\"\n    \"Yjd de 1\\n\"\n    \"Ppf pr 1\\n\"\n    \"Njs st 1\\n\"\n    \"cZh th 1\\n\"\n    \"vnG an 1\\n\"\n    \"cwJ cm 1\\n\"\n    \"qJl qu 1\\n\"\n    \"gNf ng 1\\n\"\n    \"Tfv va 1\\n\"\n    \"vwK va 1\\n\"\n    \"Zcs ch 1\\n\"\n    \"eBv er 1\\n\"\n    \"qLf qu 1\\n\"\n    \"Yqt th 1\\n\"\n    \"crD ch 1\\n\"\n    \"Icj ch 1\\n\"\n    \"qBl qu 1\\n\"\n    \"gzX ng 1\\n\"\n    \"ujF qu 1\\n\"\n    \"vxU va 1\\n\"\n    \"kZt th 1\\n\"\n    \"Ldh th 1\\n\"\n    \"bfM be 1\\n\"\n    \"mQm QO 1\\n\"\n    \"zlQ le 1\\n\"\n    \"jbU ij 1\\n\"\n    \"Kvz sz 1\\n\"\n    \"Uxw wa 1\\n\"\n    \"pjS ij 1\\n\"\n    \"Xvv va 1\\n\"\n    \"kjI ij 1\\n\"\n    \"cYi ch 1\\n\"\n    \"nJn an 1\\n\"\n    \"Qxz sz 1\\n\"\n    \"aNw an 1\\n\"\n    \"Jfp pr 1\\n\"\n    \"bNz sz 1\\n\"\n    \"xdQ de 1\\n\"\n    \"Bzk sz 1\\n\"\n    \"qZz qu 1\\n\"\n    \"Ycp ch 1\\n\"\n    \"pGs st 1\\n\"\n    \"kCf ka 1\\n\"\n    \"gwP ng 1\\n\"\n    \"wbV wa 1\\n\"\n    \"Eqt eq 1\\n\"\n    \"Xhn th 1\\n\"\n    \"oUf on 1\\n\"\n    \"dKc ch 1\\n\"\n    \"sxN st 1\\n\"\n    \"Ofz sz 1\\n\"\n    \"gCp ng 1\\n\"\n    \"bhI th 1\\n\"\n    \"hgU th 1\\n\"\n    \"knU an 1\\n\"\n    \"kjT ij 1\\n\"\n    \"fsZ st 1\\n\"\n    \"lGv le 1\\n\"\n    \"wMd de 1\\n\"\n    \"ukQ qu 1\\n\"\n    \"Ghk th 1\\n\"\n    \"kRw ka 1\\n\"\n    \"zRc ch 1\\n\"\n    \"gwK ng 1\\n\"\n    \"vJp va 1\\n\"\n    \"tVc th 1\\n\"\n    \"pqT qu 1\\n\"\n    \"iYl in 1\\n\"\n    \"xLv va 1\\n\"\n    \"Xdq qu 1\\n\"\n    \"zcO ch 1\\n\"\n    \"plM le 1\\n\"\n    \"bDz sz 1\\n\"\n    \"Nmx me 1\\n\"\n    \"dKv de 1\\n\"\n    \"hPk th 1\\n\"\n    \"Tjy ij 1\\n\"\n    \"wYs st 1\\n\"\n    \"nfJ an 1\\n\"\n    \"tfC th 1\\n\"\n    \"zJt th 1\\n\"\n    \"lKp le 1\\n\"\n    \"Iyc ch 1\\n\"\n    \"xuB qu 1\\n\"\n    \"eKx er 1\\n\"\n    \"sZf st 1\\n\"\n    \"zpQ sz 1\\n\"\n    \"sfL st 1\\n\"\n    \"mjT ij 1\\n\"\n    \"zXw sz 1\\n\"\n    \"yKt th 1\\n\"\n    \"rwV er 1\\n\"\n    \"pjB ij 1\\n\"\n    \"qYb qu 1\\n\"\n    \"bYz sz 1\\n\"\n    \"qqY eq 1\\n\"\n    \"uIf qu 1\\n\"\n    \"jTc ch 1\\n\"\n    \"sqC qu 1\\n\"\n    \"uJc ch 1\\n\"\n    \"dGx de 1\\n\"\n    \"swF st 1\\n\"\n    \"Hfn an 1\\n\"\n    \"Htb th 1\\n\"\n    \"pfW hW 1\\n\"\n    \"iyG in 1\\n\"\n    \"zPc ch 1\\n\"\n    \"yzV sz 1\\n\"\n    \"pVz sz 1\\n\"\n    \"sPg ng 1\\n\"\n    \"fKj ij 1\\n\"\n    \"eFb er 1\\n\"\n    \"Qji jS 1\\n\"\n    \"mtH th 1\\n\"\n    \"wgZ ng 1\\n\"\n    \"hHd th 1\\n\"\n    \"fTt th 1\\n\"\n    \"gxZ ng 1\\n\"\n    \"Ktg th 1\\n\"\n    \"hWd th 1\\n\"\n    \"fWq qu 1\\n\"\n    \"wSv va 1\\n\"\n    \"Fzn an 1\\n\"\n    \"ghH th 1\\n\"\n    \"npW an 1\\n\"\n    \"jvP ij 1\\n\"\n    \"uYk qu 1\\n\"\n    \"Uxn an 1\\n\"\n    \"Sqg ng 1\\n\"\n    \"zcJ ch 1\\n\"\n    \"dMr er 1\\n\"\n    \"Zgc ch 1\\n\"\n    \"qGp qu 1\\n\"\n    \"oVq qu 1\\n\"\n    \"oUa an 1\\n\"\n    \"oqV qu 1\\n\"\n    \"jGs st 1\\n\"\n    \"Ybq qu 1\\n\"\n    \"qRf qu 1\\n\"\n    \"brZ er 1\\n\"\n    \"qTv qu 1\\n\"\n    \"wZf wa 1\\n\"\n    \"gOj ng 1\\n\"\n    \"Jji in 1\\n\"\n    \"Ppx pr 1\\n\"\n    \"qwB qu 1\\n\"\n    \"qcJ ch 1\\n\"\n    \"fFz sz 1\\n\"\n    \"wwY wa 1\\n\"\n    \"kTc ch 1\\n\"\n    \"uGn an 1\\n\"\n    \"eQq qu 1\\n\"\n    \"qGk qu 1\\n\"\n    \"dpV de 1\\n\"\n    \"vTm va 1\\n\"\n    \"Ojq qu 1\\n\"\n    \"dpX de 1\\n\"\n    \"bYf be 1\\n\"\n    \"tjV th 1\\n\"\n    \"Lzn LG 1\\n\"\n    \"Yjm ij 1\\n\"\n    \"uYw qu 1\\n\"\n    \"Zdg ng 1\\n\"\n    \"hXs th 1\\n\"\n    \"Iwp pr 1\\n\"\n    \"hJw th 1\\n\"\n    \"Tfd de 1\\n\"\n    \"cxO ch 1\\n\"\n    \"Qqy qu 1\\n\"\n    \"lDv le 1\\n\"\n    \"zsO st 1\\n\"\n    \"mrG er 1\\n\"\n    \"cjJ ch 1\\n\"\n    \"dgD ng 1\\n\"\n    \"cUw ch 1\\n\"\n    \"zdB de 1\\n\"\n    \"jlU le 1\\n\"\n    \"bBf be 1\\n\"\n    \"qbJ qu 1\\n\"\n    \"qlR qu 1\\n\"\n    \"cWc ch 1\\n\"\n    \"Xgb ng 1\\n\"\n    \"zrU er 1\\n\"\n    \"bgI ng 1\\n\"\n    \"wjJ ij 1\\n\"\n    \"mvU va 1\\n\"\n    \"rCp GC 1\\n\"\n    \"nVx an 1\\n\"\n    \"xbG be 1\\n\"\n    \"tdN th 1\\n\"\n    \"yjR ij 1\\n\"\n    \"wQj ij 1\\n\"\n    \"xzZ sz 1\\n\"\n    \"qUk qu 1\\n\"\n    \"xjY ij 1\\n\"\n    \"Jxz sz 1\\n\"\n    \"xZs st 1\\n\"\n    \"vZx va 1\\n\"\n    \"lRs le 1\\n\"\n    \"vwp va 1\\n\"\n    \"wpj ij 1\\n\"\n    \"swS st 1\\n\"\n    \"Eqx qu 1\\n\"\n    \"vEw va 1\\n\"\n    \"tkQ th 1\\n\"\n    \"vgX ng 1\\n\"\n    \"Rwb wa 1\\n\"\n    \"sjW st 1\\n\"\n    \"dXm de 1\\n\"\n    \"fvY vK 1\\n\"\n    \"lrO er 1\\n\"\n    \"Ldx de 1\\n\"\n    \"cxV ch 1\\n\"\n    \"qFh th 1\\n\"\n    \"qVw qu 1\\n\"\n    \"Pyf ny 1\\n\"\n    \"Kxz sz 1\\n\"\n    \"hwJ th 1\\n\"\n    \"cpL ch 1\\n\"\n    \"Hge ng 1\\n\"\n    \"Wbh th 1\\n\"\n    \"lQq qu 1\\n\"\n    \"hDl th 1\\n\"\n    \"Zph th 1\\n\"\n    \"wZj ij 1\\n\"\n    \"Zqt th 1\\n\"\n    \"xmU me 1\\n\"\n    \"tUf th 1\\n\"\n    \"qWo qu 1\\n\"\n    \"Lrd er 1\\n\"\n    \"pQs st 1\\n\"\n    \"rZv er 1\\n\"\n    \"mjI ij 1\\n\"\n    \"xQy ny 1\\n\"\n    \"vGy va 1\\n\"\n    \"jwY ij 1\\n\"\n    \"cNn an 1\\n\"\n    \"zpP sz 1\\n\"\n    \"vKd de 1\\n\"\n    \"wVk ka 1\\n\"\n    \"tMh ch 1\\n\"\n    \"Ktd th 1\\n\"\n    \"tpG th 1\\n\"\n    \"iDf in 1\\n\"\n    \"qKl qu 1\\n\"\n    \"jLc ch 1\\n\"\n    \"Jjl le 1\\n\"\n    \"hcQ th 1\\n\"\n    \"Tqg qu 1\\n\"\n    \"bGk ka 1\\n\"\n    \"jxV ij 1\\n\"\n    \"fcC ch 1\\n\"\n    \"Fwx wa 1\\n\"\n    \"qPy qu 1\\n\"\n    \"jmE ij 1\\n\"\n    \"xmT me 1\\n\"\n    \"lxC GC 1\\n\"\n    \"lRr er 1\\n\"\n    \"Qkl le 1\\n\"\n    \"ihF th 1\\n\"\n    \"Llt th 1\\n\"\n    \"Kqe qu 1\\n\"\n    \"Hhf th 1\\n\"\n    \"nPq an 1\\n\"\n    \"zvQ QO 1\\n\"\n    \"jGy ij 1\\n\"\n    \"lMk le 1\\n\"\n    \"uOj qu 1\\n\"\n    \"fdT de 1\\n\"\n    \"qvH qu 1\\n\"\n    \"pcZ ch 1\\n\"\n    \"qkc ch 1\\n\"\n    \"cbJ ch 1\\n\"\n    \"gfK ng 1\\n\"\n    \"pMt th 1\\n\"\n    \"vpF va 1\\n\"\n    \"dgP ng 1\\n\"\n    \"mxF me 1\\n\"\n    \"rZp er 1\\n\"\n    \"cGd ch 1\\n\"\n    \"sPx st 1\\n\"\n    \"rGd er 1\\n\"\n    \"gbQ ng 1\\n\"\n    \"Dfz sz 1\\n\"\n    \"sjC st 1\\n\"\n    \"zSx sz 1\\n\"\n    \"qIo qu 1\\n\"\n    \"dIw de 1\\n\"\n    \"kpF ka 1\\n\"\n    \"eUw er 1\\n\"\n    \"Hxc ch 1\\n\"\n    \"yvG va 1\\n\"\n    \"vUf va 1\\n\"\n    \"fjF ij 1\\n\"\n    \"kLq qu 1\\n\"\n    \"Zjt th 1\\n\"\n    \"fLq qu 1\\n\"\n    \"ydS de 1\\n\"\n    \"zwK sz 1\\n\"\n    \"hHy th 1\\n\"\n    \"Ssw st 1\\n\"\n    \"hjG th 1\\n\"\n    \"Ddp de 1\\n\"\n    \"bPs st 1\\n\"\n    \"Wpq qu 1\\n\"\n    \"crW ch 1\\n\"\n    \"Xpj ij 1\\n\"\n    \"oXr er 1\\n\"\n    \"vjK ij 1\\n\"\n    \"Vzf sz 1\\n\"\n    \"lYd le 1\\n\"\n    \"Odx de 1\\n\"\n    \"hVt th 1\\n\"\n    \"gRc ch 1\\n\"\n    \"Ztf th 1\\n\"\n    \"hVj th 1\\n\"\n    \"Jjf ij 1\\n\"\n    \"jFb ij 1\\n\"\n    \"Lhf th 1\\n\"\n    \"jlO le 1\\n\"\n    \"jvB ij 1\\n\"\n    \"gbN ng 1\\n\"\n    \"vPm va 1\\n\"\n    \"tQd th 1\\n\"\n    \"Vvj ij 1\\n\"\n    \"rqX qu 1\\n\"\n    \"zEo on 1\\n\"\n    \"jsB st 1\\n\"\n    \"qmH qu 1\\n\"\n    \"btE th 1\\n\"\n    \"Wdd de 1\\n\"\n    \"Dmj ij 1\\n\"\n    \"ywI wa 1\\n\"\n    \"jpQ ij 1\\n\"\n    \"uXs qu 1\\n\"\n    \"bYm me 1\\n\"\n    \"oFz on 1\\n\"\n    \"tBg th 1\\n\"\n    \"cCn ch 1\\n\"\n    \"dZg ng 1\\n\"\n    \"wrL er 1\\n\"\n    \"Jry er 1\\n\"\n    \"iKd in 1\\n\"\n    \"vcN ch 1\\n\"\n    \"zNp sz 1\\n\"\n    \"nRf an 1\\n\"\n    \"dcH ch 1\\n\"\n    \"qaO an 1\\n\"\n    \"uaQ an 1\\n\"\n    \"jxL ij 1\\n\"\n    \"mUf me 1\\n\"\n    \"vOk ka 1\\n\"\n    \"Pxt th 1\\n\"\n    \"fuQ qu 1\\n\"\n    \"sfN st 1\\n\"\n    \"Qlv le 1\\n\"\n    \"bZy be 1\\n\"\n    \"vEq vK 1\\n\"\n    \"Xvg ng 1\\n\"\n    \"Jxb be 1\\n\"\n    \"zGz sz 1\\n\"\n    \"Cqf qu 1\\n\"\n    \"sPp st 1\\n\"\n    \"vAq qu 1\\n\"\n    \"kWd de 1\\n\"\n    \"rcZ cm 1\\n\"\n    \"lDs le 1\\n\"\n    \"xDd de 1\\n\"\n    \"pSj ij 1\\n\"\n    \"vwS va 1\\n\"\n    \"kgQ ng 1\\n\"\n    \"crT ch 1\\n\"\n    \"fKs st 1\\n\"\n    \"qhc th 1\\n\"\n    \"gMl ng 1\\n\"\n    \"zKt th 1\\n\"\n    \"jdF de 1\\n\"\n    \"cfN ch 1\\n\"\n    \"sdO st 1\\n\"\n    \"kHh th 1\\n\"\n    \"xvE va 1\\n\"\n    \"bPf be 1\\n\"\n    \"rzX er 1\\n\"\n    \"vSj ij 1\\n\"\n    \"dFf de 1\\n\"\n    \"vXl le 1\\n\"\n    \"bRv va 1\\n\"\n    \"Zxw wa 1\\n\"\n    \"Xzw sz 1\\n\"\n    \"vrR er 1\\n\"\n    \"xHb be 1\\n\"\n    \"qeE qu 1\\n\"\n    \"jrQ er 1\\n\"\n    \"vkI ka 1\\n\"\n    \"frY er 1\\n\"\n    \"jqL qu 1\\n\"\n    \"cZj ch 1\\n\"\n    \"Tmg ng 1\\n\"\n    \"mHw me 1\\n\"\n    \"dqS qu 1\\n\"\n    \"qlI qu 1\\n\"\n    \"Zvb va 1\\n\"\n    \"Klx le 1\\n\"\n    \"gbS ng 1\\n\"\n    \"sbQ st 1\\n\"\n    \"quF un 1\\n\"\n    \"qzT qu 1\\n\"\n    \"qaI an 1\\n\"\n    \"Vmd de 1\\n\"\n    \"qaQ an 1\\n\"\n    \"Qkb ka 1\\n\"\n    \"Xjb ij 1\\n\"\n    \"oCq GC 1\\n\"\n    \"qQh QO 1\\n\"\n    \"cwO ch 1\\n\"\n    \"tMf th 1\\n\"\n    \"zrK er 1\\n\"\n    \"wKy wa 1\\n\"\n    \"wKb wa 1\\n\"\n    \"cqS ch 1\\n\"\n    \"iGv in 1\\n\"\n    \"xXw wa 1\\n\"\n    \"fMx fo 1\\n\"\n    \"Zmv va 1\\n\"\n    \"Yqq qu 1\\n\"\n    \"kDh th 1\\n\"\n    \"Jxy ny 1\\n\"\n    \"yyE ny 1\\n\"\n    \"sUv st 1\\n\"\n    \"cVr ch 1\\n\"\n    \"bqH qu 1\\n\"\n    \"Wgq qu 1\\n\"\n    \"uqQ qu 1\\n\"\n    \"bTg ng 1\\n\"\n    \"iMv in 1\\n\"\n    \"qWk qu 1\\n\"\n    \"fdV de 1\\n\"\n    \"oQq qu 1\\n\"\n    \"nZp an 1\\n\"\n    \"zoY on 1\\n\"\n    \"jRk ij 1\\n\"\n    \"qPj qu 1\\n\"\n    \"uqL qu 1\\n\"\n    \"cqX ch 1\\n\"\n    \"lBq qu 1\\n\"\n    \"fpX pr 1\\n\"\n    \"bYw wa 1\\n\"\n    \"Yeq qu 1\\n\"\n    \"hjN th 1\\n\"\n    \"tqW th 1\\n\"\n    \"jhT th 1\\n\"\n    \"cvF ch 1\\n\"\n    \"Ycx ch 1\\n\"\n    \"jFs st 1\\n\"\n    \"Hdy de 1\\n\"\n    \"lrZ er 1\\n\"\n    \"fZv va 1\\n\"\n    \"Tfw wa 1\\n\"\n    \"zrI er 1\\n\"\n    \"dDv de 1\\n\"\n    \"xeH er 1\\n\"\n    \"lzH le 1\\n\"\n    \"sLr er 1\\n\"\n    \"iKq qu 1\\n\"\n    \"Fzc cm 1\\n\"\n    \"xRd de 1\\n\"\n    \"fSd de 1\\n\"\n    \"qwF qu 1\\n\"\n    \"wxY wa 1\\n\"\n    \"Ykw ka 1\\n\"\n    \"oVp on 1\\n\"\n    \"cgB ch 1\\n\"\n    \"bFh th 1\\n\"\n    \"njT an 1\\n\"\n    \"dZz de 1\\n\"\n    \"bhS th 1\\n\"\n    \"Fzu qu 1\\n\"\n    \"fHm me 1\\n\"\n    \"vNz sz 1\\n\"\n    \"qlF qu 1\\n\"\n    \"Lvf va 1\\n\"\n    \"zpU sz 1\\n\"\n    \"jtL th 1\\n\"\n    \"cQq ch 1\\n\"\n    \"mKm me 1\\n\"\n    \"Rwc ch 1\\n\"\n    \"jrO er 1\\n\"\n    \"npB an 1\\n\"\n    \"Qtx th 1\\n\"\n    \"Mqj qu 1\\n\"\n    \"Oqx qu 1\\n\"\n    \"Dzp sz 1\\n\"\n    \"hVg th 1\\n\"\n    \"pTn an 1\\n\"\n    \"gQj ng 1\\n\"\n    \"mTn an 1\\n\"\n    \"tQv th 1\\n\"\n    \"lZh th 1\\n\"\n    \"kJj ij 1\\n\"\n    \"crP ch 1\\n\"\n    \"mqC qu 1\\n\"\n    \"Dwl le 1\\n\"\n    \"vVj ij 1\\n\"\n    \"hqT th 1\\n\"\n    \"mJw me 1\\n\"\n    \"txT th 1\\n\"\n    \"wZm me 1\\n\"\n    \"Xnq an 1\\n\"\n    \"hfU th 1\\n\"\n    \"kVr er 1\\n\"\n    \"gVp ng 1\\n\"\n    \"nBp an 1\\n\"\n    \"xnZ an 1\\n\"\n    \"jqA qu 1\\n\"\n    \"Pzk sz 1\\n\"\n    \"fJq qu 1\\n\"\n    \"Gnf an 1\\n\"\n    \"Kxp pr 1\\n\"\n    \"dXl Xm 1\\n\"\n    \"hwL th 1\\n\"\n    \"Rrn an 1\\n\"\n    \"klL le 1\\n\"\n    \"fOg ng 1\\n\"\n    \"Qwx wa 1\\n\"\n    \"Cmx me 1\\n\"\n    \"Fbf be 1\\n\"\n    \"hWq th 1\\n\"\n    \"bSw wa 1\\n\"\n    \"Bxr er 1\\n\"\n    \"zcB ch 1\\n\"\n    \"lvX le 1\\n\"\n    \"Kkx ka 1\\n\"\n    \"qfI qu 1\\n\"\n    \"uKg qu 1\\n\"\n    \"Yku qu 1\\n\"\n    \"jJz sz 1\\n\"\n    \"uIp qu 1\\n\"\n    \"qAd qu 1\\n\"\n    \"pfH pr 1\\n\"\n    \"Qwf wa 1\\n\"\n    \"wbU wa 1\\n\"\n    \"vDv va 1\\n\"\n    \"gJn an 1\\n\"\n    \"zlR le 1\\n\"\n    \"mXr er 1\\n\"\n    \"rHx er 1\\n\"\n    \"oVz on 1\\n\"\n    \"gtG th 1\\n\"\n    \"lrK HK 1\\n\"\n    \"Wxe er 1\\n\"\n    \"pnJ an 1\\n\"\n    \"Fqy qu 1\\n\"\n    \"jVl le 1\\n\"\n    \"cbP ch 1\\n\"\n    \"Gjc jS 1\\n\"\n    \"jQs st 1\\n\"\n    \"tvV th 1\\n\"\n    \"Hzk sz 1\\n\"\n    \"jyW ij 1\\n\"\n    \"Xbf be 1\\n\"\n    \"qfS qu 1\\n\"\n    \"Wvp va 1\\n\"\n    \"wbL wa 1\\n\"\n    \"mkO ka 1\\n\"\n    \"eqB qu 1\\n\"\n    \"dvS de 1\\n\"\n    \"zGh th 1\\n\"\n    \"vWu qu 1\\n\"\n    \"flX le 1\\n\"\n    \"xJq qu 1\\n\"\n    \"qLk qu 1\\n\"\n    \"vNl le 1\\n\"\n    \"kzQ sz 1\\n\"\n    \"Czv sz 1\\n\"\n    \"knV an 1\\n\"\n    \"Rjb ij 1\\n\"\n    \"bNq qu 1\\n\"\n    \"zPm sz 1\\n\"\n    \"qxB qu 1\\n\"\n    \"Lhh th 1\\n\"\n    \"Uvt th 1\\n\"\n    \"xfU fo 1\\n\"\n    \"iNp in 1\\n\"\n    \"yYg ng 1\\n\"\n    \"oPb on 1\\n\"\n    \"qiW qu 1\\n\"\n    \"ycD ch 1\\n\"\n    \"wVz sz 1\\n\"\n    \"wGq qu 1\\n\"\n    \"hRb th 1\\n\"\n    \"xbB be 1\\n\"\n    \"sZl le 1\\n\"\n    \"gxO ng 1\\n\"\n    \"wFk ka 1\\n\"\n    \"Mxd de 1\\n\"\n    \"dxP de 1\\n\"\n    \"lRq qu 1\\n\"\n    \"hbZ th 1\\n\"\n    \"Eao an 1\\n\"\n    \"zgA ng 1\\n\"\n    \"qcW ch 1\\n\"\n    \"vmQ va 1\\n\"\n    \"Yqf qu 1\\n\"\n    \"wiO in 1\\n\"\n    \"xOe er 1\\n\"\n    \"Hfy ny 1\\n\"\n    \"bfS be 1\\n\"\n    \"Qhn th 1\\n\"\n    \"Cmk ka 1\\n\"\n    \"lYs le 1\\n\"\n    \"Nqt th 1\\n\"\n    \"qeJ qu 1\\n\"\n    \"ztJ th 1\\n\"\n    \"pMv va 1\\n\"\n    \"uhW th 1\\n\"\n    \"jSb ij 1\\n\"\n    \"dYh th 1\\n\"\n    \"cfW ch 1\\n\"\n    \"gSx ng 1\\n\"\n    \"qSv qu 1\\n\"\n    \"jCs st 1\\n\"\n    \"pwC pr 1\\n\"\n    \"Gxq qu 1\\n\"\n    \"fMq qu 1\\n\"\n    \"kkC ka 1\\n\"\n    \"uqI qu 1\\n\"\n    \"zBk sz 1\\n\"\n    \"zsW st 1\\n\"\n    \"fZb be 1\\n\"\n    \"xjb ij 1\\n\"\n    \"vHq qu 1\\n\"\n    \"fwN wa 1\\n\"\n    \"vMw va 1\\n\"\n    \"Hhq th 1\\n\"\n    \"csJ ch 1\\n\"\n    \"brJ er 1\\n\"\n    \"xvM va 1\\n\"\n    \"mXn an 1\\n\"\n    \"qWw wa 1\\n\"\n    \"dxZ de 1\\n\"\n    \"sVj st 1\\n\"\n    \"xrF er 1\\n\"\n    \"pbU pr 1\\n\"\n    \"Tfz sz 1\\n\"\n    \"wqT qu 1\\n\"\n    \"vcF ch 1\\n\"\n    \"nrS an 1\\n\"\n    \"Whz th 1\\n\"\n    \"kgX ng 1\\n\"\n    \"yXk ka 1\\n\"\n    \"kJb ka 1\\n\"\n    \"rZk er 1\\n\"\n    \"pBc ch 1\\n\"\n    \"gUv ng 1\\n\"\n    \"Hqe qu 1\\n\"\n    \"Kqj qu 1\\n\"\n    \"oFj on 1\\n\"\n    \"xbN be 1\\n\"\n    \"pnK an 1\\n\"\n    \"Lbw wa 1\\n\"\n    \"dMb de 1\\n\"\n    \"qSp qu 1\\n\"\n    \"Zsv st 1\\n\"\n    \"wrV er 1\\n\"\n    \"uKf qu 1\\n\"\n    \"mlY le 1\\n\"\n    \"gxF ng 1\\n\"\n    \"tjL th 1\\n\"\n    \"Xrc ch 1\\n\"\n    \"rvF er 1\\n\"\n    \"mLq qu 1\\n\"\n    \"jrK er 1\\n\"\n    \"Qlz le 1\\n\"\n    \"zxD sz 1\\n\"\n    \"fdY de 1\\n\"\n    \"jvD ij 1\\n\"\n    \"xQg ng 1\\n\"\n    \"qFu un 1\\n\"\n    \"sfJ st 1\\n\"\n    \"pIf pr 1\\n\"\n    \"hxJ th 1\\n\"\n    \"cNc ch 1\\n\"\n    \"Idq qu 1\\n\"\n    \"yHf ny 1\\n\"\n    \"qXm qu 1\\n\"\n    \"ylD le 1\\n\"\n    \"zFq qu 1\\n\"\n    \"jWp ij 1\\n\"\n    \"eKp er 1\\n\"\n    \"xhf th 1\\n\"\n    \"ybV be 1\\n\"\n    \"xXs st 1\\n\"\n    \"Yhk th 1\\n\"\n    \"fwX wa 1\\n\"\n    \"bqK qu 1\\n\"\n    \"nvY an 1\\n\"\n    \"xvk ka 1\\n\"\n    \"rbP er 1\\n\"\n    \"sXl le 1\\n\"\n    \"Uwt th 1\\n\"\n    \"wmW me 1\\n\"\n    \"pxV pr 1\\n\"\n    \"njZ an 1\\n\"\n    \"Tqk qu 1\\n\"\n    \"zmE sz 1\\n\"\n    \"Rqu un 1\\n\"\n    \"qqM qu 1\\n\"\n    \"dhQ th 1\\n\"\n    \"uJz qu 1\\n\"\n    \"Vqd qu 1\\n\"\n    \"yCk ka 1\\n\"\n    \"pWu qu 1\\n\"\n    \"Vdy de 1\\n\"\n    \"iRx in 1\\n\"\n    \"Vcm ch 1\\n\"\n    \"wIg ng 1\\n\"\n    \"Xbh th 1\\n\"\n    \"vcG ch 1\\n\"\n    \"jjX ij 1\\n\"\n    \"nmO an 1\\n\"\n    \"dQj de 1\\n\"\n    \"dfV de 1\\n\"\n    \"dbK de 1\\n\"\n    \"gqk qu 1\\n\"\n    \"nFd an 1\\n\"\n    \"oWv on 1\\n\"\n    \"nHp an 1\\n\"\n    \"knK an 1\\n\"\n    \"bxZ be 1\\n\"\n    \"wmH me 1\\n\"\n    \"fgX ng 1\\n\"\n    \"gzH ng 1\\n\"\n    \"Zbv va 1\\n\"\n    \"vgM ng 1\\n\"\n    \"dmK de 1\\n\"\n    \"cvB ch 1\\n\"\n    \"eQs er 1\\n\"\n    \"cHm ch 1\\n\"\n    \"sBt th 1\\n\"\n    \"bHx be 1\\n\"\n    \"vqd qu 1\\n\"\n    \"Npy pr 1\\n\"\n    \"xzL sz 1\\n\"\n    \"gMx ng 1\\n\"\n    \"vwU va 1\\n\"\n    \"pfX pr 1\\n\"\n    \"nFg an 1\\n\"\n    \"sFs st 1\\n\"\n    \"Vqh th 1\\n\"\n    \"Emq qu 1\\n\"\n    \"tXy th 1\\n\"\n    \"uVd qu 1\\n\"\n    \"Yvj ij 1\\n\"\n    \"qHo qu 1\\n\"\n    \"pWm me 1\\n\"\n    \"xcK ch 1\\n\"\n    \"pUv va 1\\n\"\n    \"pLn an 1\\n\"\n    \"uVn an 1\\n\"\n    \"Fsq qu 1\\n\"\n    \"cGj ch 1\\n\"\n    \"Xwy wa 1\\n\"\n    \"gzT ng 1\\n\"\n    \"dNq qu 1\\n\"\n    \"jrU er 1\\n\"\n    \"qtA th 1\\n\"\n    \"gqT qu 1\\n\"\n    \"pwM pr 1\\n\"\n    \"lrP er 1\\n\"\n    \"jmC ij 1\\n\"\n    \"pmP me 1\\n\"\n    \"yiY in 1\\n\"\n    \"pTs st 1\\n\"\n    \"Zwj ij 1\\n\"\n    \"qpF qu 1\\n\"\n    \"fhJ ch 1\\n\"\n    \"fOv va 1\\n\"\n    \"wcK ch 1\\n\"\n    \"kqk qu 1\\n\"\n    \"Ugz ng 1\\n\"\n    \"xfF fo 1\\n\"\n    \"cTv ch 1\\n\"\n    \"gpX ng 1\\n\"\n    \"Lfx fo 1\\n\"\n    \"gwU ng 1\\n\"\n    \"Dzx sz 1\\n\"\n    \"kDc ch 1\\n\"\n    \"Pvh th 1\\n\"\n    \"kdY de 1\\n\"\n    \"wWv va 1\\n\"\n    \"sQq qu 1\\n\"\n    \"mjY ij 1\\n\"\n    \"yCb be 1\\n\"\n    \"rSq qu 1\\n\"\n    \"Sfv va 1\\n\"\n    \"fZh th 1\\n\"\n    \"dMd de 1\\n\"\n    \"dNs st 1\\n\"\n    \"jTv ij 1\\n\"\n    \"tmW th 1\\n\"\n    \"cxJ ch 1\\n\"\n    \"uAo qu 1\\n\"\n    \"mHx me 1\\n\"\n    \"fgA ng 1\\n\"\n    \"Rhx th 1\\n\"\n    \"wWt th 1\\n\"\n    \"pfU pr 1\\n\"\n    \"oIj on 1\\n\"\n    \"lhQ th 1\\n\"\n    \"vDk ka 1\\n\"\n    \"vJd de 1\\n\"\n    \"sDp st 1\\n\"\n    \"qiU qu 1\\n\"\n    \"Yfs st 1\\n\"\n    \"qxW qu 1\\n\"\n    \"sFh th 1\\n\"\n    \"vhP th 1\\n\"\n    \"Vjj ij 1\\n\"\n    \"tmQ th 1\\n\"\n    \"wmM me 1\\n\"\n    \"cVy ch 1\\n\"\n    \"Kzw sz 1\\n\"\n    \"tfA th 1\\n\"\n    \"gjR ij 1\\n\"\n    \"xyQ ny 1\\n\"\n    \"mBv va 1\\n\"\n    \"fQy ny 1\\n\"\n    \"dZc ch 1\\n\"\n    \"eVh th 1\\n\"\n    \"Nvc ch 1\\n\"\n    \"qFb qu 1\\n\"\n    \"qhl th 1\\n\"\n    \"Zcn ch 1\\n\"\n    \"qwW qu 1\\n\"\n    \"xZq qu 1\\n\"\n    \"jhL th 1\\n\"\n    \"lWf le 1\\n\"\n    \"jJx ij 1\\n\"\n    \"Yzt th 1\\n\"\n    \"Eoq qu 1\\n\"\n    \"Njm ij 1\\n\"\n    \"Zgd ng 1\\n\"\n    \"pGq qu 1\\n\"\n    \"sgY ng 1\\n\"\n    \"jyE ij 1\\n\"\n    \"jzE sz 1\\n\"\n    \"ujK qu 1\\n\"\n    \"qbm qu 1\\n\"\n    \"Wsf st 1\\n\"\n    \"mQn an 1\\n\"\n    \"sQs st 1\\n\"\n    \"yXg ng 1\\n\"\n    \"vYe er 1\\n\"\n    \"ePv er 1\\n\"\n    \"aCv an 1\\n\"\n    \"pVm me 1\\n\"\n    \"zxO sz 1\\n\"\n    \"jjW ij 1\\n\"\n    \"vgI ng 1\\n\"\n    \"tZc th 1\\n\"\n    \"Qtg th 1\\n\"\n    \"vMt th 1\\n\"\n    \"kTt th 1\\n\"\n    \"Mxj ij 1\\n\"\n    \"fbI be 1\\n\"\n    \"qAu un 1\\n\"\n    \"wfT wa 1\\n\"\n    \"fcF ch 1\\n\"\n    \"pfK pr 1\\n\"\n    \"bOq qu 1\\n\"\n    \"huX th 1\\n\"\n    \"cJm ch 1\\n\"\n    \"Xpg ng 1\\n\"\n    \"tqJ th 1\\n\"\n    \"Ovf va 1\\n\"\n    \"Xlj le 1\\n\"\n    \"Nrl er 1\\n\"\n    \"fxW fo 1\\n\"\n    \"Swq qu 1\\n\"\n    \"qvE qu 1\\n\"\n    \"qpY qu 1\\n\"\n    \"oNw on 1\\n\"\n    \"kYc ch 1\\n\"\n    \"jXb ij 1\\n\"\n    \"Qfk ka 1\\n\"\n    \"eDp er 1\\n\"\n    \"Vqb qu 1\\n\"\n    \"sKz us 1\\n\"\n    \"qjp qu 1\\n\"\n    \"Uxl le 1\\n\"\n    \"Lky ka 1\\n\"\n    \"zFy sz 1\\n\"\n    \"nMl an 1\\n\"\n    \"yYi in 1\\n\"\n    \"cQe ch 1\\n\"\n    \"oYj on 1\\n\"\n    \"tbB th 1\\n\"\n    \"Ybg ng 1\\n\"\n    \"nVk nd 1\\n\"\n    \"bXc ch 1\\n\"\n    \"Lqn an 1\\n\"\n    \"mdK de 1\\n\"\n    \"pdP de 1\\n\"\n    \"tqS th 1\\n\"\n    \"Zjf ij 1\\n\"\n    \"kcC ch 1\\n\"\n    \"qZq qu 1\\n\"\n    \"aSd an 1\\n\"\n    \"Cmh th 1\\n\"\n    \"hzG th 1\\n\"\n    \"wQm me 1\\n\"\n    \"Gqg qu 1\\n\"\n    \"yWp pr 1\\n\"\n    \"Xrw er 1\\n\"\n    \"yJy ny 1\\n\"\n    \"sqD qu 1\\n\"\n    \"dWb de 1\\n\"\n    \"nbQ an 1\\n\"\n    \"iwP in 1\\n\"\n    \"lWs le 1\\n\"\n    \"Tsg ng 1\\n\"\n    \"dHz de 1\\n\"\n    \"tcF th 1\\n\"\n    \"Qkt th 1\\n\"\n    \"Bdd de 1\\n\"\n    \"Mxq qu 1\\n\"\n    \"pjV ij 1\\n\"\n    \"kQr er 1\\n\"\n    \"dnI an 1\\n\"\n    \"fyY ny 1\\n\"\n    \"aFq an 1\\n\"\n    \"Ylx le 1\\n\"\n    \"Yym me 1\\n\"\n    \"jbV ij 1\\n\"\n    \"qcV ch 1\\n\"\n    \"pzX sz 1\\n\"\n    \"qRh th 1\\n\"\n    \"djA de 1\\n\"\n    \"bnI an 1\\n\"\n    \"Llv le 1\\n\"\n    \"tmZ th 1\\n\"\n    \"hQo th 1\\n\"\n    \"ztW th 1\\n\"\n    \"Rxz sz 1\\n\"\n    \"dxW de 1\\n\"\n    \"qtW th 1\\n\"\n    \"kqO qu 1\\n\"\n    \"lHc ch 1\\n\"\n    \"lRj le 1\\n\"\n    \"hNf th 1\\n\"\n    \"Giq qu 1\\n\"\n    \"cYq ch 1\\n\"\n    \"Ydp de 1\\n\"\n    \"qWn an 1\\n\"\n    \"xkB ka 1\\n\"\n    \"kxC ka 1\\n\"\n    \"ljA le 1\\n\"\n    \"Qwp pr 1\\n\"\n    \"mCp me 1\\n\"\n    \"fJd de 1\\n\"\n    \"vCt th 1\\n\"\n    \"Vcz ch 1\\n\"\n    \"vBf va 1\\n\"\n    \"cYx ch 1\\n\"\n    \"fHw wa 1\\n\"\n    \"kvW ka 1\\n\"\n    \"Jmz sz 1\\n\"\n    \"hQj th 1\\n\"\n    \"rbQ er 1\\n\"\n    \"vxX va 1\\n\"\n    \"wFh th 1\\n\"\n    \"Tjz sz 1\\n\"\n    \"hxR th 1\\n\"\n    \"vdY de 1\\n\"\n    \"pmF me 1\\n\"\n    \"sDl le 1\\n\"\n    \"rVh th 1\\n\"\n    \"wDc ch 1\\n\"\n    \"gBw ng 1\\n\"\n    \"cHf ch 1\\n\"\n    \"pzQ sz 1\\n\"\n    \"lVp le 1\\n\"\n    \"gfH ng 1\\n\"\n    \"oGc ch 1\\n\"\n    \"tvJ th 1\\n\"\n    \"cMv ch 1\\n\"\n    \"xnS an 1\\n\"\n    \"vQx va 1\\n\"\n    \"uoM qu 1\\n\"\n    \"zkX sz 1\\n\"\n    \"zHp sz 1\\n\"\n    \"yuW qu 1\\n\"\n    \"Qbv va 1\\n\"\n    \"zwG sz 1\\n\"\n    \"cpX ch 1\\n\"\n    \"Rpv va 1\\n\"\n    \"zKq qu 1\\n\"\n    \"wUb wa 1\\n\"\n    \"qnJ an 1\\n\"\n    \"Rpy pr 1\\n\"\n    \"bcS ch 1\\n\"\n    \"qxK qu 1\\n\"\n    \"qjD qu 1\\n\"\n    \"lQg ng 1\\n\"\n    \"krX er 1\\n\"\n    \"Fcg ch 1\\n\"\n    \"oVx on 1\\n\"\n    \"vJf va 1\\n\"\n    \"Bvk ka 1\\n\"\n    \"dmX de 1\\n\"\n    \"Wdj de 1\\n\"\n    \"Yzp sz 1\\n\"\n    \"Ycd ch 1\\n\"\n    \"jKx ij 1\\n\"\n    \"krH er 1\\n\"\n    \"Lnm an 1\\n\"\n    \"zCm sz 1\\n\"\n    \"Uwj ij 1\\n\"\n    \"Uvk ka 1\\n\"\n    \"Mfj ij 1\\n\"\n    \"yqJ qu 1\\n\"\n    \"Lfq qu 1\\n\"\n    \"yHz sz 1\\n\"\n    \"kgJ ng 1\\n\"\n    \"aGq an 1\\n\"\n    \"tjH th 1\\n\"\n    \"Zkc ch 1\\n\"\n    \"wHv va 1\\n\"\n    \"Nzp sz 1\\n\"\n    \"cZx ch 1\\n\"\n    \"jvK ij 1\\n\"\n    \"clF ch 1\\n\"\n    \"xmD me 1\\n\"\n    \"Ypz sz 1\\n\"\n    \"pFy pr 1\\n\"\n    \"hvF th 1\\n\"\n    \"mtW th 1\\n\"\n    \"hqG th 1\\n\"\n    \"kvN ka 1\\n\"\n    \"tcZ th 1\\n\"\n    \"tkR th 1\\n\"\n    \"pdH de 1\\n\"\n    \"qEs qu 1\\n\"\n    \"Zcw ch 1\\n\"\n    \"Vwu un 1\\n\"\n    \"gXz ng 1\\n\"\n    \"mWj ij 1\\n\"\n    \"mWv va 1\\n\"\n    \"Jqx qu 1\\n\"\n    \"oSj on 1\\n\"\n    \"lwY le 1\\n\"\n    \"Tkf ka 1\\n\"\n    \"pcC ch 1\\n\"\n    \"ohG th 1\\n\"\n    \"dzG de 1\\n\"\n    \"fdN de 1\\n\"\n    \"xrS er 1\\n\"\n    \"hHk th 1\\n\"\n    \"Fjz sz 1\\n\"\n    \"vbZ va 1\\n\"\n    \"Udx de 1\\n\"\n    \"wzX sz 1\\n\"\n    \"uNq qu 1\\n\"\n    \"wfZ wa 1\\n\"\n    \"swB st 1\\n\"\n    \"dmQ de 1\\n\"\n    \"dcA ch 1\\n\"\n    \"qzP qu 1\\n\"\n    \"jJj ij 1\\n\"\n    \"qWq qu 1\\n\"\n    \"tVk th 1\\n\"\n    \"gwB ng 1\\n\"\n    \"bIw wa 1\\n\"\n    \"bpU pr 1\\n\"\n    \"bwM wa 1\\n\"\n    \"fkA ka 1\\n\"\n    \"xUc ch 1\\n\"\n    \"xTd de 1\\n\"\n    \"fKl le 1\\n\"\n    \"lxS le 1\\n\"\n    \"xaS an 1\\n\"\n    \"yvQ va 1\\n\"\n    \"dhV th 1\\n\"\n    \"mdW de 1\\n\"\n    \"wfJ wa 1\\n\"\n    \"Wqq qu 1\\n\"\n    \"sZj st 1\\n\"\n    \"Lxy ny 1\\n\"\n    \"xXy ny 1\\n\"\n    \"qDm qu 1\\n\"\n    \"gKq qu 1\\n\"\n    \"Qvj ij 1\\n\"\n    \"kfH ka 1\\n\"\n    \"aQp an 1\\n\"\n    \"xFz sz 1\\n\"\n    \"njW an 1\\n\"\n    \"Rpn an 1\\n\"\n    \"Mmn an 1\\n\"\n    \"fhD th 1\\n\"\n    \"jKk ij 1\\n\"\n    \"zAq qu 1\\n\"\n    \"qfL qu 1\\n\"\n    \"ywN wa 1\\n\"\n    \"qpz qu 1\\n\"\n    \"hxP th 1\\n\"\n    \"Gdq qu 1\\n\"\n    \"tMx th 1\\n\"\n    \"jwL ij 1\\n\"\n    \"kBb ka 1\\n\"\n    \"fAw wa 1\\n\"\n    \"Sdx de 1\\n\"\n    \"Jmv va 1\\n\"\n    \"bgX ng 1\\n\"\n    \"xWp pr 1\\n\"\n    \"hHt th 1\\n\"\n    \"Gww wa 1\\n\"\n    \"Fbb be 1\\n\"\n    \"zoT on 1\\n\"\n    \"yjG ij 1\\n\"\n    \"Rlg ng 1\\n\"\n    \"vFn an 1\\n\"\n    \"zcK ch 1\\n\"\n    \"xdC de 1\\n\"\n    \"wvO va 1\\n\"\n    \"oQl le 1\\n\"\n    \"nIw an 1\\n\"\n    \"wzA sz 1\\n\"\n    \"Rzj sz 1\\n\"\n    \"Qzn an 1\\n\"\n    \"Yjt th 1\\n\"\n    \"xkQ ku 1\\n\"\n    \"lrq qu 1\\n\"\n    \"nwZ an 1\\n\"\n    \"pGk ka 1\\n\"\n    \"mnL an 1\\n\"\n    \"Rlq qu 1\\n\"\n    \"ccD ch 1\\n\"\n    \"rRd er 1\\n\"\n    \"Ofj ij 1\\n\"\n    \"Fjh th 1\\n\"\n    \"uuO qu 1\\n\"\n    \"zZx sz 1\\n\"\n    \"Nbj ij 1\\n\"\n    \"znW an 1\\n\"\n    \"jbH ij 1\\n\"\n    \"rDx er 1\\n\"\n    \"Qmc ch 1\\n\"\n    \"dwV de 1\\n\"\n    \"Oqv qu 1\\n\"\n    \"Zqe qu 1\\n\"\n    \"fwI wa 1\\n\"\n    \"njP an 1\\n\"\n    \"Oqq qu 1\\n\"\n    \"pVv va 1\\n\"\n    \"fqx qu 1\\n\"\n    \"gfO ng 1\\n\"\n    \"hqU th 1\\n\"\n    \"gDj ng 1\\n\"\n    \"Tmj ij 1\\n\"\n    \"vcK ch 1\\n\"\n    \"qmV qu 1\\n\"\n    \"sVx st 1\\n\"\n    \"Wfh th 1\\n\"\n    \"mJk ka 1\\n\"\n    \"fuK qu 1\\n\"\n    \"bfN be 1\\n\"\n    \"qfT qu 1\\n\"\n    \"Fmj ij 1\\n\"\n    \"tbN th 1\\n\"\n    \"kjN ij 1\\n\"\n    \"yhZ th 1\\n\"\n    \"Nxk ka 1\\n\"\n    \"wxU wa 1\\n\"\n    \"zXb sz 1\\n\"\n    \"Nzd de 1\\n\"\n    \"ohL th 1\\n\"\n    \"pVt th 1\\n\"\n    \"Zsx st 1\\n\"\n    \"Zqj qu 1\\n\"\n    \"wUj ij 1\\n\"\n    \"yjC ij 1\\n\"\n    \"kTn an 1\\n\"\n    \"vqV qu 1\\n\"\n    \"Fyc ch 1\\n\"\n    \"Icd ch 1\\n\"\n    \"svN st 1\\n\"\n    \"Jjv ij 1\\n\"\n    \"bVp pr 1\\n\"\n    \"fdI de 1\\n\"\n    \"nbX an 1\\n\"\n    \"cfU ch 1\\n\"\n    \"lGm le 1\\n\"\n    \"Ovg ng 1\\n\"\n    \"zDc ch 1\\n\"\n    \"jgq qu 1\\n\"\n    \"lYr er 1\\n\"\n    \"hjR th 1\\n\"\n    \"qPm qu 1\\n\"\n    \"iRq qu 1\\n\"\n    \"Zrx er 1\\n\"\n    \"wpT pr 1\\n\"\n    \"xsB st 1\\n\"\n    \"qxT qu 1\\n\"\n    \"gFx ng 1\\n\"\n    \"qoJ qu 1\\n\"\n    \"smD st 1\\n\"\n    \"lbM le 1\\n\"\n    \"wCc ch 1\\n\"\n    \"wFm me 1\\n\"\n    \"Xlv le 1\\n\"\n    \"zyU sz 1\\n\"\n    \"vFk ka 1\\n\"\n    \"tjR th 1\\n\"\n    \"iYx in 1\\n\"\n    \"uJk qu 1\\n\"\n    \"Qeh th 1\\n\"\n    \"Xrv er 1\\n\"\n    \"Bqq qu 1\\n\"\n    \"Vdb de 1\\n\"\n    \"znR an 1\\n\"\n    \"pmL me 1\\n\"\n    \"tvH th 1\\n\"\n    \"Tmd de 1\\n\"\n    \"Dgb ng 1\\n\"\n    \"ozO on 1\\n\"\n    \"fQb be 1\\n\"\n    \"Pqb qu 1\\n\"\n    \"qYn an 1\\n\"\n    \"xPm me 1\\n\"\n    \"gWf ng 1\\n\"\n    \"cCv ch 1\\n\"\n    \"qeP qu 1\\n\"\n    \"qZm qu 1\\n\"\n    \"dgZ ng 1\\n\"\n    \"mjO ij 1\\n\"\n    \"gCw ng 1\\n\"\n    \"svQ st 1\\n\"\n    \"Rqq qu 1\\n\"\n    \"Qbt th 1\\n\"\n    \"Lkj ij 1\\n\"\n    \"Fza an 1\\n\"\n    \"jlB le 1\\n\"\n    \"iWj in 1\\n\"\n    \"Zxi in 1\\n\"\n    \"Kxw wa 1\\n\"\n    \"jcJ ij 1\\n\"\n    \"uCf qu 1\\n\"\n    \"cAx ch 1\\n\"\n    \"Vjw ij 1\\n\"\n    \"vUs st 1\\n\"\n    \"Mnq an 1\\n\"\n    \"jjM ij 1\\n\"\n    \"vUx va 1\\n\"\n    \"uZr qu 1\\n\"\n    \"twU th 1\\n\"\n    \"Ytv th 1\\n\"\n    \"hRp th 1\\n\"\n    \"kzV sz 1\\n\"\n    \"mvY va 1\\n\"\n    \"jFj ij 1\\n\"\n    \"jBp ij 1\\n\"\n    \"kGz sz 1\\n\"\n    \"qUq qu 1\\n\"\n    \"qgR qu 1\\n\"\n    \"lWb le 1\\n\"\n    \"wwP wa 1\\n\"\n    \"wvE va 1\\n\"\n    \"Fsx st 1\\n\"\n    \"Izx sz 1\\n\"\n    \"bwC wa 1\\n\"\n    \"Fmq qu 1\\n\"\n    \"cLd ch 1\\n\"\n    \"bRl le 1\\n\"\n    \"iXf in 1\\n\"\n    \"yMq qu 1\\n\"\n    \"cqP ch 1\\n\"\n    \"jsL st 1\\n\"\n    \"jIq qu 1\\n\"\n    \"wuG qu 1\\n\"\n    \"Lbv va 1\\n\"\n    \"Eqf qu 1\\n\"\n    \"Ogf ng 1\\n\"\n    \"kGv ka 1\\n\"\n    \"pjK ij 1\\n\"\n    \"vcQ ch 1\\n\"\n    \"Xzh th 1\\n\"\n    \"jUv ij 1\\n\"\n    \"wGd de 1\\n\"\n    \"hmX th 1\\n\"\n    \"yqm qu 1\\n\"\n    \"qkE qu 1\\n\"\n    \"zgX ng 1\\n\"\n    \"vwO va 1\\n\"\n    \"wmS me 1\\n\"\n    \"vhT th 1\\n\"\n    \"syX st 1\\n\"\n    \"nbC an 1\\n\"\n    \"zgW ng 1\\n\"\n    \"vqM qu 1\\n\"\n    \"dWf de 1\\n\"\n    \"cwF ch 1\\n\"\n    \"dnF an 1\\n\"\n    \"qDi qu 1\\n\"\n    \"qSw qu 1\\n\"\n    \"jQf ij 1\\n\"\n    \"crZ ch 1\\n\"\n    \"qGl qu 1\\n\"\n    \"Wxu qu 1\\n\"\n    \"grW ng 1\\n\"\n    \"glX ng 1\\n\"\n    \"vFd de 1\\n\"\n    \"pbF pr 1\\n\"\n    \"bNf be 1\\n\"\n    \"Qcf ch 1\\n\"\n    \"fVx fo 1\\n\"\n    \"pPf pr 1\\n\"\n    \"pVq qu 1\\n\"\n    \"xlG le 1\\n\"\n    \"Dwj ij 1\\n\"\n    \"xQj ij 1\\n\"\n    \"lkQ le 1\\n\"\n    \"sqH qu 1\\n\"\n    \"Yyx ny 1\\n\"\n    \"vFm va 1\\n\"\n    \"tQo th 1\\n\"\n    \"zlU le 1\\n\"\n    \"vlW le 1\\n\"\n    \"glW ng 1\\n\"\n    \"qmW qu 1\\n\"\n    \"aWl an 1\\n\"\n    \"zmV sz 1\\n\"\n    \"gLm ng 1\\n\"\n    \"glB ng 1\\n\"\n    \"tqA th 1\\n\"\n    \"hgJ th 1\\n\"\n    \"cGb ch 1\\n\"\n    \"qwE qu 1\\n\"\n    \"Ffy ny 1\\n\"\n    \"wmL me 1\\n\"\n    \"xLh th 1\\n\"\n    \"sbE st 1\\n\"\n    \"bQl le 1\\n\"\n    \"xkR ka 1\\n\"\n    \"yFd de 1\\n\"\n    \"Omq qu 1\\n\"\n    \"Xfj ij 1\\n\"\n    \"wJj ij 1\\n\"\n    \"Lws st 1\\n\"\n    \"wfU wa 1\\n\"\n    \"zfk sz 1\\n\"\n    \"lNv le 1\\n\"\n    \"ykQ ka 1\\n\"\n    \"xDt th 1\\n\"\n    \"jDw ij 1\\n\"\n    \"zbx sz 1\\n\"\n    \"vQs st 1\\n\"\n    \"vvM va 1\\n\"\n    \"Xqq qu 1\\n\"\n    \"jLq qu 1\\n\"\n    \"zkZ sz 1\\n\"\n    \"qAg qu 1\\n\"\n    \"Xjw ij 1\\n\"\n    \"cFw ch 1\\n\"\n    \"rwQ er 1\\n\"\n    \"mWk ka 1\\n\"\n    \"Yrx er 1\\n\"\n    \"eUo er 1\\n\"\n    \"uDm qu 1\\n\"\n    \"Mhw th 1\\n\"\n    \"fGp pr 1\\n\"\n    \"Rpz sz 1\\n\"\n    \"sbF st 1\\n\"\n    \"nfX an 1\\n\"\n    \"Wfu qu 1\\n\"\n    \"Mwq qu 1\\n\"\n    \"qDj qu 1\\n\"\n    \"Wpw pr 1\\n\"\n    \"zFv sz 1\\n\"\n    \"qXc ch 1\\n\"\n    \"qsT qu 1\\n\"\n    \"pZh th 1\\n\"\n    \"lLc ch 1\\n\"\n    \"pqB qu 1\\n\"\n    \"Xjo on 1\\n\"\n    \"kDk ka 1\\n\"\n    \"Jxf fo 1\\n\"\n    \"Vqz qu 1\\n\"\n    \"Hvq qu 1\\n\"\n    \"Zqw qu 1\\n\"\n    \"kRc ch 1\\n\"\n    \"tvR th 1\\n\"\n    \"dNx de 1\\n\"\n    \"jWq qu 1\\n\"\n    \"nRw an 1\\n\"\n    \"rGb er 1\\n\"\n    \"vZz sz 1\\n\"\n    \"Xtz th 1\\n\"\n    \"kZn an 1\\n\"\n    \"Vmj ij 1\\n\"\n    \"dMp de 1\\n\"\n    \"cPy ch 1\\n\"\n    \"uzR qu 1\\n\"\n    \"yjE ij 1\\n\"\n    \"gzF ng 1\\n\"\n    \"tCp th 1\\n\"\n    \"qfC qu 1\\n\"\n    \"vcq ch 1\\n\"\n    \"Zfg ng 1\\n\"\n    \"kwC ka 1\\n\"\n    \"fkM ko 1\\n\"\n    \"vJh th 1\\n\"\n    \"eCq qu 1\\n\"\n    \"wPp pr 1\\n\"\n    \"qJy qu 1\\n\"\n    \"dmY de 1\\n\"\n    \"uMj qu 1\\n\"\n    \"fKh th 1\\n\"\n    \"sqU qu 1\\n\"\n    \"vNp va 1\\n\"\n    \"Crj er 1\\n\"\n    \"hsH th 1\\n\"\n    \"Vwn an 1\\n\"\n    \"Sdy de 1\\n\"\n    \"Fpw pr 1\\n\"\n    \"Wcq ch 1\\n\"\n    \"pjW ij 1\\n\"\n    \"dwW de 1\\n\"\n    \"gjX ng 1\\n\"\n    \"yZk ka 1\\n\"\n    \"cKg ch 1\\n\"\n    \"xdR de 1\\n\"\n    \"wqW qu 1\\n\"\n    \"khD th 1\\n\"\n    \"vgG ng 1\\n\"\n    \"vMl le 1\\n\"\n    \"qnQ an 1\\n\"\n    \"hJt th 1\\n\"\n    \"fvC va 1\\n\"\n    \"cpR ch 1\\n\"\n    \"Wtt th 1\\n\"\n    \"uyX qu 1\\n\"\n    \"cXf ch 1\\n\"\n    \"uKv qu 1\\n\"\n    \"gVv ng 1\\n\"\n    \"xzg ng 1\\n\"\n    \"cPq ch 1\\n\"\n    \"fTn an 1\\n\"\n    \"sFj st 1\\n\"\n    \"mzX sz 1\\n\"\n    \"gMq qu 1\\n\"\n    \"rxI er 1\\n\"\n    \"eYf er 1\\n\"\n    \"kwB ka 1\\n\"\n    \"eQk er 1\\n\"\n    \"jBq qu 1\\n\"\n    \"lbH le 1\\n\"\n    \"qCt th 1\\n\"\n    \"Wnv an 1\\n\"\n    \"gYd ng 1\\n\"\n    \"Zxe er 1\\n\"\n    \"fZj ij 1\\n\"\n    \"Hgj ng 1\\n\"\n    \"bRj ij 1\\n\"\n    \"fpR pr 1\\n\"\n    \"cbR ch 1\\n\"\n    \"lqT qu 1\\n\"\n    \"cMt th 1\\n\"\n    \"tQy to 1\\n\"\n    \"vxG va 1\\n\"\n    \"gpB ng 1\\n\"\n    \"Gkw ka 1\\n\"\n    \"zqX qu 1\\n\"\n    \"tPw th 1\\n\"\n    \"fnN an 1\\n\"\n    \"Gkp ka 1\\n\"\n    \"mvQ va 1\\n\"\n    \"hHf th 1\\n\"\n    \"wfS wa 1\\n\"\n    \"qCx qu 1\\n\"\n    \"mqH qu 1\\n\"\n    \"hgR th 1\\n\"\n    \"Mwg ng 1\\n\"\n    \"bqQ qu 1\\n\"\n    \"Fkz sz 1\\n\"\n    \"oFv on 1\\n\"\n    \"Ddq qu 1\\n\"\n    \"uIo qu 1\\n\"\n    \"Yfh th 1\\n\"\n    \"ygQ ng 1\\n\"\n    \"fxh th 1\\n\"\n    \"Zqd qu 1\\n\"\n    \"Htn th 1\\n\"\n    \"Gvz sz 1\\n\"\n    \"zRw sz 1\\n\"\n    \"vCb va 1\\n\"\n    \"rjT ro 1\\n\"\n    \"rjD er 1\\n\"\n    \"Qpm me 1\\n\"\n    \"Xdb de 1\\n\"\n    \"Lkf ka 1\\n\"\n    \"Ajx ij 1\\n\"\n    \"Ylz le 1\\n\"\n    \"Qtb th 1\\n\"\n    \"bHz sz 1\\n\"\n    \"bDg ng 1\\n\"\n    \"Lqx qu 1\\n\"\n    \"yhW th 1\\n\"\n    \"zLv sz 1\\n\"\n    \"xgK ng 1\\n\"\n    \"eWq qu 1\\n\"\n    \"sjS st 1\\n\"\n    \"qVe qu 1\\n\"\n    \"Okq qu 1\\n\"\n    \"Ewj ij 1\\n\"\n    \"Dsv st 1\\n\"\n    \"jhI th 1\\n\"\n    \"xGf fo 1\\n\"\n    \"Okx ka 1\\n\"\n    \"Fqx qu 1\\n\"\n    \"dPv de 1\\n\"\n    \"zsK st 1\\n\"\n    \"qLn an 1\\n\"\n    \"fkB ka 1\\n\"\n    \"cCb ch 1\\n\"\n    \"gNp ng 1\\n\"\n    \"Qwd de 1\\n\"\n    \"zTf sz 1\\n\"\n    \"Pqq qu 1\\n\"\n    \"rFv ro 1\\n\"\n    \"Rwt th 1\\n\"\n    \"uKc ch 1\\n\"\n    \"hqN th 1\\n\"\n    \"kmK ka 1\\n\"\n    \"wuC qu 1\\n\"\n    \"pnZ an 1\\n\"\n    \"tgM th 1\\n\"\n    \"Qds st 1\\n\"\n    \"Axq qu 1\\n\"\n    \"xwO wa 1\\n\"\n    \"eQg ng 1\\n\"\n    \"mFj ij 1\\n\"\n    \"Dpm me 1\\n\"\n    \"pQm me 1\\n\"\n    \"aFp an 1\\n\"\n    \"mfB me 1\\n\"\n    \"fpA pr 1\\n\"\n    \"jgZ ng 1\\n\"\n    \"lGk le 1\\n\"\n    \"xcA ch 1\\n\"\n    \"gWw ng 1\\n\"\n    \"lzF le 1\\n\"\n    \"xsQ st 1\\n\"\n    \"bQx be 1\\n\"\n    \"wjc ch 1\\n\"\n    \"bDc ch 1\\n\"\n    \"Wpz sz 1\\n\"\n    \"rfV er 1\\n\"\n    \"Zbs st 1\\n\"\n    \"hKq th 1\\n\"\n    \"qXa ar 1\\n\"\n    \"wjA ij 1\\n\"\n    \"vzS sz 1\\n\"\n    \"cWy ch 1\\n\"\n    \"gjK ng 1\\n\"\n    \"yRb be 1\\n\"\n    \"qgU qu 1\\n\"\n    \"pqF qu 1\\n\"\n    \"qnU an 1\\n\"\n    \"Zqc ch 1\\n\"\n    \"Xqg qu 1\\n\"\n    \"zLq qu 1\\n\"\n    \"gzV ng 1\\n\"\n    \"Kqs qu 1\\n\"\n    \"zgZ ng 1\\n\"\n    \"jqG qu 1\\n\"\n    \"pqJ qu 1\\n\"\n    \"Ieq qu 1\\n\"\n    \"hjH th 1\\n\"\n    \"vmN va 1\\n\"\n    \"iuF qu 1\\n\"\n    \"wGy wa 1\\n\"\n    \"Kdh th 1\\n\"\n    \"hQb th 1\\n\"\n    \"jWr er 1\\n\"\n    \"Cxy ny 1\\n\"\n    \"Kqz qu 1\\n\"\n    \"wXr er 1\\n\"\n    \"xoQ on 1\\n\"\n    \"wBh th 1\\n\"\n    \"qyI qu 1\\n\"\n    \"qhC th 1\\n\"\n    \"Vpy pr 1\\n\"\n    \"nJb an 1\\n\"\n    \"uGw qu 1\\n\"\n    \"hhX th 1\\n\"\n    \"mjS ij 1\\n\"\n    \"Scv ch 1\\n\"\n    \"hFw th 1\\n\"\n    \"bKg ng 1\\n\"\n    \"Xmn an 1\\n\"\n    \"bdT de 1\\n\"\n    \"sJq qu 1\\n\"\n    \"xTm me 1\\n\"\n    \"qjz qu 1\\n\"\n    \"Mqp qu 1\\n\"\n    \"dHp de 1\\n\"\n    \"rRn ar 1\\n\"\n    \"Xlf le 1\\n\"\n    \"cNs ch 1\\n\"\n    \"Xql qu 1\\n\"\n    \"iFz in 1\\n\"\n    \"Nlk le 1\\n\"\n    \"sPw st 1\\n\"\n    \"vWq qu 1\\n\"\n    \"wXt th 1\\n\"\n    \"Fnq an 1\\n\"\n    \"ozJ on 1\\n\"\n    \"zIg ng 1\\n\"\n    \"lSf le 1\\n\"\n    \"wRc ch 1\\n\"\n    \"Bvp va 1\\n\"\n    \"Wwr er 1\\n\"\n    \"pWg pr 1\\n\"\n    \"pLk ka 1\\n\"\n    \"krJ er 1\\n\"\n    \"Zfv va 1\\n\"\n    \"yIx ny 1\\n\"\n    \"oKx on 1\\n\"\n    \"qLb qu 1\\n\"\n    \"dHj de 1\\n\"\n    \"oqK qu 1\\n\"\n    \"cxC ch 1\\n\"\n    \"wJh th 1\\n\"\n    \"wZd de 1\\n\"\n    \"cWz ch 1\\n\"\n    \"yqS qu 1\\n\"\n    \"kXq qu 1\\n\"\n    \"fYd de 1\\n\"\n    \"dGy de 1\\n\"\n    \"dDt th 1\\n\"\n    \"pKg ng 1\\n\"\n    \"Xjd de 1\\n\"\n    \"sjM st 1\\n\"\n    \"sfC st 1\\n\"\n    \"dMh th 1\\n\"\n    \"dZp de 1\\n\"\n    \"wcD ch 1\\n\"\n    \"Qoj on 1\\n\"\n    \"gxC ng 1\\n\"\n    \"Zfn an 1\\n\"\n    \"hYv th 1\\n\"\n    \"xWq qu 1\\n\"\n    \"gZw ng 1\\n\"\n    \"pQi in 1\\n\"\n    \"Xlb le 1\\n\"\n    \"gQz ng 1\\n\"\n    \"nbZ an 1\\n\"\n    \"Ezx sz 1\\n\"\n    \"wNg ng 1\\n\"\n    \"Xrj er 1\\n\"\n    \"cxX ch 1\\n\"\n    \"dQp de 1\\n\"\n    \"Ypn an 1\\n\"\n    \"pNp pr 1\\n\"\n    \"pbQ pr 1\\n\"\n    \"gMv ng 1\\n\"\n    \"qeF qu 1\\n\"\n    \"uVv qu 1\\n\"\n    \"dVk de 1\\n\"\n    \"uMv qu 1\\n\"\n    \"jQn an 1\\n\"\n    \"mhP th 1\\n\"\n    \"iTb in 1\\n\"\n    \"Pvw va 1\\n\"\n    \"zCw sz 1\\n\"\n    \"wcR ch 1\\n\"\n    \"svU st 1\\n\"\n    \"nMz an 1\\n\"\n    \"cjE ch 1\\n\"\n    \"jmH ij 1\\n\"\n    \"Qzc ch 1\\n\"\n    \"mqc ch 1\\n\"\n    \"qlU qu 1\\n\"\n    \"Zvp va 1\\n\"\n    \"xHl le 1\\n\"\n    \"gqB qu 1\\n\"\n    \"xsN st 1\\n\"\n    \"kCj ij 1\\n\"\n    \"Olx le 1\\n\"\n    \"Gxw wa 1\\n\"\n    \"xwV wa 1\\n\"\n    \"fPb be 1\\n\"\n    \"Rhv th 1\\n\"\n    \"pgV ng 1\\n\"\n    \"Qdp de 1\\n\"\n    \"zFs st 1\\n\"\n    \"klQ le 1\\n\"\n    \"yJd de 1\\n\"\n    \"rxE er 1\\n\"\n    \"uHv qu 1\\n\"\n    \"wKl le 1\\n\"\n    \"wpJ pr 1\\n\"\n    \"Cjr er 1\\n\"\n    \"tYg th 1\\n\"\n    \"Vpz sz 1\\n\"\n    \"Zxh th 1\\n\"\n    \"pQl le 1\\n\"\n    \"Fxe er 1\\n\"\n    \"Qok on 1\\n\"\n    \"plK le 1\\n\"\n    \"lpX le 1\\n\"\n    \"jdP de 1\\n\"\n    \"Zqy qu 1\\n\"\n    \"yRz sz 1\\n\"\n    \"nDg an 1\\n\"\n    \"kqL qu 1\\n\"\n    \"ugW qu 1\\n\"\n    \"Mbf be 1\\n\"\n    \"Kql qu 1\\n\"\n    \"Nqw qu 1\\n\"\n    \"Jzw sz 1\\n\"\n    \"sGn an 1\\n\"\n    \"wDv va 1\\n\"\n    \"Jjk ij 1\\n\"\n    \"ztQ th 1\\n\"\n    \"hwP th 1\\n\"\n    \"wDp pr 1\\n\"\n    \"gfG ng 1\\n\"\n    \"qhL th 1\\n\"\n    \"cUv ch 1\\n\"\n    \"Wbk ka 1\\n\"\n    \"fkF ko 1\\n\"\n    \"Pqv qu 1\\n\"\n    \"nbK an 1\\n\"\n    \"qSz qu 1\\n\"\n    \"vwI va 1\\n\"\n    \"cFc ch 1\\n\"\n    \"qfG qu 1\\n\"\n    \"rhF th 1\\n\"\n    \"xzl le 1\\n\"\n    \"dNc ch 1\\n\"\n    \"zwR sz 1\\n\"\n    \"wzK sz 1\\n\"\n    \"bQa an 1\\n\"\n    \"hLq th 1\\n\"\n    \"fUv va 1\\n\"\n    \"rHg ng 1\\n\"\n    \"uJj qu 1\\n\"\n    \"Fhz th 1\\n\"\n    \"Nzm sz 1\\n\"\n    \"gRz ng 1\\n\"\n    \"qXf qu 1\\n\"\n    \"Tzm sz 1\\n\"\n    \"Zkx ka 1\\n\"\n    \"hLx th 1\\n\"\n    \"Ukd de 1\\n\"\n    \"fMf fo 1\\n\"\n    \"vGp va 1\\n\"\n    \"jtI th 1\\n\"\n    \"hxE th 1\\n\"\n    \"jrH er 1\\n\"\n    \"Fgh th 1\\n\"\n    \"dlF le 1\\n\"\n    \"jcO ja 1\\n\"\n    \"sCw st 1\\n\"\n    \"Bqh th 1\\n\"\n    \"kZy ka 1\\n\"\n    \"fOh th 1\\n\"\n    \"rJb er 1\\n\"\n    \"rjV er 1\\n\"\n    \"Kwq qu 1\\n\"\n    \"Hcw ch 1\\n\"\n    \"mCw ma 1\\n\"\n    \"hxM th 1\\n\"\n    \"jTb ij 1\\n\"\n    \"mmQ me 1\\n\"\n    \"pjR ij 1\\n\"\n    \"cdP ch 1\\n\"\n    \"Zjs st 1\\n\"\n    \"jqF qu 1\\n\"\n    \"vMn an 1\\n\"\n    \"Mqs qu 1\\n\"\n    \"svX st 1\\n\"\n    \"iXn an 1\\n\"\n    \"nwR an 1\\n\"\n    \"ytR th 1\\n\"\n    \"Vjb ij 1\\n\"\n    \"Cjl le 1\\n\"\n    \"pXd de 1\\n\"\n    \"Gwu qu 1\\n\"\n    \"qIj qu 1\\n\"\n    \"kQn an 1\\n\"\n    \"fYm me 1\\n\"\n    \"vtZ th 1\\n\"\n    \"Usx st 1\\n\"\n    \"nfP an 1\\n\"\n    \"dQx de 1\\n\"\n    \"oXf on 1\\n\"\n    \"fEw wa 1\\n\"\n    \"sgX ng 1\\n\"\n    \"cPp ch 1\\n\"\n    \"ybW be 1\\n\"\n    \"kcW ch 1\\n\"\n    \"kHf ka 1\\n\"\n    \"vcU ch 1\\n\"\n    \"tXo th 1\\n\"\n    \"Kzh th 1\\n\"\n    \"Cfq qu 1\\n\"\n    \"Ujy ij 1\\n\"\n    \"Fxa an 1\\n\"\n    \"hxS th 1\\n\"\n    \"tWx th 1\\n\"\n    \"mlK le 1\\n\"\n    \"nZj an 1\\n\"\n    \"qOv qu 1\\n\"\n    \"Xkt th 1\\n\"\n    \"Fzf sz 1\\n\"\n    \"uTd qu 1\\n\"\n    \"qrS qu 1\\n\"\n    \"Ptw th 1\\n\"\n    \"dDs st 1\\n\"\n    \"rNm er 1\\n\"\n    \"Ewf wa 1\\n\"\n    \"hJk th 1\\n\"\n    \"Hdq qu 1\\n\"\n    \"Jtw th 1\\n\"\n    \"kqc ch 1\\n\"\n    \"nHq an 1\\n\"\n    \"rhH th 1\\n\"\n    \"oqH qu 1\\n\"\n    \"vpZ va 1\\n\"\n    \"Dgd ng 1\\n\"\n    \"qxV qu 1\\n\"\n    \"Cxv va 1\\n\"\n    \"plV pr 1\\n\"\n    \"kIi in 1\\n\"\n    \"Khc th 1\\n\"\n    \"jsY st 1\\n\"\n    \"fLh th 1\\n\"\n    \"Ykq qu 1\\n\"\n    \"Qmx me 1\\n\"\n    \"zvI sz 1\\n\"\n    \"yhS th 1\\n\"\n    \"qfg qu 1\\n\"\n    \"wxZ wa 1\\n\"\n    \"jVy ij 1\\n\"\n    \"kQw ka 1\\n\"\n    \"zXv sz 1\\n\"\n    \"Lhs th 1\\n\"\n    \"Mkq qu 1\\n\"\n    \"jkU ij 1\\n\"\n    \"Yhq th 1\\n\"\n    \"zrH er 1\\n\"\n    \"vhG va 1\\n\"\n    \"drD er 1\\n\"\n    \"Psj st 1\\n\"\n    \"gDf ng 1\\n\"\n    \"Xjj ij 1\\n\"\n    \"pLm me 1\\n\"\n    \"klC le 1\\n\"\n    \"hTx th 1\\n\"\n    \"zrJ er 1\\n\"\n    \"Xgk ng 1\\n\"\n    \"Wxf fo 1\\n\"\n    \"fdD de 1\\n\"\n    \"jHp ij 1\\n\"\n    \"yDw wa 1\\n\"\n    \"kPv ka 1\\n\"\n    \"Rkm ka 1\\n\"\n    \"mzg ng 1\\n\"\n    \"lHz le 1\\n\"\n    \"vpR va 1\\n\"\n    \"wZt th 1\\n\"\n    \"pBd de 1\\n\"\n    \"qPf qu 1\\n\"\n    \"hNw th 1\\n\"\n    \"Nvj ij 1\\n\"\n    \"pyU pr 1\\n\"\n    \"Sjh th 1\\n\"\n    \"Kzx sz 1\\n\"\n    \"oQp on 1\\n\"\n    \"xdL de 1\\n\"\n    \"dnZ an 1\\n\"\n    \"qfB qu 1\\n\"\n    \"kJc ch 1\\n\"\n    \"fWn an 1\\n\"\n    \"Xmc ch 1\\n\"\n    \"rGx er 1\\n\"\n    \"sFf st 1\\n\"\n    \"Vwv va 1\\n\"\n    \"tKd th 1\\n\"\n    \"sQx st 1\\n\"\n    \"oNm on 1\\n\"\n    \"uXj qu 1\\n\"\n    \"Xsq qu 1\\n\"\n    \"yWc ch 1\\n\"\n    \"hfC th 1\\n\"\n    \"Ijd de 1\\n\"\n    \"dkW de 1\\n\"\n    \"Nxn an 1\\n\"\n    \"juC qu 1\\n\"\n    \"bPy be 1\\n\"\n    \"lKs le 1\\n\"\n    \"aLq an 1\\n\"\n    \"jPp ij 1\\n\"\n    \"wpZ pr 1\\n\"\n    \"fjE ij 1\\n\"\n    \"zNt th 1\\n\"\n    \"mhN th 1\\n\"\n    \"bQn an 1\\n\"\n    \"bxB be 1\\n\"\n    \"fdX de 1\\n\"\n    \"Jcv va 1\\n\"\n    \"Fdp de 1\\n\"\n    \"wVx wa 1\\n\"\n    \"tmU th 1\\n\"\n    \"njJ an 1\\n\"\n    \"qzK qu 1\\n\"\n    \"jtD th 1\\n\"\n    \"bcX ch 1\\n\"\n    \"Ghx th 1\\n\"\n    \"xZj ij 1\\n\"\n    \"vKw va 1\\n\"\n    \"pvO va 1\\n\"\n    \"gXs ng 1\\n\"\n    \"wRv va 1\\n\"\n    \"hgN th 1\\n\"\n    \"gpO ng 1\\n\"\n    \"hWc th 1\\n\"\n    \"Upq qu 1\\n\"\n    \"vwD va 1\\n\"\n    \"mxE me 1\\n\"\n    \"Zvm va 1\\n\"\n    \"ozM on 1\\n\"\n    \"fbJ be 1\\n\"\n    \"tpQ th 1\\n\"\n    \"yeV er 1\\n\"\n    \"Znb an 1\\n\"\n    \"wXv va 1\\n\"\n    \"bcY ch 1\\n\"\n    \"sgZ ng 1\\n\"\n    \"qfM qu 1\\n\"\n    \"fcL ch 1\\n\"\n    \"mXl le 1\\n\"\n    \"uBq qu 1\\n\"\n    \"jxW ij 1\\n\"\n    \"mtU th 1\\n\"\n    \"qgJ qu 1\\n\"\n    \"dAq qu 1\\n\"\n    \"jBv ij 1\\n\"\n    \"Gty th 1\\n\"\n    \"Jfm me 1\\n\"\n    \"xqQ qu 1\\n\"\n    \"cBp ch 1\\n\"\n    \"Xqd qu 1\\n\"\n    \"fvM va 1\\n\"\n    \"uWm qu 1\\n\"\n    \"rSb er 1\\n\"\n    \"Xqj qu 1\\n\"\n    \"qTd qu 1\\n\"\n    \"lLg ng 1\\n\"\n    \"Jrp er 1\\n\"\n    \"oJb on 1\\n\"\n    \"pXy pr 1\\n\"\n    \"zrQ er 1\\n\"\n    \"cnT ch 1\\n\"\n    \"qsE qu 1\\n\"\n    \"pZc ch 1\\n\"\n    \"bVy be 1\\n\"\n    \"qIz qu 1\\n\"\n    \"dgR ng 1\\n\"\n    \"mLv va 1\\n\"\n    \"hVl th 1\\n\"\n    \"qRj qu 1\\n\"\n    \"fhA th 1\\n\"\n    \"zLc ch 1\\n\"\n    \"Sgq qu 1\\n\"\n    \"pLc ch 1\\n\"\n    \"Txq qu 1\\n\"\n    \"ypY pr 1\\n\"\n    \"tXz th 1\\n\"\n    \"dcC ch 1\\n\"\n    \"iYf in 1\\n\"\n    \"Wwm me 1\\n\"\n    \"kZk ka 1\\n\"\n    \"Ywr er 1\\n\"\n    \"gFv ng 1\\n\"\n    \"Fmz sz 1\\n\"\n    \"uQq qu 1\\n\"\n    \"xwR wa 1\\n\"\n    \"Yfc ch 1\\n\"\n    \"aIo an 1\\n\"\n    \"sBq qu 1\\n\"\n    \"Gzb sz 1\\n\"\n    \"jwI ij 1\\n\"\n    \"cFf ch 1\\n\"\n    \"aWv an 1\\n\"\n    \"Eaw an 1\\n\"\n    \"vkW ka 1\\n\"\n    \"Nfh th 1\\n\"\n    \"flN le 1\\n\"\n    \"Lpm me 1\\n\"\n    \"ylK le 1\\n\"\n    \"Znr an 1\\n\"\n    \"mcQ ch 1\\n\"\n    \"kfE ka 1\\n\"\n    \"Iyf ny 1\\n\"\n    \"qrV qu 1\\n\"\n    \"fPx fo 1\\n\"\n    \"fgJ ng 1\\n\"\n    \"jIi in 1\\n\"\n    \"bPw wa 1\\n\"\n    \"Qyx ny 1\\n\"\n    \"Qnb an 1\\n\"\n    \"Wdm de 1\\n\"\n    \"nJt th 1\\n\"\n    \"qCd qu 1\\n\"\n    \"gZl ng 1\\n\"\n    \"Nlz le 1\\n\"\n    \"Zwh th 1\\n\"\n    \"iWl in 1\\n\"\n    \"bUu qu 1\\n\"\n    \"lbJ le 1\\n\"\n    \"sNq qu 1\\n\"\n    \"qjU qu 1\\n\"\n    \"wbT wa 1\\n\"\n    \"yNc ch 1\\n\"\n    \"mxM me 1\\n\"\n    \"pHk ka 1\\n\"\n    \"Rdq qu 1\\n\"\n    \"gkE ng 1\\n\"\n    \"hbN th 1\\n\"\n    \"Tgq qu 1\\n\"\n    \"gjV ng 1\\n\"\n    \"Gjw ij 1\\n\"\n    \"gqX qu 1\\n\"\n    \"qXx qu 1\\n\"\n    \"vQq qu 1\\n\"\n    \"pNb pr 1\\n\"\n    \"fJy ny 1\\n\"\n    \"yvZ va 1\\n\"\n    \"zNl le 1\\n\"\n    \"zDb sz 1\\n\"\n    \"lUz le 1\\n\"\n    \"Dxy ny 1\\n\"\n    \"Wwn an 1\\n\"\n    \"hPn th 1\\n\"\n    \"kNb ko 1\\n\"\n    \"Wdb de 1\\n\"\n    \"zXt th 1\\n\"\n    \"pjL ij 1\\n\"\n    \"tJg th 1\\n\"\n    \"jmM ij 1\\n\"\n    \"bXg ng 1\\n\"\n    \"hTv th 1\\n\"\n    \"Ysf st 1\\n\"\n    \"hmQ th 1\\n\"\n    \"Vyq qu 1\\n\"\n    \"Fpd de 1\\n\"\n    \"yQw wa 1\\n\"\n    \"Pbn an 1\\n\"\n    \"xVj ij 1\\n\"\n    \"whP th 1\\n\"\n    \"fSg ng 1\\n\"\n    \"Gxz ze 1\\n\"\n    \"Dfw wa 1\\n\"\n    \"rMx er 1\\n\"\n    \"zMf sz 1\\n\"\n    \"vJw va 1\\n\"\n    \"xJl le 1\\n\"\n    \"xfN fo 1\\n\"\n    \"dQw de 1\\n\"\n    \"fuD qu 1\\n\"\n    \"xjB ij 1\\n\"\n    \"lPj le 1\\n\"\n    \"mqA qu 1\\n\"\n    \"mfM me 1\\n\"\n    \"kwG ka 1\\n\"\n    \"eaY an 1\\n\"\n    \"Vmm me 1\\n\"\n    \"zfS sz 1\\n\"\n    \"Fmy me 1\\n\"\n    \"sqP qu 1\\n\"\n    \"fKk ka 1\\n\"\n    \"Qdv de 1\\n\"\n    \"djZ de 1\\n\"\n    \"qrR qu 1\\n\"\n    \"txK th 1\\n\"\n    \"bxH be 1\\n\"\n    \"jRb ij 1\\n\"\n    \"cjD ch 1\\n\"\n    \"Sxw wa 1\\n\"\n    \"Sxh th 1\\n\"\n    \"vrZ er 1\\n\"\n    \"xmH me 1\\n\"\n    \"dfH de 1\\n\"\n    \"fJw wa 1\\n\"\n    \"mwZ me 1\\n\"\n    \"vRm va 1\\n\"\n    \"xwj ij 1\\n\"\n    \"Xqr er 1\\n\"\n    \"Gvj ij 1\\n\"\n    \"hzF th 1\\n\"\n    \"xnK an 1\\n\"\n    \"xhU th 1\\n\"\n    \"Nls le 1\\n\"\n    \"zbV sz 1\\n\"\n    \"fTq qu 1\\n\"\n    \"Wxv va 1\\n\"\n    \"upG qu 1\\n\"\n    \"qAo qu 1\\n\"\n    \"kKx ka 1\\n\"\n    \"zlD le 1\\n\"\n    \"hTl th 1\\n\"\n    \"Gqr qu 1\\n\"\n    \"Gxm me 1\\n\"\n    \"zPj sz 1\\n\"\n    \"bvZ va 1\\n\"\n    \"jHc ch 1\\n\"\n    \"iXg ng 1\\n\"\n    \"Kgz ng 1\\n\"\n    \"Jyi in 1\\n\"\n    \"vFh th 1\\n\"\n    \"ytW th 1\\n\"\n    \"qBd qu 1\\n\"\n    \"Xjq qu 1\\n\"\n    \"dgO ng 1\\n\"\n    \"mjN ij 1\\n\"\n    \"Djg ng 1\\n\"\n    \"zIj sz 1\\n\"\n    \"uDx qu 1\\n\"\n    \"qJf qu 1\\n\"\n    \"fAx fo 1\\n\"\n    \"Fsj st 1\\n\"\n    \"yDf ny 1\\n\"\n    \"xjV ij 1\\n\"\n    \"hdB th 1\\n\"\n    \"dwG de 1\\n\"\n    \"slW le 1\\n\"\n    \"zYb sz 1\\n\"\n    \"vzO sz 1\\n\"\n    \"vqO qu 1\\n\"\n    \"Jzv sz 1\\n\"\n    \"xmG me 1\\n\"\n    \"Kdw de 1\\n\"\n    \"xVq qu 1\\n\"\n    \"jtE th 1\\n\"\n    \"kJy ka 1\\n\"\n    \"xjW ij 1\\n\"\n    \"mwR me 1\\n\"\n    \"zVx sz 1\\n\"\n    \"tMj th 1\\n\"\n    \"qqb qu 1\\n\"\n    \"nlQ le 1\\n\"\n    \"bxQ be 1\\n\"\n    \"hJv th 1\\n\"\n    \"jnY an 1\\n\"\n    \"yfS ny 1\\n\"\n    \"Mdw de 1\\n\"\n    \"zZc ch 1\\n\"\n    \"ysJ st 1\\n\"\n    \"Qqv qu 1\\n\"\n    \"zxl le 1\\n\"\n    \"jAq qu 1\\n\"\n    \"lJw le 1\\n\"\n    \"kwJ ka 1\\n\"\n    \"sxC st 1\\n\"\n    \"hJr th 1\\n\"\n    \"xGp pr 1\\n\"\n    \"ccF ch 1\\n\"\n    \"vGq qu 1\\n\"\n    \"qSc ch 1\\n\"\n    \"fqq qu 1\\n\"\n    \"kkV ka 1\\n\"\n    \"gVq qu 1\\n\"\n    \"Wqg qu 1\\n\"\n    \"kJp ka 1\\n\"\n    \"Wlr er 1\\n\"\n    \"Jwz sz 1\\n\"\n    \"qEa an 1\\n\"\n    \"krL er 1\\n\"\n    \"tqE th 1\\n\"\n    \"eJz er 1\\n\"\n    \"Whx th 1\\n\"\n    \"vWw va 1\\n\"\n    \"Qzh th 1\\n\"\n    \"pcF ch 1\\n\"\n    \"Vmx me 1\\n\"\n    \"dvC de 1\\n\"\n    \"qjZ qu 1\\n\"\n    \"pkF ka 1\\n\"\n    \"cvO ch 1\\n\"\n    \"Qyv va 1\\n\"\n    \"hNs th 1\\n\"\n    \"snJ an 1\\n\"\n    \"yjU ij 1\\n\"\n    \"Yfq qu 1\\n\"\n    \"xLw wa 1\\n\"\n    \"rVz er 1\\n\"\n    \"gOw ng 1\\n\"\n    \"fxL fo 1\\n\"\n    \"snW an 1\\n\"\n    \"yWk ka 1\\n\"\n    \"wgK ng 1\\n\"\n    \"aTf an 1\\n\"\n    \"eVf er 1\\n\"\n    \"vZp va 1\\n\"\n    \"uVp qu 1\\n\"\n    \"Vjh th 1\\n\"\n    \"zwT sz 1\\n\"\n    \"wSn an 1\\n\"\n    \"nNp an 1\\n\"\n    \"gfF ng 1\\n\"\n    \"hcW th 1\\n\"\n    \"gTf ng 1\\n\"\n    \"qaJ an 1\\n\"\n    \"kzY sz 1\\n\"\n    \"ljX le 1\\n\"\n    \"wMm me 1\\n\"\n    \"btB st 1\\n\"\n    \"zfE sz 1\\n\"\n    \"bxO be 1\\n\"\n    \"wPc ch 1\\n\"\n    \"fgK ng 1\\n\"\n    \"fzW sz 1\\n\"\n    \"dcX ch 1\\n\"\n    \"qqR qu 1\\n\"\n    \"kjq qu 1\\n\"\n    \"vMh th 1\\n\"\n    \"gZj ng 1\\n\"\n    \"qtw th 1\\n\"\n    \"vkY ka 1\\n\"\n    \"lCb le 1\\n\"\n    \"dpO de 1\\n\"\n    \"mXm me 1\\n\"\n    \"vWc ch 1\\n\"\n    \"fOq qu 1\\n\"\n    \"Vgy ng 1\\n\"\n    \"dkD de 1\\n\"\n    \"fQh th 1\\n\"\n    \"vIq qu 1\\n\"\n    \"lZr er 1\\n\"\n    \"zKn an 1\\n\"\n    \"Vpt th 1\\n\"\n    \"Dmw me 1\\n\"\n    \"Nwf wa 1\\n\"\n    \"kYl le 1\\n\"\n    \"jpJ ij 1\\n\"\n    \"qXi qu 1\\n\"\n    \"Bnj an 1\\n\"\n    \"xfK fo 1\\n\"\n    \"fCc ch 1\\n\"\n    \"vPd de 1\\n\"\n    \"Qnp an 1\\n\"\n    \"ypW pr 1\\n\"\n    \"uwJ qu 1\\n\"\n    \"Pvb va 1\\n\"\n    \"cnC ch 1\\n\"\n    \"hvA th 1\\n\"\n    \"hGz th 1\\n\"\n    \"nZx an 1\\n\"\n    \"kbS ka 1\\n\"\n    \"Swx wa 1\\n\"\n    \"hvP th 1\\n\"\n    \"kqG qu 1\\n\"\n    \"bLq qu 1\\n\"\n    \"qjP qu 1\\n\"\n    \"sUo on 1\\n\"\n    \"lDq qu 1\\n\"\n    \"Zlp le 1\\n\"\n    \"dwQ de 1\\n\"\n    \"dlN le 1\\n\"\n    \"fTl le 1\\n\"\n    \"Npv va 1\\n\"\n    \"bMn an 1\\n\"\n    \"dNz sz 1\\n\"\n    \"efV er 1\\n\"\n    \"aCw an 1\\n\"\n    \"aWf an 1\\n\"\n    \"Lqo qu 1\\n\"\n    \"fzT sz 1\\n\"\n    \"Jjr er 1\\n\"\n    \"zvK sz 1\\n\"\n    \"nwT an 1\\n\"\n    \"fXr er 1\\n\"\n    \"cGm ch 1\\n\"\n    \"lvS le 1\\n\"\n    \"qDq qu 1\\n\"\n    \"qRm qu 1\\n\"\n    \"vYt th 1\\n\"\n    \"iQv in 1\\n\"\n    \"fkH ka 1\\n\"\n    \"fcO ch 1\\n\"\n    \"rNn an 1\\n\"\n    \"qmS qu 1\\n\"\n    \"kzR sz 1\\n\"\n    \"Dfc ch 1\\n\"\n    \"qUs qu 1\\n\"\n    \"xqP qu 1\\n\"\n    \"sXk st 1\\n\"\n    \"Xyt th 1\\n\"\n    \"pWt th 1\\n\"\n    \"jbL ij 1\\n\"\n    \"jYd ij 1\\n\"\n    \"kqV qu 1\\n\"\n    \"Fqm qu 1\\n\"\n    \"xoX on 1\\n\"\n    \"zuX qu 1\\n\"\n    \"xUq qu 1\\n\"\n    \"cgC ch 1\\n\"\n    \"wBq qu 1\\n\"\n    \"gQp ng 1\\n\"\n    \"jnE an 1\\n\"\n    \"yZs st 1\\n\"\n    \"fkD ka 1\\n\"\n    \"sVk st 1\\n\"\n    \"qyX qu 1\\n\"\n    \"cBf ch 1\\n\"\n    \"Cjy ij 1\\n\"\n    \"dPq qu 1\\n\"\n    \"wDg ng 1\\n\"\n    \"dxB de 1\\n\"\n    \"Dkm ka 1\\n\"\n    \"kPp ka 1\\n\"\n    \"hWz th 1\\n\"\n    \"Bjv ij 1\\n\"\n    \"Izf sz 1\\n\"\n    \"Hnk an 1\\n\"\n    \"rQc ch 1\\n\"\n    \"Jwu qu 1\\n\"\n    \"fbP be 1\\n\"\n    \"frQ er 1\\n\"\n    \"Aov on 1\\n\"\n    \"yqQ qu 1\\n\"\n    \"jfY ij 1\\n\"\n    \"xsH st 1\\n\"\n    \"zxh th 1\\n\"\n    \"Jbj ij 1\\n\"\n    \"Mjz sz 1\\n\"\n    \"gRp ng 1\\n\"\n    \"Gvw va 1\\n\"\n    \"mzF sz 1\\n\"\n    \"oqF qu 1\\n\"\n    \"ejU er 1\\n\"\n    \"xmQ me 1\\n\"\n    \"hOq th 1\\n\"\n    \"pwX pr 1\\n\"\n    \"zgK ng 1\\n\"\n    \"wLk ka 1\\n\"\n    \"fqc ch 1\\n\"\n    \"dPm de 1\\n\"\n    \"tCg th 1\\n\"\n    \"qrF qu 1\\n\"\n    \"pWl le 1\\n\"\n    \"rDf er 1\\n\"\n    \"Ynw an 1\\n\"\n    \"jnQ an 1\\n\"\n    \"tFb th 1\\n\"\n    \"rpU er 1\\n\"\n    \"pPj ij 1\\n\"\n    \"yjM ij 1\\n\"\n    \"jmY ij 1\\n\"\n    \"Cpz sz 1\\n\"\n    \"uDn an 1\\n\"\n    \"uqY qu 1\\n\"\n    \"Pjx ij 1\\n\"\n    \"qFv qu 1\\n\"\n    \"Ktf th 1\\n\"\n    \"Jcj ch 1\\n\"\n    \"kpO pr 1\\n\"\n    \"pgZ ng 1\\n\"\n    \"kfO ka 1\\n\"\n    \"tZv th 1\\n\"\n    \"jHq qu 1\\n\"\n    \"cRq ch 1\\n\"\n    \"zDm sz 1\\n\"\n    \"lPm le 1\\n\"\n    \"svP st 1\\n\"\n    \"qkx qu 1\\n\"\n    \"bNp pr 1\\n\"\n    \"Kjq qu 1\\n\"\n    \"vqS qu 1\\n\"\n    \"fQp pr 1\\n\"\n    \"txR th 1\\n\"\n    \"Hpf pr 1\\n\"\n    \"iQg ng 1\\n\"\n    \"vvP va 1\\n\"\n    \"iGf in 1\\n\"\n    \"tjI th 1\\n\"\n    \"pWn an 1\\n\"\n    \"Qqg qu 1\\n\"\n    \"qiF ti 1\\n\"\n    \"Zzr er 1\\n\"\n    \"aYf an 1\\n\"\n    \"zjA sz 1\\n\"\n    \"kwR ka 1\\n\"\n    \"gkM ng 1\\n\"\n    \"Cjf ij 1\\n\"\n    \"zgM ng 1\\n\"\n    \"Rxk ka 1\\n\"\n    \"bCd de 1\\n\"\n    \"Ypv va 1\\n\"\n    \"wyE wa 1\\n\"\n    \"iyB in 1\\n\"\n    \"hQp th 1\\n\"\n    \"ipQ in 1\\n\"\n    \"Ucj ch 1\\n\"\n    \"qkW qu 1\\n\"\n    \"krK er 1\\n\"\n    \"Hpp pr 1\\n\"\n    \"xnN an 1\\n\"\n    \"jwB ij 1\\n\"\n    \"Zdm de 1\\n\"\n    \"mYj ij 1\\n\"\n    \"tQx th 1\\n\"\n    \"qwS qu 1\\n\"\n    \"Hxo on 1\\n\"\n    \"qDx qu 1\\n\"\n    \"cXd ch 1\\n\"\n    \"gdO ng 1\\n\"\n    \"aEo an 1\\n\"\n    \"Twd de 1\\n\"\n    \"avQ an 1\\n\"\n    \"lhZ th 1\\n\"\n    \"lzV le 1\\n\"\n    \"bHf be 1\\n\"\n    \"bJn an 1\\n\"\n    \"Uqz qu 1\\n\"\n    \"uFy qu 1\\n\"\n    \"jNl le 1\\n\"\n    \"xBp pr 1\\n\"\n    \"dRb de 1\\n\"\n    \"nlT an 1\\n\"\n    \"wrO er 1\\n\"\n    \"lzW le 1\\n\"\n    \"fYf fo 1\\n\"\n    \"mRw me 1\\n\"\n    \"rXy er 1\\n\"\n    \"qyR qu 1\\n\"\n    \"fGv va 1\\n\"\n    \"Uwk ka 1\\n\"\n    \"kXm ka 1\\n\"\n    \"hJy th 1\\n\"\n    \"Xgv ng 1\\n\"\n    \"xYv va 1\\n\"\n    \"yYd de 1\\n\"\n    \"xzC sz 1\\n\"\n    \"gjB ng 1\\n\"\n    \"jzI sz 1\\n\"\n    \"zrO er 1\\n\"\n    \"tqF th 1\\n\"\n    \"vwM va 1\\n\"\n    \"zCq qu 1\\n\"\n    \"ljL le 1\\n\"\n    \"vnZ an 1\\n\"\n    \"eDq qu 1\\n\"\n    \"Qvq qu 1\\n\"\n    \"pfL pr 1\\n\"\n    \"iRb in 1\\n\"\n    \"gdR ng 1\\n\"\n    \"qAv qu 1\\n\"\n    \"vnL an 1\\n\"\n    \"mkT ka 1\\n\"\n    \"pVk ka 1\\n\"\n    \"xKh th 1\\n\"\n    \"jNk ij 1\\n\"\n    \"jLt th 1\\n\"\n    \"cNp ch 1\\n\"\n    \"tmP th 1\\n\"\n    \"vVt th 1\\n\"\n    \"qfP qu 1\\n\"\n    \"Uqo qu 1\\n\"\n    \"Dnp an 1\\n\"\n    \"yGb be 1\\n\"\n    \"sHd st 1\\n\"\n    \"pwF pr 1\\n\"\n    \"fPy ny 1\\n\"\n    \"Drq qu 1\\n\"\n    \"bJh th 1\\n\"\n    \"sQp st 1\\n\"\n    \"Iws st 1\\n\"\n    \"uCw qu 1\\n\"\n    \"Lwj ij 1\\n\"\n    \"rFw er 1\\n\"\n    \"sJp st 1\\n\"\n    \"xiI in 1\\n\"\n    \"Rqv qu 1\\n\"\n    \"bkQ ka 1\\n\"\n    \"qNp qu 1\\n\"\n    \"dYl le 1\\n\"\n    \"Vmf me 1\\n\"\n    \"lYc ch 1\\n\"\n    \"oPw on 1\\n\"\n    \"kjO ij 1\\n\"\n    \"mKb me 1\\n\"\n    \"fDf fo 1\\n\"\n    \"fFb be 1\\n\"\n    \"Vhv th 1\\n\"\n    \"Hjq qu 1\\n\"\n    \"qfK qu 1\\n\"\n    \"Kjp ij 1\\n\"\n    \"vTg ng 1\\n\"\n    \"pBq qu 1\\n\"\n    \"Htd th 1\\n\"\n    \"pNd de 1\\n\"\n    \"bQv va 1\\n\"\n    \"aSx an 1\\n\"\n    \"jwx ij 1\\n\"\n    \"Uyx ny 1\\n\"\n    \"wVj ij 1\\n\"\n    \"Ioq qu 1\\n\"\n    \"Nhm th 1\\n\"\n    \"Hqh th 1\\n\"\n    \"rUq qu 1\\n\"\n    \"bBx be 1\\n\"\n    \"Gqb qu 1\\n\"\n    \"Ccw ch 1\\n\"\n    \"hZw th 1\\n\"\n    \"Qbl le 1\\n\"\n    \"xFv va 1\\n\"\n    \"sZv st 1\\n\"\n    \"qzY qu 1\\n\"\n    \"pDb pr 1\\n\"\n    \"cfR ch 1\\n\"\n    \"rqk qu 1\\n\"\n    \"fzP sz 1\\n\"\n    \"hqO th 1\\n\"\n    \"pzH sz 1\\n\"\n    \"qSj qu 1\\n\"\n    \"pxJ pr 1\\n\"\n    \"xbq qu 1\\n\"\n    \"sXf st 1\\n\"\n    \"ybT be 1\\n\"\n    \"sHn an 1\\n\"\n    \"vTz sz 1\\n\"\n    \"Pgf ng 1\\n\"\n    \"hKw th 1\\n\"\n    \"jPj ij 1\\n\"\n    \"wTx wa 1\\n\"\n    \"jSj ij 1\\n\"\n    \"Fgz ng 1\\n\"\n    \"bKk ka 1\\n\"\n    \"eUj er 1\\n\"\n    \"cDf ch 1\\n\"\n    \"xFg ng 1\\n\"\n    \"cnW an 1\\n\"\n    \"tUy th 1\\n\"\n    \"Jgx ng 1\\n\"\n    \"yuF qu 1\\n\"\n    \"vyQ va 1\\n\"\n    \"xCz sz 1\\n\"\n    \"jRh th 1\\n\"\n    \"cXx ch 1\\n\"\n    \"kGk ka 1\\n\"\n    \"Xnh th 1\\n\"\n    \"qPh th 1\\n\"\n    \"lfZ le 1\\n\"\n    \"qVa an 1\\n\"\n    \"xws st 1\\n\"\n    \"Dzt th 1\\n\"\n    \"xfG fo 1\\n\"\n    \"fXh th 1\\n\"\n    \"jgV ng 1\\n\"\n    \"vJj ij 1\\n\"\n    \"bXj ij 1\\n\"\n    \"cgG ch 1\\n\"\n    \"vuW qu 1\\n\"\n    \"txG th 1\\n\"\n    \"Zxz sz 1\\n\"\n    \"fNc ch 1\\n\"\n    \"oBq qu 1\\n\"\n    \"Wgv ng 1\\n\"\n    \"Hwz sz 1\\n\"\n    \"oaW an 1\\n\"\n    \"vRg ng 1\\n\"\n    \"uXz qu 1\\n\"\n    \"fzQ sz 1\\n\"\n    \"bcB ch 1\\n\"\n    \"Bnw an 1\\n\"\n    \"gvB ng 1\\n\"\n    \"rQm er 1\\n\"\n    \"cvU ch 1\\n\"\n    \"xhR th 1\\n\"\n    \"zxR sz 1\\n\"\n    \"btZ th 1\\n\"\n    \"Kkf ka 1\\n\"\n    \"zJw sz 1\\n\"\n    \"uwq qu 1\\n\"\n    \"pSx pr 1\\n\"\n    \"yRv va 1\\n\"\n    \"nCq an 1\\n\"\n    \"tGv th 1\\n\"\n    \"wgT ng 1\\n\"\n    \"kNz sz 1\\n\"\n    \"oHk on 1\\n\"\n    \"Wzw sz 1\\n\"\n    \"hvU th 1\\n\"\n    \"skX st 1\\n\"\n    \"vYz sz 1\\n\"\n    \"joZ on 1\\n\"\n    \"nGq an 1\\n\"\n    \"qmM qu 1\\n\"\n    \"Bmr er 1\\n\"\n    \"sVg ng 1\\n\"\n    \"uCv qu 1\\n\"\n    \"iXz in 1\\n\"\n    \"vKp va 1\\n\"\n    \"lEw le 1\\n\"\n    \"hhF th 1\\n\"\n    \"iwS in 1\\n\"\n    \"qyU qu 1\\n\"\n    \"jjY ij 1\\n\"\n    \"Ygm ng 1\\n\"\n    \"wJd de 1\\n\"\n    \"eQp er 1\\n\"\n    \"Yfb be 1\\n\"\n    \"Wpg ng 1\\n\"\n    \"jdS de 1\\n\"\n    \"vmG va 1\\n\"\n    \"mdT de 1\\n\"\n    \"grZ ng 1\\n\"\n    \"yqN qu 1\\n\"\n    \"pBp po 1\\n\"\n    \"fkZ ka 1\\n\"\n    \"qeB qu 1\\n\"\n    \"cGs ch 1\\n\"\n    \"Eqg qu 1\\n\"\n    \"cfO ch 1\\n\"\n    \"uSx qu 1\\n\"\n    \"Dhf th 1\\n\"\n    \"Qjr er 1\\n\"\n    \"xqZ qu 1\\n\"\n    \"yQf ny 1\\n\"\n    \"npY an 1\\n\"\n    \"xDc ch 1\\n\"\n    \"bmQ me 1\\n\"\n    \"kMb ka 1\\n\"\n    \"aqC an 1\\n\"\n    \"jYl le 1\\n\"\n    \"wkD ka 1\\n\"\n    \"cWs ch 1\\n\"\n    \"yyJ ny 1\\n\"\n    \"wvV va 1\\n\"\n    \"lYb le 1\\n\"\n    \"qrW qu 1\\n\"\n    \"bqz qu 1\\n\"\n    \"wjC ij 1\\n\"\n    \"vKy va 1\\n\"\n    \"vjD ij 1\\n\"\n    \"sDs st 1\\n\"\n    \"fKf fo 1\\n\"\n    \"zsT st 1\\n\"\n    \"jYc ch 1\\n\"\n    \"Ywt th 1\\n\"\n    \"Hjw ij 1\\n\"\n    \"wIy wa 1\\n\"\n    \"ffU fo 1\\n\"\n    \"Wnx an 1\\n\"\n    \"eHq qu 1\\n\"\n    \"fWy ny 1\\n\"\n    \"Nwv va 1\\n\"\n    \"ySj ij 1\\n\"\n    \"jfC ij 1\\n\"\n    \"xXq qu 1\\n\"\n    \"grI ng 1\\n\"\n    \"oVf on 1\\n\"\n    \"Vfy ny 1\\n\"\n    \"jgY ng 1\\n\"\n    \"Hjp ij 1\\n\"\n    \"zqC qu 1\\n\"\n    \"qyH qu 1\\n\"\n    \"kcQ ch 1\\n\"\n    \"zsE st 1\\n\"\n    \"pCx pr 1\\n\"\n    \"kwP ka 1\\n\"\n    \"jfQ ij 1\\n\"\n    \"wZg ng 1\\n\"\n    \"Vxm me 1\\n\"\n    \"Jvb va 1\\n\"\n    \"sEw sz 1\\n\"\n    \"jLl le 1\\n\"\n    \"dOx de 1\\n\"\n    \"wpS pr 1\\n\"\n    \"yIo on 1\\n\"\n    \"tGt th 1\\n\"\n    \"vHz sz 1\\n\"\n    \"xGj ij 1\\n\"\n    \"gvQ ng 1\\n\"\n    \"pNr er 1\\n\"\n    \"gqY qu 1\\n\"\n    \"sfK st 1\\n\"\n    \"dYd de 1\\n\"\n    \"sMm st 1\\n\"\n    \"oBx on 1\\n\"\n    \"qsF qu 1\\n\"\n    \"bmI me 1\\n\"\n    \"tmC th 1\\n\"\n    \"wlW le 1\\n\"\n    \"Twg ng 1\\n\"\n    \"srV er 1\\n\"\n    \"rNz er 1\\n\"\n    \"Uuc ch 1\\n\"\n    \"Gjg ng 1\\n\"\n    \"njY an 1\\n\"\n    \"vOh th 1\\n\"\n    \"Qmh th 1\\n\"\n    \"Fnf an 1\\n\"\n    \"yvY va 1\\n\"\n    \"pGf pr 1\\n\"\n    \"lHp al 1\\n\"\n    \"qgZ qu 1\\n\"\n    \"jbS ij 1\\n\"\n    \"xQi in 1\\n\"\n    \"tqG th 1\\n\"\n    \"nwI an 1\\n\"\n    \"qkY qu 1\\n\"\n    \"Wxy ny 1\\n\"\n    \"hDm th 1\\n\"\n    \"qQe qu 1\\n\"\n    \"iJp in 1\\n\"\n    \"xrN er 1\\n\"\n    \"dGg ng 1\\n\"\n    \"kQx ka 1\\n\"\n    \"Jqg qu 1\\n\"\n    \"hMk th 1\\n\"\n    \"ljT le 1\\n\"\n    \"Xkn an 1\\n\"\n    \"ztq th 1\\n\"\n    \"qNd qu 1\\n\"\n    \"suY qu 1\\n\"\n    \"Uoa an 1\\n\"\n    \"djR de 1\\n\"\n    \"mFf me 1\\n\"\n    \"jzq qu 1\\n\"\n    \"zjR sz 1\\n\"\n    \"Nnl an 1\\n\"\n    \"tJp th 1\\n\"\n    \"gZr ng 1\\n\"\n    \"Bwx wa 1\\n\"\n    \"dWz sz 1\\n\"\n    \"lwM le 1\\n\"\n    \"Iqk qu 1\\n\"\n    \"twZ th 1\\n\"\n    \"Mwt th 1\\n\"\n    \"kjY ij 1\\n\"\n    \"zBv sz 1\\n\"\n    \"iwF in 1\\n\"\n    \"rHz er 1\\n\"\n    \"Sqh th 1\\n\"\n    \"oKq qu 1\\n\"\n    \"qjO qu 1\\n\"\n    \"htQ th 1\\n\"\n    \"cKx ch 1\\n\"\n    \"bqW qu 1\\n\"\n    \"kYh th 1\\n\"\n    \"tBq th 1\\n\"\n    \"gmJ ng 1\\n\"\n    \"eYx er 1\\n\"\n    \"hGv th 1\\n\"\n    \"hQd th 1\\n\"\n    \"pnX an 1\\n\"\n    \"bvJ va 1\\n\"\n    \"sxM st 1\\n\"\n    \"qNt th 1\\n\"\n    \"Wlj le 1\\n\"\n    \"kqD qu 1\\n\"\n    \"qdZ qu 1\\n\"\n    \"mhY th 1\\n\"\n    \"tlC th 1\\n\"\n    \"pqI qu 1\\n\"\n    \"ybD be 1\\n\"\n    \"xAe er 1\\n\"\n    \"pLt th 1\\n\"\n    \"lHb le 1\\n\"\n    \"xVc ch 1\\n\"\n    \"dhN th 1\\n\"\n    \"qxU qu 1\\n\"\n    \"dVf de 1\\n\"\n    \"Zkm ka 1\\n\"\n    \"kpD ka 1\\n\"\n    \"pjH ij 1\\n\"\n    \"yGm me 1\\n\"\n    \"iyP in 1\\n\"\n    \"wmK me 1\\n\"\n    \"mJz sz 1\\n\"\n    \"fmL me 1\\n\"\n    \"cBv ch 1\\n\"\n    \"Vvf va 1\\n\"\n    \"Eql qu 1\\n\"\n    \"ohV th 1\\n\"\n    \"lCx le 1\\n\"\n    \"oWc ch 1\\n\"\n    \"nzX an 1\\n\"\n    \"fIj ij 1\\n\"\n    \"kPt th 1\\n\"\n    \"pYm me 1\\n\"\n    \"zhG th 1\\n\"\n    \"cqN ch 1\\n\"\n    \"umQ qu 1\\n\"\n    \"wXs st 1\\n\"\n    \"lZj le 1\\n\"\n    \"Sxs st 1\\n\"\n    \"Kqd qu 1\\n\"\n    \"tWc th 1\\n\"\n    \"Kcc ch 1\\n\"\n    \"pvB po 1\\n\"\n    \"tgR th 1\\n\"\n    \"yrN er 1\\n\"\n    \"xQr er 1\\n\"\n    \"Xvz sz 1\\n\"\n    \"lJh th 1\\n\"\n    \"Xfk ka 1\\n\"\n    \"Fvr er 1\\n\"\n    \"fUb be 1\\n\"\n    \"lZb le 1\\n\"\n    \"gdI ng 1\\n\"\n    \"joI on 1\\n\"\n    \"yKq qu 1\\n\"\n    \"twz th 1\\n\"\n    \"qJj qu 1\\n\"\n    \"vxM va 1\\n\"\n    \"Vzs st 1\\n\"\n    \"fjR ij 1\\n\"\n    \"Kmz sz 1\\n\"\n    \"qIw qu 1\\n\"\n    \"jyD ij 1\\n\"\n    \"qbU qu 1\\n\"\n    \"qkZ qu 1\\n\"\n    \"jVg ng 1\\n\"\n    \"Fhj th 1\\n\"\n    \"qJq qu 1\\n\"\n    \"wPq qu 1\\n\"\n    \"Ueo er 1\\n\"\n    \"zXd sz 1\\n\"\n    \"gFb ng 1\\n\"\n    \"jJy ij 1\\n\"\n    \"Nsj st 1\\n\"\n    \"lMb le 1\\n\"\n    \"yQn an 1\\n\"\n    \"dnM an 1\\n\"\n    \"yRg ng 1\\n\"\n    \"Fjc ch 1\\n\"\n    \"dKg ng 1\\n\"\n    \"gqV ng 1\\n\"\n    \"gCk ng 1\\n\"\n    \"sOz st 1\\n\"\n    \"hlO th 1\\n\"\n    \"qbN qu 1\\n\"\n    \"sjN st 1\\n\"\n    \"Ujz sz 1\\n\"\n    \"rVm er 1\\n\"\n    \"Wjs st 1\\n\"\n    \"bmM me 1\\n\"\n    \"Vzx sz 1\\n\"\n    \"hZg th 1\\n\"\n    \"zFt th 1\\n\"\n    \"yhJ th 1\\n\"\n    \"vNk ka 1\\n\"\n    \"zbT sz 1\\n\"\n    \"xmJ me 1\\n\"\n    \"Fcs ch 1\\n\"\n    \"yTc ch 1\\n\"\n    \"cSg ch 1\\n\"\n    \"qmP qu 1\\n\"\n    \"mFz sz 1\\n\"\n    \"bdI de 1\\n\"\n    \"jlK le 1\\n\"\n    \"bnB an 1\\n\"\n    \"qyQ qu 1\\n\"\n    \"Vjk ij 1\\n\"\n    \"hzU th 1\\n\"\n    \"qgp qu 1\\n\"\n    \"lqW qu 1\\n\"\n    \"fNn an 1\\n\"\n    \"Tjp ij 1\\n\"\n    \"vlV le 1\\n\"\n    \"rVp er 1\\n\"\n    \"bLd de 1\\n\"\n    \"ydQ de 1\\n\"\n    \"gYg ng 1\\n\"\n    \"qhE th 1\\n\"\n    \"Gsq qu 1\\n\"\n    \"gWz ng 1\\n\"\n    \"Qtk th 1\\n\"\n    \"Hzw sz 1\\n\"\n    \"kIo ho 1\\n\"\n    \"kfC ka 1\\n\"\n    \"zBg ng 1\\n\"\n    \"jJp ij 1\\n\"\n    \"eIq qu 1\\n\"\n    \"vuB qu 1\\n\"\n    \"Wbg ng 1\\n\"\n    \"Jjp ij 1\\n\"\n    \"lXk le 1\\n\"\n    \"Tfx fo 1\\n\"\n    \"zLl le 1\\n\"\n    \"dqT qu 1\\n\"\n    \"oZq qu 1\\n\"\n    \"Jfu qu 1\\n\"\n    \"Qhh th 1\\n\"\n    \"qkK qu 1\\n\"\n    \"Ejc ch 1\\n\"\n    \"zwN sz 1\\n\"\n    \"yQq qu 1\\n\"\n    \"dDp de 1\\n\"\n    \"Pww wa 1\\n\"\n    \"ztC th 1\\n\"\n    \"jtH th 1\\n\"\n    \"yrX er 1\\n\"\n    \"vwT va 1\\n\"\n    \"yRh th 1\\n\"\n    \"wQt th 1\\n\"\n    \"lXz le 1\\n\"\n    \"cfL ch 1\\n\"\n    \"Fwl le 1\\n\"\n    \"rNw er 1\\n\"\n    \"Bhx th 1\\n\"\n    \"glZ ng 1\\n\"\n    \"gcD ch 1\\n\"\n    \"Sfs st 1\\n\"\n    \"Uzf sz 1\\n\"\n    \"Tdl le 1\\n\"\n    \"dRn an 1\\n\"\n    \"vYw va 1\\n\"\n    \"xcD ch 1\\n\"\n    \"xcC ch 1\\n\"\n    \"lBx le 1\\n\"\n    \"gHq qu 1\\n\"\n    \"wJy wa 1\\n\"\n    \"yrO er 1\\n\"\n    \"vqF qu 1\\n\"\n    \"tYb th 1\\n\"\n    \"Zjw ij 1\\n\"\n    \"jLk ij 1\\n\"\n    \"Hvf va 1\\n\"\n    \"pnS an 1\\n\"\n    \"pcT ch 1\\n\"\n    \"sFk st 1\\n\"\n    \"dcO ch 1\\n\"\n    \"zPw sz 1\\n\"\n    \"vNf va 1\\n\"\n    \"Gdx de 1\\n\"\n    \"dlP le 1\\n\"\n    \"jLx jo 1\\n\"\n    \"jZj ij 1\\n\"\n    \"wwT wa 1\\n\"\n    \"tGx th 1\\n\"\n    \"fhS th 1\\n\"\n    \"Xtk th 1\\n\"\n    \"xnW on 1\\n\"\n    \"pkJ ka 1\\n\"\n    \"xIo on 1\\n\"\n    \"Zxb be 1\\n\"\n    \"nOj an 1\\n\"\n    \"wHj ij 1\\n\"\n    \"fjS ij 1\\n\"\n    \"wdL de 1\\n\"\n    \"jbN ij 1\\n\"\n    \"ykO ka 1\\n\"\n    \"xqB qu 1\\n\"\n    \"qzN qu 1\\n\"\n    \"Qbq qu 1\\n\"\n    \"Fqw qu 1\\n\"\n    \"jWw ij 1\\n\"\n    \"nxM an 1\\n\"\n    \"tpX th 1\\n\"\n    \"Ttz th 1\\n\"\n    \"zsH st 1\\n\"\n    \"fjz sz 1\\n\"\n    \"xIg ng 1\\n\"\n    \"xkY ka 1\\n\"\n    \"Fqa an 1\\n\"\n    \"oGk on 1\\n\"\n    \"Hnc an 1\\n\"\n    \"jPq qu 1\\n\"\n    \"zlW le 1\\n\"\n    \"uRx qu 1\\n\"\n    \"uGx qu 1\\n\"\n    \"jYv ij 1\\n\"\n    \"Kpz sz 1\\n\"\n    \"gQo ng 1\\n\"\n    \"Kwx wa 1\\n\"\n    \"jNw ij 1\\n\"\n    \"tdD th 1\\n\"\n    \"yGj ij 1\\n\"\n    \"Lbq qu 1\\n\"\n    \"Rrc ch 1\\n\"\n    \"qvX qu 1\\n\"\n    \"hhK th 1\\n\"\n    \"kZx ka 1\\n\"\n    \"xDf fo 1\\n\"\n    \"Pjf ij 1\\n\"\n    \"cgF ch 1\\n\"\n    \"vCk ka 1\\n\"\n    \"fWw ow 1\\n\"\n    \"mJp me 1\\n\"\n    \"fXe er 1\\n\"\n    \"uYp qu 1\\n\"\n    \"jHk ij 1\\n\"\n    \"wdP de 1\\n\"\n    \"qFk qu 1\\n\"\n    \"jrG er 1\\n\"\n    \"fgD ng 1\\n\"\n    \"fsG st 1\\n\"\n    \"Vgb ng 1\\n\"\n    \"xAa an 1\\n\"\n    \"gtZ th 1\\n\"\n    \"tlq th 1\\n\"\n    \"Tmw me 1\\n\"\n    \"gyY ng 1\\n\"\n    \"Qxt th 1\\n\"\n    \"Uxz sz 1\\n\"\n    \"iVr in 1\\n\"\n    \"zqI qu 1\\n\"\n    \"Nbw wa 1\\n\"\n    \"Dhd th 1\\n\"\n    \"mOq qu 1\\n\"\n    \"iBd in 1\\n\"\n    \"cqB ch 1\\n\"\n    \"zQq qu 1\\n\"\n    \"Wbv va 1\\n\"\n    \"Qks ka 1\\n\"\n    \"qPa an 1\\n\"\n    \"tfI th 1\\n\"\n    \"mZs st 1\\n\"\n    \"pDs st 1\\n\"\n    \"nJj an 1\\n\"\n    \"zcp ch 1\\n\"\n    \"tWj th 1\\n\"\n    \"Zxp pr 1\\n\"\n    \"vPy va 1\\n\"\n    \"dxK de 1\\n\"\n    \"oPv on 1\\n\"\n    \"rjN er 1\\n\"\n    \"oQh th 1\\n\"\n    \"vwH va 1\\n\"\n    \"Qhp th 1\\n\"\n    \"xsU st 1\\n\"\n    \"kGq qu 1\\n\"\n    \"wjW ij 1\\n\"\n    \"Pwx wa 1\\n\"\n    \"Bbn an 1\\n\"\n    \"xOq qu 1\\n\"\n    \"qpN qu 1\\n\"\n    \"nbq an 1\\n\"\n    \"zpM sz 1\\n\"\n    \"jmB ij 1\\n\"\n    \"Nqj qu 1\\n\"\n    \"zYd sz 1\\n\"\n    \"Ybc ch 1\\n\"\n    \"xcW ch 1\\n\"\n    \"gPg ng 1\\n\"\n    \"Qys st 1\\n\"\n    \"Bhq th 1\\n\"\n    \"yGx ny 1\\n\"\n    \"qxL qu 1\\n\"\n    \"Jfd de 1\\n\"\n    \"mbV me 1\\n\"\n    \"pkY ka 1\\n\"\n    \"cWl ch 1\\n\"\n    \"wBg ng 1\\n\"\n    \"vOw va 1\\n\"\n    \"Gpb pr 1\\n\"\n    \"Ppq qu 1\\n\"\n    \"fsX st 1\\n\"\n    \"vtQ th 1\\n\"\n    \"yCj ij 1\\n\"\n    \"yoY on 1\\n\"\n    \"pwQ pr 1\\n\"\n    \"yGd de 1\\n\"\n    \"qtJ th 1\\n\"\n    \"nrZ an 1\\n\"\n    \"eVx er 1\\n\"\n    \"Nrq qu 1\\n\"\n    \"wtA th 1\\n\"\n    \"fHf fo 1\\n\"\n    \"gsQ ng 1\\n\"\n    \"hlC th 1\\n\"\n    \"dLc ch 1\\n\"\n    \"zjC sz 1\\n\"\n    \"jvY ij 1\\n\"\n    \"tIj th 1\\n\"\n    \"pvL va 1\\n\"\n    \"Hhg th 1\\n\"\n    \"yMv va 1\\n\"\n    \"xMn an 1\\n\"\n    \"tYx th 1\\n\"\n    \"vVp va 1\\n\"\n    \"Ynb an 1\\n\"\n    \"vmX va 1\\n\"\n    \"qjQ qu 1\\n\"\n    \"vQr er 1\\n\"\n    \"hQz th 1\\n\"\n    \"mNf me 1\\n\"\n    \"zfY sz 1\\n\"\n    \"xjS ij 1\\n\"\n    \"jBm ij 1\\n\"\n    \"jpq qu 1\\n\"\n    \"nJq an 1\\n\"\n    \"Knz an 1\\n\"\n    \"gGf ng 1\\n\"\n    \"pZx pr 1\\n\"\n    \"Gql qu 1\\n\"\n    \"Uqm qu 1\\n\"\n    \"eWv er 1\\n\"\n    \"fGg ng 1\\n\"\n    \"qsA qu 1\\n\"\n    \"uhY th 1\\n\"\n    \"xhH th 1\\n\"\n    \"yxS ny 1\\n\"\n    \"rxK er 1\\n\"\n    \"hNc th 1\\n\"\n    \"Vwh th 1\\n\"\n    \"aNv an 1\\n\"\n    \"Qzv sz 1\\n\"\n    \"fQn an 1\\n\"\n    \"jzH sz 1\\n\"\n    \"Rvh th 1\\n\"\n    \"Qpt th 1\\n\"\n    \"qXv qu 1\\n\"\n    \"phQ th 1\\n\"\n    \"Qlb le 1\\n\"\n    \"bnQ an 1\\n\"\n    \"njK an 1\\n\"\n    \"Jjs st 1\\n\"\n    \"tJx th 1\\n\"\n    \"iwX in 1\\n\"\n    \"nVd an 1\\n\"\n    \"kzA sz 1\\n\"\n    \"uwE qu 1\\n\"\n    \"Tsq qu 1\\n\"\n    \"hqM th 1\\n\"\n    \"Rnq an 1\\n\"\n    \"rDn an 1\\n\"\n    \"yNb be 1\\n\"\n    \"uqN qu 1\\n\"\n    \"fKw wa 1\\n\"\n    \"Iqn an 1\\n\"\n    \"xHc ch 1\\n\"\n    \"Wwq qu 1\\n\"\n    \"gMw ng 1\\n\"\n    \"yWf ny 1\\n\"\n    \"vcO ch 1\\n\"\n    \"Gkm ka 1\\n\"\n    \"fRh th 1\\n\"\n    \"dMc nd 1\\n\"\n    \"Zhx th 1\\n\"\n    \"qlH qu 1\\n\"\n    \"qUl qu 1\\n\"\n    \"zHf sz 1\\n\"\n    \"wCk ka 1\\n\"\n    \"Qfj ij 1\\n\"\n    \"Qkw ka 1\\n\"\n    \"mYh th 1\\n\"\n    \"dcU ch 1\\n\"\n    \"jTf ij 1\\n\"\n    \"rjF er 1\\n\"\n    \"hxQ th 1\\n\"\n    \"wNf wa 1\\n\"\n    \"Lgg ng 1\\n\"\n    \"Fdu qu 1\\n\"\n    \"tJw th 1\\n\"\n    \"ycQ ch 1\\n\"\n    \"xXf fo 1\\n\"\n    \"wwQ wa 1\\n\"\n    \"evQ er 1\\n\"\n    \"Fcj ch 1\\n\"\n    \"Cyq qu 1\\n\"\n    \"tpF th 1\\n\"\n    \"Axj ij 1\\n\"\n    \"zGg ng 1\\n\"\n    \"Qbb be 1\\n\"\n    \"vfY va 1\\n\"\n    \"oXd on 1\\n\"\n    \"wAq qu 1\\n\"\n    \"Xbk ka 1\\n\"\n    \"wmR me 1\\n\"\n    \"rzN er 1\\n\"\n    \"fcB ch 1\\n\"\n    \"Bwc ch 1\\n\"\n    \"xgS ng 1\\n\"\n    \"dQr er 1\\n\"\n    \"kJw ka 1\\n\"\n    \"bgx ng 1\\n\"\n    \"pZs sz 1\\n\"\n    \"wfA wa 1\\n\"\n    \"jmX ij 1\\n\"\n    \"dNp de 1\\n\"\n    \"Vxr er 1\\n\"\n    \"Rvb va 1\\n\"\n    \"wZl le 1\\n\"\n    \"wgA ng 1\\n\"\n    \"Wrq qu 1\\n\"\n    \"Jcq ch 1\\n\"\n    \"ljW le 1\\n\"\n    \"qPt th 1\\n\"\n    \"gjY ng 1\\n\"\n    \"jUo on 1\\n\"\n    \"mIj ij 1\\n\"\n    \"Hpy pr 1\\n\"\n    \"Mpj ij 1\\n\"\n    \"bkO ka 1\\n\"\n    \"Avz sz 1\\n\"\n    \"vKk ka 1\\n\"\n    \"Bfz sz 1\\n\"\n    \"yYj ij 1\\n\"\n    \"Egq qu 1\\n\"\n    \"wxH wa 1\\n\"\n    \"zHh th 1\\n\"\n    \"svA st 1\\n\"\n    \"zcP ch 1\\n\"\n    \"Bxo on 1\\n\"\n    \"hSv th 1\\n\"\n    \"Lxt th 1\\n\"\n    \"hBz th 1\\n\"\n    \"cWk ch 1\\n\"\n    \"xBv va 1\\n\"\n    \"hwN th 1\\n\"\n    \"mkJ ka 1\\n\"\n    \"oNj on 1\\n\"\n    \"Ugq qu 1\\n\"\n    \"jZq qu 1\\n\"\n    \"xfP fo 1\\n\"\n    \"bYv va 1\\n\"\n    \"qxF qu 1\\n\"\n    \"dcI ch 1\\n\"\n    \"dhY th 1\\n\"\n    \"cvP ch 1\\n\"\n    \"qUy qu 1\\n\"\n    \"mxC me 1\\n\"\n    \"zPx sz 1\\n\"\n    \"Nql qu 1\\n\"\n    \"Yfw wa 1\\n\"\n    \"Wgp ng 1\\n\"\n    \"jgD ng 1\\n\"\n    \"Qfq qu 1\\n\"\n    \"lcW ch 1\\n\"\n    \"qxy qu 1\\n\"\n    \"Xpq qu 1\\n\"\n    \"wrD er 1\\n\"\n    \"bEo on 1\\n\"\n    \"bzV sz 1\\n\"\n    \"fwS wa 1\\n\"\n    \"mLj ij 1\\n\"\n    \"wMr er 1\\n\"\n    \"vFb va 1\\n\"\n    \"zfT sz 1\\n\"\n    \"nRk an 1\\n\"\n    \"kJh th 1\\n\"\n    \"Rmw me 1\\n\"\n    \"nqR an 1\\n\"\n    \"qpO qu 1\\n\"\n    \"bHb be 1\\n\"\n    \"Tkq qu 1\\n\"\n    \"sjG st 1\\n\"\n    \"qaT an 1\\n\"\n    \"Pql qu 1\\n\"\n    \"hlQ th 1\\n\"\n    \"kzW sz 1\\n\"\n    \"yFc ch 1\\n\"\n    \"uBv qu 1\\n\"\n    \"vxO va 1\\n\"\n    \"qvC qu 1\\n\"\n    \"Yqx qu 1\\n\"\n    \"jCb ij 1\\n\"\n    \"Qjk ij 1\\n\"\n    \"fBh th 1\\n\"\n    \"vKq qu 1\\n\"\n    \"rMg ng 1\\n\"\n    \"hRw th 1\\n\"\n    \"ykU ka 1\\n\"\n    \"bUq qu 1\\n\"\n    \"vYv va 1\\n\"\n    \"Pdx de 1\\n\"\n    \"oGv on 1\\n\"\n    \"jLy ij 1\\n\"\n    \"duY qu 1\\n\"\n    \"Wcp ch 1\\n\"\n    \"oGx on 1\\n\"\n    \"vGl le 1\\n\"\n    \"Jdz sz 1\\n\"\n    \"ijH in 1\\n\"\n    \"mlX le 1\\n\"\n    \"jNr er 1\\n\"\n    \"kCq qu 1\\n\"\n    \"Ghh th 1\\n\"\n    \"rMv er 1\\n\"\n    \"Bgp ng 1\\n\"\n    \"bFt th 1\\n\"\n    \"uWl qu 1\\n\"\n    \"dXg ng 1\\n\"\n    \"Wcf ch 1\\n\"\n    \"dbI de 1\\n\"\n    \"bGx be 1\\n\"\n    \"exQ er 1\\n\"\n    \"jWj jo 1\\n\"\n    \"pQb pr 1\\n\"\n    \"jcH ch 1\\n\"\n    \"qOl qu 1\\n\"\n    \"mtL th 1\\n\"\n    \"crC ch 1\\n\"\n    \"pBh th 1\\n\"\n    \"Wlz le 1\\n\"\n    \"nHn an 1\\n\"\n    \"Hfp pr 1\\n\"\n    \"Xpc ch 1\\n\"\n    \"Uxp pr 1\\n\"\n    \"Ksq qu 1\\n\"\n    \"xWk ka 1\\n\"\n    \"nqZ an 1\\n\"\n    \"Cxd de 1\\n\"\n    \"zJx sz 1\\n\"\n    \"rWq qu 1\\n\"\n    \"Cbq qu 1\\n\"\n    \"qqP qu 1\\n\"\n    \"lhU th 1\\n\"\n    \"Ufv va 1\\n\"\n    \"Uxg ng 1\\n\"\n    \"hJf th 1\\n\"\n    \"nvQ an 1\\n\"\n    \"dhF th 1\\n\"\n    \"Cvb va 1\\n\"\n    \"aPf an 1\\n\"\n    \"Jxj ij 1\\n\"\n    \"Dwp pr 1\\n\"\n    \"Ixw wa 1\\n\"\n    \"kfS ka 1\\n\"\n    \"rZm er 1\\n\"\n    \"fmE me 1\\n\"\n    \"sLq qu 1\\n\"\n    \"bmR me 1\\n\"\n    \"uCs qu 1\\n\"\n    \"kFm ka 1\\n\"\n    \"Kqk qu 1\\n\"\n    \"xQk ka 1\\n\"\n    \"Sfn an 1\\n\"\n    \"fgU ng 1\\n\"\n    \"vvT va 1\\n\"\n    \"mQe er 1\\n\"\n    \"Gbt th 1\\n\"\n    \"tbY th 1\\n\"\n    \"lQk le 1\\n\"\n    \"cIh th 1\\n\"\n    \"Tjq qu 1\\n\"\n    \"nQg an 1\\n\"\n    \"yYp pr 1\\n\"\n    \"qPw qu 1\\n\"\n    \"xOa an 1\\n\"\n    \"pNw pr 1\\n\"\n    \"fJz sz 1\\n\"\n    \"zHb sz 1\\n\"\n    \"kBh th 1\\n\"\n    \"fdE de 1\\n\"\n    \"wPg ng 1\\n\"\n    \"lVv le 1\\n\"\n    \"mPw me 1\\n\"\n    \"Rmg ng 1\\n\"\n    \"xoE on 1\\n\"\n    \"hnJ th 1\\n\"\n    \"uvE qu 1\\n\"\n    \"Woq qu 1\\n\"\n    \"ucX ch 1\\n\"\n    \"nmD an 1\\n\"\n    \"pcX ch 1\\n\"\n    \"hDw th 1\\n\"\n    \"dgI ng 1\\n\"\n    \"vVd de 1\\n\"\n    \"tDh ch 1\\n\"\n    \"jHn an 1\\n\"\n    \"hkX th 1\\n\"\n    \"pxT pr 1\\n\"\n    \"xYz sz 1\\n\"\n    \"rTp er 1\\n\"\n    \"Ubz sz 1\\n\"\n    \"Llm le 1\\n\"\n    \"yjZ ij 1\\n\"\n    \"Qss st 1\\n\"\n    \"cfM ch 1\\n\"\n    \"jbG be 1\\n\"\n    \"Jfz sz 1\\n\"\n    \"mWb me 1\\n\"\n    \"jDp ij 1\\n\"\n    \"lWz le 1\\n\"\n    \"cXy ch 1\\n\"\n    \"oQr er 1\\n\"\n    \"ucZ ch 1\\n\"\n    \"cvN ch 1\\n\"\n    \"cvK ch 1\\n\"\n    \"zDk sz 1\\n\"\n    \"bLr er 1\\n\"\n    \"dDl le 1\\n\"\n    \"hhD th 1\\n\"\n    \"vmK va 1\\n\"\n    \"hLt th 1\\n\"\n    \"mqW qu 1\\n\"\n    \"Bfs st 1\\n\"\n    \"Acj ch 1\\n\"\n    \"dcG ch 1\\n\"\n    \"yJc ch 1\\n\"\n    \"mfS me 1\\n\"\n    \"drL er 1\\n\"\n    \"qyK qu 1\\n\"\n    \"tQz th 1\\n\"\n    \"jrL er 1\\n\"\n    \"ccJ ch 1\\n\"\n    \"wpX pr 1\\n\"\n    \"Zzf sz 1\\n\"\n    \"snU an 1\\n\"\n    \"qEw qu 1\\n\"\n    \"tQb th 1\\n\"\n    \"mPd de 1\\n\"\n    \"vJq qu 1\\n\"\n    \"vpU va 1\\n\"\n    \"vzM sz 1\\n\"\n    \"uZb qu 1\\n\"\n    \"ywU wa 1\\n\"\n    \"Rjs st 1\\n\"\n    \"hKt th 1\\n\"\n    \"Bfb be 1\\n\"\n    \"wuQ qu 1\\n\"\n    \"bvM va 1\\n\"\n    \"yiW in 1\\n\"\n    \"hqC th 1\\n\"\n    \"iUq qu 1\\n\"\n    \"lBd le 1\\n\"\n    \"Zxj ij 1\\n\"\n    \"wpW pr 1\\n\"\n    \"rHm er 1\\n\"\n    \"mhQ th 1\\n\"\n    \"fMb be 1\\n\"\n    \"vWf va 1\\n\"\n    \"Fdq qu 1\\n\"\n    \"jGb ij 1\\n\"\n    \"Dhw th 1\\n\"\n    \"cjR ch 1\\n\"\n    \"kvD ka 1\\n\"\n    \"qvD qu 1\\n\"\n    \"Xmk ka 1\\n\"\n    \"Cjj ij 1\\n\"\n    \"kkX ka 1\\n\"\n    \"qkF qu 1\\n\"\n    \"vWg ng 1\\n\"\n    \"Msq qu 1\\n\"\n    \"nNv an 1\\n\"\n    \"Hzu qu 1\\n\"\n    \"zrY er 1\\n\"\n    \"hgB th 1\\n\"\n    \"pwB pr 1\\n\"\n    \"Jxc ch 1\\n\"\n    \"vcJ ch 1\\n\"\n    \"sYw st 1\\n\"\n    \"Tqx qu 1\\n\"\n    \"eJf le 1\\n\"\n    \"czJ ch 1\\n\"\n    \"Qyh th 1\\n\"\n    \"bvV va 1\\n\"\n    \"Xyh th 1\\n\"\n    \"fjq qu 1\\n\"\n    \"dYc ch 1\\n\"\n    \"pBx pr 1\\n\"\n    \"jvR ij 1\\n\"\n    \"gbH ng 1\\n\"\n    \"ygH ng 1\\n\"\n    \"hbV th 1\\n\"\n    \"lwU le 1\\n\"\n    \"tJk th 1\\n\"\n    \"pIw pr 1\\n\"\n    \"Vjl le 1\\n\"\n    \"Dgm ng 1\\n\"\n    \"nvR an 1\\n\"\n    \"yRp pr 1\\n\"\n    \"fOj ij 1\\n\"\n    \"Ecf ch 1\\n\"\n    \"Zrf er 1\\n\"\n    \"mxD me 1\\n\"\n    \"Iqf qu 1\\n\"\n    \"zBj sz 1\\n\"\n    \"tTs th 1\\n\"\n    \"lqB qu 1\\n\"\n    \"kCv ka 1\\n\"\n    \"nVh th 1\\n\"\n    \"jGq qu 1\\n\"\n    \"cgQ ch 1\\n\"\n    \"Ppd de 1\\n\"\n    \"Jcd ch 1\\n\"\n    \"hhP th 1\\n\"\n    \"sLg ng 1\\n\"\n    \"xYt th 1\\n\"\n    \"Qps st 1\\n\"\n    \"sfE st 1\\n\"\n    \"wxR wa 1\\n\"\n    \"pFp pr 1\\n\"\n    \"Ymf me 1\\n\"\n    \"Jgy ng 1\\n\"\n    \"yvI va 1\\n\"\n    \"Ncz ch 1\\n\"\n    \"wBf wa 1\\n\"\n    \"rVx er 1\\n\"\n    \"jvX ij 1\\n\"\n    \"nYp an 1\\n\"\n    \"nNb an 1\\n\"\n    \"cQi ch 1\\n\"\n    \"Qwy wa 1\\n\"\n    \"vPf va 1\\n\"\n    \"qvd qu 1\\n\"\n    \"hkD th 1\\n\"\n    \"Wmr er 1\\n\"\n    \"gdY ng 1\\n\"\n    \"Kjj ij 1\\n\"\n    \"qsN qu 1\\n\"\n    \"vJg ng 1\\n\"\n    \"mDc ch 1\\n\"\n    \"kvF ka 1\\n\"\n    \"kWx ka 1\\n\"\n    \"xYu qu 1\\n\"\n    \"eMq qu 1\\n\"\n    \"mYy me 1\\n\"\n    \"Hxt th 1\\n\"\n    \"pbM pr 1\\n\"\n    \"Hwd de 1\\n\"\n    \"mWu qu 1\\n\"\n    \"zNs st 1\\n\"\n    \"Qjh th 1\\n\"\n    \"aqD an 1\\n\"\n    \"Gcd ch 1\\n\"\n    \"btX th 1\\n\"\n    \"Zql qu 1\\n\"\n    \"Ujw ij 1\\n\"\n    \"yvM va 1\\n\"\n    \"Hhw th 1\\n\"\n    \"zWd sz 1\\n\"\n    \"pYj ij 1\\n\"\n    \"xWt th 1\\n\"\n    \"ylO le 1\\n\"\n    \"cnX ch 1\\n\"\n    \"cMf ch 1\\n\"\n    \"pKb pr 1\\n\"\n    \"woV on 1\\n\"\n    \"fzG sz 1\\n\"\n    \"Lqb qu 1\\n\"\n    \"eOj er 1\\n\"\n    \"Gtb th 1\\n\"\n    \"clX ch 1\\n\"\n    \"kdC de 1\\n\"\n    \"cfq ch 1\\n\"\n    \"hKk th 1\\n\"\n    \"cJi ch 1\\n\"\n    \"uSb qu 1\\n\"\n    \"jgT ng 1\\n\"\n    \"tcG th 1\\n\"\n    \"qNv qu 1\\n\"\n    \"fpB pr 1\\n\"\n    \"vPw va 1\\n\"\n    \"jmA ij 1\\n\"\n    \"dxI de 1\\n\"\n    \"jGg ng 1\\n\"\n    \"Bvg ng 1\\n\"\n    \"qrC qu 1\\n\"\n    \"nPx an 1\\n\"\n    \"Qmn an 1\\n\"\n    \"cqC ch 1\\n\"\n    \"kFh th 1\\n\"\n    \"Jtf th 1\\n\"\n    \"Cqz qu 1\\n\"\n    \"rCd er 1\\n\"\n    \"Zms st 1\\n\"\n    \"dVq qu 1\\n\"\n    \"Gwg ng 1\\n\"\n    \"cwP ch 1\\n\"\n    \"wVu qu 1\\n\"\n    \"dNg ng 1\\n\"\n    \"jXc ch 1\\n\"\n    \"Mbz sz 1\\n\"\n    \"wvG ve 1\\n\"\n    \"Vpw pr 1\\n\"\n    \"yXq qu 1\\n\"\n    \"hlK th 1\\n\"\n    \"pYv va 1\\n\"\n    \"Fbd de 1\\n\"\n    \"zcV ch 1\\n\"\n    \"rQk er 1\\n\"\n    \"wtN th 1\\n\"\n    \"qeI qu 1\\n\"\n    \"eGt th 1\\n\"\n    \"kMq qu 1\\n\"\n    \"kqS qu 1\\n\"\n    \"cqd ch 1\\n\"\n    \"pLf po 1\\n\"\n    \"xvO va 1\\n\"\n    \"rfH er 1\\n\"\n    \"gIq qu 1\\n\"\n    \"Pqk qu 1\\n\"\n    \"xCn an 1\\n\"\n    \"dVs st 1\\n\"\n    \"iqY qu 1\\n\"\n    \"bsJ st 1\\n\"\n    \"Vww wa 1\\n\"\n    \"Znm an 1\\n\"\n    \"Yrz er 1\\n\"\n    \"Rvz sz 1\\n\"\n    \"dzK de 1\\n\"\n    \"zbW sz 1\\n\"\n    \"tkx th 1\\n\"\n    \"xkP ka 1\\n\"\n    \"kzS sz 1\\n\"\n    \"gXq qu 1\\n\"\n    \"Lxf fo 1\\n\"\n    \"Fwr er 1\\n\"\n    \"lHs le 1\\n\"\n    \"zrB er 1\\n\"\n    \"jNb ij 1\\n\"\n    \"Hxy ny 1\\n\"\n    \"Gfw wa 1\\n\"\n    \"Egw ng 1\\n\"\n    \"Jxw wa 1\\n\"\n    \"tVm th 1\\n\"\n    \"bwQ wa 1\\n\"\n    \"gIx ng 1\\n\"\n    \"Wqu un 1\\n\"\n    \"jvI ij 1\\n\"\n    \"cGc ch 1\\n\"\n    \"kSb ka 1\\n\"\n    \"hxG th 1\\n\"\n    \"zHm sz 1\\n\"\n    \"Jpk ka 1\\n\"\n    \"fVb be 1\\n\"\n    \"Ukf ka 1\\n\"\n    \"rxF er 1\\n\"\n    \"dVu qu 1\\n\"\n    \"sdX st 1\\n\"\n    \"mjM ij 1\\n\"\n    \"xwq qu 1\\n\"\n    \"Ogk ng 1\\n\"\n    \"qhr th 1\\n\"\n    \"vfA va 1\\n\"\n    \"qbA qu 1\\n\"\n    \"Lfu qu 1\\n\"\n    \"hzY th 1\\n\"\n    \"iHf in 1\\n\"\n    \"jxb ij 1\\n\"\n    \"vmP va 1\\n\"\n    \"bvI va 1\\n\"\n    \"fmH me 1\\n\"\n    \"qtx th 1\\n\"\n    \"bvQ va 1\\n\"\n    \"qzX qu 1\\n\"\n    \"bVn an 1\\n\"\n    \"Xmt th 1\\n\"\n    \"qXo qu 1\\n\"\n    \"pfD pr 1\\n\"\n    \"fCd de 1\\n\"\n    \"vbx va 1\\n\"\n    \"Zhz th 1\\n\"\n    \"Kwg ng 1\\n\"\n    \"rcJ ch 1\\n\"\n    \"jlT le 1\\n\"\n    \"jzM sz 1\\n\"\n    \"rpP er 1\\n\"\n    \"tmA th 1\\n\"\n    \"aYw an 1\\n\"\n    \"zBq qu 1\\n\"\n    \"xhT th 1\\n\"\n    \"yLq qu 1\\n\"\n    \"cKf ch 1\\n\"\n    \"qdP qu 1\\n\"\n    \"Ybx be 1\\n\"\n    \"dHs st 1\\n\"\n    \"jhH th 1\\n\"\n    \"Bsv st 1\\n\"\n    \"rZt th 1\\n\"\n    \"mhJ th 1\\n\"\n    \"Zwq qu 1\\n\"\n    \"kXf ka 1\\n\"\n    \"zvT sz 1\\n\"\n    \"yiC in 1\\n\"\n    \"gkT ng 1\\n\"\n    \"nJw an 1\\n\"\n    \"zpV sz 1\\n\"\n    \"tPq th 1\\n\"\n    \"cVt th 1\\n\"\n    \"dBg ng 1\\n\"\n    \"cRf ch 1\\n\"\n    \"vRq qu 1\\n\"\n    \"jgA ng 1\\n\"\n    \"bMz sz 1\\n\"\n    \"hJh th 1\\n\"\n    \"mHd de 1\\n\"\n    \"Ckq qu 1\\n\"\n    \"qcj ch 1\\n\"\n    \"yIb be 1\\n\"\n    \"wqE qu 1\\n\"\n    \"pMh th 1\\n\"\n    \"Hqj qu 1\\n\"\n    \"jZu qu 1\\n\"\n    \"iqO qu 1\\n\"\n    \"tqC th 1\\n\"\n    \"qoK qu 1\\n\"\n    \"Knq an 1\\n\"\n    \"bQm me 1\\n\"\n    \"uuX qu 1\\n\"\n    \"Wzc ch 1\\n\"\n    \"Pxy ny 1\\n\"\n    \"Qgf ng 1\\n\"\n    \"sFw st 1\\n\"\n    \"gHf ng 1\\n\"\n    \"kgN ng 1\\n\"\n    \"rCw er 1\\n\"\n    \"Yjy ij 1\\n\"\n    \"pnV an 1\\n\"\n    \"fbS be 1\\n\"\n    \"iHz in 1\\n\"\n    \"kGx ka 1\\n\"\n    \"kwS ka 1\\n\"\n    \"sDm st 1\\n\"\n    \"Vhk th 1\\n\"\n    \"phN th 1\\n\"\n    \"Jbf be 1\\n\"\n    \"pWz sz 1\\n\"\n    \"vvQ va 1\\n\"\n    \"vNm va 1\\n\"\n    \"lYw le 1\\n\"\n    \"zHx sz 1\\n\"\n    \"Zzc ch 1\\n\"\n    \"bDt th 1\\n\"\n    \"Fcv ch 1\\n\"\n    \"dJg ng 1\\n\"\n    \"Qwb wa 1\\n\"\n    \"qFw qu 1\\n\"\n    \"wmO me 1\\n\"\n    \"Bvy va 1\\n\"\n    \"qgY qu 1\\n\"\n    \"vYs st 1\\n\"\n    \"xwF wa 1\\n\"\n    \"qwP qu 1\\n\"\n    \"uEc ch 1\\n\"\n    \"mWq qu 1\\n\"\n    \"fzO sz 1\\n\"\n    \"bPg ng 1\\n\"\n    \"pnW an 1\\n\"\n    \"hGx th 1\\n\"\n    \"Vkk ka 1\\n\"\n    \"Xrx er 1\\n\"\n    \"gJd ng 1\\n\"\n    \"Llq qu 1\\n\"\n    \"Vqu un 1\\n\"\n    \"fgH ng 1\\n\"\n    \"Vcy ch 1\\n\"\n    \"hVc th 1\\n\"\n    \"rwZ er 1\\n\"\n    \"Xlc ch 1\\n\"\n    \"xJd de 1\\n\"\n    \"Fnn an 1\\n\"\n    \"Ypj ij 1\\n\"\n    \"lhJ th 1\\n\"\n    \"aUj an 1\\n\"\n    \"lBp pr 1\\n\"\n    \"dlW le 1\\n\"\n    \"pvV va 1\\n\"\n    \"Mwr er 1\\n\"\n    \"Zwc ch 1\\n\"\n    \"wcU ch 1\\n\"\n    \"cVq ch 1\\n\"\n    \"ycU ch 1\\n\"\n    \"Lcq ch 1\\n\"\n    \"rvQ er 1\\n\"\n    \"eYm er 1\\n\"\n    \"qCn an 1\\n\"\n    \"dBx de 1\\n\"\n    \"Iwq qu 1\\n\"\n    \"gMt th 1\\n\"\n    \"bhC th 1\\n\"\n    \"bDs st 1\\n\"\n    \"Vhz th 1\\n\"\n    \"kJz sz 1\\n\"\n    \"Ohz th 1\\n\"\n    \"kDz sz 1\\n\"\n    \"hTn th 1\\n\"\n    \"eqG qu 1\\n\"\n    \"gJr ng 1\\n\"\n    \"Zpz sz 1\\n\"\n    \"hwQ th 1\\n\"\n    \"fgY ng 1\\n\"\n    \"sdV st 1\\n\"\n    \"ljV le 1\\n\"\n    \"yGg ng 1\\n\"\n    \"uWg qu 1\\n\"\n    \"sbO st 1\\n\"\n    \"qdD qu 1\\n\"\n    \"yJj ij 1\\n\"\n    \"nwq an 1\\n\"\n    \"Apq qu 1\\n\"\n    \"ccK ch 1\\n\"\n    \"Qwl le 1\\n\"\n    \"oyQ on 1\\n\"\n    \"lPw le 1\\n\"\n    \"cYt th 1\\n\"\n    \"brG er 1\\n\"\n    \"xkT ka 1\\n\"\n    \"dUj de 1\\n\"\n    \"rhR th 1\\n\"\n    \"xPw wa 1\\n\"\n    \"xoF on 1\\n\"\n    \"hYj th 1\\n\"\n    \"hYw th 1\\n\"\n    \"lPn an 1\\n\"\n    \"zCg ng 1\\n\"\n    \"sJt th 1\\n\"\n    \"wDs st 1\\n\"\n    \"fVh th 1\\n\"\n    \"zwW sz 1\\n\"\n    \"yLj ij 1\\n\"\n    \"aBx an 1\\n\"\n    \"Dvv va 1\\n\"\n    \"tKb th 1\\n\"\n    \"jfG ij 1\\n\"\n    \"xMm me 1\\n\"\n    \"bLp pr 1\\n\"\n    \"xwW wa 1\\n\"\n    \"bzH sz 1\\n\"\n    \"cIw ch 1\\n\"\n    \"zdN sz 1\\n\"\n    \"Ggv va 1\\n\"\n    \"lwV le 1\\n\"\n    \"qyV qu 1\\n\"\n    \"vBv va 1\\n\"\n    \"Owm me 1\\n\"\n    \"Ltx th 1\\n\"\n    \"mqE qu 1\\n\"\n    \"Xjc ch 1\\n\"\n    \"pzY sz 1\\n\"\n    \"Jds st 1\\n\"\n    \"kMl le 1\\n\"\n    \"Ddj de 1\\n\"\n    \"tfX th 1\\n\"\n    \"cqT ch 1\\n\"\n    \"buG qu 1\\n\"\n    \"oHb po 1\\n\"\n    \"vRx va 1\\n\"\n    \"qyq qu 1\\n\"\n    \"kpY ka 1\\n\"\n    \"vqN qu 1\\n\"\n    \"jNq qu 1\\n\"\n    \"cWb ch 1\\n\"\n    \"gbJ ng 1\\n\"\n    \"oZw on 1\\n\"\n    \"cBz ch 1\\n\"\n    \"Pvv va 1\\n\"\n    \"ljI le 1\\n\"\n    \"hvQ th 1\\n\"\n    \"kwY ka 1\\n\"\n    \"hBg th 1\\n\"\n    \"kdN de 1\\n\"\n    \"yxH ny 1\\n\"\n    \"fxH fo 1\\n\"\n    \"tXj th 1\\n\"\n    \"uBx qu 1\\n\"\n    \"uJm qu 1\\n\"\n    \"Gxh th 1\\n\"\n    \"fjK ij 1\\n\"\n    \"gqO qu 1\\n\"\n    \"dMt th 1\\n\"\n    \"lVx le 1\\n\"\n    \"Rhp th 1\\n\"\n    \"cDn ch 1\\n\"\n    \"Xkv ka 1\\n\"\n    \"zmB sz 1\\n\"\n    \"qaY an 1\\n\"\n    \"Ivq qu 1\\n\"\n    \"wmP me 1\\n\"\n    \"bjq qu 1\\n\"\n    \"cmU ch 1\\n\"\n    \"slC le 1\\n\"\n    \"Krx er 1\\n\"\n    \"iVv in 1\\n\"\n    \"Zwz sz 1\\n\"\n    \"yPd de 1\\n\"\n    \"qUv qu 1\\n\"\n    \"Pdz sz 1\\n\"\n    \"Qzk sz 1\\n\"\n    \"zoU on 1\\n\"\n    \"xJf fo 1\\n\"\n    \"Udq qu 1\\n\"\n    \"Qwj ij 1\\n\"\n    \"Kvd de 1\\n\"\n    \"vQw va 1\\n\"\n    \"Rdk de 1\\n\"\n    \"sIj st 1\\n\"\n    \"Ggt th 1\\n\"\n    \"lNw le 1\\n\"\n    \"qvr qu 1\\n\"\n    \"yqD qu 1\\n\"\n    \"fXl le 1\\n\"\n    \"jqg qu 1\\n\"\n    \"qmA qu 1\\n\"\n    \"Tgd ng 1\\n\"\n    \"zpO po 1\\n\"\n    \"tEz th 1\\n\"\n    \"Bqz qu 1\\n\"\n    \"wfL wa 1\\n\"\n    \"vYu qu 1\\n\"\n    \"Dxw wa 1\\n\"\n    \"qWl qu 1\\n\"\n    \"Rzc ch 1\\n\"\n    \"mQo on 1\\n\"\n    \"Ttc th 1\\n\"\n    \"tVv th 1\\n\"\n    \"Rqn an 1\\n\"\n    \"Wcn ch 1\\n\"\n    \"Nwu qu 1\\n\"\n    \"xoJ on 1\\n\"\n    \"vDf va 1\\n\"\n    \"phH th 1\\n\"\n    \"fJs st 1\\n\"\n    \"Pxm me 1\\n\"\n    \"rFb er 1\\n\"\n    \"hlM th 1\\n\"\n    \"mkX ka 1\\n\"\n    \"nnQ an 1\\n\"\n    \"Xfn an 1\\n\"\n    \"sbZ st 1\\n\"\n    \"Yyf ny 1\\n\"\n    \"Bjw ij 1\\n\"\n    \"Ilx le 1\\n\"\n    \"qpA qu 1\\n\"\n    \"Mqc ch 1\\n\"\n    \"gqZ qu 1\\n\"\n    \"sNv st 1\\n\"\n    \"Zvq qu 1\\n\"\n    \"kSx ka 1\\n\"\n    \"vBd de 1\\n\"\n    \"wvZ va 1\\n\"\n    \"Uoe er 1\\n\"\n    \"Fjy ij 1\\n\"\n    \"zKb sz 1\\n\"\n    \"pvI va 1\\n\"\n    \"Zll le 1\\n\"\n    \"hdE th 1\\n\"\n    \"Fpv va 1\\n\"\n    \"lhV th 1\\n\"\n    \"rqQ qu 1\\n\"\n    \"wjG ij 1\\n\"\n    \"pLq qu 1\\n\"\n    \"bpJ pr 1\\n\"\n    \"wzV sz 1\\n\"\n    \"Hgq ng 1\\n\"\n    \"zhW th 1\\n\"\n    \"Lvq qu 1\\n\"\n    \"Xhr th 1\\n\"\n    \"quY un 1\\n\"\n    \"jqZ qu 1\\n\"\n    \"vuH qu 1\\n\"\n    \"Fzj sz 1\\n\"\n    \"gzG ng 1\\n\"\n    \"tFc th 1\\n\"\n    \"vfE va 1\\n\"\n    \"Igx ng 1\\n\"\n    \"fqY qu 1\\n\"\n    \"gYb ng 1\\n\"\n    \"lJg ng 1\\n\"\n    \"wcO ch 1\\n\"\n    \"Qvk ka 1\\n\"\n    \"Tqq qu 1\\n\"\n    \"bdY de 1\\n\"\n    \"wuT qu 1\\n\"\n    \"lHw le 1\\n\"\n    \"zRm sz 1\\n\"\n    \"Hgw ng 1\\n\"\n    \"tPk th 1\\n\"\n    \"Jqv qu 1\\n\"\n    \"tKx th 1\\n\"\n    \"xpA pr 1\\n\"\n    \"bkI ka 1\\n\"\n    \"bSj ij 1\\n\"\n    \"mxW me 1\\n\"\n    \"mjR ij 1\\n\"\n    \"Oip in 1\\n\"\n    \"wyY wa 1\\n\"\n    \"dFc ch 1\\n\"\n    \"qDg qu 1\\n\"\n    \"wXp pr 1\\n\"\n    \"Vbp pr 1\\n\"\n    \"jyN ij 1\\n\"\n    \"yvP va 1\\n\"\n    \"yVr er 1\\n\"\n    \"aWm an 1\\n\"\n    \"Gjk ij 1\\n\"\n    \"Apw pr 1\\n\"\n    \"Zsw st 1\\n\"\n    \"jQv ij 1\\n\"\n    \"jbT ij 1\\n\"\n    \"bdB de 1\\n\"\n    \"kcY ch 1\\n\"\n    \"rqC qu 1\\n\"\n    \"bxD be 1\\n\"\n    \"vlx le 1\\n\"\n    \"kjJ ij 1\\n\"\n    \"xqW qu 1\\n\"\n    \"zxE sz 1\\n\"\n    \"sHf st 1\\n\"\n    \"juF qu 1\\n\"\n    \"kwX ka 1\\n\"\n    \"oqW qu 1\\n\"\n    \"qWt th 1\\n\"\n    \"fHc ch 1\\n\"\n    \"cHc ch 1\\n\"\n    \"Jjm ij 1\\n\"\n    \"xbA be 1\\n\"\n    \"Rqj qu 1\\n\"\n    \"Ijy ij 1\\n\"\n    \"vSx va 1\\n\"\n    \"pVj ij 1\\n\"\n    \"rQx er 1\\n\"\n    \"fmK me 1\\n\"\n    \"fnA an 1\\n\"\n    \"Phv th 1\\n\"\n    \"bhN th 1\\n\"\n    \"Hxp pr 1\\n\"\n    \"Vjq qu 1\\n\"\n    \"lqC qu 1\\n\"\n    \"Whd th 1\\n\"\n    \"zsF st 1\\n\"\n    \"tYt th 1\\n\"\n    \"Jzq qu 1\\n\"\n    \"Nff fo 1\\n\"\n    \"qXs qu 1\\n\"\n    \"xJj ij 1\\n\"\n    \"lXn an 1\\n\"\n    \"Zpv va 1\\n\"\n    \"qTh th 1\\n\"\n    \"npH an 1\\n\"\n    \"kYx ka 1\\n\"\n    \"bBs st 1\\n\"\n    \"vEa an 1\\n\"\n    \"pjq qu 1\\n\"\n    \"qIi qu 1\\n\"\n    \"Fdk de 1\\n\"\n    \"fNx fo 1\\n\"\n    \"Ofh th 1\\n\"\n    \"wXe er 1\\n\"\n    \"mvZ va 1\\n\"\n    \"Cjs st 1\\n\"\n    \"Fmm me 1\\n\"\n    \"pkR ka 1\\n\"\n    \"zfZ sz 1\\n\"\n    \"Zpm me 1\\n\"\n    \"cbA ch 1\\n\"\n    \"tvY th 1\\n\"\n    \"Lmp me 1\\n\"\n    \"gFd ng 1\\n\"\n    \"bFx be 1\\n\"\n    \"Fjm ij 1\\n\"\n    \"wjF ij 1\\n\"\n    \"bjv ij 1\\n\"\n    \"dbT de 1\\n\"\n    \"jmQ ij 1\\n\"\n    \"xFw wa 1\\n\"\n    \"cDk ch 1\\n\"\n    \"hFz th 1\\n\"\n    \"uGm qu 1\\n\"\n    \"Yhx th 1\\n\"\n    \"Vtl th 1\\n\"\n    \"azV an 1\\n\"\n    \"xJs st 1\\n\"\n    \"Mxw wa 1\\n\"\n    \"vgK ng 1\\n\"\n    \"cwQ ch 1\\n\"\n    \"Gnx an 1\\n\"\n    \"lbP le 1\\n\"\n    \"kdS de 1\\n\"\n    \"kDt th 1\\n\"\n    \"Pvq qu 1\\n\"\n    \"yHs st 1\\n\"\n    \"Lgq qu 1\\n\"\n    \"Xmj ij 1\\n\"\n    \"pvA va 1\\n\"\n    \"vUu qu 1\\n\"\n    \"Qju qu 1\\n\"\n    \"qDf qu 1\\n\"\n    \"Gxj ij 1\\n\"\n    \"Gfz sz 1\\n\"\n    \"gbY ng 1\\n\"\n    \"Sjf ij 1\\n\"\n    \"Ogw ng 1\\n\"\n    \"hGt th 1\\n\"\n    \"btT th 1\\n\"\n    \"gwH ng 1\\n\"\n    \"Mwj ij 1\\n\"\n    \"fvU va 1\\n\"\n    \"frG er 1\\n\"\n    \"cMx ch 1\\n\"\n    \"Ydv de 1\\n\"\n    \"xkZ ka 1\\n\"\n    \"fjL ij 1\\n\"\n    \"yPx ny 1\\n\"\n    \"drX er 1\\n\"\n    \"jxR ij 1\\n\"\n    \"hYq th 1\\n\"\n    \"xHn an 1\\n\"\n    \"jrP er 1\\n\"\n    \"tcJ th 1\\n\"\n    \"qJz qu 1\\n\"\n    \"zUd sz 1\\n\"\n    \"jXj ij 1\\n\"\n    \"qDd qu 1\\n\"\n    \"Bjh th 1\\n\"\n    \"qFz sz 1\\n\"\n    \"mxG me 1\\n\"\n    \"xOd de 1\\n\"\n    \"hgL th 1\\n\"\n    \"cpD ch 1\\n\"\n    \"jhS th 1\\n\"\n    \"Zqp qu 1\\n\"\n    \"yNq qu 1\\n\"\n    \"pHq qu 1\\n\"\n    \"rZq qu 1\\n\"\n    \"Wjy ij 1\\n\"\n    \"Tfb be 1\\n\"\n    \"Nwb wa 1\\n\"\n    \"zQk sz 1\\n\"\n    \"Rkc ch 1\\n\"\n    \"Qvw va 1\\n\"\n    \"wlJ le 1\\n\"\n    \"cFp ch 1\\n\"\n    \"oDb on 1\\n\"\n    \"lsY le 1\\n\"\n    \"Zbn an 1\\n\"\n    \"wCd de 1\\n\"\n    \"zxN sz 1\\n\"\n    \"bQf be 1\\n\"\n    \"Kjy ij 1\\n\"\n    \"Ovk ka 1\\n\"\n    \"cxA ch 1\\n\"\n    \"Hqw qu 1\\n\"\n    \"hwY th 1\\n\"\n    \"sGv st 1\\n\"\n    \"Rwn an 1\\n\"\n    \"zvH sz 1\\n\"\n    \"yVw wa 1\\n\"\n    \"zmX sz 1\\n\"\n    \"qdM qu 1\\n\"\n    \"dJv de 1\\n\"\n    \"wDj ij 1\\n\"\n    \"Vhm th 1\\n\"\n    \"fLt th 1\\n\"\n    \"bvC va 1\\n\"\n    \"xVn an 1\\n\"\n    \"Hfx fo 1\\n\"\n    \"tQl th 1\\n\"\n    \"lhW th 1\\n\"\n    \"oqS qu 1\\n\"\n    \"Qya an 1\\n\"\n    \"gZf ng 1\\n\"\n    \"bKy be 1\\n\"\n    \"tjX th 1\\n\"\n    \"Vkc ch 1\\n\"\n    \"yjv ij 1\\n\"\n    \"bgN ng 1\\n\"\n    \"lNm le 1\\n\"\n    \"Jzl le 1\\n\"\n    \"Lwx wa 1\\n\"\n    \"vcL ch 1\\n\"\n    \"yXh th 1\\n\"\n    \"ztZ th 1\\n\"\n    \"yJx ny 1\\n\"\n    \"npV an 1\\n\"\n    \"swG st 1\\n\"\n    \"sXn an 1\\n\"\n    \"eJb er 1\\n\"\n    \"dcR ch 1\\n\"\n    \"Zrg ng 1\\n\"\n    \"Pgv ng 1\\n\"\n    \"xYr er 1\\n\"\n    \"jlI le 1\\n\"\n    \"Fmf me 1\\n\"\n    \"Gqk qu 1\\n\"\n    \"vlZ le 1\\n\"\n    \"Csq qu 1\\n\"\n    \"uQj qu 1\\n\"\n    \"lLm le 1\\n\"\n    \"hwK th 1\\n\"\n    \"cQv ch 1\\n\"\n    \"qfH qu 1\\n\"\n    \"rRw er 1\\n\"\n    \"aUo an 1\\n\"\n    \"qpE qu 1\\n\"\n    \"lPc ch 1\\n\"\n    \"dHd de 1\\n\"\n    \"gqL qu 1\\n\"\n    \"zWp sz 1\\n\"\n    \"bBq be 1\\n\"\n    \"wWp pr 1\\n\"\n    \"cfK ch 1\\n\"\n    \"fWx fo 1\\n\"\n    \"rvV er 1\\n\"\n    \"zhR th 1\\n\"\n    \"Klh th 1\\n\"\n    \"cbQ ch 1\\n\"\n    \"Jmg ng 1\\n\"\n    \"fPg ng 1\\n\"\n    \"Qnn an 1\\n\"\n    \"sMq qu 1\\n\"\n    \"aFz an 1\\n\"\n    \"sJs st 1\\n\"\n    \"Pwj ij 1\\n\"\n    \"jcL ch 1\\n\"\n    \"gmQ ng 1\\n\"\n    \"Yqr qu 1\\n\"\n    \"Cgz ng 1\\n\"\n    \"wqz qu 1\\n\"\n    \"fnI nt 1\\n\"\n    \"qOt th 1\\n\"\n    \"vyU va 1\\n\"\n    \"wQz sz 1\\n\"\n    \"vUa an 1\\n\"\n    \"xBt th 1\\n\"\n    \"dNm de 1\\n\"\n    \"Ewx wa 1\\n\"\n    \"ypD pr 1\\n\"\n    \"wxL wa 1\\n\"\n    \"qeN qu 1\\n\"\n    \"vkB ka 1\\n\"\n    \"jBj ij 1\\n\"\n    \"gUj ng 1\\n\"\n    \"kQk ka 1\\n\"\n    \"fwO wa 1\\n\"\n    \"qQt th 1\\n\"\n    \"Qrl er 1\\n\"\n    \"dTx de 1\\n\"\n    \"fWd de 1\\n\"\n    \"jxK ij 1\\n\"\n    \"fHl le 1\\n\"\n    \"jcY ch 1\\n\"\n    \"oJs on 1\\n\"\n    \"sRx st 1\\n\"\n    \"uQg qu 1\\n\"\n    \"hhY th 1\\n\"\n    \"sdN st 1\\n\"\n    \"mxR me 1\\n\"\n    \"Xsv st 1\\n\"\n    \"Pcq ch 1\\n\"\n    \"pkZ ka 1\\n\"\n    \"zDl le 1\\n\"\n    \"rIh th 1\\n\"\n    \"Hnv an 1\\n\"\n    \"jpA ij 1\\n\"\n    \"hZj th 1\\n\"\n    \"Znd an 1\\n\"\n    \"hZd th 1\\n\"\n    \"qrO qu 1\\n\"\n    \"Sbx be 1\\n\"\n    \"tWp th 1\\n\"\n    \"Hpd de 1\\n\"\n    \"Hjz sz 1\\n\"\n    \"zcS ch 1\\n\"\n    \"kPz sz 1\\n\"\n    \"Htq th 1\\n\"\n    \"gcG ch 1\\n\"\n    \"Xqx qu 1\\n\"\n    \"mZc ch 1\\n\"\n    \"Xzv sz 1\\n\"\n    \"Kgw ng 1\\n\"\n    \"aUf an 1\\n\"\n    \"Ymq qu 1\\n\"\n    \"wcY ch 1\\n\"\n    \"oVh th 1\\n\"\n    \"pdM de 1\\n\"\n    \"vzK sz 1\\n\"\n    \"lrX er 1\\n\"\n    \"ydV de 1\\n\"\n    \"uqP qu 1\\n\"\n    \"fmN me 1\\n\"\n    \"Ocg ch 1\\n\"\n    \"fLk ka 1\\n\"\n    \"cJs ch 1\\n\"\n    \"uGf qu 1\\n\"\n    \"cMk ch 1\\n\"\n    \"gTx ng 1\\n\"\n    \"xNc ch 1\\n\"\n    \"bHl le 1\\n\"\n    \"uWp qu 1\\n\"\n    \"dxL de 1\\n\"\n    \"zxG sz 1\\n\"\n    \"dVn an 1\\n\"\n    \"Nbh th 1\\n\"\n    \"Cxs st 1\\n\"\n    \"cvG ch 1\\n\"\n    \"wCf wa 1\\n\"\n    \"kjC ij 1\\n\"\n    \"cfY ch 1\\n\"\n    \"zcf ch 1\\n\"\n    \"dpW de 1\\n\"\n    \"Pqy qu 1\\n\"\n    \"tlN th 1\\n\"\n    \"sIi in 1\\n\"\n    \"qxC qu 1\\n\"\n    \"Kjm ij 1\\n\"\n    \"zZk sz 1\\n\"\n    \"Fks st 1\\n\"\n    \"gWb ng 1\\n\"\n    \"tqK th 1\\n\"\n    \"Jlv le 1\\n\"\n    \"kCk ka 1\\n\"\n    \"whT th 1\\n\"\n    \"Owv va 1\\n\"\n    \"zKm sz 1\\n\"\n    \"jql qu 1\\n\"\n    \"tGz th 1\\n\"\n    \"dCw de 1\\n\"\n    \"ymQ me 1\\n\"\n    \"xnF an 1\\n\"\n    \"wuF qu 1\\n\"\n    \"pFq qu 1\\n\"\n    \"jyS ij 1\\n\"\n    \"pjX ij 1\\n\"\n    \"lOj le 1\\n\"\n    \"Jmd de 1\\n\"\n    \"Zvz sz 1\\n\"\n    \"jqM qu 1\\n\"\n    \"jTd de 1\\n\"\n    \"qOi qu 1\\n\"\n    \"oJg ng 1\\n\"\n    \"Mjx ij 1\\n\"\n    \"Tpb pr 1\\n\"\n    \"Wtv th 1\\n\"\n    \"jxO ij 1\\n\"\n    \"dBs st 1\\n\"\n    \"tNv th 1\\n\"\n    \"qTb qu 1\\n\"\n    \"vnU an 1\\n\"\n    \"zDx sz 1\\n\"\n    \"pSq qu 1\\n\"\n    \"xRm me 1\\n\"\n    \"qUf qu 1\\n\"\n    \"mBb me 1\\n\"\n    \"qjI qu 1\\n\"\n    \"sIy st 1\\n\"\n    \"dCg ng 1\\n\"\n    \"qIx qu 1\\n\"\n    \"pZp pr 1\\n\"\n    \"qDt th 1\\n\"\n    \"xrM er 1\\n\"\n    \"uOe qu 1\\n\"\n    \"xgO ng 1\\n\"\n    \"grX ng 1\\n\"\n    \"Pgg ng 1\\n\"\n    \"yVq qu 1\\n\"\n    \"qEu un 1\\n\"\n    \"kBc ch 1\\n\"\n    \"Sgz ng 1\\n\"\n    \"hjX th 1\\n\"\n    \"gOq qu 1\\n\"\n    \"pmW me 1\\n\"\n    \"Gnw an 1\\n\"\n    \"xZl le 1\\n\"\n    \"hTd th 1\\n\"\n    \"Gfq qu 1\\n\"\n    \"sLf st 1\\n\"\n    \"Pgj ng 1\\n\"\n    \"twF th 1\\n\"\n    \"mDk ka 1\\n\"\n    \"qdY qu 1\\n\"\n    \"vsZ st 1\\n\"\n    \"vcC ch 1\\n\"\n    \"Dcj ch 1\\n\"\n    \"wUh th 1\\n\"\n    \"qId qu 1\\n\"\n    \"qrZ qu 1\\n\"\n    \"cbS ch 1\\n\"\n    \"Xzc ch 1\\n\"\n    \"vWj ij 1\\n\"\n    \"pvC va 1\\n\"\n    \"Jrw er 1\\n\"\n    \"yxI ny 1\\n\"\n    \"dqI qu 1\\n\"\n    \"uCm qu 1\\n\"\n    \"vXd de 1\\n\"\n    \"Wdp de 1\\n\"\n    \"Dzc ch 1\\n\"\n    \"hdV th 1\\n\"\n    \"qbO qu 1\\n\"\n    \"Jwk ka 1\\n\"\n    \"Wqm qu 1\\n\"\n    \"iXw in 1\\n\"\n    \"fYl le 1\\n\"\n    \"quQ un 1\\n\"\n    \"kjD ij 1\\n\"\n    \"mIh th 1\\n\"\n    \"xWw wa 1\\n\"\n    \"oCw on 1\\n\"\n    \"Zcv ch 1\\n\"\n    \"jdN de 1\\n\"\n    \"uYb qu 1\\n\"\n    \"Srx er 1\\n\"\n    \"pgU ng 1\\n\"\n    \"rQg ng 1\\n\"\n    \"mHf me 1\\n\"\n    \"fBt th 1\\n\"\n    \"jVx ij 1\\n\"\n    \"vYc ch 1\\n\"\n    \"Vgj ng 1\\n\"\n    \"qaS an 1\\n\"\n    \"pxW pr 1\\n\"\n    \"mnJ an 1\\n\"\n    \"Bww wa 1\\n\"\n    \"Tqz qu 1\\n\"\n    \"jFv ij 1\\n\"\n    \"xwM wa 1\\n\"\n    \"Dqw qu 1\\n\"\n    \"mwI me 1\\n\"\n    \"vhW th 1\\n\"\n    \"sqX qu 1\\n\"\n    \"tlR th 1\\n\"\n    \"aBh th 1\\n\"\n    \"qnZ an 1\\n\"\n    \"gXg ng 1\\n\"\n    \"sCj st 1\\n\"\n    \"grN ng 1\\n\"\n    \"tYv th 1\\n\"\n    \"Wwg ng 1\\n\"\n    \"fYi in 1\\n\"\n    \"btF th 1\\n\"\n    \"wQn an 1\\n\"\n    \"Zlt th 1\\n\"\n    \"cJz ch 1\\n\"\n    \"Xbn an 1\\n\"\n    \"tLm th 1\\n\"\n    \"Zlx le 1\\n\"\n    \"Nmj ij 1\\n\"\n    \"hcG th 1\\n\"\n    \"Wrk er 1\\n\"\n    \"Nhc th 1\\n\"\n    \"vqD qu 1\\n\"\n    \"ujY qu 1\\n\"\n    \"iJd in 1\\n\"\n    \"dLf de 1\\n\"\n    \"cQn ch 1\\n\"\n    \"Wfx fo 1\\n\"\n    \"hkZ th 1\\n\"\n    \"mhC th 1\\n\"\n    \"zMq qu 1\\n\"\n    \"zLz sz 1\\n\"\n    \"Xgt th 1\\n\"\n    \"qKr qu 1\\n\"\n    \"yjJ ij 1\\n\"\n    \"rJm er 1\\n\"\n    \"Vxc ch 1\\n\"\n    \"Bxn an 1\\n\"\n    \"cnQ ch 1\\n\"\n    \"qkQ qu 1\\n\"\n    \"Nlw le 1\\n\"\n    \"hWv th 1\\n\"\n    \"wdU de 1\\n\"\n    \"qtB th 1\\n\"\n    \"qIe qu 1\\n\"\n    \"qeY qu 1\\n\"\n    \"Zrp er 1\\n\"\n    \"Nhd th 1\\n\"\n    \"fDp po 1\\n\"\n    \"Cnj an 1\\n\"\n    \"kxU ka 1\\n\"\n    \"Bqv qu 1\\n\"\n    \"vXr er 1\\n\"\n    \"kBx ka 1\\n\"\n    \"fBn an 1\\n\"\n    \"pMx pr 1\\n\"\n    \"kxR ka 1\\n\"\n    \"Lzg ng 1\\n\"\n    \"jBh th 1\\n\"\n    \"Fjn an 1\\n\"\n    \"wpC pr 1\\n\"\n    \"fKy ny 1\\n\"\n    \"hwD th 1\\n\"\n    \"fqf qu 1\\n\"\n    \"qBy qu 1\\n\"\n    \"Ycq ch 1\\n\"\n    \"Nns an 1\\n\"\n    \"jmZ ij 1\\n\"\n    \"gKw ng 1\\n\"\n    \"dqA qu 1\\n\"\n    \"Bjg ng 1\\n\"\n    \"fGx fo 1\\n\"\n    \"Lnp an 1\\n\"\n    \"whU th 1\\n\"\n    \"qPd qu 1\\n\"\n    \"yMx ny 1\\n\"\n    \"wEj ij 1\\n\"\n    \"kmJ ka 1\\n\"\n    \"Qsx st 1\\n\"\n    \"lCw le 1\\n\"\n    \"Qqb qu 1\\n\"\n    \"hvJ th 1\\n\"\n    \"xkN ka 1\\n\"\n    \"uVg qu 1\\n\"\n    \"sQm st 1\\n\"\n    \"uJp qu 1\\n\"\n    \"Yzn an 1\\n\"\n    \"cXh th 1\\n\"\n    \"srI er 1\\n\"\n    \"tBz th 1\\n\"\n    \"cRj ch 1\\n\"\n    \"yIw wa 1\\n\"\n    \"jHg ng 1\\n\"\n    \"xFp pr 1\\n\"\n    \"wJq qu 1\\n\"\n    \"qdF qu 1\\n\"\n    \"vKv va 1\\n\"\n    \"sHc ch 1\\n\"\n    \"hBf th 1\\n\"\n    \"jDy ij 1\\n\"\n    \"Gjx ij 1\\n\"\n    \"Fkd de 1\\n\"\n    \"Hhz th 1\\n\"\n    \"xSg ng 1\\n\"\n    \"jFf ij 1\\n\"\n    \"qvM qu 1\\n\"\n    \"oRw on 1\\n\"\n    \"xgX ng 1\\n\"\n    \"gjF ng 1\\n\"\n    \"qDz qu 1\\n\"\n    \"Ycf ch 1\\n\"\n    \"Xcw ch 1\\n\"\n    \"nfQ an 1\\n\"\n    \"qGs qu 1\\n\"\n    \"kGs st 1\\n\"\n    \"fxV fo 1\\n\"\n    \"iPj in 1\\n\"\n    \"qgP qu 1\\n\"\n    \"jIv ij 1\\n\"\n    \"Vhu th 1\\n\"\n    \"Bzj sz 1\\n\"\n    \"Jvg ng 1\\n\"\n    \"Vjf ij 1\\n\"\n    \"wTq qu 1\\n\"\n    \"pDw pr 1\\n\"\n    \"Ysv st 1\\n\"\n    \"ztV th 1\\n\"\n    \"mtZ th 1\\n\"\n    \"jFy ij 1\\n\"\n    \"gqC qu 1\\n\"\n    \"Vsg ng 1\\n\"\n    \"gjS ng 1\\n\"\n    \"vXz sz 1\\n\"\n    \"bpK pr 1\\n\"\n    \"nDq an 1\\n\"\n    \"sKx st 1\\n\"\n    \"xYg ng 1\\n\"\n    \"fZd de 1\\n\"\n    \"pxf pr 1\\n\"\n    \"jqS qu 1\\n\"\n    \"hTb th 1\\n\"\n    \"Nkq qu 1\\n\"\n    \"qpH qu 1\\n\"\n    \"vEz sz 1\\n\"\n    \"vqP qu 1\\n\"\n    \"vHw va 1\\n\"\n    \"Dkp ka 1\\n\"\n    \"cqY ch 1\\n\"\n    \"mqS qu 1\\n\"\n    \"sVt th 1\\n\"\n    \"Pxh th 1\\n\"\n    \"hxN th 1\\n\"\n    \"yTf ny 1\\n\"\n    \"wCj ij 1\\n\"\n    \"qQw qu 1\\n\"\n    \"Vfv va 1\\n\"\n    \"yQd de 1\\n\"\n    \"gUc ch 1\\n\"\n    \"wsQ st 1\\n\"\n    \"fGw wa 1\\n\"\n    \"wKf wa 1\\n\"\n    \"wwB wa 1\\n\"\n    \"vFt th 1\\n\"\n    \"twQ th 1\\n\"\n    \"nrB an 1\\n\"\n    \"lpY le 1\\n\"\n    \"xlR le 1\\n\"\n    \"fdK de 1\\n\"\n    \"eFz er 1\\n\"\n    \"jyQ ij 1\\n\"\n    \"lwT le 1\\n\"\n    \"xCw wa 1\\n\"\n    \"cgM ch 1\\n\"\n    \"wtV th 1\\n\"\n    \"aqJ an 1\\n\"\n    \"bXu qu 1\\n\"\n    \"qdQ qu 1\\n\"\n    \"Yxd de 1\\n\"\n    \"xcS ch 1\\n\"\n    \"nmV an 1\\n\"\n    \"rQd er 1\\n\"\n    \"Glk le 1\\n\"\n    \"qEm qu 1\\n\"\n    \"uvO qu 1\\n\"\n    \"svF st 1\\n\"\n    \"sJx st 1\\n\"\n    \"Qyg ng 1\\n\"\n    \"mXh th 1\\n\"\n    \"btD th 1\\n\"\n    \"wGc ch 1\\n\"\n    \"fZo on 1\\n\"\n    \"Evx va 1\\n\"\n    \"vzD sz 1\\n\"\n    \"ufC qu 1\\n\"\n    \"Pxq qu 1\\n\"\n    \"qdt th 1\\n\"\n    \"rKz er 1\\n\"\n    \"Jhh th 1\\n\"\n    \"Cxk ka 1\\n\"\n    \"qxR qu 1\\n\"\n    \"gTl ng 1\\n\"\n    \"qGf qu 1\\n\"\n    \"wYh th 1\\n\"\n    \"cEh th 1\\n\"\n    \"bzU sz 1\\n\"\n    \"zWq qu 1\\n\"\n    \"rWb er 1\\n\"\n    \"Wrp er 1\\n\"\n    \"sLc ch 1\\n\"\n    \"Jpu qu 1\\n\"\n    \"Jkf ka 1\\n\"\n    \"vgE ng 1\\n\"\n    \"Bqk qu 1\\n\"\n    \"oQs on 1\\n\"\n    \"kbZ ka 1\\n\"\n    \"rVf er 1\\n\"\n    \"qLw qu 1\\n\"\n    \"Lrc ch 1\\n\"\n    \"xsR st 1\\n\"\n    \"hwB th 1\\n\"\n    \"Qnk an 1\\n\"\n    \"cPz ch 1\\n\"\n    \"Ucq ch 1\\n\"\n    \"egJ ng 1\\n\"\n    \"Qyq qu 1\\n\"\n    \"Xwr pr 1\\n\"\n    \"xfD fo 1\\n\"\n    \"wyH wa 1\\n\"\n    \"lBw le 1\\n\"\n    \"Mdx de 1\\n\"\n    \"Qsy st 1\\n\"\n    \"zqV qu 1\\n\"\n    \"vpY va 1\\n\"\n    \"slY le 1\\n\"\n    \"wgL ng 1\\n\"\n    \"snN an 1\\n\"\n    \"hVd th 1\\n\"\n    \"yKx ny 1\\n\"\n    \"bdW de 1\\n\"\n    \"lqL qu 1\\n\"\n    \"yhD th 1\\n\"\n    \"tNz th 1\\n\"\n    \"zJg ng 1\\n\"\n    \"kIx ka 1\\n\"\n    \"fHp pr 1\\n\"\n    \"yrJ er 1\\n\"\n    \"lrR er 1\\n\"\n    \"wzY sz 1\\n\"\n    \"pgB pr 1\\n\"\n    \"mfC me 1\\n\"\n    \"qkL qu 1\\n\"\n    \"jUu qu 1\\n\"\n    \"qCh th 1\\n\"\n    \"zlN le 1\\n\"\n    \"Bgj ng 1\\n\"\n    \"gcE ch 1\\n\"\n    \"zRx sz 1\\n\"\n    \"jhN th 1\\n\"\n    \"eGz er 1\\n\"\n    \"Fpq qu 1\\n\"\n    \"Wvi in 1\\n\"\n    \"mBf me 1\\n\"\n    \"hhW th 1\\n\"\n    \"oUq qu 1\\n\"\n    \"dxQ de 1\\n\"\n    \"Whq th 1\\n\"\n    \"rMk er 1\\n\"\n    \"lWd le 1\\n\"\n    \"xWz sz 1\\n\"\n    \"oQn an 1\\n\"\n    \"mWx me 1\\n\"\n    \"nuV an 1\\n\"\n    \"wWz sz 1\\n\"\n    \"hvR th 1\\n\"\n    \"Zwd de 1\\n\"\n    \"smJ st 1\\n\"\n    \"Hlh th 1\\n\"\n    \"sJh th 1\\n\"\n    \"zmY sz 1\\n\"\n    \"hZn th 1\\n\"\n    \"Vjg ng 1\\n\"\n    \"Jhz th 1\\n\"\n    \"mqR qu 1\\n\"\n    \"hcO th 1\\n\"\n    \"dqL qu 1\\n\"\n    \"Bfh th 1\\n\"\n    \"pkV ka 1\\n\"\n    \"tBx th 1\\n\"\n    \"Hkc ch 1\\n\"\n    \"Kqm qu 1\\n\"\n    \"qWv qu 1\\n\"\n    \"lXy le 1\\n\"\n    \"yRd de 1\\n\"\n    \"mjH ij 1\\n\"\n    \"qzA qu 1\\n\"\n    \"qxm qu 1\\n\"\n    \"Qvm va 1\\n\"\n    \"gcM ch 1\\n\"\n    \"xqx qu 1\\n\"\n    \"kKv ka 1\\n\"\n    \"yoX po 1\\n\"\n    \"xrT er 1\\n\"\n    \"cWq ch 1\\n\"\n    \"jqW qu 1\\n\"\n    \"sWj st 1\\n\"\n    \"Sdw de 1\\n\"\n    \"dfR de 1\\n\"\n    \"Kqn an 1\\n\"\n    \"Gjd do 1\\n\"\n    \"Qbd de 1\\n\"\n    \"yyK ny 1\\n\"\n    \"xmX me 1\\n\"\n    \"xuF qu 1\\n\"\n    \"yVg ng 1\\n\"\n    \"qoO qu 1\\n\"\n    \"Glq qu 1\\n\"\n    \"Mkx ka 1\\n\"\n    \"xLb be 1\\n\"\n    \"gMr ng 1\\n\"\n    \"sCp st 1\\n\"\n    \"bGh th 1\\n\"\n    \"cXo ch 1\\n\"\n    \"zTz sz 1\\n\"\n    \"qkC qu 1\\n\"\n    \"hTp th 1\\n\"\n    \"qNf qu 1\\n\"\n    \"mXk ka 1\\n\"\n    \"xcZ ch 1\\n\"\n    \"jVm ij 1\\n\"\n    \"bIi in 1\\n\"\n    \"qnH an 1\\n\"\n    \"nwC an 1\\n\"\n    \"dSg ng 1\\n\"\n    \"qoD qu 1\\n\"\n    \"tDx th 1\\n\"\n    \"jdU de 1\\n\"\n    \"Xmw me 1\\n\"\n    \"kNh th 1\\n\"\n    \"jYr er 1\\n\"\n    \"Ygp ng 1\\n\"\n    \"blJ le 1\\n\"\n    \"mFv va 1\\n\"\n    \"Sxr er 1\\n\"\n    \"Fzl le 1\\n\"\n    \"jTq qu 1\\n\"\n    \"cIp pr 1\\n\"\n    \"ajY an 1\\n\"\n    \"yYb be 1\\n\"\n    \"rKb er 1\\n\"\n    \"pzB sz 1\\n\"\n    \"eIy er 1\\n\"\n    \"wfK wa 1\\n\"\n    \"Fmh th 1\\n\"\n    \"ufL qu 1\\n\"\n    \"Xlm le 1\\n\"\n    \"Czg ng 1\\n\"\n    \"lPq qu 1\\n\"\n    \"tqV th 1\\n\"\n    \"wFy wa 1\\n\"\n    \"bQc ch 1\\n\"\n    \"kVw ka 1\\n\"\n    \"nMh th 1\\n\"\n    \"cCj ch 1\\n\"\n    \"oeE er 1\\n\"\n    \"wHf wa 1\\n\"\n    \"fNf fo 1\\n\"\n    \"mXv va 1\\n\"\n    \"Nkg ng 1\\n\"\n    \"jWc ch 1\\n\"\n    \"zFj sz 1\\n\"\n    \"Kfx fo 1\\n\"\n    \"bgY ng 1\\n\"\n    \"lYz le 1\\n\"\n    \"cgD ch 1\\n\"\n    \"pgM ng 1\\n\"\n    \"fhH th 1\\n\"\n    \"jrD er 1\\n\"\n    \"jwA ij 1\\n\"\n    \"jyM ij 1\\n\"\n    \"vzC sz 1\\n\"\n    \"lQd le 1\\n\"\n    \"zcH ch 1\\n\"\n    \"lbX le 1\\n\"\n    \"vzG sz 1\\n\"\n    \"mSr er 1\\n\"\n    \"xYf fo 1\\n\"\n    \"qgB qu 1\\n\"\n    \"jYk ij 1\\n\"\n    \"dIq qu 1\\n\"\n    \"wpG pr 1\\n\"\n    \"hVk th 1\\n\"\n    \"Tjb ij 1\\n\"\n    \"zvP sz 1\\n\"\n    \"bZg ng 1\\n\"\n    \"bFg ng 1\\n\"\n    \"kfU ka 1\\n\"\n    \"Sxz sz 1\\n\"\n    \"fwF wa 1\\n\"\n    \"Qwg ng 1\\n\"\n    \"fWb be 1\\n\"\n    \"jqQ ij 1\\n\"\n    \"Vfx fo 1\\n\"\n    \"cJj ch 1\\n\"\n    \"zwJ sz 1\\n\"\n    \"xBg ng 1\\n\"\n    \"Ddm de 1\\n\"\n    \"bWv va 1\\n\"\n    \"zpG sz 1\\n\"\n    \"xrQ er 1\\n\"\n    \"hcS th 1\\n\"\n    \"wHn an 1\\n\"\n    \"hIy th 1\\n\"\n    \"Yxj ij 1\\n\"\n    \"sdC st 1\\n\"\n    \"yVu qu 1\\n\"\n    \"qjf qu 1\\n\"\n    \"Tzy sz 1\\n\"\n    \"Ffn an 1\\n\"\n    \"zzX sz 1\\n\"\n    \"Hdx de 1\\n\"\n    \"gLg ng 1\\n\"\n    \"Yqg qu 1\\n\"\n    \"fLb be 1\\n\"\n    \"lQc ch 1\\n\"\n    \"vjG ij 1\\n\"\n    \"wpL pr 1\\n\"\n    \"cJr ch 1\\n\"\n    \"aJq an 1\\n\"\n    \"Ynq an 1\\n\"\n    \"Wvc ch 1\\n\"\n    \"lKy le 1\\n\"\n    \"eYq qu 1\\n\"\n    \"kxL ka 1\\n\"\n    \"gCb ng 1\\n\"\n    \"sRd st 1\\n\"\n    \"rMd er 1\\n\"\n    \"Bvh th 1\\n\"\n    \"kKg ng 1\\n\"\n    \"wlK le 1\\n\"\n    \"mDd de 1\\n\"\n    \"zkJ sz 1\\n\"\n    \"vRc ch 1\\n\"\n    \"Xlh th 1\\n\"\n    \"pRk ka 1\\n\"\n    \"xvN va 1\\n\"\n    \"nxI an 1\\n\"\n    \"fCx fo 1\\n\"\n    \"Ybt th 1\\n\"\n    \"Ebq qu 1\\n\"\n    \"bkN ka 1\\n\"\n    \"bQy be 1\\n\"\n    \"rDw er 1\\n\"\n    \"djJ de 1\\n\"\n    \"tmM th 1\\n\"\n    \"nwH an 1\\n\"\n    \"hJz th 1\\n\"\n    \"lcM ch 1\\n\"\n    \"ozV on 1\\n\"\n    \"mLd de 1\\n\"\n    \"bKc ch 1\\n\"\n    \"eZf er 1\\n\"\n    \"Fhg th 1\\n\"\n    \"Zcj ch 1\\n\"\n    \"pLr er 1\\n\"\n    \"wqs qu 1\\n\"\n    \"bXi in 1\\n\"\n    \"tgD th 1\\n\"\n    \"hQc th 1\\n\"\n    \"zDp sz 1\\n\"\n    \"oDg ng 1\\n\"\n    \"sgM ng 1\\n\"\n    \"bnD an 1\\n\"\n    \"gHp ng 1\\n\"\n    \"Wkf ka 1\\n\"\n    \"qIs qu 1\\n\"\n    \"wLd de 1\\n\"\n    \"ztN th 1\\n\"\n    \"gdQ ng 1\\n\"\n    \"wCm ow 1\\n\"\n    \"vVf va 1\\n\"\n    \"Jmw me 1\\n\"\n    \"hbC th 1\\n\"\n    \"srW er 1\\n\"\n    \"nxN an 1\\n\"\n    \"pVs st 1\\n\"\n    \"uWq qu 1\\n\"\n    \"hgM th 1\\n\"\n    \"lBc ch 1\\n\"\n    \"wUo on 1\\n\"\n    \"flH le 1\\n\"\n    \"yWg ng 1\\n\"\n    \"jjN ij 1\\n\"\n    \"Uwn an 1\\n\"\n    \"nYj an 1\\n\"\n    \"mtN th 1\\n\"\n    \"Pgp ng 1\\n\"\n    \"zFc ch 1\\n\"\n    \"oXz on 1\\n\"\n    \"iCg ng 1\\n\"\n    \"Lpc ch 1\\n\"\n    \"Gqd qu 1\\n\"\n    \"rYc ch 1\\n\"\n    \"vqA qu 1\\n\"\n    \"Vhc th 1\\n\"\n    \"zmF sz 1\\n\"\n    \"Bpc ch 1\\n\"\n    \"Jfq qu 1\\n\"\n    \"oXv on 1\\n\"\n    \"lgX ng 1\\n\"\n    \"Jfx fo 1\\n\"\n    \"zpS sz 1\\n\"\n    \"gcO ch 1\\n\"\n    \"xwQ wa 1\\n\"\n    \"pkQ ka 1\\n\"\n    \"wOc ch 1\\n\"\n    \"Wgm ng 1\\n\"\n    \"cOj ch 1\\n\"\n    \"Nft th 1\\n\"\n    \"pqN qu 1\\n\"\n    \"qsB qu 1\\n\"\n    \"ydH de 1\\n\"\n    \"qRs qu 1\\n\"\n    \"ykX ka 1\\n\"\n    \"cDq ch 1\\n\"\n    \"mfU me 1\\n\"\n    \"xzM sz 1\\n\"\n    \"vGt th 1\\n\"\n    \"fuW qu 1\\n\"\n    \"lqG qu 1\\n\"\n    \"Tqp qu 1\\n\"\n    \"zvD sz 1\\n\"\n    \"wWb wa 1\\n\"\n    \"Fzi in 1\\n\"\n    \"qpK qu 1\\n\"\n    \"oyq qu 1\\n\"\n    \"gQe ng 1\\n\"\n    \"Zmw me 1\\n\"\n    \"qYp qu 1\\n\"\n    \"Wvf va 1\\n\"\n    \"aQl an 1\\n\"\n    \"oqO qu 1\\n\"\n    \"eqJ qu 1\\n\"\n    \"nvT an 1\\n\"\n    \"fUk ka 1\\n\"\n    \"ibH in 1\\n\"\n    \"jvZ ij 1\\n\"\n    \"Wwz sz 1\\n\"\n    \"lgY ng 1\\n\"\n    \"eFp er 1\\n\"\n    \"Xgx ng 1\\n\"\n    \"fYs st 1\\n\"\n    \"kZs st 1\\n\"\n    \"vpD va 1\\n\"\n    \"qcZ ch 1\\n\"\n    \"Bqo qu 1\\n\"\n    \"jLb ij 1\\n\"\n    \"rwX er 1\\n\"\n    \"fyK ny 1\\n\"\n    \"Sxv va 1\\n\"\n    \"sxZ st 1\\n\"\n    \"wkK ka 1\\n\"\n    \"yJp pr 1\\n\"\n    \"tjT th 1\\n\"\n    \"qPv qu 1\\n\"\n    \"yZj ij 1\\n\"\n    \"Rrm er 1\\n\"\n    \"nhJ th 1\\n\"\n    \"vqJ qu 1\\n\"\n    \"yxY ny 1\\n\"\n    \"vsE st 1\\n\"\n    \"fkK ka 1\\n\"\n    \"fuY qu 1\\n\"\n    \"zQo on 1\\n\"\n    \"Xvr er 1\\n\"\n    \"mMq qu 1\\n\"\n    \"Oqm qu 1\\n\"\n    \"Dxs st 1\\n\"\n    \"Lqa an 1\\n\"\n    \"Wnh th 1\\n\"\n    \"jmG ij 1\\n\"\n    \"Wqa an 1\\n\"\n    \"mhT th 1\\n\"\n    \"bgZ ng 1\\n\"\n    \"vmO va 1\\n\"\n    \"zFm sz 1\\n\"\n    \"Khk th 1\\n\"\n    \"yqB qu 1\\n\"\n    \"nVv an 1\\n\"\n    \"Rft th 1\\n\"\n    \"zmL sz 1\\n\"\n    \"hdD th 1\\n\"\n    \"nWp an 1\\n\"\n    \"vvO va 1\\n\"\n    \"dYp de 1\\n\"\n    \"ohX th 1\\n\"\n    \"qoU qu 1\\n\"\n    \"rjB er 1\\n\"\n    \"Dwc ch 1\\n\"\n    \"aWq an 1\\n\"\n    \"clD ch 1\\n\"\n    \"Vdk de 1\\n\"\n    \"twM th 1\\n\"\n    \"fZz sz 1\\n\"\n    \"wQp pr 1\\n\"\n    \"dwD de 1\\n\"\n    \"iYv in 1\\n\"\n    \"Awv va 1\\n\"\n    \"pgG ng 1\\n\"\n    \"Xoq qu 1\\n\"\n    \"krQ er 1\\n\"\n    \"Vxg ng 1\\n\"\n    \"lwB le 1\\n\"\n    \"Pxw wa 1\\n\"\n    \"Jwf wa 1\\n\"\n    \"zLh th 1\\n\"\n    \"btH th 1\\n\"\n    \"pwY pr 1\\n\"\n    \"Mjd de 1\\n\"\n    \"Xrh th 1\\n\"\n    \"qXu un 1\\n\"\n    \"Eqy qu 1\\n\"\n    \"Bpy pr 1\\n\"\n    \"znY an 1\\n\"\n    \"Rqd qu 1\\n\"\n    \"nQf an 1\\n\"\n    \"Zvw va 1\\n\"\n    \"zjO sz 1\\n\"\n    \"wNd de 1\\n\"\n    \"lIq qu 1\\n\"\n    \"vMq qu 1\\n\"\n    \"Gqt th 1\\n\"\n    \"lMf le 1\\n\"\n    \"Jqn an 1\\n\"\n    \"fVw wa 1\\n\"\n    \"qvQ qu 1\\n\"\n    \"eHk er 1\\n\"\n    \"jbK ij 1\\n\"\n    \"fWs st 1\\n\"\n    \"qTk qu 1\\n\"\n    \"znF an 1\\n\"\n    \"yxO ny 1\\n\"\n    \"Fqr qu 1\\n\"\n    \"nFb an 1\\n\"\n    \"oDp on 1\\n\"\n    \"jUc ch 1\\n\"\n    \"qHg qu 1\\n\"\n    \"gGq qu 1\\n\"\n    \"qPs qu 1\\n\"\n    \"jHv ij 1\\n\"\n    \"Iwj ij 1\\n\"\n    \"vzV sz 1\\n\"\n    \"yUq qu 1\\n\"\n    \"jQt th 1\\n\"\n    \"sFb st 1\\n\"\n    \"Lvg ng 1\\n\"\n    \"zTt th 1\\n\"\n    \"bvK va 1\\n\"\n    \"Ccx ch 1\\n\"\n    \"jyA ij 1\\n\"\n    \"yEj ij 1\\n\"\n    \"zdG sz 1\\n\"\n    \"tqT th 1\\n\"\n    \"qbH qu 1\\n\"\n    \"nHd an 1\\n\"\n    \"Hhj th 1\\n\"\n    \"jVb ij 1\\n\"\n    \"uHw un 1\\n\"\n    \"Zck ch 1\\n\"\n    \"gPq qu 1\\n\"\n    \"mxq qu 1\\n\"\n    \"wHs st 1\\n\"\n    \"fDy ny 1\\n\"\n    \"tlV th 1\\n\"\n    \"Lsv st 1\\n\"\n    \"zvF va 1\\n\"\n    \"mqx qu 1\\n\"\n    \"nqF an 1\\n\"\n    \"xgM ng 1\\n\"\n    \"gyq qu 1\\n\"\n    \"grJ ng 1\\n\"\n    \"jSq qu 1\\n\"\n    \"Mmw me 1\\n\"\n    \"Cgx ng 1\\n\"\n    \"Rlr er 1\\n\"\n    \"mvG va 1\\n\"\n    \"fuA qu 1\\n\"\n    \"uVh th 1\\n\"\n    \"sMz st 1\\n\"\n    \"wWr er 1\\n\"\n    \"qpD qu 1\\n\"\n    \"hQw th 1\\n\"\n    \"xBc ch 1\\n\"\n    \"fcW ch 1\\n\"\n    \"hxL th 1\\n\"\n    \"rfK er 1\\n\"\n    \"mFn an 1\\n\"\n    \"Qnw an 1\\n\"\n    \"tjB th 1\\n\"\n    \"Rkx ka 1\\n\"\n    \"srE er 1\\n\"\n    \"drG er 1\\n\"\n    \"Cfy ny 1\\n\"\n    \"yZw wa 1\\n\"\n    \"Wxw wa 1\\n\"\n    \"zCp sz 1\\n\"\n    \"jZt th 1\\n\"\n    \"Nqf qu 1\\n\"\n    \"jgO ng 1\\n\"\n    \"fWc ch 1\\n\"\n    \"qrN qu 1\\n\"\n    \"Nzj sz 1\\n\"\n    \"Hjy ij 1\\n\"\n    \"Uxy ny 1\\n\"\n    \"oIy on 1\\n\"\n    \"rfX er 1\\n\"\n    \"oBw on 1\\n\"\n    \"yyV ny 1\\n\"\n    \"Qiv in 1\\n\"\n    \"dKh th 1\\n\"\n    \"qDk qu 1\\n\"\n    \"tgQ th 1\\n\"\n    \"xNw wa 1\\n\"\n    \"qdL qu 1\\n\"\n    \"ovY on 1\\n\"\n    \"fbZ be 1\\n\"\n    \"qiI qu 1\\n\"\n    \"bvT va 1\\n\"\n    \"jYq qu 1\\n\"\n    \"kbK ka 1\\n\"\n    \"Mfn an 1\\n\"\n    \"Rpd de 1\\n\"\n    \"pHb pr 1\\n\"\n    \"qqO qu 1\\n\"\n    \"vkV ka 1\\n\"\n    \"sWp st 1\\n\"\n    \"kPf ka 1\\n\"\n    \"qLy qu 1\\n\"\n    \"qoE qu 1\\n\"\n    \"wLh th 1\\n\"\n    \"zhV th 1\\n\"\n    \"bpL pr 1\\n\"\n    \"Tqf qu 1\\n\"\n    \"pzG sz 1\\n\"\n    \"kcT ch 1\\n\"\n    \"wjX ij 1\\n\"\n    \"kPy ku 1\\n\"\n    \"fdB de 1\\n\"\n    \"Qxs st 1\\n\"\n    \"gYf ng 1\\n\"\n    \"Ypx pr 1\\n\"\n    \"zSk sz 1\\n\"\n    \"tDg th 1\\n\"\n    \"xbJ be 1\\n\"\n    \"yfO ny 1\\n\"\n    \"uQf qu 1\\n\"\n    \"bpQ pr 1\\n\"\n    \"dXc ch 1\\n\"\n    \"lwP le 1\\n\"\n    \"vTs st 1\\n\"\n    \"Jlq qu 1\\n\"\n    \"Cqw qu 1\\n\"\n    \"bWy be 1\\n\"\n    \"cUq ch 1\\n\"\n    \"Ybk ka 1\\n\"\n    \"wyq qu 1\\n\"\n    \"jhq th 1\\n\"\n    \"xUy ny 1\\n\"\n    \"Ncj ch 1\\n\"\n    \"kMh th 1\\n\"\n    \"vZy va 1\\n\"\n    \"zcq ch 1\\n\"\n    \"Qsr er 1\\n\"\n    \"Lhx th 1\\n\"\n    \"Gcj ch 1\\n\"\n    \"uQt th 1\\n\"\n    \"wYn an 1\\n\"\n    \"dYm de 1\\n\"\n    \"Qvx va 1\\n\"\n    \"Rcg ch 1\\n\"\n    \"qGz qu 1\\n\"\n    \"bxJ be 1\\n\"\n    \"jFg ng 1\\n\"\n    \"xLp pr 1\\n\"\n    \"lDn an 1\\n\"\n    \"wqS qu 1\\n\"\n    \"bIq qu 1\\n\"\n    \"tBm th 1\\n\"\n    \"bQs st 1\\n\"\n    \"zJb sz 1\\n\"\n    \"jfJ ij 1\\n\"\n    \"qTc ch 1\\n\"\n    \"kbX ka 1\\n\"\n    \"Hlz le 1\\n\"\n    \"puQ qu 1\\n\"\n    \"hKb th 1\\n\"\n    \"rBb er 1\\n\"\n    \"vpW va 1\\n\"\n    \"Yjk ij 1\\n\"\n    \"Wnm an 1\\n\"\n    \"pZr er 1\\n\"\n    \"ldZ le 1\\n\"\n    \"gMm ng 1\\n\"\n    \"pZf pi 1\\n\"\n    \"eYp er 1\\n\"\n    \"vTp va 1\\n\"\n    \"Gkc ch 1\\n\"\n    \"Cgy ng 1\\n\"\n    \"qDw qu 1\\n\"\n    \"gxW ng 1\\n\"\n    \"Cwz sz 1\\n\"\n    \"jhY th 1\\n\"\n    \"Fvk ka 1\\n\"\n    \"nfH an 1\\n\"\n    \"zcW ch 1\\n\"\n    \"zgC ng 1\\n\"\n    \"Dfk ka 1\\n\"\n    \"vpJ va 1\\n\"\n    \"Wpj ij 1\\n\"\n    \"sCb st 1\\n\"\n    \"fgF ng 1\\n\"\n    \"tPx th 1\\n\"\n    \"oCp on 1\\n\"\n    \"Nrx er 1\\n\"\n    \"Hwm me 1\\n\"\n    \"fRp pr 1\\n\"\n    \"aeX an 1\\n\"\n    \"jdI de 1\\n\"\n    \"sBv st 1\\n\"\n    \"vOv va 1\\n\"\n    \"gQt th 1\\n\"\n    \"Wmk ka 1\\n\"\n    \"Pqj qu 1\\n\"\n    \"khV th 1\\n\"\n    \"Hkj ij 1\\n\"\n    \"hbB th 1\\n\"\n    \"vzF sz 1\\n\"\n    \"Ybz sz 1\\n\"\n    \"sXb st 1\\n\"\n    \"yQr er 1\\n\"\n    \"hhV th 1\\n\"\n    \"tgW th 1\\n\"\n    \"bXo on 1\\n\"\n    \"Nxp pr 1\\n\"\n    \"aOx an 1\\n\"\n    \"zfb sz 1\\n\"\n    \"Qxp pr 1\\n\"\n    \"qwQ qu 1\\n\"\n    \"fjV ij 1\\n\"\n    \"hjY ij 1\\n\"\n    \"wtX th 1\\n\"\n    \"jgU ng 1\\n\"\n    \"nMq an 1\\n\"\n    \"Nwx wa 1\\n\"\n    \"vPg ng 1\\n\"\n    \"Xfh th 1\\n\"\n    \"yFf ny 1\\n\"\n    \"fHz sz 1\\n\"\n    \"nZf an 1\\n\"\n    \"jPt th 1\\n\"\n    \"Jgb ng 1\\n\"\n    \"xBb bi 1\\n\"\n    \"sjO st 1\\n\"\n    \"wDx wa 1\\n\"\n    \"njN an 1\\n\"\n    \"ohF th 1\\n\"\n    \"pqR qu 1\\n\"\n    \"Fzw sz 1\\n\"\n    \"qrU qu 1\\n\"\n    \"cjG ch 1\\n\"\n    \"kFv ka 1\\n\"\n    \"zQd sz 1\\n\"\n    \"vbE vi 1\\n\"\n    \"Ujt th 1\\n\"\n    \"qIb qu 1\\n\"\n    \"cFt th 1\\n\"\n    \"bvY va 1\\n\"\n    \"Szq qu 1\\n\"\n    \"wlH le 1\\n\"\n    \"qcY ch 1\\n\"\n    \"gEw ng 1\\n\"\n    \"xhL th 1\\n\"\n    \"kVg ng 1\\n\"\n    \"bfH be 1\\n\"\n    \"Nrz er 1\\n\"\n    \"sJn an 1\\n\"\n    \"bWn an 1\\n\"\n    \"nvK an 1\\n\"\n    \"qiH qu 1\\n\"\n    \"qbS qu 1\\n\"\n    \"vxB va 1\\n\"\n    \"tvT th 1\\n\"\n    \"Nrh th 1\\n\"\n    \"lYx le 1\\n\"\n    \"tkX th 1\\n\"\n    \"Gzx sz 1\\n\"\n    \"vCx vi 1\\n\"\n    \"Zbj ij 1\\n\"\n    \"mWp me 1\\n\"\n    \"Dqx qu 1\\n\"\n    \"pfE pr 1\\n\"\n    \"hvW th 1\\n\"\n    \"Eox on 1\\n\"\n    \"dbZ de 1\\n\"\n    \"lNb le 1\\n\"\n    \"rTd er 1\\n\"\n    \"ljQ le 1\\n\"\n    \"Vvp va 1\\n\"\n    \"gJw ng 1\\n\"\n    \"uqW qu 1\\n\"\n    \"Gjf ij 1\\n\"\n    \"pDd de 1\\n\"\n    \"sgQ ng 1\\n\"\n    \"hkQ th 1\\n\"\n    \"fJc ch 1\\n\"\n    \"mdI de 1\\n\"\n    \"Gcp ch 1\\n\"\n    \"pXa an 1\\n\"\n    \"pQj ij 1\\n\"\n    \"bgE ng 1\\n\"\n    \"Kzv sz 1\\n\"\n    \"cPb ch 1\\n\"\n    \"Hcz ch 1\\n\"\n    \"djQ de 1\\n\"\n    \"pGd de 1\\n\"\n    \"fyE ny 1\\n\"\n    \"dBb de 1\\n\"\n    \"ePj er 1\\n\"\n    \"fgO ng 1\\n\"\n    \"xRq qu 1\\n\"\n    \"xqK qu 1\\n\"\n    \"pKp pr 1\\n\"\n    \"xmY me 1\\n\"\n    \"hgO th 1\\n\"\n    \"wdG de 1\\n\"\n    \"hvZ th 1\\n\"\n    \"srF er 1\\n\"\n    \"Bvf vi 1\\n\"\n    \"yvD va 1\\n\"\n    \"xVg ng 1\\n\"\n    \"fYg ng 1\\n\"\n    \"bqd qu 1\\n\"\n    \"eFq qu 1\\n\"\n    \"cwZ ch 1\\n\"\n    \"cqG ch 1\\n\"\n    \"sKp st 1\\n\"\n    \"hJq th 1\\n\"\n    \"vLd de 1\\n\"\n    \"hdK th 1\\n\"\n    \"pcN ch 1\\n\"\n    \"tNf th 1\\n\"\n    \"xlK le 1\\n\"\n    \"rJx er 1\\n\"\n    \"qaN an 1\\n\"\n    \"zKf sz 1\\n\"\n    \"sNf st 1\\n\"\n    \"qPz qu 1\\n\"\n    \"bzL sz 1\\n\"\n    \"Jdw de 1\\n\"\n    \"nRb an 1\\n\"\n    \"jNs st 1\\n\"\n    \"tnV th 1\\n\"\n    \"ynI an 1\\n\"\n    \"tZp th 1\\n\"\n    \"fZp pr 1\\n\"\n    \"wMq qu 1\\n\"\n    \"Onq an 1\\n\"\n    \"zIh th 1\\n\"\n    \"bvH va 1\\n\"\n    \"Uvc ch 1\\n\"\n    \"zxJ sz 1\\n\"\n    \"Vmq qu 1\\n\"\n    \"uPm qu 1\\n\"\n    \"mwD me 1\\n\"\n    \"jQc ch 1\\n\"\n    \"gPk ng 1\\n\"\n    \"vfV va 1\\n\"\n    \"Tql qu 1\\n\"\n    \"bJl le 1\\n\"\n    \"lwO le 1\\n\"\n    \"wbG wa 1\\n\"\n    \"fTd de 1\\n\"\n    \"Xtq th 1\\n\"\n    \"hzX th 1\\n\"\n    \"Pzv sz 1\\n\"\n    \"Pmx me 1\\n\"\n    \"xZm me 1\\n\"\n    \"jCp ij 1\\n\"\n    \"bKm me 1\\n\"\n    \"Tmq qu 1\\n\"\n    \"Hnf an 1\\n\"\n    \"kjX ij 1\\n\"\n    \"vgH ng 1\\n\"\n    \"fSm me 1\\n\"\n    \"ylN le 1\\n\"\n    \"gvq qu 1\\n\"\n    \"jTz sz 1\\n\"\n    \"tWw th 1\\n\"\n    \"ywB wa 1\\n\"\n    \"bCq qu 1\\n\"\n    \"dNk de 1\\n\"\n    \"yCq qu 1\\n\"\n    \"Rxj ij 1\\n\"\n    \"nTq an 1\\n\"\n    \"gFs ng 1\\n\"\n    \"Xwq qu 1\\n\"\n    \"gJl ng 1\\n\"\n    \"vcR ch 1\\n\"\n    \"fbT be 1\\n\"\n    \"Fcd ch 1\\n\"\n    \"Wxm me 1\\n\"\n    \"qwv qu 1\\n\"\n    \"Sfh th 1\\n\"\n    \"lcK ch 1\\n\"\n    \"sbV st 1\\n\"\n    \"fSf fo 1\\n\"\n    \"lbB le 1\\n\"\n    \"Ocw ch 1\\n\"\n    \"jgM ng 1\\n\"\n    \"nbI an 1\\n\"\n    \"qsK qu 1\\n\"\n    \"Xyf ny 1\\n\"\n    \"pxv va 1\\n\"\n    \"mRc ch 1\\n\"\n    \"Ogq qu 1\\n\"\n    \"zuY qu 1\\n\"\n    \"fXu qu 1\\n\"\n    \"Wbj ij 1\\n\"\n    \"Tbw wa 1\\n\"\n    \"zrR er 1\\n\"\n    \"gmP ng 1\\n\"\n    \"cCm ch 1\\n\"\n    \"gtQ th 1\\n\"\n    \"phG th 1\\n\"\n    \"qjV qu 1\\n\"\n    \"ygG ng 1\\n\"\n    \"wFb wa 1\\n\"\n    \"rqL qu 1\\n\"\n    \"qSx qu 1\\n\"\n    \"ybK be 1\\n\"\n    \"mqJ qu 1\\n\"\n    \"Qrq qu 1\\n\"\n    \"qdI qu 1\\n\"\n    \"bcG ch 1\\n\"\n    \"iFb in 1\\n\"\n    \"mcZ ch 1\\n\"\n    \"vCz sz 1\\n\"\n    \"xHz tz 1\\n\"\n    \"hjM th 1\\n\"\n    \"qtL th 1\\n\"\n    \"tmH th 1\\n\"\n    \"slD le 1\\n\"\n    \"vRz sz 1\\n\"\n    \"gCd ng 1\\n\"\n    \"Xxc ch 1\\n\"\n    \"qKc ch 1\\n\"\n    \"sIw st 1\\n\"\n    \"fsY st 1\\n\"\n    \"xrJ er 1\\n\"\n    \"tNs th 1\\n\"\n    \"gbD ng 1\\n\"\n    \"wLl le 1\\n\"\n    \"hFf th 1\\n\"\n    \"Nxi in 1\\n\"\n    \"fRb be 1\\n\"\n    \"Jrb er 1\\n\"\n    \"jEq qu 1\\n\"\n    \"hwM th 1\\n\"\n    \"uVw qu 1\\n\"\n    \"fgN ng 1\\n\"\n    \"mAo on 1\\n\"\n    \"Pjb ij 1\\n\"\n    \"npP in 1\\n\"\n    \"Jcy ch 1\\n\"\n    \"yJb bi 1\\n\"\n    \"jxI ij 1\\n\"\n    \"Kkc ch 1\\n\"\n    \"kwV ka 1\\n\"\n    \"gRf ng 1\\n\"\n    \"Wfm me 1\\n\"\n    \"Tdp po 1\\n\"\n    \"wEz sz 1\\n\"\n    \"Lvk ka 1\\n\"\n    \"Dqn an 1\\n\"\n    \"tqL th 1\\n\"\n    \"jJq qu 1\\n\"\n    \"vdC de 1\\n\"\n    \"hxU th 1\\n\"\n    \"xUe er 1\\n\"\n    \"tQc th 1\\n\"\n    \"Lzk sz 1\\n\"\n    \"dTj de 1\\n\"\n    \"Tlz le 1\\n\"\n    \"xQw wa 1\\n\"\n    \"Fcq ch 1\\n\"\n    \"wgE ng 1\\n\"\n    \"Ckd de 1\\n\"\n    \"yKs st 1\\n\"\n    \"xwS wa 1\\n\"\n    \"wRt th 1\\n\"\n    \"gkK ng 1\\n\"\n    \"hQv th 1\\n\"\n    \"sLp st 1\\n\"\n    \"jAi in 1\\n\"\n    \"dmG de 1\\n\"\n    \"jKn an 1\\n\"\n    \"qUb qu 1\\n\"\n    \"wXy wa 1\\n\"\n    \"bzJ sz 1\\n\"\n    \"gzJ ng 1\\n\"\n    \"hNz th 1\\n\"\n    \"ygY ng 1\\n\"\n    \"qhU th 1\\n\"\n    \"afX an 1\\n\"\n    \"jZw ij 1\\n\"\n    \"Xdx de 1\\n\"\n    \"Tdx de 1\\n\"\n    \"jNn an 1\\n\"\n    \"vXf va 1\\n\"\n    \"qcE ch 1\\n\"\n    \"Mnw an 1\\n\"\n    \"qDh th 1\\n\"\n    \"Tdj de 1\\n\"\n    \"dgJ ng 1\\n\"\n    \"sdR st 1\\n\"\n    \"qGn an 1\\n\"\n    \"Mjj ij 1\\n\"\n    \"sxH st 1\\n\"\n    \"Ppz sz 1\\n\"\n    \"gfV ng 1\\n\"\n    \"fOy ny 1\\n\"\n    \"Nvx vi 1\\n\"\n    \"qaV an 1\\n\"\n    \"xjl le 1\\n\"\n    \"xgZ ng 1\\n\"\n    \"cGv ch 1\\n\"\n    \"Zxu qu 1\\n\"\n    \"Mfp pr 1\\n\"\n    \"zFp sz 1\\n\"\n    \"jgJ ng 1\\n\"\n    \"bpG pr 1\\n\"\n    \"vKz sz 1\\n\"\n    \"hqI th 1\\n\"\n    \"Qgw ng 1\\n\"\n    \"Qyy ny 1\\n\"\n    \"jmI ij 1\\n\"\n    \"Vgd ng 1\\n\"\n    \"xCt th 1\\n\"\n    \"yVs st 1\\n\"\n    \"uEq qu 1\\n\"\n    \"dcN ch 1\\n\"\n    \"Bzb sz 1\\n\"\n    \"gVl ng 1\\n\"\n    \"sXg ng 1\\n\"\n    \"kQf ka 1\\n\"\n    \"lrY er 1\\n\"\n    \"Vtd th 1\\n\"\n    \"nHs an 1\\n\"\n    \"wjN ij 1\\n\"\n    \"rzJ er 1\\n\"\n    \"sYy st 1\\n\"\n    \"wxQ wa 1\\n\"\n    \"Ztb th 1\\n\"\n    \"tWf th 1\\n\"\n    \"tCx th 1\\n\"\n    \"aFb an 1\\n\"\n    \"lqf qu 1\\n\"\n    \"feZ er 1\\n\"\n    \"fPz sz 1\\n\"\n    \"cjY ch 1\\n\"\n    \"wKh th 1\\n\"\n    \"Qhy th 1\\n\"\n    \"dCj de 1\\n\"\n    \"bkH ka 1\\n\"\n    \"yjD ij 1\\n\"\n    \"jTs st 1\\n\"\n    \"hxI th 1\\n\"\n    \"lvK vi 1\\n\"\n    \"Lwz sz 1\\n\"\n    \"swQ st 1\\n\"\n    \"dTk di 1\\n\"\n    \"fsO st 1\\n\"\n    \"ljE le 1\\n\"\n    \"wjM ij 1\\n\"\n    \"uQk qu 1\\n\"\n    \"xPg ng 1\\n\"\n    \"vmC va 1\\n\"\n    \"qsD qu 1\\n\"\n    \"gDw ng 1\\n\"\n    \"wJk ka 1\\n\"\n    \"Zpq qu 1\\n\"\n    \"Yhg th 1\\n\"\n    \"kNc ch 1\\n\"\n    \"bWl le 1\\n\"\n    \"Fwh th 1\\n\"\n    \"fHx fo 1\\n\"\n    \"Fnv an 1\\n\"\n    \"fdL de 1\\n\"\n    \"oqD qu 1\\n\"\n    \"aYx an 1\\n\"\n    \"Vqx qu 1\\n\"\n    \"vKf va 1\\n\"\n    \"Cbw wa 1\\n\"\n    \"vyq qu 1\\n\"\n    \"cqZ ch 1\\n\"\n    \"Rfh th 1\\n\"\n    \"Swc ch 1\\n\"\n    \"qNi qu 1\\n\"\n    \"qoW qu 1\\n\"\n    \"jhD th 1\\n\"\n    \"kJq qu 1\\n\"\n    \"gdF ng 1\\n\"\n    \"pvF va 1\\n\"\n    \"cpV ch 1\\n\"\n    \"qtC th 1\\n\"\n    \"gWm ng 1\\n\"\n    \"gPc ch 1\\n\"\n    \"jBs st 1\\n\"\n    \"rlV er 1\\n\"\n    \"gZc ch 1\\n\"\n    \"kTk ka 1\\n\"\n    \"hfJ th 1\\n\"\n    \"Svv va 1\\n\"\n    \"kmG ka 1\\n\"\n    \"sDq qu 1\\n\"\n    \"hGb th 1\\n\"\n    \"Blq qu 1\\n\"\n    \"Qry er 1\\n\"\n    \"hHz th 1\\n\"\n    \"yLx ny 1\\n\"\n    \"lqF qu 1\\n\"\n    \"wbB bi 1\\n\"\n    \"iYr in 1\\n\"\n    \"wDz tz 1\\n\"\n    \"xsJ st 1\\n\"\n    \"bzY sz 1\\n\"\n    \"pMw pr 1\\n\"\n    \"Uuj qu 1\\n\"\n    \"hxK th 1\\n\"\n    \"Xvf va 1\\n\"\n    \"krZ er 1\\n\"\n    \"fwV wa 1\\n\"\n    \"gPw ng 1\\n\"\n    \"qVn an 1\\n\"\n    \"Qnq an 1\\n\"\n    \"gDb ng 1\\n\"\n    \"hVr th 1\\n\"\n    \"zKh th 1\\n\"\n    \"Fxy ny 1\\n\"\n    \"oZj on 1\\n\"\n    \"zAy sz 1\\n\"\n    \"jMm ij 1\\n\"\n    \"mvI va 1\\n\"\n    \"Fwm me 1\\n\"\n    \"zql qu 1\\n\"\n    \"eVv er 1\\n\"\n    \"yWq qu 1\\n\"\n    \"Lwk ka 1\\n\"\n    \"Lmw me 1\\n\"\n    \"vXb va 1\\n\"\n    \"Xhs th 1\\n\"\n    \"hlR th 1\\n\"\n    \"Qqw qu 1\\n\"\n    \"zbK sz 1\\n\"\n    \"Pxl le 1\\n\"\n    \"nPm an 1\\n\"\n    \"wQo on 1\\n\"\n    \"Dcb ch 1\\n\"\n    \"hjT th 1\\n\"\n    \"rjJ er 1\\n\"\n    \"bMc ch 1\\n\"\n    \"iYb in 1\\n\"\n    \"Fqj qu 1\\n\"\n    \"Uoq qu 1\\n\"\n    \"Xvp va 1\\n\"\n    \"Lwb wa 1\\n\"\n    \"Jpd de 1\\n\"\n    \"qUg qu 1\\n\"\n    \"lJx le 1\\n\"\n    \"Xwd de 1\\n\"\n    \"xKf fo 1\\n\"\n    \"Znq an 1\\n\"\n    \"qCb qu 1\\n\"\n    \"Zbz sz 1\\n\"\n    \"Qux qu 1\\n\"\n    \"qNq qu 1\\n\"\n    \"fvV va 1\\n\"\n    \"Qqz qu 1\\n\"\n    \"Hdf de 1\\n\"\n    \"ySx ny 1\\n\"\n    \"qSm qu 1\\n\"\n    \"Lhb th 1\\n\"\n    \"Mvf va 1\\n\"\n    \"cDp ch 1\\n\"\n    \"bHq qu 1\\n\"\n    \"Wmg ng 1\\n\"\n    \"ytG th 1\\n\"\n    \"dbJ de 1\\n\"\n    \"Ffg ng 1\\n\"\n    \"hvM th 1\\n\"\n    \"Wqy qu 1\\n\"\n    \"gXd ng 1\\n\"\n    \"uFg qu 1\\n\"\n    \"jpR ij 1\\n\"\n    \"Xcc ch 1\\n\"\n    \"Tbp pr 1\\n\"\n    \"Qwq qu 1\\n\"\n    \"tPp th 1\\n\"\n    \"fMh th 1\\n\"\n    \"qiV qu 1\\n\"\n    \"dcB ch 1\\n\"\n    \"dFx de 1\\n\"\n    \"Ymj ij 1\\n\"\n    \"Ldq qu 1\\n\"\n    \"lxV le 1\\n\"\n    \"cCk ch 1\\n\"\n    \"hVx th 1\\n\"\n    \"dlT le 1\\n\"\n    \"khP th 1\\n\"\n    \"qVg qu 1\\n\"\n    \"Ljj ij 1\\n\"\n    \"zCv sz 1\\n\"\n    \"ywV wa 1\\n\"\n    \"ybZ be 1\\n\"\n    \"vGh th 1\\n\"\n    \"Bvj ij 1\\n\"\n    \"Zqq qu 1\\n\"\n    \"Gwk ka 1\\n\"\n    \"qLq qu 1\\n\"\n    \"fkX ka 1\\n\"\n    \"Nbz sz 1\\n\"\n    \"bXm me 1\\n\"\n    \"dQh th 1\\n\"\n    \"uYd qu 1\\n\"\n    \"xYs st 1\\n\"\n    \"zSs st 1\\n\"\n    \"ycZ ch 1\\n\"\n    \"lnU an 1\\n\"\n    \"tCj th 1\\n\"\n    \"xnY an 1\\n\"\n    \"ptQ th 1\\n\"\n    \"swO st 1\\n\"\n    \"hXu th 1\\n\"\n    \"mBw mb 1\\n\"\n    \"wmF me 1\\n\"\n    \"xJx xe 1\\n\"\n    \"dXj de 1\\n\"\n    \"eqg qu 1\\n\"\n    \"nBf an 1\\n\"\n    \"Xbd de 1\\n\"\n    \"fcQ ch 1\\n\"\n    \"xkS ka 1\\n\"\n    \"tOq th 1\\n\"\n    \"uQb qu 1\\n\"\n    \"cvV ch 1\\n\"\n    \"sBh th 1\\n\"\n    \"dCk de 1\\n\"\n    \"cKv ch 1\\n\"\n    \"cVf ch 1\\n\"\n    \"wZx wa 1\\n\"\n    \"Bvm va 1\\n\"\n    \"lqJ qu 1\\n\"\n    \"fxR fo 1\\n\"\n    \"vmF va 1\\n\"\n    \"xnq an 1\\n\"\n    \"bBg ng 1\\n\"\n    \"tPd th 1\\n\"\n    \"fNs st 1\\n\"\n    \"Fkp ka 1\\n\"\n    \"Yye er 1\\n\"\n    \"Ubq qu 1\\n\"\n    \"xzP sz 1\\n\"\n    \"fmQ me 1\\n\"\n    \"qcA ch 1\\n\"\n    \"yKc ch 1\\n\"\n    \"xvZ va 1\\n\"\n    \"cbN ch 1\\n\"\n    \"yYl le 1\\n\"\n    \"Pmw me 1\\n\"\n    \"wFx wa 1\\n\"\n    \"hRh th 1\\n\"\n    \"qpS qu 1\\n\"\n    \"Vqf qu 1\\n\"\n    \"Ghg th 1\\n\"\n    \"Wvq qu 1\\n\"\n    \"xkC ka 1\\n\"\n    \"ytM th 1\\n\"\n    \"Lnh th 1\\n\"\n    \"dxD de 1\\n\"\n    \"bMw wa 1\\n\"\n    \"xvU va 1\\n\"\n    \"Qzx sz 1\\n\"\n    \"srM er 1\\n\"\n    \"vLg ng 1\\n\"\n    \"cGq ch 1\\n\"\n    \"Vmy me 1\\n\"\n    \"hcL th 1\\n\"\n    \"pKx pr 1\\n\"\n    \"Jxs st 1\\n\"\n    \"blW le 1\\n\"\n    \"pQo on 1\\n\"\n    \"bEq qu 1\\n\"\n    \"fWt th 1\\n\"\n    \"sYm st 1\\n\"\n    \"nKw an 1\\n\"\n    \"dtF th 1\\n\"\n    \"kTz sz 1\\n\"\n    \"epX er 1\\n\"\n    \"fCp pr 1\\n\"\n    \"bFk ka 1\\n\"\n    \"Rzb sz 1\\n\"\n    \"vqI qu 1\\n\"\n    \"Zhc th 1\\n\"\n    \"Hvv va 1\\n\"\n    \"mVt th 1\\n\"\n    \"Iwx wa 1\\n\"\n    \"phR th 1\\n\"\n    \"wNb wa 1\\n\"\n    \"fRc ch 1\\n\"\n    \"ljq qu 1\\n\"\n    \"lvY le 1\\n\"\n    \"jcA ch 1\\n\"\n    \"dGw de 1\\n\"\n    \"Cqn an 1\\n\"\n    \"mBx me 1\\n\"\n    \"Mmx me 1\\n\"\n    \"Vxa an 1\\n\"\n    \"Xhw th 1\\n\"\n    \"eqK qu 1\\n\"\n    \"tCw th 1\\n\"\n    \"zvU sz 1\\n\"\n    \"lxQ le 1\\n\"\n    \"vMv va 1\\n\"\n    \"gqA qu 1\\n\"\n    \"Jbn an 1\\n\"\n    \"gCj ng 1\\n\"\n    \"oTf on 1\\n\"\n    \"kbW ka 1\\n\"\n    \"qjY qu 1\\n\"\n    \"Rqf qu 1\\n\"\n    \"hYh th 1\\n\"\n    \"yhE th 1\\n\"\n    \"gYj ng 1\\n\"\n    \"jcI ch 1\\n\"\n    \"qvJ qu 1\\n\"\n    \"qoC qu 1\\n\"\n    \"qFc ch 1\\n\"\n    \"qqH qu 1\\n\"\n    \"Nxq qu 1\\n\"\n    \"wVo on 1\\n\"\n    \"zHv sz 1\\n\"\n    \"ybS be 1\\n\"\n    \"Hwc ch 1\\n\"\n    \"Mxa an 1\\n\"\n    \"xkL ka 1\\n\"\n    \"qmO qu 1\\n\"\n    \"qbR qu 1\\n\"\n    \"Zfy ny 1\\n\"\n    \"Rkf ka 1\\n\"\n    \"vgV ng 1\\n\"\n    \"hBw th 1\\n\"\n    \"pXx pr 1\\n\"\n    \"brQ er 1\\n\"\n    \"fvO va 1\\n\"\n    \"hDc th 1\\n\"\n    \"xQa an 1\\n\"\n    \"wfF wa 1\\n\"\n    \"hZx th 1\\n\"\n    \"Jgz ng 1\\n\"\n    \"qnY an 1\\n\"\n    \"qXl le 1\\n\"\n    \"eNb er 1\\n\"\n    \"fxS fo 1\\n\"\n    \"sNk st 1\\n\"\n    \"mFc ch 1\\n\"\n    \"Uux qu 1\\n\"\n    \"Ydg ng 1\\n\"\n    \"ozW on 1\\n\"\n    \"Xzd de 1\\n\"\n    \"Jfe er 1\\n\"\n    \"Ftx th 1\\n\"\n    \"vzR sz 1\\n\"\n    \"wZk ka 1\\n\"\n    \"oHz on 1\\n\"\n    \"qvT qu 1\\n\"\n    \"qoA qu 1\\n\"\n    \"Sdq qu 1\\n\"\n    \"txW th 1\\n\"\n    \"Egf ng 1\\n\"\n    \"dMf de 1\\n\"\n    \"Rhh th 1\\n\"\n    \"vRn an 1\\n\"\n    \"ujX qu 1\\n\"\n    \"fRj ij 1\\n\"\n    \"gjA ng 1\\n\"\n    \"gDg ng 1\\n\"\n    \"smZ st 1\\n\"\n    \"jId de 1\\n\"\n    \"qkM qu 1\\n\"\n    \"bKz sz 1\\n\"\n    \"sCg ng 1\\n\"\n    \"uTp qu 1\\n\"\n    \"lVs le 1\\n\"\n    \"uQo qu 1\\n\"\n    \"Jfs st 1\\n\"\n    \"vKm va 1\\n\"\n    \"jQh th 1\\n\"\n    \"fUf fo 1\\n\"\n    \"uTf qu 1\\n\"\n    \"Bnv an 1\\n\"\n    \"tdU th 1\\n\"\n    \"dxY de 1\\n\"\n    \"hgV th 1\\n\"\n    \"Zdf de 1\\n\"\n    \"hqS th 1\\n\"\n    \"eJg ng 1\\n\"\n    \"qGu un 1\\n\"\n    \"vmE va 1\\n\"\n    \"gKz ng 1\\n\"\n    \"mUg ng 1\\n\"\n    \"Vjy ij 1\\n\"\n    \"uvJ qu 1\\n\"\n    \"mHr er 1\\n\"\n    \"Mhv th 1\\n\"\n    \"zsZ st 1\\n\"\n    \"Vzy sz 1\\n\"\n    \"jKb ij 1\\n\"\n    \"zPp sz 1\\n\"\n    \"qgD qu 1\\n\"\n    \"Xhf th 1\\n\"\n    \"Ogp ng 1\\n\"\n    \"jwX ij 1\\n\"\n    \"lYy le 1\\n\"\n    \"qzD qu 1\\n\"\n    \"wXj jo 1\\n\"\n    \"Kpx pr 1\\n\"\n    \"ydY de 1\\n\"\n    \"vBq qu 1\\n\"\n    \"Zpp pr 1\\n\"\n    \"bDd de 1\\n\"\n    \"Fjk ij 1\\n\"\n    \"kdA de 1\\n\"\n    \"zWt th 1\\n\"\n    \"wSd de 1\\n\"\n    \"kFd de 1\\n\"\n    \"Sxl le 1\\n\"\n    \"Fvh th 1\\n\"\n    \"pbR pr 1\\n\"\n    \"qrD qu 1\\n\"\n    \"vZs st 1\\n\"\n    \"vUm va 1\\n\"\n    \"wEy wa 1\\n\"\n    \"jjH jo 1\\n\"\n    \"sDg ng 1\\n\"\n    \"Ujc ch 1\\n\"\n    \"knI an 1\\n\"\n    \"fOa an 1\\n\"\n    \"Cjg ng 1\\n\"\n    \"tbV th 1\\n\"\n    \"gqd qu 1\\n\"\n    \"ePx er 1\\n\"\n    \"wRm me 1\\n\"\n    \"pvG va 1\\n\"\n    \"Qyl le 1\\n\"\n    \"cwG ch 1\\n\"\n    \"Dtq th 1\\n\"\n    \"Pbz sz 1\\n\"\n    \"Rgq qu 1\\n\"\n    \"fjU ij 1\\n\"\n    \"jJf ij 1\\n\"\n    \"Rxq qu 1\\n\"\n    \"Jtx th 1\\n\"\n    \"qvZ qu 1\\n\"\n    \"kKm ka 1\\n\"\n    \"hFm th 1\\n\"\n    \"kcX ch 1\\n\"\n    \"fNm me 1\\n\"\n    \"bpB pr 1\\n\"\n    \"xqY qu 1\\n\"\n    \"hYy th 1\\n\"\n    \"gGp ng 1\\n\"\n    \"Vfs st 1\\n\"\n    \"wDt th 1\\n\"\n    \"bTs st 1\\n\"\n    \"hfV th 1\\n\"\n    \"qzp qu 1\\n\"\n    \"yUv va 1\\n\"\n    \"qGc ch 1\\n\"\n    \"Vdl le 1\\n\"\n    \"Xjt th 1\\n\"\n    \"kMj ij 1\\n\"\n    \"hTg th 1\\n\"\n    \"Hlc ch 1\\n\"\n    \"tKz th 1\\n\"\n    \"Wvt th 1\\n\"\n    \"lMz le 1\\n\"\n    \"Mwx wa 1\\n\"\n    \"Wlv le 1\\n\"\n    \"xzG sz 1\\n\"\n    \"gmD ng 1\\n\"\n    \"zOi in 1\\n\"\n    \"bbI be 1\\n\"\n    \"bpI pr 1\\n\"\n    \"fQg ng 1\\n\"\n    \"pQv va 1\\n\"\n    \"vEb va 1\\n\"\n    \"jFz sz 1\\n\"\n    \"Whf th 1\\n\"\n    \"jvQ ij 1\\n\"\n    \"qYx qu 1\\n\"\n    \"rxM er 1\\n\"\n    \"vPp va 1\\n\"\n    \"fjD ij 1\\n\"\n    \"Vwy wa 1\\n\"\n    \"Yqc ch 1\\n\"\n    \"tcW th 1\\n\"\n    \"jYg ng 1\\n\"\n    \"gJb ng 1\\n\"\n    \"Tkc ch 1\\n\"\n    \"qhj th 1\\n\"\n    \"jxF ij 1\\n\"\n    \"Fpz sz 1\\n\"\n    \"kXh th 1\\n\"\n    \"lgZ ng 1\\n\"\n    \"znI an 1\\n\"\n    \"qyN qu 1\\n\"\n    \"vBj ij 1\\n\"\n    \"jSx ij 1\\n\"\n    \"cqI ch 1\\n\"\n    \"qYv qu 1\\n\"\n    \"Zrr er 1\\n\"\n    \"sHr er 1\\n\"\n    \"vrK er 1\\n\"\n    \"pbH pr 1\\n\"\n    \"zVh th 1\\n\"\n    \"dQb de 1\\n\"\n    \"lxF le 1\\n\"\n    \"sgW ng 1\\n\"\n    \"Ghf th 1\\n\"\n    \"xpq qu 1\\n\"\n    \"qhN th 1\\n\"\n    \"Fsf st 1\\n\"\n    \"Qga an 1\\n\"\n    \"Rdp de 1\\n\"\n    \"fvK va 1\\n\"\n    \"Ydz de 1\\n\"\n    \"wvW va 1\\n\"\n    \"cPm ch 1\\n\"\n    \"cQy ch 1\\n\"\n    \"ywF wa 1\\n\"\n    \"Ypq qu 1\\n\"\n    \"Rsj st 1\\n\"\n    \"Ygw ng 1\\n\"\n    \"xVp pr 1\\n\"\n    \"yxL ny 1\\n\"\n    \"Ywl le 1\\n\"\n    \"jMc ch 1\\n\"\n    \"zTl le 1\\n\"\n    \"aIq an 1\\n\"\n    \"qQi qu 1\\n\"\n    \"tqI th 1\\n\"\n    \"Hvp va 1\\n\"\n    \"wQd de 1\\n\"\n    \"hfG th 1\\n\"\n    \"cTd ch 1\\n\"\n    \"bfQ be 1\\n\"\n    \"Kfd de 1\\n\"\n    \"cXs ch 1\\n\"\n    \"vYx va 1\\n\"\n    \"Qoc ro 1\\n\"\n    \"vrL er 1\\n\"\n    \"pZk ka 1\\n\"\n    \"cdX ch 1\\n\"\n    \"Ygn an 1\\n\"\n    \"lnO an 1\\n\"\n    \"mfY me 1\\n\"\n    \"fnV an 1\\n\"\n    \"mbZ me 1\\n\"\n    \"gbE ng 1\\n\"\n    \"xjZ ij 1\\n\"\n    \"Fpy pr 1\\n\"\n    \"npE an 1\\n\"\n    \"Rxy ny 1\\n\"\n    \"oWp on 1\\n\"\n    \"hVh th 1\\n\"\n    \"yJf ny 1\\n\"\n    \"sQd st 1\\n\"\n    \"Zvg ng 1\\n\"\n    \"bDm me 1\\n\"\n    \"pLv va 1\\n\"\n    \"wwF wa 1\\n\"\n    \"xBh th 1\\n\"\n    \"qKm qu 1\\n\"\n    \"wXx wa 1\\n\"\n    \"Iux qu 1\\n\"\n    \"dgB ng 1\\n\"\n    \"gJp ng 1\\n\"\n    \"qgx qu 1\\n\"\n    \"fNh ho 1\\n\"\n    \"cvE ch 1\\n\"\n    \"cgH ch 1\\n\"\n    \"lNs le 1\\n\"\n    \"vDj ij 1\\n\"\n    \"zcG ch 1\\n\"\n    \"fZn on 1\\n\"\n    \"uUx qu 1\\n\"\n    \"clQ le 1\\n\"\n    \"fdH de 1\\n\"\n    \"eZj er 1\\n\"\n    \"Vqc ch 1\\n\"\n    \"Rcx ch 1\\n\"\n    \"jGh th 1\\n\"\n    \"qzM sz 1\\n\"\n    \"Qpw pr 1\\n\"\n    \"Spx pr 1\\n\"\n    \"cGx ch 1\\n\"\n    \"cqA ch 1\\n\"\n    \"vbK va 1\\n\"\n    \"xeW er 1\\n\"\n    \"vkC ka 1\\n\"\n    \"xzB sz 1\\n\"\n    \"xuR qu 1\\n\"\n    \"Oyq qu 1\\n\"\n    \"Mqx qu 1\\n\"\n    \"qqj qu 1\\n\"\n    \"yqY qu 1\\n\"\n    \"cwL ch 1\\n\"\n    \"pPt th 1\\n\"\n    \"dSx de 1\\n\"\n    \"dPk de 1\\n\"\n    \"uzH qu 1\\n\"\n    \"fvH va 1\\n\"\n    \"pcH ch 1\\n\"\n    \"hlY le 1\\n\"\n    \"qtX th 1\\n\"\n    \"Nvs st 1\\n\"\n    \"hvL th 1\\n\"\n    \"zRk sz 1\\n\"\n    \"tNj th 1\\n\"\n    \"Dbv va 1\\n\"\n    \"jKc ch 1\\n\"\n    \"dKy de 1\\n\"\n    \"yVz sz 1\\n\"\n    \"iqJ qu 1\\n\"\n    \"zgJ ng 1\\n\"\n    \"eJs er 1\\n\"\n    \"wOx wa 1\\n\"\n    \"rXh th 1\\n\"\n    \"Hqp qu 1\\n\"\n    \"vWx va 1\\n\"\n    \"bTt th 1\\n\"\n    \"fCy ny 1\\n\"\n    \"aOq an 1\\n\"\n    \"oCg ng 1\\n\"\n    \"pnE an 1\\n\"\n    \"Fwc ch 1\\n\"\n    \"zrT er 1\\n\"\n    \"xHs st 1\\n\"\n    \"ydX de 1\\n\"\n    \"dkV de 1\\n\"\n    \"Rqy qu 1\\n\"\n    \"Zyq qu 1\\n\"\n    \"kXl le 1\\n\"\n    \"oJt th 1\\n\"\n    \"sxI st 1\\n\"\n    \"qZw qu 1\\n\"\n    \"zqx qu 1\\n\"\n    \"clZ ch 1\\n\"\n    \"swX sz 1\\n\"\n    \"aHw an 1\\n\"\n    \"rWc ch 1\\n\"\n    \"cQp ch 1\\n\"\n    \"Jwj ij 1\\n\"\n    \"qeV qu 1\\n\"\n    \"sQj st 1\\n\"\n    \"Rpb pr 1\\n\"\n    \"mZq qu 1\\n\"\n    \"rBx er 1\\n\"\n    \"mxV me 1\\n\"\n    \"Mvy ny 1\\n\"\n    \"cRl ch 1\\n\"\n    \"Fzv sz 1\\n\"\n    \"pBs sz 1\\n\"\n    \"jWs st 1\\n\"\n    \"vqK qu 1\\n\"\n    \"Ixl le 1\\n\"\n    \"yhw th 1\\n\"\n    \"wyQ wa 1\\n\"\n    \"uCb qu 1\\n\"\n    \"zrF sz 1\\n\"\n    \"iyQ in 1\\n\"\n    \"qsP qu 1\\n\"\n    \"hLr er 1\\n\"\n    \"cvX ch 1\\n\"\n    \"Scq ch 1\\n\"\n    \"zrL er 1\\n\"\n    \"ecU ch 1\\n\"\n    \"Vxz sz 1\\n\"\n    \"fCq qu 1\\n\"\n    \"ovX on 1\\n\"\n    \"Uqn an 1\\n\"\n    \"sVw st 1\\n\"\n    \"spX st 1\\n\"\n    \"Qkv ka 1\\n\"\n    \"fyW ny 1\\n\"\n    \"rBc ch 1\\n\"\n    \"mdC de 1\\n\"\n    \"Wjk ij 1\\n\"\n    \"jYh th 1\\n\"\n    \"hXq th 1\\n\"\n    \"xkm ka 1\\n\"\n    \"hhU th 1\\n\"\n    \"Dvz sz 1\\n\"\n    \"tcq th 1\\n\"\n    \"wZy wa 1\\n\"\n    \"jtC th 1\\n\"\n    \"qnD an 1\\n\"\n    \"vmB va 1\\n\"\n    \"kjB ij 1\\n\"\n    \"cdG ch 1\\n\"\n    \"Vkt th 1\\n\"\n    \"hNq th 1\\n\"\n    \"Jft th 1\\n\"\n    \"iWv in 1\\n\"\n    \"Wtn th 1\\n\"\n    \"lfE le 1\\n\"\n    \"dZb de 1\\n\"\n    \"eqQ qu 1\\n\"\n    \"gUq qu 1\\n\"\n    \"qwL qu 1\\n\"\n    \"hUq th 1\\n\"\n    \"hGc th 1\\n\"\n    \"nwX an 1\\n\"\n    \"Nbt th 1\\n\"\n    \"jjP ij 1\\n\"\n    \"sqJ qu 1\\n\"\n    \"lQf le 1\\n\"\n    \"jZz sz 1\\n\"\n    \"wWn an 1\\n\"\n    \"Mxu qu 1\\n\"\n    \"qFi qu 1\\n\"\n    \"mjX ij 1\\n\"\n    \"vDx va 1\\n\"\n    \"vDn an 1\\n\"\n    \"wUc ch 1\\n\"\n    \"zhU th 1\\n\"\n    \"zHw sz 1\\n\"\n    \"Tjl le 1\\n\"\n    \"xuX qu 1\\n\"\n    \"jZp ij 1\\n\"\n    \"wVc ch 1\\n\"\n    \"gFp ng 1\\n\"\n    \"Gyq qu 1\\n\"\n    \"Jlh th 1\\n\"\n    \"Bkf ka 1\\n\"\n    \"hhJ th 1\\n\"\n    \"tvW th 1\\n\"\n    \"bIy ny 1\\n\"\n    \"Llg ng 1\\n\"\n    \"zJz sz 1\\n\"\n    \"qeQ qu 1\\n\"\n    \"nlX an 1\\n\"\n    \"tcQ th 1\\n\"\n    \"qtU th 1\\n\"\n    \"fkW ka 1\\n\"\n    \"gJk ng 1\\n\"\n    \"gQy ng 1\\n\"\n    \"sPz st 1\\n\"\n    \"bmO me 1\\n\"\n    \"Ytx th 1\\n\"\n    \"yqF qu 1\\n\"\n    \"iBk in 1\\n\"\n    \"uzV qu 1\\n\"\n    \"xNp pr 1\\n\"\n    \"zRz sz 1\\n\"\n    \"qHq qu 1\\n\"\n    \"yuY qu 1\\n\"\n    \"jqh th 1\\n\"\n    \"xBd de 1\\n\"\n    \"vvA va 1\\n\"\n    \"eVj er 1\\n\"\n    \"zGp sz 1\\n\"\n    \"vcB ch 1\\n\"\n    \"kpH ka 1\\n\"\n    \"mDw me 1\\n\"\n    \"vuG qu 1\\n\"\n    \"vVy ny 1\\n\"\n    \"mzS sz 1\\n\"\n    \"jvM ij 1\\n\"\n    \"sfV st 1\\n\"\n    \"hQq th 1\\n\"\n    \"wTm me 1\\n\"\n    \"Plq qu 1\\n\"\n    \"fxJ fo 1\\n\"\n    \"qQq qu 1\\n\"\n    \"Fnw an 1\\n\"\n    \"qJo qu 1\\n\"\n    \"Nsg ng 1\\n\"\n    \"Ljx ij 1\\n\"\n    \"sRb st 1\\n\"\n    \"pcY ch 1\\n\"\n    \"vVm va 1\\n\"\n    \"sQg ng 1\\n\"\n    \"Ywz sz 1\\n\"\n    \"hqJ th 1\\n\"\n    \"sjK st 1\\n\"\n    \"Zks st 1\\n\"\n    \"Mjt th 1\\n\"\n    \"Dwh th 1\\n\"\n    \"wbN wa 1\\n\"\n    \"mvK va 1\\n\"\n    \"rLp er 1\\n\"\n    \"Lbm me 1\\n\"\n    \"wjO ij 1\\n\"\n    \"lQz le 1\\n\"\n    \"Kwf wa 1\\n\"\n    \"qmB qu 1\\n\"\n    \"Xbv va 1\\n\"\n    \"cKq ch 1\\n\"\n    \"hqR th 1\\n\"\n    \"yVb be 1\\n\"\n    \"xcF ch 1\\n\"\n    \"Ewv va 1\\n\"\n    \"Gpq qu 1\\n\"\n    \"Gbh th 1\\n\"\n    \"yHj ij 1\\n\"\n    \"gXk ng 1\\n\"\n    \"qOx qu 1\\n\"\n    \"Kbw wa 1\\n\"\n    \"qHx qu 1\\n\"\n    \"wjP ij 1\\n\"\n    \"jQl le 1\\n\"\n    \"Ffq qu 1\\n\"\n    \"oYb on 1\\n\"\n    \"Fqo qu 1\\n\"\n    \"wXz sz 1\\n\"\n    \"fIp pr 1\\n\"\n    \"pMf pr 1\\n\"\n    \"nqP an 1\\n\"\n    \"bbZ be 1\\n\"\n    \"hsX th 1\\n\"\n    \"Wjr er 1\\n\"\n    \"Zqn an 1\\n\"\n    \"Pxb be 1\\n\"\n    \"Bzs st 1\\n\"\n    \"pbI pr 1\\n\"\n    \"Yvp va 1\\n\"\n    \"jxM ij 1\\n\"\n    \"jyZ ij 1\\n\"\n    \"mzJ sz 1\\n\"\n    \"vYg ng 1\\n\"\n    \"qMm qu 1\\n\"\n    \"fhL th 1\\n\"\n    \"qOg qu 1\\n\"\n    \"Mnp an 1\\n\"\n    \"Ifv va 1\\n\"\n    \"qYm qu 1\\n\"\n    \"gxv ng 1\\n\"\n    \"zfG sz 1\\n\"\n    \"fqG qu 1\\n\"\n    \"lLq qu 1\\n\"\n    \"hkK th 1\\n\"\n    \"oYk on 1\\n\"\n    \"lRg le 1\\n\"\n    \"lOx le 1\\n\"\n    \"Vxv va 1\\n\"\n    \"qAs qu 1\\n\"\n    \"tKk th 1\\n\"\n    \"lhF th 1\\n\"\n    \"dCv de 1\\n\"\n    \"wvY va 1\\n\"\n    \"wiV in 1\\n\"\n    \"crF ch 1\\n\"\n    \"fEp pr 1\\n\"\n    \"Rrl er 1\\n\"\n    \"Zjy ij 1\\n\"\n    \"qbY qu 1\\n\"\n    \"kMw ka 1\\n\"\n    \"vZi in 1\\n\"\n    \"Fxi in 1\\n\"\n    \"zkS sz 1\\n\"\n    \"vKb va 1\\n\"\n    \"zbI sz 1\\n\"\n    \"uHg qu 1\\n\"\n    \"qzG qu 1\\n\"\n    \"jMk ij 1\\n\"\n    \"Fkc ch 1\\n\"\n    \"dKm de 1\\n\"\n    \"nHh th 1\\n\"\n    \"xGc ch 1\\n\"\n    \"qpU qu 1\\n\"\n    \"rcU ch 1\\n\"\n    \"aWx an 1\\n\"\n    \"xdS de 1\\n\"\n    \"qhV th 1\\n\"\n    \"aHc ch 1\\n\"\n    \"vmI va 1\\n\"\n    \"Wcc ch 1\\n\"\n    \"zBn an 1\\n\"\n    \"kQe er 1\\n\"\n    \"awJ an 1\\n\"\n    \"xdD de 1\\n\"\n    \"yZx ny 1\\n\"\n    \"Kkd de 1\\n\"\n    \"wBz sz 1\\n\"\n    \"lzA le 1\\n\"\n    \"yyT ny 1\\n\"\n    \"qeK qu 1\\n\"\n    \"zpE sz 1\\n\"\n    \"zFn an 1\\n\"\n    \"yyG ny 1\\n\"\n    \"lLw le 1\\n\"\n    \"bvS va 1\\n\"\n    \"mvX va 1\\n\"\n    \"hlW th 1\\n\"\n    \"pgX ng 1\\n\"\n    \"lQt th 1\\n\"\n    \"ymY me 1\\n\"\n    \"mjJ ij 1\\n\"\n    \"mVc ch 1\\n\"\n    \"Xqs qu 1\\n\"\n    \"bKr er 1\\n\"\n    \"bHt th 1\\n\"\n    \"jRv ij 1\\n\"\n    \"Lpw pr 1\\n\"\n    \"zPb sz 1\\n\"\n    \"wkR ka 1\\n\"\n    \"kxS ka 1\\n\"\n    \"jWf ij 1\\n\"\n    \"Nkx ka 1\\n\"\n    \"Kcj ch 1\\n\"\n    \"bJb be 1\\n\"\n    \"xwZ wa 1\\n\"\n    \"Rqc ch 1\\n\"\n    \"Qzg ng 1\\n\"\n    \"jwH ij 1\\n\"\n    \"Dqd qu 1\\n\"\n    \"vLf va 1\\n\"\n    \"hXd th 1\\n\"\n    \"cfD ch 1\\n\"\n    \"sjX st 1\\n\"\n    \"hzI th 1\\n\"\n    \"qUd qu 1\\n\"\n    \"tSx th 1\\n\"\n    \"hxA th 1\\n\"\n    \"gxK ng 1\\n\"\n    \"hVm th 1\\n\"\n    \"yzX sz 1\\n\"\n    \"Ucs ch 1\\n\"\n    \"qaH an 1\\n\"\n    \"Yfy ny 1\\n\"\n    \"sJg ng 1\\n\"\n    \"iHp in 1\\n\"\n    \"iyC in 1\\n\"\n    \"Tjf ij 1\\n\"\n    \"dJp de 1\\n\"\n    \"Jgv ng 1\\n\"\n    \"uJf qu 1\\n\"\n    \"nNl an 1\\n\"\n    \"zdA sz 1\\n\"\n    \"xIq qu 1\\n\"\n    \"qjK qu 1\\n\"\n    \"vzY sz 1\\n\"\n    \"wqv qu 1\\n\"\n    \"Xvx va 1\\n\"\n    \"fJr er 1\\n\"\n    \"nqH an 1\\n\"\n    \"qGd qu 1\\n\"\n    \"vQg ng 1\\n\"\n    \"iQz in 1\\n\"\n    \"tLn th 1\\n\"\n    \"lVj le 1\\n\"\n    \"vqW qu 1\\n\"\n    \"zrN er 1\\n\"\n    \"xKz sz 1\\n\"\n    \"waV an 1\\n\"\n    \"Ydq qu 1\\n\"\n    \"dkq qu 1\\n\"\n    \"fCn an 1\\n\"\n    \"Xcy ch 1\\n\"\n    \"pIl le 1\\n\"\n    \"hXl th 1\\n\"\n    \"aFs an 1\\n\"\n    \"iwM in 1\\n\"\n    \"Gwx wa 1\\n\"\n    \"Xlp le 1\\n\"\n    \"Qfu qu 1\\n\"\n    \"jqE qu 1\\n\"\n    \"lqP qu 1\\n\"\n    \"kVq qu 1\\n\"\n    \"xqJ qu 1\\n\"\n    \"Mzf sz 1\\n\"\n    \"mNw me 1\\n\"\n    \"Wsv st 1\\n\"\n    \"fnM an 1\\n\"\n    \"uSf qu 1\\n\"\n    \"hCf th 1\\n\"\n    \"zjH sz 1\\n\"\n    \"mTs st 1\\n\"\n    \"jWz sz 1\\n\"\n    \"Dxk ka 1\\n\"\n    \"Ztd th 1\\n\"\n    \"Rvv va 1\\n\"\n    \"gBx ng 1\\n\"\n    \"Lzx sz 1\\n\"\n    \"ezU er 1\\n\"\n    \"jqH qu 1\\n\"\n    \"Rjh th 1\\n\"\n    \"Dcg ch 1\\n\"\n    \"bBh th 1\\n\"\n    \"fhO th 1\\n\"\n    \"hpH th 1\\n\"\n    \"Zqa an 1\\n\"\n    \"kCx ka 1\\n\"\n    \"rRv er 1\\n\"\n    \"dkZ de 1\\n\"\n    \"Ggx ng 1\\n\"\n    \"pQh th 1\\n\"\n    \"Gcv ch 1\\n\"\n    \"Scg ch 1\\n\"\n    \"vDb va 1\\n\"\n    \"pbD pr 1\\n\"\n    \"vEh th 1\\n\"\n    \"vlE le 1\\n\"\n    \"Rjl le 1\\n\"\n    \"lFw le 1\\n\"\n    \"zqN qu 1\\n\"\n    \"aPq an 1\\n\"\n    \"gjD ng 1\\n\"\n    \"jcE ch 1\\n\"\n    \"wSw wa 1\\n\"\n    \"Dgj ng 1\\n\"\n    \"huZ th 1\\n\"\n    \"gPv ng 1\\n\"\n    \"pJj ij 1\\n\"\n    \"cQh th 1\\n\"\n    \"mwq qu 1\\n\"\n    \"vpA va 1\\n\"\n    \"hGf th 1\\n\"\n    \"cXz ch 1\\n\"\n    \"Lcb ch 1\\n\"\n    \"fJm me 1\\n\"\n    \"Qzy sz 1\\n\"\n    \"zQm sz 1\\n\"\n    \"Hhn th 1\\n\"\n    \"xdY de 1\\n\"\n    \"uYl qu 1\\n\"\n    \"Xkj ij 1\\n\"\n    \"jvA ij 1\\n\"\n    \"Jvp va 1\\n\"\n    \"iwZ in 1\\n\"\n    \"zkq qu 1\\n\"\n    \"Nhb th 1\\n\"\n    \"kmV ka 1\\n\"\n    \"qKd qu 1\\n\"\n    \"Bcq ch 1\\n\"\n    \"pfY pr 1\\n\"\n    \"qUj qu 1\\n\"\n    \"gqR qu 1\\n\"\n    \"gwO ng 1\\n\"\n    \"gXm ng 1\\n\"\n    \"jHh th 1\\n\"\n    \"rBn an 1\\n\"\n    \"uPw qu 1\\n\"\n    \"pJk ka 1\\n\"\n    \"Ipj ij 1\\n\"\n    \"yqM qu 1\\n\"\n    \"Yqn an 1\\n\"\n    \"Kbz sz 1\\n\"\n    \"vfL va 1\\n\"\n    \"npZ an 1\\n\"\n    \"oqY qu 1\\n\"\n    \"Zqf qu 1\\n\"\n    \"jzU sz 1\\n\"\n    \"vNx va 1\\n\"\n    \"hXf th 1\\n\"\n    \"fCg ng 1\\n\"\n    \"nzJ an 1\\n\"\n    \"mKj ij 1\\n\"\n    \"wmB me 1\\n\"\n    \"Wjq qu 1\\n\"\n    \"Dbq qu 1\\n\"\n    \"zXy sz 1\\n\"\n    \"xYw wa 1\\n\"\n    \"fQf fo 1\\n\"\n    \"dqP qu 1\\n\"\n    \"Kxq qu 1\\n\"\n    \"jdZ de 1\\n\"\n    \"qrX qu 1\\n\"\n    \"Lxb be 1\\n\"\n    \"yfL ny 1\\n\"\n    \"yYm me 1\\n\"\n    \"sbH st 1\\n\"\n    \"wlV le 1\\n\"\n    \"uKp qu 1\\n\"\n    \"hhN th 1\\n\"\n    \"Xxq qu 1\\n\"\n    \"jLg ng 1\\n\"\n    \"nQh th 1\\n\"\n    \"Wqp qu 1\\n\"\n    \"Nqd qu 1\\n\"\n    \"jfD ij 1\\n\"\n    \"Jnq an 1\\n\"\n    \"Bzn an 1\\n\"\n    \"mJr er 1\\n\"\n    \"qaX an 1\\n\"\n    \"pJw pr 1\\n\"\n    \"jHz sz 1\\n\"\n    \"yaX an 1\\n\"\n    \"Whs th 1\\n\"\n    \"hYr th 1\\n\"\n    \"tmS th 1\\n\"\n    \"Fhy th 1\\n\"\n    \"Ggd ng 1\\n\"\n    \"Xmy me 1\\n\"\n    \"Rqh th 1\\n\"\n    \"Fsn an 1\\n\"\n    \"qhA th 1\\n\"\n    \"fhX th 1\\n\"\n    \"Hqx qu 1\\n\"\n    \"wIo on 1\\n\"\n    \"Ibx be 1\\n\"\n    \"cFx ch 1\\n\"\n    \"dRg ng 1\\n\"\n    \"snV an 1\\n\"\n    \"kqz qu 1\\n\"\n    \"eqO er 1\\n\"\n    \"Gkz sz 1\\n\"\n    \"Nnz an 1\\n\"\n    \"yqE qu 1\\n\"\n    \"cJh th 1\\n\"\n    \"xvA va 1\\n\"\n    \"qMx qu 1\\n\"\n    \"dwS de 1\\n\"\n    \"yAj ij 1\\n\"\n    \"xCq qu 1\\n\"\n    \"gmE ng 1\\n\"\n    \"bhP th 1\\n\"\n    \"rwE er 1\\n\"\n    \"Xnz an 1\\n\"\n    \"Uhw th 1\\n\"\n    \"xnR an 1\\n\"\n    \"nfZ an 1\\n\"\n    \"Qpx pr 1\\n\"\n    \"qxO qu 1\\n\"\n    \"lGt th 1\\n\"\n    \"qRc ch 1\\n\"\n    \"Rwx wa 1\\n\"\n    \"tcM th 1\\n\"\n    \"fBd de 1\\n\"\n    \"Rjc ch 1\\n\"\n    \"dfY de 1\\n\"\n    \"hhR th 1\\n\"\n    \"bCj ij 1\\n\"\n    \"fqL qu 1\\n\"\n    \"lzS le 1\\n\"\n    \"Lrm er 1\\n\"\n    \"eqE qu 1\\n\"\n    \"vgL ng 1\\n\"\n    \"wQr er 1\\n\"\n    \"bwB wa 1\\n\"\n    \"lGf le 1\\n\"\n    \"Nwq qu 1\\n\"\n    \"sdU st 1\\n\"\n    \"Zxv va 1\\n\"\n    \"yDm me 1\\n\"\n    \"Lsw st 1\\n\"\n    \"cNq ch 1\\n\"\n    \"Dqc ch 1\\n\"\n    \"vLz sz 1\\n\"\n    \"dWv de 1\\n\"\n    \"fkQ ka 1\\n\"\n    \"zjD sz 1\\n\"\n    \"yYv va 1\\n\"\n    \"qeT qu 1\\n\"\n    \"cvL ch 1\\n\"\n    \"wkA ka 1\\n\"\n    \"Nvb va 1\\n\"\n    \"djM de 1\\n\"\n    \"hgK th 1\\n\"\n    \"pXb pr 1\\n\"\n    \"Tlw le 1\\n\"\n    \"Rhz ha 1\\n\"\n    \"wkP ka 1\\n\"\n    \"wDk ka 1\\n\"\n    \"eFc ch 1\\n\"\n    \"ehU th 1\\n\"\n    \"Xly le 1\\n\"\n    \"wxK wa 1\\n\"\n    \"dPw de 1\\n\"\n    \"sFd st 1\\n\"\n    \"vcI ch 1\\n\"\n    \"Fxd de 1\\n\"\n    \"fvR va 1\\n\"\n    \"jqs qu 1\\n\"\n    \"rMj er 1\\n\"\n    \"qbW qu 1\\n\"\n    \"kpP ka 1\\n\"\n    \"Bvw va 1\\n\"\n    \"Tmk ka 1\\n\"\n    \"hbP th 1\\n\"\n    \"hMx th 1\\n\"\n    \"jgL ng 1\\n\"\n    \"efU er 1\\n\"\n    \"cQb ch 1\\n\"\n    \"mcA ch 1\\n\"\n    \"Ewq qu 1\\n\"\n    \"xmV me 1\\n\"\n    \"Qcq ch 1\\n\"\n    \"mzG sz 1\\n\"\n    \"pKm me 1\\n\"\n    \"Fwq qu 1\\n\"\n    \"lRn an 1\\n\"\n    \"jPk ij 1\\n\"\n    \"jMb ij 1\\n\"\n    \"mzO sz 1\\n\"\n    \"oFw on 1\\n\"\n    \"hJb th 1\\n\"\n    \"sVq qu 1\\n\"\n    \"iVz in 1\\n\"\n    \"oqU qu 1\\n\"\n    \"bhW th 1\\n\"\n    \"Oxq qu 1\\n\"\n    \"mQk ka 1\\n\"\n    \"Xfb be 1\\n\"\n    \"cNw ch 1\\n\"\n    \"fgZ ng 1\\n\"\n    \"Tvf va 1\\n\"\n    \"sIx st 1\\n\"\n    \"uZs qu 1\\n\"\n    \"xzX sz 1\\n\"\n    \"Ylq qu 1\\n\"\n    \"oHf on 1\\n\"\n    \"csU ch 1\\n\"\n    \"Qzs st 1\\n\"\n    \"Bfq qu 1\\n\"\n    \"yJn an 1\\n\"\n    \"pgQ ng 1\\n\"\n    \"wxk ka 1\\n\"\n    \"Tnw an 1\\n\"\n    \"bKx be 1\\n\"\n    \"bqX qu 1\\n\"\n    \"Qjs st 1\\n\"\n    \"pFh th 1\\n\"\n    \"Xvl le 1\\n\"\n    \"kfB ka 1\\n\"\n    \"mZl le 1\\n\"\n    \"Csg ng 1\\n\"\n    \"vrJ er 1\\n\"\n    \"Gfy ny 1\\n\"\n    \"jbP ij 1\\n\"\n    \"Yvl le 1\\n\"\n    \"Hxb be 1\\n\"\n    \"lrD er 1\\n\"\n    \"qTl qu 1\\n\"\n    \"aBc ch 1\\n\"\n    \"fGb be 1\\n\"\n    \"mhS th 1\\n\"\n    \"zTp sz 1\\n\"\n    \"kRd de 1\\n\"\n    \"Wph th 1\\n\"\n    \"Npj ij 1\\n\"\n    \"lwS le 1\\n\"\n    \"mGm me 1\\n\"\n    \"nqT an 1\\n\"\n    \"Ujn an 1\\n\"\n    \"xjO ij 1\\n\"\n    \"dMz sz 1\\n\"\n    \"wKj ij 1\\n\"\n    \"yZr er 1\\n\"\n    \"Njb ij 1\\n\"\n    \"Ylr er 1\\n\"\n    \"mVf me 1\\n\"\n    \"gZg ng 1\\n\"\n    \"Hcb ch 1\\n\"\n    \"xcB ch 1\\n\"\n    \"kMm ka 1\\n\"\n    \"lwC le 1\\n\"\n    \"Dnf an 1\\n\"\n    \"hjW th 1\\n\"\n    \"rTk er 1\\n\"\n    \"Vzj sz 1\\n\"\n    \"Vxy ny 1\\n\"\n    \"wlQ le 1\\n\"\n    \"Nrv er 1\\n\"\n    \"pjP ij 1\\n\"\n    \"fwZ wa 1\\n\"\n    \"tnW th 1\\n\"\n    \"oJw on 1\\n\"\n    \"kJx ka 1\\n\"\n    \"Vpj ij 1\\n\"\n    \"qAw qu 1\\n\"\n    \"Qht th 1\\n\"\n    \"bCn an 1\\n\"\n    \"vrU er 1\\n\"\n    \"hRc th 1\\n\"\n    \"clC ch 1\\n\"\n    \"rFd er 1\\n\"\n    \"twH th 1\\n\"\n    \"kCw ka 1\\n\"\n    \"mSd de 1\\n\"\n    \"Xnw an 1\\n\"\n    \"fXm me 1\\n\"\n    \"Twf wa 1\\n\"\n    \"Fwj ij 1\\n\"\n    \"bjJ ij 1\\n\"\n    \"lbQ le 1\\n\"\n    \"kvS ka 1\\n\"\n    \"Smz sz 1\\n\"\n    \"fBp pr 1\\n\"\n    \"Nzz sz 1\\n\"\n    \"bQp pr 1\\n\"\n    \"vLx va 1\\n\"\n    \"hVf th 1\\n\"\n    \"yUj ij 1\\n\"\n    \"cZd ch 1\\n\"\n    \"gIy eg 1\\n\"\n    \"hVq th 1\\n\"\n    \"aQx an 1\\n\"\n    \"Qfv va 1\\n\"\n    \"lKb le 1\\n\"\n    \"zhN th 1\\n\"\n    \"Zbm me 1\\n\"\n    \"Gcq ch 1\\n\"\n    \"gbT ng 1\\n\"\n    \"pYk ka 1\\n\"\n    \"Xvd de 1\\n\"\n    \"xMl le 1\\n\"\n    \"uHb qu 1\\n\"\n    \"bXf be 1\\n\"\n    \"sNc ch 1\\n\"\n    \"qVy qu 1\\n\"\n    \"cpO ch 1\\n\"\n    \"Fgb ng 1\\n\"\n    \"eWl er 1\\n\"\n    \"kKd de 1\\n\"\n    \"Cbj ij 1\\n\"\n    \"mfH me 1\\n\"\n    \"qIa an 1\\n\"\n    \"sfX st 1\\n\"\n    \"snH an 1\\n\"\n    \"Hjg ng 1\\n\"\n    \"Lmf me 1\\n\"\n    \"xgf ng 1\\n\"\n    \"Evw va 1\\n\"\n    \"wOk ka 1\\n\"\n    \"Hjf ij 1\\n\"\n    \"zuJ qu 1\\n\"\n    \"fZm me 1\\n\"\n    \"lNq qu 1\\n\"\n    \"xUg ng 1\\n\"\n    \"nLs an 1\\n\"\n    \"jkS ij 1\\n\"\n    \"Gvp va 1\\n\"\n    \"jPd de 1\\n\"\n    \"ywQ wa 1\\n\"\n    \"qrG qu 1\\n\"\n    \"bbH be 1\\n\"\n    \"ghJ th 1\\n\"\n    \"mMh th 1\\n\"\n    \"Yvt th 1\\n\"\n    \"xLq qu 1\\n\"\n    \"Bdq qu 1\\n\"\n    \"zJd sz 1\\n\"\n    \"xRs st 1\\n\"\n    \"vgP ng 1\\n\"\n    \"Hhb th 1\\n\"\n    \"npL an 1\\n\"\n    \"vFp va 1\\n\"\n    \"hSj th 1\\n\"\n    \"bdC de 1\\n\"\n    \"kGg ng 1\\n\"\n    \"kVf ka 1\\n\"\n    \"qvP qu 1\\n\"\n    \"kwO ka 1\\n\"\n    \"Jqt th 1\\n\"\n    \"zWx sz 1\\n\"\n    \"sQk st 1\\n\"\n    \"hnV th 1\\n\"\n    \"rrD er 1\\n\"\n    \"jVh th 1\\n\"\n    \"vvY va 1\\n\"\n    \"bfI be 1\\n\"\n    \"fSz sz 1\\n\"\n    \"Czf sz 1\\n\"\n    \"kWl le 1\\n\"\n    \"jJc ch 1\\n\"\n    \"Gwj ij 1\\n\"\n    \"lFh th 1\\n\"\n    \"Vpf fo 1\\n\"\n    \"fkV ka 1\\n\"\n    \"cYj ch 1\\n\"\n    \"mrW er 1\\n\"\n    \"hBb th 1\\n\"\n    \"hJx th 1\\n\"\n    \"wIq qu 1\\n\"\n    \"cdA ch 1\\n\"\n    \"wQy wa 1\\n\"\n    \"wCq qu 1\\n\"\n    \"wqZ qu 1\\n\"\n    \"jfX ij 1\\n\"\n    \"jtG th 1\\n\"\n    \"xkJ ka 1\\n\"\n    \"Qzf sz 1\\n\"\n    \"gKs ng 1\\n\"\n    \"Qzo on 1\\n\"\n    \"bwI wa 1\\n\"\n    \"Tsb st 1\\n\"\n    \"vvX va 1\\n\"\n    \"jlR le 1\\n\"\n    \"qlQ qu 1\\n\"\n    \"dbX de 1\\n\"\n    \"Hfc ch 1\\n\"\n    \"Bsj st 1\\n\"\n    \"Yqk qu 1\\n\"\n    \"Xnc ch 1\\n\"\n    \"bzZ sz 1\\n\"\n    \"dGt th 1\\n\"\n    \"Xgg ng 1\\n\"\n    \"jwE wa 1\\n\"\n    \"Oyc ch 1\\n\"\n    \"pQd de 1\\n\"\n    \"jRy ij 1\\n\"\n    \"pmX me 1\\n\"\n    \"lZx le 1\\n\"\n    \"gFq qu 1\\n\"\n    \"mJd de 1\\n\"\n    \"sKq qu 1\\n\"\n    \"Ikj ij 1\\n\"\n    \"zkG sz 1\\n\"\n    \"wGf wa 1\\n\"\n    \"qRp qu 1\\n\"\n    \"xDn an 1\\n\"\n    \"gvL ng 1\\n\"\n    \"mGx me 1\\n\"\n    \"iIj in 1\\n\"\n    \"Gzd sz 1\\n\"\n    \"bLx be 1\\n\"\n    \"jUl le 1\\n\"\n    \"Qvc ch 1\\n\"\n    \"mVh th 1\\n\"\n    \"uhF th 1\\n\"\n    \"fVk ka 1\\n\"\n    \"cnM ch 1\\n\"\n    \"uFh th 1\\n\"\n    \"mXf me 1\\n\"\n    \"rCb er 1\\n\"\n    \"nLw an 1\\n\"\n    \"vfH fo 1\\n\"\n    \"iqV qu 1\\n\"\n    \"qhD th 1\\n\"\n    \"sHx st 1\\n\"\n    \"Ywy wa 1\\n\"\n    \"mDx me 1\\n\"\n    \"cBt th 1\\n\"\n    \"Bmq qu 1\\n\"\n    \"xRc ch 1\\n\"\n    \"bSz sz 1\\n\"\n    \"vCj ij 1\\n\"\n    \"Tcv ch 1\\n\"\n    \"aZq an 1\\n\"\n    \"Jcx ch 1\\n\"\n    \"nbF an 1\\n\"\n    \"Qzb sz 1\\n\"\n    \"vkQ ka 1\\n\"\n    \"hzD th 1\\n\"\n    \"xHp pr 1\\n\"\n    \"hqX th 1\\n\"\n    \"fEv va 1\\n\"\n    \"yjF ij 1\\n\"\n    \"Pjk ij 1\\n\"\n    \"sfU st 1\\n\"\n    \"bGc ch 1\\n\"\n    \"mcX ch 1\\n\"\n    \"pXc ch 1\\n\"\n    \"yvS va 1\\n\"\n    \"pMl le 1\\n\"\n    \"wJs st 1\\n\"\n    \"Vwq qu 1\\n\"\n    \"yCw wa 1\\n\"\n    \"qds qu 1\\n\"\n    \"rRj er 1\\n\"\n    \"Qhv th 1\\n\"\n    \"ucG ch 1\\n\"\n    \"oEh th 1\\n\"\n    \"wQi in 1\\n\"\n    \"lSg ng 1\\n\"\n    \"Lqt th 1\\n\"\n    \"nlH an 1\\n\"\n    \"uqG qu 1\\n\"\n    \"Oao an 1\\n\"\n    \"hlX th 1\\n\"\n    \"fPw wa 1\\n\"\n    \"tIb th 1\\n\"\n    \"zIq qu 1\\n\"\n    \"qmG qu 1\\n\"\n    \"xJm me 1\\n\"\n    \"Vgw ng 1\\n\"\n    \"Ukx ka 1\\n\"\n    \"ztH th 1\\n\"\n    \"lhP th 1\\n\"\n    \"Jtk th 1\\n\"\n    \"Hzd sz 1\\n\"\n    \"yxQ ny 1\\n\"\n    \"nrP an 1\\n\"\n    \"fHh th 1\\n\"\n    \"Yct th 1\\n\"\n    \"Gqa an 1\\n\"\n    \"Fgy ng 1\\n\"\n    \"oBn an 1\\n\"\n    \"vuC qu 1\\n\"\n    \"Bnz an 1\\n\"\n    \"vPu qu 1\\n\"\n    \"xFf fo 1\\n\"\n    \"jdJ de 1\\n\"\n    \"fGf fo 1\\n\"\n    \"Yjq qu 1\\n\"\n    \"Qjp ij 1\\n\"\n    \"xTj ij 1\\n\"\n    \"vOq qu 1\\n\"\n    \"vLw va 1\\n\"\n    \"sMf st 1\\n\"\n    \"oVl on 1\\n\"\n    \"cwN ch 1\\n\"\n    \"sgR ng 1\\n\"\n    \"jjQ ij 1\\n\"\n    \"wzR sz 1\\n\"\n    \"zhY th 1\\n\"\n    \"vbR va 1\\n\"\n    \"wgW ng 1\\n\"\n    \"qwX qu 1\\n\"\n    \"Nxw wa 1\\n\"\n    \"eQo er 1\\n\"\n    \"mQp me 1\\n\"\n    \"Kqh th 1\\n\"\n    \"tvA th 1\\n\"\n    \"dlJ le 1\\n\"\n    \"yVx ny 1\\n\"\n    \"sPf st 1\\n\"\n    \"dQz sz 1\\n\"\n    \"sZb st 1\\n\"\n    \"zhS th 1\\n\"\n    \"kWb ka 1\\n\"\n    \"mqP qu 1\\n\"\n    \"Ffk ka 1\\n\"\n    \"xql qu 1\\n\"\n    \"gqH qu 1\\n\"\n    \"Tly le 1\\n\"\n    \"kpL ka 1\\n\"\n    \"qEg qu 1\\n\"\n    \"bMg ng 1\\n\"\n    \"xRj ij 1\\n\"\n    \"xsC st 1\\n\"\n    \"jlS le 1\\n\"\n    \"lzM le 1\\n\"\n    \"Pfb be 1\\n\"\n    \"uJv qu 1\\n\"\n    \"yVf ny 1\\n\"\n    \"Zgq qu 1\\n\"\n    \"xbS be 1\\n\"\n    \"oFh th 1\\n\"\n    \"xvb va 1\\n\"\n    \"hcU th 1\\n\"\n    \"wwU wa 1\\n\"\n    \"yCg ng 1\\n\"\n    \"mPz sz 1\\n\"\n    \"sJd st 1\\n\"\n    \"bmN me 1\\n\"\n    \"uVc ch 1\\n\"\n    \"qdS qu 1\\n\"\n    \"Vwp pr 1\\n\"\n    \"Vml le 1\\n\"\n    \"Hqy qu 1\\n\"\n    \"Lfz sz 1\\n\"\n    \"Ayj ij 1\\n\"\n    \"yxK ny 1\\n\"\n    \"Hwv va 1\\n\"\n    \"gIp ng 1\\n\"\n    \"Zgt th 1\\n\"\n    \"Xtw th 1\\n\"\n    \"hLf th 1\\n\"\n    \"Nkd de 1\\n\"\n    \"jMs st 1\\n\"\n    \"xFt th 1\\n\"\n    \"xBw wa 1\\n\"\n    \"wHd de 1\\n\"\n    \"Qzz sz 1\\n\"\n    \"gYt th 1\\n\"\n    \"Pvk ka 1\\n\"\n    \"pvY va 1\\n\"\n    \"Jxt th 1\\n\"\n    \"ugQ qu 1\\n\"\n    \"Lqq qu 1\\n\"\n    \"xlL le 1\\n\"\n    \"wMb wa 1\\n\"\n    \"Sbz sz 1\\n\"\n    \"vEv va 1\\n\"\n    \"qfz qu 1\\n\"\n    \"gcS ch 1\\n\"\n    \"tCq th 1\\n\"\n    \"yHp pr 1\\n\"\n    \"zkF sz 1\\n\"\n    \"xuK qu 1\\n\"\n    \"Tbf be 1\\n\"\n    \"Ipg ng 1\\n\"\n    \"Yzk sz 1\\n\"\n    \"Qwz sz 1\\n\"\n    \"pFj ij 1\\n\"\n    \"jPm ij 1\\n\"\n    \"Dpq qu 1\\n\"\n    \"pJz sz 1\\n\"\n    \"wpN pr 1\\n\"\n    \"wzE sz 1\\n\"\n    \"gqD qu 1\\n\"\n    \"Xwm me 1\\n\"\n    \"oQx on 1\\n\"\n    \"lCp le 1\\n\"\n    \"Mhk th 1\\n\"\n    \"dTq qu 1\\n\"\n    \"xUw wa 1\\n\"\n    \"hgE th 1\\n\"\n    \"gcB ch 1\\n\"\n    \"hpJ th 1\\n\"\n    \"mqK qu 1\\n\"\n    \"gBn an 1\\n\"\n    \"hIv th 1\\n\"\n    \"lqD qu 1\\n\"\n    \"wPx wa 1\\n\"\n    \"sMt th 1\\n\"\n    \"yXw wa 1\\n\"\n    \"jKq qu 1\\n\"\n    \"Lrz er 1\\n\"\n    \"Hwj ij 1\\n\"\n    \"yfW ny 1\\n\"\n    \"Yyu qu 1\\n\"\n    \"qYs qu 1\\n\"\n    \"yvR va 1\\n\"\n    \"sRz st 1\\n\"\n    \"Kyx ny 1\\n\"\n    \"nxR an 1\\n\"\n    \"cdJ ch 1\\n\"\n    \"Nwc ch 1\\n\"\n    \"tbE th 1\\n\"\n    \"oeZ er 1\\n\"\n    \"bcQ ch 1\\n\"\n    \"Swb wa 1\\n\"\n    \"Ikq qu 1\\n\"\n    \"Bvz sz 1\\n\"\n    \"zhF th 1\\n\"\n    \"Xqy qu 1\\n\"\n    \"kKb ka 1\\n\"\n    \"Wdk de 1\\n\"\n    \"wpP pr 1\\n\"\n    \"kQy ka 1\\n\"\n    \"Bqe qu 1\\n\"\n    \"qfZ qu 1\\n\"\n    \"pPw pr 1\\n\"\n    \"Aoh th 1\\n\"\n    \"plJ le 1\\n\"\n    \"Ynv an 1\\n\"\n    \"jMh th 1\\n\"\n    \"bQg ng 1\\n\"\n    \"afM an 1\\n\"\n    \"jvO ij 1\\n\"\n    \"eHf er 1\\n\"\n    \"hQg th 1\\n\"\n    \"kqY qu 1\\n\"\n    \"zJq qu 1\\n\"\n    \"pYh th 1\\n\"\n    \"qeM qu 1\\n\"\n    \"Kpk ka 1\\n\"\n    \"kfW ka 1\\n\"\n    \"Wds st 1\\n\"\n    \"bNc ch 1\\n\"\n    \"vBx va 1\\n\"\n    \"suJ qu 1\\n\"\n    \"qEx qu 1\\n\"\n    \"rfZ er 1\\n\"\n    \"oHg ng 1\\n\"\n    \"eFw er 1\\n\"\n    \"fPp pr 1\\n\"\n    \"kDb ka 1\\n\"\n    \"tZn th 1\\n\"\n    \"dcK ch 1\\n\"\n    \"yWv va 1\\n\"\n    \"Uxv va 1\\n\"\n    \"yQe er 1\\n\"\n    \"Zjq qu 1\\n\"\n    \"Wjv ij 1\\n\"\n    \"ygO ng 1\\n\"\n    \"ojQ on 1\\n\"\n    \"Kwc ch 1\\n\"\n    \"pFg ng 1\\n\"\n    \"sMd st 1\\n\"\n    \"Mfq qu 1\\n\"\n    \"Mzy sz 1\\n\"\n    \"Nwp pr 1\\n\"\n    \"ywT wa 1\\n\"\n    \"wLq qu 1\\n\"\n    \"Hqm qu 1\\n\"\n    \"qsC qu 1\\n\"\n    \"bNn an 1\\n\"\n    \"bUv va 1\\n\"\n    \"nRc ch 1\\n\"\n    \"Rlk le 1\\n\"\n    \"Bqp qu 1\\n\"\n    \"cfI ch 1\\n\"\n    \"mVq qu 1\\n\"\n    \"qGj qu 1\\n\"\n    \"vlX le 1\\n\"\n    \"kfG ka 1\\n\"\n    \"wVd de 1\\n\"\n    \"cdE ch 1\\n\"\n    \"hzE th 1\\n\"\n    \"Dhv th 1\\n\"\n    \"bzj sz 1\\n\"\n    \"vvL va 1\\n\"\n    \"bzQ sz 1\\n\"\n    \"wVb wa 1\\n\"\n    \"Zxl le 1\\n\"\n    \"zLw sz 1\\n\"\n    \"hTq th 1\\n\"\n    \"Vqp qu 1\\n\"\n    \"hmW th 1\\n\"\n    \"flD le 1\\n\"\n    \"Kcd ch 1\\n\"\n    \"pDq qu 1\\n\"\n    \"kvY ka 1\\n\"\n    \"cQl ch 1\\n\"\n    \"Ixk ka 1\\n\"\n    \"sGf st 1\\n\"\n    \"gFh th 1\\n\"\n    \"Rkd de 1\\n\"\n    \"qHl qu 1\\n\"\n    \"rCg ng 1\\n\"\n    \"qBn an 1\\n\"\n    \"sJw st 1\\n\"\n    \"cWj ch 1\\n\"\n    \"zXp sz 1\\n\"\n    \"Hhl th 1\\n\"\n    \"hjP th 1\\n\"\n    \"qlZ qu 1\\n\"\n    \"Hxr er 1\\n\"\n    \"zrE er 1\\n\"\n    \"gkH ng 1\\n\"\n    \"uHk qu 1\\n\"\n    \"Gzm sz 1\\n\"\n    \"cBc ch 1\\n\"\n    \"zff sz 1\\n\"\n    \"zLs st 1\\n\"\n    \"Uqy qu 1\\n\"\n    \"vkD ka 1\\n\"\n    \"fqX qu 1\\n\"\n    \"hLj th 1\\n\"\n    \"fYu qu 1\\n\"\n    \"jKw ij 1\\n\"\n    \"jIb ij 1\\n\"\n    \"nrU an 1\\n\"\n    \"fFp pr 1\\n\"\n    \"sbC st 1\\n\"\n    \"mGv va 1\\n\"\n    \"fXp pr 1\\n\"\n    \"Pkv ka 1\\n\"\n    \"Cqe qu 1\\n\"\n    \"cCx ch 1\\n\"\n    \"rNq qu 1\\n\"\n    \"Zwf wa 1\\n\"\n    \"Jgc ch 1\\n\"\n    \"xlQ le 1\\n\"\n    \"gBz ng 1\\n\"\n    \"cIx ch 1\\n\"\n    \"odQ on 1\\n\"\n    \"Qnz an 1\\n\"\n    \"Uzx sz 1\\n\"\n    \"Jpt th 1\\n\"\n    \"gxX ng 1\\n\"\n    \"Zkd de 1\\n\"\n    \"Xkk ka 1\\n\"\n    \"hRv th 1\\n\"\n    \"ycV ch 1\\n\"\n    \"zMm sz 1\\n\"\n    \"eBq qu 1\\n\"\n    \"gHd ng 1\\n\"\n    \"bxU be 1\\n\"\n    \"xdK de 1\\n\"\n    \"mQc ch 1\\n\"\n    \"tYj th 1\\n\"\n    \"hlF th 1\\n\"\n    \"cRz ch 1\\n\"\n    \"lGz le 1\\n\"\n    \"zFz ze 1\\n\"\n    \"qOp qu 1\\n\"\n    \"Ggc ch 1\\n\"\n    \"oGm on 1\\n\"\n    \"Xnp an 1\\n\"\n    \"wYg ng 1\\n\"\n    \"wuJ qu 1\\n\"\n    \"sNs st 1\\n\"\n    \"zqU qu 1\\n\"\n    \"kCp ka 1\\n\"\n    \"Whw th 1\\n\"\n    \"nQx an 1\\n\"\n    \"vwA va 1\\n\"\n    \"Vcg ch 1\\n\"\n    \"kWj ij 1\\n\"\n    \"Hqd qu 1\\n\"\n    \"Cpy pr 1\\n\"\n    \"zcL ch 1\\n\"\n    \"cfF ch 1\\n\"\n    \"kXn an 1\\n\"\n    \"aXj an 1\\n\"\n    \"Swk ka 1\\n\"\n    \"fhq th 1\\n\"\n    \"Vxi in 1\\n\"\n    \"Gqu un 1\\n\"\n    \"Uxd de 1\\n\"\n    \"zdK sz 1\\n\"\n    \"hZq th 1\\n\"\n    \"mwJ me 1\\n\"\n    \"cvD ch 1\\n\"\n    \"lbZ le 1\\n\"\n    \"Pzl le 1\\n\"\n    \"hdO th 1\\n\"\n    \"hJn th 1\\n\"\n    \"qWp qu 1\\n\"\n    \"dXy de 1\\n\"\n    \"fuU qu 1\\n\"\n    \"fXy ny 1\\n\"\n    \"xnL an 1\\n\"\n    \"gMf ng 1\\n\"\n    \"rNf er 1\\n\"\n    \"xQh th 1\\n\"\n    \"kqH qu 1\\n\"\n    \"rFz er 1\\n\"\n    \"vpT va 1\\n\"\n    \"Nwy wa 1\\n\"\n    \"yqA qu 1\\n\"\n    \"vhO th 1\\n\"\n    \"kVh th 1\\n\"\n    \"nYb an 1\\n\"\n    \"jvN ij 1\\n\"\n    \"bIf be 1\\n\"\n    \"qqS qu 1\\n\"\n    \"jbF ij 1\\n\"\n    \"gMk ng 1\\n\"\n    \"bTd de 1\\n\"\n    \"Rhd th 1\\n\"\n    \"tWq th 1\\n\"\n    \"gLz ng 1\\n\"\n    \"fsD st 1\\n\"\n    \"uMt th 1\\n\"\n    \"yHq qu 1\\n\"\n    \"Xgj ng 1\\n\"\n    \"Lmm me 1\\n\"\n    \"vkU ka 1\\n\"\n    \"lAx le 1\\n\"\n    \"Kzd sz 1\\n\"\n    \"hKm th 1\\n\"\n    \"kQd de 1\\n\"\n    \"gFc ch 1\\n\"\n    \"wyX wa 1\\n\"\n    \"zfU sz 1\\n\"\n    \"xpU pr 1\\n\"\n    \"ywJ wa 1\\n\"\n    \"Ayq qu 1\\n\"\n    \"gIu qu 1\\n\"\n    \"zuQ qu 1\\n\"\n    \"Vfn an 1\\n\"\n    \"vBn an 1\\n\"\n    \"Hty th 1\\n\"\n    \"gRv ng 1\\n\"\n    \"pTb pr 1\\n\"\n    \"Uqx qu 1\\n\"\n    \"vTn an 1\\n\"\n    \"vJc ch 1\\n\"\n    \"Uiw in 1\\n\"\n    \"Jlp le 1\\n\"\n    \"zPq qu 1\\n\"\n    \"rCx er 1\\n\"\n    \"lqS qu 1\\n\"\n    \"zlZ le 1\\n\"\n    \"zOw sz 1\\n\"\n    \"klK le 1\\n\"\n    \"kfQ ka 1\\n\"\n    \"uJx qu 1\\n\"\n    \"pkP ka 1\\n\"\n    \"Gqz qu 1\\n\"\n    \"Jlc ch 1\\n\"\n    \"yyD ny 1\\n\"\n    \"jhX th 1\\n\"\n    \"crV ch 1\\n\"\n    \"Dww wa 1\\n\"\n    \"yjw ij 1\\n\"\n    \"qpX qu 1\\n\"\n    \"Qmd de 1\\n\"\n    \"yWz sz 1\\n\"\n    \"wPd de 1\\n\"\n    \"Uqk qu 1\\n\"\n    \"nbR an 1\\n\"\n    \"Ydc ch 1\\n\"\n    \"qQl qu 1\\n\"\n    \"pmD me 1\\n\"\n    \"Jkj ka 1\\n\"\n    \"jTk ka 1\\n\"\n    \"wYf wa 1\\n\"\n    \"Zzx sz 1\\n\"\n    \"rkQ er 1\\n\"\n    \"bDp pr 1\\n\"\n    \"qSs qu 1\\n\"\n    \"gXr ng 1\\n\"\n    \"cZb ch 1\\n\"\n    \"Ngp ng 1\\n\"\n    \"hqQ th 1\\n\"\n    \"Wvw va 1\\n\"\n    \"Wbw wa 1\\n\"\n    \"wvK va 1\\n\"\n    \"cJf ch 1\\n\"\n    \"Mwd de 1\\n\"\n    \"ddJ de 1\\n\"\n    \"iwE in 1\\n\"\n    \"bxX be 1\\n\"\n    \"jxT ij 1\\n\"\n    \"Ycn ch 1\\n\"\n    \"wMf wa 1\\n\"\n    \"bqD qu 1\\n\"\n    \"yqI qu 1\\n\"\n    \"dRj de 1\\n\"\n    \"wYy wa 1\\n\"\n    \"Txz sz 1\\n\"\n    \"vrN er 1\\n\"\n    \"qVu un 1\\n\"\n    \"mRj ij 1\\n\"\n    \"Fjx ij 1\\n\"\n    \"fyQ ny 1\\n\"\n    \"xeI er 1\\n\"\n    \"Wqf qu 1\\n\"\n    \"Jly le 1\\n\"\n    \"jDb ij 1\\n\"\n    \"Yzu qu 1\\n\"\n    \"Bxm me 1\\n\"\n    \"wLj ij 1\\n\"\n    \"bqc ch 1\\n\"\n    \"sgK ng 1\\n\"\n    \"kqW qu 1\\n\"\n    \"Zsn an 1\\n\"\n    \"Fqq qu 1\\n\"\n    \"rXz er 1\\n\"\n    \"lJq qu 1\\n\"\n    \"jEh th 1\\n\"\n    \"nCb an 1\\n\"\n    \"Xrd er 1\\n\"\n    \"Rzh th 1\\n\"\n    \"gfW ng 1\\n\"\n    \"Xtl th 1\\n\"\n    \"mTx me 1\\n\"\n    \"ufA qu 1\\n\"\n    \"wjQ ij 1\\n\"\n    \"xlW le 1\\n\"\n    \"dqH qu 1\\n\"\n    \"xhM th 1\\n\"\n    \"Xwt th 1\\n\"\n    \"dnW an 1\\n\"\n    \"Rfz sz 1\\n\"\n    \"fKp pr 1\\n\"\n    \"kFw ka 1\\n\"\n    \"Quv qu 1\\n\"\n    \"mXw me 1\\n\"\n    \"Vkw ka 1\\n\"\n    \"tFh ch 1\\n\"\n    \"hIu th 1\\n\"\n    \"lTf le 1\\n\"\n    \"Mwv va 1\\n\"\n    \"wvT va 1\\n\"\n    \"kKp ka 1\\n\"\n    \"tRv th 1\\n\"\n    \"wXo on 1\\n\"\n    \"vzL sz 1\\n\"\n    \"Jcf ch 1\\n\"\n    \"Tbq qu 1\\n\"\n    \"jdQ de 1\\n\"\n    \"Rbx be 1\\n\"\n    \"Jrm er 1\\n\"\n    \"sRj st 1\\n\"\n    \"zWz sz 1\\n\"\n    \"qnE an 1\\n\"\n    \"Kcf ch 1\\n\"\n    \"Qqm qu 1\\n\"\n    \"fpI pr 1\\n\"\n    \"iNw in 1\\n\"\n    \"ujE qu 1\\n\"\n    \"qHv qu 1\\n\"\n    \"Jvx va 1\\n\"\n    \"hHc th 1\\n\"\n    \"fvJ va 1\\n\"\n    \"nqY an 1\\n\"\n    \"wpE wa 1\\n\"\n    \"Hws st 1\\n\"\n    \"xzI sz 1\\n\"\n    \"Cgg ng 1\\n\"\n    \"cWd ch 1\\n\"\n    \"quV un 1\\n\"\n    \"bjN ij 1\\n\"\n    \"xQp pr 1\\n\"\n    \"bxE be 1\\n\"\n    \"uVk qu 1\\n\"\n    \"Wrl er 1\\n\"\n    \"Lrx er 1\\n\"\n    \"Iwl le 1\\n\"\n    \"aqB an 1\\n\"\n    \"Vcp ch 1\\n\"\n    \"Wwt th 1\\n\"\n    \"aGx an 1\\n\"\n    \"fPn an 1\\n\"\n    \"mFq qu 1\\n\"\n    \"qgd qu 1\\n\"\n    \"Zsd st 1\\n\"\n    \"Vxs sz 1\\n\"\n    \"Khq th 1\\n\"\n    \"wSs st 1\\n\"\n    \"oGq qu 1\\n\"\n    \"Yzv sz 1\\n\"\n    \"dqX qu 1\\n\"\n    \"mpQ me 1\\n\"\n    \"Kcp ch 1\\n\"\n    \"swD st 1\\n\"\n    \"rZg ng 1\\n\"\n    \"jYm ij 1\\n\"\n    \"uJl qu 1\\n\"\n    \"vWv va 1\\n\"\n    \"svO st 1\\n\"\n    \"pFd de 1\\n\"\n    \"Yjx ij 1\\n\"\n    \"tpI th 1\\n\"\n    \"dVt th 1\\n\"\n    \"sNm st 1\\n\"\n    \"lKt th 1\\n\"\n    \"nvU an 1\\n\"\n    \"Hxf fo 1\\n\"\n    \"puW qu 1\\n\"\n    \"wJg ng 1\\n\"\n    \"gxR ng 1\\n\"\n    \"fAg ng 1\\n\"\n    \"Yqe qu 1\\n\"\n    \"Pwz sz 1\\n\"\n    \"hmC th 1\\n\"\n    \"ylJ le 1\\n\"\n    \"mqT qu 1\\n\"\n    \"cCf ch 1\\n\"\n    \"pZg ng 1\\n\"\n    \"aFx an 1\\n\"\n    \"oYq qu 1\\n\"\n    \"fPj ij 1\\n\"\n    \"dJt th 1\\n\"\n    \"xwn an 1\\n\"\n    \"Ccb ch 1\\n\"\n    \"wFn an 1\\n\"\n    \"wrY er 1\\n\"\n    \"Cdh th 1\\n\"\n    \"hLc th 1\\n\"\n    \"Zxg ng 1\\n\"\n    \"Mxc ch 1\\n\"\n    \"hcY th 1\\n\"\n    \"zVw sz 1\\n\"\n    \"hkV th 1\\n\"\n    \"txE th 1\\n\"\n    \"yvT va 1\\n\"\n    \"Mlw le 1\\n\"\n    \"ztF th 1\\n\"\n    \"fGd de 1\\n\"\n    \"zjE sz 1\\n\"\n    \"gjM ng 1\\n\"\n    \"jwP ij 1\\n\"\n    \"Kxt th 1\\n\"\n    \"yFg ng 1\\n\"\n    \"Wcg ch 1\\n\"\n    \"thZ ch 1\\n\"\n    \"hzQ th 1\\n\"\n    \"Jtg th 1\\n\"\n    \"yvK va 1\\n\"\n    \"zVz sz 1\\n\"\n    \"Pwb wa 1\\n\"\n    \"xqD qu 1\\n\"\n    \"uyQ qu 1\\n\"\n    \"gCm ng 1\\n\"\n    \"zjU sz 1\\n\"\n    \"xGq qu 1\\n\"\n    \"Mqy qu 1\\n\"\n    \"Ocx ch 1\\n\"\n    \"sqM qu 1\\n\"\n    \"lRb le 1\\n\"\n    \"tfU th 1\\n\"\n    \"vZg ng 1\\n\"\n    \"fZc ch 1\\n\"\n    \"gpZ ng 1\\n\"\n    \"Fpf pr 1\\n\"\n    \"qtQ th 1\\n\"\n    \"mhZ th 1\\n\"\n    \"bqF qu 1\\n\"\n    \"fgG ng 1\\n\"\n    \"woT on 1\\n\"\n    \"zSb sz 1\\n\"\n    \"wxS wa 1\\n\"\n    \"Wrf er 1\\n\"\n    \"Oqk qu 1\\n\"\n    \"xLc ch 1\\n\"\n    \"Qzj sz 1\\n\"\n    \"wXk ka 1\\n\"\n    \"tdX th 1\\n\"\n    \"Jqc ch 1\\n\"\n    \"fXk ka 1\\n\"\n    \"kBd de 1\\n\"\n    \"iqW qu 1\\n\"\n    \"Ocb ch 1\\n\"\n    \"fUo on 1\\n\"\n    \"jXk ij 1\\n\"\n    \"hbI th 1\\n\"\n    \"Zcg ch 1\\n\"\n    \"zwS wa 1\\n\"\n    \"cVm ch 1\\n\"\n    \"vwj ij 1\\n\"\n    \"gwG ng 1\\n\"\n    \"zsM st 1\\n\"\n    \"Pqo qu 1\\n\"\n    \"hPj th 1\\n\"\n    \"fwG wa 1\\n\"\n    \"Xwh th 1\\n\"\n    \"Wwh th 1\\n\"\n    \"Vqw qu 1\\n\"\n    \"vmY va 1\\n\"\n    \"uvF qu 1\\n\"\n    \"tfK th 1\\n\"\n    \"Xbg ng 1\\n\"\n    \"Nfn an 1\\n\"\n    \"wpH pr 1\\n\"\n    \"yJq qu 1\\n\"\n    \"wqO qu 1\\n\"\n    \"ncV ch 1\\n\"\n    \"wgM ng 1\\n\"\n    \"fQk ka 1\\n\"\n    \"hvK th 1\\n\"\n    \"qLr qu 1\\n\"\n    \"Wce ch 1\\n\"\n    \"kFn an 1\\n\"\n    \"rBm er 1\\n\"\n    \"mdV de 1\\n\"\n    \"jFc ch 1\\n\"\n    \"knX an 1\\n\"\n    \"nMf an 1\\n\"\n    \"sCc ch 1\\n\"\n    \"pCq qu 1\\n\"\n    \"uJt th 1\\n\"\n    \"Cfk ka 1\\n\"\n    \"Cxb be 1\\n\"\n    \"fOw wa 1\\n\"\n    \"aJz an 1\\n\"\n    \"gLt th 1\\n\"\n    \"bmX me 1\\n\"\n    \"Yfo on 1\\n\"\n    \"dJf de 1\\n\"\n    \"Eay an 1\\n\"\n    \"qSd qu 1\\n\"\n    \"mjQ ij 1\\n\"\n    \"pNk ka 1\\n\"\n    \"Nvh th 1\\n\"\n    \"xkX ka 1\\n\"\n    \"Jwx wa 1\\n\"\n    \"jvL ij 1\\n\"\n    \"fpH pr 1\\n\"\n    \"pxO pr 1\\n\"\n    \"vPx va 1\\n\"\n    \"dWu qu 1\\n\"\n    \"hbR th 1\\n\"\n    \"woE on 1\\n\"\n    \"gtX th 1\\n\"\n    \"bfF be 1\\n\"\n    \"mvW va 1\\n\"\n    \"xsM st 1\\n\"\n    \"wLv va 1\\n\"\n    \"wHh th 1\\n\"\n    \"sCn an 1\\n\"\n    \"pLw pr 1\\n\"\n    \"kXw ka 1\\n\"\n    \"xVl le 1\\n\"\n    \"hCc th 1\\n\"\n    \"oUk on 1\\n\"\n    \"zcF ch 1\\n\"\n    \"sMv st 1\\n\"\n    \"drZ er 1\\n\"\n    \"wfO wa 1\\n\"\n    \"yFv va 1\\n\"\n    \"hXa th 1\\n\"\n    \"qMu un 1\\n\"\n    \"fCv va 1\\n\"\n    \"fwC wa 1\\n\"\n    \"oTg ng 1\\n\"\n    \"Fkm ka 1\\n\"\n    \"eQt th 1\\n\"\n    \"Pxd de 1\\n\"\n    \"kjG ij 1\\n\"\n    \"tGs th 1\\n\"\n    \"dqB qu 1\\n\"\n    \"fmX me 1\\n\"\n    \"xYi in 1\\n\"\n    \"kIk ka 1\\n\"\n    \"vDd de 1\\n\"\n    \"kvC ka 1\\n\"\n    \"qtZ th 1\\n\"\n    \"fPc ch 1\\n\"\n    \"dpN de 1\\n\"\n    \"hNr th 1\\n\"\n    \"Znj an 1\\n\"\n    \"Hke er 1\\n\"\n    \"Iqp qu 1\\n\"\n    \"wfN wa 1\\n\"\n    \"Vhx th 1\\n\"\n    \"Dgk ng 1\\n\"\n    \"mkQ ka 1\\n\"\n    \"Wxd de 1\\n\"\n    \"Icx ch 1\\n\"\n    \"yYt th 1\\n\"\n    \"tqx th 1\\n\"\n    \"Zvf va 1\\n\"\n    \"sxU st 1\\n\"\n    \"Lqk qu 1\\n\"\n    \"nfI an 1\\n\"\n    \"jyq qu 1\\n\"\n    \"Wvn an 1\\n\"\n    \"Sdv de 1\\n\"\n    \"uYc ch 1\\n\"\n    \"Qgm ng 1\\n\"\n    \"cXa ch 1\\n\"\n    \"wBx wa 1\\n\"\n    \"pYx pr 1\\n\"\n    \"jWl le 1\\n\"\n    \"Kfw wa 1\\n\"\n    \"qjJ qu 1\\n\"\n    \"Pjj ij 1\\n\"\n    \"ajX an 1\\n\"\n    \"sXd st 1\\n\"\n    \"xHg ng 1\\n\"\n    \"xhA th 1\\n\"\n    \"rGm er 1\\n\"\n    \"Qtm th 1\\n\"\n    \"srY er 1\\n\"\n    \"qPx qu 1\\n\"\n    \"wRz sz 1\\n\"\n    \"wOg wa 1\\n\"\n    \"fLg ng 1\\n\"\n    \"hQt th 1\\n\"\n    \"jhW th 1\\n\"\n    \"Cwk ka 1\\n\"\n    \"zWl le 1\\n\"\n    \"wJc ch 1\\n\"\n    \"Pxv va 1\\n\"\n    \"npI an 1\\n\"\n    \"lnW an 1\\n\"\n    \"kqy qu 1\\n\"\n    \"ywg ng 1\\n\"\n    \"sCd st 1\\n\"\n    \"qfF qu 1\\n\"\n    \"qpg qu 1\\n\"\n    \"Mbx be 1\\n\"\n    \"nwN an 1\\n\"\n    \"wLs st 1\\n\"\n    \"Wcv ch 1\\n\"\n    \"Vvr er 1\\n\"\n    \"Vkx ka 1\\n\"\n    \"dmU de 1\\n\"\n    \"fGs st 1\\n\"\n    \"gJz ng 1\\n\"\n    \"dFz sz 1\\n\"\n    \"qCf qu 1\\n\"\n    \"lvW le 1\\n\"\n    \"Svb va 1\\n\"\n    \"xJr er 1\\n\"\n    \"uZf qu 1\\n\"\n    \"Tjc ch 1\\n\"\n    \"pIj ij 1\\n\"\n    \"bVg ng 1\\n\"\n    \"vdO de 1\\n\"\n    \"lTq qu 1\\n\"\n    \"bMh th 1\\n\"\n    \"nDm an 1\\n\"\n    \"Tzb sz 1\\n\"\n    \"pCw pr 1\\n\"\n    \"Qkg ng 1\\n\"\n    \"fpY pr 1\\n\"\n    \"yQj ij 1\\n\"\n    \"qiC qu 1\\n\"\n    \"mQi in 1\\n\"\n    \"wUq qu 1\\n\"\n    \"kVj ij 1\\n\"\n    \"tjQ th 1\\n\"\n    \"mXj ij 1\\n\"\n    \"Xfd de 1\\n\"\n    \"cgI ch 1\\n\"\n    \"Pkj ij 1\\n\"\n    \"jjF ij 1\\n\"\n    \"jrJ er 1\\n\"\n    \"qwZ qu 1\\n\"\n    \"Rtz th 1\\n\"\n    \"fHb be 1\\n\"\n    \"Hgx ng 1\\n\"\n    \"Dzf sz 1\\n\"\n    \"cbE ch 1\\n\"\n    \"Xfs st 1\\n\"\n    \"Rjm ij 1\\n\"\n    \"fmY me 1\\n\"\n    \"wYj ij 1\\n\"\n    \"uFp qu 1\\n\"\n    \"vWm va 1\\n\"\n    \"yVc ch 1\\n\"\n    \"cgL ch 1\\n\"\n    \"zmR sz 1\\n\"\n    \"zfB sz 1\\n\"\n    \"znH an 1\\n\"\n    \"hgG th 1\\n\"\n    \"xuE qu 1\\n\"\n    \"Bsl le 1\\n\"\n    \"oWx on 1\\n\"\n    \"Pjl le 1\\n\"\n    \"Jdf de 1\\n\"\n    \"Xmp me 1\\n\"\n    \"sgO ng 1\\n\"\n    \"hCj th 1\\n\"\n    \"wtR th 1\\n\"\n    \"fDs st 1\\n\"\n    \"bQb be 1\\n\"\n    \"quM un 1\\n\"\n    \"fLl le 1\\n\"\n    \"Nhp th 1\\n\"\n    \"znU an 1\\n\"\n    \"sdS st 1\\n\"\n    \"wWu qu 1\\n\"\n    \"tFq th 1\\n\"\n    \"cFq ch 1\\n\"\n    \"Wwl le 1\\n\"\n    \"Lqy qu 1\\n\"\n    \"nqQ an 1\\n\"\n    \"zmD sz 1\\n\"\n    \"Gyx ny 1\\n\"\n    \"bkR ka 1\\n\"\n    \"lQw le 1\\n\"\n    \"Pqm qu 1\\n\"\n    \"Fwk ka 1\\n\"\n    \"tHt th 1\\n\"\n    \"jyL ij 1\\n\"\n    \"qxA qu 1\\n\"\n    \"mrC er 1\\n\"\n    \"qzL qu 1\\n\"\n    \"jJg ng 1\\n\"\n    \"jfS ij 1\\n\"\n    \"qMh th 1\\n\"\n    \"mlV le 1\\n\"\n    \"bkJ ka 1\\n\"\n    \"knH an 1\\n\"\n    \"Uqt th 1\\n\"\n    \"cuF ch 1\\n\"\n    \"iYq qu 1\\n\"\n    \"fUe er 1\\n\"\n    \"sBb st 1\\n\"\n    \"Nhx th 1\\n\"\n    \"rhP th 1\\n\"\n    \"dWp de 1\\n\"\n    \"Yvf va 1\\n\"\n    \"Rxr er 1\\n\"\n    \"kzG sz 1\\n\"\n    \"xuZ qu 1\\n\"\n    \"xvD va 1\\n\"\n    \"fwq qu 1\\n\"\n    \"hjJ th 1\\n\"\n    \"kZr er 1\\n\"\n    \"vJn an 1\\n\"\n    \"xnO an 1\\n\"\n    \"vcA ch 1\\n\"\n    \"mfK me 1\\n\"\n    \"vjS ij 1\\n\"\n    \"Nvp va 1\\n\"\n    \"dfB de 1\\n\"\n    \"Qsb st 1\\n\"\n    \"dXp pr 1\\n\"\n    \"zRl le 1\\n\"\n    \"Ejq qu 1\\n\"\n    \"aGz an 1\\n\"\n    \"nHg an 1\\n\"\n    \"bvA va 1\\n\"\n    \"Bfd de 1\\n\"\n    \"zVg ng 1\\n\"\n    \"zsY st 1\\n\"\n    \"hVz th 1\\n\"\n    \"Pjm ij 1\\n\"\n    \"sXi in 1\\n\"\n    \"iKj in 1\\n\"\n    \"qaE an 1\\n\"\n    \"Cfj ij 1\\n\"\n    \"zMc ch 1\\n\"\n    \"mgZ ng 1\\n\"\n    \"vgA ng 1\\n\"\n    \"iwJ in 1\\n\"\n    \"vGx va 1\\n\"\n    \"tfY th 1\\n\"\n    \"ljH le 1\\n\"\n    \"zGj sz 1\\n\"\n    \"bmK me 1\\n\"\n    \"nUq an 1\\n\"\n    \"zRt th 1\\n\"\n    \"tGj th 1\\n\"\n    \"zVd sz 1\\n\"\n    \"jSr er 1\\n\"\n    \"fNq qu 1\\n\"\n    \"xTg ng 1\\n\"\n    \"nqE an 1\\n\"\n    \"Wng an 1\\n\"\n    \"zVv sz 1\\n\"\n    \"gVs ng 1\\n\"\n    \"fNd de 1\\n\"\n    \"qNw qu 1\\n\"\n    \"Znc ch 1\\n\"\n    \"uJs qu 1\\n\"\n    \"yvJ va 1\\n\"\n    \"xlM le 1\\n\"\n    \"Jzc ch 1\\n\"\n    \"vRh th 1\\n\"\n    \"fcK ch 1\\n\"\n    \"wVn an 1\\n\"\n    \"rWw er 1\\n\"\n    \"cHk ch 1\\n\"\n    \"vOx va 1\\n\"\n    \"iUa an 1\\n\"\n    \"nWn an 1\\n\"\n    \"zqZ qu 1\\n\"\n    \"xFj ij 1\\n\"\n    \"nCg an 1\\n\"\n    \"fYj ij 1\\n\"\n    \"Vsx st 1\\n\"\n    \"mtM th 1\\n\"\n    \"mhG th 1\\n\"\n    \"jtN th 1\\n\"\n    \"hcC th 1\\n\"\n    \"Nwk ka 1\\n\"\n    \"dXu qu 1\\n\"\n    \"mJq qu 1\\n\"\n    \"xsO st 1\\n\"\n    \"qRn an 1\\n\"\n    \"Rnj an 1\\n\"\n    \"kmP ka 1\\n\"\n    \"Xtg th 1\\n\"\n    \"Gvh th 1\\n\"\n    \"jqv qu 1\\n\"\n    \"cVl ch 1\\n\"\n    \"cdI ch 1\\n\"\n    \"zdE sz 1\\n\"\n    \"hZk th 1\\n\"\n    \"Bdx de 1\\n\"\n    \"hHn th 1\\n\"\n    \"hkG th 1\\n\"\n    \"vxJ va 1\\n\"\n    \"lrA er 1\\n\"\n    \"lrT er 1\\n\"\n    \"hjV th 1\\n\"\n    \"qbI qu 1\\n\"\n    \"mTg ng 1\\n\"\n    \"fmV me 1\\n\"\n    \"rDk er 1\\n\"\n    \"dNd de 1\\n\"\n    \"Gzj sz 1\\n\"\n    \"aVj an 1\\n\"\n    \"vNr er 1\\n\"\n    \"kXa an 1\\n\"\n    \"rGs er 1\\n\"\n    \"xaX an 1\\n\"\n    \"crG ch 1\\n\"\n    \"qJa an 1\\n\"\n    \"jDt th 1\\n\"\n    \"Mfx fo 1\\n\"\n    \"xEa an 1\\n\"\n    \"Qvz sz 1\\n\"\n    \"wRg ng 1\\n\"\n    \"pFc ch 1\\n\"\n    \"Cpv va 1\\n\"\n    \"rJk er 1\\n\"\n    \"fbQ be 1\\n\"\n    \"Xzg ng 1\\n\"\n    \"qFy qu 1\\n\"\n    \"Zfj ij 1\\n\"\n    \"twE th 1\\n\"\n    \"Oaq an 1\\n\"\n    \"ysY st 1\\n\"\n    \"wdZ de 1\\n\"\n    \"gmO ng 1\\n\"\n    \"wGn an 1\\n\"\n    \"wRk ka 1\\n\"\n    \"gqS qu 1\\n\"\n    \"Agq qu 1\\n\"\n    \"Twv va 1\\n\"\n    \"Qnv an 1\\n\"\n    \"bVv va 1\\n\"\n    \"cDw ch 1\\n\"\n    \"tGq th 1\\n\"\n    \"fbq qu 1\\n\"\n    \"Tvw va 1\\n\"\n    \"mNv va 1\\n\"\n    \"dtE th 1\\n\"\n    \"pzP sz 1\\n\"\n    \"Vsw sz 1\\n\"\n    \"qGq qu 1\\n\"\n    \"qPc ch 1\\n\"\n    \"qyC qu 1\\n\"\n    \"nxF an 1\\n\"\n    \"jDl le 1\\n\"\n    \"jHt th 1\\n\"\n    \"fxZ fo 1\\n\"\n    \"sQc ch 1\\n\"\n    \"nmH an 1\\n\"\n    \"xrD er 1\\n\"\n    \"hMh th 1\\n\"\n    \"vHk ka 1\\n\"\n    \"hmS th 1\\n\"\n    \"Xdt th 1\\n\"\n    \"Xwl le 1\\n\"\n    \"uJr qu 1\\n\"\n    \"sPk st 1\\n\"\n    \"Xjp ij 1\\n\"\n    \"Uqi qu 1\\n\"\n    \"kgD ng 1\\n\"\n    \"jgI ng 1\\n\"\n    \"uFw qu 1\\n\"\n    \"xNd de 1\\n\"\n    \"dhI th 1\\n\"\n    \"Lxo on 1\\n\"\n    \"Sfq qu 1\\n\"\n    \"zRp sz 1\\n\"\n    \"xwK wa 1\\n\"\n    \"fmB me 1\\n\"\n    \"vrV er 1\\n\"\n    \"qSf qu 1\\n\"\n    \"jPn an 1\\n\"\n    \"Hbp pr 1\\n\"\n    \"bJt th 1\\n\"\n    \"lqQ qu 1\\n\"\n    \"xSd de 1\\n\"\n    \"dMk de 1\\n\"\n    \"vVz sz 1\\n\"\n    \"vkK ka 1\\n\"\n    \"Xds de 1\\n\"\n    \"ybB be 1\\n\"\n    \"gpE ng 1\\n\"\n    \"qcC ch 1\\n\"\n    \"pxL pr 1\\n\"\n    \"gPm ng 1\\n\"\n    \"Bpd de 1\\n\"\n    \"dpB de 1\\n\"\n    \"jlJ le 1\\n\"\n    \"pkC ka 1\\n\"\n    \"ypP pr 1\\n\"\n    \"Nqm qu 1\\n\"\n    \"tgZ th 1\\n\"\n    \"Eqo qu 1\\n\"\n    \"dRk de 1\\n\"\n    \"Ubc ch 1\\n\"\n    \"xhY th 1\\n\"\n    \"lJd le 1\\n\"\n    \"pvN va 1\\n\"\n    \"Qfc ch 1\\n\"\n    \"Dbw wa 1\\n\"\n    \"sFc ch 1\\n\"\n    \"wkX ka 1\\n\"\n    \"xpR pr 1\\n\"\n    \"pjJ ij 1\\n\"\n    \"gkQ ng 1\\n\"\n    \"rMf er 1\\n\"\n    \"Jsn an 1\\n\"\n    \"xOw wa 1\\n\"\n    \"Dqu un 1\\n\"\n    \"nbJ an 1\\n\"\n    \"gvF ng 1\\n\"\n    \"Fnp an 1\\n\"\n    \"jpV ij 1\\n\"\n    \"qtD th 1\\n\"\n    \"uEj qu 1\\n\"\n    \"yhY th 1\\n\"\n    \"Ohq th 1\\n\"\n    \"nXy an 1\\n\"\n    \"pdU de 1\\n\"\n    \"mDz sz 1\\n\"\n    \"iVk in 1\\n\"\n    \"Hqq qu 1\\n\"\n    \"xpZ po 1\\n\"\n    \"aeU an 1\\n\"\n    \"sjZ st 1\\n\"\n    \"sGp st 1\\n\"\n    \"Wqn an 1\\n\"\n    \"xqS qu 1\\n\"\n    \"Jjc ch 1\\n\"\n    \"qPp qu 1\\n\"\n    \"sXz st 1\\n\"\n    \"xvP va 1\\n\"\n    \"Wbq qu 1\\n\"\n    \"tjK th 1\\n\"\n    \"lhH th 1\\n\"\n    \"hqV th 1\\n\"\n    \"dYf de 1\\n\"\n    \"pFk ka 1\\n\"\n    \"sFq qu 1\\n\"\n    \"uHq qu 1\\n\"\n    \"vhA th 1\\n\"\n    \"jlE le 1\\n\"\n    \"sqB qu 1\\n\"\n    \"qnr an 1\\n\"\n    \"Fxq qu 1\\n\"\n    \"zHn an 1\\n\"\n    \"pdB de 1\\n\"\n    \"wHc ch 1\\n\"\n    \"Pxj ij 1\\n\"\n    \"gHx ng 1\\n\"\n    \"nqJ an 1\\n\"\n    \"oqX qu 1\\n\"\n    \"Xby be 1\\n\"\n    \"tbI th 1\\n\"\n    \"kSf ka 1\\n\"\n    \"vhD th 1\\n\"\n    \"qHj qu 1\\n\"\n    \"Npx pr 1\\n\"\n    \"Qzp sz 1\\n\"\n    \"xiU in 1\\n\"\n    \"rjZ er 1\\n\"\n    \"wjU ij 1\\n\"\n    \"jtB th 1\\n\"\n    \"Ygq qu 1\\n\"\n    \"aQf an 1\\n\"\n    \"xWu qu 1\\n\"\n    \"aVf an 1\\n\"\n    \"pQx pr 1\\n\"\n    \"Lnw an 1\\n\"\n    \"qWa an 1\\n\"\n    \"uHp qu 1\\n\"\n    \"Lvp va 1\\n\"\n    \"Jxp pr 1\\n\"\n    \"zHk sz 1\\n\"\n    \"wvU va 1\\n\"\n    \"Wqh th 1\\n\"\n    \"hVs th 1\\n\"\n    \"Xgy ng 1\\n\"\n    \"dZj de 1\\n\"\n    \"uCq qu 1\\n\"\n    \"Gxl le 1\\n\"\n    \"Hlg ng 1\\n\"\n    \"Wqd qu 1\\n\"\n    \"Dxz sz 1\\n\"\n    \"hdN th 1\\n\"\n    \"pvM va 1\\n\"\n    \"Wxk ka 1\\n\"\n    \"qWd qu 1\\n\"\n    \"fiO in 1\\n\"\n    \"fDw wa 1\\n\"\n    \"bHj ij 1\\n\"\n    \"iVh th 1\\n\"\n    \"Pmg ng 1\\n\"\n    \"fXc ch 1\\n\"\n    \"xfL fo 1\\n\"\n    \"yGc ch 1\\n\"\n    \"yBn an 1\\n\"\n    \"hCk th 1\\n\"\n    \"Llk le 1\\n\"\n    \"yMh th 1\\n\"\n    \"qrY qu 1\\n\"\n    \"gdX ng 1\\n\"\n    \"qxG qu 1\\n\"\n    \"Zmt th 1\\n\"\n    \"Rzw sz 1\\n\"\n    \"nBd an 1\\n\"\n    \"mWl le 1\\n\"\n    \"xuI qu 1\\n\"\n    \"jyF ij 1\\n\"\n    \"bVu qu 1\\n\"\n    \"ygP ng 1\\n\"\n    \"dFq qu 1\\n\"\n    \"jFm ij 1\\n\"\n    \"Rml le 1\\n\"\n    \"klH le 1\\n\"\n    \"Vff fo 1\\n\"\n    \"Kzk sz 1\\n\"\n    \"Lhv th 1\\n\"\n    \"cSj ch 1\\n\"\n    \"Qrh th 1\\n\"\n    \"uBw qu 1\\n\"\n    \"sCk ka 1\\n\"\n    \"qyS qu 1\\n\"\n    \"cXu ch 1\\n\"\n    \"wfM wa 1\\n\"\n    \"kdK de 1\\n\"\n    \"cXj ch 1\\n\"\n    \"ctZ th 1\\n\"\n    \"fjI ij 1\\n\"\n    \"cgS ch 1\\n\"\n    \"mwL me 1\\n\"\n    \"kzU sz 1\\n\"\n    \"cZr ch 1\\n\"\n    \"fqU qu 1\\n\"\n    \"qJi qu 1\\n\"\n    \"gDd ng 1\\n\"\n    \"bKq qu 1\\n\"\n    \"aUw an 1\\n\"\n    \"sxE st 1\\n\"\n    \"mxU me 1\\n\"\n    \"cwY ch 1\\n\"\n    \"fpC pr 1\\n\"\n    \"sRw st 1\\n\"\n    \"Kkq qu 1\\n\"\n    \"wxA wa 1\\n\"\n    \"gQf ng 1\\n\"\n    \"pPb pr 1\\n\"\n    \"Hwu ku 1\\n\"\n    \"suX qu 1\\n\"\n    \"lqY qu 1\\n\"\n    \"sxW st 1\\n\"\n    \"aFh th 1\\n\"\n    \"lWq qu 1\\n\"\n    \"pbZ pr 1\\n\"\n    \"bqm qu 1\\n\"\n    \"kJk ka 1\\n\"\n    \"qtT th 1\\n\"\n    \"zMd sz 1\\n\"\n    \"hGs th 1\\n\"\n    \"xlH le 1\\n\"\n    \"dmq qu 1\\n\"\n    \"Xrk er 1\\n\"\n    \"Ocf ch 1\\n\"\n    \"mKc ch 1\\n\"\n    \"zrA er 1\\n\"\n    \"gxE ng 1\\n\"\n    \"qWu un 1\\n\"\n    \"xQf fo 1\\n\"\n    \"Xoz on 1\\n\"\n    \"fmP me 1\\n\"\n    \"kdD de 1\\n\"\n    \"bBz sz 1\\n\"\n    \"wpA pr 1\\n\"\n    \"nMb an 1\\n\"\n    \"tHq th 1\\n\"\n    \"jMt th 1\\n\"\n    \"Svq qu 1\\n\"\n    \"jMl le 1\\n\"\n    \"wBc ch 1\\n\"\n    \"ymX me 1\\n\"\n    \"hcB th 1\\n\"\n    \"brU er 1\\n\"\n    \"paX an 1\\n\"\n    \"hdG th 1\\n\"\n    \"Fwp pr 1\\n\"\n    \"sbY st 1\\n\"\n    \"mhB th 1\\n\"\n    \"pfZ pr 1\\n\"\n    \"Vmh th 1\\n\"\n    \"sCq qu 1\\n\"\n    \"Zfw wa 1\\n\"\n    \"Ljm ij 1\\n\"\n    \"pqG qu 1\\n\"\n    \"dpK de 1\\n\"\n    \"tfG th 1\\n\"\n    \"ijR in 1\\n\"\n    \"iJy in 1\\n\"\n    \"qfN qu 1\\n\"\n    \"crS ch 1\\n\"\n    \"cgT ch 1\\n\"\n    \"wOt th 1\\n\"\n    \"fnE an 1\\n\"\n    \"hWp th 1\\n\"\n    \"Zpw pr 1\\n\"\n    \"wdO de 1\\n\"\n    \"vYy va 1\\n\"\n    \"qrI qu 1\\n\"\n    \"dmF de 1\\n\"\n    \"jhJ th 1\\n\"\n    \"wHr er 1\\n\"\n    \"Jzb sz 1\\n\"\n    \"fEy ny 1\\n\"\n    \"hhZ th 1\\n\"\n    \"wpQ pr 1\\n\"\n    \"qYg qu 1\\n\"\n    \"qtY th 1\\n\"\n    \"Kdx de 1\\n\"\n    \"qfj qu 1\\n\"\n    \"Rbv va 1\\n\"\n    \"bbO be 1\\n\"\n    \"Xcn ch 1\\n\"\n    \"kCd de 1\\n\"\n    \"Gcx ch 1\\n\"\n    \"zmC sz 1\\n\"\n    \"wJl le 1\\n\"\n    \"qDc ch 1\\n\"\n    \"Jzr er 1\\n\"\n    \"Yrw er 1\\n\"\n    \"Ksx st 1\\n\"\n    \"uKx qu 1\\n\"\n    \"jSc ch 1\\n\"\n    \"Ljz sz 1\\n\"\n    \"xdB de 1\\n\"\n    \"zWb sz 1\\n\"\n    \"vwY va 1\\n\"\n    \"vMd de 1\\n\"\n    \"dbH de 1\\n\"\n    \"Qsu qu 1\\n\"\n    \"wHq qu 1\\n\"\n    \"gJh th 1\\n\"\n    \"wZp pr 1\\n\"\n    \"btO th 1\\n\"\n    \"Xmv va 1\\n\"\n    \"qpd qu 1\\n\"\n    \"Jnw an 1\\n\"\n    \"vlD le 1\\n\"\n    \"xcX ch 1\\n\"\n    \"Yvv va 1\\n\"\n    \"Zft th 1\\n\"\n    \"Hqz qu 1\\n\"\n    \"xqM qu 1\\n\"\n    \"Hth ch 1\\n\"\n    \"ztL th 1\\n\"\n    \"iOj in 1\\n\"\n    \"cIz ch 1\\n\"\n    \"hhC th 1\\n\"\n    \"tvX th 1\\n\"\n    \"Fgk ng 1\\n\"\n    \"mjC ij 1\\n\"\n    \"Ojp ij 1\\n\"\n    \"kvI ka 1\\n\"\n    \"zqb qu 1\\n\"\n    \"qqW qu 1\\n\"\n    \"iHg ng 1\\n\"\n    \"jxJ ij 1\\n\"\n    \"Gbz sz 1\\n\"\n    \"nQc ch 1\\n\"\n    \"pXq qu 1\\n\"\n    \"jDd de 1\\n\"\n    \"qQr qu 1\\n\"\n    \"vJx va 1\\n\"\n    \"zbY sz 1\\n\"\n    \"fRm me 1\\n\"\n    \"qEl qu 1\\n\"\n    \"oaZ an 1\\n\"\n    \"vjF ij 1\\n\"\n    \"lqX qu 1\\n\"\n    \"pSd de 1\\n\"\n    \"bXq qu 1\\n\"\n    \"jJv ij 1\\n\"\n    \"Wrv er 1\\n\"\n    \"Kpw pr 1\\n\"\n    \"xaY an 1\\n\"\n    \"jCv ij 1\\n\"\n    \"fbR be 1\\n\"\n    \"pTp pr 1\\n\"\n    \"wdI de 1\\n\"\n    \"qfQ qu 1\\n\"\n    \"Rrq qu 1\\n\"\n    \"dbF de 1\\n\"\n    \"bzF sz 1\\n\"\n    \"qwO qu 1\\n\"\n    \"vrY er 1\\n\"\n    \"twI th 1\\n\"\n    \"zLf sz 1\\n\"\n    \"bVc ch 1\\n\"\n    \"Xnl an 1\\n\"\n    \"Wgb ng 1\\n\"\n    \"fuS qu 1\\n\"\n    \"vIf va 1\\n\"\n    \"Twt th 1\\n\"\n    \"nKd an 1\\n\"\n    \"Dkh th 1\\n\"\n    \"uBd qu 1\\n\"\n    \"kOz ka 1\\n\"\n    \"zOj sz 1\\n\"\n    \"nzE an 1\\n\"\n    \"Zbh th 1\\n\"\n    \"qMg qu 1\\n\"\n    \"gfC ng 1\\n\"\n    \"vgD ng 1\\n\"\n    \"ytC th 1\\n\"\n    \"mqM qu 1\\n\"\n    \"Kjn an 1\\n\"\n    \"xbX be 1\\n\"\n    \"zfH sz 1\\n\"\n    \"mwH me 1\\n\"\n    \"zQb sz 1\\n\"\n    \"Gzk sz 1\\n\"\n    \"qsW qu 1\\n\"\n    \"kNs st 1\\n\"\n    \"Lqz qu 1\\n\"\n    \"nmW an 1\\n\"\n    \"qNx qu 1\\n\"\n    \"zcQ ch 1\\n\"\n    \"qMz qu 1\\n\"\n    \"wGz sz 1\\n\"\n    \"uCd qu 1\\n\"\n    \"Bpv pr 1\\n\"\n    \"qNe qu 1\\n\"\n    \"bpP pr 1\\n\"\n    \"lXf le 1\\n\"\n    \"cLq ch 1\\n\"\n    \"pdX de 1\\n\"\n    \"qzU qu 1\\n\"\n    \"Kxd de 1\\n\"\n    \"jvF ij 1\\n\"\n    \"rFn an 1\\n\"\n    \"Etq th 1\\n\"\n    \"zYh th 1\\n\"\n    \"Ksv st 1\\n\"\n    \"fJk ka 1\\n\"\n    \"fkC ka 1\\n\"\n    \"mxK me 1\\n\"\n    \"fbz sz 1\\n\"\n    \"vrW er 1\\n\"\n    \"mPq qu 1\\n\"\n    \"yBt th 1\\n\"\n    \"iCf in 1\\n\"\n    \"srH er 1\\n\"\n    \"hjB th 1\\n\"\n    \"fcG ch 1\\n\"\n    \"Ftg th 1\\n\"\n    \"uBp qu 1\\n\"\n    \"yqT qu 1\\n\"\n    \"djF de 1\\n\"\n    \"tgU th 1\\n\"\n    \"Wrj er 1\\n\"\n    \"xFc ch 1\\n\"\n    \"ycC ch 1\\n\"\n    \"eqA qu 1\\n\"\n    \"pbG pr 1\\n\"\n    \"Cwh th 1\\n\"\n    \"fDk ka 1\\n\"\n    \"wTz sz 1\\n\"\n    \"xrW er 1\\n\"\n    \"kQs st 1\\n\"\n    \"wMl le 1\\n\"\n    \"yCn nd 1\\n\"\n    \"eGp er 1\\n\"\n    \"uPv qu 1\\n\"\n    \"Wqe qu 1\\n\"\n    \"yiI in 1\\n\"\n    \"rqF qu 1\\n\"\n    \"Kjs st 1\\n\"\n    \"lwK le 1\\n\"\n    \"fjQ ij 1\\n\"\n    \"uIq qu 1\\n\"\n    \"dxR de 1\\n\"\n    \"Gqj qu 1\\n\"\n    \"nLb an 1\\n\"\n    \"gRd ng 1\\n\"\n    \"qyv qu 1\\n\"\n    \"wtZ th 1\\n\"\n    \"cRk ch 1\\n\"\n    \"iKf in 1\\n\"\n    \"hbK th 1\\n\"\n    \"rqT qu 1\\n\"\n    \"xmF me 1\\n\"\n    \"vHt th 1\\n\"\n    \"tqN th 1\\n\"\n    \"vLv va 1\\n\"\n    \"xvJ va 1\\n\"\n    \"bgJ ng 1\\n\"\n    \"Qjq qu 1\\n\"\n    \"Lvb va 1\\n\"\n    \"Hxg ng 1\\n\"\n    \"tVq th 1\\n\"\n    \"rhZ th 1\\n\"\n    \"slL le 1\\n\"\n    \"kdH de 1\\n\"\n    \"Kfb be 1\\n\"\n    \"Dfh th 1\\n\"\n    \"Cqq qu 1\\n\"\n    \"nQk an 1\\n\"\n    \"Wnz an 1\\n\"\n    \"Njj ij 1\\n\"\n    \"bJf be 1\\n\"\n    \"wRh th 1\\n\"\n    \"Dpb pr 1\\n\"\n    \"sPj st 1\\n\"\n    \"Zpn an 1\\n\"\n    \"mPj ij 1\\n\"\n    \"Qcl ch 1\\n\"\n    \"zCd sz 1\\n\"\n    \"yrC er 1\\n\"\n    \"hCb th 1\\n\"\n    \"aBv an 1\\n\"\n    \"yuG qu 1\\n\"\n    \"fcN ch 1\\n\"\n    \"bZp pr 1\\n\"\n    \"Gtf th 1\\n\"\n    \"wbW wa 1\\n\"\n    \"vPq qu 1\\n\"\n    \"Vtj th 1\\n\"\n    \"kWq qu 1\\n\"\n    \"Jbm me 1\\n\"\n    \"Wmb me 1\\n\"\n    \"pxY pr 1\\n\"\n    \"hQx th 1\\n\"\n    \"tNn th 1\\n\"\n    \"qdx qu 1\\n\"\n    \"cYv ch 1\\n\"\n    \"zlX le 1\\n\"\n    \"rwF er 1\\n\"\n    \"cZm ch 1\\n\"\n    \"ybJ be 1\\n\"\n    \"qaB an 1\\n\"\n    \"tVj th 1\\n\"\n    \"zUg ng 1\\n\"\n    \"cfC ch 1\\n\"\n    \"hxB th 1\\n\"\n    \"Tbz sz 1\\n\"\n    \"oFn an 1\\n\"\n    \"bTp pr 1\\n\"\n    \"hBk th 1\\n\"\n    \"hQe th 1\\n\"\n    \"qBe de 1\\n\"\n    \"dpC de 1\\n\"\n    \"kpW ka 1\\n\"\n    \"Zkj ij 1\\n\"\n    \"Nwn an 1\\n\"\n    \"grC ng 1\\n\"\n    \"uXq qu 1\\n\"\n    \"Uoy on 1\\n\"\n    \"Zfu qu 1\\n\"\n    \"xKb be 1\\n\"\n    \"hSb th 1\\n\"\n    \"bPc ch 1\\n\"\n    \"qcg ch 1\\n\"\n    \"xIu qu 1\\n\"\n    \"gBv ng 1\\n\"\n    \"gZm me 1\\n\"\n    \"qPu un 1\\n\"\n    \"Bfp pr 1\\n\"\n    \"rxC er 1\\n\"\n    \"sLk st 1\\n\"\n    \"hGj th 1\\n\"\n    \"qvR qu 1\\n\"\n    \"qpR qu 1\\n\"\n    \"vNn an 1\\n\"\n    \"Dft th 1\\n\"\n    \"nRq an 1\\n\"\n    \"khR th 1\\n\"\n    \"pqP qu 1\\n\"\n    \"tNp th 1\\n\"\n    \"Vwt th 1\\n\"\n    \"xwA wa 1\\n\"\n    \"wMn an 1\\n\"\n    \"Snq an 1\\n\"\n    \"dfD de 1\\n\"\n    \"vGw va 1\\n\"\n    \"Xqb qu 1\\n\"\n    \"Kww wa 1\\n\"\n    \"Qhx th 1\\n\"\n    \"Oyx ny 1\\n\"\n    \"dvB de 1\\n\"\n    \"sVh th 1\\n\"\n    \"Hcn ch 1\\n\"\n    \"sbU st 1\\n\"\n    \"fFw wa 1\\n\"\n    \"kfT ka 1\\n\"\n    \"rvW er 1\\n\"\n    \"Yxw wa 1\\n\"\n    \"nFk an 1\\n\"\n    \"Lqd qu 1\\n\"\n    \"hoQ th 1\\n\"\n    \"Nfj ij 1\\n\"\n    \"grH ng 1\\n\"\n    \"cJk ch 1\\n\"\n    \"Pnv an 1\\n\"\n    \"Nqx qu 1\\n\"\n    \"yfE ny 1\\n\"\n    \"kmI ka 1\\n\"\n    \"Gmz sz 1\\n\"\n    \"bxS be 1\\n\"\n    \"quU un 1\\n\"\n    \"qYf qu 1\\n\"\n    \"zKw sz 1\\n\"\n    \"whK th 1\\n\"\n    \"ofY on 1\\n\"\n    \"prH er 1\\n\"\n    \"jXz sz 1\\n\"\n    \"vQm va 1\\n\"\n    \"iWx in 1\\n\"\n    \"bzC sz 1\\n\"\n    \"nYx an 1\\n\"\n    \"qaK an 1\\n\"\n    \"Ggb ng 1\\n\"\n    \"zSf sz 1\\n\"\n    \"rQz er 1\\n\"\n    \"hkW th 1\\n\"\n    \"Vnl an 1\\n\"\n    \"Gtd th 1\\n\"\n    \"rMw er 1\\n\"\n    \"wvX va 1\\n\"\n    \"jyU ij 1\\n\"\n    \"Qqp qu 1\\n\"\n    \"Hnq an 1\\n\"\n    \"bFb be 1\\n\"\n    \"qkH qu 1\\n\"\n    \"Wck ch 1\\n\"\n    \"fMw wa 1\\n\"\n    \"zgE ng 1\\n\"\n    \"oJz on 1\\n\"\n    \"xvH va 1\\n\"\n    \"hQy th 1\\n\"\n    \"cYf ch 1\\n\"\n    \"cxD ch 1\\n\"\n    \"yDs st 1\\n\"\n    \"qBh th 1\\n\"\n    \"cJx ch 1\\n\"\n    \"dPj de 1\\n\"\n    \"wWd de 1\\n\"\n    \"rHn an 1\\n\"\n    \"iyM in 1\\n\"\n    \"yxD ny 1\\n\"\n    \"kPc ch 1\\n\"\n    \"cXv ch 1\\n\"\n    \"Nmg ng 1\\n\"\n    \"vkN ka 1\\n\"\n    \"lFj le 1\\n\"\n    \"ymU me 1\\n\"\n    \"pZv va 1\\n\"\n    \"gZt th 1\\n\"\n    \"Jqy qu 1\\n\"\n    \"qAz qu 1\\n\"\n    \"Bcy ch 1\\n\"\n    \"pqj qu 1\\n\"\n    \"cqE ch 1\\n\"\n    \"Rwv va 1\\n\"\n    \"crM ch 1\\n\"\n    \"Axz sz 1\\n\"\n    \"Zjp ij 1\\n\"\n    \"yxF ny 1\\n\"\n    \"vZh th 1\\n\"\n    \"sPb st 1\\n\"\n    \"vCs st 1\\n\"\n    \"fQq qu 1\\n\"\n    \"qYq qu 1\\n\"\n    \"hBp th 1\\n\"\n    \"Jbk ka 1\\n\"\n    \"gqK qu 1\\n\"\n    \"krq qu 1\\n\"\n    \"Cfz sz 1\\n\"\n    \"mbJ me 1\\n\"\n    \"fRq qu 1\\n\"\n    \"Iwv va 1\\n\"\n    \"uFn an 1\\n\"\n    \"cYz ch 1\\n\"\n    \"qDb qu 1\\n\"\n    \"xHd de 1\\n\"\n    \"qmI qu 1\\n\"\n    \"ycE ch 1\\n\"\n    \"Mhf th 1\\n\"\n    \"iuE qu 1\\n\"\n    \"gXf ng 1\\n\"\n    \"lPy le 1\\n\"\n    \"bPv va 1\\n\"\n    \"jXh th 1\\n\"\n    \"gOx ng 1\\n\"\n    \"Nmv va 1\\n\"\n    \"xDg ng 1\\n\"\n    \"Cwd de 1\\n\"\n    \"ljP le 1\\n\"\n    \"wqV qu 1\\n\"\n    \"nrE an 1\\n\"\n    \"Kmw me 1\\n\"\n    \"gJt th 1\\n\"\n    \"tgB th 1\\n\"\n    \"xzR sz 1\\n\"\n    \"vJr er 1\\n\"\n    \"aUi an 1\\n\"\n    \"ynY an 1\\n\"\n    \"bZv va 1\\n\"\n    \"fFq qu 1\\n\"\n    \"Sxg ng 1\\n\"\n    \"qAc ch 1\\n\"\n    \"iZv in 1\\n\"\n    \"jXu qu 1\\n\"\n    \"gpR ng 1\\n\"\n    \"wVl le 1\\n\"\n    \"dNj de 1\\n\"\n    \"fBw wa 1\\n\"\n    \"Mjy ij 1\\n\"\n    \"kjZ ij 1\\n\"\n    \"tLs th 1\\n\"\n    \"iYj in 1\\n\"\n    \"wbO wa 1\\n\"\n    \"qXb qu 1\\n\"\n    \"uJq qu 1\\n\"\n    \"qKt th 1\\n\"\n    \"vjO ij 1\\n\"\n    \"wuD qu 1\\n\"\n    \"blQ le 1\\n\"\n    \"yfB ny 1\\n\"\n    \"Qsk st 1\\n\"\n    \"Uwm me 1\\n\"\n    \"Zqg qu 1\\n\"\n    \"nmY an 1\\n\"\n    \"pXw pr 1\\n\"\n    \"yVj ij 1\\n\"\n    \"gIw ng 1\\n\"\n    \"Hxk ka 1\\n\"\n    \"Pgy ng 1\\n\"\n    \"lQv le 1\\n\"\n    \"bnK an 1\\n\"\n    \"xtZ th 1\\n\"\n    \"Qce ch 1\\n\"\n    \"Njq qu 1\\n\"\n    \"mvq qu 1\\n\"\n    \"Mwz sz 1\\n\"\n    \"Gtn th 1\\n\"\n    \"fJh th 1\\n\"\n    \"vJz sz 1\\n\"\n    \"gDk ng 1\\n\"\n    \"dLw de 1\\n\"\n    \"oeU er 1\\n\"\n    \"cvY ch 1\\n\"\n    \"Gbb be 1\\n\"\n    \"Tqd qu 1\\n\"\n    \"aTp an 1\\n\"\n    \"Ywg ng 1\\n\"\n    \"jdT de 1\\n\"\n    \"Wkm ka 1\\n\"\n    \"pxA pr 1\\n\"\n    \"vDl le 1\\n\"\n    \"sfD st 1\\n\"\n    \"rqV qu 1\\n\"\n    \"cHb ch 1\\n\"\n    \"iVc ch 1\\n\"\n    \"Mfh th 1\\n\"\n    \"sVm st 1\\n\"\n    \"nzR an 1\\n\"\n    \"Qvs st 1\\n\"\n    \"kZg ng 1\\n\"\n    \"Wnw an 1\\n\"\n    \"qZb qu 1\\n\"\n    \"Gvq qu 1\\n\"\n    \"vPk ka 1\\n\"\n    \"Sxq qu 1\\n\"\n    \"vNg ng 1\\n\"\n    \"qrH qu 1\\n\"\n    \"fLc ch 1\\n\"\n    \"wVs st 1\\n\"\n    \"qEh th 1\\n\"\n    \"uqC qu 1\\n\"\n    \"tZx th 1\\n\"\n    \"yhI th 1\\n\"\n    \"wNh th 1\\n\"\n    \"rFj er 1\\n\"\n    \"xPq qu 1\\n\"\n    \"pqW qu 1\\n\"\n    \"Pjc ch 1\\n\"\n    \"jYj ij 1\\n\"\n    \"pFv va 1\\n\"\n    \"vLr er 1\\n\"\n    \"lqq qu 1\\n\"\n    \"xJg ng 1\\n\"\n    \"lVz le 1\\n\"\n    \"cZc ch 1\\n\"\n    \"hcF th 1\\n\"\n    \"uhJ th 1\\n\"\n    \"cLj ch 1\\n\"\n    \"qyW qu 1\\n\"\n    \"zhT th 1\\n\"\n    \"mtK th 1\\n\"\n    \"pRb pr 1\\n\"\n    \"bCx be 1\\n\"\n    \"nJf an 1\\n\"\n    \"jwF ij 1\\n\"\n    \"Pdj de 1\\n\"\n    \"jxE ij 1\\n\"\n    \"slZ le 1\\n\"\n    \"Lxn an 1\\n\"\n    \"znL an 1\\n\"\n    \"mzV sz 1\\n\"\n    \"lGq le 1\\n\"\n    \"Qbw wa 1\\n\"\n    \"jbY ij 1\\n\"\n    \"zSm sz 1\\n\"\n    \"Qqx qu 1\\n\"\n    \"ypR pr 1\\n\"\n    \"gCc ch 1\\n\"\n    \"Yvx va 1\\n\"\n    \"ihI th 1\\n\"\n    \"Zfx fo 1\\n\"\n    \"njI nd 1\\n\"\n    \"Ypt th 1\\n\"\n    \"lxT le 1\\n\"\n    \"fVv va 1\\n\"\n    \"Jzm sz 1\\n\"\n    \"jxA ij 1\\n\"\n    \"gDl ng 1\\n\"\n    \"Eaq an 1\\n\"\n    \"Qcn an 1\\n\"\n    \"zGb sz 1\\n\"\n    \"jLh th 1\\n\"\n    \"qkX qu 1\\n\"\n    \"wbK wa 1\\n\"\n    \"nNx an 1\\n\"\n    \"sqW qu 1\\n\"\n    \"wRx wa 1\\n\"\n    \"xrU er 1\\n\"\n    \"fnQ an 1\\n\"\n    \"kzB sz 1\\n\"\n    \"Rcn ch 1\\n\"\n    \"qbL qu 1\\n\"\n    \"srD er 1\\n\"\n    \"Vxu qu 1\\n\"\n    \"qvF qu 1\\n\"\n    \"wJr er 1\\n\"\n    \"Yxg ng 1\\n\"\n    \"qiY qu 1\\n\"\n    \"fMc ch 1\\n\"\n    \"hbY th 1\\n\"\n    \"hgH th 1\\n\"\n    \"dmS de 1\\n\"\n    \"jTn an 1\\n\"\n    \"Zjm ij 1\\n\"\n    \"Njl le 1\\n\"\n    \"dqV qu 1\\n\"\n    \"Yjh th 1\\n\"\n    \"rKw er 1\\n\"\n    \"cxU ch 1\\n\"\n    \"Ckj ij 1\\n\"\n    \"zfJ sz 1\\n\"\n    \"ytF th 1\\n\"\n    \"xrP er 1\\n\"\n    \"qEj qu 1\\n\"\n    \"rxO er 1\\n\"\n    \"rZn an 1\\n\"\n    \"bZq qu 1\\n\"\n    \"cXq ch 1\\n\"\n    \"wvD va 1\\n\"\n    \"hcX th 1\\n\"\n    \"zkO sz 1\\n\"\n    \"hNx th 1\\n\"\n    \"wFg ng 1\\n\"\n    \"kXu qu 1\\n\"\n    \"Vkn an 1\\n\"\n    \"Gjz sz 1\\n\"\n    \"Qcd ch 1\\n\"\n    \"yvF va 1\\n\"\n    \"xFx xe 1\\n\"\n    \"dSj de 1\\n\"\n    \"xPb be 1\\n\"\n    \"oFp on 1\\n\"\n    \"qAk qu 1\\n\"\n    \"rqU qu 1\\n\"\n    \"pGv va 1\\n\"\n    \"hzC th 1\\n\"\n    \"qIk qu 1\\n\"\n    \"Lhl th 1\\n\"\n    \"Fwb wa 1\\n\"\n    \"pgE ng 1\\n\"\n    \"Awz sz 1\\n\"\n    \"fBk ka 1\\n\"\n    \"xKd de 1\\n\"\n    \"Pfw wa 1\\n\"\n    \"uqK qu 1\\n\"\n    \"pJc ch 1\\n\"\n    \"bTc ch 1\\n\"\n    \"tWg th 1\\n\"\n    \"gdN ng 1\\n\"\n    \"jrN er 1\\n\"\n    \"klS le 1\\n\"\n    \"qEi qu 1\\n\"\n    \"sFn an 1\\n\"\n    \"tqR th 1\\n\"\n    \"Fnm an 1\\n\"\n    \"hXv th 1\\n\"\n    \"fxN fo 1\\n\"\n    \"bvL va 1\\n\"\n    \"oGf on 1\\n\"\n    \"hZm th 1\\n\"\n    \"yfH ny 1\\n\"\n    \"dcE ch 1\\n\"\n    \"pgW ng 1\\n\"\n    \"wrB er 1\\n\"\n    \"kWm ka 1\\n\"\n    \"Shx th 1\\n\"\n    \"twP th 1\\n\"\n    \"Qvd de 1\\n\"\n    \"Qgu qu 1\\n\"\n    \"pJt th 1\\n\"\n    \"zNv sz 1\\n\"\n    \"Hph th 1\\n\"\n    \"klF le 1\\n\"\n    \"vqz qu 1\\n\"\n    \"sgG ng 1\\n\"\n    \"kdZ de 1\\n\"\n    \"ejX er 1\\n\"\n    \"Pxu qu 1\\n\"\n    \"pvT va 1\\n\"\n    \"Kqx qu 1\\n\"\n    \"Qmb me 1\\n\"\n    \"xFk ka 1\\n\"\n    \"wQb wa 1\\n\"\n    \"Pgx ng 1\\n\"\n    \"ypL pr 1\\n\"\n    \"bwE wa 1\\n\"\n    \"xHt th 1\\n\"\n    \"kVz sz 1\\n\"\n    \"jmF ij 1\\n\"\n    \"Ixq qu 1\\n\"\n    \"qyP qu 1\\n\"\n    \"rVv er 1\\n\"\n    \"Ytw th 1\\n\"\n    \"qpZ qu 1\\n\"\n    \"tpZ th 1\\n\"\n    \"zjX sz 1\\n\"\n    \"Khg th 1\\n\"\n    \"qfV qu 1\\n\"\n    \"Jzx sz 1\\n\"\n    \"kTj ij 1\\n\"\n    \"Bzq qu 1\\n\"\n    \"njR an 1\\n\"\n    \"cgW ch 1\\n\"\n    \"cmI ch 1\\n\"\n    \"kCb ka 1\\n\"\n    \"pYp pr 1\\n\"\n    \"vkZ ka 1\\n\"\n    \"wvk ka 1\\n\"\n    \"Vfq qu 1\\n\"\n    \"nlZ an 1\\n\"\n    \"qNj qu 1\\n\"\n    \"rCq qu 1\\n\"\n    \"kbV ka 1\\n\"\n    \"Dqj qu 1\\n\"\n    \"brD er 1\\n\"\n    \"lbG le 1\\n\"\n    \"xhF th 1\\n\"\n    \"kxZ ka 1\\n\"\n    \"Iuq qu 1\\n\"\n    \"yFx ny 1\\n\"\n    \"qVl qu 1\\n\"\n    \"lcG ch 1\\n\"\n    \"vWr er 1\\n\"\n    \"aBq an 1\\n\"\n    \"yJk ka 1\\n\"\n    \"czL ch 1\\n\"\n    \"jIu qu 1\\n\"\n    \"vUl le 1\\n\"\n    \"pZq qu 1\\n\"\n    \"vtW th 1\\n\"\n    \"Qxw wa 1\\n\"\n    \"dYv de 1\\n\"\n    \"iqH qu 1\\n\"\n    \"Xws st 1\\n\"\n    \"fDj ij 1\\n\"\n    \"xVz sz 1\\n\"\n    \"dKq qu 1\\n\"\n    \"vfQ va 1\\n\"\n    \"hvD th 1\\n\"\n    \"wdY de 1\\n\"\n    \"Hzz sz 1\\n\"\n    \"cYs ch 1\\n\"\n    \"Ftj th 1\\n\"\n    \"dpU de 1\\n\"\n    \"Lld le 1\\n\"\n    \"Gqw qu 1\\n\"\n    \"kdR de 1\\n\"\n    \"vXg ng 1\\n\"\n    \"qsY qu 1\\n\"\n    \"jNf ij 1\\n\"\n    \"Qjj ij 1\\n\"\n    \"pVl le 1\\n\"\n    \"Jmx me 1\\n\"\n    \"pDj ij 1\\n\"\n    \"iBc ch 1\\n\"\n    \"kLj ij 1\\n\"\n    \"xnG an 1\\n\"\n    \"vTl le 1\\n\"\n    \"Ndg ng 1\\n\"\n    \"pqU qu 1\\n\"\n    \"Uaw an 1\\n\"\n    \"fzN sz 1\\n\"\n    \"gNq qu 1\\n\"\n    \"kjM ij 1\\n\"\n    \"lnK an 1\\n\"\n    \"zxb sz 1\\n\"\n    \"kcS ch 1\\n\"\n    \"njM an 1\\n\"\n    \"Gdw de 1\\n\"\n    \"lnZ an 1\\n\"\n    \"Ygj ng 1\\n\"\n    \"hKd th 1\\n\"\n    \"gpT ng 1\\n\"\n    \"yqP qu 1\\n\"\n    \"ijX in 1\\n\"\n    \"jGf ij 1\\n\"\n    \"bxI be 1\\n\"\n    \"vXx va 1\\n\"\n    \"Vrw er 1\\n\"\n    \"Cwx wa 1\\n\"\n    \"nBh th 1\\n\"\n    \"qvy qu 1\\n\"\n    \"sxB st 1\\n\"\n    \"mVk ka 1\\n\"\n    \"Czx sz 1\\n\"\n    \"fyV ny 1\\n\"\n    \"cXw ch 1\\n\"\n    \"Qnf an 1\\n\"\n    \"Yqd qu 1\\n\"\n    \"lqH qu 1\\n\"\n    \"dbY de 1\\n\"\n    \"Sqb qu 1\\n\"\n    \"Kqw qu 1\\n\"\n    \"zpJ sz 1\\n\"\n    \"cbM ch 1\\n\"\n    \"zFg ng 1\\n\"\n    \"sKb st 1\\n\"\n    \"qrK qu 1\\n\"\n    \"zJc ch 1\\n\"\n    \"nRn an 1\\n\"\n    \"fqN qu 1\\n\"\n    \"hfA th 1\\n\"\n    \"qoG qu 1\\n\"\n    \"Owz sz 1\\n\"\n    \"nlG an 1\\n\"\n    \"wIx wa 1\\n\"\n    \"qrP qu 1\\n\"\n    \"Nwg ng 1\\n\"\n    \"qaW an 1\\n\"\n    \"hcT th 1\\n\"\n    \"wkB ka 1\\n\"\n    \"Ndt th 1\\n\"\n    \"Kzq qu 1\\n\"\n    \"gxB ng 1\\n\"\n    \"Bjz sz 1\\n\"\n    \"vTf va 1\\n\"\n    \"jFq qu 1\\n\"\n    \"qMe qu 1\\n\"\n    \"ufQ qu 1\\n\"\n    \"npG an 1\\n\"\n    \"uZk qu 1\\n\"\n    \"qTw qu 1\\n\"\n    \"Glw le 1\\n\"\n    \"Kqq qu 1\\n\"\n    \"Cxr er 1\\n\"\n    \"jZs st 1\\n\"\n    \"Sqv qu 1\\n\"\n    \"yPm me 1\\n\"\n    \"eQj er 1\\n\"\n    \"aIh th 1\\n\"\n    \"gDq qu 1\\n\"\n    \"lIp le 1\\n\"\n    \"jNj ij 1\\n\"\n    \"qOd qu 1\\n\"\n    \"vkM ka 1\\n\"\n    \"vFy va 1\\n\"\n    \"cfV ch 1\\n\"\n    \"Kjh th 1\\n\"\n    \"gkP ng 1\\n\"\n    \"rJc ch 1\\n\"\n    \"uPq qu 1\\n\"\n    \"ozQ on 1\\n\"\n    \"Dlk le 1\\n\"\n    \"vXh th 1\\n\"\n    \"ktY th 1\\n\"\n    \"vWy va 1\\n\"\n    \"gQv ng 1\\n\"\n    \"Yww wa 1\\n\"\n    \"Tpz sz 1\\n\"\n    \"Qhc th 1\\n\"\n    \"xuT qu 1\\n\"\n    \"nbS an 1\\n\"\n    \"zQg ng 1\\n\"\n    \"vgZ ng 1\\n\"\n    \"pUo on 1\\n\"\n    \"uWb qu 1\\n\"\n    \"mMf me 1\\n\"\n    \"Zcd ch 1\\n\"\n    \"iBp in 1\\n\"\n    \"fwp pr 1\\n\"\n    \"zYf sz 1\\n\"\n    \"wCp pr 1\\n\"\n    \"Cqy qu 1\\n\"\n    \"cjF ch 1\\n\"\n    \"Gfh th 1\\n\"\n    \"mcW ch 1\\n\"\n    \"cqV ch 1\\n\"\n    \"uJd qu 1\\n\"\n    \"iUj in 1\\n\"\n    \"vkR ka 1\\n\"\n    \"wgI ng 1\\n\"\n    \"vUg ng 1\\n\"\n    \"Wdn de 1\\n\"\n    \"sjF st 1\\n\"\n    \"tPv th 1\\n\"\n    \"xRn an 1\\n\"\n    \"klV le 1\\n\"\n    \"sbM st 1\\n\"\n    \"mfT me 1\\n\"\n    \"dbV de 1\\n\"\n    \"Fmn an 1\\n\"\n    \"gfU ng 1\\n\"\n    \"cbB ch 1\\n\"\n    \"Yxz sz 1\\n\"\n    \"Kxk ka 1\\n\"\n    \"Dwq qu 1\\n\"\n    \"wgX ng 1\\n\"\n    \"sPv st 1\\n\"\n    \"vHd de 1\\n\"\n    \"nbH an 1\\n\"\n    \"cFn an 1\\n\"\n    \"qqX qu 1\\n\"\n    \"jFe er 1\\n\"\n    \"qEb qu 1\\n\"\n    \"dFh th 1\\n\"\n    \"uEo qu 1\\n\"\n    \"lcI ch 1\\n\"\n    \"bMm me 1\\n\"\n    \"zZw sz 1\\n\"\n    \"hjO th 1\\n\"\n    \"hKx th 1\\n\"\n    \"jgC ng 1\\n\"\n    \"cnL an 1\\n\"\n    \"Fdg ng 1\\n\"\n    \"bGf be 1\\n\"\n    \"Sjz sz 1\\n\"\n    \"bMj ij 1\\n\"\n    \"vXw va 1\\n\"\n    \"Gff fo 1\\n\"\n    \"Cww wa 1\\n\"\n    \"jsQ st 1\\n\"\n    \"Zgv ng 1\\n\"\n    \"lPf le 1\\n\"\n    \"nmQ an 1\\n\"\n    \"Vdq qu 1\\n\"\n    \"lcX ch 1\\n\"\n    \"gjT ng 1\\n\"\n    \"mwE me 1\\n\"\n    \"qLm qu 1\\n\"\n    \"cHq ch 1\\n\"\n    \"Xtn th 1\\n\"\n    \"Ntq th 1\\n\"\n    \"gWk ng 1\\n\"\n    \"Pqd qu 1\\n\"\n    \"qpP qu 1\\n\"\n    \"sRf st 1\\n\"\n    \"qpL qu 1\\n\"\n    \"cnD an 1\\n\"\n    \"qpG qu 1\\n\"\n    \"dzS sz 1\\n\"\n    \"tZb th 1\\n\"\n    \"ygM ng 1\\n\"\n    \"bxC be 1\\n\"\n    \"dfU de 1\\n\"\n    \"bmB me 1\\n\"\n    \"lBz le 1\\n\"\n    \"gJx ng 1\\n\"\n    \"Ykv ka 1\\n\"\n    \"Zdk de 1\\n\"\n    \"wnQ an 1\\n\"\n    \"tZj th 1\\n\"\n    \"Zzm sz 1\\n\"\n    \"Vfh th 1\\n\"\n    \"Mwc ch 1\\n\"\n    \"rUo on 1\\n\"\n    \"qwp qu 1\\n\"\n    \"tcI th 1\\n\"\n    \"tfD th 1\\n\"\n    \"uoZ qu 1\\n\"\n    \"fCw wa 1\\n\"\n    \"iQq qu 1\\n\"\n    \"qBg qu 1\\n\"\n    \"sVb st 1\\n\"\n    \"pjU ij 1\\n\"\n    \"scQ ch 1\\n\"\n    \"pqQ qu 1\\n\"\n    \"svZ st 1\\n\"\n    \"Zpj ij 1\\n\"\n    \"piV in 1\\n\"\n    \"kbP ka 1\\n\"\n    \"wqM qu 1\\n\"\n    \"rVb er 1\\n\"\n    \"qZr qu 1\\n\"\n    \"hxO th 1\\n\"\n    \"wTn an 1\\n\"\n    \"Jzf sz 1\\n\"\n    \"Qjb ij 1\\n\"\n    \"uYv qu 1\\n\"\n    \"pwK pr 1\\n\"\n    \"hvH th 1\\n\"\n    \"Dqe qu 1\\n\"\n    \"pfI pr 1\\n\"\n    \"mhV th 1\\n\"\n    \"jgE ng 1\\n\"\n    \"rcQ ch 1\\n\"\n    \"kmT ka 1\\n\"\n    \"Wzj sz 1\\n\"\n    \"xNs st 1\\n\"\n    \"Pbj ij 1\\n\"\n    \"zvB sz 1\\n\"\n    \"xhJ th 1\\n\"\n    \"svq qu 1\\n\"\n    \"Nvn an 1\\n\"\n    \"swZ st 1\\n\"\n    \"jgF ng 1\\n\"\n    \"mfL me 1\\n\"\n    \"zkL sz 1\\n\"\n    \"jVp ij 1\\n\"\n    \"Dkj ij 1\\n\"\n    \"xuY qu 1\\n\"\n    \"hHq th 1\\n\"\n    \"cSf ch 1\\n\"\n    \"Jzd sz 1\\n\"\n    \"lqU qu 1\\n\"\n    \"qMd qu 1\\n\"\n    \"Qgj ng 1\\n\"\n    \"fxk ka 1\\n\"\n    \"tRt th 1\\n\"\n    \"zFk sz 1\\n\"\n    \"qEo qu 1\\n\"\n    \"voY on 1\\n\"\n    \"Awj ij 1\\n\"\n    \"Txj ij 1\\n\"\n    \"cIg ch 1\\n\"\n    \"xUu qu 1\\n\"\n    \"sRr er 1\\n\"\n    \"Jxn an 1\\n\"\n    \"iPf in 1\\n\"\n    \"ejY er 1\\n\"\n    \"Xts th 1\\n\"\n    \"pfT pr 1\\n\"\n    \"Pqa an 1\\n\"\n    \"zsV st 1\\n\"\n    \"ypC pr 1\\n\"\n    \"wMs st 1\\n\"\n    \"qEc ch 1\\n\"\n    \"vxY va 1\\n\"\n    \"fUg ng 1\\n\"\n    \"Dff fo 1\\n\"\n    \"gqQ qu 1\\n\"\n    \"zMv sz 1\\n\"\n    \"vJi in 1\\n\"\n    \"fPv va 1\\n\"\n    \"dLz sz 1\\n\"\n    \"cdM ch 1\\n\"\n    \"gNx ng 1\\n\"\n    \"aGv an 1\\n\"\n    \"vvD va 1\\n\"\n    \"dJh th 1\\n\"\n    \"rxY er 1\\n\"\n    \"rWj er 1\\n\"\n    \"Pvx va 1\\n\"\n    \"rhD th 1\\n\"\n    \"zRd sz 1\\n\"\n    \"Kgv ng 1\\n\"\n    \"Xvy va 1\\n\"\n    \"kZj ij 1\\n\"\n    \"kpK ka 1\\n\"\n    \"Pfn an 1\\n\"\n    \"wUe er 1\\n\"\n    \"wWx wa 1\\n\"\n    \"jPw ij 1\\n\"\n    \"gLq qu 1\\n\"\n    \"iJq qu 1\\n\"\n    \"gPx ng 1\\n\"\n    \"jHd de 1\\n\"\n    \"vJb va 1\\n\"\n    \"xhB th 1\\n\"\n    \"xQv va 1\\n\"\n    \"Eoa an 1\\n\"\n    \"pjO ij 1\\n\"\n    \"yFj ij 1\\n\"\n    \"sXo on 1\\n\"\n    \"wbY wa 1\\n\"\n    \"cjO ch 1\\n\"\n    \"mlZ le 1\\n\"\n    \"bNv va 1\\n\"\n    \"kjP ij 1\\n\"\n    \"yXn an 1\\n\"\n    \"qVj qu 1\\n\"\n    \"fNv va 1\\n\"\n    \"gjW ng 1\\n\"\n    \"nXj an 1\\n\"\n    \"dqJ qu 1\\n\"\n    \"Hnh th 1\\n\"\n    \"Qyk ka 1\\n\"\n    \"kvB ka 1\\n\"\n    \"qyB qu 1\\n\"\n    \"mDt th 1\\n\"\n    \"zgP ng 1\\n\"\n    \"Zzk sz 1\\n\"\n    \"fMk ka 1\\n\"\n    \"xzY sz 1\\n\"\n    \"qbT qu 1\\n\"\n    \"xOt th 1\\n\"\n    \"xsA st 1\\n\"\n    \"gLj ng 1\\n\"\n    \"zxH sz 1\\n\"\n    \"cLm ch 1\\n\"\n    \"Dnk an 1\\n\"\n    \"zIu qu 1\\n\"\n    \"kpJ ka 1\\n\"\n    \"xrK er 1\\n\"\n    \"eIb er 1\\n\"\n    \"Jbp pr 1\\n\"\n    \"Bqg qu 1\\n\"\n    \"tXg th 1\\n\"\n    \"Zjk ij 1\\n\"\n    \"dRd de 1\\n\"\n    \"tjZ th 1\\n\"\n    \"hQl th 1\\n\"\n    \"iyW in 1\\n\"\n    \"Jwd de 1\\n\"\n    \"qZt th 1\\n\"\n    \"cJp ch 1\\n\"\n    \"jBg ng 1\\n\"\n    \"zrG er 1\\n\"\n    \"hWf th 1\\n\"\n    \"Zds st 1\\n\"\n    \"qsZ qu 1\\n\"\n    \"cQx ch 1\\n\"\n    \"ccN ch 1\\n\"\n    \"ywM wa 1\\n\"\n    \"gbX ng 1\\n\"\n    \"tfT th 1\\n\"\n    \"vwt th 1\\n\"\n    \"Qbp pr 1\\n\"\n    \"yeY er 1\\n\"\n    \"aUb an 1\\n\"\n    \"qHw qu 1\\n\"\n    \"Fhq th 1\\n\"\n    \"Fng an 1\\n\"\n    \"lvI le 1\\n\"\n    \"jCf ij 1\\n\"\n    \"hqH th 1\\n\"\n    \"tTq th 1\\n\"\n    \"sfI st 1\\n\"\n    \"vsM st 1\\n\"\n    \"lDp le 1\\n\"\n    \"wJb wa 1\\n\"\n    \"bhX th 1\\n\"\n    \"rRq qu 1\\n\"\n    \"qtS th 1\\n\"\n    \"Zwp pr 1\\n\"\n    \"Jbh th 1\\n\"\n    \"hHb th 1\\n\"\n    \"pDy pr 1\\n\"\n    \"sjD st 1\\n\"\n    \"Oyp pr 1\\n\"\n    \"qwD qu 1\\n\"\n    \"jbD ij 1\\n\"\n    \"vpG va 1\\n\"\n    \"Wjb ij 1\\n\"\n    \"vpB va 1\\n\"\n    \"aXq an 1\\n\"\n    \"mWz sz 1\\n\"\n    \"qHi qu 1\\n\"\n    \"fyN ny 1\\n\"\n    \"mbQ me 1\\n\"\n    \"ywC wa 1\\n\"\n    \"oVg ng 1\\n\"\n    \"xmZ me 1\\n\"\n    \"slO le 1\\n\"\n    \"fXn an 1\\n\"\n    \"kYs st 1\\n\"\n    \"pVu qu 1\\n\"\n    \"bkU ka 1\\n\"\n    \"Brq qu 1\\n\"\n    \"qCq qu 1\\n\"\n    \"Xcx ch 1\\n\"\n    \"zMt th 1\\n\"\n    \"cRw ch 1\\n\"\n    \"gzQ ng 1\\n\"\n    \"Qbg ng 1\\n\"\n    \"juU qu 1\\n\"\n    \"xSz sz 1\\n\"\n    \"Vgz ng 1\\n\"\n    \"oMw on 1\\n\"\n    \"fpE pr 1\\n\"\n    \"xjX ij 1\\n\"\n    \"qCg qu 1\\n\"\n    \"zwM sz 1\\n\"\n    \"uQl qu 1\\n\"\n    \"qPk qu 1\\n\"\n    \"pjD ij 1\\n\"\n    \"Qzm sz 1\\n\"\n    \"sIp st 1\\n\"\n    \"uoG qu 1\\n\"\n    \"rVl er 1\\n\"\n    \"cbK ch 1\\n\"\n    \"hXm th 1\\n\"\n    \"Ksf st 1\\n\"\n    \"kbF ka 1\\n\"\n    \"wBm me 1\\n\"\n    \"iYt th 1\\n\"\n    \"sgH ng 1\\n\"\n    \"Gzv sz 1\\n\"\n    \"yvE va 1\\n\"\n    \"xKq qu 1\\n\"\n    \"sWf st 1\\n\"\n    \"zBc ch 1\\n\"\n    \"ykH ka 1\\n\"\n    \"vjH ij 1\\n\"\n    \"whI th 1\\n\"\n    \"vPj ij 1\\n\"\n    \"Zht th 1\\n\"\n    \"iJx in 1\\n\"\n    \"cZt th 1\\n\"\n    \"dqU qu 1\\n\"\n    \"hMd th 1\\n\"\n    \"cUj ch 1\\n\"\n    \"vMg ng 1\\n\"\n    \"pcJ ch 1\\n\"\n    \"Bcm ch 1\\n\"\n    \"jXi in 1\\n\"\n    \"xoI on 1\\n\"\n    \"Zkq qu 1\\n\"\n    \"Xzr er 1\\n\"\n    \"yzM sz 1\\n\"\n    \"qjX qu 1\\n\"\n    \"mNq qu 1\\n\"\n    \"hpX th 1\\n\"\n    \"fBq qu 1\\n\"\n    \"tXd th 1\\n\"\n    \"Xki in 1\\n\"\n    \"Hsq qu 1\\n\"\n    \"bqU qu 1\\n\"\n    \"sgF ng 1\\n\"\n    \"dPc ch 1\\n\"\n    \"Jxi in 1\\n\"\n    \"Ugp ng 1\\n\"\n    \"Rxi in 1\\n\"\n    \"Kwm me 1\\n\"\n    \"zkD sz 1\\n\"\n    \"Rql qu 1\\n\"\n    \"pJb pr 1\\n\"\n    \"fcV ch 1\\n\"\n    \"iVd in 1\\n\"\n    \"bBp be 1\\n\"\n    \"Ojw ij 1\\n\"\n    \"vZl le 1\\n\"\n    \"Iyj ij 1\\n\"\n    \"fkU ka 1\\n\"\n    \"Kcq ch 1\\n\"\n    \"dBq qu 1\\n\"\n    \"Mqq qu 1\\n\"\n    \"iMg ng 1\\n\"\n    \"Wws st 1\\n\"\n    \"tqX th 1\\n\"\n    \"xhD th 1\\n\"\n    \"rNl er 1\\n\"\n    \"pWd de 1\\n\"\n    \"jrV er 1\\n\"\n    \"Bmj ij 1\\n\"\n    \"Hmq qu 1\\n\"\n    \"vlH le 1\\n\"\n    \"Mxb be 1\\n\"\n    \"yyS ny 1\\n\"\n    \"qvW qu 1\\n\"\n    \"fvX va 1\\n\"\n    \"Vfe er 1\\n\"\n    \"Cdw de 1\\n\"\n    \"Kge ng 1\\n\"\n    \"Qej er 1\\n\"\n    \"rvZ er 1\\n\"\n    \"vzI sz 1\\n\"\n    \"dDn an 1\\n\"\n    \"nwS an 1\\n\"\n    \"Qcb ch 1\\n\"\n    \"wkV ka 1\\n\"\n    \"uCx qu 1\\n\"\n    \"Igk ng 1\\n\"\n    \"Vpm me 1\\n\"\n    \"hBm th 1\\n\"\n    \"pdQ de 1\\n\"\n    \"fgQ ng 1\\n\"\n    \"yQm me 1\\n\"\n    \"gxH ng 1\\n\"\n    \"pqK qu 1\\n\"\n    \"lRc ch 1\\n\"\n    \"Xdv de 1\\n\"\n    \"hDz th 1\\n\"\n    \"dFw de 1\\n\"\n    \"qQu un 1\\n\"\n    \"xbD be 1\\n\"\n    \"qmE qu 1\\n\"\n    \"mWm me 1\\n\"\n    \"jBb ij 1\\n\"\n    \"jXt th 1\\n\"\n    \"fxU fo 1\\n\"\n    \"Xwc ch 1\\n\"\n    \"Lqf qu 1\\n\"\n    \"hcP th 1\\n\"\n    \"pfB pr 1\\n\"\n    \"vSg ng 1\\n\"\n    \"xJw wa 1\\n\"\n    \"mRf me 1\\n\"\n    \"hqW th 1\\n\"\n    \"nVb an 1\\n\"\n    \"cEu ch 1\\n\"\n    \"nfN an 1\\n\"\n    \"nVj an 1\\n\"\n    \"Rwk ka 1\\n\"\n    \"nmG an 1\\n\"\n    \"oDt th 1\\n\"\n    \"kPb ka 1\\n\"\n    \"gqW qu 1\\n\"\n    \"Qhf th 1\\n\"\n    \"qZl qu 1\\n\"\n    \"zHq qu 1\\n\"\n    \"iXl in 1\\n\"\n#endif\n};\n#ifndef _MSC_VER\n#pragma GCC diagnostic pop\n#endif\n\ninline const int ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_\n"
  },
  {
    "path": "src/classify/adaptive.cpp",
    "content": "/******************************************************************************\n ** Filename:    adaptive.c\n ** Purpose:     Adaptive matcher.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"adaptive.h\"\n\n#include \"classify.h\"\n\n#include <cassert>\n#include <cstdio>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n/*---------------------------------------------------------------------------*/\n/**\n * This routine adds a new adapted class to an existing\n * set of adapted templates.\n *\n * @param Templates set of templates to add new class to\n * @param Class new class to add to templates\n * @param ClassId class id to associate with new class\n *\n * @note Globals: none\n */\nvoid AddAdaptedClass(ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_CLASS_STRUCT *Class, CLASS_ID ClassId) {\n  assert(Templates != nullptr);\n  assert(Class != nullptr);\n  assert(LegalClassId(ClassId));\n  assert(UnusedClassIdIn(Templates->Templates, ClassId));\n  assert(Class->NumPermConfigs == 0);\n\n  auto IntClass = new INT_CLASS_STRUCT(1, 1);\n  AddIntClass(Templates->Templates, ClassId, IntClass);\n\n  assert(Templates->Class[ClassId] == nullptr);\n  Templates->Class[ClassId] = Class;\n\n} /* AddAdaptedClass */\n\n/*---------------------------------------------------------------------------*/\n\nPERM_CONFIG_STRUCT::~PERM_CONFIG_STRUCT() {\n  delete[] Ambigs;\n}\n\nADAPT_CLASS_STRUCT::ADAPT_CLASS_STRUCT() :\n  NumPermConfigs(0),\n  MaxNumTimesSeen(0),\n  PermProtos(NewBitVector(MAX_NUM_PROTOS)),\n  PermConfigs(NewBitVector(MAX_NUM_CONFIGS)),\n  TempProtos(NIL_LIST) {\n  zero_all_bits(PermProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));\n  zero_all_bits(PermConfigs, WordsInVectorOfSize(MAX_NUM_CONFIGS));\n\n  for (int i = 0; i < MAX_NUM_CONFIGS; i++) {\n    TempConfigFor(this, i) = nullptr;\n  }\n}\n\nADAPT_CLASS_STRUCT::~ADAPT_CLASS_STRUCT() {\n  for (int i = 0; i < MAX_NUM_CONFIGS; i++) {\n    if (ConfigIsPermanent(this, i) && PermConfigFor(this, i) != nullptr) {\n      delete PermConfigFor(this, i);\n    } else if (!ConfigIsPermanent(this, i) && TempConfigFor(this, i) != nullptr) {\n      delete TempConfigFor(this, i);\n    }\n  }\n  FreeBitVector(PermProtos);\n  FreeBitVector(PermConfigs);\n  auto list = TempProtos;\n  while (list != nullptr) {\n    delete reinterpret_cast<TEMP_PROTO_STRUCT *>(list->node);\n    list = pop(list);\n  }\n}\n\n/// Constructor for adapted templates.\n/// Add an empty class for each char in unicharset to the newly created templates.\nADAPT_TEMPLATES_STRUCT::ADAPT_TEMPLATES_STRUCT(UNICHARSET &unicharset) {\n  Templates = new INT_TEMPLATES_STRUCT;\n  NumPermClasses = 0;\n  NumNonEmptyClasses = 0;\n\n  /* Insert an empty class for each unichar id in unicharset */\n  for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {\n    Class[i] = nullptr;\n    if (i < unicharset.size()) {\n      AddAdaptedClass(this, new ADAPT_CLASS_STRUCT, i);\n    }\n  }\n}\n\nADAPT_TEMPLATES_STRUCT::~ADAPT_TEMPLATES_STRUCT() {\n  for (unsigned i = 0; i < (Templates)->NumClasses; i++) {\n    delete Class[i];\n  }\n  delete Templates;\n}\n\n// Returns FontinfoId of the given config of the given adapted class.\nint Classify::GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId) {\n  return (ConfigIsPermanent(Class, ConfigId) ? PermConfigFor(Class, ConfigId)->FontinfoId\n                                             : TempConfigFor(Class, ConfigId)->FontinfoId);\n}\n\n/// This constructor allocates and returns a new temporary config.\n///\n/// @param MaxProtoId  max id of any proto in new config\n/// @param FontinfoId font information from pre-trained templates\nTEMP_CONFIG_STRUCT::TEMP_CONFIG_STRUCT(int maxProtoId, int fontinfoId) :\n  NumTimesSeen(1),\n  ProtoVectorSize(WordsInVectorOfSize(maxProtoId + 1)),\n  MaxProtoId(maxProtoId),\n  Protos(NewBitVector(maxProtoId + 1)),\n  FontinfoId(fontinfoId) {\n  zero_all_bits(Protos, ProtoVectorSize);\n}\n\nTEMP_CONFIG_STRUCT::~TEMP_CONFIG_STRUCT() {\n  FreeBitVector(Protos);\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine prints a summary of the adapted templates\n *  in Templates to File.\n *\n * @param File    open text file to print Templates to\n * @param Templates adapted templates to print to File\n *\n * @note Globals: none\n */\nvoid Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {\n  INT_CLASS_STRUCT *IClass;\n  ADAPT_CLASS_STRUCT *AClass;\n\n  fprintf(File, \"\\n\\nSUMMARY OF ADAPTED TEMPLATES:\\n\\n\");\n  fprintf(File, \"Num classes = %d;  Num permanent classes = %d\\n\\n\", Templates->NumNonEmptyClasses,\n          Templates->NumPermClasses);\n  fprintf(File, \"   Id  NC NPC  NP NPP\\n\");\n  fprintf(File, \"------------------------\\n\");\n\n  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {\n    IClass = Templates->Templates->Class[i];\n    AClass = Templates->Class[i];\n    if (!IsEmptyAdaptedClass(AClass)) {\n      fprintf(File, \"%5u  %s %3d %3d %3d %3zd\\n\", i, unicharset.id_to_unichar(i), IClass->NumConfigs,\n              AClass->NumPermConfigs, IClass->NumProtos,\n              IClass->NumProtos - AClass->TempProtos->size());\n    }\n  }\n  fprintf(File, \"\\n\");\n\n} /* PrintAdaptedTemplates */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Read an adapted class description from file and return\n * a ptr to the adapted class.\n *\n * @param fp open file to read adapted class from\n * @return Ptr to new adapted class.\n *\n * @note Globals: none\n */\nADAPT_CLASS_STRUCT *ReadAdaptedClass(TFile *fp) {\n  int NumTempProtos;\n  int NumConfigs;\n  int i;\n  ADAPT_CLASS_STRUCT *Class;\n\n  /* first read high level adapted class structure */\n  Class = new ADAPT_CLASS_STRUCT;\n  fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1);\n\n  /* then read in the definitions of the permanent protos and configs */\n  Class->PermProtos = NewBitVector(MAX_NUM_PROTOS);\n  Class->PermConfigs = NewBitVector(MAX_NUM_CONFIGS);\n  fp->FRead(Class->PermProtos, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_PROTOS));\n  fp->FRead(Class->PermConfigs, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_CONFIGS));\n\n  /* then read in the list of temporary protos */\n  fp->FRead(&NumTempProtos, sizeof(int), 1);\n  Class->TempProtos = NIL_LIST;\n  for (i = 0; i < NumTempProtos; i++) {\n    auto TempProto = new TEMP_PROTO_STRUCT;\n    fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1);\n    Class->TempProtos = push_last(Class->TempProtos, TempProto);\n  }\n\n  /* then read in the adapted configs */\n  fp->FRead(&NumConfigs, sizeof(int), 1);\n  for (i = 0; i < NumConfigs; i++) {\n    if (test_bit(Class->PermConfigs, i)) {\n      Class->Config[i].Perm = ReadPermConfig(fp);\n    } else {\n      Class->Config[i].Temp = ReadTempConfig(fp);\n    }\n  }\n\n  return (Class);\n\n} /* ReadAdaptedClass */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Read a set of adapted templates from file and return\n * a ptr to the templates.\n *\n * @param fp open text file to read adapted templates from\n * @return Ptr to adapted templates read from file.\n *\n * @note Globals: none\n */\nADAPT_TEMPLATES_STRUCT *Classify::ReadAdaptedTemplates(TFile *fp) {\n  auto Templates = new ADAPT_TEMPLATES_STRUCT;\n\n  /* first read the high level adaptive template struct */\n  fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);\n\n  /* then read in the basic integer templates */\n  Templates->Templates = ReadIntTemplates(fp);\n\n  /* then read in the adaptive info for each class */\n  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {\n    Templates->Class[i] = ReadAdaptedClass(fp);\n  }\n  return (Templates);\n\n} /* ReadAdaptedTemplates */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Read a permanent configuration description from file\n * and return a ptr to it.\n *\n * @param fp open file to read permanent config from\n * @return Ptr to new permanent configuration description.\n *\n * @note Globals: none\n */\nPERM_CONFIG_STRUCT *ReadPermConfig(TFile *fp) {\n  auto Config = new PERM_CONFIG_STRUCT;\n  uint8_t NumAmbigs;\n  fp->FRead(&NumAmbigs, sizeof(NumAmbigs), 1);\n  Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1];\n  fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs);\n  Config->Ambigs[NumAmbigs] = -1;\n  fp->FRead(&(Config->FontinfoId), sizeof(int), 1);\n\n  return (Config);\n\n} /* ReadPermConfig */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Read a temporary configuration description from file\n * and return a ptr to it.\n *\n * @param fp open file to read temporary config from\n * @return Ptr to new temporary configuration description.\n *\n * @note Globals: none\n */\nTEMP_CONFIG_STRUCT *ReadTempConfig(TFile *fp) {\n  auto Config = new TEMP_CONFIG_STRUCT;\n  fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1);\n\n  Config->Protos = NewBitVector(Config->ProtoVectorSize * BITSINLONG);\n  fp->FRead(Config->Protos, sizeof(uint32_t), Config->ProtoVectorSize);\n\n  return (Config);\n\n} /* ReadTempConfig */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine writes a binary representation of Class\n * to File.\n *\n * @param File    open file to write Class to\n * @param Class   adapted class to write to File\n * @param NumConfigs  number of configs in Class\n *\n * @note Globals: none\n */\nvoid WriteAdaptedClass(FILE *File, ADAPT_CLASS_STRUCT *Class, int NumConfigs) {\n  /* first write high level adapted class structure */\n  fwrite(Class, sizeof(ADAPT_CLASS_STRUCT), 1, File);\n\n  /* then write out the definitions of the permanent protos and configs */\n  fwrite(Class->PermProtos, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_PROTOS), File);\n  fwrite(Class->PermConfigs, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_CONFIGS), File);\n\n  /* then write out the list of temporary protos */\n  uint32_t NumTempProtos = Class->TempProtos->size();\n  fwrite(&NumTempProtos, sizeof(NumTempProtos), 1, File);\n  auto TempProtos = Class->TempProtos;\n  iterate(TempProtos) {\n    void *proto = TempProtos->node;\n    fwrite(proto, sizeof(TEMP_PROTO_STRUCT), 1, File);\n  }\n\n  /* then write out the adapted configs */\n  fwrite(&NumConfigs, sizeof(int), 1, File);\n  for (int i = 0; i < NumConfigs; i++) {\n    if (test_bit(Class->PermConfigs, i)) {\n      WritePermConfig(File, Class->Config[i].Perm);\n    } else {\n      WriteTempConfig(File, Class->Config[i].Temp);\n    }\n  }\n\n} /* WriteAdaptedClass */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine saves Templates to File in a binary format.\n *\n * @param File    open text file to write Templates to\n * @param Templates set of adapted templates to write to File\n *\n * @note Globals: none\n */\nvoid Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {\n  /* first write the high level adaptive template struct */\n  fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);\n\n  /* then write out the basic integer templates */\n  WriteIntTemplates(File, Templates->Templates, unicharset);\n\n  /* then write out the adaptive info for each class */\n  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {\n    WriteAdaptedClass(File, Templates->Class[i], Templates->Templates->Class[i]->NumConfigs);\n  }\n} /* WriteAdaptedTemplates */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine writes a binary representation of a\n * permanent configuration to File.\n *\n * @param File  open file to write Config to\n * @param Config  permanent config to write to File\n *\n * @note Globals: none\n */\nvoid WritePermConfig(FILE *File, PERM_CONFIG_STRUCT *Config) {\n  uint8_t NumAmbigs = 0;\n\n  assert(Config != nullptr);\n  while (Config->Ambigs[NumAmbigs] > 0) {\n    ++NumAmbigs;\n  }\n\n  fwrite(&NumAmbigs, sizeof(uint8_t), 1, File);\n  fwrite(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File);\n  fwrite(&(Config->FontinfoId), sizeof(int), 1, File);\n} /* WritePermConfig */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine writes a binary representation of a\n * temporary configuration to File.\n *\n * @param File  open file to write Config to\n * @param Config  temporary config to write to File\n *\n * @note Globals: none\n */\nvoid WriteTempConfig(FILE *File, TEMP_CONFIG_STRUCT *Config) {\n  assert(Config != nullptr);\n\n  fwrite(Config, sizeof(TEMP_CONFIG_STRUCT), 1, File);\n  fwrite(Config->Protos, sizeof(uint32_t), Config->ProtoVectorSize, File);\n\n} /* WriteTempConfig */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/adaptive.h",
    "content": "/******************************************************************************\n ** Filename:   adaptive.h\n ** Purpose:    Interface to adaptive matcher.\n ** Author:     Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n#ifndef ADAPTIVE_H\n#define ADAPTIVE_H\n\n#include \"intproto.h\"\n#include \"oldlist.h\"\n\n#include <cstdio>\n\nnamespace tesseract {\n\nstruct TEMP_PROTO_STRUCT {\n  uint16_t ProtoId;\n  PROTO_STRUCT Proto;\n};\n\nstruct TEMP_CONFIG_STRUCT {\n  TEMP_CONFIG_STRUCT() = default;\n  TEMP_CONFIG_STRUCT(int MaxProtoId, int FontinfoId);\n  ~TEMP_CONFIG_STRUCT();\n  uint8_t NumTimesSeen;\n  uint8_t ProtoVectorSize;\n  PROTO_ID MaxProtoId;\n  BIT_VECTOR Protos;\n  int FontinfoId; // font information inferred from pre-trained templates\n};\n\nstruct PERM_CONFIG_STRUCT {\n  PERM_CONFIG_STRUCT() = default;\n  ~PERM_CONFIG_STRUCT();\n  UNICHAR_ID *Ambigs;\n  int FontinfoId; // font information inferred from pre-trained templates\n};\n\nunion ADAPTED_CONFIG {\n  TEMP_CONFIG_STRUCT *Temp;\n  PERM_CONFIG_STRUCT *Perm;\n};\n\nstruct ADAPT_CLASS_STRUCT {\n  ADAPT_CLASS_STRUCT();\n  ~ADAPT_CLASS_STRUCT();\n  uint8_t NumPermConfigs;\n  uint8_t MaxNumTimesSeen; // maximum number of times any TEMP_CONFIG_STRUCT was seen\n                           // (cut at matcher_min_examples_for_prototyping)\n  BIT_VECTOR PermProtos;\n  BIT_VECTOR PermConfigs;\n  LIST TempProtos;\n  ADAPTED_CONFIG Config[MAX_NUM_CONFIGS];\n};\n\nclass ADAPT_TEMPLATES_STRUCT {\npublic:\n  ADAPT_TEMPLATES_STRUCT() = default;\n  ADAPT_TEMPLATES_STRUCT(UNICHARSET &unicharset);\n  ~ADAPT_TEMPLATES_STRUCT();\n  INT_TEMPLATES_STRUCT *Templates;\n  int NumNonEmptyClasses;\n  uint8_t NumPermClasses;\n  ADAPT_CLASS_STRUCT *Class[MAX_NUM_CLASSES];\n};\n\n/*----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------*/\n#define NumNonEmptyClassesIn(Template) ((Template)->NumNonEmptyClasses)\n\n#define IsEmptyAdaptedClass(Class) ((Class)->NumPermConfigs == 0 && (Class)->TempProtos == NIL_LIST)\n\n#define ConfigIsPermanent(Class, ConfigId) (test_bit((Class)->PermConfigs, ConfigId))\n\n#define MakeConfigPermanent(Class, ConfigId) (SET_BIT((Class)->PermConfigs, ConfigId))\n\n#define MakeProtoPermanent(Class, ProtoId) (SET_BIT((Class)->PermProtos, ProtoId))\n\n#define TempConfigFor(Class, ConfigId) ((Class)->Config[ConfigId].Temp)\n\n#define PermConfigFor(Class, ConfigId) ((Class)->Config[ConfigId].Perm)\n\n#define IncreaseConfidence(TempConfig) ((TempConfig)->NumTimesSeen++)\n\nvoid AddAdaptedClass(ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_CLASS_STRUCT *Class, CLASS_ID ClassId);\n\nADAPT_CLASS_STRUCT *ReadAdaptedClass(tesseract::TFile *File);\n\nPERM_CONFIG_STRUCT *ReadPermConfig(tesseract::TFile *File);\n\nTEMP_CONFIG_STRUCT *ReadTempConfig(tesseract::TFile *File);\n\nvoid WriteAdaptedClass(FILE *File, ADAPT_CLASS_STRUCT *Class, int NumConfigs);\n\nvoid WritePermConfig(FILE *File, PERM_CONFIG_STRUCT *Config);\n\nvoid WriteTempConfig(FILE *File, TEMP_CONFIG_STRUCT *Config);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/adaptmatch.cpp",
    "content": "/******************************************************************************\n ** Filename:    adaptmatch.cpp\n ** Purpose:     High level adaptive matcher.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n/*-----------------------------------------------------------------------------\n          Include Files and Type Defines\n-----------------------------------------------------------------------------*/\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"adaptive.h\"        // for ADAPT_CLASS\n#include \"ambigs.h\"          // for UnicharIdVector, UnicharAmbigs\n#include \"bitvec.h\"          // for FreeBitVector, NewBitVector, BIT_VECTOR\n#include \"blobs.h\"           // for TBLOB, TWERD\n#include \"classify.h\"        // for Classify, CST_FRAGMENT, CST_WHOLE\n#include \"dict.h\"            // for Dict\n#include \"errcode.h\"         // for ASSERT_HOST\n#include \"featdefs.h\"        // for CharNormDesc\n#include \"float2int.h\"       // for BASELINE_Y_SHIFT\n#include \"fontinfo.h\"        // for ScoredFont, FontSet\n#include \"intfx.h\"           // for BlobToTrainingSample, INT_FX_RESULT_S...\n#include \"intmatcher.h\"      // for CP_RESULT_STRUCT, IntegerMatcher\n#include \"intproto.h\"        // for INT_FEATURE_STRUCT, (anonymous), Clas...\n#include \"matchdefs.h\"       // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO\n#include \"mfoutline.h\"       // for baseline, character, MF_SCALE_FACTOR\n#include \"normalis.h\"        // for DENORM, kBlnBaselineOffset, kBlnXHeight\n#include \"normfeat.h\"        // for ActualOutlineLength, CharNormLength\n#include \"ocrfeatures.h\"     // for FEATURE_STRUCT, FEATURE\n#include \"oldlist.h\"         // for push, delete_d\n#include \"outfeat.h\"         // for OutlineFeatDir, OutlineFeatLength\n#include \"pageres.h\"         // for WERD_RES\n#include \"params.h\"          // for IntParam, BoolParam, DoubleParam, Str...\n#include \"picofeat.h\"        // for PicoFeatDir, PicoFeatX, PicoFeatY\n#include \"protos.h\"          // for PROTO_STRUCT, FillABC\n#include \"ratngs.h\"          // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...\n#include \"rect.h\"            // for TBOX\n#include \"scrollview.h\"      // for ScrollView, ScrollView::BROWN, Scroll...\n#include \"seam.h\"            // for SEAM\n#include \"shapeclassifier.h\" // for ShapeClassifier\n#include \"shapetable.h\"      // for UnicharRating, ShapeTable, Shape, Uni...\n#include \"tessclassifier.h\"  // for TessClassifier\n#include \"tessdatamanager.h\" // for TessdataManager, TESSDATA_INTTEMP\n#include \"tprintf.h\"         // for tprintf\n#include \"trainingsample.h\"  // for TrainingSample\n#include \"unicharset.h\"      // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE\n#include \"unicity_table.h\"   // for UnicityTable\n\n#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID\n#include \"helpers.h\"           // for IntCastRounded, ClipToRange\n#include \"serialis.h\"          // for TFile\n\n#include <algorithm> // for max, min\n#include <cassert>   // for assert\n#include <cmath>     // for fabs\n#include <cstdint>   // for INT32_MAX, UINT8_MAX\n#include <cstdio>    // for fflush, fclose, fopen, stdout, FILE\n#include <cstring>   // for strstr, memset, strcmp\n\nnamespace tesseract {\n\n// TODO: The parameter classify_enable_adaptive_matcher can cause\n// a segmentation fault if it is set to false (issue #256),\n// so override it here.\n#define CLASSIFY_ENABLE_ADAPTIVE_MATCHER_OVERRIDE true\n\n#define ADAPT_TEMPLATE_SUFFIX \".a\"\n\n#define MAX_MATCHES 10\n#define UNLIKELY_NUM_FEAT 200\n#define NO_DEBUG 0\n#define MAX_ADAPTABLE_WERD_SIZE 40\n\n#define ADAPTABLE_WERD_ADJUSTMENT (0.05)\n\n#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)\n\n#define WORST_POSSIBLE_RATING (0.0f)\n\nstruct ADAPT_RESULTS {\n  int32_t BlobLength;\n  bool HasNonfragment;\n  UNICHAR_ID best_unichar_id;\n  int best_match_index;\n  float best_rating;\n  std::vector<UnicharRating> match;\n  std::vector<CP_RESULT_STRUCT> CPResults;\n\n  /// Initializes data members to the default values. Sets the initial\n  /// rating of each class to be the worst possible rating (1.0).\n  inline void Initialize() {\n    BlobLength = INT32_MAX;\n    HasNonfragment = false;\n    ComputeBest();\n  }\n  // Computes best_unichar_id, best_match_index and best_rating.\n  void ComputeBest() {\n    best_unichar_id = INVALID_UNICHAR_ID;\n    best_match_index = -1;\n    best_rating = WORST_POSSIBLE_RATING;\n    for (unsigned i = 0; i < match.size(); ++i) {\n      if (match[i].rating > best_rating) {\n        best_rating = match[i].rating;\n        best_unichar_id = match[i].unichar_id;\n        best_match_index = i;\n      }\n    }\n  }\n};\n\nstruct PROTO_KEY {\n  ADAPT_TEMPLATES_STRUCT *Templates;\n  CLASS_ID ClassId;\n  int ConfigId;\n};\n\n// Sort function to sort ratings appropriately by descending rating.\nstatic bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {\n  if (a.rating != b.rating) {\n    return a.rating > b.rating;\n  } else {\n    return a.unichar_id < b.unichar_id;\n  }\n}\n\n/*-----------------------------------------------------------------------------\n          Private Macros\n-----------------------------------------------------------------------------*/\ninline bool MarginalMatch(float confidence, float matcher_great_threshold) {\n  return (1.0f - confidence) > matcher_great_threshold;\n}\n\n/*-----------------------------------------------------------------------------\n          Private Function Prototypes\n-----------------------------------------------------------------------------*/\n// Returns the index of the given id in results, if present, or the size of the\n// vector (index it will go at) if not present.\nstatic unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {\n  for (unsigned i = 0; i < results.match.size(); i++) {\n    if (results.match[i].unichar_id == id) {\n      return i;\n    }\n  }\n  return results.match.size();\n}\n\n// Returns the current rating for a unichar id if we have rated it, defaulting\n// to WORST_POSSIBLE_RATING.\nstatic float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {\n  unsigned index = FindScoredUnichar(id, results);\n  if (index >= results.match.size()) {\n    return WORST_POSSIBLE_RATING;\n  }\n  return results.match[index].rating;\n}\n\nvoid InitMatcherRatings(float *Rating);\n\nint MakeTempProtoPerm(void *item1, void *item2);\n\nvoid SetAdaptiveThreshold(float Threshold);\n\n/*-----------------------------------------------------------------------------\n              Public Code\n-----------------------------------------------------------------------------*/\n/**\n * This routine calls the adaptive matcher\n * which returns (in an array) the class id of each\n * class matched.\n *\n * It also returns the number of classes matched.\n * For each class matched it places the best rating\n * found for that class into the Ratings array.\n *\n * Bad matches are then removed so that they don't\n * need to be sorted.  The remaining good matches are\n * then sorted and converted to choices.\n *\n * This routine also performs some simple speckle\n * filtering.\n *\n * @param Blob    blob to be classified\n * @param[out] Choices    List of choices found by adaptive matcher.\n * filled on return with the choices found by the\n * class pruner and the ratings there from. Also\n * contains the detailed results of the integer matcher.\n *\n */\nvoid Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {\n  assert(Choices != nullptr);\n  auto *Results = new ADAPT_RESULTS;\n  Results->Initialize();\n\n  ASSERT_HOST(AdaptedTemplates != nullptr);\n\n  DoAdaptiveMatch(Blob, Results);\n\n  RemoveBadMatches(Results);\n  std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);\n  RemoveExtraPuncs(Results);\n  Results->ComputeBest();\n  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);\n\n  // TODO(rays) Move to before ConvertMatchesToChoices!\n  if (LargeSpeckle(*Blob) || Choices->empty()) {\n    AddLargeSpeckleTo(Results->BlobLength, Choices);\n  }\n\n  if (matcher_debug_level >= 1) {\n    tprintf(\"AD Matches =  \");\n    PrintAdaptiveMatchResults(*Results);\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (classify_enable_adaptive_debugger) {\n    DebugAdaptiveClassifier(Blob, Results);\n  }\n#endif\n\n  delete Results;\n} /* AdaptiveClassifier */\n\n#ifndef GRAPHICS_DISABLED\n\n// If *win is nullptr, sets it to a new ScrollView() object with title msg.\n// Clears the window and draws baselines.\nvoid Classify::RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset,\n                                  const TBOX &wbox) {\n  const int kSampleSpaceWidth = 500;\n  if (*win == nullptr) {\n    *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,\n                          200, true);\n  }\n  (*win)->Clear();\n  (*win)->Pen(64, 64, 64);\n  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);\n  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,\n               kBlnXHeight + kBlnBaselineOffset);\n  (*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Learns the given word using its chopped_word, seam_array, denorm,\n// box_word, best_state, and correct_text to learn both correctly and\n// incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob\n// is called and the data will be saved in an internal buffer.\n// Otherwise AdaptToBlob is called for adaption within a document.\nvoid Classify::LearnWord(const char *fontname, WERD_RES *word) {\n  int word_len = word->correct_text.size();\n  if (word_len == 0) {\n    return;\n  }\n\n  float *thresholds = nullptr;\n  if (fontname == nullptr) {\n    // Adaption mode.\n    if (!EnableLearning || word->best_choice == nullptr) {\n      return; // Can't or won't adapt.\n    }\n\n    if (classify_learning_debug_level >= 1) {\n      tprintf(\"\\n\\nAdapting to word = %s\\n\", word->best_choice->debug_string().c_str());\n    }\n    thresholds = new float[word_len];\n    word->ComputeAdaptionThresholds(getDict().certainty_scale, matcher_perfect_threshold,\n                                    matcher_good_threshold, matcher_rating_margin, thresholds);\n  }\n  int start_blob = 0;\n\n#ifndef GRAPHICS_DISABLED\n  if (classify_debug_character_fragments) {\n    if (learn_fragmented_word_debug_win_ != nullptr) {\n      learn_fragmented_word_debug_win_->Wait();\n    }\n    RefreshDebugWindow(&learn_fragments_debug_win_, \"LearnPieces\", 400,\n                       word->chopped_word->bounding_box());\n    RefreshDebugWindow(&learn_fragmented_word_debug_win_, \"LearnWord\", 200,\n                       word->chopped_word->bounding_box());\n    word->chopped_word->plot(learn_fragmented_word_debug_win_);\n    ScrollView::Update();\n  }\n#endif // !GRAPHICS_DISABLED\n\n  for (int ch = 0; ch < word_len; ++ch) {\n    if (classify_debug_character_fragments) {\n      tprintf(\"\\nLearning %s\\n\", word->correct_text[ch].c_str());\n    }\n    if (word->correct_text[ch].length() > 0) {\n      float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;\n\n      LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,\n                  word->correct_text[ch].c_str(), word);\n\n      if (word->best_state[ch] > 1 && !disable_character_fragments) {\n        // Check that the character breaks into meaningful fragments\n        // that each match a whole character with at least\n        // classify_character_fragments_garbage_certainty_threshold\n        bool garbage = false;\n        int frag;\n        for (frag = 0; frag < word->best_state[ch]; ++frag) {\n          TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];\n          if (classify_character_fragments_garbage_certainty_threshold < 0) {\n            garbage |= LooksLikeGarbage(frag_blob);\n          }\n        }\n        // Learn the fragments.\n        if (!garbage) {\n          bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);\n          if (pieces_all_natural || !prioritize_division) {\n            for (frag = 0; frag < word->best_state[ch]; ++frag) {\n              std::vector<std::string> tokens = split(word->correct_text[ch], ' ');\n\n              tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],\n                                                   pieces_all_natural);\n\n              std::string full_string;\n              for (unsigned i = 0; i < tokens.size(); i++) {\n                full_string += tokens[i];\n                if (i != tokens.size() - 1) {\n                  full_string += ' ';\n                }\n              }\n              LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,\n                          full_string.c_str(), word);\n            }\n          }\n        }\n      }\n\n      // TODO(rays): re-enable this part of the code when we switch to the\n      // new classifier that needs to see examples of garbage.\n      /*\nif (word->best_state[ch] > 1) {\n  // If the next blob is good, make junk with the rightmost fragment.\n  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {\n    LearnPieces(fontname, start_blob + word->best_state[ch] - 1,\n                word->best_state[ch + 1] + 1,\n                threshold, CST_IMPROPER, INVALID_UNICHAR, word);\n  }\n  // If the previous blob is good, make junk with the leftmost fragment.\n  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {\n    LearnPieces(fontname, start_blob - word->best_state[ch - 1],\n                word->best_state[ch - 1] + 1,\n                threshold, CST_IMPROPER, INVALID_UNICHAR, word);\n  }\n}\n// If the next blob is good, make a join with it.\nif (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {\n  std::string joined_text = word->correct_text[ch];\n  joined_text += word->correct_text[ch + 1];\n  LearnPieces(fontname, start_blob,\n              word->best_state[ch] + word->best_state[ch + 1],\n              threshold, CST_NGRAM, joined_text.c_str(), word);\n}\n*/\n    }\n    start_blob += word->best_state[ch];\n  }\n  delete[] thresholds;\n} // LearnWord.\n\n// Builds a blob of length fragments, from the word, starting at start,\n// and then learns it, as having the given correct_text.\n// If fontname is not nullptr, then LearnBlob is called and the data will be\n// saved in an internal buffer for static training.\n// Otherwise AdaptToBlob is called for adaption within a document.\n// threshold is a magic number required by AdaptToChar and generated by\n// ComputeAdaptionThresholds.\n// Although it can be partly inferred from the string, segmentation is\n// provided to explicitly clarify the character segmentation.\nvoid Classify::LearnPieces(const char *fontname, int start, int length, float threshold,\n                           CharSegmentationType segmentation, const char *correct_text,\n                           WERD_RES *word) {\n  // TODO(daria) Remove/modify this if/when we want\n  // to train and/or adapt to n-grams.\n  if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {\n    return;\n  }\n\n  if (length > 1) {\n    SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);\n  }\n  TBLOB *blob = word->chopped_word->blobs[start];\n  // Rotate the blob if needed for classification.\n  TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();\n  if (rotated_blob == nullptr) {\n    rotated_blob = blob;\n  }\n\n#ifndef GRAPHICS_DISABLED\n  // Draw debug windows showing the blob that is being learned if needed.\n  if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {\n    RefreshDebugWindow(&learn_debug_win_, \"LearnPieces\", 600, word->chopped_word->bounding_box());\n    rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);\n    learn_debug_win_->Update();\n    learn_debug_win_->Wait();\n  }\n  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {\n    ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord\n    blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);\n    learn_fragments_debug_win_->Update();\n  }\n#endif // !GRAPHICS_DISABLED\n\n  if (fontname != nullptr) {\n    classify_norm_method.set_value(character); // force char norm spc 30/11/93\n    tess_bn_matching.set_value(false);         // turn it off\n    tess_cn_matching.set_value(false);\n    DENORM bl_denorm, cn_denorm;\n    INT_FX_RESULT_STRUCT fx_info;\n    SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);\n    LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);\n  } else if (unicharset.contains_unichar(correct_text)) {\n    UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);\n    int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;\n    if (classify_learning_debug_level >= 1) {\n      tprintf(\"Adapting to char = %s, thr= %g font_id= %d\\n\", unicharset.id_to_unichar(class_id),\n              threshold, font_id);\n    }\n    // If filename is not nullptr we are doing recognition\n    // (as opposed to training), so we must have already set word fonts.\n    AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);\n    if (BackupAdaptedTemplates != nullptr) {\n      // Adapt the backup templates too. They will be used if the primary gets\n      // too full.\n      AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);\n    }\n  } else if (classify_debug_level >= 1) {\n    tprintf(\"Can't adapt to %s not in unicharset\\n\", correct_text);\n  }\n  if (rotated_blob != blob) {\n    delete rotated_blob;\n  }\n\n  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);\n} // LearnPieces.\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine performs cleanup operations\n * on the adaptive classifier.  It should be called\n * before the program is terminated.  Its main function\n * is to save the adapted templates to a file.\n *\n * Globals:\n * - #AdaptedTemplates current set of adapted templates\n * - #classify_save_adapted_templates true if templates should be saved\n * - #classify_enable_adaptive_matcher true if adaptive matcher is enabled\n */\nvoid Classify::EndAdaptiveClassifier() {\n  std::string Filename;\n  FILE *File;\n\n  if (AdaptedTemplates != nullptr && CLASSIFY_ENABLE_ADAPTIVE_MATCHER_OVERRIDE &&\n      classify_save_adapted_templates) {\n    Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;\n    File = fopen(Filename.c_str(), \"wb\");\n    if (File == nullptr) {\n      tprintf(\"Unable to save adapted templates to %s!\\n\", Filename.c_str());\n    } else {\n      tprintf(\"\\nSaving adapted templates to %s ...\", Filename.c_str());\n      fflush(stdout);\n      WriteAdaptedTemplates(File, AdaptedTemplates);\n      tprintf(\"\\n\");\n      fclose(File);\n    }\n  }\n\n  delete AdaptedTemplates;\n  AdaptedTemplates = nullptr;\n  delete BackupAdaptedTemplates;\n  BackupAdaptedTemplates = nullptr;\n\n  if (PreTrainedTemplates != nullptr) {\n    delete PreTrainedTemplates;\n    PreTrainedTemplates = nullptr;\n  }\n  getDict().EndDangerousAmbigs();\n  FreeNormProtos();\n  if (AllProtosOn != nullptr) {\n    FreeBitVector(AllProtosOn);\n    FreeBitVector(AllConfigsOn);\n    FreeBitVector(AllConfigsOff);\n    FreeBitVector(TempProtoMask);\n    AllProtosOn = nullptr;\n    AllConfigsOn = nullptr;\n    AllConfigsOff = nullptr;\n    TempProtoMask = nullptr;\n  }\n  delete shape_table_;\n  shape_table_ = nullptr;\n  delete static_classifier_;\n  static_classifier_ = nullptr;\n} /* EndAdaptiveClassifier */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine reads in the training\n * information needed by the adaptive classifier\n * and saves it into global variables.\n *  Parameters:\n *      load_pre_trained_templates  Indicates whether the pre-trained\n *                     templates (inttemp, normproto and pffmtable components)\n *                     should be loaded. Should only be set to true if the\n *                     necessary classifier components are present in the\n *                     [lang].traineddata file.\n *  Globals:\n *      BuiltInTemplatesFile  file to get built-in temps from\n *      BuiltInCutoffsFile    file to get avg. feat per class from\n *      classify_use_pre_adapted_templates\n *                            enables use of pre-adapted templates\n */\nvoid Classify::InitAdaptiveClassifier(TessdataManager *mgr) {\n  if (!CLASSIFY_ENABLE_ADAPTIVE_MATCHER_OVERRIDE) {\n    return;\n  }\n  if (AllProtosOn != nullptr) {\n    EndAdaptiveClassifier(); // Don't leak with multiple inits.\n  }\n\n  // If there is no language_data_path_prefix, the classifier will be\n  // adaptive only.\n  if (language_data_path_prefix.length() > 0 && mgr != nullptr) {\n    TFile fp;\n    ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));\n    PreTrainedTemplates = ReadIntTemplates(&fp);\n\n    if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {\n      shape_table_ = new ShapeTable(unicharset);\n      if (!shape_table_->DeSerialize(&fp)) {\n        tprintf(\"Error loading shape table!\\n\");\n        delete shape_table_;\n        shape_table_ = nullptr;\n      }\n    }\n\n    ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));\n    ReadNewCutoffs(&fp, CharNormCutoffs);\n\n    ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));\n    NormProtos = ReadNormProtos(&fp);\n    static_classifier_ = new TessClassifier(false, this);\n  }\n\n  InitIntegerFX();\n\n  AllProtosOn = NewBitVector(MAX_NUM_PROTOS);\n  AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);\n  AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);\n  TempProtoMask = NewBitVector(MAX_NUM_PROTOS);\n  set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));\n  set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));\n  zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));\n\n  for (uint16_t &BaselineCutoff : BaselineCutoffs) {\n    BaselineCutoff = 0;\n  }\n\n  if (classify_use_pre_adapted_templates) {\n    TFile fp;\n    std::string Filename = imagefile;\n    Filename += ADAPT_TEMPLATE_SUFFIX;\n    if (!fp.Open(Filename.c_str(), nullptr)) {\n      AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);\n    } else {\n      tprintf(\"\\nReading pre-adapted templates from %s ...\\n\", Filename.c_str());\n      fflush(stdout);\n      AdaptedTemplates = ReadAdaptedTemplates(&fp);\n      tprintf(\"\\n\");\n      PrintAdaptedTemplates(stdout, AdaptedTemplates);\n\n      for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {\n        BaselineCutoffs[i] = CharNormCutoffs[i];\n      }\n    }\n  } else {\n    delete AdaptedTemplates;\n    AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);\n  }\n} /* InitAdaptiveClassifier */\n\nvoid Classify::ResetAdaptiveClassifierInternal() {\n  if (classify_learning_debug_level > 0) {\n    tprintf(\"Resetting adaptive classifier (NumAdaptationsFailed=%d)\\n\", NumAdaptationsFailed);\n  }\n  delete AdaptedTemplates;\n  AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);\n  delete BackupAdaptedTemplates;\n  BackupAdaptedTemplates = nullptr;\n  NumAdaptationsFailed = 0;\n}\n\n// If there are backup adapted templates, switches to those, otherwise resets\n// the main adaptive classifier (because it is full.)\nvoid Classify::SwitchAdaptiveClassifier() {\n  if (BackupAdaptedTemplates == nullptr) {\n    ResetAdaptiveClassifierInternal();\n    return;\n  }\n  if (classify_learning_debug_level > 0) {\n    tprintf(\"Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\\n\",\n            NumAdaptationsFailed);\n  }\n  delete AdaptedTemplates;\n  AdaptedTemplates = BackupAdaptedTemplates;\n  BackupAdaptedTemplates = nullptr;\n  NumAdaptationsFailed = 0;\n}\n\n// Resets the backup adaptive classifier to empty.\nvoid Classify::StartBackupAdaptiveClassifier() {\n  delete BackupAdaptedTemplates;\n  BackupAdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine prepares the adaptive\n * matcher for the start\n * of the first pass.  Learning is enabled (unless it\n * is disabled for the whole program).\n *\n * @note this is somewhat redundant, it simply says that if learning is\n * enabled then it will remain enabled on the first pass.  If it is\n * disabled, then it will remain disabled.  This is only put here to\n * make it very clear that learning is controlled directly by the global\n * setting of EnableLearning.\n *\n * Globals:\n * - #EnableLearning\n * set to true by this routine\n */\nvoid Classify::SetupPass1() {\n  EnableLearning = classify_enable_learning;\n\n  getDict().SetupStopperPass1();\n\n} /* SetupPass1 */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine prepares the adaptive\n * matcher for the start of the second pass.  Further\n * learning is disabled.\n *\n * Globals:\n * - #EnableLearning set to false by this routine\n */\nvoid Classify::SetupPass2() {\n  EnableLearning = false;\n  getDict().SetupStopperPass2();\n\n} /* SetupPass2 */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine creates a new adapted\n * class and uses Blob as the model for the first\n * config in that class.\n *\n * @param Blob blob to model new class after\n * @param ClassId id of the class to be initialized\n * @param FontinfoId font information inferred from pre-trained templates\n * @param Class adapted class to be initialized\n * @param Templates adapted templates to add new class to\n *\n * Globals:\n * - #AllProtosOn dummy mask with all 1's\n * - BaselineCutoffs kludge needed to get cutoffs\n * - #PreTrainedTemplates kludge needed to get cutoffs\n */\nvoid Classify::InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,\n                                ADAPT_TEMPLATES_STRUCT *Templates) {\n  FEATURE_SET Features;\n  int Fid, Pid;\n  FEATURE Feature;\n  int NumFeatures;\n  PROTO_STRUCT *Proto;\n  INT_CLASS_STRUCT *IClass;\n  TEMP_CONFIG_STRUCT *Config;\n\n  classify_norm_method.set_value(baseline);\n  Features = ExtractOutlineFeatures(Blob);\n  NumFeatures = Features->NumFeatures;\n  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {\n    delete Features;\n    return;\n  }\n\n  Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);\n  TempConfigFor(Class, 0) = Config;\n\n  /* this is a kludge to construct cutoffs for adapted templates */\n  if (Templates == AdaptedTemplates) {\n    BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];\n  }\n\n  IClass = ClassForClassId(Templates->Templates, ClassId);\n\n  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {\n    Pid = AddIntProto(IClass);\n    assert(Pid != NO_PROTO);\n\n    Feature = Features->Features[Fid];\n    auto TempProto = new TEMP_PROTO_STRUCT;\n    Proto = &(TempProto->Proto);\n\n    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because\n   ConvertProto assumes that the Y dimension varies from -0.5 to 0.5\n   instead of the -0.25 to 0.75 used in baseline normalization */\n    Proto->Angle = Feature->Params[OutlineFeatDir];\n    Proto->X = Feature->Params[OutlineFeatX];\n    Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;\n    Proto->Length = Feature->Params[OutlineFeatLength];\n    FillABC(Proto);\n\n    TempProto->ProtoId = Pid;\n    SET_BIT(Config->Protos, Pid);\n\n    ConvertProto(Proto, Pid, IClass);\n    AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);\n\n    Class->TempProtos = push(Class->TempProtos, TempProto);\n  }\n  delete Features;\n\n  AddIntConfig(IClass);\n  ConvertConfig(AllProtosOn, 0, IClass);\n\n  if (classify_learning_debug_level >= 1) {\n    tprintf(\"Added new class '%s' with class id %d and %d protos.\\n\",\n            unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);\n#ifndef GRAPHICS_DISABLED\n    if (classify_learning_debug_level > 1) {\n      DisplayAdaptedChar(Blob, IClass);\n    }\n#endif\n  }\n\n  if (IsEmptyAdaptedClass(Class)) {\n    (Templates->NumNonEmptyClasses)++;\n  }\n} /* InitAdaptedClass */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine sets up the feature\n * extractor to extract baseline normalized\n * pico-features.\n *\n * The extracted pico-features are converted\n * to integer form and placed in IntFeatures. The\n * original floating-pt. features are returned in\n * FloatFeatures.\n *\n * Globals: none\n * @param Blob blob to extract features from\n * @param[out] IntFeatures array to fill with integer features\n * @param[out] FloatFeatures place to return actual floating-pt features\n *\n * @return Number of pico-features returned (0 if\n * an error occurred)\n */\nint Classify::GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures,\n                                  FEATURE_SET *FloatFeatures) {\n  FEATURE_SET Features;\n  int NumFeatures;\n\n  classify_norm_method.set_value(baseline);\n  Features = ExtractPicoFeatures(Blob);\n\n  NumFeatures = Features->NumFeatures;\n  if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {\n    delete Features;\n    return 0;\n  }\n\n  ComputeIntFeatures(Features, IntFeatures);\n  *FloatFeatures = Features;\n\n  return NumFeatures;\n} /* GetAdaptiveFeatures */\n\n/*-----------------------------------------------------------------------------\n              Private Code\n-----------------------------------------------------------------------------*/\n/*---------------------------------------------------------------------------*/\n/**\n * Return true if the specified word is acceptable for adaptation.\n *\n * Globals: none\n *\n * @param word current word\n *\n * @return true or false\n */\nbool Classify::AdaptableWord(WERD_RES *word) {\n  if (word->best_choice == nullptr) {\n    return false;\n  }\n  auto BestChoiceLength = word->best_choice->length();\n  float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;\n  return // rules that apply in general - simplest to compute first\n      BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&\n      BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&\n      // This basically ensures that the word is at least a dictionary match\n      // (freq word, user word, system dawg word, etc).\n      // Since all the other adjustments will make adjust factor higher\n      // than higher than adaptable_score=1.1+0.05=1.15\n      // Since these are other flags that ensure that the word is dict word,\n      // this check could be at times redundant.\n      word->best_choice->adjust_factor() <= adaptable_score &&\n      // Make sure that alternative choices are not dictionary words.\n      word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * @param Blob blob to add to templates for ClassId\n * @param ClassId class to add blob to\n * @param FontinfoId font information from pre-trained templates\n * @param Threshold minimum match rating to existing template\n * @param adaptive_templates current set of adapted templates\n *\n * Globals:\n * - AllProtosOn dummy mask to match against all protos\n * - AllConfigsOn dummy mask to match against all configs\n */\nvoid Classify::AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,\n                           ADAPT_TEMPLATES_STRUCT *adaptive_templates) {\n  int NumFeatures;\n  INT_FEATURE_ARRAY IntFeatures;\n  UnicharRating int_result;\n  INT_CLASS_STRUCT *IClass;\n  ADAPT_CLASS_STRUCT *Class;\n  TEMP_CONFIG_STRUCT *TempConfig;\n  FEATURE_SET FloatFeatures;\n  int NewTempConfigId;\n\n  if (!LegalClassId(ClassId)) {\n    return;\n  }\n\n  int_result.unichar_id = ClassId;\n  Class = adaptive_templates->Class[ClassId];\n  assert(Class != nullptr);\n  if (IsEmptyAdaptedClass(Class)) {\n    InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);\n  } else {\n    IClass = ClassForClassId(adaptive_templates->Templates, ClassId);\n\n    NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);\n    if (NumFeatures <= 0) {\n      return; // Features already freed by GetAdaptiveFeatures.\n    }\n\n    // Only match configs with the matching font.\n    BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);\n    for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {\n      if (GetFontinfoId(Class, cfg) == FontinfoId) {\n        SET_BIT(MatchingFontConfigs, cfg);\n      } else {\n        reset_bit(MatchingFontConfigs, cfg);\n      }\n    }\n    im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,\n              classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);\n    FreeBitVector(MatchingFontConfigs);\n\n    SetAdaptiveThreshold(Threshold);\n\n    if (1.0f - int_result.rating <= Threshold) {\n      if (ConfigIsPermanent(Class, int_result.config)) {\n        if (classify_learning_debug_level >= 1) {\n          tprintf(\"Found good match to perm config %d = %4.1f%%.\\n\", int_result.config,\n                  int_result.rating * 100.0);\n        }\n        delete FloatFeatures;\n        return;\n      }\n\n      TempConfig = TempConfigFor(Class, int_result.config);\n      IncreaseConfidence(TempConfig);\n      if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {\n        Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;\n      }\n      if (classify_learning_debug_level >= 1) {\n        tprintf(\"Increasing reliability of temp config %d to %d.\\n\", int_result.config,\n                TempConfig->NumTimesSeen);\n      }\n\n      if (TempConfigReliable(ClassId, TempConfig)) {\n        MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);\n        UpdateAmbigsGroup(ClassId, Blob);\n      }\n    } else {\n      if (classify_learning_debug_level >= 1) {\n        tprintf(\"Found poor match to temp config %d = %4.1f%%.\\n\", int_result.config,\n                int_result.rating * 100.0);\n#ifndef GRAPHICS_DISABLED\n        if (classify_learning_debug_level > 2) {\n          DisplayAdaptedChar(Blob, IClass);\n        }\n#endif\n      }\n      NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,\n                                               IntFeatures, FloatFeatures);\n      if (NewTempConfigId >= 0 &&\n          TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {\n        MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);\n        UpdateAmbigsGroup(ClassId, Blob);\n      }\n\n#ifndef GRAPHICS_DISABLED\n      if (classify_learning_debug_level > 1) {\n        DisplayAdaptedChar(Blob, IClass);\n      }\n#endif\n    }\n    delete FloatFeatures;\n  }\n} /* AdaptToChar */\n\n#ifndef GRAPHICS_DISABLED\n\nvoid Classify::DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class) {\n  INT_FX_RESULT_STRUCT fx_info;\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  TrainingSample *sample =\n      BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);\n  if (sample == nullptr) {\n    return;\n  }\n\n  UnicharRating int_result;\n  im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,\n            classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);\n  tprintf(\"Best match to temp config %d = %4.1f%%.\\n\", int_result.config,\n          int_result.rating * 100.0);\n  if (classify_learning_debug_level >= 2) {\n    uint32_t ConfigMask;\n    ConfigMask = 1 << int_result.config;\n    ShowMatchDisplay();\n    im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),\n              &bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,\n              matcher_debug_separate_windows);\n    UpdateMatchDisplay();\n  }\n\n  delete sample;\n}\n\n#endif\n\n/**\n * This routine adds the result of a classification into\n * Results.  If the new rating is much worse than the current\n * best rating, it is not entered into results because it\n * would end up being stripped later anyway.  If the new rating\n * is better than the old rating for the class, it replaces the\n * old rating.  If this is the first rating for the class, the\n * class is added to the list of matched classes in Results.\n * If the new rating is better than the best so far, it\n * becomes the best so far.\n *\n * Globals:\n * - #matcher_bad_match_pad defines limits of an acceptable match\n *\n * @param new_result new result to add\n * @param[out] results results to add new result to\n */\nvoid Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {\n  auto old_match = FindScoredUnichar(new_result.unichar_id, *results);\n\n  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||\n      (old_match < results->match.size() &&\n       new_result.rating <= results->match[old_match].rating)) {\n    return; // New one not good enough.\n  }\n\n  if (!unicharset.get_fragment(new_result.unichar_id)) {\n    results->HasNonfragment = true;\n  }\n\n  if (old_match < results->match.size()) {\n    results->match[old_match].rating = new_result.rating;\n  } else {\n    results->match.push_back(new_result);\n  }\n\n  if (new_result.rating > results->best_rating &&\n      // Ensure that fragments do not affect best rating, class and config.\n      // This is needed so that at least one non-fragmented character is\n      // always present in the results.\n      // TODO(daria): verify that this helps accuracy and does not\n      // hurt performance.\n      !unicharset.get_fragment(new_result.unichar_id)) {\n    results->best_match_index = old_match;\n    results->best_rating = new_result.rating;\n    results->best_unichar_id = new_result.unichar_id;\n  }\n} /* AddNewResult */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine is identical to CharNormClassifier()\n * except that it does no class pruning.  It simply matches\n * the unknown blob against the classes listed in\n * Ambiguities.\n *\n * Globals:\n * - #AllProtosOn mask that enables all protos\n * - #AllConfigsOn mask that enables all configs\n *\n * @param blob blob to be classified\n * @param templates built-in templates to classify against\n * @param classes adapted class templates\n * @param ambiguities array of unichar id's to match against\n * @param[out] results place to put match results\n * @param int_features\n * @param fx_info\n */\nvoid Classify::AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,\n                               const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,\n                               INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes,\n                               UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) {\n  if (int_features.empty()) {\n    return;\n  }\n  auto *CharNormArray = new uint8_t[unicharset.size()];\n  UnicharRating int_result;\n\n  results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);\n  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;\n  if (debug) {\n    tprintf(\"AM Matches =  \");\n  }\n\n  int top = blob->bounding_box().top();\n  int bottom = blob->bounding_box().bottom();\n  while (*ambiguities >= 0) {\n    CLASS_ID class_id = *ambiguities;\n\n    int_result.unichar_id = class_id;\n    im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),\n              &int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,\n              matcher_debug_separate_windows);\n\n    ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,\n                                    classify_integer_matcher_multiplier, CharNormArray, &int_result,\n                                    results);\n    ambiguities++;\n  }\n  delete[] CharNormArray;\n} /* AmbigClassifier */\n\n/*---------------------------------------------------------------------------*/\n/// Factored-out calls to IntegerMatcher based on class pruner results.\n/// Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.\nvoid Classify::MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,\n                             const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,\n                             ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier,\n                             const TBOX &blob_box, const std::vector<CP_RESULT_STRUCT> &results,\n                             ADAPT_RESULTS *final_results) {\n  int top = blob_box.top();\n  int bottom = blob_box.bottom();\n  UnicharRating int_result;\n  for (auto &&result : results) {\n    CLASS_ID class_id = result.Class;\n    BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;\n    BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;\n\n    int_result.unichar_id = class_id;\n    im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,\n              &int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);\n    bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;\n    ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,\n                                    final_results->BlobLength, matcher_multiplier, norm_factors,\n                                    &int_result, final_results);\n  }\n}\n\n// Converts configs to fonts, and if the result is not adapted, and a\n// shape_table_ is present, the shape is expanded to include all\n// unichar_ids represented, before applying a set of corrections to the\n// distance rating in int_result, (see ComputeCorrectedRating.)\n// The results are added to the final_results output.\nvoid Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id,\n                                               int bottom, int top, float cp_rating,\n                                               int blob_length, int matcher_multiplier,\n                                               const uint8_t *cn_factors, UnicharRating *int_result,\n                                               ADAPT_RESULTS *final_results) {\n  if (classes != nullptr) {\n    // Adapted result. Convert configs to fontinfo_ids.\n    int_result->adapted = true;\n    for (auto &font : int_result->fonts) {\n      font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);\n    }\n  } else {\n    // Pre-trained result. Map fonts using font_sets_.\n    int_result->adapted = false;\n    for (auto &font : int_result->fonts) {\n      font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);\n    }\n    if (shape_table_ != nullptr) {\n      // Two possible cases:\n      // 1. Flat shapetable. All unichar-ids of the shapes referenced by\n      // int_result->fonts are the same. In this case build a new vector of\n      // mapped fonts and replace the fonts in int_result.\n      // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced\n      // by int_result. In this case, build a vector of UnicharRating to\n      // gather together different font-ids for each unichar. Also covers case1.\n      std::vector<UnicharRating> mapped_results;\n      for (auto &f : int_result->fonts) {\n        int shape_id = f.fontinfo_id;\n        const Shape &shape = shape_table_->GetShape(shape_id);\n        for (int c = 0; c < shape.size(); ++c) {\n          int unichar_id = shape[c].unichar_id;\n          if (!unicharset.get_enabled(unichar_id)) {\n            continue;\n          }\n          // Find the mapped_result for unichar_id.\n          unsigned r = 0;\n          for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;\n               ++r) {\n          }\n          if (r == mapped_results.size()) {\n            mapped_results.push_back(*int_result);\n            mapped_results[r].unichar_id = unichar_id;\n            mapped_results[r].fonts.clear();\n          }\n          for (int font_id : shape[c].font_ids) {\n            mapped_results[r].fonts.emplace_back(font_id, f.score);\n          }\n        }\n      }\n      for (auto &m : mapped_results) {\n        m.rating = ComputeCorrectedRating(debug, m.unichar_id, cp_rating, int_result->rating,\n                                          int_result->feature_misses, bottom, top, blob_length,\n                                          matcher_multiplier, cn_factors);\n        AddNewResult(m, final_results);\n      }\n      return;\n    }\n  }\n  if (unicharset.get_enabled(class_id)) {\n    int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,\n                                                int_result->feature_misses, bottom, top,\n                                                blob_length, matcher_multiplier, cn_factors);\n    AddNewResult(*int_result, final_results);\n  }\n}\n\n// Applies a set of corrections to the confidence im_rating,\n// including the cn_correction, miss penalty and additional penalty\n// for non-alnums being vertical misfits. Returns the corrected confidence.\ndouble Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,\n                                        double im_rating, int feature_misses, int bottom, int top,\n                                        int blob_length, int matcher_multiplier,\n                                        const uint8_t *cn_factors) {\n  // Compute class feature corrections.\n  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],\n                                              matcher_multiplier);\n  double miss_penalty = tessedit_class_miss_scale * feature_misses;\n  double vertical_penalty = 0.0;\n  // Penalize non-alnums for being vertical misfits.\n  if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&\n      cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {\n    int min_bottom, max_bottom, min_top, max_top;\n    unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);\n    if (debug) {\n      tprintf(\"top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\\n\", top, min_top, max_top, bottom,\n              min_bottom, max_bottom);\n    }\n    if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {\n      vertical_penalty = classify_misfit_junk_penalty;\n    }\n  }\n  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);\n  if (result < WORST_POSSIBLE_RATING) {\n    result = WORST_POSSIBLE_RATING;\n  }\n  if (debug) {\n    tprintf(\"%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\\n\",\n            unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,\n            (1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,\n            cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);\n  }\n  return result;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine extracts baseline normalized features\n * from the unknown character and matches them against the\n * specified set of templates.  The classes which match\n * are added to Results.\n *\n * Globals:\n * - BaselineCutoffs expected num features for each class\n *\n * @param Blob blob to be classified\n * @param Templates current set of adapted templates\n * @param Results place to put match results\n * @param int_features\n * @param fx_info\n *\n * @return Array of possible ambiguous chars that should be checked.\n */\nUNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,\n                                         const std::vector<INT_FEATURE_STRUCT> &int_features,\n                                         const INT_FX_RESULT_STRUCT &fx_info,\n                                         ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) {\n  if (int_features.empty()) {\n    return nullptr;\n  }\n  auto *CharNormArray = new uint8_t[unicharset.size()];\n  ClearCharNormArray(CharNormArray);\n\n  Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);\n  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,\n               BaselineCutoffs, &Results->CPResults);\n\n  if (matcher_debug_level >= 2 || classify_debug_level > 1) {\n    tprintf(\"BL Matches =  \");\n  }\n\n  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,\n                Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,\n                Results);\n\n  delete[] CharNormArray;\n  CLASS_ID ClassId = Results->best_unichar_id;\n  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {\n    return nullptr;\n  }\n\n  return Templates->Class[ClassId]\n      ->Config[Results->match[Results->best_match_index].config]\n      .Perm->Ambigs;\n} /* BaselineClassifier */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine extracts character normalized features\n * from the unknown character and matches them against the\n * specified set of templates.  The classes which match\n * are added to Results.\n *\n * @param blob blob to be classified\n * @param sample templates to classify unknown against\n * @param adapt_results place to put match results\n *\n * Globals:\n * - CharNormCutoffs expected num features for each class\n * - AllProtosOn mask that enables all protos\n * - AllConfigsOn mask that enables all configs\n */\nint Classify::CharNormClassifier(TBLOB *blob, const TrainingSample &sample,\n                                 ADAPT_RESULTS *adapt_results) {\n  // This is the length that is used for scaling ratings vs certainty.\n  adapt_results->BlobLength = IntCastRounded(sample.outline_length() / kStandardFeatureLength);\n  std::vector<UnicharRating> unichar_results;\n  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);\n  // Convert results to the format used internally by AdaptiveClassifier.\n  for (auto &r : unichar_results) {\n    AddNewResult(r, adapt_results);\n  }\n  return sample.num_features();\n} /* CharNormClassifier */\n\n// As CharNormClassifier, but operates on a TrainingSample and outputs to\n// a vector of ShapeRating without conversion to classes.\nint Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,\n                                     std::vector<UnicharRating> *results) {\n  results->clear();\n  std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());\n  adapt_results->Initialize();\n  // Compute the bounding box of the features.\n  uint32_t num_features = sample.num_features();\n  // Only the top and bottom of the blob_box are used by MasterMatcher, so\n  // fabricate right and left using top and bottom.\n  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),\n                sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));\n  // Compute the char_norm_array from the saved cn_feature.\n  FEATURE norm_feature = sample.GetCNFeature();\n  std::vector<uint8_t> char_norm_array(unicharset.size());\n  auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);\n  std::vector<uint8_t> pruner_norm_array(num_pruner_classes);\n  adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);\n  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);\n\n  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],\n               shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,\n               &adapt_results->CPResults);\n  if (keep_this >= 0) {\n    adapt_results->CPResults[0].Class = keep_this;\n    adapt_results->CPResults.resize(1);\n  }\n  if (pruner_only) {\n    // Convert pruner results to output format.\n    for (auto &it : adapt_results->CPResults) {\n      int class_id = it.Class;\n      results->push_back(UnicharRating(class_id, 1.0f - it.Rating));\n    }\n  } else {\n    MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,\n                  matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,\n                  adapt_results->CPResults, adapt_results.get());\n    // Convert master matcher results to output format.\n    for (auto &i : adapt_results->match) {\n      results->push_back(i);\n    }\n    if (results->size() > 1) {\n      std::sort(results->begin(), results->end(), SortDescendingRating);\n    }\n  }\n  return num_features;\n} /* CharNormTrainingSample */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine computes a rating which reflects the\n * likelihood that the blob being classified is a noise\n * blob.  NOTE: assumes that the blob length has already been\n * computed and placed into Results.\n *\n * @param results results to add noise classification to\n *\n * Globals:\n * - matcher_avg_noise_size avg. length of a noise blob\n */\nvoid Classify::ClassifyAsNoise(ADAPT_RESULTS *results) {\n  float rating = results->BlobLength / matcher_avg_noise_size;\n  rating *= rating;\n  rating /= 1 + rating;\n\n  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);\n} /* ClassifyAsNoise */\n\n/// The function converts the given match ratings to the list of blob\n/// choices with ratings and certainties (used by the context checkers).\n/// If character fragments are present in the results, this function also makes\n/// sure that there is at least one non-fragmented classification included.\n/// For each classification result check the unicharset for \"definite\"\n/// ambiguities and modify the resulting Choices accordingly.\nvoid Classify::ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box,\n                                       ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) {\n  assert(Choices != nullptr);\n  float Rating;\n  float Certainty;\n  BLOB_CHOICE_IT temp_it;\n  bool contains_nonfrag = false;\n  temp_it.set_to_list(Choices);\n  int choices_length = 0;\n  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum\n  // number of returned results, but with a shape_table_ we want to have room\n  // for at least the biggest shape (which might contain hundreds of Indic\n  // grapheme fragments) and more, so use double the size of the biggest shape\n  // if that is more than the default.\n  int max_matches = MAX_MATCHES;\n  if (shape_table_ != nullptr) {\n    max_matches = shape_table_->MaxNumUnichars() * 2;\n    if (max_matches < MAX_MATCHES) {\n      max_matches = MAX_MATCHES;\n    }\n  }\n\n  float best_certainty = -FLT_MAX;\n  for (auto &it : Results->match) {\n    const UnicharRating &result = it;\n    bool adapted = result.adapted;\n    bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);\n    if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {\n      continue; // look for a non-fragmented character to fill the\n                // last spot in Choices if only fragments are present\n    }\n    // BlobLength can never be legally 0, this means recognition failed.\n    // But we must return a classification result because some invoking\n    // functions (chopper/permuter) do not anticipate a null blob choice.\n    // So we need to assign a poor, but not infinitely bad score.\n    if (Results->BlobLength == 0) {\n      Certainty = -20;\n      Rating = 100; // should be -certainty * real_blob_length\n    } else {\n      Rating = Certainty = (1.0f - result.rating);\n      Rating *= rating_scale * Results->BlobLength;\n      Certainty *= -(getDict().certainty_scale);\n    }\n    // Adapted results, by their very nature, should have good certainty.\n    // Those that don't are at best misleading, and often lead to errors,\n    // so don't accept adapted results that are too far behind the best result,\n    // whether adapted or static.\n    // TODO(rays) find some way of automatically tuning these constants.\n    if (Certainty > best_certainty) {\n      best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));\n    } else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {\n      continue; // Don't accept bad adapted results.\n    }\n\n    float min_xheight, max_xheight, yshift;\n    denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);\n    auto *choice = new BLOB_CHOICE(\n        result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,\n        max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);\n    choice->set_fonts(result.fonts);\n    temp_it.add_to_end(choice);\n    contains_nonfrag |= !current_is_frag; // update contains_nonfrag\n    choices_length++;\n    if (choices_length >= max_matches) {\n      break;\n    }\n  }\n  Results->match.resize(choices_length);\n} // ConvertMatchesToChoices\n\n/*---------------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\n/**\n *\n * @param blob blob whose classification is being debugged\n * @param Results results of match being debugged\n *\n * Globals: none\n */\nvoid Classify::DebugAdaptiveClassifier(TBLOB *blob, ADAPT_RESULTS *Results) {\n  if (static_classifier_ == nullptr) {\n    return;\n  }\n  INT_FX_RESULT_STRUCT fx_info;\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);\n  if (sample == nullptr) {\n    return;\n  }\n  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);\n} /* DebugAdaptiveClassifier */\n#endif\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine performs an adaptive classification.\n * If we have not yet adapted to enough classes, a simple\n * classification to the pre-trained templates is performed.\n * Otherwise, we match the blob against the adapted templates.\n * If the adapted templates do not match well, we try a\n * match against the pre-trained templates.  If an adapted\n * template match is found, we do a match to any pre-trained\n * templates which could be ambiguous.  The results from all\n * of these classifications are merged together into Results.\n *\n * @param Blob blob to be classified\n * @param Results place to put match results\n *\n * Globals:\n * - PreTrainedTemplates built-in training templates\n * - AdaptedTemplates templates adapted for this page\n * - matcher_reliable_adaptive_result rating limit for a great match\n */\nvoid Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {\n  UNICHAR_ID *Ambiguities;\n\n  INT_FX_RESULT_STRUCT fx_info;\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  TrainingSample *sample =\n      BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);\n  if (sample == nullptr) {\n    return;\n  }\n\n  // TODO: With LSTM, static_classifier_ is nullptr.\n  // Return to avoid crash in CharNormClassifier.\n  if (static_classifier_ == nullptr) {\n    delete sample;\n    return;\n  }\n\n  if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {\n    CharNormClassifier(Blob, *sample, Results);\n  } else {\n    Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);\n    if ((!Results->match.empty() &&\n         MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&\n         !tess_bn_matching) ||\n        Results->match.empty()) {\n      CharNormClassifier(Blob, *sample, Results);\n    } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {\n      AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,\n                      Ambiguities, Results);\n    }\n  }\n\n  // Force the blob to be classified as noise\n  // if the results contain only fragments.\n  // TODO(daria): verify that this is better than\n  // just adding a nullptr classification.\n  if (!Results->HasNonfragment || Results->match.empty()) {\n    ClassifyAsNoise(Results);\n  }\n  delete sample;\n} /* DoAdaptiveMatch */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine matches blob to the built-in templates\n * to find out if there are any classes other than the correct\n * class which are potential ambiguities.\n *\n * @param Blob blob to get classification ambiguities for\n * @param CorrectClass correct class for Blob\n *\n * Globals:\n * - CurrentRatings used by qsort compare routine\n * - PreTrainedTemplates built-in templates\n *\n * @return String containing all possible ambiguous classes.\n */\nUNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass) {\n  auto *Results = new ADAPT_RESULTS();\n  UNICHAR_ID *Ambiguities;\n\n  Results->Initialize();\n  INT_FX_RESULT_STRUCT fx_info;\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  TrainingSample *sample =\n      BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);\n  if (sample == nullptr) {\n    delete Results;\n    return nullptr;\n  }\n\n  CharNormClassifier(Blob, *sample, Results);\n  delete sample;\n  RemoveBadMatches(Results);\n  std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);\n\n  /* copy the class id's into an string of ambiguities - don't copy if\n   the correct class is the only class id matched */\n  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];\n  if (Results->match.size() > 1 ||\n      (Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {\n    unsigned i;\n    for (i = 0; i < Results->match.size(); i++) {\n      Ambiguities[i] = Results->match[i].unichar_id;\n    }\n    Ambiguities[i] = -1;\n  } else {\n    Ambiguities[0] = -1;\n  }\n\n  delete Results;\n  return Ambiguities;\n} /* GetAmbiguities */\n\n// Returns true if the given blob looks too dissimilar to any character\n// present in the classifier templates.\nbool Classify::LooksLikeGarbage(TBLOB *blob) {\n  auto *ratings = new BLOB_CHOICE_LIST();\n  AdaptiveClassifier(blob, ratings);\n  BLOB_CHOICE_IT ratings_it(ratings);\n  const UNICHARSET &unicharset = getDict().getUnicharset();\n  if (classify_debug_character_fragments) {\n    print_ratings_list(\"======================\\nLooksLikeGarbage() got \", ratings, unicharset);\n  }\n  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {\n    if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {\n      continue;\n    }\n    float certainty = ratings_it.data()->certainty();\n    delete ratings;\n    return certainty < classify_character_fragments_garbage_certainty_threshold;\n  }\n  delete ratings;\n  return true; // no whole characters in ratings\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine calls the integer (Hardware) feature\n * extractor if it has not been called before for this blob.\n *\n * The results from the feature extractor are placed into\n * globals so that they can be used in other routines without\n * re-extracting the features.\n *\n * It then copies the char norm features into the IntFeatures\n * array provided by the caller.\n *\n * @param templates used to compute char norm adjustments\n * @param pruner_norm_array Array of factors from blob normalization\n *        process\n * @param char_norm_array array to fill with dummy char norm adjustments\n * @param fx_info\n *\n * Globals:\n *\n * @return Number of features extracted or 0 if an error occurred.\n */\nint Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,\n                                 uint8_t *pruner_norm_array, uint8_t *char_norm_array) {\n  auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);\n  float baseline = kBlnBaselineOffset;\n  float scale = MF_SCALE_FACTOR;\n  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;\n  norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;\n  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;\n  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;\n  // Deletes norm_feature.\n  ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);\n  return IntCastRounded(fx_info.Length / kStandardFeatureLength);\n} /* GetCharNormFeature */\n\n// Computes the char_norm_array for the unicharset and, if not nullptr, the\n// pruner_array as appropriate according to the existence of the shape_table.\nvoid Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,\n                                     uint8_t *char_norm_array, uint8_t *pruner_array) {\n  ComputeIntCharNormArray(*norm_feature, char_norm_array);\n  //if (pruner_array != nullptr) {\n    if (shape_table_ == nullptr) {\n      ComputeIntCharNormArray(*norm_feature, pruner_array);\n    } else {\n      memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));\n      // Each entry in the pruner norm array is the MIN of all the entries of\n      // the corresponding unichars in the CharNormArray.\n      for (unsigned id = 0; id < templates->NumClasses; ++id) {\n        int font_set_id = templates->Class[id]->font_set_id;\n        const FontSet &fs = fontset_table_.at(font_set_id);\n        for (auto f : fs) {\n          const Shape &shape = shape_table_->GetShape(f);\n          for (int c = 0; c < shape.size(); ++c) {\n            if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {\n              pruner_array[id] = char_norm_array[shape[c].unichar_id];\n            }\n          }\n        }\n      }\n    }\n  //}\n  delete norm_feature;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n *\n * @param Templates adapted templates to add new config to\n * @param ClassId class id to associate with new config\n * @param FontinfoId font information inferred from pre-trained templates\n * @param NumFeatures number of features in IntFeatures\n * @param Features features describing model for new config\n * @param FloatFeatures floating-pt representation of features\n *\n * @return The id of the new config created, a negative integer in\n * case of error.\n */\nint Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,\n                                     int NumFeatures, INT_FEATURE_ARRAY Features,\n                                     FEATURE_SET FloatFeatures) {\n  INT_CLASS_STRUCT *IClass;\n  ADAPT_CLASS_STRUCT *Class;\n  PROTO_ID OldProtos[MAX_NUM_PROTOS];\n  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];\n  int NumOldProtos;\n  int NumBadFeatures;\n  int MaxProtoId, OldMaxProtoId;\n  int MaskSize;\n  int ConfigId;\n  int i;\n  int debug_level = NO_DEBUG;\n\n  if (classify_learning_debug_level >= 3) {\n    debug_level = PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;\n  }\n\n  IClass = ClassForClassId(Templates->Templates, ClassId);\n  Class = Templates->Class[ClassId];\n\n  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {\n    ++NumAdaptationsFailed;\n    if (classify_learning_debug_level >= 1) {\n      tprintf(\"Cannot make new temporary config: maximum number exceeded.\\n\");\n    }\n    return -1;\n  }\n\n  OldMaxProtoId = IClass->NumProtos - 1;\n\n  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,\n                                    OldProtos, classify_adapt_proto_threshold, debug_level);\n\n  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);\n  zero_all_bits(TempProtoMask, MaskSize);\n  for (i = 0; i < NumOldProtos; i++) {\n    SET_BIT(TempProtoMask, OldProtos[i]);\n  }\n\n  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,\n                                       BadFeatures, classify_adapt_feature_threshold, debug_level);\n\n  MaxProtoId =\n      MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);\n  if (MaxProtoId == NO_PROTO) {\n    ++NumAdaptationsFailed;\n    if (classify_learning_debug_level >= 1) {\n      tprintf(\"Cannot make new temp protos: maximum number exceeded.\\n\");\n    }\n    return -1;\n  }\n\n  ConfigId = AddIntConfig(IClass);\n  ConvertConfig(TempProtoMask, ConfigId, IClass);\n  auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);\n  TempConfigFor(Class, ConfigId) = Config;\n  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);\n\n  if (classify_learning_debug_level >= 1) {\n    tprintf(\n        \"Making new temp config %d fontinfo id %d\"\n        \" using %d old and %d new protos.\\n\",\n        ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);\n  }\n\n  return ConfigId;\n} /* MakeNewTemporaryConfig */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine finds sets of sequential bad features\n * that all have the same angle and converts each set into\n * a new temporary proto.  The temp proto is added to the\n * proto pruner for IClass, pushed onto the list of temp\n * protos in Class, and added to TempProtoMask.\n *\n * @param Features floating-pt features describing new character\n * @param NumBadFeat number of bad features to turn into protos\n * @param BadFeat feature id's of bad features\n * @param IClass integer class templates to add new protos to\n * @param Class adapted class templates to add new protos to\n * @param TempProtoMask proto mask to add new protos to\n *\n * Globals: none\n *\n * @return Max proto id in class after all protos have been added.\n */\nPROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],\n                                     INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class,\n                                     BIT_VECTOR TempProtoMask) {\n  FEATURE_ID *ProtoStart;\n  FEATURE_ID *ProtoEnd;\n  FEATURE_ID *LastBad;\n  PROTO_STRUCT *Proto;\n  FEATURE F1, F2;\n  float X1, X2, Y1, Y2;\n  float A1, A2, AngleDelta;\n  float SegmentLength;\n  PROTO_ID Pid;\n\n  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;\n       ProtoStart = ProtoEnd) {\n    F1 = Features->Features[*ProtoStart];\n    X1 = F1->Params[PicoFeatX];\n    Y1 = F1->Params[PicoFeatY];\n    A1 = F1->Params[PicoFeatDir];\n\n    for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;\n         ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {\n      F2 = Features->Features[*ProtoEnd];\n      X2 = F2->Params[PicoFeatX];\n      Y2 = F2->Params[PicoFeatY];\n      A2 = F2->Params[PicoFeatDir];\n\n      AngleDelta = std::fabs(A1 - A2);\n      if (AngleDelta > 0.5f) {\n        AngleDelta = 1 - AngleDelta;\n      }\n\n      if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||\n          std::fabs(Y1 - Y2) > SegmentLength) {\n        break;\n      }\n    }\n\n    F2 = Features->Features[*(ProtoEnd - 1)];\n    X2 = F2->Params[PicoFeatX];\n    Y2 = F2->Params[PicoFeatY];\n    A2 = F2->Params[PicoFeatDir];\n\n    Pid = AddIntProto(IClass);\n    if (Pid == NO_PROTO) {\n      return (NO_PROTO);\n    }\n\n    auto TempProto = new TEMP_PROTO_STRUCT;\n    Proto = &(TempProto->Proto);\n\n    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because\n   ConvertProto assumes that the Y dimension varies from -0.5 to 0.5\n   instead of the -0.25 to 0.75 used in baseline normalization */\n    Proto->Length = SegmentLength;\n    Proto->Angle = A1;\n    Proto->X = (X1 + X2) / 2;\n    Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;\n    FillABC(Proto);\n\n    TempProto->ProtoId = Pid;\n    SET_BIT(TempProtoMask, Pid);\n\n    ConvertProto(Proto, Pid, IClass);\n    AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);\n\n    Class->TempProtos = push(Class->TempProtos, TempProto);\n  }\n  return IClass->NumProtos - 1;\n} /* MakeNewTempProtos */\n\n/*---------------------------------------------------------------------------*/\n/**\n *\n * @param Templates current set of adaptive templates\n * @param ClassId class containing config to be made permanent\n * @param ConfigId config to be made permanent\n * @param Blob current blob being adapted to\n *\n * Globals: none\n */\nvoid Classify::MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId,\n                             TBLOB *Blob) {\n  UNICHAR_ID *Ambigs;\n  PROTO_KEY ProtoKey;\n\n  auto Class = Templates->Class[ClassId];\n  auto Config = TempConfigFor(Class, ConfigId);\n\n  MakeConfigPermanent(Class, ConfigId);\n  if (Class->NumPermConfigs == 0) {\n    Templates->NumPermClasses++;\n  }\n  Class->NumPermConfigs++;\n\n  // Initialize permanent config.\n  Ambigs = GetAmbiguities(Blob, ClassId);\n  auto Perm = new PERM_CONFIG_STRUCT;\n  Perm->Ambigs = Ambigs;\n  Perm->FontinfoId = Config->FontinfoId;\n\n  // Free memory associated with temporary config (since ADAPTED_CONFIG\n  // is a union we need to clean up before we record permanent config).\n  ProtoKey.Templates = Templates;\n  ProtoKey.ClassId = ClassId;\n  ProtoKey.ConfigId = ConfigId;\n  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);\n  delete Config;\n\n  // Record permanent config.\n  PermConfigFor(Class, ConfigId) = Perm;\n\n  if (classify_learning_debug_level >= 1) {\n    tprintf(\n        \"Making config %d for %s (ClassId %d) permanent:\"\n        \" fontinfo id %d, ambiguities '\",\n        ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,\n        PermConfigFor(Class, ConfigId)->FontinfoId);\n    for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {\n      tprintf(\"%s\", unicharset.id_to_unichar(*AmbigsPointer));\n    }\n    tprintf(\"'.\\n\");\n  }\n} /* MakePermanent */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine converts TempProto to be permanent if\n * its proto id is used by the configuration specified in\n * ProtoKey.\n *\n * @param item1 (TEMP_PROTO) temporary proto to compare to key\n * @param item2 (PROTO_KEY) defines which protos to make permanent\n *\n * Globals: none\n *\n * @return true if TempProto is converted, false otherwise\n */\nint MakeTempProtoPerm(void *item1, void *item2) {\n  auto TempProto = static_cast<TEMP_PROTO_STRUCT *>(item1);\n  auto ProtoKey = static_cast<PROTO_KEY *>(item2);\n\n  auto Class = ProtoKey->Templates->Class[ProtoKey->ClassId];\n  auto Config = TempConfigFor(Class, ProtoKey->ConfigId);\n\n  if (TempProto->ProtoId > Config->MaxProtoId || !test_bit(Config->Protos, TempProto->ProtoId)) {\n    return false;\n  }\n\n  MakeProtoPermanent(Class, TempProto->ProtoId);\n  AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId, ProtoKey->Templates->Templates);\n  delete TempProto;\n\n  return true;\n} /* MakeTempProtoPerm */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine writes the matches in Results to File.\n *\n * @param results match results to write to File\n *\n * Globals: none\n */\nvoid Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS &results) {\n  for (auto &it : results.match) {\n    tprintf(\"%s  \", unicharset.debug_str(it.unichar_id).c_str());\n    it.Print();\n  }\n} /* PrintAdaptiveMatchResults */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine steps through each matching class in Results\n * and removes it from the match list if its rating\n * is worse than the BestRating plus a pad.  In other words,\n * all good matches get moved to the front of the classes\n * array.\n *\n * @param Results contains matches to be filtered\n *\n * Globals:\n * - matcher_bad_match_pad defines a \"bad match\"\n */\nvoid Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {\n  unsigned Next, NextGood;\n  float BadMatchThreshold;\n  static const char *romans = \"i v x I V X\";\n  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;\n\n  if (classify_bln_numeric_mode) {\n    UNICHAR_ID unichar_id_one =\n        unicharset.contains_unichar(\"1\") ? unicharset.unichar_to_id(\"1\") : -1;\n    UNICHAR_ID unichar_id_zero =\n        unicharset.contains_unichar(\"0\") ? unicharset.unichar_to_id(\"0\") : -1;\n    float scored_one = ScoredUnichar(unichar_id_one, *Results);\n    float scored_zero = ScoredUnichar(unichar_id_zero, *Results);\n\n    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {\n      const UnicharRating &match = Results->match[Next];\n      if (match.rating >= BadMatchThreshold) {\n        if (!unicharset.get_isalpha(match.unichar_id) ||\n            strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {\n        } else if (unicharset.eq(match.unichar_id, \"l\") && scored_one < BadMatchThreshold) {\n          Results->match[Next].unichar_id = unichar_id_one;\n        } else if (unicharset.eq(match.unichar_id, \"O\") && scored_zero < BadMatchThreshold) {\n          Results->match[Next].unichar_id = unichar_id_zero;\n        } else {\n          Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.\n        }\n        if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {\n          if (NextGood == Next) {\n            ++NextGood;\n          } else {\n            Results->match[NextGood++] = Results->match[Next];\n          }\n        }\n      }\n    }\n  } else {\n    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {\n      if (Results->match[Next].rating >= BadMatchThreshold) {\n        if (NextGood == Next) {\n          ++NextGood;\n        } else {\n          Results->match[NextGood++] = Results->match[Next];\n        }\n      }\n    }\n  }\n  Results->match.resize(NextGood);\n} /* RemoveBadMatches */\n\n/*----------------------------------------------------------------------------*/\n/**\n * This routine discards extra digits or punctuation from the results.\n * We keep only the top 2 punctuation answers and the top 1 digit answer if\n * present.\n *\n * @param Results contains matches to be filtered\n */\nvoid Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {\n  unsigned Next, NextGood;\n  int punc_count; /*no of garbage characters */\n  int digit_count;\n  /*garbage characters */\n  static char punc_chars[] = \". , ; : / ` ~ ' - = \\\\ | \\\" ! _ ^\";\n  static char digit_chars[] = \"0 1 2 3 4 5 6 7 8 9\";\n\n  punc_count = 0;\n  digit_count = 0;\n  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {\n    const UnicharRating &match = Results->match[Next];\n    bool keep = true;\n    if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {\n      if (punc_count >= 2) {\n        keep = false;\n      }\n      punc_count++;\n    } else {\n      if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {\n        if (digit_count >= 1) {\n          keep = false;\n        }\n        digit_count++;\n      }\n    }\n    if (keep) {\n      if (NextGood == Next) {\n        ++NextGood;\n      } else {\n        Results->match[NextGood++] = match;\n      }\n    }\n  }\n  Results->match.resize(NextGood);\n} /* RemoveExtraPuncs */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine resets the internal thresholds inside\n * the integer matcher to correspond to the specified\n * threshold.\n *\n * @param Threshold threshold for creating new templates\n *\n * Globals:\n * - matcher_good_threshold default good match rating\n */\nvoid Classify::SetAdaptiveThreshold(float Threshold) {\n  Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);\n  classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));\n  classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));\n} /* SetAdaptiveThreshold */\n\n#ifndef GRAPHICS_DISABLED\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine displays debug information for the best config\n * of the given shape_id for the given set of features.\n *\n * @param shape_id classifier id to work with\n * @param features features of the unknown character\n * @param num_features Number of features in the features array.\n */\n\nvoid Classify::ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features,\n                                int num_features) {\n  uint32_t config_mask;\n  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {\n    tprintf(\"No built-in templates for class/shape %d\\n\", shape_id);\n    return;\n  }\n  if (num_features <= 0) {\n    tprintf(\"Illegal blob (char norm features)!\\n\");\n    return;\n  }\n  UnicharRating cn_result;\n  classify_norm_method.set_value(character);\n  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, AllConfigsOn, num_features,\n            features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,\n            matcher_debug_separate_windows);\n  tprintf(\"\\n\");\n  config_mask = 1 << cn_result.config;\n\n  tprintf(\"Static Shape ID: %d\\n\", shape_id);\n  ShowMatchDisplay();\n  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,\n            features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,\n            matcher_debug_separate_windows);\n  UpdateMatchDisplay();\n} /* ShowBestMatchFor */\n\n#endif // !GRAPHICS_DISABLED\n\n// Returns a string for the classifier class_id: either the corresponding\n// unicharset debug_str or the shape_table_ debug str.\nstd::string Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,\n                                        int config_id) const {\n  std::string class_string;\n  if (templates == PreTrainedTemplates && shape_table_ != nullptr) {\n    int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);\n    class_string = shape_table_->DebugStr(shape_id);\n  } else {\n    class_string = unicharset.debug_str(class_id);\n  }\n  return class_string;\n}\n\n// Converts a classifier class_id index to a shape_table_ index\nint Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const {\n  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;\n  // Older inttemps have no font_ids.\n  if (font_set_id < 0) {\n    return kBlankFontinfoId;\n  }\n  const FontSet &fs = fontset_table_.at(font_set_id);\n  return fs.at(int_result_config);\n}\n\n// Converts a shape_table_ index to a classifier class_id index (not a\n// unichar-id!). Uses a search, so not fast.\nint Classify::ShapeIDToClassID(int shape_id) const {\n  for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {\n    int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;\n    ASSERT_HOST(font_set_id >= 0);\n    const FontSet &fs = fontset_table_.at(font_set_id);\n    for (auto f : fs) {\n      if (f == shape_id) {\n        return id;\n      }\n    }\n  }\n  tprintf(\"Shape %d not found\\n\", shape_id);\n  return -1;\n}\n\n// Returns true if the given TEMP_CONFIG_STRUCT is good enough to make it\n// a permanent config.\nbool Classify::TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config) {\n  if (classify_learning_debug_level >= 1) {\n    tprintf(\"NumTimesSeen for config of %s is %d\\n\",\n            getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);\n  }\n  if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {\n    return true;\n  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {\n    return false;\n  } else if (use_ambigs_for_adaption) {\n    // Go through the ambigs vector and see whether we have already seen\n    // enough times all the characters represented by the ambigs vector.\n    const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);\n    int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();\n    for (int ambig = 0; ambig < ambigs_size; ++ambig) {\n      ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];\n      assert(ambig_class != nullptr);\n      if (ambig_class->NumPermConfigs == 0 &&\n          ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {\n        if (classify_learning_debug_level >= 1) {\n          tprintf(\n              \"Ambig %s has not been seen enough times,\"\n              \" not making config for %s permanent\\n\",\n              getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),\n              getDict().getUnicharset().debug_str(class_id).c_str());\n        }\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\nvoid Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {\n  const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);\n  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();\n  if (classify_learning_debug_level >= 1) {\n    tprintf(\"Running UpdateAmbigsGroup for %s class_id=%d\\n\",\n            getDict().getUnicharset().debug_str(class_id).c_str(), class_id);\n  }\n  for (int ambig = 0; ambig < ambigs_size; ++ambig) {\n    CLASS_ID ambig_class_id = (*ambigs)[ambig];\n    const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];\n    for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {\n      if (ConfigIsPermanent(ambigs_class, cfg)) {\n        continue;\n      }\n      const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);\n      if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {\n        if (classify_learning_debug_level >= 1) {\n          tprintf(\"Making config %d of %s permanent\\n\", cfg,\n                  getDict().getUnicharset().debug_str(ambig_class_id).c_str());\n        }\n        MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);\n      }\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/blobclass.cpp",
    "content": "/******************************************************************************\n **      Filename:       blobclass.c\n **      Purpose:        High level blob classification and training routines.\n **      Author:         Dan Johnson\n **\n **      (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include <cstdio>\n\n#include \"classify.h\"\n#include \"featdefs.h\"\n#include \"mf.h\"\n#include \"normfeat.h\"\n\nnamespace tesseract {\n\n/*---------------------------------------------------------------------------*/\n\n// Extracts features from the given blob and saves them in the tr_file_data_\n// member variable.\n// fontname:  Name of font that this blob was printed in.\n// cn_denorm: Character normalization transformation to apply to the blob.\n// fx_info:   Character normalization parameters computed with cn_denorm.\n// blob_text: Ground truth text for the blob.\nvoid Classify::LearnBlob(const std::string &fontname, TBLOB *blob, const DENORM &cn_denorm,\n                         const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) {\n  std::unique_ptr<CHAR_DESC_STRUCT> CharDesc(new CHAR_DESC_STRUCT(feature_defs_));\n  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);\n  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);\n  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);\n  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);\n\n  if (ValidCharDescription(feature_defs_, CharDesc.get())) {\n    // Label the features with a class name and font name.\n    tr_file_data_ += \"\\n\";\n    tr_file_data_ += fontname;\n    tr_file_data_ += \" \";\n    tr_file_data_ += blob_text;\n    tr_file_data_ += \"\\n\";\n\n    // write micro-features to file and clean up\n    WriteCharDescription(feature_defs_, CharDesc.get(), tr_file_data_);\n  } else {\n    tprintf(\"Blob learned was invalid!\\n\");\n  }\n} // LearnBlob\n\n// Writes stored training data to a .tr file based on the given filename.\n// Returns false on error.\nbool Classify::WriteTRFile(const char *filename) {\n  bool result = false;\n  std::string tr_filename = filename;\n  tr_filename += \".tr\";\n  FILE *fp = fopen(tr_filename.c_str(), \"wb\");\n  if (fp) {\n    result = tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());\n    fclose(fp);\n  }\n  tr_file_data_.resize(0);\n  return result;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/classify.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        classify.cpp\n// Description: classify class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"classify.h\"\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include <string.h>\n\nnamespace tesseract {\n\nClassify::Classify()\n    : INT_MEMBER(classify_debug_level, 0, \"Classify debug level\", this->params())\n    ,\n\n    BOOL_MEMBER(classify_bln_numeric_mode, 0, \"Assume the input is numbers [0-9].\", this->params())\n    ,\n\n    double_MEMBER(classify_max_rating_ratio, 1.5, \"Veto ratio between classifier ratings\",\n                  this->params())\n    ,\n\n    double_MEMBER(classify_max_certainty_margin, 5.5,\n                  \"Veto difference between classifier certainties\", this->params())\n    ,\n\n    dict_(this) {}\n\nClassify::~Classify() {}\n\n} // namespace tesseract\n\n#else // DISABLED_LEGACY_ENGINE not defined\n\n#  include <cstring>\n#  include \"fontinfo.h\"\n#  include \"intproto.h\"\n#  include \"mfoutline.h\"\n#  include \"scrollview.h\"\n#  include \"shapeclassifier.h\"\n#  include \"shapetable.h\"\n#  include \"unicity_table.h\"\n\nnamespace tesseract {\nClassify::Classify()\n    : BOOL_MEMBER(allow_blob_division, true, \"Use divisible blobs chopping\", this->params())\n    , BOOL_MEMBER(prioritize_division, false, \"Prioritize blob division over chopping\",\n                  this->params())\n    , BOOL_MEMBER(classify_enable_learning, true, \"Enable adaptive classifier\", this->params())\n    , INT_MEMBER(classify_debug_level, 0, \"Classify debug level\", this->params())\n    , INT_MEMBER(classify_norm_method, character, \"Normalization Method   ...\", this->params())\n    , double_MEMBER(classify_char_norm_range, 0.2, \"Character Normalization Range ...\",\n                    this->params())\n    , double_MEMBER(classify_max_rating_ratio, 1.5, \"Veto ratio between classifier ratings\",\n                    this->params())\n    , double_MEMBER(classify_max_certainty_margin, 5.5,\n                    \"Veto difference between classifier certainties\", this->params())\n    , BOOL_MEMBER(tess_cn_matching, 0, \"Character Normalized Matching\", this->params())\n    , BOOL_MEMBER(tess_bn_matching, 0, \"Baseline Normalized Matching\", this->params())\n    , BOOL_MEMBER(classify_enable_adaptive_matcher, 1, \"Enable adaptive classifier\", this->params())\n    , BOOL_MEMBER(classify_use_pre_adapted_templates, 0, \"Use pre-adapted classifier templates\",\n                  this->params())\n    , BOOL_MEMBER(classify_save_adapted_templates, 0, \"Save adapted templates to a file\",\n                  this->params())\n    , BOOL_MEMBER(classify_enable_adaptive_debugger, 0, \"Enable match debugger\", this->params())\n    , BOOL_MEMBER(classify_nonlinear_norm, 0, \"Non-linear stroke-density normalization\",\n                  this->params())\n    , INT_MEMBER(matcher_debug_level, 0, \"Matcher Debug Level\", this->params())\n    , INT_MEMBER(matcher_debug_flags, 0, \"Matcher Debug Flags\", this->params())\n    , INT_MEMBER(classify_learning_debug_level, 0, \"Learning Debug Level: \", this->params())\n    , double_MEMBER(matcher_good_threshold, 0.125, \"Good Match (0-1)\", this->params())\n    , double_MEMBER(matcher_reliable_adaptive_result, 0.0, \"Great Match (0-1)\", this->params())\n    , double_MEMBER(matcher_perfect_threshold, 0.02, \"Perfect Match (0-1)\", this->params())\n    , double_MEMBER(matcher_bad_match_pad, 0.15, \"Bad Match Pad (0-1)\", this->params())\n    , double_MEMBER(matcher_rating_margin, 0.1, \"New template margin (0-1)\", this->params())\n    , double_MEMBER(matcher_avg_noise_size, 12.0, \"Avg. noise blob length\", this->params())\n    , INT_MEMBER(matcher_permanent_classes_min, 1, \"Min # of permanent classes\", this->params())\n    , INT_MEMBER(matcher_min_examples_for_prototyping, 3, \"Reliable Config Threshold\",\n                 this->params())\n    , INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,\n                 \"Enable adaption even if the ambiguities have not been seen\", this->params())\n    , double_MEMBER(matcher_clustering_max_angle_delta, 0.015,\n                    \"Maximum angle delta for prototype clustering\", this->params())\n    , double_MEMBER(classify_misfit_junk_penalty, 0.0,\n                    \"Penalty to apply when a non-alnum is vertically out of \"\n                    \"its expected textline position\",\n                    this->params())\n    , double_MEMBER(rating_scale, 1.5, \"Rating scaling factor\", this->params())\n    , double_MEMBER(tessedit_class_miss_scale, 0.00390625, \"Scale factor for features not used\",\n                    this->params())\n    , double_MEMBER(classify_adapted_pruning_factor, 2.5,\n                    \"Prune poor adapted results this much worse than best result\", this->params())\n    , double_MEMBER(classify_adapted_pruning_threshold, -1.0,\n                    \"Threshold at which classify_adapted_pruning_factor starts\", this->params())\n    , INT_MEMBER(classify_adapt_proto_threshold, 230,\n                 \"Threshold for good protos during adaptive 0-255\", this->params())\n    , INT_MEMBER(classify_adapt_feature_threshold, 230,\n                 \"Threshold for good features during adaptive 0-255\", this->params())\n    , BOOL_MEMBER(disable_character_fragments, true,\n                  \"Do not include character fragments in the\"\n                  \" results of the classifier\",\n                  this->params())\n    , double_MEMBER(classify_character_fragments_garbage_certainty_threshold, -3.0,\n                    \"Exclude fragments that do not look like whole\"\n                    \" characters from training and adaption\",\n                    this->params())\n    , BOOL_MEMBER(classify_debug_character_fragments, false,\n                  \"Bring up graphical debugging windows for fragments training\", this->params())\n    , BOOL_MEMBER(matcher_debug_separate_windows, false,\n                  \"Use two different windows for debugging the matching: \"\n                  \"One for the protos and one for the features.\",\n                  this->params())\n    , STRING_MEMBER(classify_learn_debug_str, \"\", \"Class str to debug learning\", this->params())\n    , INT_MEMBER(classify_class_pruner_threshold, 229, \"Class Pruner Threshold 0-255\",\n                 this->params())\n    , INT_MEMBER(classify_class_pruner_multiplier, 15,\n                 \"Class Pruner Multiplier 0-255:       \", this->params())\n    , INT_MEMBER(classify_cp_cutoff_strength, 7,\n                 \"Class Pruner CutoffStrength:         \", this->params())\n    , INT_MEMBER(classify_integer_matcher_multiplier, 10,\n                 \"Integer Matcher Multiplier  0-255:   \", this->params())\n    , BOOL_MEMBER(classify_bln_numeric_mode, 0, \"Assume the input is numbers [0-9].\",\n                  this->params())\n    , double_MEMBER(speckle_large_max_size, 0.30, \"Max large speckle size\", this->params())\n    , double_MEMBER(speckle_rating_penalty, 10.0, \"Penalty to add to worst rating for noise\",\n                    this->params())\n    , im_(&classify_debug_level)\n    , dict_(this) {\n  using namespace std::placeholders; // for _1, _2\n  fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1));\n\n  InitFeatureDefs(&feature_defs_);\n}\n\nClassify::~Classify() {\n  EndAdaptiveClassifier();\n#ifndef GRAPHICS_DISABLED\n  delete learn_debug_win_;\n  delete learn_fragmented_word_debug_win_;\n  delete learn_fragments_debug_win_;\n#endif\n}\n\n// Takes ownership of the given classifier, and uses it for future calls\n// to CharNormClassifier.\nvoid Classify::SetStaticClassifier(ShapeClassifier *static_classifier) {\n  delete static_classifier_;\n  static_classifier_ = static_classifier;\n}\n\n// Moved from speckle.cpp\n// Adds a noise classification result that is a bit worse than the worst\n// current result, or the worst possible result if no current results.\nvoid Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {\n  BLOB_CHOICE_IT bc_it(choices);\n  // If there is no classifier result, we will use the worst possible certainty\n  // and corresponding rating.\n  float certainty = -getDict().certainty_scale;\n  float rating = rating_scale * blob_length;\n  if (!choices->empty() && blob_length > 0) {\n    bc_it.move_to_last();\n    BLOB_CHOICE *worst_choice = bc_it.data();\n    // Add speckle_rating_penalty to worst rating, matching old value.\n    rating = worst_choice->rating() + speckle_rating_penalty;\n    // Compute the rating to correspond to the certainty. (Used to be kept\n    // the same, but that messes up the language model search.)\n    certainty = -rating * getDict().certainty_scale / (rating_scale * blob_length);\n  }\n  auto *blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, -1, 0.0f, FLT_MAX, 0,\n                                      BCC_SPECKLE_CLASSIFIER);\n  bc_it.add_to_end(blob_choice);\n}\n\n// Returns true if the blob is small enough to be a large speckle.\nbool Classify::LargeSpeckle(const TBLOB &blob) {\n  double speckle_size = kBlnXHeight * speckle_large_max_size;\n  TBOX bbox = blob.bounding_box();\n  return bbox.width() < speckle_size && bbox.height() < speckle_size;\n}\n\n} // namespace tesseract\n\n#endif // def DISABLED_LEGACY_ENGINE\n"
  },
  {
    "path": "src/classify/classify.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        classify.h\n// Description: classify class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_CLASSIFY_H_\n#define TESSERACT_CLASSIFY_CLASSIFY_H_\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include \"ccstruct.h\"\n#  include \"dict.h\"\n\nnamespace tesseract {\n\nclass Classify : public CCStruct {\npublic:\n  Classify();\n  virtual ~Classify();\n  virtual Dict &getDict() {\n    return dict_;\n  }\n\n  // Member variables.\n\n  INT_VAR_H(classify_debug_level);\n  BOOL_VAR_H(classify_bln_numeric_mode);\n  double_VAR_H(classify_max_rating_ratio);\n  double_VAR_H(classify_max_certainty_margin);\n\nprivate:\n  Dict dict_;\n};\n\n} // namespace tesseract\n\n#else // DISABLED_LEGACY_ENGINE not defined\n\n#  include \"adaptive.h\"\n#  include \"ccstruct.h\"\n#  include \"dict.h\"\n#  include \"featdefs.h\"\n#  include \"fontinfo.h\"\n#  include \"intfx.h\"\n#  include \"intmatcher.h\"\n#  include \"normalis.h\"\n#  include \"ocrfeatures.h\"\n#  include \"ratngs.h\"\n#  include \"unicity_table.h\"\n\nnamespace tesseract {\n\nclass ScrollView;\nclass WERD_CHOICE;\nclass WERD_RES;\nstruct ADAPT_RESULTS;\nstruct NORM_PROTOS;\n\nstatic const int kUnknownFontinfoId = -1;\nstatic const int kBlankFontinfoId = -2;\n\nclass ShapeClassifier;\nstruct ShapeRating;\nclass ShapeTable;\nstruct UnicharRating;\n\n// How segmented is a blob. In this enum, character refers to a classifiable\n// unit, but that is too long and character is usually easier to understand.\nenum CharSegmentationType {\n  CST_FRAGMENT, // A partial character.\n  CST_WHOLE,    // A correctly segmented character.\n  CST_IMPROPER, // More than one but less than 2 characters.\n  CST_NGRAM     // Multiple characters.\n};\n\nclass TESS_API Classify : public CCStruct {\npublic:\n  Classify();\n  ~Classify() override;\n  virtual Dict &getDict() {\n    return dict_;\n  }\n\n  const ShapeTable *shape_table() const {\n    return shape_table_;\n  }\n\n  // Takes ownership of the given classifier, and uses it for future calls\n  // to CharNormClassifier.\n  void SetStaticClassifier(ShapeClassifier *static_classifier);\n\n  // Adds a noise classification result that is a bit worse than the worst\n  // current result, or the worst possible result if no current results.\n  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);\n\n  // Returns true if the blob is small enough to be a large speckle.\n  bool LargeSpeckle(const TBLOB &blob);\n\n  /* adaptive.cpp ************************************************************/\n  int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId);\n  // Runs the class pruner from int_templates on the given features, returning\n  // the number of classes output in results.\n  //    int_templates          Class pruner tables\n  //    num_features           Number of features in blob\n  //    features               Array of features\n  //    normalization_factors  (input) Array of int_templates->NumClasses fudge\n  //                           factors from blob normalization process.\n  //                           (Indexed by CLASS_INDEX)\n  //    expected_num_features  (input) Array of int_templates->NumClasses\n  //                           expected number of features for each class.\n  //                           (Indexed by CLASS_INDEX)\n  //    results                (output) Sorted Array of pruned classes.\n  //                           Array must be sized to take the maximum possible\n  //                           number of outputs : int_templates->NumClasses.\n  int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this,\n                   const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors,\n                   const uint16_t *expected_num_features, std::vector<CP_RESULT_STRUCT> *results);\n  void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs);\n  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);\n  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);\n  ADAPT_TEMPLATES_STRUCT *ReadAdaptedTemplates(TFile *File);\n  /* normmatch.cpp ************************************************************/\n  float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch);\n  void FreeNormProtos();\n  NORM_PROTOS *ReadNormProtos(TFile *fp);\n  /* protos.cpp ***************************************************************/\n  void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class);\n  INT_TEMPLATES_STRUCT *CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset);\n  /* adaptmatch.cpp ***********************************************************/\n\n  // Learns the given word using its chopped_word, seam_array, denorm,\n  // box_word, best_state, and correct_text to learn both correctly and\n  // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob\n  // is called and the data will be saved in an internal buffer.\n  // Otherwise AdaptToBlob is called for adaption within a document.\n  void LearnWord(const char *fontname, WERD_RES *word);\n\n  // Builds a blob of length fragments, from the word, starting at start,\n  // and then learns it, as having the given correct_text.\n  // If fontname is not nullptr, then LearnBlob is called and the data will be\n  // saved in an internal buffer for static training.\n  // Otherwise AdaptToBlob is called for adaption within a document.\n  // threshold is a magic number required by AdaptToChar and generated by\n  // ComputeAdaptionThresholds.\n  // Although it can be partly inferred from the string, segmentation is\n  // provided to explicitly clarify the character segmentation.\n  void LearnPieces(const char *fontname, int start, int length, float threshold,\n                   CharSegmentationType segmentation, const char *correct_text, WERD_RES *word);\n  void InitAdaptiveClassifier(TessdataManager *mgr);\n  void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,\n                        ADAPT_TEMPLATES_STRUCT *Templates);\n  void AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,\n                       const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,\n                       INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities,\n                       ADAPT_RESULTS *results);\n  void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,\n                     const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,\n                     ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box,\n                     const std::vector<CP_RESULT_STRUCT> &results, ADAPT_RESULTS *final_results);\n  // Converts configs to fonts, and if the result is not adapted, and a\n  // shape_table_ is present, the shape is expanded to include all\n  // unichar_ids represented, before applying a set of corrections to the\n  // distance rating in int_result, (see ComputeCorrectedRating.)\n  // The results are added to the final_results output.\n  void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom,\n                                       int top, float cp_rating, int blob_length,\n                                       int matcher_multiplier, const uint8_t *cn_factors,\n                                       UnicharRating *int_result, ADAPT_RESULTS *final_results);\n  // Applies a set of corrections to the distance im_rating,\n  // including the cn_correction, miss penalty and additional penalty\n  // for non-alnums being vertical misfits. Returns the corrected distance.\n  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating,\n                                int feature_misses, int bottom, int top, int blob_length,\n                                int matcher_multiplier, const uint8_t *cn_factors);\n  void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results,\n                               BLOB_CHOICE_LIST *Choices);\n  void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results);\n  int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures);\n\n#  ifndef GRAPHICS_DISABLED\n  void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results);\n#  endif\n  PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],\n                             INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask);\n  int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,\n                             int NumFeatures, INT_FEATURE_ARRAY Features,\n                             FEATURE_SET FloatFeatures);\n  void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob);\n  void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results);\n  void RemoveExtraPuncs(ADAPT_RESULTS *Results);\n  void RemoveBadMatches(ADAPT_RESULTS *Results);\n  void SetAdaptiveThreshold(float Threshold);\n  void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features);\n  // Returns a string for the classifier class_id: either the corresponding\n  // unicharset debug_str or the shape_table_ debug str.\n  std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,\n                                int config_id) const;\n  // Converts a classifier class_id index with a config ID to:\n  // shape_table_ present: a shape_table_ index OR\n  // No shape_table_: a font ID.\n  // Without shape training, each class_id, config pair represents a single\n  // unichar id/font combination, so this function looks up the corresponding\n  // font id.\n  // With shape training, each class_id, config pair represents a single\n  // shape table index, so the fontset_table stores the shape table index,\n  // and the shape_table_ must be consulted to obtain the actual unichar_id/\n  // font combinations that the shape represents.\n  int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const;\n  // Converts a shape_table_ index to a classifier class_id index (not a\n  // unichar-id!). Uses a search, so not fast.\n  int ShapeIDToClassID(int shape_id) const;\n  UNICHAR_ID *BaselineClassifier(TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT> &int_features,\n                                 const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates,\n                                 ADAPT_RESULTS *Results);\n  int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results);\n\n  // As CharNormClassifier, but operates on a TrainingSample and outputs to\n  // a vector of ShapeRating without conversion to classes.\n  int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,\n                             std::vector<UnicharRating> *results);\n  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);\n  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);\n  void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,\n                   ADAPT_TEMPLATES_STRUCT *adaptive_templates);\n  void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class);\n  bool AdaptableWord(WERD_RES *word);\n  void EndAdaptiveClassifier();\n  void SetupPass1();\n  void SetupPass2();\n  void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);\n  void ClassifyAsNoise(ADAPT_RESULTS *Results);\n  void ResetAdaptiveClassifierInternal();\n  void SwitchAdaptiveClassifier();\n  void StartBackupAdaptiveClassifier();\n\n  int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,\n                         uint8_t *pruner_norm_array, uint8_t *char_norm_array);\n  // Computes the char_norm_array for the unicharset and, if not nullptr, the\n  // pruner_array as appropriate according to the existence of the shape_table.\n  // The norm_feature is deleted as it is almost certainly no longer needed.\n  void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,\n                             uint8_t *char_norm_array, uint8_t *pruner_array);\n\n  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config);\n  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);\n\n  bool AdaptiveClassifierIsFull() const {\n    return NumAdaptationsFailed > 0;\n  }\n  bool AdaptiveClassifierIsEmpty() const {\n    return AdaptedTemplates->NumPermClasses == 0;\n  }\n  bool LooksLikeGarbage(TBLOB *blob);\n#ifndef GRAPHICS_DISABLED\n  void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox);\n#endif\n  // intfx.cpp\n  // Computes the DENORMS for bl(baseline) and cn(character) normalization\n  // during feature extraction. The input denorm describes the current state\n  // of the blob, which is usually a baseline-normalized word.\n  // The Transforms setup are as follows:\n  // Baseline Normalized (bl) Output:\n  //   We center the grapheme by aligning the x-coordinate of its centroid with\n  //   x=128 and leaving the already-baseline-normalized y as-is.\n  //\n  // Character Normalized (cn) Output:\n  //   We align the grapheme's centroid at the origin and scale it\n  //   asymmetrically in x and y so that the 2nd moments are a standard value\n  //   (51.2) ie the result is vaguely square.\n  // If classify_nonlinear_norm is true:\n  //   A non-linear normalization is setup that attempts to evenly distribute\n  //   edges across x and y.\n  //\n  // Some of the fields of fx_info are also setup:\n  // Length: Total length of outline.\n  // Rx:     Rounded y second moment. (Reversed by convention.)\n  // Ry:     rounded x second moment.\n  // Xmean:  Rounded x center of mass of the blob.\n  // Ymean:  Rounded y center of mass of the blob.\n  static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm,\n                               DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info);\n\n  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as\n  // (x,y) position and angle as measured counterclockwise from the vector\n  // <-1, 0>, from blob using two normalizations defined by bl_denorm and\n  // cn_denorm. See SetpuBLCNDenorms for definitions.\n  // If outline_cn_counts is not nullptr, on return it contains the cumulative\n  // number of cn features generated for each outline in the blob (in order).\n  // Thus after the first outline, there were (*outline_cn_counts)[0] features,\n  // after the second outline, there were (*outline_cn_counts)[1] features etc.\n  static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm,\n                              std::vector<INT_FEATURE_STRUCT> *bl_features,\n                              std::vector<INT_FEATURE_STRUCT> *cn_features,\n                              INT_FX_RESULT_STRUCT *results, std::vector<int> *outline_cn_counts);\n  /* float2int.cpp ************************************************************/\n  void ClearCharNormArray(uint8_t *char_norm_array);\n  void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array);\n  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);\n  /* intproto.cpp *************************************************************/\n  INT_TEMPLATES_STRUCT *ReadIntTemplates(TFile *fp);\n  void WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset);\n  CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on,\n                           int *shape_id);\n  void ShowMatchDisplay();\n  /* font detection ***********************************************************/\n  UnicityTable<FontInfo> &get_fontinfo_table() {\n    return fontinfo_table_;\n  }\n  const UnicityTable<FontInfo> &get_fontinfo_table() const {\n    return fontinfo_table_;\n  }\n  UnicityTable<FontSet> &get_fontset_table() {\n    return fontset_table_;\n  }\n  /* mfoutline.cpp ***********************************************************/\n  void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale);\n  /* outfeat.cpp ***********************************************************/\n  FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);\n  /* picofeat.cpp ***********************************************************/\n  FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);\n  FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);\n  FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);\n  /* blobclass.cpp ***********************************************************/\n  // Extracts features from the given blob and saves them in the tr_file_data_\n  // member variable.\n  // fontname:  Name of font that this blob was printed in.\n  // cn_denorm: Character normalization transformation to apply to the blob.\n  // fx_info:   Character normalization parameters computed with cn_denorm.\n  // blob_text: Ground truth text for the blob.\n  void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm,\n                 const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text);\n  // Writes stored training data to a .tr file based on the given filename.\n  // Returns false on error.\n  bool WriteTRFile(const char *filename);\n\n  // Member variables.\n\n  // Parameters.\n  // Set during training (in lang.config) to indicate whether the divisible\n  // blobs chopper should be used (true for latin script.)\n  BOOL_VAR_H(allow_blob_division);\n  // Set during training (in lang.config) to indicate whether the divisible\n  // blobs chopper should be used in preference to chopping. Set to true for\n  // southern Indic scripts.\n  BOOL_VAR_H(prioritize_division);\n  BOOL_VAR_H(classify_enable_learning);\n  INT_VAR_H(classify_debug_level);\n\n  /* mfoutline.cpp ***********************************************************/\n  /* control knobs used to control normalization of outlines */\n  INT_VAR_H(classify_norm_method);\n  double_VAR_H(classify_char_norm_range);\n  double_VAR_H(classify_max_rating_ratio);\n  double_VAR_H(classify_max_certainty_margin);\n\n  /* adaptmatch.cpp ***********************************************************/\n  BOOL_VAR_H(tess_cn_matching);\n  BOOL_VAR_H(tess_bn_matching);\n  BOOL_VAR_H(classify_enable_adaptive_matcher);\n  BOOL_VAR_H(classify_use_pre_adapted_templates);\n  BOOL_VAR_H(classify_save_adapted_templates);\n  BOOL_VAR_H(classify_enable_adaptive_debugger);\n  BOOL_VAR_H(classify_nonlinear_norm);\n  INT_VAR_H(matcher_debug_level);\n  INT_VAR_H(matcher_debug_flags);\n  INT_VAR_H(classify_learning_debug_level);\n  double_VAR_H(matcher_good_threshold);\n  double_VAR_H(matcher_reliable_adaptive_result);\n  double_VAR_H(matcher_perfect_threshold);\n  double_VAR_H(matcher_bad_match_pad);\n  double_VAR_H(matcher_rating_margin);\n  double_VAR_H(matcher_avg_noise_size);\n  INT_VAR_H(matcher_permanent_classes_min);\n  INT_VAR_H(matcher_min_examples_for_prototyping);\n  INT_VAR_H(matcher_sufficient_examples_for_prototyping);\n  double_VAR_H(matcher_clustering_max_angle_delta);\n  double_VAR_H(classify_misfit_junk_penalty);\n  double_VAR_H(rating_scale);\n  double_VAR_H(tessedit_class_miss_scale);\n  double_VAR_H(classify_adapted_pruning_factor);\n  double_VAR_H(classify_adapted_pruning_threshold);\n  INT_VAR_H(classify_adapt_proto_threshold);\n  INT_VAR_H(classify_adapt_feature_threshold);\n  BOOL_VAR_H(disable_character_fragments);\n  double_VAR_H(classify_character_fragments_garbage_certainty_threshold);\n  BOOL_VAR_H(classify_debug_character_fragments);\n  BOOL_VAR_H(matcher_debug_separate_windows);\n  STRING_VAR_H(classify_learn_debug_str);\n\n  /* intmatcher.cpp **********************************************************/\n  INT_VAR_H(classify_class_pruner_threshold);\n  INT_VAR_H(classify_class_pruner_multiplier);\n  INT_VAR_H(classify_cp_cutoff_strength);\n  INT_VAR_H(classify_integer_matcher_multiplier);\n\n  BOOL_VAR_H(classify_bln_numeric_mode);\n  double_VAR_H(speckle_large_max_size);\n  double_VAR_H(speckle_rating_penalty);\n\n  // Use class variables to hold onto built-in templates and adapted templates.\n  INT_TEMPLATES_STRUCT *PreTrainedTemplates = nullptr;\n  ADAPT_TEMPLATES_STRUCT *AdaptedTemplates = nullptr;\n  // The backup adapted templates are created from the previous page (only)\n  // so they are always ready and reasonably well trained if the primary\n  // adapted templates become full.\n  ADAPT_TEMPLATES_STRUCT *BackupAdaptedTemplates = nullptr;\n\n  // Create dummy proto and config masks for use with the built-in templates.\n  BIT_VECTOR AllProtosOn = nullptr;\n  BIT_VECTOR AllConfigsOn = nullptr;\n  BIT_VECTOR AllConfigsOff = nullptr;\n  BIT_VECTOR TempProtoMask = nullptr;\n  /* normmatch.cpp */\n  NORM_PROTOS *NormProtos = nullptr;\n  /* font detection ***********************************************************/\n  UnicityTable<FontInfo> fontinfo_table_;\n  // Without shape training, each class_id, config pair represents a single\n  // unichar id/font combination, so each fontset_table_ entry holds font ids\n  // for each config in the class.\n  // With shape training, each class_id, config pair represents a single\n  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,\n  // and the shape_table_ must be consulted to obtain the actual unichar_id/\n  // font combinations that the shape represents.\n  UnicityTable<FontSet> fontset_table_;\n\nprotected:\n  IntegerMatcher im_;\n  FEATURE_DEFS_STRUCT feature_defs_;\n  // If a shape_table_ is present, it is used to remap classifier output in\n  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually\n  // mean an index to the shape_table_ and the choices returned are *all* the\n  // shape_table_ entries at that index.\n  ShapeTable *shape_table_ = nullptr;\n\nprivate:\n  // The currently active static classifier.\n  ShapeClassifier *static_classifier_ = nullptr;\n#ifndef GRAPHICS_DISABLED\n  ScrollView *learn_debug_win_ = nullptr;\n  ScrollView *learn_fragmented_word_debug_win_ = nullptr;\n  ScrollView *learn_fragments_debug_win_ = nullptr;\n#endif\n\n  // Training data gathered here for all the images in a document.\n  std::string tr_file_data_;\n\n  Dict dict_;\n\n  std::vector<uint16_t> shapetable_cutoffs_;\n\n  /* variables used to hold performance statistics */\n  int NumAdaptationsFailed = 0;\n\n  // Expected number of features in the class pruner, used to penalize\n  // unknowns that have too few features (like a c being classified as e) so\n  // it doesn't recognize everything as '@' or '#'.\n  // CharNormCutoffs is for the static classifier (with no shapetable).\n  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real\n  // value in the adaptive classifier. Both are indexed by unichar_id.\n  // shapetable_cutoffs_ provides a similar value for each shape in the\n  // shape_table_\n  uint16_t CharNormCutoffs[MAX_NUM_CLASSES];\n  uint16_t BaselineCutoffs[MAX_NUM_CLASSES];\n\npublic:\n  bool EnableLearning = true;\n};\n\n} // namespace tesseract\n\n#endif // DISABLED_LEGACY_ENGINE\n\n#endif // TESSERACT_CLASSIFY_CLASSIFY_H_\n"
  },
  {
    "path": "src/classify/cluster.cpp",
    "content": "/******************************************************************************\n ** Filename: cluster.cpp\n ** Purpose:  Routines for clustering points in N-D space\n ** Author:   Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"cluster.h\"\n\n#include \"genericheap.h\"\n#include \"kdpair.h\"\n#include \"matrix.h\"\n#include \"tprintf.h\"\n\n#include \"helpers.h\"\n\n#include <cfloat> // for FLT_MAX\n#include <cmath>  // for M_PI\n#include <vector> // for std::vector\n\nnamespace tesseract {\n\n#define HOTELLING 1  // If true use Hotelling's test to decide where to split.\n#define FTABLE_X 10  // Size of FTable.\n#define FTABLE_Y 100 // Size of FTable.\n\n// Table of values approximating the cumulative F-distribution for a confidence\n// of 1%.\nconst double FTable[FTABLE_Y][FTABLE_X] = {\n    {\n        4052.19,\n        4999.52,\n        5403.34,\n        5624.62,\n        5763.65,\n        5858.97,\n        5928.33,\n        5981.10,\n        6022.50,\n        6055.85,\n    },\n    {\n        98.502,\n        99.000,\n        99.166,\n        99.249,\n        99.300,\n        99.333,\n        99.356,\n        99.374,\n        99.388,\n        99.399,\n    },\n    {\n        34.116,\n        30.816,\n        29.457,\n        28.710,\n        28.237,\n        27.911,\n        27.672,\n        27.489,\n        27.345,\n        27.229,\n    },\n    {\n        21.198,\n        18.000,\n        16.694,\n        15.977,\n        15.522,\n        15.207,\n        14.976,\n        14.799,\n        14.659,\n        14.546,\n    },\n    {\n        16.258,\n        13.274,\n        12.060,\n        11.392,\n        10.967,\n        10.672,\n        10.456,\n        10.289,\n        10.158,\n        10.051,\n    },\n    {\n        13.745,\n        10.925,\n        9.780,\n        9.148,\n        8.746,\n        8.466,\n        8.260,\n        8.102,\n        7.976,\n        7.874,\n    },\n    {\n        12.246,\n        9.547,\n        8.451,\n        7.847,\n        7.460,\n        7.191,\n        6.993,\n        6.840,\n        6.719,\n        6.620,\n    },\n    {\n        11.259,\n        8.649,\n        7.591,\n        7.006,\n        6.632,\n        6.371,\n        6.178,\n        6.029,\n        5.911,\n        5.814,\n    },\n    {\n        10.561,\n        8.022,\n        6.992,\n        6.422,\n        6.057,\n        5.802,\n        5.613,\n        5.467,\n        5.351,\n        5.257,\n    },\n    {\n        10.044,\n        7.559,\n        6.552,\n        5.994,\n        5.636,\n        5.386,\n        5.200,\n        5.057,\n        4.942,\n        4.849,\n    },\n    {\n        9.646,\n        7.206,\n        6.217,\n        5.668,\n        5.316,\n        5.069,\n        4.886,\n        4.744,\n        4.632,\n        4.539,\n    },\n    {\n        9.330,\n        6.927,\n        5.953,\n        5.412,\n        5.064,\n        4.821,\n        4.640,\n        4.499,\n        4.388,\n        4.296,\n    },\n    {\n        9.074,\n        6.701,\n        5.739,\n        5.205,\n        4.862,\n        4.620,\n        4.441,\n        4.302,\n        4.191,\n        4.100,\n    },\n    {\n        8.862,\n        6.515,\n        5.564,\n        5.035,\n        4.695,\n        4.456,\n        4.278,\n        4.140,\n        4.030,\n        3.939,\n    },\n    {\n        8.683,\n        6.359,\n        5.417,\n        4.893,\n        4.556,\n        4.318,\n        4.142,\n        4.004,\n        3.895,\n        3.805,\n    },\n    {\n        8.531,\n        6.226,\n        5.292,\n        4.773,\n        4.437,\n        4.202,\n        4.026,\n        3.890,\n        3.780,\n        3.691,\n    },\n    {\n        8.400,\n        6.112,\n        5.185,\n        4.669,\n        4.336,\n        4.102,\n        3.927,\n        3.791,\n        3.682,\n        3.593,\n    },\n    {\n        8.285,\n        6.013,\n        5.092,\n        4.579,\n        4.248,\n        4.015,\n        3.841,\n        3.705,\n        3.597,\n        3.508,\n    },\n    {\n        8.185,\n        5.926,\n        5.010,\n        4.500,\n        4.171,\n        3.939,\n        3.765,\n        3.631,\n        3.523,\n        3.434,\n    },\n    {\n        8.096,\n        5.849,\n        4.938,\n        4.431,\n        4.103,\n        3.871,\n        3.699,\n        3.564,\n        3.457,\n        3.368,\n    },\n    {\n        8.017,\n        5.780,\n        4.874,\n        4.369,\n        4.042,\n        3.812,\n        3.640,\n        3.506,\n        3.398,\n        3.310,\n    },\n    {\n        7.945,\n        5.719,\n        4.817,\n        4.313,\n        3.988,\n        3.758,\n        3.587,\n        3.453,\n        3.346,\n        3.258,\n    },\n    {\n        7.881,\n        5.664,\n        4.765,\n        4.264,\n        3.939,\n        3.710,\n        3.539,\n        3.406,\n        3.299,\n        3.211,\n    },\n    {\n        7.823,\n        5.614,\n        4.718,\n        4.218,\n        3.895,\n        3.667,\n        3.496,\n        3.363,\n        3.256,\n        3.168,\n    },\n    {\n        7.770,\n        5.568,\n        4.675,\n        4.177,\n        3.855,\n        3.627,\n        3.457,\n        3.324,\n        3.217,\n        3.129,\n    },\n    {\n        7.721,\n        5.526,\n        4.637,\n        4.140,\n        3.818,\n        3.591,\n        3.421,\n        3.288,\n        3.182,\n        3.094,\n    },\n    {\n        7.677,\n        5.488,\n        4.601,\n        4.106,\n        3.785,\n        3.558,\n        3.388,\n        3.256,\n        3.149,\n        3.062,\n    },\n    {\n        7.636,\n        5.453,\n        4.568,\n        4.074,\n        3.754,\n        3.528,\n        3.358,\n        3.226,\n        3.120,\n        3.032,\n    },\n    {\n        7.598,\n        5.420,\n        4.538,\n        4.045,\n        3.725,\n        3.499,\n        3.330,\n        3.198,\n        3.092,\n        3.005,\n    },\n    {\n        7.562,\n        5.390,\n        4.510,\n        4.018,\n        3.699,\n        3.473,\n        3.305,\n        3.173,\n        3.067,\n        2.979,\n    },\n    {\n        7.530,\n        5.362,\n        4.484,\n        3.993,\n        3.675,\n        3.449,\n        3.281,\n        3.149,\n        3.043,\n        2.955,\n    },\n    {\n        7.499,\n        5.336,\n        4.459,\n        3.969,\n        3.652,\n        3.427,\n        3.258,\n        3.127,\n        3.021,\n        2.934,\n    },\n    {\n        7.471,\n        5.312,\n        4.437,\n        3.948,\n        3.630,\n        3.406,\n        3.238,\n        3.106,\n        3.000,\n        2.913,\n    },\n    {\n        7.444,\n        5.289,\n        4.416,\n        3.927,\n        3.611,\n        3.386,\n        3.218,\n        3.087,\n        2.981,\n        2.894,\n    },\n    {\n        7.419,\n        5.268,\n        4.396,\n        3.908,\n        3.592,\n        3.368,\n        3.200,\n        3.069,\n        2.963,\n        2.876,\n    },\n    {\n        7.396,\n        5.248,\n        4.377,\n        3.890,\n        3.574,\n        3.351,\n        3.183,\n        3.052,\n        2.946,\n        2.859,\n    },\n    {\n        7.373,\n        5.229,\n        4.360,\n        3.873,\n        3.558,\n        3.334,\n        3.167,\n        3.036,\n        2.930,\n        2.843,\n    },\n    {\n        7.353,\n        5.211,\n        4.343,\n        3.858,\n        3.542,\n        3.319,\n        3.152,\n        3.021,\n        2.915,\n        2.828,\n    },\n    {\n        7.333,\n        5.194,\n        4.327,\n        3.843,\n        3.528,\n        3.305,\n        3.137,\n        3.006,\n        2.901,\n        2.814,\n    },\n    {\n        7.314,\n        5.179,\n        4.313,\n        3.828,\n        3.514,\n        3.291,\n        3.124,\n        2.993,\n        2.888,\n        2.801,\n    },\n    {\n        7.296,\n        5.163,\n        4.299,\n        3.815,\n        3.501,\n        3.278,\n        3.111,\n        2.980,\n        2.875,\n        2.788,\n    },\n    {\n        7.280,\n        5.149,\n        4.285,\n        3.802,\n        3.488,\n        3.266,\n        3.099,\n        2.968,\n        2.863,\n        2.776,\n    },\n    {\n        7.264,\n        5.136,\n        4.273,\n        3.790,\n        3.476,\n        3.254,\n        3.087,\n        2.957,\n        2.851,\n        2.764,\n    },\n    {\n        7.248,\n        5.123,\n        4.261,\n        3.778,\n        3.465,\n        3.243,\n        3.076,\n        2.946,\n        2.840,\n        2.754,\n    },\n    {\n        7.234,\n        5.110,\n        4.249,\n        3.767,\n        3.454,\n        3.232,\n        3.066,\n        2.935,\n        2.830,\n        2.743,\n    },\n    {\n        7.220,\n        5.099,\n        4.238,\n        3.757,\n        3.444,\n        3.222,\n        3.056,\n        2.925,\n        2.820,\n        2.733,\n    },\n    {\n        7.207,\n        5.087,\n        4.228,\n        3.747,\n        3.434,\n        3.213,\n        3.046,\n        2.916,\n        2.811,\n        2.724,\n    },\n    {\n        7.194,\n        5.077,\n        4.218,\n        3.737,\n        3.425,\n        3.204,\n        3.037,\n        2.907,\n        2.802,\n        2.715,\n    },\n    {\n        7.182,\n        5.066,\n        4.208,\n        3.728,\n        3.416,\n        3.195,\n        3.028,\n        2.898,\n        2.793,\n        2.706,\n    },\n    {\n        7.171,\n        5.057,\n        4.199,\n        3.720,\n        3.408,\n        3.186,\n        3.020,\n        2.890,\n        2.785,\n        2.698,\n    },\n    {\n        7.159,\n        5.047,\n        4.191,\n        3.711,\n        3.400,\n        3.178,\n        3.012,\n        2.882,\n        2.777,\n        2.690,\n    },\n    {\n        7.149,\n        5.038,\n        4.182,\n        3.703,\n        3.392,\n        3.171,\n        3.005,\n        2.874,\n        2.769,\n        2.683,\n    },\n    {\n        7.139,\n        5.030,\n        4.174,\n        3.695,\n        3.384,\n        3.163,\n        2.997,\n        2.867,\n        2.762,\n        2.675,\n    },\n    {\n        7.129,\n        5.021,\n        4.167,\n        3.688,\n        3.377,\n        3.156,\n        2.990,\n        2.860,\n        2.755,\n        2.668,\n    },\n    {\n        7.119,\n        5.013,\n        4.159,\n        3.681,\n        3.370,\n        3.149,\n        2.983,\n        2.853,\n        2.748,\n        2.662,\n    },\n    {\n        7.110,\n        5.006,\n        4.152,\n        3.674,\n        3.363,\n        3.143,\n        2.977,\n        2.847,\n        2.742,\n        2.655,\n    },\n    {\n        7.102,\n        4.998,\n        4.145,\n        3.667,\n        3.357,\n        3.136,\n        2.971,\n        2.841,\n        2.736,\n        2.649,\n    },\n    {\n        7.093,\n        4.991,\n        4.138,\n        3.661,\n        3.351,\n        3.130,\n        2.965,\n        2.835,\n        2.730,\n        2.643,\n    },\n    {\n        7.085,\n        4.984,\n        4.132,\n        3.655,\n        3.345,\n        3.124,\n        2.959,\n        2.829,\n        2.724,\n        2.637,\n    },\n    {\n        7.077,\n        4.977,\n        4.126,\n        3.649,\n        3.339,\n        3.119,\n        2.953,\n        2.823,\n        2.718,\n        2.632,\n    },\n    {\n        7.070,\n        4.971,\n        4.120,\n        3.643,\n        3.333,\n        3.113,\n        2.948,\n        2.818,\n        2.713,\n        2.626,\n    },\n    {\n        7.062,\n        4.965,\n        4.114,\n        3.638,\n        3.328,\n        3.108,\n        2.942,\n        2.813,\n        2.708,\n        2.621,\n    },\n    {\n        7.055,\n        4.959,\n        4.109,\n        3.632,\n        3.323,\n        3.103,\n        2.937,\n        2.808,\n        2.703,\n        2.616,\n    },\n    {\n        7.048,\n        4.953,\n        4.103,\n        3.627,\n        3.318,\n        3.098,\n        2.932,\n        2.803,\n        2.698,\n        2.611,\n    },\n    {\n        7.042,\n        4.947,\n        4.098,\n        3.622,\n        3.313,\n        3.093,\n        2.928,\n        2.798,\n        2.693,\n        2.607,\n    },\n    {\n        7.035,\n        4.942,\n        4.093,\n        3.618,\n        3.308,\n        3.088,\n        2.923,\n        2.793,\n        2.689,\n        2.602,\n    },\n    {\n        7.029,\n        4.937,\n        4.088,\n        3.613,\n        3.304,\n        3.084,\n        2.919,\n        2.789,\n        2.684,\n        2.598,\n    },\n    {\n        7.023,\n        4.932,\n        4.083,\n        3.608,\n        3.299,\n        3.080,\n        2.914,\n        2.785,\n        2.680,\n        2.593,\n    },\n    {\n        7.017,\n        4.927,\n        4.079,\n        3.604,\n        3.295,\n        3.075,\n        2.910,\n        2.781,\n        2.676,\n        2.589,\n    },\n    {\n        7.011,\n        4.922,\n        4.074,\n        3.600,\n        3.291,\n        3.071,\n        2.906,\n        2.777,\n        2.672,\n        2.585,\n    },\n    {\n        7.006,\n        4.917,\n        4.070,\n        3.596,\n        3.287,\n        3.067,\n        2.902,\n        2.773,\n        2.668,\n        2.581,\n    },\n    {\n        7.001,\n        4.913,\n        4.066,\n        3.591,\n        3.283,\n        3.063,\n        2.898,\n        2.769,\n        2.664,\n        2.578,\n    },\n    {\n        6.995,\n        4.908,\n        4.062,\n        3.588,\n        3.279,\n        3.060,\n        2.895,\n        2.765,\n        2.660,\n        2.574,\n    },\n    {\n        6.990,\n        4.904,\n        4.058,\n        3.584,\n        3.275,\n        3.056,\n        2.891,\n        2.762,\n        2.657,\n        2.570,\n    },\n    {\n        6.985,\n        4.900,\n        4.054,\n        3.580,\n        3.272,\n        3.052,\n        2.887,\n        2.758,\n        2.653,\n        2.567,\n    },\n    {\n        6.981,\n        4.896,\n        4.050,\n        3.577,\n        3.268,\n        3.049,\n        2.884,\n        2.755,\n        2.650,\n        2.563,\n    },\n    {\n        6.976,\n        4.892,\n        4.047,\n        3.573,\n        3.265,\n        3.046,\n        2.881,\n        2.751,\n        2.647,\n        2.560,\n    },\n    {\n        6.971,\n        4.888,\n        4.043,\n        3.570,\n        3.261,\n        3.042,\n        2.877,\n        2.748,\n        2.644,\n        2.557,\n    },\n    {\n        6.967,\n        4.884,\n        4.040,\n        3.566,\n        3.258,\n        3.039,\n        2.874,\n        2.745,\n        2.640,\n        2.554,\n    },\n    {\n        6.963,\n        4.881,\n        4.036,\n        3.563,\n        3.255,\n        3.036,\n        2.871,\n        2.742,\n        2.637,\n        2.551,\n    },\n    {\n        6.958,\n        4.877,\n        4.033,\n        3.560,\n        3.252,\n        3.033,\n        2.868,\n        2.739,\n        2.634,\n        2.548,\n    },\n    {\n        6.954,\n        4.874,\n        4.030,\n        3.557,\n        3.249,\n        3.030,\n        2.865,\n        2.736,\n        2.632,\n        2.545,\n    },\n    {\n        6.950,\n        4.870,\n        4.027,\n        3.554,\n        3.246,\n        3.027,\n        2.863,\n        2.733,\n        2.629,\n        2.542,\n    },\n    {\n        6.947,\n        4.867,\n        4.024,\n        3.551,\n        3.243,\n        3.025,\n        2.860,\n        2.731,\n        2.626,\n        2.539,\n    },\n    {\n        6.943,\n        4.864,\n        4.021,\n        3.548,\n        3.240,\n        3.022,\n        2.857,\n        2.728,\n        2.623,\n        2.537,\n    },\n    {\n        6.939,\n        4.861,\n        4.018,\n        3.545,\n        3.238,\n        3.019,\n        2.854,\n        2.725,\n        2.621,\n        2.534,\n    },\n    {\n        6.935,\n        4.858,\n        4.015,\n        3.543,\n        3.235,\n        3.017,\n        2.852,\n        2.723,\n        2.618,\n        2.532,\n    },\n    {\n        6.932,\n        4.855,\n        4.012,\n        3.540,\n        3.233,\n        3.014,\n        2.849,\n        2.720,\n        2.616,\n        2.529,\n    },\n    {\n        6.928,\n        4.852,\n        4.010,\n        3.538,\n        3.230,\n        3.012,\n        2.847,\n        2.718,\n        2.613,\n        2.527,\n    },\n    {\n        6.925,\n        4.849,\n        4.007,\n        3.535,\n        3.228,\n        3.009,\n        2.845,\n        2.715,\n        2.611,\n        2.524,\n    },\n    {\n        6.922,\n        4.846,\n        4.004,\n        3.533,\n        3.225,\n        3.007,\n        2.842,\n        2.713,\n        2.609,\n        2.522,\n    },\n    {\n        6.919,\n        4.844,\n        4.002,\n        3.530,\n        3.223,\n        3.004,\n        2.840,\n        2.711,\n        2.606,\n        2.520,\n    },\n    {\n        6.915,\n        4.841,\n        3.999,\n        3.528,\n        3.221,\n        3.002,\n        2.838,\n        2.709,\n        2.604,\n        2.518,\n    },\n    {\n        6.912,\n        4.838,\n        3.997,\n        3.525,\n        3.218,\n        3.000,\n        2.835,\n        2.706,\n        2.602,\n        2.515,\n    },\n    {\n        6.909,\n        4.836,\n        3.995,\n        3.523,\n        3.216,\n        2.998,\n        2.833,\n        2.704,\n        2.600,\n        2.513,\n    },\n    {\n        6.906,\n        4.833,\n        3.992,\n        3.521,\n        3.214,\n        2.996,\n        2.831,\n        2.702,\n        2.598,\n        2.511,\n    },\n    {\n        6.904,\n        4.831,\n        3.990,\n        3.519,\n        3.212,\n        2.994,\n        2.829,\n        2.700,\n        2.596,\n        2.509,\n    },\n    {\n        6.901,\n        4.829,\n        3.988,\n        3.517,\n        3.210,\n        2.992,\n        2.827,\n        2.698,\n        2.594,\n        2.507,\n    },\n    {\n        6.898,\n        4.826,\n        3.986,\n        3.515,\n        3.208,\n        2.990,\n        2.825,\n        2.696,\n        2.592,\n        2.505,\n    },\n    {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};\n\n/** define the variance which will be used as a minimum variance for any\n  dimension of any feature. Since most features are calculated from numbers\n  with a precision no better than 1 in 128, the variance should never be\n  less than the square of this number for parameters whose range is 1. */\n#define MINVARIANCE 0.0004\n\n/** define the absolute minimum number of samples which must be present in\n  order to accurately test hypotheses about underlying probability\n  distributions.  Define separately the minimum samples that are needed\n  before a statistical analysis is attempted; this number should be\n  equal to MINSAMPLES but can be set to a lower number for early testing\n  when very few samples are available. */\n#define MINSAMPLESPERBUCKET 5\n#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)\n#define MINSAMPLESNEEDED 1\n\n/** define the size of the table which maps normalized samples to\n  histogram buckets.  Also define the number of standard deviations\n  in a normal distribution which are considered to be significant.\n  The mapping table will be defined in such a way that it covers\n  the specified number of standard deviations on either side of\n  the mean.  BUCKETTABLESIZE should always be even. */\n#define BUCKETTABLESIZE 1024\n#define NORMALEXTENT 3.0\n\nstruct TEMPCLUSTER {\n  CLUSTER *Cluster;\n  CLUSTER *Neighbor;\n};\n\nusing ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>;\nusing ClusterHeap = tesseract::GenericHeap<ClusterPair>;\n\nstruct STATISTICS {\n  STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {\n  }\n  float AvgVariance = 1.0f;\n  std::vector<float> CoVariance;\n  std::vector<float> Min; // largest negative distance from the mean\n  std::vector<float> Max; // largest positive distance from the mean\n};\n\nstruct BUCKETS {\n  BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) {\n  }\n  ~BUCKETS() {\n  }\n  DISTRIBUTION Distribution = normal; // distribution being tested for\n  uint32_t SampleCount = 0;         // # of samples in histogram\n  double Confidence = 0.0;          // confidence level of test\n  double ChiSquared = 0.0;          // test threshold\n  uint16_t NumberOfBuckets;         // number of cells in histogram\n  uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets\n  std::vector<uint32_t> Count;      // frequency of occurrence histogram\n  std::vector<float> ExpectedCount; // expected histogram\n};\n\nstruct CHISTRUCT {\n  /// This constructor allocates a new data structure which is used\n  /// to hold a chi-squared value along with its associated\n  /// number of degrees of freedom and alpha value.\n  ///\n  /// @param degreesOfFreedom  degrees of freedom for new chi value\n  /// @param alpha     confidence level for new chi value\n  CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {\n  }\n  uint16_t DegreesOfFreedom = 0;\n  double Alpha = 0.0;\n  double ChiSquared = 0.0;\n};\n\n// For use with KDWalk / MakePotentialClusters\nstruct ClusteringContext {\n  ClusterHeap *heap;       // heap used to hold temp clusters, \"best\" on top\n  TEMPCLUSTER *candidates; // array of potential clusters\n  KDTREE *tree;            // kd-tree to be searched for neighbors\n  int32_t next;            // next candidate to be used\n};\n\nusing DENSITYFUNC = double (*)(int32_t);\nusing SOLVEFUNC = double (*)(CHISTRUCT *, double);\n\n#define Odd(N) ((N) % 2)\n#define Mirror(N, R) ((R) - (N)-1)\n#define Abs(N) (((N) < 0) ? (-(N)) : (N))\n\n//--------------Global Data Definitions and Declarations----------------------\n/** the following variables describe a discrete normal distribution\n  which is used by NormalDensity() and NormalBucket().  The\n  constant NORMALEXTENT determines how many standard\n  deviations of the distribution are mapped onto the fixed\n  discrete range of x.  x=0 is mapped to -NORMALEXTENT standard\n  deviations and x=BUCKETTABLESIZE is mapped to\n  +NORMALEXTENT standard deviations. */\n#define SqrtOf2Pi 2.506628275\nstatic const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);\nstatic const double kNormalVariance =\n    (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);\nstatic const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);\nstatic const double kNormalMean = BUCKETTABLESIZE / 2;\n\n/** define lookup tables used to compute the number of histogram buckets\n  that should be used for a given number of samples. */\n#define LOOKUPTABLESIZE 8\n#define MAXDEGREESOFFREEDOM MAXBUCKETS\n\nstatic const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200,  400, 600, 800,\n                                                      1000,       1500, 2000}; // number of samples\n\nstatic const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {\n    MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets\n\n/*-------------------------------------------------------------------------\n          Private Function Prototypes\n--------------------------------------------------------------------------*/\nstatic void CreateClusterTree(CLUSTERER *Clusterer);\n\nstatic void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);\n\nstatic CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);\n\nstatic CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);\n\nstatic void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);\n\nstatic PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);\n\nstatic PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,\n                                      PROTOSTYLE Style, int32_t MinSamples);\n\nstatic PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,\n                                      STATISTICS *Statistics);\n\nstatic PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,\n                                     BUCKETS *Buckets);\n\nstatic PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,\n                                      STATISTICS *Statistics, BUCKETS *Buckets);\n\nstatic PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,\n                                 BUCKETS *NormalBuckets, double Confidence);\n\nstatic void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);\n\nstatic void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);\n\nstatic STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);\n\nstatic PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);\n\nstatic PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);\n\nstatic PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);\n\nstatic PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);\n\nstatic bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);\n\nstatic BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,\n                           double Confidence);\n\nstatic BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);\n\nstatic uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);\n\nstatic double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);\n\nstatic double NormalDensity(int32_t x);\n\nstatic double UniformDensity(int32_t x);\n\nstatic double Integral(double f1, double f2, double Dx);\n\nstatic void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,\n                        float Mean, float StdDev);\n\nstatic uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);\n\nstatic uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);\n\nstatic bool DistributionOK(BUCKETS *Buckets);\n\nstatic uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);\n\nstatic void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);\n\nstatic void InitBuckets(BUCKETS *Buckets);\n\nstatic int AlphaMatch(void *arg1,  // CHISTRUCT *ChiStruct,\n                      void *arg2); // CHISTRUCT *SearchKey);\n\nstatic double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);\n\nstatic double ChiArea(CHISTRUCT *ChiParams, double x);\n\nstatic bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);\n\nstatic double InvertMatrix(const float *input, int size, float *inv);\n\n//--------------------------Public Code--------------------------------------\n/**\n * This routine creates a new clusterer data structure,\n * initializes it, and returns a pointer to it.\n *\n * @param SampleSize  number of dimensions in feature space\n * @param ParamDesc description of each dimension\n * @return pointer to the new clusterer data structure\n */\nCLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {\n  int i;\n\n  // allocate main clusterer data structure and init simple fields\n  auto Clusterer = new CLUSTERER;\n  Clusterer->SampleSize = SampleSize;\n  Clusterer->NumberOfSamples = 0;\n  Clusterer->NumChar = 0;\n\n  // init fields which will not be used initially\n  Clusterer->Root = nullptr;\n  Clusterer->ProtoList = NIL_LIST;\n\n  // maintain a copy of param descriptors in the clusterer data structure\n  Clusterer->ParamDesc = new PARAM_DESC[SampleSize];\n  for (i = 0; i < SampleSize; i++) {\n    Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;\n    Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;\n    Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;\n    Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;\n    Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;\n    Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;\n    Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;\n  }\n\n  // allocate a kd tree to hold the samples\n  Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);\n\n  // Initialize cache of histogram buckets to minimize recomputing them.\n  for (auto &d : Clusterer->bucket_cache) {\n    for (auto &c : d) {\n      c = nullptr;\n    }\n  }\n\n  return Clusterer;\n} // MakeClusterer\n\n/**\n * This routine creates a new sample data structure to hold\n * the specified feature.  This sample is added to the clusterer\n * data structure (so that it knows which samples are to be\n * clustered later), and a pointer to the sample is returned to\n * the caller.\n *\n * @param Clusterer clusterer data structure to add sample to\n * @param Feature feature to be added to clusterer\n * @param CharID  unique ident. of char that sample came from\n *\n * @return    Pointer to the new sample data structure\n */\nSAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {\n  int i;\n\n  // see if the samples have already been clustered - if so trap an error\n  // Can't add samples after they have been clustered.\n  ASSERT_HOST(Clusterer->Root == nullptr);\n\n  // allocate the new sample and initialize it\n  auto Sample = new SAMPLE(Clusterer->SampleSize);\n  Sample->Clustered = false;\n  Sample->Prototype = false;\n  Sample->SampleCount = 1;\n  Sample->Left = nullptr;\n  Sample->Right = nullptr;\n  Sample->CharID = CharID;\n\n  for (i = 0; i < Clusterer->SampleSize; i++) {\n    Sample->Mean[i] = Feature[i];\n  }\n\n  // add the sample to the KD tree - keep track of the total # of samples\n  Clusterer->NumberOfSamples++;\n  KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);\n  if (CharID >= Clusterer->NumChar) {\n    Clusterer->NumChar = CharID + 1;\n  }\n\n  // execute hook for monitoring clustering operation\n  // (*SampleCreationHook)(Sample);\n\n  return (Sample);\n} // MakeSample\n\n/**\n * This routine first checks to see if the samples in this\n * clusterer have already been clustered before; if so, it does\n * not bother to recreate the cluster tree.  It simply recomputes\n * the prototypes based on the new Config info.\n *\n * If the samples have not been clustered before, the\n * samples in the KD tree are formed into a cluster tree and then\n * the prototypes are computed from the cluster tree.\n *\n * In either case this routine returns a pointer to a\n * list of prototypes that best represent the samples given\n * the constraints specified in Config.\n *\n * @param Clusterer data struct containing samples to be clustered\n * @param Config  parameters which control clustering process\n *\n * @return Pointer to a list of prototypes\n */\nLIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {\n  // only create cluster tree if samples have never been clustered before\n  if (Clusterer->Root == nullptr) {\n    CreateClusterTree(Clusterer);\n  }\n\n  // deallocate the old prototype list if one exists\n  FreeProtoList(&Clusterer->ProtoList);\n  Clusterer->ProtoList = NIL_LIST;\n\n  // compute prototypes starting at the root node in the tree\n  ComputePrototypes(Clusterer, Config);\n  // We don't need the cluster pointers in the protos any more, so null them\n  // out, which makes it safe to delete the clusterer.\n  LIST proto_list = Clusterer->ProtoList;\n  iterate(proto_list) {\n    auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());\n    proto->Cluster = nullptr;\n  }\n  return Clusterer->ProtoList;\n} // ClusterSamples\n\n/**\n * This routine frees all of the memory allocated to the\n * specified data structure.  It will not, however, free\n * the memory used by the prototype list.  The pointers to\n * the clusters for each prototype in the list will be set\n * to nullptr to indicate that the cluster data structures no\n * longer exist.  Any sample lists that have been obtained\n * via calls to GetSamples are no longer valid.\n * @param Clusterer pointer to data structure to be freed\n */\nvoid FreeClusterer(CLUSTERER *Clusterer) {\n  if (Clusterer != nullptr) {\n    delete[] Clusterer->ParamDesc;\n    delete Clusterer->KDTree;\n    delete Clusterer->Root;\n    // Free up all used buckets structures.\n    for (auto &d : Clusterer->bucket_cache) {\n      for (auto &c : d) {\n        delete c;\n      }\n    }\n\n    delete Clusterer;\n  }\n} // FreeClusterer\n\n/**\n * This routine frees all of the memory allocated to the\n * specified list of prototypes.  The clusters which are\n * pointed to by the prototypes are not freed.\n * @param ProtoList pointer to list of prototypes to be freed\n */\nvoid FreeProtoList(LIST *ProtoList) {\n  destroy_nodes(*ProtoList, FreePrototype);\n} // FreeProtoList\n\n/**\n * This routine deallocates the memory consumed by the specified\n * prototype and modifies the corresponding cluster so that it\n * is no longer marked as a prototype.  The cluster is NOT\n * deallocated by this routine.\n * @param arg prototype data structure to be deallocated\n */\nvoid FreePrototype(void *arg) { // PROTOTYPE     *Prototype)\n  auto *Prototype = static_cast<PROTOTYPE *>(arg);\n\n  // unmark the corresponding cluster (if there is one\n  if (Prototype->Cluster != nullptr) {\n    Prototype->Cluster->Prototype = false;\n  }\n\n  // deallocate the prototype statistics and then the prototype itself\n  if (Prototype->Style != spherical) {\n    delete[] Prototype->Variance.Elliptical;\n    delete[] Prototype->Magnitude.Elliptical;\n    delete[] Prototype->Weight.Elliptical;\n  }\n  delete Prototype;\n} // FreePrototype\n\n/**\n * This routine is used to find all of the samples which\n * belong to a cluster.  It starts by removing the top\n * cluster on the cluster list (SearchState).  If this cluster is\n * a leaf it is returned.  Otherwise, the right subcluster\n * is pushed on the list and we continue the search in the\n * left subcluster.  This continues until a leaf is found.\n * If all samples have been found, nullptr is returned.\n * InitSampleSearch() must be called\n * before NextSample() to initialize the search.\n * @param SearchState ptr to list containing clusters to be searched\n * @return  Pointer to the next leaf cluster (sample) or nullptr.\n */\nCLUSTER *NextSample(LIST *SearchState) {\n  CLUSTER *Cluster;\n\n  if (*SearchState == NIL_LIST) {\n    return (nullptr);\n  }\n  Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());\n  *SearchState = pop(*SearchState);\n  for (;;) {\n    if (Cluster->Left == nullptr) {\n      return (Cluster);\n    }\n    *SearchState = push(*SearchState, Cluster->Right);\n    Cluster = Cluster->Left;\n  }\n} // NextSample\n\n/**\n * This routine returns the mean of the specified\n * prototype in the indicated dimension.\n * @param Proto prototype to return mean of\n * @param Dimension dimension whose mean is to be returned\n * @return  Mean of Prototype in Dimension\n */\nfloat Mean(PROTOTYPE *Proto, uint16_t Dimension) {\n  return (Proto->Mean[Dimension]);\n} // Mean\n\n/**\n * This routine returns the standard deviation of the\n * prototype in the indicated dimension.\n * @param Proto   prototype to return standard deviation of\n * @param Dimension dimension whose stddev is to be returned\n * @return  Standard deviation of Prototype in Dimension\n */\nfloat StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {\n  switch (Proto->Style) {\n    case spherical:\n      return std::sqrt(Proto->Variance.Spherical);\n    case elliptical:\n      return std::sqrt(Proto->Variance.Elliptical[Dimension]);\n    case mixed:\n      switch (Proto->Distrib[Dimension]) {\n        case normal:\n          return std::sqrt(Proto->Variance.Elliptical[Dimension]);\n        case uniform:\n        case D_random:\n          return Proto->Variance.Elliptical[Dimension];\n        case DISTRIBUTION_COUNT:\n          ASSERT_HOST(!\"Distribution count not allowed!\");\n      }\n  }\n  return 0.0f;\n} // StandardDeviation\n\n/*---------------------------------------------------------------------------\n            Private Code\n----------------------------------------------------------------------------*/\n/**\n * This routine performs a bottoms-up clustering on the samples\n * held in the kd-tree of the Clusterer data structure.  The\n * result is a cluster tree.  Each node in the tree represents\n * a cluster which conceptually contains a subset of the samples.\n * More precisely, the cluster contains all of the samples which\n * are contained in its two sub-clusters.  The leaves of the\n * tree are the individual samples themselves; they have no\n * sub-clusters.  The root node of the tree conceptually contains\n * all of the samples.\n * The Clusterer data structure is changed.\n * @param Clusterer data structure holdings samples to be clustered\n */\nstatic void CreateClusterTree(CLUSTERER *Clusterer) {\n  ClusteringContext context;\n  ClusterPair HeapEntry;\n\n  // each sample and its nearest neighbor form a \"potential\" cluster\n  // save these in a heap with the \"best\" potential clusters on top\n  context.tree = Clusterer->KDTree;\n  context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];\n  context.next = 0;\n  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);\n  KDWalk(context.tree, MakePotentialClusters, &context);\n\n  // form potential clusters into actual clusters - always do \"best\" first\n  while (context.heap->Pop(&HeapEntry)) {\n    TEMPCLUSTER *PotentialCluster = HeapEntry.data();\n\n    // if main cluster of potential cluster is already in another cluster\n    // then we don't need to worry about it\n    if (PotentialCluster->Cluster->Clustered) {\n      continue;\n    }\n\n    // if main cluster is not yet clustered, but its nearest neighbor is\n    // then we must find a new nearest neighbor\n    else if (PotentialCluster->Neighbor->Clustered) {\n      PotentialCluster->Neighbor =\n          FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());\n      if (PotentialCluster->Neighbor != nullptr) {\n        context.heap->Push(&HeapEntry);\n      }\n    }\n\n    // if neither cluster is already clustered, form permanent cluster\n    else {\n      PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);\n      PotentialCluster->Neighbor =\n          FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());\n      if (PotentialCluster->Neighbor != nullptr) {\n        context.heap->Push(&HeapEntry);\n      }\n    }\n  }\n\n  // the root node in the cluster tree is now the only node in the kd-tree\n  Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);\n\n  // free up the memory used by the K-D tree, heap, and temp clusters\n  delete context.tree;\n  Clusterer->KDTree = nullptr;\n  delete context.heap;\n  delete[] context.candidates;\n} // CreateClusterTree\n\n/**\n * This routine is designed to be used in concert with the\n * KDWalk routine.  It will create a potential cluster for\n * each sample in the kd-tree that is being walked.  This\n * potential cluster will then be pushed on the heap.\n * @param context  ClusteringContext (see definition above)\n * @param Cluster  current cluster being visited in kd-tree walk\n * @param Level  level of this cluster in the kd-tree\n */\nstatic void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {\n  ClusterPair HeapEntry;\n  int next = context->next;\n  context->candidates[next].Cluster = Cluster;\n  HeapEntry.data() = &(context->candidates[next]);\n  context->candidates[next].Neighbor =\n      FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());\n  if (context->candidates[next].Neighbor != nullptr) {\n    context->heap->Push(&HeapEntry);\n    context->next++;\n  }\n} // MakePotentialClusters\n\n/**\n * This routine searches the specified kd-tree for the nearest\n * neighbor of the specified cluster.  It actually uses the\n * kd routines to find the 2 nearest neighbors since one of them\n * will be the original cluster.  A pointer to the nearest\n * neighbor is returned, if it can be found, otherwise nullptr is\n * returned.  The distance between the 2 nodes is placed\n * in the specified variable.\n * @param Tree    kd-tree to search in for nearest neighbor\n * @param Cluster cluster whose nearest neighbor is to be found\n * @param Distance  ptr to variable to report distance found\n * @return  Pointer to the nearest neighbor of Cluster, or nullptr\n */\nstatic CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)\n#define MAXNEIGHBORS 2\n#define MAXDISTANCE FLT_MAX\n{\n  CLUSTER *Neighbor[MAXNEIGHBORS];\n  float Dist[MAXNEIGHBORS];\n  int NumberOfNeighbors;\n  int32_t i;\n  CLUSTER *BestNeighbor;\n\n  // find the 2 nearest neighbors of the cluster\n  KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,\n                          reinterpret_cast<void **>(Neighbor), Dist);\n\n  // search for the nearest neighbor that is not the cluster itself\n  *Distance = MAXDISTANCE;\n  BestNeighbor = nullptr;\n  for (i = 0; i < NumberOfNeighbors; i++) {\n    if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {\n      *Distance = Dist[i];\n      BestNeighbor = Neighbor[i];\n    }\n  }\n  return BestNeighbor;\n} // FindNearestNeighbor\n\n/**\n * This routine creates a new permanent cluster from the\n * clusters specified in TempCluster.  The 2 clusters in\n * TempCluster are marked as \"clustered\" and deleted from\n * the kd-tree.  The new cluster is then added to the kd-tree.\n * @param Clusterer current clustering environment\n * @param TempCluster potential cluster to make permanent\n * @return Pointer to the new permanent cluster\n */\nstatic CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {\n  // allocate the new cluster and initialize it\n  auto Cluster = new CLUSTER(Clusterer->SampleSize);\n  Cluster->Clustered = false;\n  Cluster->Prototype = false;\n  Cluster->Left = TempCluster->Cluster;\n  Cluster->Right = TempCluster->Neighbor;\n  Cluster->CharID = -1;\n\n  // mark the old clusters as \"clustered\" and delete them from the kd-tree\n  Cluster->Left->Clustered = true;\n  Cluster->Right->Clustered = true;\n  KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);\n  KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);\n\n  // compute the mean and sample count for the new cluster\n  Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,\n                                       Cluster->Left->SampleCount, Cluster->Right->SampleCount,\n                                       &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);\n\n  // add the new cluster to the KD tree\n  KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);\n  return Cluster;\n} // MakeNewCluster\n\n/**\n * This routine merges two clusters into one larger cluster.\n * To do this it computes the number of samples in the new\n * cluster and the mean of the new cluster.  The ParamDesc\n * information is used to ensure that circular dimensions\n * are handled correctly.\n * @param N # of dimensions (size of arrays)\n * @param ParamDesc array of dimension descriptions\n * @param n1, n2  number of samples in each old cluster\n * @param m array to hold mean of new cluster\n * @param m1, m2  arrays containing means of old clusters\n * @return  The number of samples in the new cluster.\n */\nint32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],\n                      float m1[], float m2[]) {\n  int32_t i, n;\n\n  n = n1 + n2;\n  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {\n    if (ParamDesc->Circular) {\n      // if distance between means is greater than allowed\n      // reduce upper point by one \"rotation\" to compute mean\n      // then normalize the mean back into the accepted range\n      if ((*m2 - *m1) > ParamDesc->HalfRange) {\n        *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;\n        if (*m < ParamDesc->Min) {\n          *m += ParamDesc->Range;\n        }\n      } else if ((*m1 - *m2) > ParamDesc->HalfRange) {\n        *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;\n        if (*m < ParamDesc->Min) {\n          *m += ParamDesc->Range;\n        }\n      } else {\n        *m = (n1 * *m1 + n2 * *m2) / n;\n      }\n    } else {\n      *m = (n1 * *m1 + n2 * *m2) / n;\n    }\n  }\n  return n;\n} // MergeClusters\n\n/**\n * This routine decides which clusters in the cluster tree\n * should be represented by prototypes, forms a list of these\n * prototypes, and places the list in the Clusterer data\n * structure.\n * @param Clusterer data structure holding cluster tree\n * @param Config    parameters used to control prototype generation\n */\nstatic void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {\n  LIST ClusterStack = NIL_LIST;\n  CLUSTER *Cluster;\n  PROTOTYPE *Prototype;\n\n  // use a stack to keep track of clusters waiting to be processed\n  // initially the only cluster on the stack is the root cluster\n  if (Clusterer->Root != nullptr) {\n    ClusterStack = push(NIL_LIST, Clusterer->Root);\n  }\n\n  // loop until we have analyzed all clusters which are potential prototypes\n  while (ClusterStack != NIL_LIST) {\n    // remove the next cluster to be analyzed from the stack\n    // try to make a prototype from the cluster\n    // if successful, put it on the proto list, else split the cluster\n    Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());\n    ClusterStack = pop(ClusterStack);\n    Prototype = MakePrototype(Clusterer, Config, Cluster);\n    if (Prototype != nullptr) {\n      Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);\n    } else {\n      ClusterStack = push(ClusterStack, Cluster->Right);\n      ClusterStack = push(ClusterStack, Cluster->Left);\n    }\n  }\n} // ComputePrototypes\n\n/**\n * This routine attempts to create a prototype from the\n * specified cluster that conforms to the distribution\n * specified in Config.  If there are too few samples in the\n * cluster to perform a statistical analysis, then a prototype\n * is generated but labelled as insignificant.  If the\n * dimensions of the cluster are not independent, no prototype\n * is generated and nullptr is returned.  If a prototype can be\n * found that matches the desired distribution then a pointer\n * to it is returned, otherwise nullptr is returned.\n * @param Clusterer data structure holding cluster tree\n * @param Config  parameters used to control prototype generation\n * @param Cluster cluster to be made into a prototype\n * @return  Pointer to new prototype or nullptr\n */\nstatic PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {\n  PROTOTYPE *Proto;\n  BUCKETS *Buckets;\n\n  // filter out clusters which contain samples from the same character\n  if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {\n    return nullptr;\n  }\n\n  // compute the covariance matrix and ranges for the cluster\n  auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);\n\n  // check for degenerate clusters which need not be analyzed further\n  // note that the MinSamples test assumes that all clusters with multiple\n  // character samples have been removed (as above)\n  Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,\n                              static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));\n  if (Proto != nullptr) {\n    delete Statistics;\n    return Proto;\n  }\n  // check to ensure that all dimensions are independent\n  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],\n                   Config->Independence)) {\n    delete Statistics;\n    return nullptr;\n  }\n\n  if (HOTELLING && Config->ProtoStyle == elliptical) {\n    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);\n    if (Proto != nullptr) {\n      delete Statistics;\n      return Proto;\n    }\n  }\n\n  // create a histogram data structure used to evaluate distributions\n  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);\n\n  // create a prototype based on the statistics and test it\n  switch (Config->ProtoStyle) {\n    case spherical:\n      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);\n      break;\n    case elliptical:\n      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);\n      break;\n    case mixed:\n      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);\n      break;\n    case automatic:\n      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);\n      if (Proto != nullptr) {\n        break;\n      }\n      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);\n      if (Proto != nullptr) {\n        break;\n      }\n      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);\n      break;\n  }\n  delete Statistics;\n  return Proto;\n} // MakePrototype\n\n/**\n * This routine checks for clusters which are degenerate and\n * therefore cannot be analyzed in a statistically valid way.\n * A cluster is defined as degenerate if it does not have at\n * least MINSAMPLESNEEDED samples in it.  If the cluster is\n * found to be degenerate, a prototype of the specified style\n * is generated and marked as insignificant.  A cluster is\n * also degenerate if it does not have at least MinSamples\n * samples in it.\n *\n * If the cluster is not degenerate, nullptr is returned.\n *\n * @param N   number of dimensions\n * @param Cluster   cluster being analyzed\n * @param Statistics  statistical info about cluster\n * @param Style   type of prototype to be generated\n * @param MinSamples  minimum number of samples in a cluster\n * @return  Pointer to degenerate prototype or nullptr.\n */\nstatic PROTOTYPE *MakeDegenerateProto( // this was MinSample\n    uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {\n  PROTOTYPE *Proto = nullptr;\n\n  if (MinSamples < MINSAMPLESNEEDED) {\n    MinSamples = MINSAMPLESNEEDED;\n  }\n\n  if (Cluster->SampleCount < MinSamples) {\n    switch (Style) {\n      case spherical:\n        Proto = NewSphericalProto(N, Cluster, Statistics);\n        break;\n      case elliptical:\n      case automatic:\n        Proto = NewEllipticalProto(N, Cluster, Statistics);\n        break;\n      case mixed:\n        Proto = NewMixedProto(N, Cluster, Statistics);\n        break;\n    }\n    Proto->Significant = false;\n  }\n  return (Proto);\n} // MakeDegenerateProto\n\n/**\n * This routine tests the specified cluster to see if **\n * there is a statistically significant difference between\n * the sub-clusters that would be made if the cluster were to\n * be split. If not, then a new prototype is formed and\n * returned to the caller. If there is, then nullptr is returned\n * to the caller.\n * @param Clusterer data struct containing samples being clustered\n * @param Config provides the magic number of samples that make a good cluster\n * @param Cluster   cluster to be made into an elliptical prototype\n * @param Statistics  statistical info about cluster\n * @return Pointer to new elliptical prototype or nullptr.\n */\nstatic PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,\n                                      STATISTICS *Statistics) {\n  // Fraction of the number of samples used as a range around 1 within\n  // which a cluster has the magic size that allows a boost to the\n  // FTable by kFTableBoostMargin, thus allowing clusters near the\n  // magic size (equal to the number of sample characters) to be more\n  // likely to stay together.\n  const double kMagicSampleMargin = 0.0625;\n  const double kFTableBoostMargin = 2.0;\n\n  int N = Clusterer->SampleSize;\n  CLUSTER *Left = Cluster->Left;\n  CLUSTER *Right = Cluster->Right;\n  if (Left == nullptr || Right == nullptr) {\n    return nullptr;\n  }\n  int TotalDims = Left->SampleCount + Right->SampleCount;\n  if (TotalDims < N + 1 || TotalDims < 2) {\n    return nullptr;\n  }\n  std::vector<float> Covariance(static_cast<size_t>(N) * N);\n  std::vector<float> Inverse(static_cast<size_t>(N) * N);\n  std::vector<float> Delta(N);\n  // Compute a new covariance matrix that only uses essential features.\n  for (int i = 0; i < N; ++i) {\n    int row_offset = i * N;\n    if (!Clusterer->ParamDesc[i].NonEssential) {\n      for (int j = 0; j < N; ++j) {\n        if (!Clusterer->ParamDesc[j].NonEssential) {\n          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];\n        } else {\n          Covariance[j + row_offset] = 0.0f;\n        }\n      }\n    } else {\n      for (int j = 0; j < N; ++j) {\n        if (i == j) {\n          Covariance[j + row_offset] = 1.0f;\n        } else {\n          Covariance[j + row_offset] = 0.0f;\n        }\n      }\n    }\n  }\n  double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);\n  if (err > 1) {\n    tprintf(\"Clustering error: Matrix inverse failed with error %g\\n\", err);\n  }\n  int EssentialN = 0;\n  for (int dim = 0; dim < N; ++dim) {\n    if (!Clusterer->ParamDesc[dim].NonEssential) {\n      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];\n      ++EssentialN;\n    } else {\n      Delta[dim] = 0.0f;\n    }\n  }\n  // Compute Hotelling's T-squared.\n  double Tsq = 0.0;\n  for (int x = 0; x < N; ++x) {\n    double temp = 0.0;\n    for (int y = 0; y < N; ++y) {\n      temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];\n    }\n    Tsq += Delta[x] * temp;\n  }\n  // Changed this function to match the formula in\n  // Statistical Methods in Medical Research p 473\n  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.\n  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;\n  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);\n  int Fx = EssentialN;\n  if (Fx > FTABLE_X) {\n    Fx = FTABLE_X;\n  }\n  --Fx;\n  int Fy = TotalDims - EssentialN - 1;\n  if (Fy > FTABLE_Y) {\n    Fy = FTABLE_Y;\n  }\n  --Fy;\n  double FTarget = FTable[Fy][Fx];\n  if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&\n      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {\n    // Give magic-sized clusters a magic FTable boost.\n    FTarget += kFTableBoostMargin;\n  }\n  if (F < FTarget) {\n    return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);\n  }\n  return nullptr;\n}\n\n/**\n * This routine tests the specified cluster to see if it can\n * be approximated by a spherical normal distribution.  If it\n * can be, then a new prototype is formed and returned to the\n * caller.  If it can't be, then nullptr is returned to the caller.\n * @param Clusterer data struct containing samples being clustered\n * @param Cluster   cluster to be made into a spherical prototype\n * @param Statistics  statistical info about cluster\n * @param Buckets   histogram struct used to analyze distribution\n * @return  Pointer to new spherical prototype or nullptr.\n */\nstatic PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,\n                                     BUCKETS *Buckets) {\n  PROTOTYPE *Proto = nullptr;\n  int i;\n\n  // check that each dimension is a normal distribution\n  for (i = 0; i < Clusterer->SampleSize; i++) {\n    if (Clusterer->ParamDesc[i].NonEssential) {\n      continue;\n    }\n\n    FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],\n                sqrt(static_cast<double>(Statistics->AvgVariance)));\n    if (!DistributionOK(Buckets)) {\n      break;\n    }\n  }\n  // if all dimensions matched a normal distribution, make a proto\n  if (i >= Clusterer->SampleSize) {\n    Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);\n  }\n  return (Proto);\n} // MakeSphericalProto\n\n/**\n * This routine tests the specified cluster to see if it can\n * be approximated by an elliptical normal distribution.  If it\n * can be, then a new prototype is formed and returned to the\n * caller.  If it can't be, then nullptr is returned to the caller.\n * @param Clusterer data struct containing samples being clustered\n * @param Cluster   cluster to be made into an elliptical prototype\n * @param Statistics  statistical info about cluster\n * @param Buckets   histogram struct used to analyze distribution\n * @return  Pointer to new elliptical prototype or nullptr.\n */\nstatic PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,\n                                      STATISTICS *Statistics, BUCKETS *Buckets) {\n  PROTOTYPE *Proto = nullptr;\n  int i;\n\n  // check that each dimension is a normal distribution\n  for (i = 0; i < Clusterer->SampleSize; i++) {\n    if (Clusterer->ParamDesc[i].NonEssential) {\n      continue;\n    }\n\n    FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],\n                sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));\n    if (!DistributionOK(Buckets)) {\n      break;\n    }\n  }\n  // if all dimensions matched a normal distribution, make a proto\n  if (i >= Clusterer->SampleSize) {\n    Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);\n  }\n  return (Proto);\n} // MakeEllipticalProto\n\n/**\n * This routine tests each dimension of the specified cluster to\n * see what distribution would best approximate that dimension.\n * Each dimension is compared to the following distributions\n * in order: normal, random, uniform.  If each dimension can\n * be represented by one of these distributions,\n * then a new prototype is formed and returned to the\n * caller.  If it can't be, then nullptr is returned to the caller.\n * @param Clusterer data struct containing samples being clustered\n * @param Cluster   cluster to be made into a prototype\n * @param Statistics  statistical info about cluster\n * @param NormalBuckets histogram struct used to analyze distribution\n * @param Confidence  confidence level for alternate distributions\n * @return  Pointer to new mixed prototype or nullptr.\n */\nstatic PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,\n                                 BUCKETS *NormalBuckets, double Confidence) {\n  PROTOTYPE *Proto;\n  int i;\n  BUCKETS *UniformBuckets = nullptr;\n  BUCKETS *RandomBuckets = nullptr;\n\n  // create a mixed proto to work on - initially assume all dimensions normal\n  Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);\n\n  // find the proper distribution for each dimension\n  for (i = 0; i < Clusterer->SampleSize; i++) {\n    if (Clusterer->ParamDesc[i].NonEssential) {\n      continue;\n    }\n\n    FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],\n                std::sqrt(Proto->Variance.Elliptical[i]));\n    if (DistributionOK(NormalBuckets)) {\n      continue;\n    }\n\n    if (RandomBuckets == nullptr) {\n      RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);\n    }\n    MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));\n    FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],\n                Proto->Variance.Elliptical[i]);\n    if (DistributionOK(RandomBuckets)) {\n      continue;\n    }\n\n    if (UniformBuckets == nullptr) {\n      UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);\n    }\n    MakeDimUniform(i, Proto, Statistics);\n    FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],\n                Proto->Variance.Elliptical[i]);\n    if (DistributionOK(UniformBuckets)) {\n      continue;\n    }\n    break;\n  }\n  // if any dimension failed to match a distribution, discard the proto\n  if (i < Clusterer->SampleSize) {\n    FreePrototype(Proto);\n    Proto = nullptr;\n  }\n  return (Proto);\n} // MakeMixedProto\n\n/**\n * This routine alters the ith dimension of the specified\n * mixed prototype to be D_random.\n * @param i index of dimension to be changed\n * @param Proto prototype whose dimension is to be altered\n * @param ParamDesc description of specified dimension\n */\nstatic void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {\n  Proto->Distrib[i] = D_random;\n  Proto->Mean[i] = ParamDesc->MidRange;\n  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;\n\n  // subtract out the previous magnitude of this dimension from the total\n  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];\n  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;\n  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];\n  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n\n  // note that the proto Weight is irrelevant for D_random protos\n} // MakeDimRandom\n\n/**\n * This routine alters the ith dimension of the specified\n * mixed prototype to be uniform.\n * @param i index of dimension to be changed\n * @param Proto   prototype whose dimension is to be altered\n * @param Statistics  statistical info about prototype\n */\nstatic void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {\n  Proto->Distrib[i] = uniform;\n  Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;\n  Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;\n  if (Proto->Variance.Elliptical[i] < MINVARIANCE) {\n    Proto->Variance.Elliptical[i] = MINVARIANCE;\n  }\n\n  // subtract out the previous magnitude of this dimension from the total\n  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];\n  Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);\n  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];\n  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n\n  // note that the proto Weight is irrelevant for uniform protos\n} // MakeDimUniform\n\n/**\n * This routine searches the cluster tree for all leaf nodes\n * which are samples in the specified cluster.  It computes\n * a full covariance matrix for these samples as well as\n * keeping track of the ranges (min and max) for each\n * dimension.  A special data structure is allocated to\n * return this information to the caller.  An incremental\n * algorithm for computing statistics is not used because\n * it will not work with circular dimensions.\n * @param N number of dimensions\n * @param ParamDesc array of dimension descriptions\n * @param Cluster cluster whose stats are to be computed\n * @return  Pointer to new data structure containing statistics\n */\nstatic STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {\n  int i, j;\n  LIST SearchState;\n  SAMPLE *Sample;\n  uint32_t SampleCountAdjustedForBias;\n\n  // allocate memory to hold the statistics results\n  auto Statistics = new STATISTICS(N);\n\n  // allocate temporary memory to hold the sample to mean distances\n  std::vector<float> Distance(N);\n\n  // find each sample in the cluster and merge it into the statistics\n  InitSampleSearch(SearchState, Cluster);\n  while ((Sample = NextSample(&SearchState)) != nullptr) {\n    for (i = 0; i < N; i++) {\n      Distance[i] = Sample->Mean[i] - Cluster->Mean[i];\n      if (ParamDesc[i].Circular) {\n        if (Distance[i] > ParamDesc[i].HalfRange) {\n          Distance[i] -= ParamDesc[i].Range;\n        }\n        if (Distance[i] < -ParamDesc[i].HalfRange) {\n          Distance[i] += ParamDesc[i].Range;\n        }\n      }\n      if (Distance[i] < Statistics->Min[i]) {\n        Statistics->Min[i] = Distance[i];\n      }\n      if (Distance[i] > Statistics->Max[i]) {\n        Statistics->Max[i] = Distance[i];\n      }\n    }\n    auto CoVariance = &Statistics->CoVariance[0];\n    for (i = 0; i < N; i++) {\n      for (j = 0; j < N; j++, CoVariance++) {\n        *CoVariance += Distance[i] * Distance[j];\n      }\n    }\n  }\n  // normalize the variances by the total number of samples\n  // use SampleCount-1 instead of SampleCount to get an unbiased estimate\n  // also compute the geometic mean of the diagonal variances\n  // ensure that clusters with only 1 sample are handled correctly\n  if (Cluster->SampleCount > 1) {\n    SampleCountAdjustedForBias = Cluster->SampleCount - 1;\n  } else {\n    SampleCountAdjustedForBias = 1;\n  }\n  auto CoVariance = &Statistics->CoVariance[0];\n  for (i = 0; i < N; i++) {\n    for (j = 0; j < N; j++, CoVariance++) {\n      *CoVariance /= SampleCountAdjustedForBias;\n      if (j == i) {\n        if (*CoVariance < MINVARIANCE) {\n          *CoVariance = MINVARIANCE;\n        }\n        Statistics->AvgVariance *= *CoVariance;\n      }\n    }\n  }\n  Statistics->AvgVariance =\n      static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));\n\n  return Statistics;\n} // ComputeStatistics\n\n/**\n * This routine creates a spherical prototype data structure to\n * approximate the samples in the specified cluster.\n * Spherical prototypes have a single variance which is\n * common across all dimensions.  All dimensions are normally\n * distributed and independent.\n * @param N number of dimensions\n * @param Cluster cluster to be made into a spherical prototype\n * @param Statistics  statistical info about samples in cluster\n * @return  Pointer to a new spherical prototype data structure\n */\nstatic PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {\n  PROTOTYPE *Proto;\n\n  Proto = NewSimpleProto(N, Cluster);\n\n  Proto->Variance.Spherical = Statistics->AvgVariance;\n  if (Proto->Variance.Spherical < MINVARIANCE) {\n    Proto->Variance.Spherical = MINVARIANCE;\n  }\n\n  Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);\n  Proto->TotalMagnitude = static_cast<float>(\n      pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));\n  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;\n  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n\n  return (Proto);\n} // NewSphericalProto\n\n/**\n * This routine creates an elliptical prototype data structure to\n * approximate the samples in the specified cluster.\n * Elliptical prototypes have a variance for each dimension.\n * All dimensions are normally distributed and independent.\n * @param N number of dimensions\n * @param Cluster cluster to be made into an elliptical prototype\n * @param Statistics  statistical info about samples in cluster\n * @return  Pointer to a new elliptical prototype data structure\n */\nstatic PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {\n  PROTOTYPE *Proto;\n  int i;\n\n  Proto = NewSimpleProto(N, Cluster);\n  Proto->Variance.Elliptical = new float[N];\n  Proto->Magnitude.Elliptical = new float[N];\n  Proto->Weight.Elliptical = new float[N];\n\n  auto CoVariance = &Statistics->CoVariance[0];\n  Proto->TotalMagnitude = 1.0;\n  for (i = 0; i < N; i++, CoVariance += N + 1) {\n    Proto->Variance.Elliptical[i] = *CoVariance;\n    if (Proto->Variance.Elliptical[i] < MINVARIANCE) {\n      Proto->Variance.Elliptical[i] = MINVARIANCE;\n    }\n\n    Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);\n    Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];\n    Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];\n  }\n  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n  Proto->Style = elliptical;\n  return (Proto);\n} // NewEllipticalProto\n\n/**\n * This routine creates a mixed prototype data structure to\n * approximate the samples in the specified cluster.\n * Mixed prototypes can have different distributions for\n * each dimension.  All dimensions are independent.  The\n * structure is initially filled in as though it were an\n * elliptical prototype.  The actual distributions of the\n * dimensions can be altered by other routines.\n * @param N number of dimensions\n * @param Cluster cluster to be made into a mixed prototype\n * @param Statistics  statistical info about samples in cluster\n * @return  Pointer to a new mixed prototype data structure\n */\nstatic PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {\n  auto Proto = NewEllipticalProto(N, Cluster, Statistics);\n  Proto->Distrib.clear();\n  Proto->Distrib.resize(N, normal);\n  Proto->Style = mixed;\n  return Proto;\n} // NewMixedProto\n\n/**\n * This routine allocates memory to hold a simple prototype\n * data structure, i.e. one without independent distributions\n * and variances for each dimension.\n * @param N number of dimensions\n * @param Cluster cluster to be made into a prototype\n * @return  Pointer to new simple prototype\n */\nstatic PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {\n  auto Proto = new PROTOTYPE;\n  Proto->Mean = Cluster->Mean;\n  Proto->Distrib.clear();\n  Proto->Significant = true;\n  Proto->Merged = false;\n  Proto->Style = spherical;\n  Proto->NumSamples = Cluster->SampleCount;\n  Proto->Cluster = Cluster;\n  Proto->Cluster->Prototype = true;\n  return Proto;\n} // NewSimpleProto\n\n/**\n * This routine returns true if the specified covariance\n * matrix indicates that all N dimensions are independent of\n * one another.  One dimension is judged to be independent of\n * another when the magnitude of the corresponding correlation\n * coefficient is\n * less than the specified Independence factor.  The\n * correlation coefficient is calculated as: (see Duda and\n * Hart, pg. 247)\n * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])\n * The covariance matrix is assumed to be symmetric (which\n * should always be true).\n * @param ParamDesc descriptions of each feature space dimension\n * @param N number of dimensions\n * @param CoVariance  ptr to a covariance matrix\n * @param Independence  max off-diagonal correlation coefficient\n * @return true if dimensions are independent, false otherwise\n */\nstatic bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {\n  int i, j;\n  float *VARii; // points to ith on-diagonal element\n  float *VARjj; // points to jth on-diagonal element\n  float CorrelationCoeff;\n\n  VARii = CoVariance;\n  for (i = 0; i < N; i++, VARii += N + 1) {\n    if (ParamDesc[i].NonEssential) {\n      continue;\n    }\n\n    VARjj = VARii + N + 1;\n    CoVariance = VARii + 1;\n    for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {\n      if (ParamDesc[j].NonEssential) {\n        continue;\n      }\n\n      if ((*VARii == 0.0) || (*VARjj == 0.0)) {\n        CorrelationCoeff = 0.0;\n      } else {\n        CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));\n      }\n      if (CorrelationCoeff > Independence) {\n        return false;\n      }\n    }\n  }\n  return true;\n} // Independent\n\n/**\n * This routine returns a histogram data structure which can\n * be used by other routines to place samples into histogram\n * buckets, and then apply a goodness of fit test to the\n * histogram data to determine if the samples belong to the\n * specified probability distribution.  The routine keeps\n * a list of bucket data structures which have already been\n * created so that it minimizes the computation time needed\n * to create a new bucket.\n * @param clusterer  which keeps a bucket_cache for us.\n * @param Distribution  type of probability distribution to test for\n * @param SampleCount number of samples that are available\n * @param Confidence  probability of a Type I error\n * @return  Bucket data structure\n */\nstatic BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,\n                           double Confidence) {\n  // Get an old bucket structure with the same number of buckets.\n  uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);\n  BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];\n\n  // If a matching bucket structure is not found, make one and save it.\n  if (Buckets == nullptr) {\n    Buckets = MakeBuckets(Distribution, SampleCount, Confidence);\n    clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;\n  } else {\n    // Just adjust the existing buckets.\n    if (SampleCount != Buckets->SampleCount) {\n      AdjustBuckets(Buckets, SampleCount);\n    }\n    if (Confidence != Buckets->Confidence) {\n      Buckets->Confidence = Confidence;\n      Buckets->ChiSquared =\n          ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);\n    }\n    InitBuckets(Buckets);\n  }\n  return Buckets;\n} // GetBuckets\n\n/**\n * This routine creates a histogram data structure which can\n * be used by other routines to place samples into histogram\n * buckets, and then apply a goodness of fit test to the\n * histogram data to determine if the samples belong to the\n * specified probability distribution.  The buckets are\n * allocated in such a way that the expected frequency of\n * samples in each bucket is approximately the same.  In\n * order to make this possible, a mapping table is\n * computed which maps \"normalized\" samples into the\n * appropriate bucket.\n * @param Distribution  type of probability distribution to test for\n * @param SampleCount number of samples that are available\n * @param Confidence  probability of a Type I error\n * @return Pointer to new histogram data structure\n */\nstatic BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {\n  const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};\n  int i, j;\n  double BucketProbability;\n  double NextBucketBoundary;\n  double Probability;\n  double ProbabilityDelta;\n  double LastProbDensity;\n  double ProbDensity;\n  uint16_t CurrentBucket;\n  bool Symmetrical;\n\n  // allocate memory needed for data structure\n  auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));\n  Buckets->SampleCount = SampleCount;\n  Buckets->Confidence = Confidence;\n\n  // initialize simple fields\n  Buckets->Distribution = Distribution;\n\n  // all currently defined distributions are symmetrical\n  Symmetrical = true;\n  Buckets->ChiSquared =\n      ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);\n\n  if (Symmetrical) {\n    // allocate buckets so that all have approx. equal probability\n    BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);\n\n    // distribution is symmetric so fill in upper half then copy\n    CurrentBucket = Buckets->NumberOfBuckets / 2;\n    if (Odd(Buckets->NumberOfBuckets)) {\n      NextBucketBoundary = BucketProbability / 2;\n    } else {\n      NextBucketBoundary = BucketProbability;\n    }\n\n    Probability = 0.0;\n    LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);\n    for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {\n      ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);\n      ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);\n      Probability += ProbabilityDelta;\n      if (Probability > NextBucketBoundary) {\n        if (CurrentBucket < Buckets->NumberOfBuckets - 1) {\n          CurrentBucket++;\n        }\n        NextBucketBoundary += BucketProbability;\n      }\n      Buckets->Bucket[i] = CurrentBucket;\n      Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);\n      LastProbDensity = ProbDensity;\n    }\n    // place any leftover probability into the last bucket\n    Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);\n\n    // copy upper half of distribution to lower half\n    for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {\n      Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);\n    }\n\n    // copy upper half of expected counts to lower half\n    for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {\n      Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];\n    }\n  }\n  return Buckets;\n} // MakeBuckets\n\n/**\n * This routine computes the optimum number of histogram\n * buckets that should be used in a chi-squared goodness of\n * fit test for the specified number of samples.  The optimum\n * number is computed based on Table 4.1 on pg. 147 of\n * \"Measurement and Analysis of Random Data\" by Bendat & Piersol.\n * Linear interpolation is used to interpolate between table\n * values.  The table is intended for a 0.05 level of\n * significance (alpha).  This routine assumes that it is\n * equally valid for other alpha's, which may not be true.\n * @param SampleCount number of samples to be tested\n * @return Optimum number of histogram buckets\n */\nstatic uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {\n  uint8_t Last, Next;\n  float Slope;\n\n  if (SampleCount < kCountTable[0]) {\n    return kBucketsTable[0];\n  }\n\n  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {\n    if (SampleCount <= kCountTable[Next]) {\n      Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /\n              static_cast<float>(kCountTable[Next] - kCountTable[Last]);\n      return (\n          static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));\n    }\n  }\n  return kBucketsTable[Last];\n} // OptimumNumberOfBuckets\n\n/**\n * This routine computes the chi-squared value which will\n * leave a cumulative probability of Alpha in the right tail\n * of a chi-squared distribution with the specified number of\n * degrees of freedom.  Alpha must be between 0 and 1.\n * DegreesOfFreedom must be even.  The routine maintains an\n * array of lists.  Each list corresponds to a different\n * number of degrees of freedom.  Each entry in the list\n * corresponds to a different alpha value and its corresponding\n * chi-squared value.  Therefore, once a particular chi-squared\n * value is computed, it is stored in the list and never\n * needs to be computed again.\n * @param DegreesOfFreedom  determines shape of distribution\n * @param Alpha probability of right tail\n * @return Desired chi-squared value\n */\nstatic double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)\n#define CHIACCURACY 0.01\n#define MINALPHA (1e-200)\n{\n  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];\n\n  // limit the minimum alpha that can be used - if alpha is too small\n  //      it may not be possible to compute chi-squared.\n  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);\n  if (Odd(DegreesOfFreedom)) {\n    DegreesOfFreedom++;\n  }\n\n  /* find the list of chi-squared values which have already been computed\n   for the specified number of degrees of freedom.  Search the list for\n   the desired chi-squared. */\n  CHISTRUCT SearchKey(0.0, Alpha);\n  auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch);\n  auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr);\n\n  if (OldChiSquared == nullptr) {\n    OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);\n    OldChiSquared->ChiSquared =\n        Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);\n    ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);\n  } else {\n    // further optimization might move OldChiSquared to front of list\n  }\n\n  return (OldChiSquared->ChiSquared);\n\n} // ComputeChiSquared\n\n/**\n * This routine computes the probability density function\n * of a discrete normal distribution defined by the global\n * variables kNormalMean, kNormalVariance, and kNormalMagnitude.\n * Normal magnitude could, of course, be computed in terms of\n * the normal variance but it is precomputed for efficiency.\n * @param x number to compute the normal probability density for\n * @note Globals:\n *    kNormalMean mean of a discrete normal distribution\n *    kNormalVariance variance of a discrete normal distribution\n *    kNormalMagnitude  magnitude of a discrete normal distribution\n * @return  The value of the normal distribution at x.\n */\nstatic double NormalDensity(int32_t x) {\n  double Distance;\n\n  Distance = x - kNormalMean;\n  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);\n} // NormalDensity\n\n/**\n * This routine computes the probability density function\n * of a uniform distribution at the specified point.  The\n * range of the distribution is from 0 to BUCKETTABLESIZE.\n * @param x number to compute the uniform probability density for\n * @return The value of the uniform distribution at x.\n */\nstatic double UniformDensity(int32_t x) {\n  constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;\n\n  if ((x >= 0) && (x <= BUCKETTABLESIZE)) {\n    return UniformDistributionDensity;\n  } else {\n    return 0.0;\n  }\n} // UniformDensity\n\n/**\n * This routine computes a trapezoidal approximation to the\n * integral of a function over a small delta in x.\n * @param f1  value of function at x1\n * @param f2  value of function at x2\n * @param Dx  x2 - x1 (should always be positive)\n * @return Approximation of the integral of the function from x1 to x2.\n */\nstatic double Integral(double f1, double f2, double Dx) {\n  return (f1 + f2) * Dx / 2.0;\n} // Integral\n\n/**\n * This routine counts the number of cluster samples which\n * fall within the various histogram buckets in Buckets.  Only\n * one dimension of each sample is examined.  The exact meaning\n * of the Mean and StdDev parameters depends on the\n * distribution which is being analyzed (this info is in the\n * Buckets data structure).  For normal distributions, Mean\n * and StdDev have the expected meanings.  For uniform and\n * random distributions the Mean is the center point of the\n * range and the StdDev is 1/2 the range.  A dimension with\n * zero standard deviation cannot be statistically analyzed.\n * In this case, a pseudo-analysis is used.\n * The Buckets data structure is filled in.\n * @param Buckets histogram buckets to count samples\n * @param Cluster cluster whose samples are being analyzed\n * @param Dim dimension of samples which is being analyzed\n * @param ParamDesc description of the dimension\n * @param Mean  \"mean\" of the distribution\n * @param StdDev  \"standard deviation\" of the distribution\n */\nstatic void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,\n                        float Mean, float StdDev) {\n  uint16_t BucketID;\n  int i;\n  LIST SearchState;\n  SAMPLE *Sample;\n\n  // initialize the histogram bucket counts to 0\n  for (i = 0; i < Buckets->NumberOfBuckets; i++) {\n    Buckets->Count[i] = 0;\n  }\n\n  if (StdDev == 0.0) {\n    /* if the standard deviation is zero, then we can't statistically\n   analyze the cluster.  Use a pseudo-analysis: samples exactly on\n   the mean are distributed evenly across all buckets.  Samples greater\n   than the mean are placed in the last bucket; samples less than the\n   mean are placed in the first bucket. */\n\n    InitSampleSearch(SearchState, Cluster);\n    i = 0;\n    while ((Sample = NextSample(&SearchState)) != nullptr) {\n      if (Sample->Mean[Dim] > Mean) {\n        BucketID = Buckets->NumberOfBuckets - 1;\n      } else if (Sample->Mean[Dim] < Mean) {\n        BucketID = 0;\n      } else {\n        BucketID = i;\n      }\n      Buckets->Count[BucketID] += 1;\n      i++;\n      if (i >= Buckets->NumberOfBuckets) {\n        i = 0;\n      }\n    }\n  } else {\n    // search for all samples in the cluster and add to histogram buckets\n    InitSampleSearch(SearchState, Cluster);\n    while ((Sample = NextSample(&SearchState)) != nullptr) {\n      switch (Buckets->Distribution) {\n        case normal:\n          BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);\n          break;\n        case D_random:\n        case uniform:\n          BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);\n          break;\n        default:\n          BucketID = 0;\n      }\n      Buckets->Count[Buckets->Bucket[BucketID]] += 1;\n    }\n  }\n} // FillBuckets\n\n/**\n * This routine determines which bucket x falls into in the\n * discrete normal distribution defined by kNormalMean\n * and kNormalStdDev.  x values which exceed the range of\n * the discrete distribution are clipped.\n * @param ParamDesc used to identify circular dimensions\n * @param x value to be normalized\n * @param Mean  mean of normal distribution\n * @param StdDev  standard deviation of normal distribution\n * @return Bucket number into which x falls\n */\nstatic uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {\n  float X;\n\n  // wraparound circular parameters if necessary\n  if (ParamDesc->Circular) {\n    if (x - Mean > ParamDesc->HalfRange) {\n      x -= ParamDesc->Range;\n    } else if (x - Mean < -ParamDesc->HalfRange) {\n      x += ParamDesc->Range;\n    }\n  }\n\n  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;\n  if (X < 0) {\n    return 0;\n  }\n  if (X > BUCKETTABLESIZE - 1) {\n    return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));\n  }\n  return static_cast<uint16_t>(floor(static_cast<double>(X)));\n} // NormalBucket\n\n/**\n * This routine determines which bucket x falls into in the\n * discrete uniform distribution defined by\n * BUCKETTABLESIZE.  x values which exceed the range of\n * the discrete distribution are clipped.\n * @param ParamDesc used to identify circular dimensions\n * @param x value to be normalized\n * @param Mean  center of range of uniform distribution\n * @param StdDev  1/2 the range of the uniform distribution\n * @return Bucket number into which x falls\n */\nstatic uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {\n  float X;\n\n  // wraparound circular parameters if necessary\n  if (ParamDesc->Circular) {\n    if (x - Mean > ParamDesc->HalfRange) {\n      x -= ParamDesc->Range;\n    } else if (x - Mean < -ParamDesc->HalfRange) {\n      x += ParamDesc->Range;\n    }\n  }\n\n  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);\n  if (X < 0) {\n    return 0;\n  }\n  if (X > BUCKETTABLESIZE - 1) {\n    return static_cast<uint16_t>(BUCKETTABLESIZE - 1);\n  }\n  return static_cast<uint16_t>(floor(static_cast<double>(X)));\n} // UniformBucket\n\n/**\n * This routine performs a chi-square goodness of fit test\n * on the histogram data in the Buckets data structure.\n * true is returned if the histogram matches the probability\n * distribution which was specified when the Buckets\n * structure was originally created.  Otherwise false is\n * returned.\n * @param Buckets   histogram data to perform chi-square test on\n * @return true if samples match distribution, false otherwise\n */\nstatic bool DistributionOK(BUCKETS *Buckets) {\n  float FrequencyDifference;\n  float TotalDifference;\n  int i;\n\n  // compute how well the histogram matches the expected histogram\n  TotalDifference = 0.0;\n  for (i = 0; i < Buckets->NumberOfBuckets; i++) {\n    FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];\n    TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];\n  }\n\n  // test to see if the difference is more than expected\n  if (TotalDifference > Buckets->ChiSquared) {\n    return false;\n  } else {\n    return true;\n  }\n} // DistributionOK\n\n/**\n * This routine computes the degrees of freedom that should\n * be used in a chi-squared test with the specified number of\n * histogram buckets.  The result is always rounded up to\n * the next even number so that the value of chi-squared can be\n * computed more easily.  This will cause the value of\n * chi-squared to be higher than the optimum value, resulting\n * in the chi-square test being more lenient than optimum.\n * @param Distribution    distribution being tested for\n * @param HistogramBuckets  number of buckets in chi-square test\n * @return The number of degrees of freedom for a chi-square test\n */\nstatic uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {\n  static uint8_t DegreeOffsets[] = {3, 3, 1};\n\n  uint16_t AdjustedNumBuckets;\n\n  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];\n  if (Odd(AdjustedNumBuckets)) {\n    AdjustedNumBuckets++;\n  }\n  return (AdjustedNumBuckets);\n\n} // DegreesOfFreedom\n\n/**\n * This routine multiplies each ExpectedCount histogram entry\n * by NewSampleCount/OldSampleCount so that the histogram\n * is now adjusted to the new sample count.\n * @param Buckets histogram data structure to adjust\n * @param NewSampleCount  new sample count to adjust to\n */\nstatic void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {\n  int i;\n  double AdjustFactor;\n\n  AdjustFactor =\n      ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));\n\n  for (i = 0; i < Buckets->NumberOfBuckets; i++) {\n    Buckets->ExpectedCount[i] *= AdjustFactor;\n  }\n\n  Buckets->SampleCount = NewSampleCount;\n\n} // AdjustBuckets\n\n/**\n * This routine sets the bucket counts in the specified histogram\n * to zero.\n * @param Buckets histogram data structure to init\n */\nstatic void InitBuckets(BUCKETS *Buckets) {\n  int i;\n\n  for (i = 0; i < Buckets->NumberOfBuckets; i++) {\n    Buckets->Count[i] = 0;\n  }\n\n} // InitBuckets\n\n/**\n * This routine is used to search a list of structures which\n * hold pre-computed chi-squared values for a chi-squared\n * value whose corresponding alpha field matches the alpha\n * field of SearchKey.\n *\n * It is called by the list search routines.\n *\n * @param arg1 chi-squared struct being tested for a match\n * @param arg2 chi-squared struct that is the search key\n * @return true if ChiStruct's Alpha matches SearchKey's Alpha\n */\nstatic int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,\n                      void *arg2) { // CHISTRUCT *SearchKey)\n  auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);\n  auto *SearchKey = static_cast<CHISTRUCT *>(arg2);\n\n  return (ChiStruct->Alpha == SearchKey->Alpha);\n\n} // AlphaMatch\n\n/**\n * This routine attempts to find an x value at which Function\n * goes to zero (i.e. a root of the function).  It will only\n * work correctly if a solution actually exists and there\n * are no extrema between the solution and the InitialGuess.\n * The algorithms used are extremely primitive.\n *\n * @param Function  function whose zero is to be found\n * @param FunctionParams  arbitrary data to pass to function\n * @param InitialGuess  point to start solution search at\n * @param Accuracy  maximum allowed error\n * @return Solution of function (x for which f(x) = 0).\n */\nstatic double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)\n#define INITIALDELTA 0.1\n#define DELTARATIO 0.1\n{\n  double x;\n  double f;\n  double Slope;\n  double Delta;\n  double NewDelta;\n  double xDelta;\n  double LastPosX, LastNegX;\n\n  x = InitialGuess;\n  Delta = INITIALDELTA;\n  LastPosX = FLT_MAX;\n  LastNegX = -FLT_MAX;\n  f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);\n  while (Abs(LastPosX - LastNegX) > Accuracy) {\n    // keep track of outer bounds of current estimate\n    if (f < 0) {\n      LastNegX = x;\n    } else {\n      LastPosX = x;\n    }\n\n    // compute the approx. slope of f(x) at the current point\n    Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;\n\n    // compute the next solution guess */\n    xDelta = f / Slope;\n    x -= xDelta;\n\n    // reduce the delta used for computing slope to be a fraction of\n    // the amount moved to get to the new guess\n    NewDelta = Abs(xDelta) * DELTARATIO;\n    if (NewDelta < Delta) {\n      Delta = NewDelta;\n    }\n\n    // compute the value of the function at the new guess\n    f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);\n  }\n  return (x);\n\n} // Solve\n\n/**\n * This routine computes the area under a chi density curve\n * from 0 to x, minus the desired area under the curve.  The\n * number of degrees of freedom of the chi curve is specified\n * in the ChiParams structure.  The desired area is also\n * specified in the ChiParams structure as Alpha (or 1 minus\n * the desired area).  This routine is intended to be passed\n * to the Solve() function to find the value of chi-squared\n * which will yield a desired area under the right tail of\n * the chi density curve.  The function will only work for\n * even degrees of freedom.  The equations are based on\n * integrating the chi density curve in parts to obtain\n * a series that can be used to compute the area under the\n * curve.\n * @param ChiParams contains degrees of freedom and alpha\n * @param x   value of chi-squared to evaluate\n * @return Error between actual and desired area under the chi curve.\n */\nstatic double ChiArea(CHISTRUCT *ChiParams, double x) {\n  int i, N;\n  double SeriesTotal;\n  double Denominator;\n  double PowerOfx;\n\n  N = ChiParams->DegreesOfFreedom / 2 - 1;\n  SeriesTotal = 1;\n  Denominator = 1;\n  PowerOfx = 1;\n  for (i = 1; i <= N; i++) {\n    Denominator *= 2 * i;\n    PowerOfx *= x;\n    SeriesTotal += PowerOfx / Denominator;\n  }\n  return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);\n\n} // ChiArea\n\n/**\n * This routine looks at all samples in the specified cluster.\n * It computes a running estimate of the percentage of the\n * characters which have more than 1 sample in the cluster.\n * When this percentage exceeds MaxIllegal, true is returned.\n * Otherwise false is returned.  The CharID\n * fields must contain integers which identify the training\n * characters which were used to generate the sample.  One\n * integer is used for each sample.  The NumChar field in\n * the Clusterer must contain the number of characters in the\n * training set.  All CharID fields must be between 0 and\n * NumChar-1.  The main function of this routine is to help\n * identify clusters which need to be split further, i.e. if\n * numerous training characters have 2 or more features which are\n * contained in the same cluster, then the cluster should be\n * split.\n *\n * @param Clusterer data structure holding cluster tree\n * @param Cluster   cluster containing samples to be tested\n * @param MaxIllegal  max percentage of samples allowed to have\n *        more than 1 feature in the cluster\n * @return true if the cluster should be split, false otherwise.\n */\nstatic bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)\n#define ILLEGAL_CHAR 2\n{\n  static std::vector<uint8_t> CharFlags;\n  LIST SearchState;\n  SAMPLE *Sample;\n  int32_t CharID;\n  int32_t NumCharInCluster;\n  int32_t NumIllegalInCluster;\n  float PercentIllegal;\n\n  // initial estimate assumes that no illegal chars exist in the cluster\n  NumCharInCluster = Cluster->SampleCount;\n  NumIllegalInCluster = 0;\n\n  if (Clusterer->NumChar > CharFlags.size()) {\n    CharFlags.resize(Clusterer->NumChar);\n  }\n\n  for (auto &CharFlag : CharFlags) {\n    CharFlag = false;\n  }\n\n  // find each sample in the cluster and check if we have seen it before\n  InitSampleSearch(SearchState, Cluster);\n  while ((Sample = NextSample(&SearchState)) != nullptr) {\n    CharID = Sample->CharID;\n    if (CharFlags[CharID] == 0) {\n      CharFlags[CharID] = true;\n    } else {\n      if (CharFlags[CharID] == 1) {\n        NumIllegalInCluster++;\n        CharFlags[CharID] = ILLEGAL_CHAR;\n      }\n      NumCharInCluster--;\n      PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;\n      if (PercentIllegal > MaxIllegal) {\n        destroy(SearchState);\n        return true;\n      }\n    }\n  }\n  return false;\n\n} // MultipleCharSamples\n\n/**\n * Compute the inverse of a matrix using LU decomposition with partial pivoting.\n * The return value is the sum of norms of the off-diagonal terms of the\n * product of a and inv. (A measure of the error.)\n */\nstatic double InvertMatrix(const float *input, int size, float *inv) {\n  // Allocate memory for the 2D arrays.\n  GENERIC_2D_ARRAY<double> U(size, size, 0.0);\n  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);\n  GENERIC_2D_ARRAY<double> L(size, size, 0.0);\n\n  // Initialize the working matrices. U starts as input, L as I and U_inv as O.\n  int row;\n  int col;\n  for (row = 0; row < size; row++) {\n    for (col = 0; col < size; col++) {\n      U[row][col] = input[row * size + col];\n      L[row][col] = row == col ? 1.0 : 0.0;\n      U_inv[row][col] = 0.0;\n    }\n  }\n\n  // Compute forward matrix by inversion by LU decomposition of input.\n  for (col = 0; col < size; ++col) {\n    // Find best pivot\n    int best_row = 0;\n    double best_pivot = -1.0;\n    for (row = col; row < size; ++row) {\n      if (Abs(U[row][col]) > best_pivot) {\n        best_pivot = Abs(U[row][col]);\n        best_row = row;\n      }\n    }\n    // Exchange pivot rows.\n    if (best_row != col) {\n      for (int k = 0; k < size; ++k) {\n        double tmp = U[best_row][k];\n        U[best_row][k] = U[col][k];\n        U[col][k] = tmp;\n        tmp = L[best_row][k];\n        L[best_row][k] = L[col][k];\n        L[col][k] = tmp;\n      }\n    }\n    // Now do the pivot itself.\n    for (row = col + 1; row < size; ++row) {\n      double ratio = -U[row][col] / U[col][col];\n      for (int j = col; j < size; ++j) {\n        U[row][j] += U[col][j] * ratio;\n      }\n      for (int k = 0; k < size; ++k) {\n        L[row][k] += L[col][k] * ratio;\n      }\n    }\n  }\n  // Next invert U.\n  for (col = 0; col < size; ++col) {\n    U_inv[col][col] = 1.0 / U[col][col];\n    for (row = col - 1; row >= 0; --row) {\n      double total = 0.0;\n      for (int k = col; k > row; --k) {\n        total += U[row][k] * U_inv[k][col];\n      }\n      U_inv[row][col] = -total / U[row][row];\n    }\n  }\n  // Now the answer is U_inv.L.\n  for (row = 0; row < size; row++) {\n    for (col = 0; col < size; col++) {\n      double sum = 0.0;\n      for (int k = row; k < size; ++k) {\n        sum += U_inv[row][k] * L[k][col];\n      }\n      inv[row * size + col] = sum;\n    }\n  }\n  // Check matrix product.\n  double error_sum = 0.0;\n  for (row = 0; row < size; row++) {\n    for (col = 0; col < size; col++) {\n      double sum = 0.0;\n      for (int k = 0; k < size; ++k) {\n        sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];\n      }\n      if (row != col) {\n        error_sum += Abs(sum);\n      }\n    }\n  }\n  return error_sum;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/cluster.h",
    "content": "/******************************************************************************\n ** Filename:   cluster.h\n ** Purpose:    Definition of feature space clustering routines\n ** Author:     Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#ifndef CLUSTER_H\n#define CLUSTER_H\n\n#include \"kdtree.h\"\n#include \"oldlist.h\"\n\nnamespace tesseract {\n\nstruct BUCKETS;\n\n#define MINBUCKETS 5\n#define MAXBUCKETS 39\n\n/*----------------------------------------------------------------------\n          Types\n----------------------------------------------------------------------*/\nstruct CLUSTER {\n  CLUSTER(size_t n) : Mean(n) {\n  }\n\n  ~CLUSTER() {\n    delete Left;\n    delete Right;\n  }\n\n  bool Clustered : 1;        // true if included in a higher cluster\n  bool Prototype : 1;        // true if cluster represented by a proto\n  unsigned SampleCount : 30; // number of samples in this cluster\n  CLUSTER *Left;       // ptr to left sub-cluster\n  CLUSTER *Right;      // ptr to right sub-cluster\n  int32_t CharID;            // identifier of char sample came from\n  std::vector<float> Mean;   // mean of cluster - SampleSize floats\n};\nusing SAMPLE = CLUSTER; // can refer to as either sample or cluster\n\ntypedef enum { spherical, elliptical, mixed, automatic } PROTOSTYLE;\n\nstruct CLUSTERCONFIG {   // parameters to control clustering\n  PROTOSTYLE ProtoStyle; // specifies types of protos to be made\n  float MinSamples;      // min # of samples per proto - % of total\n  float MaxIllegal;      // max percentage of samples in a cluster which\n                         // have more than 1 feature in that cluster\n  float Independence;    // desired independence between dimensions\n  double Confidence;     // desired confidence in prototypes created\n  int MagicSamples;      // Ideal number of samples in a cluster.\n};\n\ntypedef enum { normal, uniform, D_random, DISTRIBUTION_COUNT } DISTRIBUTION;\n\nunion FLOATUNION {\n  float Spherical;\n  float *Elliptical;\n};\n\nstruct PROTOTYPE {\n  bool Significant : 1;     // true if prototype is significant\n  bool Merged : 1;          // Merged after clustering so do not output\n                            // but kept for display purposes. If it has no\n                            // samples then it was actually merged.\n                            // Otherwise it matched an already significant\n                            // cluster.\n  unsigned Style : 2;       // spherical, elliptical, or mixed\n  unsigned NumSamples : 28; // number of samples in the cluster\n  CLUSTER *Cluster;         // ptr to cluster which made prototype\n  std::vector<DISTRIBUTION> Distrib; // different distribution for each dimension\n  std::vector<float> Mean;  // prototype mean\n  float TotalMagnitude;     // total magnitude over all dimensions\n  float LogMagnitude;       // log base e of TotalMagnitude\n  FLOATUNION Variance;      // prototype variance\n  FLOATUNION Magnitude;     // magnitude of density function\n  FLOATUNION Weight;        // weight of density function\n};\n\nstruct CLUSTERER {\n  int16_t SampleSize;      // number of parameters per sample\n  PARAM_DESC *ParamDesc;   // description of each parameter\n  int32_t NumberOfSamples; // total number of samples being clustered\n  KDTREE *KDTree;          // for optimal nearest neighbor searching\n  CLUSTER *Root;           // ptr to root cluster of cluster tree\n  LIST ProtoList;          // list of prototypes\n  uint32_t NumChar;        // # of characters represented by samples\n  // cache of reusable histograms by distribution type and number of buckets.\n  BUCKETS *bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];\n};\n\nstruct SAMPLELIST {\n  int32_t NumSamples;    // number of samples in list\n  int32_t MaxNumSamples; // maximum size of list\n  SAMPLE *Sample[1];     // array of ptrs to sample data structures\n};\n\n// low level cluster tree analysis routines.\n#define InitSampleSearch(S, C) (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))\n\n/*--------------------------------------------------------------------------\n        Public Function Prototypes\n--------------------------------------------------------------------------*/\nTESS_API\nCLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);\n\nTESS_API\nSAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID);\n\nTESS_API\nLIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);\n\nTESS_API\nvoid FreeClusterer(CLUSTERER *Clusterer);\n\nTESS_API\nvoid FreeProtoList(LIST *ProtoList);\n\nvoid FreePrototype(void *arg); // PROTOTYPE *Prototype);\n\nCLUSTER *NextSample(LIST *SearchState);\n\nfloat Mean(PROTOTYPE *Proto, uint16_t Dimension);\n\nfloat StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension);\n\nTESS_API\nint32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],\n                      float m1[], float m2[]);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/clusttool.cpp",
    "content": "/******************************************************************************\n ** Filename: clusttool.cpp\n ** Purpose:  Misc. tools for use with the clustering routines\n ** Author:   Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"clusttool.h\"\n\n#include <cmath>   // for M_PI, std::isnan\n#include <locale>  // for std::locale::classic\n#include <sstream> // for std::stringstream\n\nnamespace tesseract {\n\n//---------------Global Data Definitions and Declarations--------------------\n#define TOKENSIZE 80 ///< max size of tokens read from an input file\n#define QUOTED_TOKENSIZE \"79\"\n#define MAXSAMPLESIZE 65535 ///< max num of dimensions in feature space\n\n/**\n * This routine reads N floats from the specified text file\n * and places them into Buffer.  If Buffer is nullptr, a buffer\n * is created and passed back to the caller.  If EOF is\n * encountered before any floats can be read, nullptr is\n * returned.\n * @param fp open text file to read floats from\n * @param N number of floats to read\n * @param Buffer pointer to buffer to place floats into\n * @return Pointer to buffer holding floats or nullptr if EOF\n * @note Globals: None\n */\nstatic bool ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {\n  const int kMaxLineSize = 1024;\n  char line[kMaxLineSize];\n  if (fp->FGets(line, kMaxLineSize) == nullptr) {\n    tprintf(\"Hit EOF in ReadNFloats!\\n\");\n    return false;\n  }\n\n  std::stringstream stream(line);\n  // Use \"C\" locale (needed for float values Buffer[i]).\n  stream.imbue(std::locale::classic());\n  for (uint16_t i = 0; i < N; i++) {\n    float f = NAN;\n    stream >> f;\n    if (std::isnan(f)) {\n      tprintf(\"Read of %u floats failed!\\n\", N);\n      return false;\n    }\n    Buffer[i] = f;\n  }\n  return true;\n}\n\n/**\n * This routine writes a text representation of N floats from\n * an array to a file.  All of the floats are placed on one line.\n * @param File open text file to write N floats to\n * @param N number of floats to write\n * @param Array array of floats to write\n */\nstatic void WriteNFloats(FILE *File, uint16_t N, float Array[]) {\n  for (int i = 0; i < N; i++) {\n    fprintf(File, \" %9.6f\", Array[i]);\n  }\n  fprintf(File, \"\\n\");\n}\n\n/**\n * This routine writes to the specified text file a word\n * which represents the ProtoStyle.  It does not append\n * a carriage return to the end.\n * @param File open text file to write prototype style to\n * @param ProtoStyle prototype style to write\n */\nstatic void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {\n  switch (ProtoStyle) {\n    case spherical:\n      fprintf(File, \"spherical\");\n      break;\n    case elliptical:\n      fprintf(File, \"elliptical\");\n      break;\n    case mixed:\n      fprintf(File, \"mixed\");\n      break;\n    case automatic:\n      fprintf(File, \"automatic\");\n      break;\n  }\n}\n\n/**\n * This routine reads a single integer from the specified\n * file and checks to ensure that it is between 0 and\n * MAXSAMPLESIZE.\n * @param fp open text file to read sample size from\n * @return Sample size\n * @note Globals: None\n */\nuint16_t ReadSampleSize(TFile *fp) {\n  int SampleSize = 0;\n\n  const int kMaxLineSize = 100;\n  char line[kMaxLineSize];\n  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);\n  ASSERT_HOST(sscanf(line, \"%d\", &SampleSize) == 1);\n  ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);\n  return SampleSize;\n}\n\n/**\n * This routine reads textual descriptions of sets of parameters\n * which describe the characteristics of feature dimensions.\n *\n * @param fp open text file to read N parameter descriptions from\n * @param N number of parameter descriptions to read\n * @return Pointer to an array of parameter descriptors.\n * @note Globals: None\n */\nPARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {\n  auto ParamDesc = new PARAM_DESC[N];\n  for (int i = 0; i < N; i++) {\n    const int kMaxLineSize = TOKENSIZE * 4;\n    char line[kMaxLineSize];\n    ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);\n    std::istringstream stream(line);\n    // Use \"C\" locale (needed for float values Min, Max).\n    stream.imbue(std::locale::classic());\n    std::string linear_token;\n    stream >> linear_token;\n    std::string essential_token;\n    stream >> essential_token;\n    stream >> ParamDesc[i].Min;\n    stream >> ParamDesc[i].Max;\n    ASSERT_HOST(!stream.fail());\n    ParamDesc[i].Circular = (linear_token[0] == 'c');\n    ParamDesc[i].NonEssential = (essential_token[0] != 'e');\n    ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;\n    ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;\n    ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;\n  }\n  return (ParamDesc);\n}\n\n/**\n * This routine reads a textual description of a prototype from\n * the specified file.\n *\n * @param fp open text file to read prototype from\n * @param N number of dimensions used in prototype\n * @return List of prototypes\n * @note Globals: None\n */\nPROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {\n  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];\n  int SampleCount;\n  int i;\n\n  const int kMaxLineSize = TOKENSIZE * 4;\n  char line[kMaxLineSize];\n  if (fp->FGets(line, kMaxLineSize) == nullptr ||\n      sscanf(line, \"%\" QUOTED_TOKENSIZE \"s %\" QUOTED_TOKENSIZE \"s %d\", sig_token, shape_token,\n             &SampleCount) != 3) {\n    tprintf(\"Invalid prototype: %s\\n\", line);\n    return nullptr;\n  }\n  auto Proto = new PROTOTYPE;\n  Proto->Cluster = nullptr;\n  Proto->Significant = (sig_token[0] == 's');\n\n  switch (shape_token[0]) {\n    case 's':\n      Proto->Style = spherical;\n      break;\n    case 'e':\n      Proto->Style = elliptical;\n      break;\n    case 'a':\n      Proto->Style = automatic;\n      break;\n    default:\n      tprintf(\"Invalid prototype style specification:%s\\n\", shape_token);\n      Proto->Style = elliptical;\n  }\n\n  ASSERT_HOST(SampleCount >= 0);\n  Proto->NumSamples = SampleCount;\n\n  Proto->Mean.resize(N);\n  ReadNFloats(fp, N, &Proto->Mean[0]);\n\n  switch (Proto->Style) {\n    case spherical:\n      ReadNFloats(fp, 1, &(Proto->Variance.Spherical));\n      Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);\n      Proto->TotalMagnitude = std::pow(Proto->Magnitude.Spherical, static_cast<float>(N));\n      Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n      Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;\n      Proto->Distrib.clear();\n      break;\n    case elliptical:\n      Proto->Variance.Elliptical = new float[N];\n      ReadNFloats(fp, N, Proto->Variance.Elliptical);\n      Proto->Magnitude.Elliptical = new float[N];\n      Proto->Weight.Elliptical = new float[N];\n      Proto->TotalMagnitude = 1.0;\n      for (i = 0; i < N; i++) {\n        Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);\n        Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];\n        Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];\n      }\n      Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));\n      Proto->Distrib.clear();\n      break;\n    default:\n      delete Proto;\n      tprintf(\"Invalid prototype style\\n\");\n      return nullptr;\n  }\n  return Proto;\n}\n\n/**\n * This routine writes an array of dimension descriptors to\n * the specified text file.\n * @param File open text file to write param descriptors to\n * @param N number of param descriptors to write\n * @param ParamDesc array of param descriptors to write\n */\nvoid WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {\n  int i;\n\n  for (i = 0; i < N; i++) {\n    if (ParamDesc[i].Circular) {\n      fprintf(File, \"circular \");\n    } else {\n      fprintf(File, \"linear   \");\n    }\n\n    if (ParamDesc[i].NonEssential) {\n      fprintf(File, \"non-essential \");\n    } else {\n      fprintf(File, \"essential     \");\n    }\n\n    fprintf(File, \"%10.6f %10.6f\\n\", ParamDesc[i].Min, ParamDesc[i].Max);\n  }\n}\n\n/**\n * This routine writes a textual description of a prototype\n * to the specified text file.\n * @param File open text file to write prototype to\n * @param N number of dimensions in feature space\n * @param Proto prototype to write out\n */\nvoid WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {\n  int i;\n\n  if (Proto->Significant) {\n    fprintf(File, \"significant   \");\n  } else {\n    fprintf(File, \"insignificant \");\n  }\n  WriteProtoStyle(File, static_cast<PROTOSTYLE>(Proto->Style));\n  fprintf(File, \"%6u\\n\\t\", Proto->NumSamples);\n  WriteNFloats(File, N, &Proto->Mean[0]);\n  fprintf(File, \"\\t\");\n\n  switch (Proto->Style) {\n    case spherical:\n      WriteNFloats(File, 1, &(Proto->Variance.Spherical));\n      break;\n    case elliptical:\n      WriteNFloats(File, N, Proto->Variance.Elliptical);\n      break;\n    case mixed:\n      for (i = 0; i < N; i++) {\n        switch (Proto->Distrib[i]) {\n          case normal:\n            fprintf(File, \" %9s\", \"normal\");\n            break;\n          case uniform:\n            fprintf(File, \" %9s\", \"uniform\");\n            break;\n          case D_random:\n            fprintf(File, \" %9s\", \"random\");\n            break;\n          case DISTRIBUTION_COUNT:\n            ASSERT_HOST(!\"Distribution count not allowed!\");\n        }\n      }\n      fprintf(File, \"\\n\\t\");\n      WriteNFloats(File, N, Proto->Variance.Elliptical);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/clusttool.h",
    "content": "/******************************************************************************\n ** Filename: clusttool.h\n ** Purpose:  Definition of clustering utility tools\n ** Author:   Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef TESSERACT_CLASSIFY_CLUSTTOOL_H_\n#define TESSERACT_CLASSIFY_CLUSTTOOL_H_\n\n#include \"cluster.h\"\n\n#include \"serialis.h\"\n\n#include <cstdio>\n\nnamespace tesseract {\n\nuint16_t ReadSampleSize(tesseract::TFile *fp);\n\nPARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uint16_t N);\n\nPROTOTYPE *ReadPrototype(tesseract::TFile *fp, uint16_t N);\n\nTESS_API\nvoid WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]);\n\nTESS_API\nvoid WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CLASSIFY_CLUSTTOOL_H_\n"
  },
  {
    "path": "src/classify/cutoffs.cpp",
    "content": "/******************************************************************************\n ** Filename:    cutoffs.c\n ** Purpose:     Routines to manipulate an array of class cutoffs.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n\n#include <cstdio>\n#include <sstream> // for std::istringstream\n#include <string>  // for std::string\n\n#include <tesseract/unichar.h>\n#include \"classify.h\"\n#include \"helpers.h\"\n#include \"serialis.h\"\n\n#define MAX_CUTOFF 1000\n\nnamespace tesseract {\n/**\n * Open file, read in all of the class-id/cutoff pairs\n * and insert them into the Cutoffs array.  Cutoffs are\n * indexed in the array by class id.  Unused entries in the\n * array are set to an arbitrarily high cutoff value.\n * @param fp file containing cutoff definitions\n * @param Cutoffs array to put cutoffs into\n */\nvoid Classify::ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs) {\n  int Cutoff;\n\n  if (shape_table_ != nullptr) {\n    if (!fp->DeSerialize(shapetable_cutoffs_)) {\n      tprintf(\"Error during read of shapetable pffmtable!\\n\");\n    }\n  }\n  for (int i = 0; i < MAX_NUM_CLASSES; i++) {\n    Cutoffs[i] = MAX_CUTOFF;\n  }\n\n  const int kMaxLineSize = 100;\n  char line[kMaxLineSize];\n  while (fp->FGets(line, kMaxLineSize) != nullptr) {\n    std::string Class;\n    CLASS_ID ClassId;\n    std::istringstream stream(line);\n    stream.imbue(std::locale::classic());\n    stream >> Class >> Cutoff;\n    if (stream.fail()) {\n      break;\n    }\n    if (Class.compare(\"NULL\") == 0) {\n      ClassId = unicharset.unichar_to_id(\" \");\n    } else {\n      ClassId = unicharset.unichar_to_id(Class.c_str());\n    }\n    ASSERT_HOST(ClassId >= 0 && ClassId < MAX_NUM_CLASSES);\n    Cutoffs[ClassId] = Cutoff;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/featdefs.cpp",
    "content": "/******************************************************************************\n ** Filename:    featdefs.cpp\n ** Purpose:     Definitions of currently defined feature types.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"featdefs.h\"\n\n#include \"picofeat.h\" // for PicoFeatureLength\n#include \"scanutils.h\"\n\n#include <cstdio>\n#include <cstring>\n\nnamespace tesseract {\n\n#define PICO_FEATURE_LENGTH 0.05\n\n/*-----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n-----------------------------------------------------------------------------*/\nconst char *const kMicroFeatureType = \"mf\";\nconst char *const kCNFeatureType = \"cn\";\nconst char *const kIntFeatureType = \"if\";\nconst char *const kGeoFeatureType = \"tb\";\n\n// Define all of the parameters for the MicroFeature type.\nStartParamDesc(MicroFeatureParams) DefineParam(0, 0, -0.5, 0.5) DefineParam(0, 0, -0.25, 0.75)\n    DefineParam(0, 1, 0.0, 1.0) DefineParam(1, 0, 0.0, 1.0) DefineParam(0, 1, -0.5, 0.5)\n        DefineParam(0, 1, -0.5, 0.5) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(MicroFeatureDesc, 5, 1, kMicroFeatureType, MicroFeatureParams)\n\n    // Define all of the parameters for the NormFeat type.\n    StartParamDesc(CharNormParams) DefineParam(0, 0, -0.25, 0.75) DefineParam(0, 1, 0.0, 1.0)\n        DefineParam(0, 0, 0.0, 1.0) DefineParam(0, 0, 0.0, 1.0) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(CharNormDesc, 4, 0, kCNFeatureType, CharNormParams)\n\n    // Define all of the parameters for the IntFeature type\n    StartParamDesc(IntFeatParams) DefineParam(0, 0, 0.0, 255.0) DefineParam(0, 0, 0.0, 255.0)\n        DefineParam(1, 0, 0.0, 255.0) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(IntFeatDesc, 2, 1, kIntFeatureType, IntFeatParams)\n\n    // Define all of the parameters for the GeoFeature type\n    StartParamDesc(GeoFeatParams) DefineParam(0, 0, 0.0, 255.0) DefineParam(0, 0, 0.0, 255.0)\n        DefineParam(0, 0, 0.0, 255.0) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(GeoFeatDesc, 3, 0, kGeoFeatureType, GeoFeatParams)\n\n    // Other features used for training the adaptive classifier, but not used\n    // during normal training, therefore not in the DescDefs array.\n\n    // Define all of the parameters for the PicoFeature type\n    // define knob that can be used to adjust pico-feature length.\n    float PicoFeatureLength = PICO_FEATURE_LENGTH;\nStartParamDesc(PicoFeatParams) DefineParam(0, 0, -0.25, 0.75) DefineParam(1, 0, 0.0, 1.0)\n    DefineParam(0, 0, -0.5, 0.5) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(PicoFeatDesc, 2, 1, \"pf\", PicoFeatParams)\n\n    // Define all of the parameters for the OutlineFeature type.\n    StartParamDesc(OutlineFeatParams) DefineParam(0, 0, -0.5, 0.5) DefineParam(0, 0, -0.25, 0.75)\n        DefineParam(0, 0, 0.0, 1.0) DefineParam(1, 0, 0.0, 1.0) EndParamDesc\n    // Now define the feature type itself (see features.h for parameters).\n    DefineFeature(OutlineFeatDesc, 3, 1, \"of\", OutlineFeatParams)\n\n    // MUST be kept in-sync with ExtractorDefs in fxdefs.cpp.\n    static const FEATURE_DESC_STRUCT *DescDefs[NUM_FEATURE_TYPES] = {\n        &MicroFeatureDesc, &CharNormDesc, &IntFeatDesc, &GeoFeatDesc};\n\n/*-----------------------------------------------------------------------------\n              Public Code\n-----------------------------------------------------------------------------*/\nvoid InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs) {\n  featuredefs->NumFeatureTypes = NUM_FEATURE_TYPES;\n  for (int i = 0; i < NUM_FEATURE_TYPES; ++i) {\n    featuredefs->FeatureDesc[i] = DescDefs[i];\n  }\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * Appends a textual representation of CharDesc to str.\n * The format used is to write out the number of feature\n * sets which will be written followed by a representation of\n * each feature set.\n *\n * Each set starts with the short name for that feature followed\n * by a description of the feature set.  Feature sets which are\n * not present are not written.\n *\n * @param FeatureDefs    definitions of feature types/extractors\n * @param str            string to append CharDesc to\n * @param CharDesc       character description to write to File\n */\nvoid WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc, std::string &str) {\n  int NumSetsToWrite = 0;\n\n  for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++) {\n    if (CharDesc->FeatureSets[Type]) {\n      NumSetsToWrite++;\n    }\n  }\n\n  str += \" \" + std::to_string(NumSetsToWrite);\n  str += \"\\n\";\n  for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++) {\n    if (CharDesc->FeatureSets[Type]) {\n      str += FeatureDefs.FeatureDesc[Type]->ShortName;\n      str += \" \";\n      WriteFeatureSet(CharDesc->FeatureSets[Type], str);\n    }\n  }\n} /* WriteCharDescription */\n\n// Return whether all of the fields of the given feature set\n// are well defined (not inf or nan).\nbool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc) {\n  bool anything_written = false;\n  bool well_formed = true;\n  for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++) {\n    if (CharDesc->FeatureSets[Type]) {\n      for (int i = 0; i < CharDesc->FeatureSets[Type]->NumFeatures; i++) {\n        FEATURE feat = CharDesc->FeatureSets[Type]->Features[i];\n        for (int p = 0; p < feat->Type->NumParams; p++) {\n          if (std::isnan(feat->Params[p]) || std::isinf(feat->Params[p])) {\n            well_formed = false;\n          } else {\n            anything_written = true;\n          }\n        }\n      }\n    } else {\n      return false;\n    }\n  }\n  return anything_written && well_formed;\n} /* ValidCharDescription */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Read a character description from File, and return\n * a data structure containing this information.  The data\n * is formatted as follows:\n * @verbatim\n     NumberOfSets\n             ShortNameForSet1 Set1\n             ShortNameForSet2 Set2\n             ...\n   @endverbatim\n *\n * Globals:\n * - none\n *\n * @param FeatureDefs    definitions of feature types/extractors\n * @param File open text file to read character description from\n * @return Character description read from File.\n */\nCHAR_DESC_STRUCT *ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File) {\n  int NumSetsToRead;\n  char ShortName[FEAT_NAME_SIZE];\n  int Type;\n\n  ASSERT_HOST(tfscanf(File, \"%d\", &NumSetsToRead) == 1);\n  ASSERT_HOST(NumSetsToRead >= 0);\n  ASSERT_HOST(NumSetsToRead <= FeatureDefs.NumFeatureTypes);\n\n  auto CharDesc = new CHAR_DESC_STRUCT(FeatureDefs);\n  for (; NumSetsToRead > 0; NumSetsToRead--) {\n    tfscanf(File, \"%s\", ShortName);\n    Type = ShortNameToFeatureType(FeatureDefs, ShortName);\n    CharDesc->FeatureSets[Type] = ReadFeatureSet(File, FeatureDefs.FeatureDesc[Type]);\n  }\n  return CharDesc;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * Search through all features currently defined and return\n * the feature type for the feature with the specified short\n * name.  Trap an error if the specified name is not found.\n *\n * Globals:\n * - none\n *\n * @param FeatureDefs    definitions of feature types/extractors\n * @param ShortName short name of a feature type\n * @return Feature type which corresponds to ShortName.\n */\nuint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName) {\n  for (int i = 0; i < FeatureDefs.NumFeatureTypes; i++) {\n    if (!strcmp((FeatureDefs.FeatureDesc[i]->ShortName), ShortName)) {\n      return static_cast<uint32_t>(i);\n    }\n  }\n  ASSERT_HOST(!\"Illegal short name for a feature\");\n  return 0;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/featdefs.h",
    "content": "/******************************************************************************\n ** Filename:    featdefs.h\n ** Purpose:     Definitions of currently defined feature types.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef FEATDEFS_H\n#define FEATDEFS_H\n\n#include \"ocrfeatures.h\"\n\n#include <array>  // for std::array\n#include <string> // for std::string\n\nnamespace tesseract {\n\n/* Enumerate the different types of features currently defined. */\n#define NUM_FEATURE_TYPES 4\nextern TESS_API const char *const kMicroFeatureType;\nextern TESS_API const char *const kCNFeatureType;\nextern TESS_API const char *const kIntFeatureType;\nextern TESS_API const char *const kGeoFeatureType;\n\n/* A character is described by multiple sets of extracted features.  Each\n  set contains a number of features of a particular type, for example, a\n  set of bays, or a set of closures, or a set of microfeatures.  Each\n  feature consists of a number of parameters.  All features within a\n  feature set contain the same number of parameters.*/\n\nstruct FEATURE_DEFS_STRUCT {\n  int32_t NumFeatureTypes;\n  const FEATURE_DESC_STRUCT *FeatureDesc[NUM_FEATURE_TYPES];\n};\nusing FEATURE_DEFS = FEATURE_DEFS_STRUCT *;\n\nstruct CHAR_DESC_STRUCT {\n  /// Allocate a new character description, initialize its\n  /// feature sets to be empty, and return it.\n  CHAR_DESC_STRUCT(const FEATURE_DEFS_STRUCT &FeatureDefs) {\n    NumFeatureSets = FeatureDefs.NumFeatureTypes;\n  }\n\n  /// Release the memory consumed by the specified character\n  /// description and all of the features in that description.\n  ~CHAR_DESC_STRUCT() {\n    for (size_t i = 0; i < NumFeatureSets; i++) {\n      delete FeatureSets[i];\n    }\n  }\n\n  uint32_t NumFeatureSets;\n  std::array<FEATURE_SET_STRUCT *, NUM_FEATURE_TYPES> FeatureSets;\n};\n\n/*----------------------------------------------------------------------\n    Generic functions for manipulating character descriptions\n----------------------------------------------------------------------*/\nTESS_API\nvoid InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs);\n\nbool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc);\n\nvoid WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc, std::string &str);\n\nTESS_API\nCHAR_DESC_STRUCT *ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File);\n\nTESS_API\nuint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName);\n\n/**----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n----------------------------------------------------------------------------**/\nextern const FEATURE_DESC_STRUCT MicroFeatureDesc;\nextern TESS_API const FEATURE_DESC_STRUCT PicoFeatDesc;\nextern const FEATURE_DESC_STRUCT CharNormDesc;\nextern const FEATURE_DESC_STRUCT OutlineFeatDesc;\nextern const FEATURE_DESC_STRUCT IntFeatDesc;\nextern const FEATURE_DESC_STRUCT GeoFeatDesc;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/float2int.cpp",
    "content": "/******************************************************************************\n ** Filename:    float2int.cpp\n ** Purpose:     Routines for converting float features to int features\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"float2int.h\"\n\n#include \"classify.h\"\n#include \"mfoutline.h\"\n#include \"normmatch.h\"\n#include \"picofeat.h\"\n\n#include \"helpers.h\"\n\n#define MAX_INT_CHAR_NORM (INT_CHAR_NORM_RANGE - 1)\n\n/*---------------------------------------------------------------------------*/\nnamespace tesseract {\n\n/**\n * For each class in the unicharset, clears the corresponding\n * entry in char_norm_array.  char_norm_array is indexed by unichar_id.\n *\n * Globals:\n * - none\n *\n * @param char_norm_array array to be cleared\n */\nvoid Classify::ClearCharNormArray(uint8_t *char_norm_array) {\n  memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());\n} /* ClearCharNormArray */\n\n/*---------------------------------------------------------------------------*/\n/**\n * For each class in unicharset, computes the match between\n * norm_feature and the normalization protos for that class.\n * Converts this number to the range from 0 - 255 and stores it\n * into char_norm_array.  CharNormArray is indexed by unichar_id.\n *\n * Globals:\n * - PreTrainedTemplates current set of built-in templates\n *\n * @param norm_feature character normalization feature\n * @param[out] char_norm_array place to put results of size unicharset.size()\n */\nvoid Classify::ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature,\n                                       uint8_t *char_norm_array) {\n  for (unsigned i = 0; i < unicharset.size(); i++) {\n    if (i < PreTrainedTemplates->NumClasses) {\n      int norm_adjust =\n          static_cast<int>(INT_CHAR_NORM_RANGE * ComputeNormMatch(i, norm_feature, false));\n      char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);\n    } else {\n      // Classes with no templates (eg. ambigs & ligatures) default\n      // to worst match.\n      char_norm_array[i] = MAX_INT_CHAR_NORM;\n    }\n  }\n} /* ComputeIntCharNormArray */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine converts each floating point pico-feature\n * in Features into integer format and saves it into\n * IntFeatures.\n *\n * Globals:\n * - none\n *\n * @param Features floating point pico-features to be converted\n * @param[out] IntFeatures array to put converted features into\n */\nvoid Classify::ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) {\n  float YShift;\n\n  if (classify_norm_method == baseline) {\n    YShift = BASELINE_Y_SHIFT;\n  } else {\n    YShift = Y_SHIFT;\n  }\n\n  for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {\n    FEATURE Feature = Features->Features[Fid];\n\n    IntFeatures[Fid].X = Bucket8For(Feature->Params[PicoFeatX], X_SHIFT, INT_FEAT_RANGE);\n    IntFeatures[Fid].Y = Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);\n    IntFeatures[Fid].Theta =\n        CircBucketFor(Feature->Params[PicoFeatDir], ANGLE_SHIFT, INT_FEAT_RANGE);\n    IntFeatures[Fid].CP_misses = 0;\n  }\n} /* ComputeIntFeatures */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/float2int.h",
    "content": "/******************************************************************************\n ** Filename:    float2int.h\n ** Purpose:     Routines for converting float features to int features\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef FLOAT2INT_H\n#define FLOAT2INT_H\n\n/*-----------------------------------------------------------------------------\n          Include Files and Type Defines\n-----------------------------------------------------------------------------*/\n#include \"intmatcher.h\"\n#include \"ocrfeatures.h\"\n\n#define INT_FEAT_RANGE 256\n#define BASELINE_Y_SHIFT (0.25)\n\n#endif\n"
  },
  {
    "path": "src/classify/fpoint.cpp",
    "content": "/******************************************************************************\n ** Filename:    fpoint.cpp\n ** Purpose:     Abstract data type for a 2D point (floating point coords)\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n#define _USE_MATH_DEFINES // for M_PI\n#include \"fpoint.h\"\n#include <cmath> // for M_PI\n#include <cstdio>\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n\nfloat DistanceBetween(FPOINT A, FPOINT B) {\n  const double xd = XDelta(A, B);\n  const double yd = YDelta(A, B);\n  return sqrt(static_cast<double>(xd * xd + yd * yd));\n}\n\n/**\n * Return the angle from Point1 to Point2 normalized to\n * lie in the range 0 to FullScale (where FullScale corresponds\n * to 2*pi or 360 degrees).\n * @param Point1 points to compute angle between\n * @param Point2 points to compute angle between\n * @param FullScale value to associate with 2*pi\n * @return angle\n */\nfloat NormalizedAngleFrom(FPOINT *Point1, FPOINT *Point2, float FullScale) {\n  float NumRadsInCircle = 2.0 * M_PI;\n\n  float Angle = AngleFrom(*Point1, *Point2);\n  if (Angle < 0.0) {\n    Angle += NumRadsInCircle;\n  }\n  Angle *= FullScale / NumRadsInCircle;\n  if (Angle < 0.0 || Angle >= FullScale) {\n    Angle = 0.0;\n  }\n  return (Angle);\n}\n"
  },
  {
    "path": "src/classify/fpoint.h",
    "content": "/******************************************************************************\n ** Filename:    fpoint.h\n ** Purpose:     Abstract data type for 2D points (floating point coords)\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef FPOINT_H\n#define FPOINT_H\n\n/**----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------**/\n#include <cmath>\n#include <cstdio>\n\n/* define data structure to hold 2D points or vectors using floating point */\nstruct FPOINT {\n  float x, y;\n};\nusing FVECTOR = FPOINT;\n\n/**----------------------------------------------------------------------------\n            Macros\n----------------------------------------------------------------------------**/\n/* macros for computing miscellaneous functions of 2 points */\n#define XDelta(A, B) ((B).x - (A).x)\n#define YDelta(A, B) ((B).y - (A).y)\n#define SlopeFrom(A, B) (YDelta(A, B) / XDelta(A, B))\n#define AngleFrom(A, B) (atan2((double)YDelta(A, B), (double)XDelta(A, B)))\n\n#define XIntersectionOf(A, B, X) (SlopeFrom(A, B) * ((X)-A.x) + A.y)\n\n/*-------------------------------------------------------------------------\n        Public Function Prototypes\n---------------------------------------------------------------------------*/\n\nfloat DistanceBetween(FPOINT A, FPOINT B);\n\nfloat NormalizedAngleFrom(FPOINT *Point1, FPOINT *Point2, float FullScale);\n\n#endif\n"
  },
  {
    "path": "src/classify/intfeaturespace.cpp",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturespace.cpp\n// Description: Indexed feature space based on INT_FEATURE_STRUCT.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#define _USE_MATH_DEFINES // for M_PI\n#include \"intfeaturespace.h\"\n#include <cmath> // for M_PI\n#include \"intfx.h\"\n\nnamespace tesseract {\n\nIntFeatureSpace::IntFeatureSpace() : x_buckets_(0), y_buckets_(0), theta_buckets_(0) {}\n\nvoid IntFeatureSpace::Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets) {\n  x_buckets_ = xbuckets;\n  y_buckets_ = ybuckets;\n  theta_buckets_ = thetabuckets;\n}\n\n// Serializes the feature space definition to the given file.\n// Returns false on error.\nbool IntFeatureSpace::Serialize(FILE *fp) const {\n  if (fwrite(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1) {\n    return false;\n  }\n  return true;\n}\n\n// Returns an INT_FEATURE_STRUCT corresponding to the given index.\n// This is the inverse of the Index member.\nINT_FEATURE_STRUCT IntFeatureSpace::PositionFromIndex(int index) const {\n  return PositionFromBuckets(index / (y_buckets_ * theta_buckets_),\n                             index / theta_buckets_ % y_buckets_, index % theta_buckets_);\n}\n\n// Bulk calls to Index. Maps the given array of features to a vector of\n// int32_t indices in the same order as the input.\nvoid IntFeatureSpace::IndexFeatures(const INT_FEATURE_STRUCT *features, int num_features,\n                                    std::vector<int> *mapped_features) const {\n  mapped_features->clear();\n  for (int f = 0; f < num_features; ++f) {\n    mapped_features->push_back(Index(features[f]));\n  }\n}\n\n// Bulk calls to Index. Maps the given array of features to a vector of\n// sorted int32_t indices.\nvoid IntFeatureSpace::IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,\n                                           std::vector<int> *sorted_features) const {\n  sorted_features->clear();\n  for (int f = 0; f < num_features; ++f) {\n    sorted_features->push_back(Index(features[f]));\n  }\n  std::sort(sorted_features->begin(), sorted_features->end());\n}\n\n// Returns a feature space index for the given x,y position in a display\n// window, or -1 if the feature is a miss.\nint IntFeatureSpace::XYToFeatureIndex(int x, int y) const {\n  // Round the x,y position to a feature. Search for a valid theta.\n  INT_FEATURE_STRUCT feature(x, y, 0);\n  int index = -1;\n  for (int theta = 0; theta <= UINT8_MAX && index < 0; ++theta) {\n    feature.Theta = theta;\n    index = Index(feature);\n  }\n  if (index < 0) {\n    tprintf(\"(%d,%d) does not exist in feature space!\\n\", x, y);\n    return -1;\n  }\n  feature = PositionFromIndex(index);\n  tprintf(\"Click at (%d, %d) ->(%d, %d), ->(%d, %d)\\n\", x, y, feature.X, feature.Y, x - feature.X,\n          y - feature.Y);\n  // Get the relative position of x,y from the rounded feature.\n  x -= feature.X;\n  y -= feature.Y;\n  if (x != 0 || y != 0) {\n    double angle = atan2(static_cast<double>(y), static_cast<double>(x)) + M_PI;\n    angle *= kIntFeatureExtent / (2.0 * M_PI);\n    feature.Theta = static_cast<uint8_t>(angle + 0.5);\n    index = Index(feature);\n    if (index < 0) {\n      tprintf(\"Feature failed to map to a valid index:\");\n      feature.print();\n      return -1;\n    }\n    feature = PositionFromIndex(index);\n  }\n  feature.print();\n  return index;\n}\n\n// Returns an INT_FEATURE_STRUCT corresponding to the given bucket coords.\nINT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x, int y, int theta) const {\n  INT_FEATURE_STRUCT pos((x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,\n                         (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,\n                         DivRounded(theta * kIntFeatureExtent, theta_buckets_));\n  return pos;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/classify/intfeaturespace.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturespace.h\n// Description: Indexed feature space based on INT_FEATURE_STRUCT.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H_\n#define TESSERACT_CLASSIFY_INTFEATURESPACE_H_\n\n#include \"intproto.h\"\n\n// Extent of x,y,theta in the input feature space. [0,255].\nconst int kIntFeatureExtent = 256;\n// Extent of x,y,theta dimensions in the quantized feature space.\nconst int kBoostXYBuckets = 16;\nconst int kBoostDirBuckets = 16;\n\nnamespace tesseract {\n\nclass IndexMap;\n\n// Down-sampling quantization of the INT_FEATURE_STRUCT feature space and\n// conversion to a single scalar index value, used as a binary feature space.\nclass TESS_API IntFeatureSpace {\npublic:\n  IntFeatureSpace();\n  // Default copy constructors and assignment OK!\n\n  // Setup the feature space with the given dimensions.\n  void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets);\n\n  // Serializes the feature space definition to the given file.\n  // Returns false on error.\n  bool Serialize(FILE *fp) const;\n\n  // Returns the total size of the feature space.\n  int Size() const {\n    return static_cast<int>(x_buckets_) * y_buckets_ * theta_buckets_;\n  }\n  // Returns an INT_FEATURE_STRUCT corresponding to the given index.\n  // This is the inverse of the Index member.\n  INT_FEATURE_STRUCT PositionFromIndex(int index) const;\n\n  // Returns a 1-dimensional index corresponding to the given feature value.\n  // Range is [0, Size()-1]. Inverse of PositionFromIndex member.\n  int Index(const INT_FEATURE_STRUCT &f) const {\n    return (XBucket(f.X) * y_buckets_ + YBucket(f.Y)) * theta_buckets_ + ThetaBucket(f.Theta);\n  }\n  // Bulk calls to Index. Maps the given array of features to a vector of\n  // int32_t indices in the same order as the input.\n  void IndexFeatures(const INT_FEATURE_STRUCT *features, int num_features,\n                     std::vector<int> *mapped_features) const;\n  // Bulk calls to Index. Maps the given array of features to a vector of\n  // sorted int32_t indices.\n  void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,\n                            std::vector<int> *sorted_features) const;\n  // Returns a feature space index for the given x,y position in a display\n  // window, or -1 if the feature is a miss.\n  int XYToFeatureIndex(int x, int y) const;\n\nprotected:\n  // Converters to generate indices for individual feature dimensions.\n  int XBucket(int x) const {\n    int bucket = x * x_buckets_ / kIntFeatureExtent;\n    return ClipToRange(bucket, 0, static_cast<int>(x_buckets_) - 1);\n  }\n  int YBucket(int y) const {\n    int bucket = y * y_buckets_ / kIntFeatureExtent;\n    return ClipToRange(bucket, 0, static_cast<int>(y_buckets_) - 1);\n  }\n  // Use DivRounded for theta so that exactly vertical and horizontal are in\n  // the middle of a bucket. The Modulo takes care of the wrap-around.\n  int ThetaBucket(int theta) const {\n    int bucket = DivRounded(theta * theta_buckets_, kIntFeatureExtent);\n    return Modulo(bucket, theta_buckets_);\n  }\n  // Returns an INT_FEATURE_STRUCT corresponding to the given buckets.\n  INT_FEATURE_STRUCT PositionFromBuckets(int x, int y, int theta) const;\n\n  // Feature space definition - serialized.\n  uint8_t x_buckets_;\n  uint8_t y_buckets_;\n  uint8_t theta_buckets_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H_\n"
  },
  {
    "path": "src/classify/intfx.cpp",
    "content": "/******************************************************************************\n ** Filename:    intfx.c\n ** Purpose:     Integer character normalization & feature extraction\n ** Author:      Robert Moss, rays@google.com (Ray Smith)\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n/**----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------**/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"intfx.h\"\n\n#include \"classify.h\"\n#include \"intmatcher.h\"\n#include \"linlsq.h\"\n#include \"normalis.h\"\n#include \"statistc.h\"\n#include \"trainingsample.h\"\n\n#include \"helpers.h\"\n\n#include <allheaders.h>\n\n#include <cmath> // for M_PI\n#include <mutex> // for std::mutex\n\nnamespace tesseract {\n\n/**----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n----------------------------------------------------------------------------**/\n// Look up table for cos and sin to turn the intfx feature angle to a vector.\n// Protected by atan_table_mutex.\n// The entries are in binary degrees where a full circle is 256 binary degrees.\nstatic float cos_table[INT_CHAR_NORM_RANGE];\nstatic float sin_table[INT_CHAR_NORM_RANGE];\n\n/**----------------------------------------------------------------------------\n            Public Code\n----------------------------------------------------------------------------**/\n\nvoid InitIntegerFX() {\n  // Guards write access to AtanTable so we don't create it more than once.\n  static std::mutex atan_table_mutex;\n  static bool atan_table_init = false;\n  std::lock_guard<std::mutex> guard(atan_table_mutex);\n  if (!atan_table_init) {\n    for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {\n      cos_table[i] = cos(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);\n      sin_table[i] = sin(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);\n    }\n    atan_table_init = true;\n  }\n}\n\n// Returns a vector representing the direction of a feature with the given\n// theta direction in an INT_FEATURE_STRUCT.\nFCOORD FeatureDirection(uint8_t theta) {\n  return FCOORD(cos_table[theta], sin_table[theta]);\n}\n\n// Generates a TrainingSample from a TBLOB. Extracts features and sets\n// the bounding box, so classifiers that operate on the image can work.\n// TODO(rays) Make BlobToTrainingSample a member of Classify now that\n// the FlexFx and FeatureDescription code have been removed and LearnBlob\n// is now a member of Classify.\nTrainingSample *BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm,\n                                     INT_FX_RESULT_STRUCT *fx_info,\n                                     std::vector<INT_FEATURE_STRUCT> *bl_features) {\n  std::vector<INT_FEATURE_STRUCT> cn_features;\n  Classify::ExtractFeatures(blob, nonlinear_norm, bl_features, &cn_features, fx_info, nullptr);\n  // TODO(rays) Use blob->PreciseBoundingBox() instead.\n  TBOX box = blob.bounding_box();\n  TrainingSample *sample = nullptr;\n  int num_features = fx_info->NumCN;\n  if (num_features > 0) {\n    sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0], num_features);\n  }\n  if (sample != nullptr) {\n    // Set the bounding box (in original image coordinates) in the sample.\n    TPOINT topleft, botright;\n    topleft.x = box.left();\n    topleft.y = box.top();\n    botright.x = box.right();\n    botright.y = box.bottom();\n    TPOINT original_topleft, original_botright;\n    blob.denorm().DenormTransform(nullptr, topleft, &original_topleft);\n    blob.denorm().DenormTransform(nullptr, botright, &original_botright);\n    sample->set_bounding_box(\n        TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y));\n  }\n  return sample;\n}\n\n// Computes the DENORMS for bl(baseline) and cn(character) normalization\n// during feature extraction. The input denorm describes the current state\n// of the blob, which is usually a baseline-normalized word.\n// The Transforms setup are as follows:\n// Baseline Normalized (bl) Output:\n//   We center the grapheme by aligning the x-coordinate of its centroid with\n//   x=128 and leaving the already-baseline-normalized y as-is.\n//\n// Character Normalized (cn) Output:\n//   We align the grapheme's centroid at the origin and scale it\n//   asymmetrically in x and y so that the 2nd moments are a standard value\n//   (51.2) ie the result is vaguely square.\n// If classify_nonlinear_norm is true:\n//   A non-linear normalization is setup that attempts to evenly distribute\n//   edges across x and y.\n//\n// Some of the fields of fx_info are also setup:\n// Length: Total length of outline.\n// Rx:     Rounded y second moment. (Reversed by convention.)\n// Ry:     rounded x second moment.\n// Xmean:  Rounded x center of mass of the blob.\n// Ymean:  Rounded y center of mass of the blob.\nvoid Classify::SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm,\n                                DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) {\n  // Compute 1st and 2nd moments of the original outline.\n  FCOORD center, second_moments;\n  int length = blob.ComputeMoments(&center, &second_moments);\n  if (fx_info != nullptr) {\n    fx_info->Length = length;\n    fx_info->Rx = IntCastRounded(second_moments.y());\n    fx_info->Ry = IntCastRounded(second_moments.x());\n\n    fx_info->Xmean = IntCastRounded(center.x());\n    fx_info->Ymean = IntCastRounded(center.y());\n  }\n  // Setup the denorm for Baseline normalization.\n  bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f, 1.0f, 1.0f,\n                                128.0f, 128.0f);\n  // Setup the denorm for character normalization.\n  if (nonlinear_norm) {\n    std::vector<std::vector<int>> x_coords;\n    std::vector<std::vector<int>> y_coords;\n    TBOX box;\n    blob.GetPreciseBoundingBox(&box);\n    box.pad(1, 1);\n    blob.GetEdgeCoords(box, x_coords, y_coords);\n    cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX, 0.0f, 0.0f, x_coords,\n                              y_coords);\n  } else {\n    cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), center.y(),\n                                  51.2f / second_moments.x(), 51.2f / second_moments.y(), 128.0f,\n                                  128.0f);\n  }\n}\n\n// Helper normalizes the direction, assuming that it is at the given\n// unnormed_pos, using the given denorm, starting at the root_denorm.\nstatic uint8_t NormalizeDirection(uint8_t dir, const FCOORD &unnormed_pos, const DENORM &denorm,\n                                  const DENORM *root_denorm) {\n  // Convert direction to a vector.\n  FCOORD unnormed_end;\n  unnormed_end.from_direction(dir);\n  unnormed_end += unnormed_pos;\n  FCOORD normed_pos, normed_end;\n  denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);\n  denorm.NormTransform(root_denorm, unnormed_end, &normed_end);\n  normed_end -= normed_pos;\n  return normed_end.to_direction();\n}\n\n// Helper returns the mean direction vector from the given stats. Use the\n// mean direction from dirs if there is information available, otherwise, use\n// the fit_vector from point_diffs.\nstatic FCOORD MeanDirectionVector(const LLSQ &point_diffs, const LLSQ &dirs, const FCOORD &start_pt,\n                                  const FCOORD &end_pt) {\n  FCOORD fit_vector;\n  if (dirs.count() > 0) {\n    // There were directions, so use them. To avoid wrap-around problems, we\n    // have 2 accumulators in dirs: x for normal directions and y for\n    // directions offset by 128. We will use the one with the least variance.\n    FCOORD mean_pt = dirs.mean_point();\n    double mean_dir = 0.0;\n    if (dirs.x_variance() <= dirs.y_variance()) {\n      mean_dir = mean_pt.x();\n    } else {\n      mean_dir = mean_pt.y() + 128;\n    }\n    fit_vector.from_direction(Modulo(IntCastRounded(mean_dir), 256));\n  } else {\n    // There were no directions, so we rely on the vector_fit to the points.\n    // Since the vector_fit is 180 degrees ambiguous, we align with the\n    // supplied feature_dir by making the scalar product non-negative.\n    FCOORD feature_dir(end_pt - start_pt);\n    fit_vector = point_diffs.vector_fit();\n    if (fit_vector.x() == 0.0f && fit_vector.y() == 0.0f) {\n      // There was only a single point. Use feature_dir directly.\n      fit_vector = feature_dir;\n    } else {\n      // Sometimes the least mean squares fit is wrong, due to the small sample\n      // of points and scaling. Use a 90 degree rotated vector if that matches\n      // feature_dir better.\n      FCOORD fit_vector2 = !fit_vector;\n      // The fit_vector is 180 degrees ambiguous, so resolve the ambiguity by\n      // insisting that the scalar product with the feature_dir should be +ve.\n      if (fit_vector % feature_dir < 0.0) {\n        fit_vector = -fit_vector;\n      }\n      if (fit_vector2 % feature_dir < 0.0) {\n        fit_vector2 = -fit_vector2;\n      }\n      // Even though fit_vector2 has a higher mean squared error, it might be\n      // a better fit, so use it if the dot product with feature_dir is bigger.\n      if (fit_vector2 % feature_dir > fit_vector % feature_dir) {\n        fit_vector = fit_vector2;\n      }\n    }\n  }\n  return fit_vector;\n}\n\n// Helper computes one or more features corresponding to the given points.\n// Emitted features are on the line defined by:\n// start_pt + lambda * (end_pt - start_pt) for scalar lambda.\n// Features are spaced at feature_length intervals.\nstatic int ComputeFeatures(const FCOORD &start_pt, const FCOORD &end_pt, double feature_length,\n                           std::vector<INT_FEATURE_STRUCT> *features) {\n  FCOORD feature_vector(end_pt - start_pt);\n  if (feature_vector.x() == 0.0f && feature_vector.y() == 0.0f) {\n    return 0;\n  }\n  // Compute theta for the feature based on its direction.\n  uint8_t theta = feature_vector.to_direction();\n  // Compute the number of features and lambda_step.\n  double target_length = feature_vector.length();\n  int num_features = IntCastRounded(target_length / feature_length);\n  if (num_features == 0) {\n    return 0;\n  }\n  // Divide the length evenly into num_features pieces.\n  double lambda_step = 1.0 / num_features;\n  double lambda = lambda_step / 2.0;\n  for (int f = 0; f < num_features; ++f, lambda += lambda_step) {\n    FCOORD feature_pt(start_pt);\n    feature_pt += feature_vector * lambda;\n    INT_FEATURE_STRUCT feature(feature_pt, theta);\n    features->push_back(feature);\n  }\n  return num_features;\n}\n\n// Gathers outline points and their directions from start_index into dirs by\n// stepping along the outline and normalizing the coordinates until the\n// required feature_length has been collected or end_index is reached.\n// On input pos must point to the position corresponding to start_index and on\n// return pos is updated to the current raw position, and pos_normed is set to\n// the normed version of pos.\n// Since directions wrap-around, they need special treatment to get the mean.\n// Provided the cluster of directions doesn't straddle the wrap-around point,\n// the simple mean works. If they do, then, unless the directions are wildly\n// varying, the cluster rotated by 180 degrees will not straddle the wrap-\n// around point, so mean(dir + 180 degrees) - 180 degrees will work. Since\n// LLSQ conveniently stores the mean of 2 variables, we use it to store\n// dir and dir+128 (128 is 180 degrees) and then use the resulting mean\n// with the least variance.\nstatic int GatherPoints(const C_OUTLINE *outline, double feature_length, const DENORM &denorm,\n                        const DENORM *root_denorm, int start_index, int end_index, ICOORD *pos,\n                        FCOORD *pos_normed, LLSQ *points, LLSQ *dirs) {\n  int step_length = outline->pathlength();\n  ICOORD step = outline->step(start_index % step_length);\n  // Prev_normed is the start point of this collection and will be set on the\n  // first iteration, and on later iterations used to determine the length\n  // that has been collected.\n  FCOORD prev_normed;\n  points->clear();\n  dirs->clear();\n  int num_points = 0;\n  int index;\n  for (index = start_index; index <= end_index; ++index, *pos += step) {\n    step = outline->step(index % step_length);\n    int edge_weight = outline->edge_strength_at_index(index % step_length);\n    if (edge_weight == 0) {\n      // This point has conflicting gradient and step direction, so ignore it.\n      continue;\n    }\n    // Get the sub-pixel precise location and normalize.\n    FCOORD f_pos = outline->sub_pixel_pos_at_index(*pos, index % step_length);\n    denorm.NormTransform(root_denorm, f_pos, pos_normed);\n    if (num_points == 0) {\n      // The start of this segment.\n      prev_normed = *pos_normed;\n    } else {\n      FCOORD offset = *pos_normed - prev_normed;\n      float length = offset.length();\n      if (length > feature_length) {\n        // We have gone far enough from the start. We will use this point in\n        // the next set so return what we have so far.\n        return index;\n      }\n    }\n    points->add(pos_normed->x(), pos_normed->y(), edge_weight);\n    int direction = outline->direction_at_index(index % step_length);\n    if (direction >= 0) {\n      direction = NormalizeDirection(direction, f_pos, denorm, root_denorm);\n      // Use both the direction and direction +128 so we are not trying to\n      // take the mean of something straddling the wrap-around point.\n      dirs->add(direction, Modulo(direction + 128, 256));\n    }\n    ++num_points;\n  }\n  return index;\n}\n\n// Extracts Tesseract features and appends them to the features vector.\n// Startpt to lastpt, inclusive, MUST have the same src_outline member,\n// which may be nullptr. The vector from lastpt to its next is included in\n// the feature extraction. Hidden edges should be excluded by the caller.\n// If force_poly is true, the features will be extracted from the polygonal\n// approximation even if more accurate data is available.\nstatic void ExtractFeaturesFromRun(const EDGEPT *startpt, const EDGEPT *lastpt,\n                                   const DENORM &denorm, double feature_length, bool force_poly,\n                                   std::vector<INT_FEATURE_STRUCT> *features) {\n  const EDGEPT *endpt = lastpt->next;\n  const C_OUTLINE *outline = startpt->src_outline;\n  if (outline != nullptr && !force_poly) {\n    // Detailed information is available. We have to normalize only from\n    // the root_denorm to denorm.\n    const DENORM *root_denorm = denorm.RootDenorm();\n    int total_features = 0;\n    // Get the features from the outline.\n    int step_length = outline->pathlength();\n    int start_index = startpt->start_step;\n    // pos is the integer coordinates of the binary image steps.\n    ICOORD pos = outline->position_at_index(start_index);\n    // We use an end_index that allows us to use a positive increment, but that\n    // may be beyond the bounds of the outline steps/ due to wrap-around, to\n    // so we use % step_length everywhere, except for start_index.\n    int end_index = lastpt->start_step + lastpt->step_count;\n    if (end_index <= start_index) {\n      end_index += step_length;\n    }\n    LLSQ prev_points;\n    LLSQ prev_dirs;\n    FCOORD prev_normed_pos = outline->sub_pixel_pos_at_index(pos, start_index);\n    denorm.NormTransform(root_denorm, prev_normed_pos, &prev_normed_pos);\n    LLSQ points;\n    LLSQ dirs;\n    FCOORD normed_pos(0.0f, 0.0f);\n    int index = GatherPoints(outline, feature_length, denorm, root_denorm, start_index, end_index,\n                             &pos, &normed_pos, &points, &dirs);\n    while (index <= end_index) {\n      // At each iteration we nominally have 3 accumulated sets of points and\n      // dirs: prev_points/dirs, points/dirs, next_points/dirs and sum them\n      // into sum_points/dirs, but we don't necessarily get any features out,\n      // so if that is the case, we keep accumulating instead of rotating the\n      // accumulators.\n      LLSQ next_points;\n      LLSQ next_dirs;\n      FCOORD next_normed_pos(0.0f, 0.0f);\n      index = GatherPoints(outline, feature_length, denorm, root_denorm, index, end_index, &pos,\n                           &next_normed_pos, &next_points, &next_dirs);\n      LLSQ sum_points(prev_points);\n      // TODO(rays) find out why it is better to use just dirs and next_dirs\n      // in sum_dirs, instead of using prev_dirs as well.\n      LLSQ sum_dirs(dirs);\n      sum_points.add(points);\n      sum_points.add(next_points);\n      sum_dirs.add(next_dirs);\n      bool made_features = false;\n      // If we have some points, we can try making some features.\n      if (sum_points.count() > 0) {\n        // We have gone far enough from the start. Make a feature and restart.\n        FCOORD fit_pt = sum_points.mean_point();\n        FCOORD fit_vector = MeanDirectionVector(sum_points, sum_dirs, prev_normed_pos, normed_pos);\n        // The segment to which we fit features is the line passing through\n        // fit_pt in direction of fit_vector that starts nearest to\n        // prev_normed_pos and ends nearest to normed_pos.\n        FCOORD start_pos = prev_normed_pos.nearest_pt_on_line(fit_pt, fit_vector);\n        FCOORD end_pos = normed_pos.nearest_pt_on_line(fit_pt, fit_vector);\n        // Possible correction to match the adjacent polygon segment.\n        if (total_features == 0 && startpt != endpt) {\n          FCOORD poly_pos(startpt->pos.x, startpt->pos.y);\n          denorm.LocalNormTransform(poly_pos, &start_pos);\n        }\n        if (index > end_index && startpt != endpt) {\n          FCOORD poly_pos(endpt->pos.x, endpt->pos.y);\n          denorm.LocalNormTransform(poly_pos, &end_pos);\n        }\n        int num_features = ComputeFeatures(start_pos, end_pos, feature_length, features);\n        if (num_features > 0) {\n          // We made some features so shuffle the accumulators.\n          prev_points = points;\n          prev_dirs = dirs;\n          prev_normed_pos = normed_pos;\n          points = next_points;\n          dirs = next_dirs;\n          made_features = true;\n          total_features += num_features;\n        }\n        // The end of the next set becomes the end next time around.\n        normed_pos = next_normed_pos;\n      }\n      if (!made_features) {\n        // We didn't make any features, so keep the prev accumulators and\n        // add the next ones into the current.\n        points.add(next_points);\n        dirs.add(next_dirs);\n      }\n    }\n  } else {\n    // There is no outline, so we are forced to use the polygonal approximation.\n    const EDGEPT *pt = startpt;\n    do {\n      FCOORD start_pos(pt->pos.x, pt->pos.y);\n      FCOORD end_pos(pt->next->pos.x, pt->next->pos.y);\n      denorm.LocalNormTransform(start_pos, &start_pos);\n      denorm.LocalNormTransform(end_pos, &end_pos);\n      ComputeFeatures(start_pos, end_pos, feature_length, features);\n    } while ((pt = pt->next) != endpt);\n  }\n}\n\n// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as\n// (x,y) position and angle as measured counterclockwise from the vector\n// <-1, 0>, from blob using two normalizations defined by bl_denorm and\n// cn_denorm. See SetpuBLCNDenorms for definitions.\n// If outline_cn_counts is not nullptr, on return it contains the cumulative\n// number of cn features generated for each outline in the blob (in order).\n// Thus after the first outline, there were (*outline_cn_counts)[0] features,\n// after the second outline, there were (*outline_cn_counts)[1] features etc.\nvoid Classify::ExtractFeatures(const TBLOB &blob, bool nonlinear_norm,\n                               std::vector<INT_FEATURE_STRUCT> *bl_features,\n                               std::vector<INT_FEATURE_STRUCT> *cn_features,\n                               INT_FX_RESULT_STRUCT *results,\n                               std::vector<int> *outline_cn_counts) {\n  DENORM bl_denorm, cn_denorm;\n  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm, &bl_denorm, &cn_denorm, results);\n  if (outline_cn_counts != nullptr) {\n    outline_cn_counts->clear();\n  }\n  // Iterate the outlines.\n  for (TESSLINE *ol = blob.outlines; ol != nullptr; ol = ol->next) {\n    // Iterate the polygon.\n    EDGEPT *loop_pt = ol->FindBestStartPt();\n    EDGEPT *pt = loop_pt;\n    if (pt == nullptr) {\n      continue;\n    }\n    do {\n      if (pt->IsHidden()) {\n        continue;\n      }\n      // Find a run of equal src_outline.\n      EDGEPT *last_pt = pt;\n      do {\n        last_pt = last_pt->next;\n      } while (last_pt != loop_pt && !last_pt->IsHidden() &&\n               last_pt->src_outline == pt->src_outline);\n      last_pt = last_pt->prev;\n      // Until the adaptive classifier can be weaned off polygon segments,\n      // we have to force extraction from the polygon for the bl_features.\n      ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength, true, bl_features);\n      ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength, false, cn_features);\n      pt = last_pt;\n    } while ((pt = pt->next) != loop_pt);\n    if (outline_cn_counts != nullptr) {\n      outline_cn_counts->push_back(cn_features->size());\n    }\n  }\n  results->NumBL = bl_features->size();\n  results->NumCN = cn_features->size();\n  results->YBottom = blob.bounding_box().bottom();\n  results->YTop = blob.bounding_box().top();\n  results->Width = blob.bounding_box().width();\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/intfx.h",
    "content": "/******************************************************************************\n **  Filename:    intfx.h\n **  Purpose:     Interface to high level integer feature extractor.\n **  Author:      Robert Moss\n **  History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.\n **\n **  (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n#ifndef INTFX_H\n#define INTFX_H\n\n#include \"blobs.h\"\n#include \"intproto.h\"\n#include \"normalis.h\"\n\n#include <cmath>\n\nnamespace tesseract {\n\nclass DENORM;\n\nclass TrainingSample;\n\nstruct INT_FX_RESULT_STRUCT {\n  int32_t Length;       // total length of all outlines\n  int16_t Xmean, Ymean; // center of mass of all outlines\n  int16_t Rx, Ry;       // radius of gyration\n  int16_t NumBL, NumCN; // number of features extracted\n  int16_t Width;        // Width of blob in BLN coords.\n  uint8_t YBottom;      // Bottom of blob in BLN coords.\n  uint8_t YTop;         // Top of blob in BLN coords.\n};\n\n// The standard feature length\nconst double kStandardFeatureLength = 64.0 / 5;\n\n/**----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\nTESS_API\nvoid InitIntegerFX();\n\n// Returns a vector representing the direction of a feature with the given\n// theta direction in an INT_FEATURE_STRUCT.\nTESS_API\nFCOORD FeatureDirection(uint8_t theta);\n\n// Generates a TrainingSample from a TBLOB. Extracts features and sets\n// the bounding box, so classifiers that operate on the image can work.\n// TODO(rays) BlobToTrainingSample must remain a global function until\n// the FlexFx and FeatureDescription code can be removed and LearnBlob\n// made a member of Classify.\nTrainingSample *BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm,\n                                     INT_FX_RESULT_STRUCT *fx_info,\n                                     std::vector<INT_FEATURE_STRUCT> *bl_features);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/intmatcher.cpp",
    "content": "/******************************************************************************\n ** Filename:    intmatcher.cpp\n ** Purpose:     Generic high level classification routines.\n ** Author:      Robert Moss\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"intmatcher.h\"\n\n#include \"classify.h\"\n#include \"float2int.h\"\n#include \"fontinfo.h\"\n#include \"intproto.h\"\n#include \"scrollview.h\"\n#include \"shapetable.h\"\n\n#include \"helpers.h\"\n\n#include <cassert>\n#include <cmath>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------------\n                    Global Data Definitions and Declarations\n----------------------------------------------------------------------------*/\n// Parameters of the sigmoid used to convert similarity to evidence in the\n// similarity_evidence_table_ that is used to convert distance metric to an\n// 8 bit evidence value in the secondary matcher. (See IntMatcher::Init).\nconst float IntegerMatcher::kSEExponentialMultiplier = 0.0f;\nconst float IntegerMatcher::kSimilarityCenter = 0.0075f;\n\nstatic const uint8_t offset_table[] = {\n    255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,\n    0,   1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0,\n    1,   0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1,\n    0,   3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0,\n    2,   0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,\n    0,   1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,\n    1,   0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1,\n    0,   2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0,\n    3,   0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};\n\nstatic const uint8_t next_table[] = {\n    0,    0,    0,    0x2,  0,    0x4,  0x4,  0x6,  0,    0x8,  0x8,  0x0a, 0x08, 0x0c, 0x0c, 0x0e,\n    0,    0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,\n    0,    0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,\n    0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,\n    0,    0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,\n    0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,\n    0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,\n    0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,\n    0,    0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,\n    0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,\n    0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,\n    0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,\n    0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,\n    0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,\n    0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,\n    0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe};\n\n// See http://b/19318793 (#6) for a complete discussion.\n\n/**\n * Sort Key array in ascending order using heap sort\n * algorithm.  Also sort Index array that is tied to\n * the key array.\n * @param n Number of elements to sort\n * @param ra     Key array [1..n]\n * @param rb     Index array [1..n]\n */\nstatic void HeapSort(int n, int ra[], int rb[]) {\n  int i, rra, rrb;\n  int l, j, ir;\n\n  l = (n >> 1) + 1;\n  ir = n;\n  for (;;) {\n    if (l > 1) {\n      rra = ra[--l];\n      rrb = rb[l];\n    } else {\n      rra = ra[ir];\n      rrb = rb[ir];\n      ra[ir] = ra[1];\n      rb[ir] = rb[1];\n      if (--ir == 1) {\n        ra[1] = rra;\n        rb[1] = rrb;\n        return;\n      }\n    }\n    i = l;\n    j = l << 1;\n    while (j <= ir) {\n      if (j < ir && ra[j] < ra[j + 1]) {\n        ++j;\n      }\n      if (rra < ra[j]) {\n        ra[i] = ra[j];\n        rb[i] = rb[j];\n        j += (i = j);\n      } else {\n        j = ir + 1;\n      }\n    }\n    ra[i] = rra;\n    rb[i] = rrb;\n  }\n}\n\n// Encapsulation of the intermediate data and computations made by the class\n// pruner. The class pruner implements a simple linear classifier on binary\n// features by heavily quantizing the feature space, and applying\n// NUM_BITS_PER_CLASS (2)-bit weights to the features. Lack of resolution in\n// weights is compensated by a non-constant bias that is dependent on the\n// number of features present.\nclass ClassPruner {\npublic:\n  ClassPruner(int max_classes) {\n    // The unrolled loop in ComputeScores means that the array sizes need to\n    // be rounded up so that the array is big enough to accommodate the extra\n    // entries accessed by the unrolling. Each pruner word is of sized\n    // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are\n    // BITS_PER_WERD / NUM_BITS_PER_CLASS entries.\n    // See ComputeScores.\n    max_classes_ = max_classes;\n    rounded_classes_ =\n        RoundUp(max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS);\n    class_count_ = new int[rounded_classes_];\n    norm_count_ = new int[rounded_classes_];\n    sort_key_ = new int[rounded_classes_ + 1];\n    sort_index_ = new int[rounded_classes_ + 1];\n    for (int i = 0; i < rounded_classes_; i++) {\n      class_count_[i] = 0;\n    }\n    pruning_threshold_ = 0;\n    num_features_ = 0;\n    num_classes_ = 0;\n  }\n\n  ~ClassPruner() {\n    delete[] class_count_;\n    delete[] norm_count_;\n    delete[] sort_key_;\n    delete[] sort_index_;\n  }\n\n  /// Computes the scores for every class in the character set, by summing the\n  /// weights for each feature and stores the sums internally in class_count_.\n  void ComputeScores(const INT_TEMPLATES_STRUCT *int_templates, int num_features,\n                     const INT_FEATURE_STRUCT *features) {\n    num_features_ = num_features;\n    auto num_pruners = int_templates->NumClassPruners;\n    for (int f = 0; f < num_features; ++f) {\n      const INT_FEATURE_STRUCT *feature = &features[f];\n      // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.\n      int x = feature->X * NUM_CP_BUCKETS >> 8;\n      int y = feature->Y * NUM_CP_BUCKETS >> 8;\n      int theta = feature->Theta * NUM_CP_BUCKETS >> 8;\n      int class_id = 0;\n      // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so\n      // we need a collection of them, indexed by pruner_set.\n      for (unsigned pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {\n        // Look up quantized feature in a 3-D array, an array of weights for\n        // each class.\n        const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta];\n        for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {\n          uint32_t pruner_word = *pruner_word_ptr++;\n          // This inner loop is unrolled to speed up the ClassPruner.\n          // Currently gcc would not unroll it unless it is set to O3\n          // level of optimization or -funroll-loops is specified.\n          /*\nuint32_t class_mask = (1 << NUM_BITS_PER_CLASS) - 1;\nfor (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {\n  class_count_[class_id++] += pruner_word & class_mask;\n  pruner_word >>= NUM_BITS_PER_CLASS;\n}\n*/\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n          pruner_word >>= NUM_BITS_PER_CLASS;\n          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;\n        }\n      }\n    }\n  }\n\n  /// Adjusts the scores according to the number of expected features. Used\n  /// in lieu of a constant bias, this penalizes classes that expect more\n  /// features than there are present. Thus an actual c will score higher for c\n  /// than e, even though almost all the features match e as well as c, because\n  /// e expects more features to be present.\n  void AdjustForExpectedNumFeatures(const uint16_t *expected_num_features, int cutoff_strength) {\n    for (int class_id = 0; class_id < max_classes_; ++class_id) {\n      if (num_features_ < expected_num_features[class_id]) {\n        int deficit = expected_num_features[class_id] - num_features_;\n        class_count_[class_id] -=\n            class_count_[class_id] * deficit / (num_features_ * cutoff_strength + deficit);\n      }\n    }\n  }\n\n  /// Zeros the scores for classes disabled in the unicharset.\n  /// Implements the black-list to recognize a subset of the character set.\n  void DisableDisabledClasses(const UNICHARSET &unicharset) {\n    for (int class_id = 0; class_id < max_classes_; ++class_id) {\n      if (!unicharset.get_enabled(class_id)) {\n        class_count_[class_id] = 0; // This char is disabled!\n      }\n    }\n  }\n\n  /** Zeros the scores of fragments. */\n  void DisableFragments(const UNICHARSET &unicharset) {\n    for (int class_id = 0; class_id < max_classes_; ++class_id) {\n      // Do not include character fragments in the class pruner\n      // results if disable_character_fragments is true.\n      if (unicharset.get_fragment(class_id)) {\n        class_count_[class_id] = 0;\n      }\n    }\n  }\n\n  /// Normalizes the counts for xheight, putting the normalized result in\n  /// norm_count_. Applies a simple subtractive penalty for incorrect vertical\n  /// position provided by the normalization_factors array, indexed by\n  /// character class, and scaled by the norm_multiplier.\n  void NormalizeForXheight(int norm_multiplier, const uint8_t *normalization_factors) {\n    for (int class_id = 0; class_id < max_classes_; class_id++) {\n      norm_count_[class_id] =\n          class_count_[class_id] - ((norm_multiplier * normalization_factors[class_id]) >> 8);\n    }\n  }\n\n  /** The nop normalization copies the class_count_ array to norm_count_. */\n  void NoNormalization() {\n    for (int class_id = 0; class_id < max_classes_; class_id++) {\n      norm_count_[class_id] = class_count_[class_id];\n    }\n  }\n\n  /// Prunes the classes using &lt;the maximum count> * pruning_factor/256 as a\n  /// threshold for keeping classes. If max_of_non_fragments, then ignore\n  /// fragments in computing the maximum count.\n  void PruneAndSort(int pruning_factor, int keep_this, bool max_of_non_fragments,\n                    const UNICHARSET &unicharset) {\n    int max_count = 0;\n    for (int c = 0; c < max_classes_; ++c) {\n      if (norm_count_[c] > max_count &&\n          // This additional check is added in order to ensure that\n          // the classifier will return at least one non-fragmented\n          // character match.\n          // TODO(daria): verify that this helps accuracy and does not\n          // hurt performance.\n          (!max_of_non_fragments || !unicharset.get_fragment(c))) {\n        max_count = norm_count_[c];\n      }\n    }\n    // Prune Classes.\n    pruning_threshold_ = (max_count * pruning_factor) >> 8;\n    // Select Classes.\n    if (pruning_threshold_ < 1) {\n      pruning_threshold_ = 1;\n    }\n    num_classes_ = 0;\n    for (int class_id = 0; class_id < max_classes_; class_id++) {\n      if (norm_count_[class_id] >= pruning_threshold_ || class_id == keep_this) {\n        ++num_classes_;\n        sort_index_[num_classes_] = class_id;\n        sort_key_[num_classes_] = norm_count_[class_id];\n      }\n    }\n\n    // Sort Classes using Heapsort Algorithm.\n    if (num_classes_ > 1) {\n      HeapSort(num_classes_, sort_key_, sort_index_);\n    }\n  }\n\n  /** Prints debug info on the class pruner matches for the pruned classes only.\n   */\n  void DebugMatch(const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates,\n                  const INT_FEATURE_STRUCT *features) const {\n    int num_pruners = int_templates->NumClassPruners;\n    int max_num_classes = int_templates->NumClasses;\n    for (int f = 0; f < num_features_; ++f) {\n      const INT_FEATURE_STRUCT *feature = &features[f];\n      tprintf(\"F=%3d(%d,%d,%d),\", f, feature->X, feature->Y, feature->Theta);\n      // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.\n      int x = feature->X * NUM_CP_BUCKETS >> 8;\n      int y = feature->Y * NUM_CP_BUCKETS >> 8;\n      int theta = feature->Theta * NUM_CP_BUCKETS >> 8;\n      int class_id = 0;\n      for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {\n        // Look up quantized feature in a 3-D array, an array of weights for\n        // each class.\n        const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta];\n        for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {\n          uint32_t pruner_word = *pruner_word_ptr++;\n          for (int word_class = 0; word_class < 16 && class_id < max_num_classes;\n               ++word_class, ++class_id) {\n            if (norm_count_[class_id] >= pruning_threshold_) {\n              tprintf(\" %s=%d,\", classify.ClassIDToDebugStr(int_templates, class_id, 0).c_str(),\n                      pruner_word & CLASS_PRUNER_CLASS_MASK);\n            }\n            pruner_word >>= NUM_BITS_PER_CLASS;\n          }\n        }\n        tprintf(\"\\n\");\n      }\n    }\n  }\n\n  /** Prints a summary of the pruner result. */\n  void SummarizeResult(const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates,\n                       const uint16_t *expected_num_features, int norm_multiplier,\n                       const uint8_t *normalization_factors) const {\n    tprintf(\"CP:%d classes, %d features:\\n\", num_classes_, num_features_);\n    for (int i = 0; i < num_classes_; ++i) {\n      int class_id = sort_index_[num_classes_ - i];\n      std::string class_string = classify.ClassIDToDebugStr(int_templates, class_id, 0);\n      tprintf(\n          \"%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\\n\", class_string.c_str(),\n          class_count_[class_id], expected_num_features[class_id],\n          (norm_multiplier * normalization_factors[class_id]) >> 8, sort_key_[num_classes_ - i],\n          100.0 - 100.0 * sort_key_[num_classes_ - i] / (CLASS_PRUNER_CLASS_MASK * num_features_));\n    }\n  }\n\n  /// Copies the pruned, sorted classes into the output results and returns\n  /// the number of classes.\n  int SetupResults(std::vector<CP_RESULT_STRUCT> *results) const {\n    results->clear();\n    results->resize(num_classes_);\n    for (int c = 0; c < num_classes_; ++c) {\n      (*results)[c].Class = sort_index_[num_classes_ - c];\n      (*results)[c].Rating =\n          1.0f - sort_key_[num_classes_ - c] /\n                     (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);\n    }\n    return num_classes_;\n  }\n\nprivate:\n  /** Array[rounded_classes_] of initial counts for each class. */\n  int *class_count_;\n  /// Array[rounded_classes_] of modified counts for each class after\n  /// normalizing for expected number of features, disabled classes, fragments,\n  /// and xheights.\n  int *norm_count_;\n  /** Array[rounded_classes_ +1] of pruned counts that gets sorted */\n  int *sort_key_;\n  /** Array[rounded_classes_ +1] of classes corresponding to sort_key_. */\n  int *sort_index_;\n  /** Number of classes in this class pruner. */\n  int max_classes_;\n  /** Rounded up number of classes used for array sizes. */\n  int rounded_classes_;\n  /** Threshold count applied to prune classes. */\n  int pruning_threshold_;\n  /** The number of features used to compute the scores. */\n  int num_features_;\n  /** Final number of pruned classes. */\n  int num_classes_;\n};\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n/**\n * Runs the class pruner from int_templates on the given features, returning\n * the number of classes output in results.\n * @param int_templates          Class pruner tables\n * @param num_features           Number of features in blob\n * @param features               Array of features\n * @param normalization_factors  Array of fudge factors from blob\n *                               normalization process (by CLASS_INDEX)\n * @param expected_num_features  Array of expected number of features\n *                               for each class (by CLASS_INDEX)\n * @param results                Sorted Array of pruned classes. Must be an\n *                               array of size at least\n *                               int_templates->NumClasses.\n * @param keep_this\n */\nint Classify::PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features,\n                           int keep_this, const INT_FEATURE_STRUCT *features,\n                           const uint8_t *normalization_factors,\n                           const uint16_t *expected_num_features,\n                           std::vector<CP_RESULT_STRUCT> *results) {\n  ClassPruner pruner(int_templates->NumClasses);\n  // Compute initial match scores for all classes.\n  pruner.ComputeScores(int_templates, num_features, features);\n  // Adjust match scores for number of expected features.\n  pruner.AdjustForExpectedNumFeatures(expected_num_features, classify_cp_cutoff_strength);\n  // Apply disabled classes in unicharset - only works without a shape_table.\n  if (shape_table_ == nullptr) {\n    pruner.DisableDisabledClasses(unicharset);\n  }\n  // If fragments are disabled, remove them, also only without a shape table.\n  if (disable_character_fragments && shape_table_ == nullptr) {\n    pruner.DisableFragments(unicharset);\n  }\n\n  // If we have good x-heights, apply the given normalization factors.\n  if (normalization_factors != nullptr) {\n    pruner.NormalizeForXheight(classify_class_pruner_multiplier, normalization_factors);\n  } else {\n    pruner.NoNormalization();\n  }\n  // Do the actual pruning and sort the short-list.\n  pruner.PruneAndSort(classify_class_pruner_threshold, keep_this, shape_table_ == nullptr,\n                      unicharset);\n\n  if (classify_debug_level > 2) {\n    pruner.DebugMatch(*this, int_templates, features);\n  }\n  if (classify_debug_level > 1) {\n    pruner.SummarizeResult(*this, int_templates, expected_num_features,\n                           classify_class_pruner_multiplier, normalization_factors);\n  }\n  // Convert to the expected output format.\n  return pruner.SetupResults(results);\n}\n\n/**\n * IntegerMatcher returns the best configuration and rating\n * for a single class.  The class matched against is determined\n * by the uniqueness of the ClassTemplate parameter.  The\n * best rating and its associated configuration are returned.\n *\n * Globals:\n * - local_matcher_multiplier_ Normalization factor multiplier\n * param ClassTemplate Prototypes & tables for a class\n * param NumFeatures Number of features in blob\n * param Features Array of features\n * param NormalizationFactor Fudge factor from blob normalization process\n * param Result Class rating & configuration: (0.0 -> 1.0), 0=bad, 1=good\n * param Debug Debugger flag: 1=debugger on\n */\nvoid IntegerMatcher::Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                           int16_t NumFeatures, const INT_FEATURE_STRUCT *Features,\n                           UnicharRating *Result, int AdaptFeatureThreshold, int Debug,\n                           bool SeparateDebugWindows) {\n  auto *tables = new ScratchEvidence();\n  int Feature;\n\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Integer Matcher -------------------------------------------\\n\");\n  }\n\n  tables->Clear(ClassTemplate);\n  Result->feature_misses = 0;\n\n  for (Feature = 0; Feature < NumFeatures; Feature++) {\n    int csum = UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature,\n                                      &Features[Feature], tables, Debug);\n    // Count features that were missed over all configs.\n    if (csum == 0) {\n      ++Result->feature_misses;\n    }\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) {\n    DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug);\n  }\n\n  if (DisplayProtoMatchesOn(Debug)) {\n    DisplayProtoDebugInfo(ClassTemplate, ConfigMask, *tables, SeparateDebugWindows);\n  }\n\n  if (DisplayFeatureMatchesOn(Debug)) {\n    DisplayFeatureDebugInfo(ClassTemplate, ProtoMask, ConfigMask, NumFeatures, Features,\n                            AdaptFeatureThreshold, Debug, SeparateDebugWindows);\n  }\n#endif\n\n  tables->UpdateSumOfProtoEvidences(ClassTemplate, ConfigMask);\n  tables->NormalizeSums(ClassTemplate, NumFeatures);\n\n  FindBestMatch(ClassTemplate, *tables, Result);\n\n#ifndef GRAPHICS_DISABLED\n  if (PrintMatchSummaryOn(Debug)) {\n    Result->Print();\n  }\n\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Match Complete --------------------------------------------\\n\");\n  }\n#endif\n\n  delete tables;\n}\n\n/**\n * FindGoodProtos finds all protos whose normalized proto-evidence\n * exceed AdaptProtoThreshold.  The list is ordered by increasing\n * proto id number.\n *\n * Globals:\n * - local_matcher_multiplier_    Normalization factor multiplier\n * param ClassTemplate Prototypes & tables for a class\n * param ProtoMask AND Mask for proto word\n * param ConfigMask AND Mask for config word\n * param NumFeatures Number of features in blob\n * param Features Array of features\n * param ProtoArray Array of good protos\n * param AdaptProtoThreshold Threshold for good protos\n * param Debug Debugger flag: 1=debugger on\n * @return Number of good protos in ProtoArray.\n */\nint IntegerMatcher::FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask,\n                                   BIT_VECTOR ConfigMask, int16_t NumFeatures,\n                                   INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray,\n                                   int AdaptProtoThreshold, int Debug) {\n  auto *tables = new ScratchEvidence();\n  int NumGoodProtos = 0;\n\n  /* DEBUG opening heading */\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Find Good Protos -------------------------------------------\\n\");\n  }\n\n  tables->Clear(ClassTemplate);\n\n  for (int Feature = 0; Feature < NumFeatures; Feature++) {\n    UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &(Features[Feature]),\n                           tables, Debug);\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) {\n    DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug);\n  }\n#endif\n\n  /* Average Proto Evidences & Find Good Protos */\n  for (int proto = 0; proto < ClassTemplate->NumProtos; proto++) {\n    /* Compute Average for Actual Proto */\n    int Temp = 0;\n    for (uint8_t i = 0; i < MAX_PROTO_INDEX && i < ClassTemplate->ProtoLengths[proto]; i++) {\n      Temp += tables->proto_evidence_[proto][i];\n    }\n\n    Temp /= ClassTemplate->ProtoLengths[proto];\n\n    /* Find Good Protos */\n    if (Temp >= AdaptProtoThreshold) {\n      *ProtoArray = proto;\n      ProtoArray++;\n      NumGoodProtos++;\n    }\n  }\n\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Match Complete --------------------------------------------\\n\");\n  }\n  delete tables;\n\n  return NumGoodProtos;\n}\n\n/**\n * FindBadFeatures finds all features with maximum feature-evidence <\n * AdaptFeatureThresh. The list is ordered by increasing feature number.\n * @param ClassTemplate Prototypes & tables for a class\n * @param ProtoMask AND Mask for proto word\n * @param ConfigMask AND Mask for config word\n * @param NumFeatures Number of features in blob\n * @param Features Array of features\n * @param FeatureArray Array of bad features\n * @param AdaptFeatureThreshold Threshold for bad features\n * @param Debug Debugger flag: 1=debugger on\n * @return Number of bad features in FeatureArray.\n */\nint IntegerMatcher::FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask,\n                                    BIT_VECTOR ConfigMask, int16_t NumFeatures,\n                                    INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray,\n                                    int AdaptFeatureThreshold, int Debug) {\n  auto *tables = new ScratchEvidence();\n  int NumBadFeatures = 0;\n\n  /* DEBUG opening heading */\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Find Bad Features -------------------------------------------\\n\");\n  }\n\n  tables->Clear(ClassTemplate);\n\n  for (int Feature = 0; Feature < NumFeatures; Feature++) {\n    UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature],\n                           tables, Debug);\n\n    /* Find Best Evidence for Current Feature */\n    int best = 0;\n    assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS);\n    for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) {\n      if (tables->feature_evidence_[i] > best) {\n        best = tables->feature_evidence_[i];\n      }\n    }\n\n    /* Find Bad Features */\n    if (best < AdaptFeatureThreshold) {\n      *FeatureArray = Feature;\n      FeatureArray++;\n      NumBadFeatures++;\n    }\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) {\n    DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug);\n  }\n#endif\n\n  if (MatchDebuggingOn(Debug)) {\n    tprintf(\"Match Complete --------------------------------------------\\n\");\n  }\n\n  delete tables;\n  return NumBadFeatures;\n}\n\nIntegerMatcher::IntegerMatcher(tesseract::IntParam *classify_debug_level)\n    : classify_debug_level_(classify_debug_level) {\n  /* Initialize table for evidence to similarity lookup */\n  for (int i = 0; i < SE_TABLE_SIZE; i++) {\n    uint32_t IntSimilarity = i << (27 - SE_TABLE_BITS);\n    double Similarity = (static_cast<double>(IntSimilarity)) / 65536.0 / 65536.0;\n    double evidence = Similarity / kSimilarityCenter;\n    evidence = 255.0 / (evidence * evidence + 1.0);\n\n    if (kSEExponentialMultiplier > 0.0) {\n      double scale =\n          1.0 - std::exp(-kSEExponentialMultiplier) *\n                    exp(kSEExponentialMultiplier * (static_cast<double>(i) / SE_TABLE_SIZE));\n      evidence *= ClipToRange(scale, 0.0, 1.0);\n    }\n\n    similarity_evidence_table_[i] = static_cast<uint8_t>(evidence + 0.5);\n  }\n\n  /* Initialize evidence computation variables */\n  evidence_table_mask_ = ((1 << kEvidenceTableBits) - 1) << (9 - kEvidenceTableBits);\n  mult_trunc_shift_bits_ = (14 - kIntEvidenceTruncBits);\n  table_trunc_shift_bits_ = (27 - SE_TABLE_BITS - (mult_trunc_shift_bits_ << 1));\n  evidence_mult_mask_ = ((1 << kIntEvidenceTruncBits) - 1);\n}\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\nvoid ScratchEvidence::Clear(const INT_CLASS_STRUCT *class_template) {\n  memset(sum_feature_evidence_, 0, class_template->NumConfigs * sizeof(sum_feature_evidence_[0]));\n  memset(proto_evidence_, 0, class_template->NumProtos * sizeof(proto_evidence_[0]));\n}\n\nvoid ScratchEvidence::ClearFeatureEvidence(const INT_CLASS_STRUCT *class_template) {\n  memset(feature_evidence_, 0, class_template->NumConfigs * sizeof(feature_evidence_[0]));\n}\n\n/**\n * Print debugging information for Configurations\n */\nstatic void IMDebugConfiguration(int FeatureNum, uint16_t ActualProtoNum, uint8_t Evidence,\n                                 uint32_t ConfigWord) {\n  tprintf(\"F = %3d, P = %3d, E = %3d, Configs = \", FeatureNum, static_cast<int>(ActualProtoNum),\n          static_cast<int>(Evidence));\n  while (ConfigWord) {\n    if (ConfigWord & 1) {\n      tprintf(\"1\");\n    } else {\n      tprintf(\"0\");\n    }\n    ConfigWord >>= 1;\n  }\n  tprintf(\"\\n\");\n}\n\n/**\n * Print debugging information for Configurations\n */\nstatic void IMDebugConfigurationSum(int FeatureNum, uint8_t *FeatureEvidence, int32_t ConfigCount) {\n  tprintf(\"F=%3d, C=\", FeatureNum);\n  for (int ConfigNum = 0; ConfigNum < ConfigCount; ConfigNum++) {\n    tprintf(\"%4d\", FeatureEvidence[ConfigNum]);\n  }\n  tprintf(\"\\n\");\n}\n\n/**\n * For the given feature: prune protos, compute evidence,\n * update Feature Evidence, Proto Evidence, and Sum of Feature\n * Evidence tables.\n * @param ClassTemplate Prototypes & tables for a class\n * @param FeatureNum Current feature number (for DEBUG only)\n * @param Feature Pointer to a feature struct\n * @param tables Evidence tables\n * @param Debug Debugger flag: 1=debugger on\n * @return sum of feature evidence tables\n */\nint IntegerMatcher::UpdateTablesForFeature(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask,\n                                           BIT_VECTOR ConfigMask, int FeatureNum,\n                                           const INT_FEATURE_STRUCT *Feature,\n                                           ScratchEvidence *tables, int Debug) {\n  uint32_t ConfigWord;\n  uint32_t ProtoWord;\n  uint32_t ProtoNum;\n  uint32_t ActualProtoNum;\n  uint8_t proto_byte;\n  int32_t proto_word_offset;\n  int32_t proto_offset;\n  PROTO_SET_STRUCT *ProtoSet;\n  uint32_t *ProtoPrunerPtr;\n  INT_PROTO_STRUCT *Proto;\n  int ProtoSetIndex;\n  uint8_t Evidence;\n  uint32_t XFeatureAddress;\n  uint32_t YFeatureAddress;\n  uint32_t ThetaFeatureAddress;\n\n  tables->ClearFeatureEvidence(ClassTemplate);\n\n  /* Precompute Feature Address offset for Proto Pruning */\n  XFeatureAddress = ((Feature->X >> 2) << 1);\n  YFeatureAddress = (NUM_PP_BUCKETS << 1) + ((Feature->Y >> 2) << 1);\n  ThetaFeatureAddress = (NUM_PP_BUCKETS << 2) + ((Feature->Theta >> 2) << 1);\n\n  for (ProtoSetIndex = 0, ActualProtoNum = 0; ProtoSetIndex < ClassTemplate->NumProtoSets;\n       ProtoSetIndex++) {\n    ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];\n    ProtoPrunerPtr = reinterpret_cast<uint32_t *>((*ProtoSet).ProtoPruner);\n    for (ProtoNum = 0; ProtoNum < PROTOS_PER_PROTO_SET; ProtoNum += (PROTOS_PER_PROTO_SET >> 1),\n        ActualProtoNum += (PROTOS_PER_PROTO_SET >> 1), ProtoMask++, ProtoPrunerPtr++) {\n      /* Prune Protos of current Proto Set */\n      ProtoWord = *(ProtoPrunerPtr + XFeatureAddress);\n      ProtoWord &= *(ProtoPrunerPtr + YFeatureAddress);\n      ProtoWord &= *(ProtoPrunerPtr + ThetaFeatureAddress);\n      ProtoWord &= *ProtoMask;\n\n      if (ProtoWord != 0) {\n        proto_byte = ProtoWord & 0xff;\n        ProtoWord >>= 8;\n        proto_word_offset = 0;\n        while (ProtoWord != 0 || proto_byte != 0) {\n          while (proto_byte == 0) {\n            proto_byte = ProtoWord & 0xff;\n            ProtoWord >>= 8;\n            proto_word_offset += 8;\n          }\n          proto_offset = offset_table[proto_byte] + proto_word_offset;\n          proto_byte = next_table[proto_byte];\n          Proto = &(ProtoSet->Protos[ProtoNum + proto_offset]);\n          ConfigWord = Proto->Configs[0];\n          int32_t A3 = (((Proto->A * (Feature->X - 128)) * 2) - (Proto->B * (Feature->Y - 128)) +\n                        (Proto->C * 512));\n          int32_t M3 = ((static_cast<int8_t>(Feature->Theta - Proto->Angle)) * kIntThetaFudge) * 2;\n\n          if (A3 < 0) {\n            A3 = ~A3;\n          }\n          if (M3 < 0) {\n            M3 = ~M3;\n          }\n          A3 >>= mult_trunc_shift_bits_;\n          M3 >>= mult_trunc_shift_bits_;\n          if (static_cast<uint32_t>(A3) > evidence_mult_mask_) {\n            A3 = evidence_mult_mask_;\n          }\n          if (static_cast<uint32_t>(M3) > evidence_mult_mask_) {\n            M3 = evidence_mult_mask_;\n          }\n\n          uint32_t A4 = (A3 * A3) + (M3 * M3);\n          A4 >>= table_trunc_shift_bits_;\n          if (A4 > evidence_table_mask_) {\n            Evidence = 0;\n          } else {\n            Evidence = similarity_evidence_table_[A4];\n          }\n\n          if (PrintFeatureMatchesOn(Debug)) {\n            IMDebugConfiguration(FeatureNum, ActualProtoNum + proto_offset, Evidence, ConfigWord);\n          }\n\n          ConfigWord &= *ConfigMask;\n\n          uint8_t feature_evidence_index = 0;\n          uint8_t config_byte = 0;\n          while (ConfigWord != 0 || config_byte != 0) {\n            while (config_byte == 0) {\n              config_byte = ConfigWord & 0xff;\n              ConfigWord >>= 8;\n              feature_evidence_index += 8;\n            }\n            const uint8_t config_offset = offset_table[config_byte] + feature_evidence_index - 8;\n            config_byte = next_table[config_byte];\n            if (Evidence > tables->feature_evidence_[config_offset]) {\n              tables->feature_evidence_[config_offset] = Evidence;\n            }\n          }\n\n          uint8_t ProtoIndex = ClassTemplate->ProtoLengths[ActualProtoNum + proto_offset];\n          if (ProtoIndex > MAX_PROTO_INDEX) {\n            // Avoid buffer overflow.\n            // TODO: A better fix is still open.\n            ProtoIndex = MAX_PROTO_INDEX;\n          }\n          uint8_t *UINT8Pointer = &(tables->proto_evidence_[ActualProtoNum + proto_offset][0]);\n          for (; Evidence > 0 && ProtoIndex > 0; ProtoIndex--, UINT8Pointer++) {\n            if (Evidence > *UINT8Pointer) {\n              uint8_t Temp = *UINT8Pointer;\n              *UINT8Pointer = Evidence;\n              Evidence = Temp;\n            }\n          }\n        }\n      }\n    }\n  }\n\n  if (PrintFeatureMatchesOn(Debug)) {\n    IMDebugConfigurationSum(FeatureNum, tables->feature_evidence_, ClassTemplate->NumConfigs);\n  }\n\n  int *IntPointer = tables->sum_feature_evidence_;\n  uint8_t *UINT8Pointer = tables->feature_evidence_;\n  int SumOverConfigs = 0;\n  for (int ConfigNum = ClassTemplate->NumConfigs; ConfigNum > 0; ConfigNum--) {\n    int evidence = *UINT8Pointer++;\n    SumOverConfigs += evidence;\n    *IntPointer++ += evidence;\n  }\n  return SumOverConfigs;\n}\n\n/**\n * Print debugging information for Configurations\n */\n#ifndef GRAPHICS_DISABLED\nvoid IntegerMatcher::DebugFeatureProtoError(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask,\n                                            BIT_VECTOR ConfigMask, const ScratchEvidence &tables,\n                                            int16_t NumFeatures, int Debug) {\n  float ProtoConfigs[MAX_NUM_CONFIGS];\n  int ConfigNum;\n  uint32_t ConfigWord;\n  int ProtoSetIndex;\n  uint16_t ProtoNum;\n  uint8_t ProtoWordNum;\n  PROTO_SET_STRUCT *ProtoSet;\n\n  if (PrintMatchSummaryOn(Debug)) {\n    tprintf(\"Configuration Mask:\\n\");\n    for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {\n      tprintf(\"%1d\", (((*ConfigMask) >> ConfigNum) & 1));\n    }\n    tprintf(\"\\n\");\n\n    tprintf(\"Feature Error for Configurations:\\n\");\n    for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {\n      tprintf(\" %5.1f\", 100.0 * (1.0 - static_cast<float>(tables.sum_feature_evidence_[ConfigNum]) /\n                                           NumFeatures / 256.0));\n    }\n    tprintf(\"\\n\\n\\n\");\n  }\n\n  if (PrintMatchSummaryOn(Debug)) {\n    tprintf(\"Proto Mask:\\n\");\n    for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) {\n      for (ProtoWordNum = 0; ProtoWordNum < 2; ProtoWordNum++, ProtoMask++) {\n        uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);\n        for (ProtoNum = 0; ((ProtoNum < (PROTOS_PER_PROTO_SET >> 1)) &&\n                            (ActualProtoNum < ClassTemplate->NumProtos));\n             ProtoNum++, ActualProtoNum++) {\n          tprintf(\"%1d\", (((*ProtoMask) >> ProtoNum) & 1));\n        }\n        tprintf(\"\\n\");\n      }\n    }\n    tprintf(\"\\n\");\n  }\n\n  for (int i = 0; i < ClassTemplate->NumConfigs; i++) {\n    ProtoConfigs[i] = 0;\n  }\n\n  if (PrintProtoMatchesOn(Debug)) {\n    tprintf(\"Proto Evidence:\\n\");\n    for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) {\n      ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];\n      uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);\n      for (ProtoNum = 0;\n           ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < ClassTemplate->NumProtos));\n           ProtoNum++, ActualProtoNum++) {\n        tprintf(\"P %3d =\", ActualProtoNum);\n        int temp = 0;\n        for (uint8_t j = 0; j < ClassTemplate->ProtoLengths[ActualProtoNum]; j++) {\n          uint8_t data = tables.proto_evidence_[ActualProtoNum][j];\n          tprintf(\" %d\", data);\n          temp += data;\n        }\n\n        tprintf(\" = %6.4f%%\\n\", temp / 256.0 / ClassTemplate->ProtoLengths[ActualProtoNum]);\n\n        ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0];\n        ConfigNum = 0;\n        while (ConfigWord) {\n          tprintf(\"%5d\", ConfigWord & 1 ? temp : 0);\n          if (ConfigWord & 1) {\n            ProtoConfigs[ConfigNum] += temp;\n          }\n          ConfigNum++;\n          ConfigWord >>= 1;\n        }\n        tprintf(\"\\n\");\n      }\n    }\n  }\n\n  if (PrintMatchSummaryOn(Debug)) {\n    tprintf(\"Proto Error for Configurations:\\n\");\n    for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {\n      tprintf(\" %5.1f\", 100.0 * (1.0 - ProtoConfigs[ConfigNum] /\n                                           ClassTemplate->ConfigLengths[ConfigNum] / 256.0));\n    }\n    tprintf(\"\\n\\n\");\n  }\n\n  if (PrintProtoMatchesOn(Debug)) {\n    tprintf(\"Proto Sum for Configurations:\\n\");\n    for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {\n      tprintf(\" %4.1f\", ProtoConfigs[ConfigNum] / 256.0);\n    }\n    tprintf(\"\\n\\n\");\n\n    tprintf(\"Proto Length for Configurations:\\n\");\n    for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {\n      tprintf(\" %4.1f\", static_cast<float>(ClassTemplate->ConfigLengths[ConfigNum]));\n    }\n    tprintf(\"\\n\\n\");\n  }\n}\n\nvoid IntegerMatcher::DisplayProtoDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask,\n                                           const ScratchEvidence &tables,\n                                           bool SeparateDebugWindows) {\n  uint16_t ProtoNum;\n  PROTO_SET_STRUCT *ProtoSet;\n  int ProtoSetIndex;\n\n  InitIntMatchWindowIfReqd();\n  if (SeparateDebugWindows) {\n    InitFeatureDisplayWindowIfReqd();\n    InitProtoDisplayWindowIfReqd();\n  }\n\n  for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) {\n    ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];\n    uint16_t ActualProtoNum = ProtoSetIndex * PROTOS_PER_PROTO_SET;\n    for (ProtoNum = 0;\n         ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < ClassTemplate->NumProtos));\n         ProtoNum++, ActualProtoNum++) {\n      /* Compute Average for Actual Proto */\n      int temp = 0;\n      for (uint8_t i = 0; i < ClassTemplate->ProtoLengths[ActualProtoNum]; i++) {\n        temp += tables.proto_evidence_[ActualProtoNum][i];\n      }\n\n      temp /= ClassTemplate->ProtoLengths[ActualProtoNum];\n\n      if ((ProtoSet->Protos[ProtoNum]).Configs[0] & (*ConfigMask)) {\n        DisplayIntProto(ClassTemplate, ActualProtoNum, temp / 255.0);\n      }\n    }\n  }\n}\n\nvoid IntegerMatcher::DisplayFeatureDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask,\n                                             BIT_VECTOR ConfigMask, int16_t NumFeatures,\n                                             const INT_FEATURE_STRUCT *Features,\n                                             int AdaptFeatureThreshold, int Debug,\n                                             bool SeparateDebugWindows) {\n  auto *tables = new ScratchEvidence();\n\n  tables->Clear(ClassTemplate);\n\n  InitIntMatchWindowIfReqd();\n  if (SeparateDebugWindows) {\n    InitFeatureDisplayWindowIfReqd();\n    InitProtoDisplayWindowIfReqd();\n  }\n\n  for (int Feature = 0; Feature < NumFeatures; Feature++) {\n    UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature],\n                           tables, 0);\n\n    /* Find Best Evidence for Current Feature */\n    int best = 0;\n    assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS);\n    for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) {\n      if (tables->feature_evidence_[i] > best) {\n        best = tables->feature_evidence_[i];\n      }\n    }\n\n    /* Update display for current feature */\n    if (ClipMatchEvidenceOn(Debug)) {\n      if (best < AdaptFeatureThreshold) {\n        DisplayIntFeature(&Features[Feature], 0.0);\n      } else {\n        DisplayIntFeature(&Features[Feature], 1.0);\n      }\n    } else {\n      DisplayIntFeature(&Features[Feature], best / 255.0);\n    }\n  }\n\n  delete tables;\n}\n#endif\n\n/**\n * Add sum of Proto Evidences into Sum Of Feature Evidence Array\n */\nvoid ScratchEvidence::UpdateSumOfProtoEvidences(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask) {\n  int *IntPointer;\n  uint32_t ConfigWord;\n  int ProtoSetIndex;\n  uint16_t ProtoNum;\n  PROTO_SET_STRUCT *ProtoSet;\n  int NumProtos;\n\n  NumProtos = ClassTemplate->NumProtos;\n\n  for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) {\n    ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];\n    uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);\n    for (ProtoNum = 0; ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < NumProtos));\n         ProtoNum++, ActualProtoNum++) {\n      int temp = 0;\n      for (uint8_t i = 0; i < MAX_PROTO_INDEX && i < ClassTemplate->ProtoLengths[ActualProtoNum];\n           i++) {\n        temp += proto_evidence_[ActualProtoNum][i];\n      }\n\n      ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0];\n      ConfigWord &= *ConfigMask;\n      IntPointer = sum_feature_evidence_;\n      while (ConfigWord) {\n        if (ConfigWord & 1) {\n          *IntPointer += temp;\n        }\n        IntPointer++;\n        ConfigWord >>= 1;\n      }\n    }\n  }\n}\n\n/**\n * Normalize Sum of Proto and Feature Evidence by dividing by the sum of\n * the Feature Lengths and the Proto Lengths for each configuration.\n */\nvoid ScratchEvidence::NormalizeSums(INT_CLASS_STRUCT *ClassTemplate, int16_t NumFeatures) {\n  // ClassTemplate->NumConfigs can become larger than MAX_NUM_CONFIGS.\n  for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) {\n    sum_feature_evidence_[i] =\n        (sum_feature_evidence_[i] << 8) / (NumFeatures + ClassTemplate->ConfigLengths[i]);\n  }\n}\n\n/**\n * Find the best match for the current class and update the Result\n * with the configuration and match rating.\n * @return The best normalized sum of evidences\n */\nint IntegerMatcher::FindBestMatch(INT_CLASS_STRUCT *class_template, const ScratchEvidence &tables,\n                                  UnicharRating *result) {\n  int best_match = 0;\n  result->config = 0;\n  result->fonts.clear();\n  result->fonts.reserve(class_template->NumConfigs);\n\n  // Find best match.\n  // ClassTemplate->NumConfigs can become larger than MAX_NUM_CONFIGS.\n  for (int c = 0; c < MAX_NUM_CONFIGS && c < class_template->NumConfigs; ++c) {\n    int rating = tables.sum_feature_evidence_[c];\n    if (*classify_debug_level_ > 2) {\n      tprintf(\"Config %d, rating=%d\\n\", c, rating);\n    }\n    if (rating > best_match) {\n      result->config = c;\n      best_match = rating;\n    }\n    result->fonts.emplace_back(c, rating);\n  }\n\n  // Compute confidence on a Probability scale.\n  result->rating = best_match / 65536.0f;\n\n  return best_match;\n}\n\n/**\n * Applies the CN normalization factor to the given rating and returns\n * the modified rating.\n */\nfloat IntegerMatcher::ApplyCNCorrection(float rating, int blob_length, int normalization_factor,\n                                        int matcher_multiplier) {\n  int divisor = blob_length + matcher_multiplier;\n  return divisor == 0\n             ? 1.0f\n             : (rating * blob_length + matcher_multiplier * normalization_factor / 256.0f) /\n                   divisor;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/intmatcher.h",
    "content": "/******************************************************************************\n ** Filename:    intmatcher.h\n ** Purpose:     Interface to high level generic classifier routines.\n ** Author:      Robert Moss\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n#ifndef INTMATCHER_H\n#define INTMATCHER_H\n\n#include \"intproto.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\n// Character fragments could be present in the trained templates\n// but turned on/off on the language-by-language basis or depending\n// on particular properties of the corpus (e.g. when we expect the\n// images to have low exposure).\nextern BOOL_VAR_H(disable_character_fragments);\n\nextern INT_VAR_H(classify_integer_matcher_multiplier);\n\nstruct UnicharRating;\n\nstruct CP_RESULT_STRUCT {\n  CP_RESULT_STRUCT() : Rating(0.0f), Class(0) {}\n\n  float Rating;\n  CLASS_ID Class;\n};\n\n/**----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\n\n#define SE_TABLE_BITS 9\n#define SE_TABLE_SIZE 512\n\nstruct ScratchEvidence {\n  uint8_t feature_evidence_[MAX_NUM_CONFIGS];\n  int sum_feature_evidence_[MAX_NUM_CONFIGS];\n  uint8_t proto_evidence_[MAX_NUM_PROTOS][MAX_PROTO_INDEX];\n\n  void Clear(const INT_CLASS_STRUCT *class_template);\n  void ClearFeatureEvidence(const INT_CLASS_STRUCT *class_template);\n  void NormalizeSums(INT_CLASS_STRUCT *ClassTemplate, int16_t NumFeatures);\n  void UpdateSumOfProtoEvidences(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask);\n};\n\nclass IntegerMatcher {\npublic:\n  // Integer Matcher Theta Fudge (0-255).\n  static const int kIntThetaFudge = 128;\n  // Bits in Similarity to Evidence Lookup (8-9).\n  static const int kEvidenceTableBits = 9;\n  // Integer Evidence Truncation Bits (8-14).\n  static const int kIntEvidenceTruncBits = 14;\n  // Similarity to Evidence Table Exponential Multiplier.\n  static const float kSEExponentialMultiplier;\n  // Center of Similarity Curve.\n  static const float kSimilarityCenter;\n\n  IntegerMatcher(tesseract::IntParam *classify_debug_level);\n\n  void Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n             int16_t NumFeatures, const INT_FEATURE_STRUCT *Features,\n             tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug,\n             bool SeparateDebugWindows);\n\n  // Applies the CN normalization factor to the given rating and returns\n  // the modified rating.\n  float ApplyCNCorrection(float rating, int blob_length, int normalization_factor,\n                          int matcher_multiplier);\n\n  int FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                     int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray,\n                     int AdaptProtoThreshold, int Debug);\n\n  int FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                      int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray,\n                      int AdaptFeatureThreshold, int Debug);\n\nprivate:\n  int UpdateTablesForFeature(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                             int FeatureNum, const INT_FEATURE_STRUCT *Feature,\n                             ScratchEvidence *evidence, int Debug);\n\n  int FindBestMatch(INT_CLASS_STRUCT *ClassTemplate, const ScratchEvidence &tables,\n                    tesseract::UnicharRating *Result);\n\n#ifndef GRAPHICS_DISABLED\n  void DebugFeatureProtoError(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                              const ScratchEvidence &tables, int16_t NumFeatures, int Debug);\n\n  void DisplayProtoDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask,\n                             const ScratchEvidence &tables, bool SeparateDebugWindows);\n\n  void DisplayFeatureDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask,\n                               int16_t NumFeatures, const INT_FEATURE_STRUCT *Features,\n                               int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows);\n#endif\n\nprivate:\n  tesseract::IntParam *classify_debug_level_;\n  uint8_t similarity_evidence_table_[SE_TABLE_SIZE];\n  uint32_t evidence_table_mask_;\n  uint32_t mult_trunc_shift_bits_;\n  uint32_t table_trunc_shift_bits_;\n  uint32_t evidence_mult_mask_;\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/intproto.cpp",
    "content": "/******************************************************************************\n ** Filename:    intproto.c\n ** Purpose:     Definition of data structures for integer protos.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*-----------------------------------------------------------------------------\n          Include Files and Type Defines\n-----------------------------------------------------------------------------*/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"intproto.h\"\n\n#include \"classify.h\"\n#include \"fontinfo.h\"\n#include \"mfoutline.h\"\n#include \"picofeat.h\"\n#include \"points.h\"\n#include \"shapetable.h\"\n#ifndef GRAPHICS_DISABLED\n#include \"svmnode.h\"\n#endif\n\n#include \"helpers.h\"\n\n#include <algorithm>\n#include <cassert>\n#include <cmath> // for M_PI, std::floor\n#include <cstdio>\n\nnamespace tesseract {\n\n/* match debug display constants*/\n#define PROTO_PRUNER_SCALE (4.0)\n\n#define INT_DESCENDER (0.0 * INT_CHAR_NORM_RANGE)\n#define INT_BASELINE (0.25 * INT_CHAR_NORM_RANGE)\n#define INT_XHEIGHT (0.75 * INT_CHAR_NORM_RANGE)\n#define INT_CAPHEIGHT (1.0 * INT_CHAR_NORM_RANGE)\n\n#define INT_XCENTER (0.5 * INT_CHAR_NORM_RANGE)\n#define INT_YCENTER (0.5 * INT_CHAR_NORM_RANGE)\n#define INT_XRADIUS (0.2 * INT_CHAR_NORM_RANGE)\n#define INT_YRADIUS (0.2 * INT_CHAR_NORM_RANGE)\n#define INT_MIN_X 0\n#define INT_MIN_Y 0\n#define INT_MAX_X INT_CHAR_NORM_RANGE\n#define INT_MAX_Y INT_CHAR_NORM_RANGE\n\n/** define pad used to snap near horiz/vertical protos to horiz/vertical */\n#define HV_TOLERANCE (0.0025) /* approx 0.9 degrees */\n\ntypedef enum { StartSwitch, EndSwitch, LastSwitch } SWITCH_TYPE;\n#define MAX_NUM_SWITCHES 3\n\nstruct FILL_SWITCH {\n  SWITCH_TYPE Type;\n  int8_t X, Y;\n  int16_t YInit;\n  int16_t Delta;\n};\n\nstruct TABLE_FILLER {\n  uint8_t NextSwitch;\n  uint8_t AngleStart, AngleEnd;\n  int8_t X;\n  int16_t YStart, YEnd;\n  int16_t StartDelta, EndDelta;\n  FILL_SWITCH Switch[MAX_NUM_SWITCHES];\n};\n\nstruct FILL_SPEC {\n  int8_t X;\n  int8_t YStart, YEnd;\n  uint8_t AngleStart, AngleEnd;\n};\n\n/* constants for conversion from old inttemp format */\n#define OLD_MAX_NUM_CONFIGS 32\n#define OLD_WERDS_PER_CONFIG_VEC ((OLD_MAX_NUM_CONFIGS + BITS_PER_WERD - 1) / BITS_PER_WERD)\n\n/*-----------------------------------------------------------------------------\n            Macros\n-----------------------------------------------------------------------------*/\n/** macro for performing circular increments of bucket indices */\n#define CircularIncrement(i, r) (((i) < (r)-1) ? ((i)++) : ((i) = 0))\n\n/** macro for mapping floats to ints without bounds checking */\n#define MapParam(P, O, N) (std::floor(((P) + (O)) * (N)))\n\n/*---------------------------------------------------------------------------\n            Private Function Prototypes\n----------------------------------------------------------------------------*/\nfloat BucketStart(int Bucket, float Offset, int NumBuckets);\n\nfloat BucketEnd(int Bucket, float Offset, int NumBuckets);\n\nvoid DoFill(FILL_SPEC *FillSpec, CLASS_PRUNER_STRUCT *Pruner, uint32_t ClassMask,\n            uint32_t ClassCount, uint32_t WordIndex);\n\nbool FillerDone(TABLE_FILLER *Filler);\n\nvoid FillPPCircularBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR], int Bit,\n                        float Center, float Spread, bool debug);\n\nvoid FillPPLinearBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR], int Bit,\n                      float Center, float Spread, bool debug);\n\nvoid GetCPPadsForLevel(int Level, float *EndPad, float *SidePad, float *AnglePad);\n\nScrollView::Color GetMatchColorFor(float Evidence);\n\nvoid GetNextFill(TABLE_FILLER *Filler, FILL_SPEC *Fill);\n\nvoid InitTableFiller(float EndPad, float SidePad, float AnglePad, PROTO_STRUCT *Proto,\n                     TABLE_FILLER *Filler);\n\n#ifndef GRAPHICS_DISABLED\nvoid RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature,\n                      ScrollView::Color color);\n\nvoid RenderIntProto(ScrollView *window, INT_CLASS_STRUCT *Class, PROTO_ID ProtoId, ScrollView::Color color);\n#endif // !GRAPHICS_DISABLED\n\n/*-----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n-----------------------------------------------------------------------------*/\n\n#ifndef GRAPHICS_DISABLED\n/* global display lists used to display proto and feature match information*/\nstatic ScrollView *IntMatchWindow = nullptr;\nstatic ScrollView *FeatureDisplayWindow = nullptr;\nstatic ScrollView *ProtoDisplayWindow = nullptr;\n#endif\n\n/*-----------------------------------------------------------------------------\n        Variables\n-----------------------------------------------------------------------------*/\n\n/* control knobs */\nstatic INT_VAR(classify_num_cp_levels, 3, \"Number of Class Pruner Levels\");\nstatic double_VAR(classify_cp_angle_pad_loose, 45.0, \"Class Pruner Angle Pad Loose\");\nstatic double_VAR(classify_cp_angle_pad_medium, 20.0, \"Class Pruner Angle Pad Medium\");\nstatic double_VAR(classify_cp_angle_pad_tight, 10.0, \"CLass Pruner Angle Pad Tight\");\nstatic double_VAR(classify_cp_end_pad_loose, 0.5, \"Class Pruner End Pad Loose\");\nstatic double_VAR(classify_cp_end_pad_medium, 0.5, \"Class Pruner End Pad Medium\");\nstatic double_VAR(classify_cp_end_pad_tight, 0.5, \"Class Pruner End Pad Tight\");\nstatic double_VAR(classify_cp_side_pad_loose, 2.5, \"Class Pruner Side Pad Loose\");\nstatic double_VAR(classify_cp_side_pad_medium, 1.2, \"Class Pruner Side Pad Medium\");\nstatic double_VAR(classify_cp_side_pad_tight, 0.6, \"Class Pruner Side Pad Tight\");\nstatic double_VAR(classify_pp_angle_pad, 45.0, \"Proto Pruner Angle Pad\");\nstatic double_VAR(classify_pp_end_pad, 0.5, \"Proto Prune End Pad\");\nstatic double_VAR(classify_pp_side_pad, 2.5, \"Proto Pruner Side Pad\");\n\n/**\n * This routine truncates Param to lie within the range\n * of Min-Max inclusive.\n *\n * @param Param   parameter value to be truncated\n * @param Min, Max  parameter limits (inclusive)\n *\n * @return Truncated parameter.\n */\nstatic int TruncateParam(float Param, int Min, int Max) {\n  int result;\n  if (Param < Min) {\n    result = Min;\n  } else if (Param > Max) {\n    result = Max;\n  } else {\n    result = static_cast<int>(std::floor(Param));\n  }\n  return result;\n}\n\n/*-----------------------------------------------------------------------------\n              Public Code\n-----------------------------------------------------------------------------*/\n/// Builds a feature from an FCOORD for position with all the necessary\n/// clipping and rounding.\nINT_FEATURE_STRUCT::INT_FEATURE_STRUCT(const FCOORD &pos, uint8_t theta)\n    : X(ClipToRange<int16_t>(static_cast<int16_t>(pos.x() + 0.5), 0, 255))\n    , Y(ClipToRange<int16_t>(static_cast<int16_t>(pos.y() + 0.5), 0, 255))\n    , Theta(theta)\n    , CP_misses(0) {}\n/** Builds a feature from ints with all the necessary clipping and casting. */\nINT_FEATURE_STRUCT::INT_FEATURE_STRUCT(int x, int y, int theta)\n    : X(static_cast<uint8_t>(ClipToRange<int>(x, 0, UINT8_MAX)))\n    , Y(static_cast<uint8_t>(ClipToRange<int>(y, 0, UINT8_MAX)))\n    , Theta(static_cast<uint8_t>(ClipToRange<int>(theta, 0, UINT8_MAX)))\n    , CP_misses(0) {}\n\n/**\n * This routine adds a new class structure to a set of\n * templates. Classes have to be added to Templates in\n * the order of increasing ClassIds.\n *\n * @param Templates templates to add new class to\n * @param ClassId   class id to associate new class with\n * @param Class   class data structure to add to templates\n *\n * Globals: none\n */\nvoid AddIntClass(INT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, INT_CLASS_STRUCT *Class) {\n  int Pruner;\n\n  assert(LegalClassId(ClassId));\n  if (static_cast<unsigned>(ClassId) != Templates->NumClasses) {\n    fprintf(stderr,\n            \"Please make sure that classes are added to templates\"\n            \" in increasing order of ClassIds\\n\");\n    exit(1);\n  }\n  ClassForClassId(Templates, ClassId) = Class;\n  Templates->NumClasses++;\n\n  if (Templates->NumClasses > MaxNumClassesIn(Templates)) {\n    Pruner = Templates->NumClassPruners++;\n    Templates->ClassPruners[Pruner] = new CLASS_PRUNER_STRUCT;\n    memset(Templates->ClassPruners[Pruner], 0, sizeof(CLASS_PRUNER_STRUCT));\n  }\n} /* AddIntClass */\n\n/**\n * This routine returns the index of the next free config\n * in Class.\n *\n * @param Class class to add new configuration to\n *\n * Globals: none\n *\n * @return Index of next free config.\n */\nint AddIntConfig(INT_CLASS_STRUCT *Class) {\n  int Index;\n\n  assert(Class->NumConfigs < MAX_NUM_CONFIGS);\n\n  Index = Class->NumConfigs++;\n  Class->ConfigLengths[Index] = 0;\n  return Index;\n} /* AddIntConfig */\n\n/**\n * This routine allocates the next free proto in Class and\n * returns its index.\n *\n * @param Class class to add new proto to\n *\n * Globals: none\n *\n * @return Proto index of new proto.\n */\nint AddIntProto(INT_CLASS_STRUCT *Class) {\n  if (Class->NumProtos >= MAX_NUM_PROTOS) {\n    return (NO_PROTO);\n  }\n\n  int Index = Class->NumProtos++;\n\n  if (Class->NumProtos > MaxNumIntProtosIn(Class)) {\n    int ProtoSetId = Class->NumProtoSets++;\n    auto ProtoSet = new PROTO_SET_STRUCT;\n    Class->ProtoSets[ProtoSetId] = ProtoSet;\n    memset(ProtoSet, 0, sizeof(*ProtoSet));\n\n    /* reallocate space for the proto lengths and install in class */\n    Class->ProtoLengths.resize(MaxNumIntProtosIn(Class));\n  }\n\n  /* initialize proto so its length is zero and it isn't in any configs */\n  Class->ProtoLengths[Index] = 0;\n  auto Proto = ProtoForProtoId(Class, Index);\n  for (uint32_t *Word = Proto->Configs; Word < Proto->Configs + WERDS_PER_CONFIG_VEC; *Word++ = 0) {\n  }\n\n  return (Index);\n}\n\n/**\n * This routine adds Proto to the class pruning tables\n * for the specified class in Templates.\n *\n * Globals:\n *  - classify_num_cp_levels number of levels used in the class pruner\n * @param Proto   floating-pt proto to add to class pruner\n * @param ClassId   class id corresponding to Proto\n * @param Templates set of templates containing class pruner\n */\nvoid AddProtoToClassPruner(PROTO_STRUCT *Proto, CLASS_ID ClassId, INT_TEMPLATES_STRUCT *Templates)\n#define MAX_LEVEL 2\n{\n  CLASS_PRUNER_STRUCT *Pruner;\n  uint32_t ClassMask;\n  uint32_t ClassCount;\n  uint32_t WordIndex;\n  int Level;\n  float EndPad, SidePad, AnglePad;\n  TABLE_FILLER TableFiller;\n  FILL_SPEC FillSpec;\n\n  Pruner = CPrunerFor(Templates, ClassId);\n  WordIndex = CPrunerWordIndexFor(ClassId);\n  ClassMask = CPrunerMaskFor(MAX_LEVEL, ClassId);\n\n  for (Level = classify_num_cp_levels - 1; Level >= 0; Level--) {\n    GetCPPadsForLevel(Level, &EndPad, &SidePad, &AnglePad);\n    ClassCount = CPrunerMaskFor(Level, ClassId);\n    InitTableFiller(EndPad, SidePad, AnglePad, Proto, &TableFiller);\n\n    while (!FillerDone(&TableFiller)) {\n      GetNextFill(&TableFiller, &FillSpec);\n      DoFill(&FillSpec, Pruner, ClassMask, ClassCount, WordIndex);\n    }\n  }\n} /* AddProtoToClassPruner */\n\n/**\n * This routine updates the proto pruner lookup tables\n * for Class to include a new proto identified by ProtoId\n * and described by Proto.\n * @param Proto floating-pt proto to be added to proto pruner\n * @param ProtoId id of proto\n * @param Class integer class that contains desired proto pruner\n * @param debug debug flag\n * @note Globals: none\n */\nvoid AddProtoToProtoPruner(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class, bool debug) {\n  float X, Y, Length;\n  float Pad;\n\n  if (ProtoId >= Class->NumProtos) {\n    tprintf(\"AddProtoToProtoPruner:assert failed: %d < %d\", ProtoId, Class->NumProtos);\n  }\n  assert(ProtoId < Class->NumProtos);\n\n  int Index = IndexForProto(ProtoId);\n  auto ProtoSet = Class->ProtoSets[SetForProto(ProtoId)];\n\n  float Angle = Proto->Angle;\n#ifndef _WIN32\n  assert(!std::isnan(Angle));\n#endif\n\n  FillPPCircularBits(ProtoSet->ProtoPruner[PRUNER_ANGLE], Index, Angle + ANGLE_SHIFT,\n                     classify_pp_angle_pad / 360.0, debug);\n\n  Angle *= 2.0 * M_PI;\n  Length = Proto->Length;\n\n  X = Proto->X + X_SHIFT;\n  Pad = std::max(fabs(std::cos(Angle)) * (Length / 2.0 + classify_pp_end_pad * GetPicoFeatureLength()),\n                 fabs(std::sin(Angle)) * (classify_pp_side_pad * GetPicoFeatureLength()));\n\n  FillPPLinearBits(ProtoSet->ProtoPruner[PRUNER_X], Index, X, Pad, debug);\n\n  Y = Proto->Y + Y_SHIFT;\n  Pad = std::max(fabs(std::sin(Angle)) * (Length / 2.0 + classify_pp_end_pad * GetPicoFeatureLength()),\n                 fabs(std::cos(Angle)) * (classify_pp_side_pad * GetPicoFeatureLength()));\n\n  FillPPLinearBits(ProtoSet->ProtoPruner[PRUNER_Y], Index, Y, Pad, debug);\n} /* AddProtoToProtoPruner */\n\n/**\n * Returns a quantized bucket for the given param shifted by offset,\n * notionally (param + offset) * num_buckets, but clipped and casted to the\n * appropriate type.\n */\nuint8_t Bucket8For(float param, float offset, int num_buckets) {\n  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));\n  return static_cast<uint8_t>(ClipToRange<int>(bucket, 0, num_buckets - 1));\n}\nuint16_t Bucket16For(float param, float offset, int num_buckets) {\n  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));\n  return static_cast<uint16_t>(ClipToRange<int>(bucket, 0, num_buckets - 1));\n}\n\n/**\n * Returns a quantized bucket for the given circular param shifted by offset,\n * notionally (param + offset) * num_buckets, but modded and casted to the\n * appropriate type.\n */\nuint8_t CircBucketFor(float param, float offset, int num_buckets) {\n  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));\n  return static_cast<uint8_t>(Modulo(bucket, num_buckets));\n} /* CircBucketFor */\n\n#ifndef GRAPHICS_DISABLED\n/**\n * This routine clears the global feature and proto\n * display lists.\n *\n * Globals:\n * - FeatureShapes display list for features\n * - ProtoShapes display list for protos\n */\nvoid UpdateMatchDisplay() {\n  if (IntMatchWindow != nullptr) {\n    IntMatchWindow->Update();\n  }\n} /* ClearMatchDisplay */\n#endif\n\n/**\n * This operation updates the config vectors of all protos\n * in Class to indicate that the protos with 1's in Config\n * belong to a new configuration identified by ConfigId.\n * It is assumed that the length of the Config bit vector is\n * equal to the number of protos in Class.\n * @param Config    config to be added to class\n * @param ConfigId  id to be used for new config\n * @param Class   class to add new config to\n */\nvoid ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS_STRUCT *Class) {\n  int ProtoId;\n  INT_PROTO_STRUCT *Proto;\n  int TotalLength;\n\n  for (ProtoId = 0, TotalLength = 0; ProtoId < Class->NumProtos; ProtoId++) {\n    if (test_bit(Config, ProtoId)) {\n      Proto = ProtoForProtoId(Class, ProtoId);\n      SET_BIT(Proto->Configs, ConfigId);\n      TotalLength += Class->ProtoLengths[ProtoId];\n    }\n  }\n  Class->ConfigLengths[ConfigId] = TotalLength;\n} /* ConvertConfig */\n\n/**\n * This routine converts Proto to integer format and\n * installs it as ProtoId in Class.\n * @param Proto floating-pt proto to be converted to integer format\n * @param ProtoId id of proto\n * @param Class integer class to add converted proto to\n */\nvoid Classify::ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class) {\n  assert(ProtoId < Class->NumProtos);\n\n  INT_PROTO_STRUCT *P = ProtoForProtoId(Class, ProtoId);\n\n  float Param = Proto->A * 128;\n  P->A = TruncateParam(Param, -128, 127);\n\n  Param = -Proto->B * 256;\n  P->B = TruncateParam(Param, 0, 255);\n\n  Param = Proto->C * 128;\n  P->C = TruncateParam(Param, -128, 127);\n\n  Param = Proto->Angle * 256;\n  if (Param < 0 || Param >= 256) {\n    P->Angle = 0;\n  } else {\n    P->Angle = static_cast<uint8_t>(Param);\n  }\n\n  /* round proto length to nearest integer number of pico-features */\n  Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;\n  Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255);\n  if (classify_learning_debug_level >= 2) {\n    tprintf(\"Converted ffeat to (A=%d,B=%d,C=%d,L=%d)\", P->A, P->B, P->C,\n            Class->ProtoLengths[ProtoId]);\n  }\n} /* ConvertProto */\n\n/**\n * This routine converts from the old floating point format\n * to the new integer format.\n * @param FloatProtos prototypes in old floating pt format\n * @param target_unicharset the UNICHARSET to use\n * @return New set of training templates in integer format.\n * @note Globals: none\n */\nINT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,\n                                           const UNICHARSET &target_unicharset) {\n  CLASS_TYPE FClass;\n  INT_CLASS_STRUCT *IClass;\n  int ProtoId;\n  int ConfigId;\n\n  auto IntTemplates = new INT_TEMPLATES_STRUCT;\n\n  for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {\n    FClass = &(FloatProtos[ClassId]);\n    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&\n        strcmp(target_unicharset.id_to_unichar(ClassId), \" \") != 0) {\n      tprintf(\"Warning: no protos/configs for %s in CreateIntTemplates()\\n\",\n              target_unicharset.id_to_unichar(ClassId));\n    }\n    assert(UnusedClassIdIn(IntTemplates, ClassId));\n    IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);\n    unsigned fs_size = FClass->font_set.size();\n    FontSet fs;\n    fs.reserve(fs_size);\n    for (unsigned i = 0; i < fs_size; ++i) {\n      fs.push_back(FClass->font_set[i]);\n    }\n    IClass->font_set_id = this->fontset_table_.push_back(std::move(fs));\n    AddIntClass(IntTemplates, ClassId, IClass);\n\n    for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {\n      AddIntProto(IClass);\n      ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);\n      AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,\n                            classify_learning_debug_level >= 2);\n      AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);\n    }\n\n    for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {\n      AddIntConfig(IClass);\n      ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);\n    }\n  }\n  return (IntTemplates);\n} /* CreateIntTemplates */\n\n#ifndef GRAPHICS_DISABLED\n/**\n * This routine renders the specified feature into a\n * global display list.\n *\n * Globals:\n * - FeatureShapes global display list for features\n * @param Feature   pico-feature to be displayed\n * @param Evidence  best evidence for this feature (0-1)\n */\nvoid DisplayIntFeature(const INT_FEATURE_STRUCT *Feature, float Evidence) {\n  ScrollView::Color color = GetMatchColorFor(Evidence);\n  RenderIntFeature(IntMatchWindow, Feature, color);\n  if (FeatureDisplayWindow) {\n    RenderIntFeature(FeatureDisplayWindow, Feature, color);\n  }\n} /* DisplayIntFeature */\n\n/**\n * This routine renders the specified proto into a\n * global display list.\n *\n * Globals:\n * - ProtoShapes global display list for protos\n * @param Class   class to take proto from\n * @param ProtoId   id of proto in Class to be displayed\n * @param Evidence  total evidence for proto (0-1)\n */\nvoid DisplayIntProto(INT_CLASS_STRUCT *Class, PROTO_ID ProtoId, float Evidence) {\n  ScrollView::Color color = GetMatchColorFor(Evidence);\n  RenderIntProto(IntMatchWindow, Class, ProtoId, color);\n  if (ProtoDisplayWindow) {\n    RenderIntProto(ProtoDisplayWindow, Class, ProtoId, color);\n  }\n} /* DisplayIntProto */\n#endif\n\n/// This constructor creates a new integer class data structure\n/// and returns it.  Sufficient space is allocated\n/// to handle the specified number of protos and configs.\n/// @param MaxNumProtos  number of protos to allocate space for\n/// @param MaxNumConfigs number of configs to allocate space for\nINT_CLASS_STRUCT::INT_CLASS_STRUCT(int MaxNumProtos, int MaxNumConfigs) :\n  NumProtos(0),\n  NumProtoSets((MaxNumProtos + PROTOS_PER_PROTO_SET - 1) / PROTOS_PER_PROTO_SET),\n  NumConfigs(0),\n  ProtoLengths(MaxNumIntProtosIn(this))\n{\n  assert(MaxNumConfigs <= MAX_NUM_CONFIGS);\n  assert(NumProtoSets <= MAX_NUM_PROTO_SETS);\n\n  for (int i = 0; i < NumProtoSets; i++) {\n    /* allocate space for a proto set, install in class, and initialize */\n    auto ProtoSet = new PROTO_SET_STRUCT;\n    memset(ProtoSet, 0, sizeof(*ProtoSet));\n    ProtoSets[i] = ProtoSet;\n\n    /* allocate space for the proto lengths and install in class */\n  }\n  memset(ConfigLengths, 0, sizeof(ConfigLengths));\n}\n\nINT_CLASS_STRUCT::~INT_CLASS_STRUCT() {\n  for (int i = 0; i < NumProtoSets; i++) {\n    delete ProtoSets[i];\n  }\n}\n\n/// This constructor allocates a new set of integer templates\n/// initialized to hold 0 classes.\nINT_TEMPLATES_STRUCT::INT_TEMPLATES_STRUCT() {\n  NumClasses = 0;\n  NumClassPruners = 0;\n\n  for (int i = 0; i < MAX_NUM_CLASSES; i++) {\n    ClassForClassId(this, i) = nullptr;\n  }\n}\n\nINT_TEMPLATES_STRUCT::~INT_TEMPLATES_STRUCT() {\n  for (unsigned i = 0; i < NumClasses; i++) {\n    delete Class[i];\n  }\n  for (unsigned i = 0; i < NumClassPruners; i++) {\n    delete ClassPruners[i];\n  }\n}\n\n/**\n * This routine reads a set of integer templates from\n * File.  File must already be open and must be in the\n * correct binary format.\n * @param  fp open file to read templates from\n * @return Pointer to integer templates read from File.\n * @note Globals: none\n */\nINT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {\n  int j, w, x, y, z;\n  INT_TEMPLATES_STRUCT *Templates;\n  CLASS_PRUNER_STRUCT *Pruner;\n  INT_CLASS_STRUCT *Class;\n\n  /* variables for conversion from older inttemp formats */\n  int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;\n  CLASS_ID class_id, max_class_id;\n  std::vector<CLASS_ID> ClassIdFor(MAX_NUM_CLASSES);\n  std::vector<CLASS_PRUNER_STRUCT *> TempClassPruner(MAX_NUM_CLASS_PRUNERS);\n  uint32_t SetBitsForMask =          // word with NUM_BITS_PER_CLASS\n      (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0\n  uint32_t Mask, NewMask, ClassBits;\n  unsigned MaxNumConfigs = MAX_NUM_CONFIGS;\n  unsigned WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;\n\n  /* first read the high level template struct */\n  Templates = new INT_TEMPLATES_STRUCT;\n  // Read Templates in parts for 64 bit compatibility.\n  uint32_t unicharset_size;\n  if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1) {\n    tprintf(\"Bad read of inttemp!\\n\");\n  }\n  int32_t version_id = 0;\n  if (fp->FReadEndian(&version_id, sizeof(version_id), 1) != 1 ||\n      fp->FReadEndian(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1) != 1) {\n    tprintf(\"Bad read of inttemp!\\n\");\n  }\n  if (version_id < 0) {\n    // This file has a version id!\n    version_id = -version_id;\n    if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1) != 1) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n  } else {\n    Templates->NumClasses = version_id;\n  }\n\n  if (version_id < 3) {\n    MaxNumConfigs = OLD_MAX_NUM_CONFIGS;\n    WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;\n  }\n\n  if (version_id < 2) {\n    std::vector<int16_t> IndexFor(MAX_NUM_CLASSES);\n    if (fp->FReadEndian(&IndexFor[0], sizeof(IndexFor[0]), unicharset_size) != unicharset_size) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n    if (fp->FReadEndian(&ClassIdFor[0], sizeof(ClassIdFor[0]), Templates->NumClasses) !=\n        Templates->NumClasses) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n  }\n\n  /* then read in the class pruners */\n  const unsigned kNumBuckets = NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;\n  for (unsigned i = 0; i < Templates->NumClassPruners; i++) {\n    Pruner = new CLASS_PRUNER_STRUCT;\n    if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) != kNumBuckets) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n    if (version_id < 2) {\n      TempClassPruner[i] = Pruner;\n    } else {\n      Templates->ClassPruners[i] = Pruner;\n    }\n  }\n\n  /* fix class pruners if they came from an old version of inttemp */\n  if (version_id < 2) {\n    // Allocate enough class pruners to cover all the class ids.\n    max_class_id = 0;\n    for (unsigned i = 0; i < Templates->NumClasses; i++) {\n      if (ClassIdFor[i] > max_class_id) {\n        max_class_id = ClassIdFor[i];\n      }\n    }\n    for (int i = 0; i <= CPrunerIdFor(max_class_id); i++) {\n      Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;\n      memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));\n    }\n    // Convert class pruners from the old format (indexed by class index)\n    // to the new format (indexed by class id).\n    last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;\n    for (unsigned i = 0; i < Templates->NumClassPruners; i++) {\n      for (x = 0; x < NUM_CP_BUCKETS; x++) {\n        for (y = 0; y < NUM_CP_BUCKETS; y++) {\n          for (z = 0; z < NUM_CP_BUCKETS; z++) {\n            for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {\n              if (TempClassPruner[i]->p[x][y][z][w] == 0) {\n                continue;\n              }\n              for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {\n                bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;\n                if (bit_number > last_cp_bit_number) {\n                  break; // the rest of the bits in this word are not used\n                }\n                class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];\n                // Single out NUM_BITS_PER_CLASS bits relating to class_id.\n                Mask = SetBitsForMask << b;\n                ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;\n                // Move these bits to the new position in which they should\n                // appear (indexed corresponding to the class_id).\n                new_i = CPrunerIdFor(class_id);\n                new_w = CPrunerWordIndexFor(class_id);\n                new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;\n                if (new_b > b) {\n                  ClassBits <<= (new_b - b);\n                } else {\n                  ClassBits >>= (b - new_b);\n                }\n                // Copy bits relating to class_id to the correct position\n                // in Templates->ClassPruner.\n                NewMask = SetBitsForMask << new_b;\n                Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;\n                Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;\n              }\n            }\n          }\n        }\n      }\n    }\n    for (unsigned i = 0; i < Templates->NumClassPruners; i++) {\n      delete TempClassPruner[i];\n    }\n  }\n\n  /* then read in each class */\n  for (unsigned i = 0; i < Templates->NumClasses; i++) {\n    /* first read in the high level struct for the class */\n    Class = new INT_CLASS_STRUCT;\n    if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||\n        fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||\n        fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n    if (version_id == 0) {\n      // Only version 0 writes 5 pointless pointers to the file.\n      for (j = 0; j < 5; ++j) {\n        int32_t junk;\n        if (fp->FRead(&junk, sizeof(junk), 1) != 1) {\n          tprintf(\"Bad read of inttemp!\\n\");\n        }\n      }\n    }\n    unsigned num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;\n    ASSERT_HOST(num_configs <= MaxNumConfigs);\n    if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) != num_configs) {\n      tprintf(\"Bad read of inttemp!\\n\");\n    }\n    if (version_id < 2) {\n      ClassForClassId(Templates, ClassIdFor[i]) = Class;\n    } else {\n      ClassForClassId(Templates, i) = Class;\n    }\n\n    /* then read in the proto lengths */\n    Class->ProtoLengths.clear();\n    if (MaxNumIntProtosIn(Class) > 0) {\n      Class->ProtoLengths.resize(MaxNumIntProtosIn(Class));\n      if (fp->FRead(&Class->ProtoLengths[0], sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=\n          MaxNumIntProtosIn(Class)) {\n        tprintf(\"Bad read of inttemp!\\n\");\n      }\n    }\n\n    /* then read in the proto sets */\n    for (j = 0; j < Class->NumProtoSets; j++) {\n      auto ProtoSet = new PROTO_SET_STRUCT;\n      unsigned num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;\n      if (fp->FReadEndian(&ProtoSet->ProtoPruner, sizeof(ProtoSet->ProtoPruner[0][0][0]),\n                          num_buckets) != num_buckets) {\n        tprintf(\"Bad read of inttemp!\\n\");\n      }\n      for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {\n        if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A), 1) != 1 ||\n            fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B), 1) != 1 ||\n            fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C), 1) != 1 ||\n            fp->FRead(&ProtoSet->Protos[x].Angle, sizeof(ProtoSet->Protos[x].Angle), 1) != 1) {\n          tprintf(\"Bad read of inttemp!\\n\");\n        }\n        if (fp->FReadEndian(&ProtoSet->Protos[x].Configs, sizeof(ProtoSet->Protos[x].Configs[0]),\n                            WerdsPerConfigVec) != WerdsPerConfigVec) {\n          tprintf(\"Bad read of inttemp!\\n\");\n        }\n      }\n      Class->ProtoSets[j] = ProtoSet;\n    }\n    if (version_id < 4) {\n      Class->font_set_id = -1;\n    } else {\n      fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);\n    }\n  }\n\n  if (version_id < 2) {\n    /* add an empty nullptr class with class id 0 */\n    assert(UnusedClassIdIn(Templates, 0));\n    ClassForClassId(Templates, 0) = new INT_CLASS_STRUCT(1, 1);\n    ClassForClassId(Templates, 0)->font_set_id = -1;\n    Templates->NumClasses++;\n    /* make sure the classes are contiguous */\n    for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {\n      if (i < Templates->NumClasses) {\n        if (ClassForClassId(Templates, i) == nullptr) {\n          fprintf(stderr, \"Non-contiguous class ids in inttemp\\n\");\n          exit(1);\n        }\n      } else {\n        if (ClassForClassId(Templates, i) != nullptr) {\n          fprintf(stderr, \"Class id %u exceeds NumClassesIn (Templates) %u\\n\", i,\n                  Templates->NumClasses);\n          exit(1);\n        }\n      }\n    }\n  }\n  if (version_id >= 4) {\n    using namespace std::placeholders; // for _1, _2\n    this->fontinfo_table_.read(fp, std::bind(read_info, _1, _2));\n    if (version_id >= 5) {\n      this->fontinfo_table_.read(fp, std::bind(read_spacing_info, _1, _2));\n    }\n    this->fontset_table_.read(fp, [](auto *f, auto *fs) { return f->DeSerialize(*fs); } );\n  }\n\n  return (Templates);\n} /* ReadIntTemplates */\n\n#ifndef GRAPHICS_DISABLED\n/**\n * This routine sends the shapes in the global display\n * lists to the match debugger window.\n *\n * Globals:\n * - FeatureShapes display list containing feature matches\n * - ProtoShapes display list containing proto matches\n */\nvoid Classify::ShowMatchDisplay() {\n  InitIntMatchWindowIfReqd();\n  if (ProtoDisplayWindow) {\n    ProtoDisplayWindow->Clear();\n  }\n  if (FeatureDisplayWindow) {\n    FeatureDisplayWindow->Clear();\n  }\n  ClearFeatureSpaceWindow(static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),\n                          IntMatchWindow);\n  IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y, INT_MAX_X, INT_MAX_Y);\n  if (ProtoDisplayWindow) {\n    ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y, INT_MAX_X, INT_MAX_Y);\n  }\n  if (FeatureDisplayWindow) {\n    FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y, INT_MAX_X, INT_MAX_Y);\n  }\n} /* ShowMatchDisplay */\n\n/// Clears the given window and draws the featurespace guides for the\n/// appropriate normalization method.\nvoid ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window) {\n  window->Clear();\n\n  window->Pen(ScrollView::GREY);\n  // Draw the feature space limit rectangle.\n  window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);\n  if (norm_method == baseline) {\n    window->SetCursor(0, INT_DESCENDER);\n    window->DrawTo(INT_MAX_X, INT_DESCENDER);\n    window->SetCursor(0, INT_BASELINE);\n    window->DrawTo(INT_MAX_X, INT_BASELINE);\n    window->SetCursor(0, INT_XHEIGHT);\n    window->DrawTo(INT_MAX_X, INT_XHEIGHT);\n    window->SetCursor(0, INT_CAPHEIGHT);\n    window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);\n  } else {\n    window->Rectangle(INT_XCENTER - INT_XRADIUS, INT_YCENTER - INT_YRADIUS,\n                      INT_XCENTER + INT_XRADIUS, INT_YCENTER + INT_YRADIUS);\n  }\n}\n#endif\n\n/**\n * This routine writes Templates to File.  The format\n * is an efficient binary format.  File must already be open\n * for writing.\n * @param File open file to write templates to\n * @param Templates templates to save into File\n * @param target_unicharset the UNICHARSET to use\n */\nvoid Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,\n                                 const UNICHARSET &target_unicharset) {\n  INT_CLASS_STRUCT *Class;\n  uint32_t unicharset_size = target_unicharset.size();\n  int version_id = -5; // When negated by the reader -1 becomes +1 etc.\n\n  if (Templates->NumClasses != unicharset_size) {\n    tprintf(\n        \"Warning: executing WriteIntTemplates() with %d classes in\"\n        \" Templates, while target_unicharset size is %\" PRIu32 \"\\n\",\n        Templates->NumClasses, unicharset_size);\n  }\n\n  /* first write the high level template struct */\n  fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);\n  fwrite(&version_id, sizeof(version_id), 1, File);\n  fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1, File);\n  fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);\n\n  /* then write out the class pruners */\n  for (unsigned i = 0; i < Templates->NumClassPruners; i++) {\n    fwrite(Templates->ClassPruners[i], sizeof(CLASS_PRUNER_STRUCT), 1, File);\n  }\n\n  /* then write out each class */\n  for (unsigned i = 0; i < Templates->NumClasses; i++) {\n    Class = Templates->Class[i];\n\n    /* first write out the high level struct for the class */\n    fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);\n    fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);\n    ASSERT_HOST(Class->NumConfigs == this->fontset_table_.at(Class->font_set_id).size());\n    fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);\n    for (int j = 0; j < Class->NumConfigs; ++j) {\n      fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);\n    }\n\n    /* then write out the proto lengths */\n    if (MaxNumIntProtosIn(Class) > 0) {\n      fwrite(&Class->ProtoLengths[0], sizeof(uint8_t), MaxNumIntProtosIn(Class), File);\n    }\n\n    /* then write out the proto sets */\n    for (int j = 0; j < Class->NumProtoSets; j++) {\n      fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);\n    }\n\n    /* then write the fonts info */\n    fwrite(&Class->font_set_id, sizeof(int), 1, File);\n  }\n\n  /* Write the fonts info tables */\n  using namespace std::placeholders; // for _1, _2\n  this->fontinfo_table_.write(File, std::bind(write_info, _1, _2));\n  this->fontinfo_table_.write(File, std::bind(write_spacing_info, _1, _2));\n  this->fontset_table_.write(File, std::bind(write_set, _1, _2));\n} /* WriteIntTemplates */\n\n/*-----------------------------------------------------------------------------\n              Private Code\n-----------------------------------------------------------------------------*/\n/**\n * This routine returns the parameter value which\n * corresponds to the beginning of the specified bucket.\n * The bucket number should have been generated using the\n * BucketFor() function with parameters Offset and NumBuckets.\n * @param Bucket    bucket whose start is to be computed\n * @param Offset    offset used to map params to buckets\n * @param NumBuckets  total number of buckets\n * @return Param value corresponding to start position of Bucket.\n * @note Globals: none\n */\nfloat BucketStart(int Bucket, float Offset, int NumBuckets) {\n  return static_cast<float>(Bucket) / NumBuckets - Offset;\n\n} /* BucketStart */\n\n/**\n * This routine returns the parameter value which\n * corresponds to the end of the specified bucket.\n * The bucket number should have been generated using the\n * BucketFor() function with parameters Offset and NumBuckets.\n * @param Bucket    bucket whose end is to be computed\n * @param Offset    offset used to map params to buckets\n * @param NumBuckets  total number of buckets\n * @return Param value corresponding to end position of Bucket.\n * @note Globals: none\n */\nfloat BucketEnd(int Bucket, float Offset, int NumBuckets) {\n  return static_cast<float>(Bucket + 1) / NumBuckets - Offset;\n} /* BucketEnd */\n\n/**\n * This routine fills in the section of a class pruner\n * corresponding to a single x value for a single proto of\n * a class.\n * @param FillSpec  specifies which bits to fill in pruner\n * @param Pruner    class pruner to be filled\n * @param ClassMask indicates which bits to change in each word\n * @param ClassCount  indicates what to change bits to\n * @param WordIndex indicates which word to change\n */\nvoid DoFill(FILL_SPEC *FillSpec, CLASS_PRUNER_STRUCT *Pruner, uint32_t ClassMask,\n            uint32_t ClassCount, uint32_t WordIndex) {\n  int X, Y, Angle;\n  uint32_t OldWord;\n\n  X = FillSpec->X;\n  if (X < 0) {\n    X = 0;\n  }\n  if (X >= NUM_CP_BUCKETS) {\n    X = NUM_CP_BUCKETS - 1;\n  }\n\n  if (FillSpec->YStart < 0) {\n    FillSpec->YStart = 0;\n  }\n  if (FillSpec->YEnd >= NUM_CP_BUCKETS) {\n    FillSpec->YEnd = NUM_CP_BUCKETS - 1;\n  }\n\n  for (Y = FillSpec->YStart; Y <= FillSpec->YEnd; Y++) {\n    for (Angle = FillSpec->AngleStart;; CircularIncrement(Angle, NUM_CP_BUCKETS)) {\n      OldWord = Pruner->p[X][Y][Angle][WordIndex];\n      if (ClassCount > (OldWord & ClassMask)) {\n        OldWord &= ~ClassMask;\n        OldWord |= ClassCount;\n        Pruner->p[X][Y][Angle][WordIndex] = OldWord;\n      }\n      if (Angle == FillSpec->AngleEnd) {\n        break;\n      }\n    }\n  }\n} /* DoFill */\n\n/**\n * Return true if the specified table filler is done, i.e.\n * if it has no more lines to fill.\n * @param Filler    table filler to check if done\n * @return true if no more lines to fill, false otherwise.\n * @note Globals: none\n */\nbool FillerDone(TABLE_FILLER *Filler) {\n  FILL_SWITCH *Next;\n\n  Next = &(Filler->Switch[Filler->NextSwitch]);\n\n  return Filler->X > Next->X && Next->Type == LastSwitch;\n\n} /* FillerDone */\n\n/**\n * This routine sets Bit in each bit vector whose\n * bucket lies within the range Center +- Spread.  The fill\n * is done for a circular dimension, i.e. bucket 0 is adjacent\n * to the last bucket.  It is assumed that Center and Spread\n * are expressed in a circular coordinate system whose range\n * is 0 to 1.\n * @param ParamTable  table of bit vectors, one per param bucket\n * @param Bit bit position in vectors to be filled\n * @param Center center of filled area\n * @param Spread spread of filled area\n * @param debug debug flag\n */\nvoid FillPPCircularBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR], int Bit,\n                        float Center, float Spread, bool debug) {\n  int i, FirstBucket, LastBucket;\n\n  if (Spread > 0.5) {\n    Spread = 0.5;\n  }\n\n  FirstBucket = static_cast<int>(std::floor((Center - Spread) * NUM_PP_BUCKETS));\n  if (FirstBucket < 0) {\n    FirstBucket += NUM_PP_BUCKETS;\n  }\n\n  LastBucket = static_cast<int>(std::floor((Center + Spread) * NUM_PP_BUCKETS));\n  if (LastBucket >= NUM_PP_BUCKETS) {\n    LastBucket -= NUM_PP_BUCKETS;\n  }\n  if (debug) {\n    tprintf(\"Circular fill from %d to %d\", FirstBucket, LastBucket);\n  }\n  for (i = FirstBucket; true; CircularIncrement(i, NUM_PP_BUCKETS)) {\n    SET_BIT(ParamTable[i], Bit);\n\n    /* exit loop after we have set the bit for the last bucket */\n    if (i == LastBucket) {\n      break;\n    }\n  }\n\n} /* FillPPCircularBits */\n\n/**\n * This routine sets Bit in each bit vector whose\n * bucket lies within the range Center +- Spread.  The fill\n * is done for a linear dimension, i.e. there is no wrap-around\n * for this dimension.  It is assumed that Center and Spread\n * are expressed in a linear coordinate system whose range\n * is approximately 0 to 1.  Values outside this range will\n * be clipped.\n * @param ParamTable table of bit vectors, one per param bucket\n * @param Bit bit number being filled\n * @param Center center of filled area\n * @param Spread spread of filled area\n * @param debug debug flag\n */\nvoid FillPPLinearBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR], int Bit,\n                      float Center, float Spread, bool debug) {\n  int i, FirstBucket, LastBucket;\n\n  FirstBucket = static_cast<int>(std::floor((Center - Spread) * NUM_PP_BUCKETS));\n  if (FirstBucket < 0) {\n    FirstBucket = 0;\n  }\n\n  LastBucket = static_cast<int>(std::floor((Center + Spread) * NUM_PP_BUCKETS));\n  if (LastBucket >= NUM_PP_BUCKETS) {\n    LastBucket = NUM_PP_BUCKETS - 1;\n  }\n\n  if (debug) {\n    tprintf(\"Linear fill from %d to %d\", FirstBucket, LastBucket);\n  }\n  for (i = FirstBucket; i <= LastBucket; i++) {\n    SET_BIT(ParamTable[i], Bit);\n  }\n\n} /* FillPPLinearBits */\n\n/*---------------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\n/**\n * This routine prompts the user with Prompt and waits\n * for the user to enter something in the debug window.\n * @param Prompt prompt to print while waiting for input from window\n * @param adaptive_on\n * @param pretrained_on\n * @param shape_id\n * @return Character entered in the debug window.\n * @note Globals: none\n */\nCLASS_ID Classify::GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on,\n                                   int *shape_id) {\n  tprintf(\"%s\\n\", Prompt);\n  SVEventType ev_type;\n  int unichar_id = INVALID_UNICHAR_ID;\n  // Wait until a click or popup event.\n  do {\n    auto ev = IntMatchWindow->AwaitEvent(SVET_ANY);\n    ev_type = ev->type;\n    if (ev_type == SVET_POPUP) {\n      if (ev->command_id == IDA_SHAPE_INDEX) {\n        if (shape_table_ != nullptr) {\n          *shape_id = atoi(ev->parameter);\n          *adaptive_on = false;\n          *pretrained_on = true;\n          if (*shape_id >= 0 && static_cast<unsigned>(*shape_id) < shape_table_->NumShapes()) {\n            int font_id;\n            shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id, &font_id);\n            tprintf(\"Shape %d, first unichar=%d, font=%d\\n\", *shape_id, unichar_id, font_id);\n            return unichar_id;\n          }\n          tprintf(\"Shape index '%s' not found in shape table\\n\", ev->parameter);\n        } else {\n          tprintf(\"No shape table loaded!\\n\");\n        }\n      } else {\n        if (unicharset.contains_unichar(ev->parameter)) {\n          unichar_id = unicharset.unichar_to_id(ev->parameter);\n          if (ev->command_id == IDA_ADAPTIVE) {\n            *adaptive_on = true;\n            *pretrained_on = false;\n            *shape_id = -1;\n          } else if (ev->command_id == IDA_STATIC) {\n            *adaptive_on = false;\n            *pretrained_on = true;\n          } else {\n            *adaptive_on = true;\n            *pretrained_on = true;\n          }\n          if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {\n            *shape_id = -1;\n            return unichar_id;\n          }\n          for (unsigned s = 0; s < shape_table_->NumShapes(); ++s) {\n            if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {\n              tprintf(\"%s\\n\", shape_table_->DebugStr(s).c_str());\n            }\n          }\n        } else {\n          tprintf(\"Char class '%s' not found in unicharset\", ev->parameter);\n        }\n      }\n    }\n  } while (ev_type != SVET_CLICK);\n  return 0;\n} /* GetClassToDebug */\n\n#endif\n\n/**\n * This routine copies the appropriate global pad variables\n * into EndPad, SidePad, and AnglePad.  This is a kludge used\n * to get around the fact that global control variables cannot\n * be arrays.  If the specified level is illegal, the tightest\n * possible pads are returned.\n * @param Level   \"tightness\" level to return pads for\n * @param EndPad    place to put end pad for Level\n * @param SidePad   place to put side pad for Level\n * @param AnglePad  place to put angle pad for Level\n */\nvoid GetCPPadsForLevel(int Level, float *EndPad, float *SidePad, float *AnglePad) {\n  switch (Level) {\n    case 0:\n      *EndPad = classify_cp_end_pad_loose * GetPicoFeatureLength();\n      *SidePad = classify_cp_side_pad_loose * GetPicoFeatureLength();\n      *AnglePad = classify_cp_angle_pad_loose / 360.0;\n      break;\n\n    case 1:\n      *EndPad = classify_cp_end_pad_medium * GetPicoFeatureLength();\n      *SidePad = classify_cp_side_pad_medium * GetPicoFeatureLength();\n      *AnglePad = classify_cp_angle_pad_medium / 360.0;\n      break;\n\n    case 2:\n      *EndPad = classify_cp_end_pad_tight * GetPicoFeatureLength();\n      *SidePad = classify_cp_side_pad_tight * GetPicoFeatureLength();\n      *AnglePad = classify_cp_angle_pad_tight / 360.0;\n      break;\n\n    default:\n      *EndPad = classify_cp_end_pad_tight * GetPicoFeatureLength();\n      *SidePad = classify_cp_side_pad_tight * GetPicoFeatureLength();\n      *AnglePad = classify_cp_angle_pad_tight / 360.0;\n      break;\n  }\n  if (*AnglePad > 0.5) {\n    *AnglePad = 0.5;\n  }\n\n} /* GetCPPadsForLevel */\n\n/**\n * @param Evidence  evidence value to return color for\n * @return Color which corresponds to specified Evidence value.\n * @note Globals: none\n */\nScrollView::Color GetMatchColorFor(float Evidence) {\n  assert(Evidence >= 0.0);\n  assert(Evidence <= 1.0);\n\n  if (Evidence >= 0.90) {\n    return ScrollView::WHITE;\n  } else if (Evidence >= 0.75) {\n    return ScrollView::GREEN;\n  } else if (Evidence >= 0.50) {\n    return ScrollView::RED;\n  } else {\n    return ScrollView::BLUE;\n  }\n} /* GetMatchColorFor */\n\n/**\n * This routine returns (in Fill) the specification of\n * the next line to be filled from Filler.  FillerDone() should\n * always be called before GetNextFill() to ensure that we\n * do not run past the end of the fill table.\n * @param Filler    filler to get next fill spec from\n * @param Fill    place to put spec for next fill\n */\nvoid GetNextFill(TABLE_FILLER *Filler, FILL_SPEC *Fill) {\n  FILL_SWITCH *Next;\n\n  /* compute the fill assuming no switches will be encountered */\n  Fill->AngleStart = Filler->AngleStart;\n  Fill->AngleEnd = Filler->AngleEnd;\n  Fill->X = Filler->X;\n  Fill->YStart = Filler->YStart >> 8;\n  Fill->YEnd = Filler->YEnd >> 8;\n\n  /* update the fill info and the filler for ALL switches at this X value */\n  Next = &(Filler->Switch[Filler->NextSwitch]);\n  while (Filler->X >= Next->X) {\n    Fill->X = Filler->X = Next->X;\n    if (Next->Type == StartSwitch) {\n      Fill->YStart = Next->Y;\n      Filler->StartDelta = Next->Delta;\n      Filler->YStart = Next->YInit;\n    } else if (Next->Type == EndSwitch) {\n      Fill->YEnd = Next->Y;\n      Filler->EndDelta = Next->Delta;\n      Filler->YEnd = Next->YInit;\n    } else { /* Type must be LastSwitch */\n      break;\n    }\n    Filler->NextSwitch++;\n    Next = &(Filler->Switch[Filler->NextSwitch]);\n  }\n\n  /* prepare the filler for the next call to this routine */\n  Filler->X++;\n  Filler->YStart += Filler->StartDelta;\n  Filler->YEnd += Filler->EndDelta;\n\n} /* GetNextFill */\n\n/**\n * This routine computes a data structure (Filler)\n * which can be used to fill in a rectangle surrounding\n * the specified Proto. Results are returned in Filler.\n *\n * @param EndPad, SidePad, AnglePad padding to add to proto\n * @param Proto       proto to create a filler for\n * @param Filler        place to put table filler\n */\nvoid InitTableFiller(float EndPad, float SidePad, float AnglePad, PROTO_STRUCT *Proto, TABLE_FILLER *Filler)\n#define XS X_SHIFT\n#define YS Y_SHIFT\n#define AS ANGLE_SHIFT\n#define NB NUM_CP_BUCKETS\n{\n  float Angle;\n  float X, Y, HalfLength;\n  float Cos, Sin;\n  float XAdjust, YAdjust;\n  FPOINT Start, Switch1, Switch2, End;\n  int S1 = 0;\n  int S2 = 1;\n\n  Angle = Proto->Angle;\n  X = Proto->X;\n  Y = Proto->Y;\n  HalfLength = Proto->Length / 2.0;\n\n  Filler->AngleStart = CircBucketFor(Angle - AnglePad, AS, NB);\n  Filler->AngleEnd = CircBucketFor(Angle + AnglePad, AS, NB);\n  Filler->NextSwitch = 0;\n\n  if (fabs(Angle - 0.0) < HV_TOLERANCE || fabs(Angle - 0.5) < HV_TOLERANCE) {\n    /* horizontal proto - handle as special case */\n    Filler->X = Bucket8For(X - HalfLength - EndPad, XS, NB);\n    Filler->YStart = Bucket16For(Y - SidePad, YS, NB * 256);\n    Filler->YEnd = Bucket16For(Y + SidePad, YS, NB * 256);\n    Filler->StartDelta = 0;\n    Filler->EndDelta = 0;\n    Filler->Switch[0].Type = LastSwitch;\n    Filler->Switch[0].X = Bucket8For(X + HalfLength + EndPad, XS, NB);\n  } else if (fabs(Angle - 0.25) < HV_TOLERANCE || fabs(Angle - 0.75) < HV_TOLERANCE) {\n    /* vertical proto - handle as special case */\n    Filler->X = Bucket8For(X - SidePad, XS, NB);\n    Filler->YStart = Bucket16For(Y - HalfLength - EndPad, YS, NB * 256);\n    Filler->YEnd = Bucket16For(Y + HalfLength + EndPad, YS, NB * 256);\n    Filler->StartDelta = 0;\n    Filler->EndDelta = 0;\n    Filler->Switch[0].Type = LastSwitch;\n    Filler->Switch[0].X = Bucket8For(X + SidePad, XS, NB);\n  } else {\n    /* diagonal proto */\n\n    if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {\n      /* rising diagonal proto */\n      Angle *= 2.0 * M_PI;\n      Cos = fabs(std::cos(Angle));\n      Sin = fabs(std::sin(Angle));\n\n      /* compute the positions of the corners of the acceptance region */\n      Start.x = X - (HalfLength + EndPad) * Cos - SidePad * Sin;\n      Start.y = Y - (HalfLength + EndPad) * Sin + SidePad * Cos;\n      End.x = 2.0 * X - Start.x;\n      End.y = 2.0 * Y - Start.y;\n      Switch1.x = X - (HalfLength + EndPad) * Cos + SidePad * Sin;\n      Switch1.y = Y - (HalfLength + EndPad) * Sin - SidePad * Cos;\n      Switch2.x = 2.0 * X - Switch1.x;\n      Switch2.y = 2.0 * Y - Switch1.y;\n\n      if (Switch1.x > Switch2.x) {\n        S1 = 1;\n        S2 = 0;\n      }\n\n      /* translate into bucket positions and deltas */\n      Filler->X = Bucket8For(Start.x, XS, NB);\n      Filler->StartDelta = -static_cast<int16_t>((Cos / Sin) * 256);\n      Filler->EndDelta = static_cast<int16_t>((Sin / Cos) * 256);\n\n      XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;\n      YAdjust = XAdjust * Cos / Sin;\n      Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);\n      YAdjust = XAdjust * Sin / Cos;\n      Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);\n\n      Filler->Switch[S1].Type = StartSwitch;\n      Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);\n      Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);\n      XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);\n      YAdjust = XAdjust * Sin / Cos;\n      Filler->Switch[S1].YInit = Bucket16For(Switch1.y - YAdjust, YS, NB * 256);\n      Filler->Switch[S1].Delta = Filler->EndDelta;\n\n      Filler->Switch[S2].Type = EndSwitch;\n      Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);\n      Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);\n      XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);\n      YAdjust = XAdjust * Cos / Sin;\n      Filler->Switch[S2].YInit = Bucket16For(Switch2.y + YAdjust, YS, NB * 256);\n      Filler->Switch[S2].Delta = Filler->StartDelta;\n\n      Filler->Switch[2].Type = LastSwitch;\n      Filler->Switch[2].X = Bucket8For(End.x, XS, NB);\n    } else {\n      /* falling diagonal proto */\n      Angle *= 2.0 * M_PI;\n      Cos = fabs(std::cos(Angle));\n      Sin = fabs(std::sin(Angle));\n\n      /* compute the positions of the corners of the acceptance region */\n      Start.x = X - (HalfLength + EndPad) * Cos - SidePad * Sin;\n      Start.y = Y + (HalfLength + EndPad) * Sin - SidePad * Cos;\n      End.x = 2.0 * X - Start.x;\n      End.y = 2.0 * Y - Start.y;\n      Switch1.x = X - (HalfLength + EndPad) * Cos + SidePad * Sin;\n      Switch1.y = Y + (HalfLength + EndPad) * Sin + SidePad * Cos;\n      Switch2.x = 2.0 * X - Switch1.x;\n      Switch2.y = 2.0 * Y - Switch1.y;\n\n      if (Switch1.x > Switch2.x) {\n        S1 = 1;\n        S2 = 0;\n      }\n\n      /* translate into bucket positions and deltas */\n      Filler->X = Bucket8For(Start.x, XS, NB);\n      Filler->StartDelta = static_cast<int16_t>(\n          ClipToRange<int>(-IntCastRounded((Sin / Cos) * 256), INT16_MIN, INT16_MAX));\n      Filler->EndDelta = static_cast<int16_t>(\n          ClipToRange<int>(IntCastRounded((Cos / Sin) * 256), INT16_MIN, INT16_MAX));\n\n      XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;\n      YAdjust = XAdjust * Sin / Cos;\n      Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);\n      YAdjust = XAdjust * Cos / Sin;\n      Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);\n\n      Filler->Switch[S1].Type = EndSwitch;\n      Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);\n      Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);\n      XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);\n      YAdjust = XAdjust * Sin / Cos;\n      Filler->Switch[S1].YInit = Bucket16For(Switch1.y + YAdjust, YS, NB * 256);\n      Filler->Switch[S1].Delta = Filler->StartDelta;\n\n      Filler->Switch[S2].Type = StartSwitch;\n      Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);\n      Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);\n      XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);\n      YAdjust = XAdjust * Cos / Sin;\n      Filler->Switch[S2].YInit = Bucket16For(Switch2.y - YAdjust, YS, NB * 256);\n      Filler->Switch[S2].Delta = Filler->EndDelta;\n\n      Filler->Switch[2].Type = LastSwitch;\n      Filler->Switch[2].X = Bucket8For(End.x, XS, NB);\n    }\n  }\n} /* InitTableFiller */\n\n/*---------------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\n/**\n * This routine renders the specified feature into ShapeList.\n * @param window to add feature rendering to\n * @param Feature feature to be rendered\n * @param color color to use for feature rendering\n * @return New shape list with rendering of Feature added.\n * @note Globals: none\n */\nvoid RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature,\n                      ScrollView::Color color) {\n  float X, Y, Dx, Dy, Length;\n\n  window->Pen(color);\n  assert(Feature != nullptr);\n  assert(color != 0);\n\n  X = Feature->X;\n  Y = Feature->Y;\n  Length = GetPicoFeatureLength() * 0.7 * INT_CHAR_NORM_RANGE;\n  // The -PI has no significant effect here, but the value of Theta is computed\n  // using BinaryAnglePlusPi in intfx.cpp.\n  Dx = (Length / 2.0) * cos((Feature->Theta / 256.0) * 2.0 * M_PI - M_PI);\n  Dy = (Length / 2.0) * sin((Feature->Theta / 256.0) * 2.0 * M_PI - M_PI);\n\n  window->SetCursor(X, Y);\n  window->DrawTo(X + Dx, Y + Dy);\n} /* RenderIntFeature */\n\n/**\n * This routine extracts the parameters of the specified\n * proto from the class description and adds a rendering of\n * the proto onto the ShapeList.\n *\n * @param window ScrollView instance\n * @param Class class that proto is contained in\n * @param ProtoId id of proto to be rendered\n * @param color color to render proto in\n *\n * Globals: none\n *\n * @return New shape list with a rendering of one proto added.\n */\nvoid RenderIntProto(ScrollView *window, INT_CLASS_STRUCT *Class, PROTO_ID ProtoId,\n                    ScrollView::Color color) {\n  INT_PROTO_STRUCT *Proto;\n  int ProtoSetIndex;\n  int ProtoWordIndex;\n  float Length;\n  int Xmin, Xmax, Ymin, Ymax;\n  float X, Y, Dx, Dy;\n  uint32_t ProtoMask;\n  int Bucket;\n\n  assert(ProtoId >= 0);\n  assert(Class != nullptr);\n  assert(ProtoId < Class->NumProtos);\n  assert(color != 0);\n  window->Pen(color);\n\n  auto ProtoSet = Class->ProtoSets[SetForProto(ProtoId)];\n  ProtoSetIndex = IndexForProto(ProtoId);\n  Proto = &(ProtoSet->Protos[ProtoSetIndex]);\n  Length = (Class->ProtoLengths[ProtoId] * GetPicoFeatureLength() * INT_CHAR_NORM_RANGE);\n  ProtoMask = PPrunerMaskFor(ProtoId);\n  ProtoWordIndex = PPrunerWordIndexFor(ProtoId);\n\n  // find the x and y extent of the proto from the proto pruning table\n  Xmin = Ymin = NUM_PP_BUCKETS;\n  Xmax = Ymax = 0;\n  for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {\n    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {\n      UpdateRange(Bucket, &Xmin, &Xmax);\n    }\n\n    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {\n      UpdateRange(Bucket, &Ymin, &Ymax);\n    }\n  }\n  X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE;\n  Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE;\n  // The -PI has no significant effect here, but the value of Theta is computed\n  // using BinaryAnglePlusPi in intfx.cpp.\n  Dx = (Length / 2.0) * cos((Proto->Angle / 256.0) * 2.0 * M_PI - M_PI);\n  Dy = (Length / 2.0) * sin((Proto->Angle / 256.0) * 2.0 * M_PI - M_PI);\n\n  window->SetCursor(X - Dx, Y - Dy);\n  window->DrawTo(X + Dx, Y + Dy);\n} /* RenderIntProto */\n#endif\n\n#ifndef GRAPHICS_DISABLED\n/**\n * Initializes the int matcher window if it is not already\n * initialized.\n */\nvoid InitIntMatchWindowIfReqd() {\n  if (IntMatchWindow == nullptr) {\n    IntMatchWindow = CreateFeatureSpaceWindow(\"IntMatchWindow\", 50, 200);\n    auto *popup_menu = new SVMenuNode();\n\n    popup_menu->AddChild(\"Debug Adapted classes\", IDA_ADAPTIVE, \"x\", \"Class to debug\");\n    popup_menu->AddChild(\"Debug Static classes\", IDA_STATIC, \"x\", \"Class to debug\");\n    popup_menu->AddChild(\"Debug Both\", IDA_BOTH, \"x\", \"Class to debug\");\n    popup_menu->AddChild(\"Debug Shape Index\", IDA_SHAPE_INDEX, \"0\", \"Index to debug\");\n    popup_menu->BuildMenu(IntMatchWindow, false);\n  }\n}\n\n/**\n * Initializes the proto display window if it is not already\n * initialized.\n */\nvoid InitProtoDisplayWindowIfReqd() {\n  if (ProtoDisplayWindow == nullptr) {\n    ProtoDisplayWindow = CreateFeatureSpaceWindow(\"ProtoDisplayWindow\", 550, 200);\n  }\n}\n\n/**\n * Initializes the feature display window if it is not already\n * initialized.\n */\nvoid InitFeatureDisplayWindowIfReqd() {\n  if (FeatureDisplayWindow == nullptr) {\n    FeatureDisplayWindow = CreateFeatureSpaceWindow(\"FeatureDisplayWindow\", 50, 700);\n  }\n}\n\n/// Creates a window of the appropriate size for displaying elements\n/// in feature space.\nScrollView *CreateFeatureSpaceWindow(const char *name, int xpos, int ypos) {\n  return new ScrollView(name, xpos, ypos, 520, 520, 260, 260, true);\n}\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/intproto.h",
    "content": "/******************************************************************************\n ** Filename:    intproto.h\n ** Purpose:     Definition of data structures for integer protos.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#ifndef INTPROTO_H\n#define INTPROTO_H\n\n/**----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------**/\n#include \"matchdefs.h\"\n#include \"mfoutline.h\"\n#include \"protos.h\"\n#include \"scrollview.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\nclass FCOORD;\n\n/* define order of params in pruners */\n#define PRUNER_X 0\n#define PRUNER_Y 1\n#define PRUNER_ANGLE 2\n\n/* definition of coordinate system offsets for each table parameter */\n#define ANGLE_SHIFT (0.0)\n#define X_SHIFT (0.5)\n#define Y_SHIFT (0.5)\n\n#define MAX_PROTO_INDEX 24\n#define BITS_PER_WERD static_cast<int>(8 * sizeof(uint32_t))\n/* Script detection: increase this number to 128 */\n#define MAX_NUM_CONFIGS 64\n#define MAX_NUM_PROTOS 512\n#define PROTOS_PER_PROTO_SET 64\n#define MAX_NUM_PROTO_SETS (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)\n#define NUM_PP_PARAMS 3\n#define NUM_PP_BUCKETS 64\n#define NUM_CP_BUCKETS 24\n#define CLASSES_PER_CP 32\n#define NUM_BITS_PER_CLASS 2\n#define CLASS_PRUNER_CLASS_MASK (~(~0u << NUM_BITS_PER_CLASS))\n#define CLASSES_PER_CP_WERD (CLASSES_PER_CP / NUM_BITS_PER_CLASS)\n#define PROTOS_PER_PP_WERD BITS_PER_WERD\n#define BITS_PER_CP_VECTOR (CLASSES_PER_CP * NUM_BITS_PER_CLASS)\n#define MAX_NUM_CLASS_PRUNERS ((MAX_NUM_CLASSES + CLASSES_PER_CP - 1) / CLASSES_PER_CP)\n#define WERDS_PER_CP_VECTOR (BITS_PER_CP_VECTOR / BITS_PER_WERD)\n#define WERDS_PER_PP_VECTOR ((PROTOS_PER_PROTO_SET + BITS_PER_WERD - 1) / BITS_PER_WERD)\n#define WERDS_PER_PP (NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR)\n#define WERDS_PER_CP (NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR)\n#define WERDS_PER_CONFIG_VEC ((MAX_NUM_CONFIGS + BITS_PER_WERD - 1) / BITS_PER_WERD)\n\n/* The first 3 dimensions of the CLASS_PRUNER_STRUCT are the\n * 3 axes of the quantized feature space.\n * The position of the bits recorded for each class in the\n * 4th dimension is determined by using CPrunerWordIndexFor(c),\n * where c is the corresponding class id. */\nstruct CLASS_PRUNER_STRUCT {\n  uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];\n};\n\nstruct INT_PROTO_STRUCT {\n  int8_t A;\n  uint8_t B;\n  int8_t C;\n  uint8_t Angle;\n  uint32_t Configs[WERDS_PER_CONFIG_VEC];\n};\n\ntypedef uint32_t PROTO_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR];\n\nstruct PROTO_SET_STRUCT {\n  PROTO_PRUNER ProtoPruner;\n  INT_PROTO_STRUCT Protos[PROTOS_PER_PROTO_SET];\n};\n\ntypedef uint32_t CONFIG_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS][4];\n\nstruct INT_CLASS_STRUCT {\n  INT_CLASS_STRUCT() = default;\n  INT_CLASS_STRUCT(int MaxNumProtos, int MaxNumConfigs);\n  ~INT_CLASS_STRUCT();\n  uint16_t NumProtos = 0;\n  uint8_t NumProtoSets = 0;\n  uint8_t NumConfigs = 0;\n  PROTO_SET_STRUCT *ProtoSets[MAX_NUM_PROTO_SETS];\n  std::vector<uint8_t> ProtoLengths;\n  uint16_t ConfigLengths[MAX_NUM_CONFIGS];\n  int font_set_id = 0; // FontSet id, see above\n};\n\nstruct TESS_API INT_TEMPLATES_STRUCT {\n  INT_TEMPLATES_STRUCT();\n  ~INT_TEMPLATES_STRUCT();\n  unsigned NumClasses;\n  unsigned NumClassPruners;\n  INT_CLASS_STRUCT *Class[MAX_NUM_CLASSES];\n  CLASS_PRUNER_STRUCT *ClassPruners[MAX_NUM_CLASS_PRUNERS];\n};\n\n/* definitions of integer features*/\n#define MAX_NUM_INT_FEATURES 512\n#define INT_CHAR_NORM_RANGE 256\n\nstruct INT_FEATURE_STRUCT {\n  INT_FEATURE_STRUCT() : X(0), Y(0), Theta(0), CP_misses(0) {}\n  // Builds a feature from an FCOORD for position with all the necessary\n  // clipping and rounding.\n  INT_FEATURE_STRUCT(const FCOORD &pos, uint8_t theta);\n  // Builds a feature from ints with all the necessary clipping and casting.\n  INT_FEATURE_STRUCT(int x, int y, int theta);\n\n  uint8_t X;\n  uint8_t Y;\n  uint8_t Theta;\n  int8_t CP_misses;\n\n  void print() const {\n    tprintf(\"(%d,%d):%d\\n\", X, Y, Theta);\n  }\n};\n\ntypedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];\n\nenum IntmatcherDebugAction { IDA_ADAPTIVE, IDA_STATIC, IDA_SHAPE_INDEX, IDA_BOTH };\n\n/**----------------------------------------------------------------------------\n            Macros\n----------------------------------------------------------------------------**/\n\n#define MaxNumIntProtosIn(C) (C->NumProtoSets * PROTOS_PER_PROTO_SET)\n#define SetForProto(P) (P / PROTOS_PER_PROTO_SET)\n#define IndexForProto(P) (P % PROTOS_PER_PROTO_SET)\n#define ProtoForProtoId(C, P) (&((C->ProtoSets[SetForProto(P)])->Protos[IndexForProto(P)]))\n#define PPrunerWordIndexFor(I) (((I) % PROTOS_PER_PROTO_SET) / PROTOS_PER_PP_WERD)\n#define PPrunerBitIndexFor(I) ((I) % PROTOS_PER_PP_WERD)\n#define PPrunerMaskFor(I) (1 << PPrunerBitIndexFor(I))\n\n#define MaxNumClassesIn(T) (T->NumClassPruners * CLASSES_PER_CP)\n#define LegalClassId(c) ((c) >= 0 && (c) < MAX_NUM_CLASSES)\n#define UnusedClassIdIn(T, c) ((T)->Class[c] == nullptr)\n#define ClassForClassId(T, c) ((T)->Class[c])\n#define ClassPrunersFor(T) ((T)->ClassPruner)\n#define CPrunerIdFor(c) ((c) / CLASSES_PER_CP)\n#define CPrunerFor(T, c) ((T)->ClassPruners[CPrunerIdFor(c)])\n#define CPrunerWordIndexFor(c) (((c) % CLASSES_PER_CP) / CLASSES_PER_CP_WERD)\n#define CPrunerBitIndexFor(c) (((c) % CLASSES_PER_CP) % CLASSES_PER_CP_WERD)\n#define CPrunerMaskFor(L, c) (((L) + 1) << CPrunerBitIndexFor(c) * NUM_BITS_PER_CLASS)\n\n/* DEBUG macros*/\n#define PRINT_MATCH_SUMMARY 0x001\n#define DISPLAY_FEATURE_MATCHES 0x002\n#define DISPLAY_PROTO_MATCHES 0x004\n#define PRINT_FEATURE_MATCHES 0x008\n#define PRINT_PROTO_MATCHES 0x010\n#define CLIP_MATCH_EVIDENCE 0x020\n\n#define MatchDebuggingOn(D) (D)\n#define PrintMatchSummaryOn(D) ((D)&PRINT_MATCH_SUMMARY)\n#define DisplayFeatureMatchesOn(D) ((D)&DISPLAY_FEATURE_MATCHES)\n#define DisplayProtoMatchesOn(D) ((D)&DISPLAY_PROTO_MATCHES)\n#define PrintFeatureMatchesOn(D) ((D)&PRINT_FEATURE_MATCHES)\n#define PrintProtoMatchesOn(D) ((D)&PRINT_PROTO_MATCHES)\n#define ClipMatchEvidenceOn(D) ((D)&CLIP_MATCH_EVIDENCE)\n\n/**----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\nvoid AddIntClass(INT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, INT_CLASS_STRUCT *Class);\n\nint AddIntConfig(INT_CLASS_STRUCT *Class);\n\nint AddIntProto(INT_CLASS_STRUCT *Class);\n\nvoid AddProtoToClassPruner(PROTO_STRUCT *Proto, CLASS_ID ClassId, INT_TEMPLATES_STRUCT *Templates);\n\nvoid AddProtoToProtoPruner(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class, bool debug);\n\nuint8_t Bucket8For(float param, float offset, int num_buckets);\nuint16_t Bucket16For(float param, float offset, int num_buckets);\n\nuint8_t CircBucketFor(float param, float offset, int num_buckets);\n\nvoid UpdateMatchDisplay();\n\nvoid ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS_STRUCT *Class);\n\nvoid DisplayIntFeature(const INT_FEATURE_STRUCT *Feature, float Evidence);\n\nvoid DisplayIntProto(INT_CLASS_STRUCT *Class, PROTO_ID ProtoId, float Evidence);\n\nvoid ShowMatchDisplay();\n\n#ifndef GRAPHICS_DISABLED\n// Clears the given window and draws the featurespace guides for the\n// appropriate normalization method.\nTESS_API\nvoid ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window);\n#endif // !GRAPHICS_DISABLED\n\n/*----------------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\nTESS_API\nvoid RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature,\n                      ScrollView::Color color);\n\nvoid InitIntMatchWindowIfReqd();\n\nvoid InitProtoDisplayWindowIfReqd();\n\nvoid InitFeatureDisplayWindowIfReqd();\n\n// Creates a window of the appropriate size for displaying elements\n// in feature space.\nTESS_API\nScrollView *CreateFeatureSpaceWindow(const char *name, int xpos, int ypos);\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/kdtree.cpp",
    "content": "/******************************************************************************\n **  Filename:  kdtree.cpp\n **  Purpose:   Routines for managing K-D search trees\n **  Author:    Dan Johnson\n **\n **  (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n/*-----------------------------------------------------------------------------\n          Include Files and Type Defines\n-----------------------------------------------------------------------------*/\n#include \"kdtree.h\"\n\n#include <algorithm>\n#include <cfloat> // for FLT_MAX\n#include <cmath>\n#include <cstdio>\n\nnamespace tesseract {\n\n#define Magnitude(X) ((X) < 0 ? -(X) : (X))\n#define NodeFound(N, K, D) (((N)->Key == (K)) && ((N)->Data == (D)))\n\n/*-----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n-----------------------------------------------------------------------------*/\n#define MINSEARCH (-FLT_MAX)\n#define MAXSEARCH FLT_MAX\n\n// Helper function to find the next essential dimension in a cycle.\nstatic int NextLevel(KDTREE *tree, int level) {\n  do {\n    ++level;\n    if (level >= tree->KeySize) {\n      level = 0;\n    }\n  } while (tree->KeyDesc[level].NonEssential);\n  return level;\n}\n\n//-----------------------------------------------------------------------------\n/**  Store the k smallest-keyed key-value pairs. */\ntemplate <typename Key, typename Value>\nclass MinK {\npublic:\n  MinK(Key max_key, int k);\n  ~MinK();\n\n  struct Element {\n    Element() = default;\n    Element(const Key &k, const Value &v) : key(k), value(v) {}\n\n    Key key;\n    Value value;\n  };\n\n  bool insert(Key k, Value v);\n  const Key &max_insertable_key();\n\n  int elements_count() {\n    return elements_count_;\n  }\n  const Element *elements() {\n    return elements_;\n  }\n\nprivate:\n  const Key max_key_;  ///< the maximum possible Key\n  Element *elements_;  ///< unsorted array of elements\n  int elements_count_; ///< the number of results collected so far\n  int k_;              ///< the number of results we want from the search\n  int max_index_;      ///< the index of the result with the largest key\n};\n\ntemplate <typename Key, typename Value>\nMinK<Key, Value>::MinK(Key max_key, int k)\n    : max_key_(max_key), elements_count_(0), k_(k < 1 ? 1 : k), max_index_(0) {\n  elements_ = new Element[k_];\n}\n\ntemplate <typename Key, typename Value>\nMinK<Key, Value>::~MinK() {\n  delete[] elements_;\n}\n\ntemplate <typename Key, typename Value>\nconst Key &MinK<Key, Value>::max_insertable_key() {\n  if (elements_count_ < k_) {\n    return max_key_;\n  }\n  return elements_[max_index_].key;\n}\n\ntemplate <typename Key, typename Value>\nbool MinK<Key, Value>::insert(Key key, Value value) {\n  if (elements_count_ < k_) {\n    elements_[elements_count_++] = Element(key, value);\n    if (key > elements_[max_index_].key) {\n      max_index_ = elements_count_ - 1;\n    }\n    return true;\n  } else if (key < elements_[max_index_].key) {\n    // evict the largest element.\n    elements_[max_index_] = Element(key, value);\n    // recompute max_index_\n    for (int i = 0; i < elements_count_; i++) {\n      if (elements_[i].key > elements_[max_index_].key) {\n        max_index_ = i;\n      }\n    }\n    return true;\n  }\n  return false;\n}\n\n//-----------------------------------------------------------------------------\n/** Helper class for searching for the k closest points to query_point in tree.\n */\nclass KDTreeSearch {\npublic:\n  KDTreeSearch(KDTREE *tree, float *query_point, int k_closest);\n  ~KDTreeSearch();\n\n  /** Return the k nearest points' data. */\n  void Search(int *result_count, float *distances, void **results);\n\nprivate:\n  void SearchRec(int Level, KDNODE *SubTree);\n  bool BoxIntersectsSearch(float *lower, float *upper);\n\n  KDTREE *tree_;\n  float *query_point_;\n  float *sb_min_; ///< search box minimum\n  float *sb_max_; ///< search box maximum\n  MinK<float, void *> results_;\n};\n\nKDTreeSearch::KDTreeSearch(KDTREE *tree, float *query_point, int k_closest)\n    : tree_(tree), query_point_(query_point), results_(MAXSEARCH, k_closest) {\n  sb_min_ = new float[tree->KeySize];\n  sb_max_ = new float[tree->KeySize];\n}\n\nKDTreeSearch::~KDTreeSearch() {\n  delete[] sb_min_;\n  delete[] sb_max_;\n}\n\n/// Locate the k_closest points to query_point_, and return their distances and\n/// data into the given buffers.\nvoid KDTreeSearch::Search(int *result_count, float *distances, void **results) {\n  if (tree_->Root.Left == nullptr) {\n    *result_count = 0;\n  } else {\n    for (int i = 0; i < tree_->KeySize; i++) {\n      sb_min_[i] = tree_->KeyDesc[i].Min;\n      sb_max_[i] = tree_->KeyDesc[i].Max;\n    }\n    SearchRec(0, tree_->Root.Left);\n    int count = results_.elements_count();\n    *result_count = count;\n    for (int j = 0; j < count; j++) {\n      // Pre-cast to float64 as key is a template type and we have no control\n      // over its actual type.\n      distances[j] = static_cast<float>(sqrt(static_cast<double>(results_.elements()[j].key)));\n      results[j] = results_.elements()[j].value;\n    }\n  }\n}\n\n/*-----------------------------------------------------------------------------\n              Public Code\n-----------------------------------------------------------------------------*/\n/// @return a new KDTREE based on the specified parameters.\n/// @param KeySize  # of dimensions in the K-D tree\n/// @param KeyDesc  array of params to describe key dimensions\nKDTREE *MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[]) {\n  auto *KDTree = new KDTREE(KeySize);\n  for (int i = 0; i < KeySize; i++) {\n    KDTree->KeyDesc[i].NonEssential = KeyDesc[i].NonEssential;\n    KDTree->KeyDesc[i].Circular = KeyDesc[i].Circular;\n    if (KeyDesc[i].Circular) {\n      KDTree->KeyDesc[i].Min = KeyDesc[i].Min;\n      KDTree->KeyDesc[i].Max = KeyDesc[i].Max;\n      KDTree->KeyDesc[i].Range = KeyDesc[i].Max - KeyDesc[i].Min;\n      KDTree->KeyDesc[i].HalfRange = KDTree->KeyDesc[i].Range / 2;\n      KDTree->KeyDesc[i].MidRange = (KeyDesc[i].Max + KeyDesc[i].Min) / 2;\n    } else {\n      KDTree->KeyDesc[i].Min = MINSEARCH;\n      KDTree->KeyDesc[i].Max = MAXSEARCH;\n    }\n  }\n  KDTree->Root.Left = nullptr;\n  KDTree->Root.Right = nullptr;\n  return KDTree;\n}\n\n/**\n * This routine stores Data in the K-D tree specified by Tree\n * using Key as an access key.\n *\n * @param Tree    K-D tree in which data is to be stored\n * @param Key    ptr to key by which data can be retrieved\n * @param Data    ptr to data to be stored in the tree\n */\nvoid KDStore(KDTREE *Tree, float *Key, CLUSTER *Data) {\n  auto PtrToNode = &(Tree->Root.Left);\n  auto Node = *PtrToNode;\n  auto Level = NextLevel(Tree, -1);\n  while (Node != nullptr) {\n    if (Key[Level] < Node->BranchPoint) {\n      PtrToNode = &(Node->Left);\n      if (Key[Level] > Node->LeftBranch) {\n        Node->LeftBranch = Key[Level];\n      }\n    } else {\n      PtrToNode = &(Node->Right);\n      if (Key[Level] < Node->RightBranch) {\n        Node->RightBranch = Key[Level];\n      }\n    }\n    Level = NextLevel(Tree, Level);\n    Node = *PtrToNode;\n  }\n\n  *PtrToNode = new KDNODE(Tree, Key, Data, Level);\n} /* KDStore */\n\n/**\n * This routine deletes a node from Tree.  The node to be\n * deleted is specified by the Key for the node and the Data\n * contents of the node.  These two pointers must be identical\n * to the pointers that were used for the node when it was\n * originally stored in the tree.  A node will be deleted from\n * the tree only if its key and data pointers are identical\n * to Key and Data respectively.  The tree is re-formed by removing\n * the affected subtree and inserting all elements but the root.\n *\n * @param Tree K-D tree to delete node from\n * @param Key key of node to be deleted\n * @param Data data contents of node to be deleted\n */\nvoid KDDelete(KDTREE *Tree, float Key[], void *Data) {\n  int Level;\n  KDNODE *Current;\n  KDNODE *Father;\n\n  /* initialize search at root of tree */\n  Father = &(Tree->Root);\n  Current = Father->Left;\n  Level = NextLevel(Tree, -1);\n\n  /* search tree for node to be deleted */\n  while ((Current != nullptr) && (!NodeFound(Current, Key, Data))) {\n    Father = Current;\n    if (Key[Level] < Current->BranchPoint) {\n      Current = Current->Left;\n    } else {\n      Current = Current->Right;\n    }\n\n    Level = NextLevel(Tree, Level);\n  }\n\n  if (Current != nullptr) { /* if node to be deleted was found */\n    if (Current == Father->Left) {\n      Father->Left = nullptr;\n      Father->LeftBranch = Tree->KeyDesc[Level].Min;\n    } else {\n      Father->Right = nullptr;\n      Father->RightBranch = Tree->KeyDesc[Level].Max;\n    }\n\n    InsertNodes(Tree, Current->Left);\n    InsertNodes(Tree, Current->Right);\n    delete Current;\n  }\n} /* KDDelete */\n\n/**\n * This routine searches the K-D tree specified by Tree and\n * finds the QuerySize nearest neighbors of Query.  All neighbors\n * must be within MaxDistance of Query.  The data contents of\n * the nearest neighbors\n * are placed in NBuffer and their distances from Query are\n * placed in DBuffer.\n * @param Tree    ptr to K-D tree to be searched\n * @param Query    ptr to query key (point in D-space)\n * @param QuerySize  number of nearest neighbors to be found\n * @param MaxDistance  all neighbors must be within this distance\n * @param NBuffer ptr to QuerySize buffer to hold nearest neighbors\n * @param DBuffer ptr to QuerySize buffer to hold distances\n *          from nearest neighbor to query point\n * @param NumberOfResults [out] Number of nearest neighbors actually found\n */\nvoid KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance,\n                             int *NumberOfResults, void **NBuffer, float DBuffer[]) {\n  KDTreeSearch search(Tree, Query, QuerySize);\n  search.Search(NumberOfResults, DBuffer, NBuffer);\n}\n\n/*---------------------------------------------------------------------------*/\n/** Walk a given Tree with action. */\nvoid KDWalk(KDTREE *Tree, kdwalk_proc action, ClusteringContext *context) {\n  if (Tree->Root.Left != nullptr) {\n    Walk(Tree, action, context, Tree->Root.Left, NextLevel(Tree, -1));\n  }\n}\n\n/*-----------------------------------------------------------------------------\n              Private Code\n-----------------------------------------------------------------------------*/\n\n/*---------------------------------------------------------------------------*/\n/**\n * Recursively accumulate the k_closest points to query_point_ into results_.\n * @param Level  level in tree of sub-tree to be searched\n * @param SubTree  sub-tree to be searched\n */\nvoid KDTreeSearch::SearchRec(int level, KDNODE *sub_tree) {\n  if (level >= tree_->KeySize) {\n    level = 0;\n  }\n\n  if (!BoxIntersectsSearch(sb_min_, sb_max_)) {\n    return;\n  }\n\n  results_.insert(DistanceSquared(tree_->KeySize, &tree_->KeyDesc[0], query_point_, sub_tree->Key),\n                  sub_tree->Data);\n\n  if (query_point_[level] < sub_tree->BranchPoint) {\n    if (sub_tree->Left != nullptr) {\n      float tmp = sb_max_[level];\n      sb_max_[level] = sub_tree->LeftBranch;\n      SearchRec(NextLevel(tree_, level), sub_tree->Left);\n      sb_max_[level] = tmp;\n    }\n    if (sub_tree->Right != nullptr) {\n      float tmp = sb_min_[level];\n      sb_min_[level] = sub_tree->RightBranch;\n      SearchRec(NextLevel(tree_, level), sub_tree->Right);\n      sb_min_[level] = tmp;\n    }\n  } else {\n    if (sub_tree->Right != nullptr) {\n      float tmp = sb_min_[level];\n      sb_min_[level] = sub_tree->RightBranch;\n      SearchRec(NextLevel(tree_, level), sub_tree->Right);\n      sb_min_[level] = tmp;\n    }\n    if (sub_tree->Left != nullptr) {\n      float tmp = sb_max_[level];\n      sb_max_[level] = sub_tree->LeftBranch;\n      SearchRec(NextLevel(tree_, level), sub_tree->Left);\n      sb_max_[level] = tmp;\n    }\n  }\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n *Returns the Euclidean distance squared between p1 and p2 for all essential\n * dimensions.\n * @param k      keys are in k-space\n * @param dim    dimension descriptions (essential, circular, etc)\n * @param p1,p2  two different points in K-D space\n */\nfloat DistanceSquared(int k, PARAM_DESC *dim, float p1[], float p2[]) {\n  float total_distance = 0;\n\n  for (; k > 0; k--, p1++, p2++, dim++) {\n    if (dim->NonEssential) {\n      continue;\n    }\n\n    float dimension_distance = *p1 - *p2;\n\n    /* if this dimension is circular - check wraparound distance */\n    if (dim->Circular) {\n      dimension_distance = Magnitude(dimension_distance);\n      float wrap_distance = dim->Max - dim->Min - dimension_distance;\n      dimension_distance = std::min(dimension_distance, wrap_distance);\n    }\n\n    total_distance += dimension_distance * dimension_distance;\n  }\n  return total_distance;\n}\n\nfloat ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[]) {\n  return std::sqrt(DistanceSquared(k, dim, p1, p2));\n}\n\n/*---------------------------------------------------------------------------*/\n/// Return whether the query region (the smallest known circle about\n/// query_point_ containing results->k_ points) intersects the box specified\n/// between lower and upper.  For circular dimensions, we also check the point\n/// one wrap distance away from the query.\nbool KDTreeSearch::BoxIntersectsSearch(float *lower, float *upper) {\n  float *query = query_point_;\n  // Compute the sum in higher precision.\n  double total_distance = 0.0;\n  double radius_squared =\n      static_cast<double>(results_.max_insertable_key()) * results_.max_insertable_key();\n  PARAM_DESC *dim = &tree_->KeyDesc[0];\n\n  for (int i = tree_->KeySize; i > 0; i--, dim++, query++, lower++, upper++) {\n    if (dim->NonEssential) {\n      continue;\n    }\n\n    float dimension_distance;\n    if (*query < *lower) {\n      dimension_distance = *lower - *query;\n    } else if (*query > *upper) {\n      dimension_distance = *query - *upper;\n    } else {\n      dimension_distance = 0;\n    }\n\n    /* if this dimension is circular - check wraparound distance */\n    if (dim->Circular) {\n      float wrap_distance = FLT_MAX;\n      if (*query < *lower) {\n        wrap_distance = *query + dim->Max - dim->Min - *upper;\n      } else if (*query > *upper) {\n        wrap_distance = *lower - (*query - (dim->Max - dim->Min));\n      }\n      dimension_distance = std::min(dimension_distance, wrap_distance);\n    }\n\n    total_distance += static_cast<double>(dimension_distance) * dimension_distance;\n    if (total_distance >= radius_squared) {\n      return false;\n    }\n  }\n  return true;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * Walk a tree, calling action once on each node.\n *\n * Operation:\n *   This routine walks through the specified sub_tree and invokes action\n *   action at each node as follows:\n *       action(context, data, level)\n *   data  the data contents of the node being visited,\n *   level is the level of the node in the tree with the root being level 0.\n * @param tree  root of the tree being walked.\n * @param action  action to be performed at every node\n * @param context  action's context\n * @param sub_tree  ptr to root of subtree to be walked\n * @param level  current level in the tree for this node\n */\nvoid Walk(KDTREE *tree, kdwalk_proc action, ClusteringContext *context, KDNODE *sub_tree, int32_t level) {\n  (*action)(context, sub_tree->Data, level);\n  if (sub_tree->Left != nullptr) {\n    Walk(tree, action, context, sub_tree->Left, NextLevel(tree, level));\n  }\n  if (sub_tree->Right != nullptr) {\n    Walk(tree, action, context, sub_tree->Right, NextLevel(tree, level));\n  }\n}\n\n/** Given a subtree nodes, insert all of its elements into tree. */\nvoid InsertNodes(KDTREE *tree, KDNODE *nodes) {\n  if (nodes == nullptr) {\n    return;\n  }\n\n  KDStore(tree, nodes->Key, nodes->Data);\n  InsertNodes(tree, nodes->Left);\n  InsertNodes(tree, nodes->Right);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/kdtree.h",
    "content": "/******************************************************************************\n ** Filename:   kdtree.h\n ** Purpose:    Definition of K-D tree access routines.\n ** Author:     Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#ifndef KDTREE_H\n#define KDTREE_H\n\n#include \"ocrfeatures.h\"\n\nnamespace tesseract {\n\n/**\nNOTE:  All circular parameters of all keys must be in the range\n\nMin <= Param < Max\n\nwhere Min and Max are specified in the KeyDesc parameter passed to\nMakeKDTree.  All KD routines assume that this is true and will not operate\ncorrectly if circular parameters outside the specified range are used.\n*/\n\nstruct ClusteringContext;\nstruct CLUSTER;\nstruct KDTREE;\n\nusing kdwalk_proc = void (*)(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);\n\nstruct KDNODE {\n  /// This routine allocates memory for a new K-D tree node\n  /// and places the specified Key and Data into it.  The\n  /// left and right subtree pointers for the node are\n  /// initialized to empty subtrees.\n  /// @param tree  The tree to create the node for\n  /// @param Key  Access key for new node in KD tree\n  /// @param Data  ptr to data to be stored in new node\n  /// @param Index  index of Key to branch on\n  KDNODE() = default;\n  KDNODE(KDTREE *tree, float key[], CLUSTER *data, int Index);\n  ~KDNODE() {\n    delete Left;\n    delete Right;\n  }\n\n  float *Key;          /**< search key */\n  CLUSTER *Data;       /**< data that corresponds to key */\n  float BranchPoint;   /**< needed to make deletes work efficiently */\n  float LeftBranch;    /**< used to optimize search pruning */\n  float RightBranch;   /**< used to optimize search pruning */\n  KDNODE *Left;        /**< ptrs for KD tree structure */\n  KDNODE *Right;\n};\n\nstruct KDTREE {\n  KDTREE(size_t n) : KeySize(n), KeyDesc(n) {\n  }\n\n  // The destructor frees all memory which is allocated to the\n  // specified KD-tree.  This includes the data structure for\n  // the kd-tree itself plus the data structures for each node\n  // in the tree.  It does not include the Key and Data items\n  // which are pointed to by the nodes.  This memory is left\n  // untouched.\n  ~KDTREE() {\n  }\n\n  // TODO: KeySize might be replaced by KeyDesc.size().\n  int16_t KeySize = 0;   // number of dimensions in the tree\n  KDNODE Root;           // Root.Left points to actual root node\n  std::vector<PARAM_DESC> KeyDesc; // description of each dimension\n};\n\ninline KDNODE::KDNODE(KDTREE *tree, float key[], CLUSTER *data, int Index) {\n  Key = key;\n  Data = data;\n  BranchPoint = Key[Index];\n  LeftBranch = tree->KeyDesc[Index].Min;\n  RightBranch = tree->KeyDesc[Index].Max;\n  Left = nullptr;\n  Right = nullptr;\n}\n\n/*----------------------------------------------------------------------------\n            Macros\n-----------------------------------------------------------------------------*/\n#define RootOf(T) ((T)->Root.Left->Data)\n\n/*-----------------------------------------------------------------------------\n          Public Function Prototypes\n-----------------------------------------------------------------------------*/\nKDTREE *MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[]);\n\nvoid KDStore(KDTREE *Tree, float *Key, CLUSTER *Data);\n\nvoid KDDelete(KDTREE *Tree, float Key[], void *Data);\n\nvoid KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance,\n                             int *NumberOfResults, void **NBuffer, float DBuffer[]);\n\nvoid KDWalk(KDTREE *Tree, kdwalk_proc Action, ClusteringContext *context);\n\n/*-----------------------------------------------------------------------------\n          Private Function Prototypes\n-----------------------------------------------------------------------------*/\n\nfloat DistanceSquared(int k, PARAM_DESC *dim, float p1[], float p2[]);\n\nTESS_API\nfloat ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[]);\n\nint QueryInSearch(KDTREE *tree);\n\nvoid Walk(KDTREE *tree, kdwalk_proc action, ClusteringContext *context, KDNODE *SubTree, int32_t Level);\n\nvoid InsertNodes(KDTREE *tree, KDNODE *nodes);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/mf.cpp",
    "content": "/******************************************************************************\n ** Filename:    mf.c\n ** Purpose:     Micro-feature interface to flexible feature extractor.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n#include \"mf.h\"\n\n#include \"featdefs.h\"\n#include \"mfdefs.h\"\n#include \"mfx.h\"\n\n#include <cmath>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n/**\n * Call the old micro-feature extractor and then copy\n * the features into the new format.  Then deallocate the\n * old micro-features.\n * @param Blob  blob to extract micro-features from\n * @param cn_denorm  control parameter to feature extractor.\n * @return Micro-features for Blob.\n */\nFEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm) {\n  auto features = BlobMicroFeatures(Blob, cn_denorm);\n  if (features.empty()) {\n    return nullptr;\n  }\n  int n = 0;\n  for ([[maybe_unused]] auto &f: features) {\n    ++n;\n  }\n  auto FeatureSet = new FEATURE_SET_STRUCT(n);\n\n  for (auto &f : features) {\n    auto Feature = new FEATURE_STRUCT(&MicroFeatureDesc);\n    for (int i = 0; i < (int)MicroFeatureParameter::MFCount; ++i)\n      Feature->Params[i] = f[i];\n    // Bulge features are deprecated and should not be used. Set to 0.\n    Feature->Params[(int)MicroFeatureParameter::MFBulge1] = 0.0f;\n    Feature->Params[(int)MicroFeatureParameter::MFBulge2] = 0.0f;\n\n#ifndef _WIN32\n    // Assert that feature parameters are well defined.\n    for (int i = 0; i < Feature->Type->NumParams; i++) {\n      ASSERT_HOST(!std::isnan(Feature->Params[i]));\n    }\n#endif\n\n    AddFeature(FeatureSet, Feature);\n  }\n  return FeatureSet;\n} /* ExtractMicros */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/mf.h",
    "content": "/******************************************************************************\n ** Filename:    mf.h\n ** Purpose:     Micro-feature interface to flexible feature extractor.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef MF_H\n#define MF_H\n\n#include \"blobs.h\"\n#include \"ocrfeatures.h\"\n\nnamespace tesseract {\n\nFEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/mfdefs.h",
    "content": "/******************************************************************************\n ** Filename:    mfdefs.h\n ** Purpose:     Definition of micro-features\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n#ifndef MFDEFS_H\n#define MFDEFS_H\n\n#include <array>\n#include <forward_list>\n\nnamespace tesseract {\n\nenum class MicroFeatureParameter {\n  MFXPosition,\n  MFYPosition,\n  MFLength,\n  MFDirection,\n  MFBulge1,\n  MFBulge2,\n\n  MFCount // For array sizes.\n};\n\nusing MicroFeature = std::array<float, (int)MicroFeatureParameter::MFCount>;\nusing MICROFEATURES = std::forward_list<MicroFeature>;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/mfoutline.cpp",
    "content": "/******************************************************************************\n ** Filename:    mfoutline.c\n ** Purpose:     Interface to outline struct used for extracting features\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"mfoutline.h\"\n\n#include \"blobs.h\"\n#include \"classify.h\"\n#include \"clusttool.h\" //If remove you get caught in a loop somewhere\n#include \"mfx.h\"\n#include \"params.h\"\n\n#include <cmath>\n#include <cstdio>\n\nnamespace tesseract {\n\n/*---------------------------------------------------------------------------*/\n/** Convert a blob into a list of MFOUTLINEs (float-based microfeature format).\n */\nLIST ConvertBlob(TBLOB *blob) {\n  LIST outlines = NIL_LIST;\n  return (blob == nullptr) ? NIL_LIST : ConvertOutlines(blob->outlines, outlines, outer);\n}\n\n/*---------------------------------------------------------------------------*/\n/** Convert a TESSLINE into the float-based MFOUTLINE micro-feature format. */\nMFOUTLINE ConvertOutline(TESSLINE *outline) {\n  auto MFOutline = NIL_LIST;\n\n  if (outline == nullptr || outline->loop == nullptr) {\n    return MFOutline;\n  }\n\n  auto StartPoint = outline->loop;\n  auto EdgePoint = StartPoint;\n  do {\n    auto NextPoint = EdgePoint->next;\n\n    /* filter out duplicate points */\n    if (EdgePoint->pos.x != NextPoint->pos.x || EdgePoint->pos.y != NextPoint->pos.y) {\n      auto NewPoint = new MFEDGEPT;\n      NewPoint->ClearMark();\n      NewPoint->Hidden = EdgePoint->IsHidden();\n      NewPoint->Point.x = EdgePoint->pos.x;\n      NewPoint->Point.y = EdgePoint->pos.y;\n      MFOutline = push(MFOutline, NewPoint);\n    }\n    EdgePoint = NextPoint;\n  } while (EdgePoint != StartPoint);\n\n  if (MFOutline != nullptr) {\n    MakeOutlineCircular(MFOutline);\n  }\n  return MFOutline;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * Convert a tree of outlines to a list of MFOUTLINEs (lists of MFEDGEPTs).\n *\n * @param outline      first outline to be converted\n * @param mf_outlines  list to add converted outlines to\n * @param outline_type  are the outlines outer or holes?\n */\nLIST ConvertOutlines(TESSLINE *outline, LIST mf_outlines, OUTLINETYPE outline_type) {\n  MFOUTLINE mf_outline;\n\n  while (outline != nullptr) {\n    mf_outline = ConvertOutline(outline);\n    if (mf_outline != nullptr) {\n      mf_outlines = push(mf_outlines, mf_outline);\n    }\n    outline = outline->next;\n  }\n  return mf_outlines;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine searches through the specified outline, computes\n * a slope for each vector in the outline, and marks each\n * vector as having one of the following directions:\n *   N, S, E, W, NE, NW, SE, SW\n * This information is then stored in the outline and the\n * outline is returned.\n * @param Outline   micro-feature outline to analyze\n * @param MinSlope  controls \"snapping\" of segments to horizontal\n * @param MaxSlope  controls \"snapping\" of segments to vertical\n */\nvoid FindDirectionChanges(MFOUTLINE Outline, float MinSlope, float MaxSlope) {\n  MFEDGEPT *Current;\n  MFEDGEPT *Last;\n  MFOUTLINE EdgePoint;\n\n  if (DegenerateOutline(Outline)) {\n    return;\n  }\n\n  Last = PointAt(Outline);\n  Outline = NextPointAfter(Outline);\n  EdgePoint = Outline;\n  do {\n    Current = PointAt(EdgePoint);\n    ComputeDirection(Last, Current, MinSlope, MaxSlope);\n\n    Last = Current;\n    EdgePoint = NextPointAfter(EdgePoint);\n  } while (EdgePoint != Outline);\n\n} /* FindDirectionChanges */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine deallocates all of the memory consumed by\n * a micro-feature outline.\n * @param arg   micro-feature outline to be freed\n */\nvoid FreeMFOutline(void *arg) { // MFOUTLINE Outline)\n  auto Outline = static_cast<MFOUTLINE>(arg);\n\n  /* break the circular outline so we can use std. techniques to deallocate */\n  MFOUTLINE Start = Outline->list_rest();\n  set_rest(Outline, NIL_LIST);\n  while (Start != nullptr) {\n    delete reinterpret_cast<MFEDGEPT *>(Start->first_node());\n    Start = pop(Start);\n  }\n\n} /* FreeMFOutline */\n\n/*---------------------------------------------------------------------------*/\n/**\n * Release all memory consumed by the specified list\n * of outlines.\n * @param Outlines  list of mf-outlines to be freed\n */\nvoid FreeOutlines(LIST Outlines) {\n  destroy_nodes(Outlines, FreeMFOutline);\n} /* FreeOutlines */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine searches through the specified outline and finds\n * the points at which the outline changes direction.  These\n * points are then marked as \"extremities\".  This routine is\n * used as an alternative to FindExtremities().  It forces the\n * endpoints of the microfeatures to be at the direction\n * changes rather than at the midpoint between direction\n * changes.\n * @param Outline   micro-feature outline to analyze\n */\nvoid MarkDirectionChanges(MFOUTLINE Outline) {\n  MFOUTLINE Current;\n  MFOUTLINE Last;\n  MFOUTLINE First;\n\n  if (DegenerateOutline(Outline)) {\n    return;\n  }\n\n  First = NextDirectionChange(Outline);\n  Last = First;\n  do {\n    Current = NextDirectionChange(Last);\n    PointAt(Current)->MarkPoint();\n    Last = Current;\n  } while (Last != First);\n\n} /* MarkDirectionChanges */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine returns the next point in the micro-feature\n * outline that is an extremity.  The search starts after\n * EdgePoint.  The routine assumes that the outline being\n * searched is not a degenerate outline (i.e. it must have\n * 2 or more edge points).\n * @param EdgePoint start search from this point\n * @return Next extremity in the outline after EdgePoint.\n * @note Globals: none\n */\nMFOUTLINE NextExtremity(MFOUTLINE EdgePoint) {\n  EdgePoint = NextPointAfter(EdgePoint);\n  while (!PointAt(EdgePoint)->ExtremityMark) {\n    EdgePoint = NextPointAfter(EdgePoint);\n  }\n\n  return (EdgePoint);\n\n} /* NextExtremity */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine normalizes the coordinates of the specified\n * outline so that the outline is deskewed down to the\n * baseline, translated so that x=0 is at XOrigin, and scaled\n * so that the height of a character cell from descender to\n * ascender is 1.  Of this height, 0.25 is for the descender,\n * 0.25 for the ascender, and 0.5 for the x-height.  The\n * y coordinate of the baseline is 0.\n * @param Outline   outline to be normalized\n * @param XOrigin   x-origin of text\n */\nvoid NormalizeOutline(MFOUTLINE Outline, float XOrigin) {\n  if (Outline == NIL_LIST) {\n    return;\n  }\n\n  MFOUTLINE EdgePoint = Outline;\n  do {\n    MFEDGEPT *Current = PointAt(EdgePoint);\n    Current->Point.y = MF_SCALE_FACTOR * (Current->Point.y - kBlnBaselineOffset);\n    Current->Point.x = MF_SCALE_FACTOR * (Current->Point.x - XOrigin);\n    EdgePoint = NextPointAfter(EdgePoint);\n  } while (EdgePoint != Outline);\n} /* NormalizeOutline */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine normalizes every outline in Outlines\n * according to the currently selected normalization method.\n * It also returns the scale factors that it used to do this\n * scaling.  The scale factors returned represent the x and\n * y sizes in the normalized coordinate system that correspond\n * to 1 pixel in the original coordinate system.\n * Outlines are changed and XScale and YScale are updated.\n *\n * Globals:\n * - classify_norm_method  method being used for normalization\n * - classify_char_norm_range map radius of gyration to this value\n * @param Outlines  list of outlines to be normalized\n * @param XScale    x-direction scale factor used by routine\n * @param YScale    y-direction scale factor used by routine\n */\nvoid Classify::NormalizeOutlines(LIST Outlines, float *XScale, float *YScale) {\n  MFOUTLINE Outline;\n\n  switch (classify_norm_method) {\n    case character:\n      ASSERT_HOST(!\"How did NormalizeOutlines get called in character mode?\");\n      break;\n\n    case baseline:\n      iterate(Outlines) {\n        Outline = static_cast<MFOUTLINE>(Outlines->first_node());\n        NormalizeOutline(Outline, 0.0);\n      }\n      *XScale = *YScale = MF_SCALE_FACTOR;\n      break;\n  }\n} /* NormalizeOutlines */\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n/**\n * Change the direction of every vector in the specified\n * outline segment to Direction.  The segment to be changed\n * starts at Start and ends at End.  Note that the previous\n * direction of End must also be changed to reflect the\n * change in direction of the point before it.\n * @param Start defines start of segment of outline to be modified\n * @param End defines end of segment of outline to be modified\n * @param Direction new direction to assign to segment\n */\nvoid ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction) {\n  MFOUTLINE Current;\n\n  for (Current = Start; Current != End; Current = NextPointAfter(Current)) {\n    PointAt(Current)->Direction = Direction;\n  }\n\n  PointAt(End)->PreviousDirection = Direction;\n\n} /* ChangeDirection */\n\n/**\n * This routine normalizes each point in Outline by\n * translating it to the specified center and scaling it\n * anisotropically according to the given scale factors.\n * @param Outline     outline to be character normalized\n * @param cn_denorm\n */\nvoid CharNormalizeOutline(MFOUTLINE Outline, const DENORM &cn_denorm) {\n  MFOUTLINE First, Current;\n  MFEDGEPT *CurrentPoint;\n\n  if (Outline == NIL_LIST) {\n    return;\n  }\n\n  First = Outline;\n  Current = First;\n  do {\n    CurrentPoint = PointAt(Current);\n    FCOORD pos(CurrentPoint->Point.x, CurrentPoint->Point.y);\n    cn_denorm.LocalNormTransform(pos, &pos);\n    CurrentPoint->Point.x = (pos.x() - UINT8_MAX / 2) * MF_SCALE_FACTOR;\n    CurrentPoint->Point.y = (pos.y() - UINT8_MAX / 2) * MF_SCALE_FACTOR;\n\n    Current = NextPointAfter(Current);\n  } while (Current != First);\n\n} /* CharNormalizeOutline */\n\n/**\n * This routine computes the slope from Start to Finish and\n * and then computes the approximate direction of the line\n * segment from Start to Finish.  The direction is quantized\n * into 8 buckets:\n *  N, S, E, W, NE, NW, SE, SW\n * Both the slope and the direction are then stored into\n * the appropriate fields of the Start edge point.  The\n * direction is also stored into the PreviousDirection field\n * of the Finish edge point.\n * @param Start   starting point to compute direction from\n * @param Finish    finishing point to compute direction to\n * @param MinSlope  slope below which lines are horizontal\n * @param MaxSlope  slope above which lines are vertical\n */\nvoid ComputeDirection(MFEDGEPT *Start, MFEDGEPT *Finish, float MinSlope, float MaxSlope) {\n  FVECTOR Delta;\n\n  Delta.x = Finish->Point.x - Start->Point.x;\n  Delta.y = Finish->Point.y - Start->Point.y;\n  if (Delta.x == 0) {\n    if (Delta.y < 0) {\n      Start->Slope = -FLT_MAX;\n      Start->Direction = south;\n    } else {\n      Start->Slope = FLT_MAX;\n      Start->Direction = north;\n    }\n  } else {\n    Start->Slope = Delta.y / Delta.x;\n    if (Delta.x > 0) {\n      if (Delta.y > 0) {\n        if (Start->Slope > MinSlope) {\n          if (Start->Slope < MaxSlope) {\n            Start->Direction = northeast;\n          } else {\n            Start->Direction = north;\n          }\n        } else {\n          Start->Direction = east;\n        }\n      } else if (Start->Slope < -MinSlope) {\n        if (Start->Slope > -MaxSlope) {\n          Start->Direction = southeast;\n        } else {\n          Start->Direction = south;\n        }\n      } else {\n        Start->Direction = east;\n      }\n    } else if (Delta.y > 0) {\n      if (Start->Slope < -MinSlope) {\n        if (Start->Slope > -MaxSlope) {\n          Start->Direction = northwest;\n        } else {\n          Start->Direction = north;\n        }\n      } else {\n        Start->Direction = west;\n      }\n    } else if (Start->Slope > MinSlope) {\n      if (Start->Slope < MaxSlope) {\n        Start->Direction = southwest;\n      } else {\n        Start->Direction = south;\n      }\n    } else {\n      Start->Direction = west;\n    }\n  }\n  Finish->PreviousDirection = Start->Direction;\n}\n\n/**\n * This routine returns the next point in the micro-feature\n * outline that has a direction different than EdgePoint.  The\n * routine assumes that the outline being searched is not a\n * degenerate outline (i.e. it must have 2 or more edge points).\n * @param EdgePoint start search from this point\n * @return Point of next direction change in micro-feature outline.\n * @note Globals: none\n */\nMFOUTLINE NextDirectionChange(MFOUTLINE EdgePoint) {\n  DIRECTION InitialDirection;\n\n  InitialDirection = PointAt(EdgePoint)->Direction;\n\n  MFOUTLINE next_pt = nullptr;\n  do {\n    EdgePoint = NextPointAfter(EdgePoint);\n    next_pt = NextPointAfter(EdgePoint);\n  } while (PointAt(EdgePoint)->Direction == InitialDirection && !PointAt(EdgePoint)->Hidden &&\n           next_pt != nullptr && !PointAt(next_pt)->Hidden);\n\n  return (EdgePoint);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/mfoutline.h",
    "content": "/******************************************************************************\n ** Filename:    mfoutline.h\n ** Purpose:     Interface spec for fx outline structures\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef MFOUTLINE_H\n#define MFOUTLINE_H\n\n#include \"blobs.h\"\n#include \"fpoint.h\"\n#include \"oldlist.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nusing MFOUTLINE = LIST;\n\nenum DIRECTION : uint8_t { north, south, east, west, northeast, northwest, southeast, southwest };\n\nstruct MFEDGEPT {\n  // Inline functions for manipulating micro-feature outline edge points.\n\n  void ClearMark() {\n    ExtremityMark = false;\n  }\n\n  void MarkPoint() {\n    ExtremityMark = true;\n  }\n\n  FPOINT Point;\n  float Slope;\n  bool Hidden;\n  bool ExtremityMark;\n  DIRECTION Direction;\n  DIRECTION PreviousDirection;\n};\n\nenum OUTLINETYPE { outer, hole };\n\nenum NORM_METHOD { baseline, character };\n\n/**----------------------------------------------------------------------------\n          Macros\n----------------------------------------------------------------------------**/\n#define AverageOf(A, B) (((A) + (B)) / 2)\n\n// Constant for computing the scale factor to use to normalize characters.\nconst float MF_SCALE_FACTOR = 0.5f / kBlnXHeight;\n\n// Inline functions for manipulating micro-feature outlines.\n\nstatic inline bool DegenerateOutline(MFOUTLINE Outline) {\n  return (Outline == NIL_LIST) || (Outline == Outline->list_rest());\n}\n\nstatic inline MFEDGEPT *PointAt(MFOUTLINE Outline) {\n  return reinterpret_cast<MFEDGEPT *>(Outline->first_node());\n}\n\nstatic inline MFOUTLINE NextPointAfter(MFOUTLINE Outline) {\n  return Outline->list_rest();\n}\n\nstatic inline void MakeOutlineCircular(MFOUTLINE Outline) {\n  set_rest(last(Outline), Outline);\n}\n\n/**----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\nvoid ComputeBlobCenter(TBLOB *Blob, TPOINT *BlobCenter);\n\nLIST ConvertBlob(TBLOB *Blob);\n\nMFOUTLINE ConvertOutline(TESSLINE *Outline);\n\nLIST ConvertOutlines(TESSLINE *Outline, LIST ConvertedOutlines, OUTLINETYPE OutlineType);\n\nvoid FilterEdgeNoise(MFOUTLINE Outline, float NoiseSegmentLength);\n\nvoid FindDirectionChanges(MFOUTLINE Outline, float MinSlope, float MaxSlope);\n\nvoid FreeMFOutline(void *agr); // MFOUTLINE Outline);\n\nvoid FreeOutlines(LIST Outlines);\n\nvoid MarkDirectionChanges(MFOUTLINE Outline);\n\nMFOUTLINE NextExtremity(MFOUTLINE EdgePoint);\n\nvoid NormalizeOutline(MFOUTLINE Outline, float XOrigin);\n\n/*----------------------------------------------------------------------------\n          Private Function Prototypes\n-----------------------------------------------------------------------------*/\nvoid ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction);\n\n// Normalizes the Outline in-place using cn_denorm's local transformation,\n// then converts from the integer feature range [0,255] to the clusterer\n// feature range of [-0.5, 0.5].\nvoid CharNormalizeOutline(MFOUTLINE Outline, const DENORM &cn_denorm);\n\nvoid ComputeDirection(MFEDGEPT *Start, MFEDGEPT *Finish, float MinSlope, float MaxSlope);\n\nMFOUTLINE NextDirectionChange(MFOUTLINE EdgePoint);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/mfx.cpp",
    "content": "/******************************************************************************\n ** Filename:       mfx.c\n ** Purpose:        Micro feature extraction routines\n ** Author:         Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#include \"mfx.h\"\n\n#include \"clusttool.h\" //NEEDED\n#include \"intfx.h\"\n#include \"mfdefs.h\"\n#include \"mfoutline.h\"\n#include \"normalis.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\n/* old numbers corresponded to 10.0 degrees and 80.0 degrees */\ndouble_VAR(classify_min_slope, 0.414213562, \"Slope below which lines are called horizontal\");\ndouble_VAR(classify_max_slope, 2.414213562, \"Slope above which lines are called vertical\");\n\n/*----------------------------------------------------------------------------\n          Private Function Prototypes\n-----------------------------------------------------------------------------*/\n\nMICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline, MICROFEATURES MicroFeatures);\n\nMicroFeature ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End);\n\n/*----------------------------------------------------------------------------\n            Public Code\n----------------------------------------------------------------------------*/\n\n/**\n * This routine extracts micro-features from the specified\n * blob and returns a list of the micro-features.  All\n * micro-features are normalized according to the specified\n * line statistics.\n * @param Blob blob to extract micro-features from\n * @param cn_denorm control parameter to feature extractor\n * @return List of micro-features extracted from the blob.\n */\nMICROFEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM &cn_denorm) {\n  MICROFEATURES MicroFeatures;\n  LIST Outlines;\n  LIST RemainingOutlines;\n\n  if (Blob != nullptr) {\n    Outlines = ConvertBlob(Blob);\n\n    RemainingOutlines = Outlines;\n    iterate(RemainingOutlines) {\n      auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());\n      CharNormalizeOutline(Outline, cn_denorm);\n    }\n\n    RemainingOutlines = Outlines;\n    iterate(RemainingOutlines) {\n      auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());\n      FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);\n      MarkDirectionChanges(Outline);\n      MicroFeatures = ConvertToMicroFeatures(Outline, MicroFeatures);\n    }\n    FreeOutlines(Outlines);\n  }\n  return MicroFeatures;\n} /* BlobMicroFeatures */\n\n/*---------------------------------------------------------------------------\n            Private Code\n---------------------------------------------------------------------------*/\n\n/**\n * Convert Outline to MicroFeatures\n * @param Outline         outline to extract micro-features from\n * @param MicroFeatures   list of micro-features to add to\n * @return List of micro-features with new features added to front.\n * @note Globals: none\n */\nMICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline, MICROFEATURES MicroFeatures) {\n  MFOUTLINE Current;\n  MFOUTLINE Last;\n  MFOUTLINE First;\n\n  if (DegenerateOutline(Outline)) {\n    return (MicroFeatures);\n  }\n\n  First = NextExtremity(Outline);\n  Last = First;\n  do {\n    Current = NextExtremity(Last);\n    if (!PointAt(Current)->Hidden) {\n      auto NewFeature = ExtractMicroFeature(Last, Current);\n      MicroFeatures.push_front(NewFeature);\n    }\n    Last = Current;\n  } while (Last != First);\n\n  return MicroFeatures;\n} /* ConvertToMicroFeatures */\n\n/**\n * This routine computes the feature parameters which describe\n * the micro-feature that starts and Start and ends at End.\n * A new micro-feature is allocated, filled with the feature\n * parameters, and returned.  The routine assumes that\n * Start and End are not the same point.  If they are the\n * same point, nullptr is returned, a warning message is\n * printed, and the current outline is dumped to stdout.\n * @param Start starting point of micro-feature\n * @param End ending point of micro-feature\n * @return New micro-feature or nullptr if the feature was rejected.\n * @note Globals: none\n */\nMicroFeature ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End) {\n  MFEDGEPT *P1, *P2;\n\n  P1 = PointAt(Start);\n  P2 = PointAt(End);\n\n  MicroFeature NewFeature;\n  NewFeature[(int)MicroFeatureParameter::MFXPosition] = AverageOf(P1->Point.x, P2->Point.x);\n  NewFeature[(int)MicroFeatureParameter::MFYPosition] = AverageOf(P1->Point.y, P2->Point.y);\n  NewFeature[(int)MicroFeatureParameter::MFLength] = DistanceBetween(P1->Point, P2->Point);\n  NewFeature[(int)MicroFeatureParameter::MFDirection] = NormalizedAngleFrom(&P1->Point, &P2->Point, 1.0);\n  NewFeature[(int)MicroFeatureParameter::MFBulge1] = 0.0f;  // deprecated\n  NewFeature[(int)MicroFeatureParameter::MFBulge2] = 0.0f; // deprecated\n\n  return NewFeature;\n} /* ExtractMicroFeature */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/mfx.h",
    "content": "/******************************************************************************\n ** Filename: mfx.h\n ** Purpose:  Definition of micro-feature extraction routines\n ** Author:   Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef MFX_H\n#define MFX_H\n\n#include \"mfdefs.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nclass DENORM;\nstruct TBLOB;\n\n/*----------------------------------------------------------------------------\n          Variables\n----------------------------------------------------------------------------**/\n\n/* old numbers corresponded to 10.0 degrees and 80.0 degrees */\nextern double_VAR_H(classify_min_slope);\nextern double_VAR_H(classify_max_slope);\n\n/*----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\nMICROFEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM &cn_denorm);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/normfeat.cpp",
    "content": "/******************************************************************************\n ** Filename:    normfeat.c\n ** Purpose:     Definition of char normalization features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"normfeat.h\"\n\n#include \"featdefs.h\"\n#include \"intfx.h\"\n#include \"mfoutline.h\"\n\nnamespace tesseract {\n\n/** Return the length of the outline in baseline normalized form. */\nfloat ActualOutlineLength(FEATURE Feature) {\n  return (Feature->Params[CharNormLength] * LENGTH_COMPRESSION);\n}\n\n/**\n * Return the character normalization feature for a blob.\n *\n * The features returned are in a scale where the x-height has been\n * normalized to live in the region y = [-0.25 .. 0.25].  Example ranges\n * for English below are based on the Linux font collection on 2009-12-04:\n *\n *  - Params[CharNormY]\n *     - The y coordinate of the grapheme's centroid.\n *     - English: [-0.27, 0.71]\n *\n *  - Params[CharNormLength]\n *     - The length of the grapheme's outline (tiny segments discarded),\n *     divided by 10.0=LENGTH_COMPRESSION.\n *     - English: [0.16, 0.85]\n *\n *  - Params[CharNormRx]\n *     - The radius of gyration about the x axis, as measured from CharNormY.\n *     - English: [0.011, 0.34]\n *\n *  - Params[CharNormRy]\n *     - The radius of gyration about the y axis, as measured from\n *     the x center of the grapheme's bounding box.\n *     - English: [0.011, 0.31]\n */\nFEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info) {\n  auto feature_set = new FEATURE_SET_STRUCT(1);\n  auto feature = new FEATURE_STRUCT(&CharNormDesc);\n\n  feature->Params[CharNormY] = MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);\n  feature->Params[CharNormLength] = MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;\n  feature->Params[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;\n  feature->Params[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;\n\n  AddFeature(feature_set, feature);\n\n  return feature_set;\n} /* ExtractCharNormFeatures */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/normfeat.h",
    "content": "/******************************************************************************\n ** Filename:    normfeat.h\n ** Purpose:     Definition of character normalization features.\n ** Author:      Dan Johnson\n ** History:     12/14/90, DSJ, Created.\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#ifndef NORMFEAT_H\n#define NORMFEAT_H\n\n#include \"ocrfeatures.h\"\n\nnamespace tesseract {\n\n#define LENGTH_COMPRESSION (10.0)\n\nstruct INT_FX_RESULT_STRUCT;\n\ntypedef enum { CharNormY, CharNormLength, CharNormRx, CharNormRy } NORM_PARAM_NAME;\n\nfloat ActualOutlineLength(FEATURE Feature);\n\nFEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/normmatch.cpp",
    "content": "/******************************************************************************\n ** Filename:    normmatch.c\n ** Purpose:     Simple matcher based on character normalization features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n#include \"normmatch.h\"\n\n#include \"classify.h\"\n#include \"clusttool.h\"\n#include \"helpers.h\"\n#include \"normfeat.h\"\n#include \"params.h\"\n#include \"unicharset.h\"\n\n#include <cmath>\n#include <cstdio>\n#include <sstream> // for std::istringstream\n\nnamespace tesseract {\n\nstruct NORM_PROTOS {\n  NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {\n  }\n  int NumParams = 0;\n  int NumProtos;\n  PARAM_DESC *ParamDesc = nullptr;\n  std::vector<LIST> Protos;\n};\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n\n/**\n * @name NormEvidenceOf\n *\n * Return the new type of evidence number corresponding to this\n * normalization adjustment.  The equation that represents the transform is:\n *       1 / (1 + (NormAdj / midpoint) ^ curl)\n */\nstatic float NormEvidenceOf(float NormAdj) {\n  NormAdj /= static_cast<float>(classify_norm_adj_midpoint);\n\n  if (classify_norm_adj_curl == 3) {\n    NormAdj = NormAdj * NormAdj * NormAdj;\n  } else if (classify_norm_adj_curl == 2) {\n    NormAdj = NormAdj * NormAdj;\n  } else {\n    NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl));\n  }\n  return (1 / (1 + NormAdj));\n}\n\n/*----------------------------------------------------------------------------\n        Variables\n----------------------------------------------------------------------------*/\n\n/** control knobs used to control the normalization adjustment process */\ndouble_VAR(classify_norm_adj_midpoint, 32.0, \"Norm adjust midpoint ...\");\ndouble_VAR(classify_norm_adj_curl, 2.0, \"Norm adjust curl ...\");\n/** Weight of width variance against height and vertical position. */\nconst float kWidthErrorWeighting = 0.125f;\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n/**\n * This routine compares Features against each character\n * normalization proto for ClassId and returns the match\n * rating of the best match.\n * @param ClassId id of class to match against\n * @param feature character normalization feature\n * @param DebugMatch controls dump of debug info\n *\n * Globals:\n * #NormProtos character normalization prototypes\n *\n * @return Best match rating for Feature against protos of ClassId.\n */\nfloat Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {\n  if (ClassId >= NormProtos->NumProtos) {\n    ClassId = NO_CLASS;\n  }\n\n  /* handle requests for classification as noise */\n  if (ClassId == NO_CLASS) {\n    /* kludge - clean up constants and make into control knobs later */\n    float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +\n                   feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +\n                   feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);\n    return (1 - NormEvidenceOf(Match));\n  }\n\n  if (DebugMatch) {\n    tprintf(\"\\nChar norm for class %s\\n\", unicharset.id_to_unichar(ClassId));\n  }\n\n  LIST Protos = NormProtos->Protos[ClassId];\n  if (Protos == nullptr) {\n     // Avoid FP overflow in NormEvidenceOf.\n     return 1.0f;\n  }\n\n  float BestMatch = FLT_MAX;\n  iterate(Protos) {\n    auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());\n    float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];\n    float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];\n    if (DebugMatch) {\n      tprintf(\"YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\\n\", Proto->Mean[CharNormY], Delta,\n              Proto->Weight.Elliptical[CharNormY], Match);\n    }\n    Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];\n    Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];\n    if (DebugMatch) {\n      tprintf(\"Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\\n\", Proto->Mean[CharNormRx], Delta,\n              Proto->Weight.Elliptical[CharNormRx], Match);\n    }\n    // Ry is width! See intfx.cpp.\n    Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];\n    if (DebugMatch) {\n      tprintf(\"Width: Proto=%g, Delta=%g, Var=%g\\n\", Proto->Mean[CharNormRy], Delta,\n              Proto->Weight.Elliptical[CharNormRy]);\n    }\n    Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];\n    Delta *= kWidthErrorWeighting;\n    Match += Delta;\n    if (DebugMatch) {\n      tprintf(\"Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\\n\", Match,\n              Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),\n              256 * (1 - NormEvidenceOf(Match)));\n    }\n\n    if (Match < BestMatch) {\n      BestMatch = Match;\n    }\n  }\n  return 1 - NormEvidenceOf(BestMatch);\n} /* ComputeNormMatch */\n\nvoid Classify::FreeNormProtos() {\n  if (NormProtos != nullptr) {\n    for (int i = 0; i < NormProtos->NumProtos; i++) {\n      FreeProtoList(&NormProtos->Protos[i]);\n    }\n    delete[] NormProtos->ParamDesc;\n    delete NormProtos;\n    NormProtos = nullptr;\n  }\n}\n\n/**\n * This routine allocates a new data structure to hold\n * a set of character normalization protos.  It then fills in\n * the data structure by reading from the specified File.\n * @param fp open text file to read normalization protos from\n * Globals: none\n * @return Character normalization protos.\n */\nNORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {\n  char unichar[2 * UNICHAR_LEN + 1];\n  UNICHAR_ID unichar_id;\n  LIST Protos;\n  int NumProtos;\n\n  /* allocate and initialization data structure */\n  auto NormProtos = new NORM_PROTOS(unicharset.size());\n\n  /* read file header and save in data structure */\n  NormProtos->NumParams = ReadSampleSize(fp);\n  NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);\n\n  /* read protos for each class into a separate list */\n  const int kMaxLineSize = 100;\n  char line[kMaxLineSize];\n  while (fp->FGets(line, kMaxLineSize) != nullptr) {\n    std::istringstream stream(line);\n    stream.imbue(std::locale::classic());\n    stream >> unichar >> NumProtos;\n    if (stream.fail()) {\n      continue;\n    }\n    if (unicharset.contains_unichar(unichar)) {\n      unichar_id = unicharset.unichar_to_id(unichar);\n      Protos = NormProtos->Protos[unichar_id];\n      for (int i = 0; i < NumProtos; i++) {\n        Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));\n      }\n      NormProtos->Protos[unichar_id] = Protos;\n    } else {\n      tprintf(\"Error: unichar %s in normproto file is not in unichar set.\\n\", unichar);\n      for (int i = 0; i < NumProtos; i++) {\n        FreePrototype(ReadPrototype(fp, NormProtos->NumParams));\n      }\n    }\n  }\n  return NormProtos;\n} /* ReadNormProtos */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/normmatch.h",
    "content": "/******************************************************************************\n ** Filename:    normmatch.h\n ** Purpose:     Simple matcher based on character normalization features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef NORMMATCH_H\n#define NORMMATCH_H\n\n#include \"matchdefs.h\"\n#include \"ocrfeatures.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\n/* control knobs used to control the normalization adjustment process */\nextern double_VAR_H(classify_norm_adj_midpoint);\nextern double_VAR_H(classify_norm_adj_curl);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/ocrfeatures.cpp",
    "content": "/******************************************************************************\n ** Filename:    ocrfeatures.cpp\n ** Purpose:     Generic definition of a feature.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"ocrfeatures.h\"\n\n#include \"scanutils.h\"\n\n#include <cassert>\n#include <cmath>\n#include <sstream> // for std::stringstream\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n/**\n * Add a feature to a feature set.  If the feature set is\n * already full, false is returned to indicate that the\n * feature could not be added to the set; otherwise, true is\n * returned.\n * @param FeatureSet set of features to add Feature to\n * @param Feature feature to be added to FeatureSet\n * @return  true if feature added to set, false if set is already full.\n */\nbool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature) {\n  if (FeatureSet->NumFeatures >= FeatureSet->MaxNumFeatures) {\n    delete Feature;\n    return false;\n  }\n\n  FeatureSet->Features[FeatureSet->NumFeatures++] = Feature;\n  return true;\n} /* AddFeature */\n\n/**\n * Create a new feature of the specified type and read in\n * the value of its parameters from File.  The extra penalty\n * for the feature is also computed by calling the appropriate\n * function for the specified feature type.  The correct text\n * representation for a feature is a list of N floats where\n * N is the number of parameters in the feature.\n * @param File open text file to read feature from\n * @param FeatureDesc specifies type of feature to read from File\n * @return New #FEATURE read from File.\n */\nstatic FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc) {\n  auto Feature = new FEATURE_STRUCT(FeatureDesc);\n  for (int i = 0; i < Feature->Type->NumParams; i++) {\n    ASSERT_HOST(tfscanf(File, \"%f\", &(Feature->Params[i])) == 1);\n#ifndef _WIN32\n    assert(!std::isnan(Feature->Params[i]));\n#endif\n  }\n  return Feature;\n}\n\n/**\n * Create a new feature set of the specified type and read in\n * the features from File.  The correct text representation\n * for a feature set is an integer which specifies the number (N)\n * of features in a set followed by a list of N feature\n * descriptions.\n * @param File open text file to read new feature set from\n * @param FeatureDesc specifies type of feature to read from File\n * @return New feature set read from File.\n */\nFEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc) {\n  int NumFeatures;\n  ASSERT_HOST(tfscanf(File, \"%d\", &NumFeatures) == 1);\n  ASSERT_HOST(NumFeatures >= 0);\n\n  auto FeatureSet = new FEATURE_SET_STRUCT(NumFeatures);\n  for (int i = 0; i < NumFeatures; i++) {\n    AddFeature(FeatureSet, ReadFeature(File, FeatureDesc));\n  }\n\n  return FeatureSet;\n}\n\n/**\n * Appends a textual representation of Feature to str.\n * This representation is simply a list of the N parameters\n * of the feature, terminated with a newline.  It is assumed\n * that the ExtraPenalty field can be reconstructed from the\n * parameters of the feature.  It is also assumed that the\n * feature type information is specified or assumed elsewhere.\n * @param Feature feature to write out to str\n * @param str string to write Feature to\n */\nstatic void WriteFeature(FEATURE Feature, std::string &str) {\n  for (int i = 0; i < Feature->Type->NumParams; i++) {\n#ifndef WIN32\n    assert(!std::isnan(Feature->Params[i]));\n#endif\n    std::stringstream stream;\n    // Use \"C\" locale (needed for double value).\n    stream.imbue(std::locale::classic());\n    // Use 8 digits for double value.\n    stream.precision(8);\n    stream << Feature->Params[i];\n    str += \" \" + stream.str();\n  }\n  str += \"\\n\";\n} /* WriteFeature */\n\n/**\n * Write a textual representation of FeatureSet to File.\n * This representation is an integer specifying the number of\n * features in the set, followed by a newline, followed by\n * text representations for each feature in the set.\n * @param FeatureSet feature set to write to File\n * @param str string to write Feature to\n */\nvoid WriteFeatureSet(FEATURE_SET FeatureSet, std::string &str) {\n  if (FeatureSet) {\n    str += \"\" + std::to_string(FeatureSet->NumFeatures);\n    str += \"\\n\";\n    for (int i = 0; i < FeatureSet->NumFeatures; i++) {\n      WriteFeature(FeatureSet->Features[i], str);\n    }\n  }\n} /* WriteFeatureSet */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/ocrfeatures.h",
    "content": "/******************************************************************************\n ** Filename:    features.h\n ** Purpose:     Generic definition of a feature.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef FEATURES_H\n#define FEATURES_H\n\n#include \"blobs.h\"\n\n#include <cstdio>\n#include <string> // for std::string\n\nnamespace tesseract {\n\nclass DENORM;\n\n#undef Min\n#undef Max\n#define FEAT_NAME_SIZE 80\n\n// A character is described by multiple sets of extracted features.  Each\n// set contains a number of features of a particular type, for example, a\n// set of bays, or a set of closures, or a set of microfeatures.  Each\n// feature consists of a number of parameters.  All features within a\n// feature set contain the same number of parameters.  All circular\n// parameters are required to be the first parameters in the feature.\n\nstruct PARAM_DESC {\n  bool Circular;     // true if dimension wraps around\n  bool NonEssential; // true if dimension not used in searches\n  float Min;         // low end of range for circular dimensions\n  float Max;         // high end of range for circular dimensions\n  float Range;       // Max - Min\n  float HalfRange;   // (Max - Min)/2\n  float MidRange;    // (Max + Min)/2\n};\n\nstruct FEATURE_DESC_STRUCT {\n  uint16_t NumParams;          // total # of params\n  const char *ShortName;       // short name for feature\n  const PARAM_DESC *ParamDesc; // array - one per param\n};\nusing FEATURE_DESC = FEATURE_DESC_STRUCT *;\n\nstruct FEATURE_STRUCT {\n  /// Constructor for a new feature of the specified type.\n  /// @param FeatureDesc description of feature to be created.\n  FEATURE_STRUCT(const FEATURE_DESC_STRUCT *FeatureDesc) : Type(FeatureDesc), Params(FeatureDesc->NumParams) {\n  }\n  ~FEATURE_STRUCT() {\n  }\n  const FEATURE_DESC_STRUCT *Type; // points to description of feature type\n  std::vector<float> Params;       // variable size array - params for feature\n};\nusing FEATURE = FEATURE_STRUCT *;\n\nstruct FEATURE_SET_STRUCT {\n  /// Creator for a new feature set large enough to\n  /// hold the specified number of features.\n  /// @param NumFeatures maximum # of features to be put in feature set\n  FEATURE_SET_STRUCT(int numFeatures) : NumFeatures(0), MaxNumFeatures(numFeatures), Features(numFeatures) {\n  }\n\n  ~FEATURE_SET_STRUCT() {\n    for (uint16_t i = 0; i < NumFeatures; i++) {\n      delete Features[i];\n    }\n  }\n\n  uint16_t NumFeatures;    // number of features in set\n  uint16_t MaxNumFeatures; // maximum size of feature set\n  std::vector<FEATURE_STRUCT *> Features; // variable size array of features\n};\nusing FEATURE_SET = FEATURE_SET_STRUCT *;\n\n// A generic character description as a char pointer. In reality, it will be\n// a pointer to some data structure. Paired feature extractors/matchers need\n// to agree on the data structure to be used, however, the high level\n// classifier does not need to know the details of this data structure.\nusing CHAR_FEATURES = char *;\n\n/*----------------------------------------------------------------------\n    Macros for defining the parameters of a new features\n----------------------------------------------------------------------*/\n#define StartParamDesc(Name) const PARAM_DESC Name[] = {\n#define DefineParam(Circular, NonEssential, Min, Max) \\\n  {Circular,                                          \\\n   NonEssential,                                      \\\n   Min,                                               \\\n   Max,                                               \\\n   (Max) - (Min),                                     \\\n   (((Max) - (Min)) / 2.0),                           \\\n   (((Max) + (Min)) / 2.0)},\n\n#define EndParamDesc \\\n  }                  \\\n  ;\n\n/*----------------------------------------------------------------------\nMacro for describing a new feature.  The parameters of the macro\nare as follows:\n\nDefineFeature (Name, NumLinear, NumCircular, ShortName, ParamName)\n----------------------------------------------------------------------*/\n#define DefineFeature(Name, NL, NC, SN, PN) \\\n  const FEATURE_DESC_STRUCT Name = {((NL) + (NC)), SN, PN};\n\n/*----------------------------------------------------------------------\n        Generic routines that work for all feature types\n----------------------------------------------------------------------*/\nbool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature);\n\nFEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc);\n\nvoid WriteFeatureSet(FEATURE_SET FeatureSet, std::string &str);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/outfeat.cpp",
    "content": "/******************************************************************************\n ** Filename:    outfeat.c\n ** Purpose:     Definition of outline-features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"outfeat.h\"\n\n#include \"classify.h\"\n#include \"featdefs.h\"\n#include \"mfoutline.h\"\n#include \"ocrfeatures.h\"\n\n#include <cstdio>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n\n/**\n * Convert each segment in the outline to a feature\n * and return the features.\n * @param Blob blob to extract pico-features from\n * @return Outline-features for Blob.\n * @note Globals: none\n */\nFEATURE_SET Classify::ExtractOutlineFeatures(TBLOB *Blob) {\n  auto FeatureSet = new FEATURE_SET_STRUCT(MAX_OUTLINE_FEATURES);\n  if (Blob == nullptr) {\n    return (FeatureSet);\n  }\n\n  auto Outlines = ConvertBlob(Blob);\n\n  float XScale, YScale;\n  NormalizeOutlines(Outlines, &XScale, &YScale);\n  auto RemainingOutlines = Outlines;\n  iterate(RemainingOutlines) {\n    auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());\n    ConvertToOutlineFeatures(Outline, FeatureSet);\n  }\n  if (classify_norm_method == baseline) {\n    NormalizeOutlineX(FeatureSet);\n  }\n  FreeOutlines(Outlines);\n  return (FeatureSet);\n} /* ExtractOutlineFeatures */\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n/*---------------------------------------------------------------------------*/\n/**\n * This routine computes the midpoint between Start and\n * End to obtain the x,y position of the outline-feature.  It\n * also computes the direction from Start to End as the\n * direction of the outline-feature and the distance from\n * Start to End as the length of the outline-feature.\n * This feature is then\n * inserted into the next feature slot in FeatureSet.\n * @param Start starting point of outline-feature\n * @param End ending point of outline-feature\n * @param FeatureSet set to add outline-feature to\n */\nvoid AddOutlineFeatureToSet(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet) {\n  auto Feature = new FEATURE_STRUCT(&OutlineFeatDesc);\n  Feature->Params[OutlineFeatDir] = NormalizedAngleFrom(Start, End, 1.0);\n  Feature->Params[OutlineFeatX] = AverageOf(Start->x, End->x);\n  Feature->Params[OutlineFeatY] = AverageOf(Start->y, End->y);\n  Feature->Params[OutlineFeatLength] = DistanceBetween(*Start, *End);\n  AddFeature(FeatureSet, Feature);\n\n} /* AddOutlineFeatureToSet */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine steps converts each section in the specified\n * outline to a feature described by its x,y position, length\n * and angle.\n * Results are returned in FeatureSet.\n * @param Outline outline to extract outline-features from\n * @param FeatureSet set of features to add outline-features to\n */\nvoid ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet) {\n  MFOUTLINE Next;\n  MFOUTLINE First;\n  FPOINT FeatureStart;\n  FPOINT FeatureEnd;\n\n  if (DegenerateOutline(Outline)) {\n    return;\n  }\n\n  First = Outline;\n  Next = First;\n  do {\n    FeatureStart = PointAt(Next)->Point;\n    Next = NextPointAfter(Next);\n\n    /* note that an edge is hidden if the ending point of the edge is\n   marked as hidden.  This situation happens because the order of\n   the outlines is reversed when they are converted from the old\n   format.  In the old format, a hidden edge is marked by the\n   starting point for that edge. */\n    if (!PointAt(Next)->Hidden) {\n      FeatureEnd = PointAt(Next)->Point;\n      AddOutlineFeatureToSet(&FeatureStart, &FeatureEnd, FeatureSet);\n    }\n  } while (Next != First);\n} /* ConvertToOutlineFeatures */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine computes the weighted average x position\n * over all of the outline-features in FeatureSet and then\n * renormalizes the outline-features to force this average\n * to be the x origin (i.e. x=0).\n * FeatureSet is changed.\n * @param FeatureSet outline-features to be normalized\n */\nvoid NormalizeOutlineX(FEATURE_SET FeatureSet) {\n  int i;\n  FEATURE Feature;\n  float Length;\n  float TotalX = 0.0;\n  float TotalWeight = 0.0;\n  float Origin;\n\n  if (FeatureSet->NumFeatures <= 0) {\n    return;\n  }\n\n  for (i = 0; i < FeatureSet->NumFeatures; i++) {\n    Feature = FeatureSet->Features[i];\n    Length = Feature->Params[OutlineFeatLength];\n    TotalX += Feature->Params[OutlineFeatX] * Length;\n    TotalWeight += Length;\n  }\n  Origin = TotalX / TotalWeight;\n\n  for (i = 0; i < FeatureSet->NumFeatures; i++) {\n    Feature = FeatureSet->Features[i];\n    Feature->Params[OutlineFeatX] -= Origin;\n  }\n} /* NormalizeOutlineX */\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/outfeat.h",
    "content": "/******************************************************************************\n ** Filename:    outfeat.h\n ** Purpose:     Definition of outline features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef OUTFEAT_H\n#define OUTFEAT_H\n\n#include \"fpoint.h\"\n#include \"mfoutline.h\"\n#include \"ocrfeatures.h\"\n\nnamespace tesseract {\n\ntypedef enum {\n  OutlineFeatX,\n  OutlineFeatY,\n  OutlineFeatLength,\n  OutlineFeatDir\n} OUTLINE_FEAT_PARAM_NAME;\n\n#define MAX_OUTLINE_FEATURES (100)\n\n/*---------------------------------------------------------------------------\n          Privat Function Prototypes\n----------------------------------------------------------------------------*/\nvoid AddOutlineFeatureToSet(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet);\n\nvoid ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet);\n\nvoid NormalizeOutlineX(FEATURE_SET FeatureSet);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/picofeat.cpp",
    "content": "/******************************************************************************\n ** Filename:    picofeat.c\n ** Purpose:     Definition of pico-features.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include \"picofeat.h\"\n\n#include \"classify.h\"\n#include \"featdefs.h\"\n#include \"fpoint.h\"\n#include \"mfoutline.h\"\n#include \"ocrfeatures.h\"\n#include \"params.h\"\n#include \"trainingsample.h\"\n\n#include <cmath>\n#include <cstdio>\n\nnamespace tesseract {\n\n/*---------------------------------------------------------------------------\n          Variables\n----------------------------------------------------------------------------*/\n\ndouble_VAR(classify_pico_feature_length, 0.05, \"Pico Feature Length\");\n\n/*---------------------------------------------------------------------------\n          Private Function Prototypes\n----------------------------------------------------------------------------*/\nvoid ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet);\n\nvoid ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet);\n\nvoid NormalizePicoX(FEATURE_SET FeatureSet);\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n/*---------------------------------------------------------------------------*/\n/**\n * Operation: Dummy for now.\n *\n * Globals:\n * - classify_norm_method normalization method currently specified\n * @param Blob blob to extract pico-features from\n * @return Pico-features for Blob.\n */\nFEATURE_SET Classify::ExtractPicoFeatures(TBLOB *Blob) {\n  auto FeatureSet = new FEATURE_SET_STRUCT(MAX_PICO_FEATURES);\n  auto Outlines = ConvertBlob(Blob);\n  float XScale, YScale;\n  NormalizeOutlines(Outlines, &XScale, &YScale);\n  auto RemainingOutlines = Outlines;\n  iterate(RemainingOutlines) {\n    auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());\n    ConvertToPicoFeatures2(Outline, FeatureSet);\n  }\n  if (classify_norm_method == baseline) {\n    NormalizePicoX(FeatureSet);\n  }\n  FreeOutlines(Outlines);\n  return (FeatureSet);\n\n} /* ExtractPicoFeatures */\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n/*---------------------------------------------------------------------------*/\n/**\n * This routine converts an entire segment of an outline\n * into a set of pico features which are added to\n * FeatureSet.  The length of the segment is rounded to the\n * nearest whole number of pico-features.  The pico-features\n * are spaced evenly over the entire segment.\n * Results are placed in FeatureSet.\n * Globals:\n * - classify_pico_feature_length length of a single pico-feature\n * @param Start starting point of pico-feature\n * @param End ending point of pico-feature\n * @param FeatureSet set to add pico-feature to\n */\nvoid ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet) {\n  float Angle;\n  float Length;\n  int NumFeatures;\n  FPOINT Center;\n  FPOINT Delta;\n  int i;\n\n  Angle = NormalizedAngleFrom(Start, End, 1.0);\n  Length = DistanceBetween(*Start, *End);\n  NumFeatures = static_cast<int>(floor(Length / classify_pico_feature_length + 0.5));\n  if (NumFeatures < 1) {\n    NumFeatures = 1;\n  }\n\n  /* compute vector for one pico feature */\n  Delta.x = XDelta(*Start, *End) / NumFeatures;\n  Delta.y = YDelta(*Start, *End) / NumFeatures;\n\n  /* compute position of first pico feature */\n  Center.x = Start->x + Delta.x / 2.0;\n  Center.y = Start->y + Delta.y / 2.0;\n\n  /* compute each pico feature in segment and add to feature set */\n  for (i = 0; i < NumFeatures; i++) {\n    auto Feature = new FEATURE_STRUCT(&PicoFeatDesc);\n    Feature->Params[PicoFeatDir] = Angle;\n    Feature->Params[PicoFeatX] = Center.x;\n    Feature->Params[PicoFeatY] = Center.y;\n    AddFeature(FeatureSet, Feature);\n\n    Center.x += Delta.x;\n    Center.y += Delta.y;\n  }\n} /* ConvertSegmentToPicoFeat */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine steps through the specified outline and cuts it\n * up into pieces of equal length.  These pieces become the\n * desired pico-features.  Each segment in the outline\n * is converted into an integral number of pico-features.\n * Results are returned in FeatureSet.\n *\n * Globals:\n * - classify_pico_feature_length length of features to be extracted\n * @param Outline outline to extract micro-features from\n * @param FeatureSet set of features to add pico-features to\n */\nvoid ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet) {\n  MFOUTLINE Next;\n  MFOUTLINE First;\n  MFOUTLINE Current;\n\n  if (DegenerateOutline(Outline)) {\n    return;\n  }\n\n  First = Outline;\n  Current = First;\n  Next = NextPointAfter(Current);\n  do {\n    /* note that an edge is hidden if the ending point of the edge is\n   marked as hidden.  This situation happens because the order of\n   the outlines is reversed when they are converted from the old\n   format.  In the old format, a hidden edge is marked by the\n   starting point for that edge. */\n    if (!(PointAt(Next)->Hidden)) {\n      ConvertSegmentToPicoFeat(&(PointAt(Current)->Point), &(PointAt(Next)->Point), FeatureSet);\n    }\n\n    Current = Next;\n    Next = NextPointAfter(Current);\n  } while (Current != First);\n\n} /* ConvertToPicoFeatures2 */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine computes the average x position over all\n * of the pico-features in FeatureSet and then renormalizes\n * the pico-features to force this average to be the x origin\n * (i.e. x=0).\n * FeatureSet is changed.\n * @param FeatureSet pico-features to be normalized\n */\nvoid NormalizePicoX(FEATURE_SET FeatureSet) {\n  int i;\n  FEATURE Feature;\n  float Origin = 0.0;\n\n  for (i = 0; i < FeatureSet->NumFeatures; i++) {\n    Feature = FeatureSet->Features[i];\n    Origin += Feature->Params[PicoFeatX];\n  }\n  Origin /= FeatureSet->NumFeatures;\n\n  for (i = 0; i < FeatureSet->NumFeatures; i++) {\n    Feature = FeatureSet->Features[i];\n    Feature->Params[PicoFeatX] -= Origin;\n  }\n} /* NormalizePicoX */\n\n/*---------------------------------------------------------------------------*/\n/**\n * @param blob blob to extract features from\n * @param fx_info\n * @return Integer character-normalized features for blob.\n */\nFEATURE_SET Classify::ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) {\n  INT_FX_RESULT_STRUCT local_fx_info(fx_info);\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  tesseract::TrainingSample *sample =\n      tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);\n  if (sample == nullptr) {\n    return nullptr;\n  }\n\n  uint32_t num_features = sample->num_features();\n  const INT_FEATURE_STRUCT *features = sample->features();\n  auto feature_set = new FEATURE_SET_STRUCT(num_features);\n  for (uint32_t f = 0; f < num_features; ++f) {\n    auto feature = new FEATURE_STRUCT(&IntFeatDesc);\n    feature->Params[IntX] = features[f].X;\n    feature->Params[IntY] = features[f].Y;\n    feature->Params[IntDir] = features[f].Theta;\n    AddFeature(feature_set, feature);\n  }\n  delete sample;\n\n  return feature_set;\n} /* ExtractIntCNFeatures */\n\n/*---------------------------------------------------------------------------*/\n/**\n * @param blob blob to extract features from\n * @param fx_info\n * @return Geometric (top/bottom/width) features for blob.\n */\nFEATURE_SET Classify::ExtractIntGeoFeatures(const TBLOB &blob,\n                                            const INT_FX_RESULT_STRUCT &fx_info) {\n  INT_FX_RESULT_STRUCT local_fx_info(fx_info);\n  std::vector<INT_FEATURE_STRUCT> bl_features;\n  tesseract::TrainingSample *sample =\n      tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);\n  if (sample == nullptr) {\n    return nullptr;\n  }\n\n  auto feature_set = new FEATURE_SET_STRUCT(1);\n  auto feature = new FEATURE_STRUCT(&IntFeatDesc);\n\n  feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);\n  feature->Params[GeoTop] = sample->geo_feature(GeoTop);\n  feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);\n  AddFeature(feature_set, feature);\n  delete sample;\n\n  return feature_set;\n} /* ExtractIntGeoFeatures */\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/classify/picofeat.h",
    "content": "/******************************************************************************\n ** Filename:    picofeat.h\n ** Purpose:     Definition of pico features.\n ** Author:      Dan Johnson\n ** History:     9/4/90, DSJ, Created.\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef PICOFEAT_H\n#define PICOFEAT_H\n\n#include \"ocrfeatures.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\n// Enum for the order/type of params in IntFeatDesc.\nenum IntParams {\n  IntX,  // x-position (0-255).\n  IntY,  // y-position (0-255).\n  IntDir // Direction (0-255, circular).\n};\n\n// Enum for the order/type of params in GeoFeatDesc.\nenum GeoParams {\n  GeoBottom, // Bounding box bottom in baseline space (0-255).\n  GeoTop,    // Bounding box top in baseline space (0-255).\n  GeoWidth,  // Bounding box width in baseline space (0-255).\n\n  GeoCount // Number of geo features.\n};\n\ntypedef enum { PicoFeatY, PicoFeatDir, PicoFeatX } PICO_FEAT_PARAM_NAME;\n\n#define MAX_PICO_FEATURES (1000)\n\n/*---------------------------------------------------------------------------\n          Variables\n----------------------------------------------------------------------------*/\n\nextern double_VAR_H(classify_pico_feature_length);\n\n/**----------------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------------**/\n#define GetPicoFeatureLength() (PicoFeatureLength)\n\n/**----------------------------------------------------------------------------\n        Global Data Definitions and Declarations\n----------------------------------------------------------------------------**/\nextern TESS_API float PicoFeatureLength;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/protos.cpp",
    "content": "/******************************************************************************\n *\n * File:        protos.cpp  (Formerly protos.c)\n * Author:      Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"protos.h\"\n\n#include \"classify.h\"\n#include \"intproto.h\"\n#include \"params.h\"\n\n#include <cmath> // for M_PI\n#include <cstdio>\n\nnamespace tesseract {\n\n#define PROTO_INCREMENT 32\n#define CONFIG_INCREMENT 16\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n/**\n * @name AddConfigToClass\n *\n * Add a new config to this class.  Malloc new space and copy the\n * old configs if necessary.  Return the config id for the new config.\n *\n * @param Class The class to add to\n */\nint AddConfigToClass(CLASS_TYPE Class) {\n  int NewNumConfigs;\n  int NewConfig;\n  int MaxNumProtos;\n  BIT_VECTOR Config;\n\n  MaxNumProtos = Class->MaxNumProtos;\n  ASSERT_HOST(MaxNumProtos <= MAX_NUM_PROTOS);\n\n  if (Class->NumConfigs >= Class->MaxNumConfigs) {\n    /* add configs in CONFIG_INCREMENT chunks at a time */\n    NewNumConfigs =\n        (((Class->MaxNumConfigs + CONFIG_INCREMENT) / CONFIG_INCREMENT) * CONFIG_INCREMENT);\n\n    Class->Configurations.resize(NewNumConfigs);\n    Class->MaxNumConfigs = NewNumConfigs;\n  }\n  NewConfig = Class->NumConfigs++;\n  Config = NewBitVector(MAX_NUM_PROTOS);\n  Class->Configurations[NewConfig] = Config;\n  zero_all_bits(Config, WordsInVectorOfSize(MAX_NUM_PROTOS));\n\n  return (NewConfig);\n}\n\n/**\n * @name AddProtoToClass\n *\n * Add a new proto to this class.  Malloc new space and copy the\n * old protos if necessary.  Return the proto id for the new proto.\n *\n * @param Class The class to add to\n */\nint AddProtoToClass(CLASS_TYPE Class) {\n  if (Class->NumProtos >= Class->MaxNumProtos) {\n    /* add protos in PROTO_INCREMENT chunks at a time */\n    int NewNumProtos =\n        (((Class->MaxNumProtos + PROTO_INCREMENT) / PROTO_INCREMENT) * PROTO_INCREMENT);\n\n    Class->Prototypes.resize(NewNumProtos);\n\n    Class->MaxNumProtos = NewNumProtos;\n    ASSERT_HOST(NewNumProtos <= MAX_NUM_PROTOS);\n  }\n  int NewProto = Class->NumProtos++;\n  ASSERT_HOST(Class->NumProtos <= MAX_NUM_PROTOS);\n  return (NewProto);\n}\n\n/**********************************************************************\n * FillABC\n *\n * Fill in Protos A, B, C fields based on the X, Y, Angle fields.\n **********************************************************************/\nvoid FillABC(PROTO_STRUCT *Proto) {\n  float Slope, Intercept, Normalizer;\n\n  Slope = tan(Proto->Angle * 2.0 * M_PI);\n  Intercept = Proto->Y - Slope * Proto->X;\n  Normalizer = 1.0 / sqrt(Slope * Slope + 1.0);\n  Proto->A = Slope * Normalizer;\n  Proto->B = -Normalizer;\n  Proto->C = Intercept * Normalizer;\n}\n\n/**********************************************************************\n * FreeClass\n *\n * Deallocate the memory consumed by the specified class.\n **********************************************************************/\nvoid FreeClass(CLASS_TYPE Class) {\n  if (Class) {\n    FreeClassFields(Class);\n    delete Class;\n  }\n}\n\n/**********************************************************************\n * FreeClassFields\n *\n * Deallocate the memory consumed by subfields of the specified class.\n **********************************************************************/\nvoid FreeClassFields(CLASS_TYPE Class) {\n  if (Class) {\n    for (int i = 0; i < Class->NumConfigs; i++) {\n      FreeBitVector(Class->Configurations[i]);\n    }\n  }\n}\n\n/**********************************************************************\n * NewClass\n *\n * Allocate a new class with enough memory to hold the specified number\n * of prototypes and configurations.\n **********************************************************************/\nCLASS_TYPE NewClass(int NumProtos, int NumConfigs) {\n  CLASS_TYPE Class;\n\n  Class = new CLASS_STRUCT;\n\n  Class->Prototypes.resize(NumProtos);\n  Class->Configurations.resize(NumConfigs);\n  Class->MaxNumProtos = NumProtos;\n  Class->MaxNumConfigs = NumConfigs;\n  Class->NumProtos = 0;\n  Class->NumConfigs = 0;\n  return (Class);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/protos.h",
    "content": "/******************************************************************************\n *\n * File:         protos.h\n * Author:       Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef PROTOS_H\n#define PROTOS_H\n\n#include \"bitvec.h\"\n#include \"params.h\"\n#include \"unicity_table.h\"\n\n#include <tesseract/unichar.h>\n\nnamespace tesseract {\n\nstruct PROTO_STRUCT {\n  float A;\n  float B;\n  float C;\n  float X;\n  float Y;\n  float Angle;\n  float Length;\n};\n\nstruct CLASS_STRUCT {\n  int16_t NumProtos = 0;\n  int16_t MaxNumProtos = 0;\n  int16_t NumConfigs = 0;\n  int16_t MaxNumConfigs = 0;\n  std::vector<PROTO_STRUCT> Prototypes;\n  std::vector<BIT_VECTOR> Configurations;\n  UnicityTable<int> font_set;\n};\nusing CLASS_TYPE = CLASS_STRUCT *;\nusing CLASSES = CLASS_STRUCT *;\n\n/*----------------------------------------------------------------------\n              M a c r o s\n----------------------------------------------------------------------*/\n/**\n * AddProtoToConfig\n *\n * Set a single proto bit in the specified configuration.\n */\n\n#define AddProtoToConfig(Pid, Config) (SET_BIT(Config, Pid))\n\n/**\n * ProtoIn\n *\n * Choose the selected prototype in this class record.  Return the\n * pointer to it (PROTO_STRUCT *).\n */\n\n#define ProtoIn(Class, Pid) (&(Class)->Prototypes[Pid])\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\nTESS_API\nint AddConfigToClass(CLASS_TYPE Class);\n\nTESS_API\nint AddProtoToClass(CLASS_TYPE Class);\n\nTESS_API\nvoid FillABC(PROTO_STRUCT *Proto);\n\nTESS_API\nvoid FreeClass(CLASS_TYPE Class);\n\nTESS_API\nvoid FreeClassFields(CLASS_TYPE Class);\n\nvoid InitPrototypes();\n\nTESS_API\nCLASS_TYPE NewClass(int NumProtos, int NumConfigs);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/classify/shapeclassifier.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        shapeclassifier.cpp\n// Description: Base interface class for classifiers that return a\n//              shape index.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"shapeclassifier.h\"\n\n#include \"scrollview.h\"\n#include \"shapetable.h\"\n#ifndef GRAPHICS_DISABLED\n#include \"svmnode.h\"\n#endif\n#include \"tprintf.h\"\n#include \"trainingsample.h\"\n\nnamespace tesseract {\n\n// Classifies the given [training] sample, writing to results.\n// See shapeclassifier.h for a full description.\n// Default implementation calls the ShapeRating version.\nint ShapeClassifier::UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                                           UNICHAR_ID keep_this,\n                                           std::vector<UnicharRating> *results) {\n  results->clear();\n  std::vector<ShapeRating> shape_results;\n  int num_shape_results = ClassifySample(sample, page_pix, debug, keep_this, &shape_results);\n  const ShapeTable *shapes = GetShapeTable();\n  std::vector<int> unichar_map(shapes->unicharset().size(), -1);\n  for (int r = 0; r < num_shape_results; ++r) {\n    shapes->AddShapeToResults(shape_results[r], &unichar_map, results);\n  }\n  return results->size();\n}\n\n// Classifies the given [training] sample, writing to results.\n// See shapeclassifier.h for a full description.\n// Default implementation aborts.\nint ShapeClassifier::ClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                                    int keep_this, std::vector<ShapeRating> *results) {\n  ASSERT_HOST(\"Must implement ClassifySample!\" == nullptr);\n  return 0;\n}\n\n// Returns the shape that contains unichar_id that has the best result.\n// If result is not nullptr, it is set with the shape_id and rating.\n// Does not need to be overridden if ClassifySample respects the keep_this\n// rule.\nint ShapeClassifier::BestShapeForUnichar(const TrainingSample &sample, Image page_pix,\n                                         UNICHAR_ID unichar_id, ShapeRating *result) {\n  std::vector<ShapeRating> results;\n  const ShapeTable *shapes = GetShapeTable();\n  int num_results = ClassifySample(sample, page_pix, 0, unichar_id, &results);\n  for (int r = 0; r < num_results; ++r) {\n    if (shapes->GetShape(results[r].shape_id).ContainsUnichar(unichar_id)) {\n      if (result != nullptr) {\n        *result = results[r];\n      }\n      return results[r].shape_id;\n    }\n  }\n  return -1;\n}\n\n// Provides access to the UNICHARSET that this classifier works with.\n// Only needs to be overridden if GetShapeTable() can return nullptr.\nconst UNICHARSET &ShapeClassifier::GetUnicharset() const {\n  return GetShapeTable()->unicharset();\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Visual debugger classifies the given sample, displays the results and\n// solicits user input to display other classifications. Returns when\n// the user has finished with debugging the sample.\n// Probably doesn't need to be overridden if the subclass provides\n// DisplayClassifyAs.\nvoid ShapeClassifier::DebugDisplay(const TrainingSample &sample, Image page_pix,\n                                   UNICHAR_ID unichar_id) {\n  static ScrollView *terminator = nullptr;\n  if (terminator == nullptr) {\n    terminator = new ScrollView(\"XIT\", 0, 0, 50, 50, 50, 50, true);\n  }\n  ScrollView *debug_win = CreateFeatureSpaceWindow(\"ClassifierDebug\", 0, 0);\n  // Provide a right-click menu to choose the class.\n  auto *popup_menu = new SVMenuNode();\n  popup_menu->AddChild(\"Choose class to debug\", 0, \"x\", \"Class to debug\");\n  popup_menu->BuildMenu(debug_win, false);\n  // Display the features in green.\n  const INT_FEATURE_STRUCT *features = sample.features();\n  uint32_t num_features = sample.num_features();\n  for (uint32_t f = 0; f < num_features; ++f) {\n    RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);\n  }\n  debug_win->Update();\n  std::vector<UnicharRating> results;\n  // Debug classification until the user quits.\n  const UNICHARSET &unicharset = GetUnicharset();\n  SVEventType ev_type;\n  do {\n    std::vector<ScrollView *> windows;\n    if (unichar_id >= 0) {\n      tprintf(\"Debugging class %d = %s\\n\", unichar_id, unicharset.id_to_unichar(unichar_id));\n      UnicharClassifySample(sample, page_pix, 1, unichar_id, &results);\n      DisplayClassifyAs(sample, page_pix, unichar_id, 1, windows);\n    } else {\n      tprintf(\"Invalid unichar_id: %d\\n\", unichar_id);\n      UnicharClassifySample(sample, page_pix, 1, -1, &results);\n    }\n    if (unichar_id >= 0) {\n      tprintf(\"Debugged class %d = %s\\n\", unichar_id, unicharset.id_to_unichar(unichar_id));\n    }\n    tprintf(\"Right-click in ClassifierDebug window to choose debug class,\");\n    tprintf(\" Left-click or close window to quit...\\n\");\n    UNICHAR_ID old_unichar_id;\n    do {\n      old_unichar_id = unichar_id;\n      auto ev = debug_win->AwaitEvent(SVET_ANY);\n      ev_type = ev->type;\n      if (ev_type == SVET_POPUP) {\n        if (unicharset.contains_unichar(ev->parameter)) {\n          unichar_id = unicharset.unichar_to_id(ev->parameter);\n        } else {\n          tprintf(\"Char class '%s' not found in unicharset\", ev->parameter);\n        }\n      }\n    } while (unichar_id == old_unichar_id && ev_type != SVET_CLICK && ev_type != SVET_DESTROY);\n    for (auto window : windows) {\n      delete window;\n    }\n  } while (ev_type != SVET_CLICK && ev_type != SVET_DESTROY);\n  delete debug_win;\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Displays classification as the given shape_id. Creates as many windows\n// as it feels fit, using index as a guide for placement. Adds any created\n// windows to the windows output and returns a new index that may be used\n// by any subsequent classifiers. Caller waits for the user to view and\n// then destroys the windows by clearing the vector.\nint ShapeClassifier::DisplayClassifyAs(const TrainingSample &sample, Image page_pix,\n                                       UNICHAR_ID unichar_id, int index,\n                                       std::vector<ScrollView *> &windows) {\n  // Does nothing in the default implementation.\n  return index;\n}\n\n// Prints debug information on the results.\nvoid ShapeClassifier::UnicharPrintResults(const char *context,\n                                          const std::vector<UnicharRating> &results) const {\n  tprintf(\"%s\\n\", context);\n  for (const auto &result : results) {\n    tprintf(\"%g: c_id=%d=%s\", result.rating, result.unichar_id,\n            GetUnicharset().id_to_unichar(result.unichar_id));\n    if (!result.fonts.empty()) {\n      tprintf(\" Font Vector:\");\n      for (auto &&font : result.fonts) {\n        tprintf(\" %d\", font.fontinfo_id);\n      }\n    }\n    tprintf(\"\\n\");\n  }\n}\nvoid ShapeClassifier::PrintResults(const char *context,\n                                   const std::vector<ShapeRating> &results) const {\n  tprintf(\"%s\\n\", context);\n  for (const auto &result : results) {\n    tprintf(\"%g:\", result.rating);\n    if (result.joined) {\n      tprintf(\"[J]\");\n    }\n    if (result.broken) {\n      tprintf(\"[B]\");\n    }\n    tprintf(\" %s\\n\", GetShapeTable()->DebugStr(result.shape_id).c_str());\n  }\n}\n\n// Removes any result that has all its unichars covered by a better choice,\n// regardless of font.\nvoid ShapeClassifier::FilterDuplicateUnichars(std::vector<ShapeRating> *results) const {\n  std::vector<ShapeRating> filtered_results;\n  // Copy results to filtered results and knock out duplicate unichars.\n  const ShapeTable *shapes = GetShapeTable();\n  for (unsigned r = 0; r < results->size(); ++r) {\n    if (r > 0) {\n      const Shape &shape_r = shapes->GetShape((*results)[r].shape_id);\n      int c;\n      for (c = 0; c < shape_r.size(); ++c) {\n        int unichar_id = shape_r[c].unichar_id;\n        unsigned s;\n        for (s = 0; s < r; ++s) {\n          const Shape &shape_s = shapes->GetShape((*results)[s].shape_id);\n          if (shape_s.ContainsUnichar(unichar_id)) {\n            break; // We found unichar_id.\n          }\n        }\n        if (s == r) {\n          break; // We didn't find unichar_id.\n        }\n      }\n      if (c == shape_r.size()) {\n        continue; // We found all the unichar ids in previous answers.\n      }\n    }\n    filtered_results.push_back((*results)[r]);\n  }\n  *results = std::move(filtered_results);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/classify/shapeclassifier.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        shapeclassifier.h\n// Description: Base interface class for classifiers that return a\n//              shape index.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_\n#define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_\n\n#include \"image.h\"\n\n#include <tesseract/unichar.h>\n\n#include <vector>\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass ScrollView;\nclass UNICHARSET;\n\nstruct ShapeRating;\nclass ShapeTable;\nclass TrainingSample;\nclass TrainingSampleSet;\nstruct UnicharRating;\n\n// Interface base class for classifiers that produce ShapeRating results.\nclass TESS_API ShapeClassifier {\npublic:\n  virtual ~ShapeClassifier() = default;\n\n  // Classifies the given [training] sample, writing to results.\n  // If page_pix is not nullptr, the overriding function may call\n  // sample.GetSamplePix(padding, page_pix) to get an image of the sample\n  // padded (with real image data) by the given padding to extract features\n  // from the image of the character. Other members of TrainingSample:\n  // features(), micro_features(), cn_feature(), geo_feature() may be used\n  // to get the appropriate tesseract features.\n  // If debug is non-zero, then various degrees of classifier dependent debug\n  // information is provided.\n  // If keep_this (a UNICHAR_ID) is >= 0, then the results should always\n  // contain keep_this, and (if possible) anything of intermediate confidence.\n  // (Used for answering \"Why didn't it get that right?\" questions.) It must\n  // be a UNICHAR_ID as the callers have no clue how to choose the best shape\n  // that may contain a desired answer.\n  // The return value is the number of classes saved in results.\n  // NOTE that overriding functions MUST clear and sort the results by\n  // descending rating unless the classifier is working with a team of such\n  // classifiers.\n  // NOTE: Neither overload of ClassifySample is pure, but at least one must\n  // be overridden by a classifier in order for it to do anything.\n  virtual int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                                    UNICHAR_ID keep_this, std::vector<UnicharRating> *results);\n\nprotected:\n  virtual int ClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                             UNICHAR_ID keep_this, std::vector<ShapeRating> *results);\n\npublic:\n  // Returns the shape that contains unichar_id that has the best result.\n  // If result is not nullptr, it is set with the shape_id and rating.\n  // Returns -1 if ClassifySample fails to provide any result containing\n  // unichar_id. BestShapeForUnichar does not need to be overridden if\n  // ClassifySample respects the keep_this rule.\n  virtual int BestShapeForUnichar(const TrainingSample &sample, Image page_pix,\n                                  UNICHAR_ID unichar_id, ShapeRating *result);\n\n  // Provides access to the ShapeTable that this classifier works with.\n  virtual const ShapeTable *GetShapeTable() const = 0;\n  // Provides access to the UNICHARSET that this classifier works with.\n  // Must be overridden IFF GetShapeTable() returns nullptr.\n  virtual const UNICHARSET &GetUnicharset() const;\n\n  // Visual debugger classifies the given sample, displays the results and\n  // solicits user input to display other classifications. Returns when\n  // the user has finished with debugging the sample.\n  // Probably doesn't need to be overridden if the subclass provides\n  // DisplayClassifyAs.\n  void DebugDisplay(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id);\n\n  // Displays classification as the given unichar_id. Creates as many windows\n  // as it feels fit, using index as a guide for placement. Adds any created\n  // windows to the windows output and returns a new index that may be used\n  // by any subsequent classifiers. Caller waits for the user to view and\n  // then destroys the windows by clearing the vector.\n  virtual int DisplayClassifyAs(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id,\n                                int index, std::vector<ScrollView *> &windows);\n\n  // Prints debug information on the results. context is some introductory/title\n  // message.\n  virtual void UnicharPrintResults(const char *context,\n                                   const std::vector<UnicharRating> &results) const;\n  virtual void PrintResults(const char *context, const std::vector<ShapeRating> &results) const;\n\nprotected:\n  // Removes any result that has all its unichars covered by a better choice,\n  // regardless of font.\n  void FilterDuplicateUnichars(std::vector<ShapeRating> *results) const;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_\n"
  },
  {
    "path": "src/classify/shapetable.cpp",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        shapetable.cpp\n// Description: Class to map a classifier shape index to unicharset\n//              indices and font indices.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"shapetable.h\"\n\n#include \"bitvector.h\"\n#include \"fontinfo.h\"\n#include \"intfeaturespace.h\"\n#include \"unicharset.h\"\n#include \"unicity_table.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// Helper function to get the index of the first result with the required\n// unichar_id. If the results are sorted by rating, this will also be the\n// best result with the required unichar_id.\n// Returns -1 if the unichar_id is not found\nint ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,\n                                        const ShapeTable &shape_table, UNICHAR_ID unichar_id) {\n  size_t r = 0;\n  for (const auto &result : results) {\n    const auto shape_id = result.shape_id;\n    const Shape &shape = shape_table.GetShape(shape_id);\n    if (shape.ContainsUnichar(unichar_id)) {\n      return r;\n    }\n    ++r;\n  }\n  return -1;\n}\n\n// Helper function to get the index of the first result with the required\n// unichar_id. If the results are sorted by rating, this will also be the\n// best result with the required unichar_id.\n// Returns -1 if the unichar_id is not found\nint UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,\n                                          UNICHAR_ID unichar_id) {\n  size_t r = 0;\n  for (const auto &result : results) {\n    if (result.unichar_id == unichar_id) {\n      return r;\n    }\n    ++r;\n  }\n  return -1;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool UnicharAndFonts::Serialize(FILE *fp) const {\n  return tesseract::Serialize(fp, &unichar_id) && tesseract::Serialize(fp, font_ids);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool UnicharAndFonts::DeSerialize(TFile *fp) {\n  return fp->DeSerialize(&unichar_id) && fp->DeSerialize(font_ids);\n}\n\n// Sort function to sort a pair of UnicharAndFonts by unichar_id.\nint UnicharAndFonts::SortByUnicharId(const void *v1, const void *v2) {\n  const auto *p1 = static_cast<const UnicharAndFonts *>(v1);\n  const auto *p2 = static_cast<const UnicharAndFonts *>(v2);\n  return p1->unichar_id - p2->unichar_id;\n}\n\nbool UnicharAndFonts::StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2) {\n  return v1.unichar_id < v2.unichar_id;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool Shape::Serialize(FILE *fp) const {\n  uint8_t sorted = unichars_sorted_;\n  return tesseract::Serialize(fp, &sorted) && tesseract::Serialize(fp, unichars_);\n}\n// Reads from the given file. Returns false in case of error.\n\nbool Shape::DeSerialize(TFile *fp) {\n  uint8_t sorted;\n  if (!fp->DeSerialize(&sorted)) {\n    return false;\n  }\n  unichars_sorted_ = sorted != 0;\n  return fp->DeSerialize(unichars_);\n}\n\n// Adds a font_id for the given unichar_id. If the unichar_id is not\n// in the shape, it is added.\nvoid Shape::AddToShape(int unichar_id, int font_id) {\n  for (auto &unichar : unichars_) {\n    if (unichar.unichar_id == unichar_id) {\n      // Found the unichar in the shape table.\n      std::vector<int> &font_list = unichar.font_ids;\n      for (int f : font_list) {\n        if (f == font_id) {\n          return; // Font is already there.\n        }\n      }\n      font_list.push_back(font_id);\n      return;\n    }\n  }\n  // Unichar_id is not in shape, so add it to shape.\n  unichars_.emplace_back(unichar_id, font_id);\n  unichars_sorted_ = unichars_.size() <= 1;\n}\n\n// Adds everything in other to this.\nvoid Shape::AddShape(const Shape &other) {\n  for (const auto &unichar : other.unichars_) {\n    for (auto font_id : unichar.font_ids) {\n      AddToShape(unichar.unichar_id, font_id);\n    }\n  }\n  unichars_sorted_ = unichars_.size() <= 1;\n}\n\n// Returns true if the shape contains the given unichar_id, font_id pair.\nbool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {\n  for (const auto &unichar : unichars_) {\n    if (unichar.unichar_id == unichar_id) {\n      // Found the unichar, so look for the font.\n      auto &font_list = unichar.font_ids;\n      for (int f : font_list) {\n        if (f == font_id) {\n          return true;\n        }\n      }\n      return false;\n    }\n  }\n  return false;\n}\n\n// Returns true if the shape contains the given unichar_id, ignoring font.\nbool Shape::ContainsUnichar(int unichar_id) const {\n  for (const auto &unichar : unichars_) {\n    if (unichar.unichar_id == unichar_id) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns true if the shape contains the given font, ignoring unichar_id.\nbool Shape::ContainsFont(int font_id) const {\n  for (const auto &unichar : unichars_) {\n    auto &font_list = unichar.font_ids;\n    for (int f : font_list) {\n      if (f == font_id) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n// Returns true if the shape contains the given font properties, ignoring\n// unichar_id.\nbool Shape::ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const {\n  for (const auto &unichar : unichars_) {\n    auto &font_list = unichar.font_ids;\n    for (int f : font_list) {\n      if (font_table.at(f).properties == properties) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n// Returns true if the shape contains multiple different font properties,\n// ignoring unichar_id.\nbool Shape::ContainsMultipleFontProperties(const FontInfoTable &font_table) const {\n  uint32_t properties = font_table.at(unichars_[0].font_ids[0]).properties;\n  for (const auto &unichar : unichars_) {\n    auto &font_list = unichar.font_ids;\n    for (int f : font_list) {\n      if (font_table.at(f).properties != properties) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n// Returns true if this shape is equal to other (ignoring order of unichars\n// and fonts).\nbool Shape::operator==(const Shape &other) const {\n  return IsSubsetOf(other) && other.IsSubsetOf(*this);\n}\n\n// Returns true if this is a subset (including equal) of other.\nbool Shape::IsSubsetOf(const Shape &other) const {\n  for (const auto &unichar : unichars_) {\n    int unichar_id = unichar.unichar_id;\n    const std::vector<int> &font_list = unichar.font_ids;\n    for (int f : font_list) {\n      if (!other.ContainsUnicharAndFont(unichar_id, f)) {\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\n// Returns true if the lists of unichar ids are the same in this and other,\n// ignoring fonts.\n// NOT const, as it will sort the unichars on demand.\nbool Shape::IsEqualUnichars(Shape *other) {\n  if (unichars_.size() != other->unichars_.size()) {\n    return false;\n  }\n  if (!unichars_sorted_) {\n    SortUnichars();\n  }\n  if (!other->unichars_sorted_) {\n    other->SortUnichars();\n  }\n  for (unsigned c = 0; c < unichars_.size(); ++c) {\n    if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Sorts the unichars_ vector by unichar.\nvoid Shape::SortUnichars() {\n  std::sort(unichars_.begin(), unichars_.end(), UnicharAndFonts::StdSortByUnicharId);\n  unichars_sorted_ = true;\n}\n\nShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {}\nShapeTable::ShapeTable(const UNICHARSET &unicharset) : unicharset_(&unicharset), num_fonts_(0) {}\n\n// Writes to the given file. Returns false in case of error.\nbool ShapeTable::Serialize(FILE *fp) const {\n  return tesseract::Serialize(fp, shape_table_);\n}\n// Reads from the given file. Returns false in case of error.\n\nbool ShapeTable::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(shape_table_)) {\n    return false;\n  }\n  num_fonts_ = 0;\n  return true;\n}\n\n// Returns the number of fonts used in this ShapeTable, computing it if\n// necessary.\nint ShapeTable::NumFonts() const {\n  if (num_fonts_ <= 0) {\n    for (auto shape_id : shape_table_) {\n      const Shape &shape = *shape_id;\n      for (int c = 0; c < shape.size(); ++c) {\n        for (auto font_id : shape[c].font_ids) {\n          if (font_id >= num_fonts_) {\n            num_fonts_ = font_id + 1;\n          }\n        }\n      }\n    }\n  }\n  return num_fonts_;\n}\n\n// Re-indexes the class_ids in the shapetable according to the given map.\n// Useful in conjunction with set_unicharset.\nvoid ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) {\n  for (auto shape : shape_table_) {\n    for (int c = 0; c < shape->size(); ++c) {\n      shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);\n    }\n  }\n}\n\n// Returns a string listing the classes/fonts in a shape.\nstd::string ShapeTable::DebugStr(unsigned shape_id) const {\n  if (shape_id >= shape_table_.size()) {\n    return \"INVALID_UNICHAR_ID\";\n  }\n  const Shape &shape = GetShape(shape_id);\n  std::string result;\n  result += \"Shape\" + std::to_string(shape_id);\n  if (shape.size() > 100) {\n    result += \" Num unichars=\" + std::to_string(shape.size());\n    return result;\n  }\n  for (int c = 0; c < shape.size(); ++c) {\n    result += \" c_id=\" + std::to_string(shape[c].unichar_id);\n    result += \"=\";\n    result += unicharset_->id_to_unichar(shape[c].unichar_id);\n    if (shape.size() < 10) {\n      result += \", \" + std::to_string(shape[c].font_ids.size());\n      result += \" fonts =\";\n      int num_fonts = shape[c].font_ids.size();\n      if (num_fonts > 10) {\n        result += \" \" + std::to_string(shape[c].font_ids[0]);\n        result += \" ... \" + std::to_string(shape[c].font_ids[num_fonts - 1]);\n      } else {\n        for (int f = 0; f < num_fonts; ++f) {\n          result += \" \" + std::to_string(shape[c].font_ids[f]);\n        }\n      }\n    }\n  }\n  return result;\n}\n\n// Returns a debug string summarizing the table.\nstd::string ShapeTable::SummaryStr() const {\n  int max_unichars = 0;\n  int num_multi_shapes = 0;\n  int num_master_shapes = 0;\n  for (unsigned s = 0; s < shape_table_.size(); ++s) {\n    if (MasterDestinationIndex(s) != s) {\n      continue;\n    }\n    ++num_master_shapes;\n    int shape_size = GetShape(s).size();\n    if (shape_size > 1) {\n      ++num_multi_shapes;\n    }\n    if (shape_size > max_unichars) {\n      max_unichars = shape_size;\n    }\n  }\n  std::string result;\n  result += \"Number of shapes = \" + std::to_string(num_master_shapes);\n  result += \" max unichars = \" + std::to_string(max_unichars);\n  result += \" number with multiple unichars = \" + std::to_string(num_multi_shapes);\n  return result;\n}\n\n// Adds a new shape starting with the given unichar_id and font_id.\n// Returns the assigned index.\nunsigned ShapeTable::AddShape(int unichar_id, int font_id) {\n  auto index = shape_table_.size();\n  auto *shape = new Shape;\n  shape->AddToShape(unichar_id, font_id);\n  shape_table_.push_back(shape);\n  num_fonts_ = std::max(num_fonts_, font_id + 1);\n  return index;\n}\n\n// Adds a copy of the given shape unless it is already present.\n// Returns the assigned index or index of existing shape if already present.\nunsigned ShapeTable::AddShape(const Shape &other) {\n  unsigned index;\n  for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {\n    continue;\n  }\n  if (index == shape_table_.size()) {\n    auto *shape = new Shape(other);\n    shape_table_.push_back(shape);\n  }\n  num_fonts_ = 0;\n  return index;\n}\n\n// Removes the shape given by the shape index.\nvoid ShapeTable::DeleteShape(unsigned shape_id) {\n  delete shape_table_[shape_id];\n  shape_table_.erase(shape_table_.begin() + shape_id);\n}\n\n// Adds a font_id to the given existing shape index for the given\n// unichar_id. If the unichar_id is not in the shape, it is added.\nvoid ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) {\n  Shape &shape = *shape_table_[shape_id];\n  shape.AddToShape(unichar_id, font_id);\n  num_fonts_ = std::max(num_fonts_, font_id + 1);\n}\n\n// Adds the given shape to the existing shape with the given index.\nvoid ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) {\n  Shape &shape = *shape_table_[shape_id];\n  shape.AddShape(other);\n  num_fonts_ = 0;\n}\n\n// Returns the id of the shape that contains the given unichar and font.\n// If not found, returns -1.\n// If font_id < 0, the font_id is ignored and the first shape that matches\n// the unichar_id is returned.\nint ShapeTable::FindShape(int unichar_id, int font_id) const {\n  for (unsigned s = 0; s < shape_table_.size(); ++s) {\n    const Shape &shape = GetShape(s);\n    for (int c = 0; c < shape.size(); ++c) {\n      if (shape[c].unichar_id == unichar_id) {\n        if (font_id < 0) {\n          return s; // We don't care about the font.\n        }\n        for (auto f : shape[c].font_ids) {\n          if (f == font_id) {\n            return s;\n          }\n        }\n      }\n    }\n  }\n  return -1;\n}\n\n// Returns the first unichar_id and font_id in the given shape.\nvoid ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const {\n  const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];\n  *unichar_id = unichar_and_fonts.unichar_id;\n  *font_id = unichar_and_fonts.font_ids[0];\n}\n\n// Expands all the classes/fonts in the shape individually to build\n// a ShapeTable.\nint ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {\n  BitVector shape_map(master_shapes.NumShapes());\n  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {\n    for (auto font_id : shape[u_ind].font_ids) {\n      int c = shape[u_ind].unichar_id;\n      int master_id = master_shapes.FindShape(c, font_id);\n      if (master_id >= 0) {\n        shape_map.SetBit(master_id);\n      } else if (FindShape(c, font_id) < 0) {\n        AddShape(c, font_id);\n      }\n    }\n  }\n  int num_masters = 0;\n  for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) {\n    if (shape_map[s]) {\n      AddShape(master_shapes.GetShape(s));\n      ++num_masters;\n    }\n  }\n  return num_masters;\n}\n\n// Returns true if the shapes are already merged.\nbool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const {\n  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);\n}\n\n// Returns true if any shape contains multiple unichars.\nbool ShapeTable::AnyMultipleUnichars() const {\n  auto num_shapes = NumShapes();\n  for (unsigned s1 = 0; s1 < num_shapes; ++s1) {\n    if (MasterDestinationIndex(s1) != s1) {\n      continue;\n    }\n    if (GetShape(s1).size() > 1) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns the maximum number of unichars over all shapes.\nint ShapeTable::MaxNumUnichars() const {\n  int max_num_unichars = 0;\n  int num_shapes = NumShapes();\n  for (int s = 0; s < num_shapes; ++s) {\n    if (GetShape(s).size() > max_num_unichars) {\n      max_num_unichars = GetShape(s).size();\n    }\n  }\n  return max_num_unichars;\n}\n\n// Merges shapes with a common unichar over the [start, end) interval.\n// Assumes single unichar per shape.\nvoid ShapeTable::ForceFontMerges(unsigned start, unsigned end) {\n  for (unsigned s1 = start; s1 < end; ++s1) {\n    if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {\n      int unichar_id = GetShape(s1)[0].unichar_id;\n      for (auto s2 = s1 + 1; s2 < end; ++s2) {\n        if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&\n            unichar_id == GetShape(s2)[0].unichar_id) {\n          MergeShapes(s1, s2);\n        }\n      }\n    }\n  }\n  ShapeTable compacted(*unicharset_);\n  compacted.AppendMasterShapes(*this, nullptr);\n  *this = compacted;\n}\n\n// Returns the number of unichars in the master shape.\nunsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const {\n  int master_id = MasterDestinationIndex(shape_id);\n  return GetShape(master_id).size();\n}\n\n// Returns the sum of the font counts in the master shape.\nint ShapeTable::MasterFontCount(unsigned shape_id) const {\n  int master_id = MasterDestinationIndex(shape_id);\n  const Shape &shape = GetShape(master_id);\n  int font_count = 0;\n  for (int c = 0; c < shape.size(); ++c) {\n    font_count += shape[c].font_ids.size();\n  }\n  return font_count;\n}\n\n// Returns the number of unichars that would result from merging the shapes.\nint ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const {\n  // Do it the easy way for now.\n  int master_id1 = MasterDestinationIndex(shape_id1);\n  int master_id2 = MasterDestinationIndex(shape_id2);\n  Shape combined_shape(*shape_table_[master_id1]);\n  combined_shape.AddShape(*shape_table_[master_id2]);\n  return combined_shape.size();\n}\n\n// Merges two shape_ids, leaving shape_id2 marked as merged.\nvoid ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) {\n  auto master_id1 = MasterDestinationIndex(shape_id1);\n  auto master_id2 = MasterDestinationIndex(shape_id2);\n  // Point master_id2 (and all merged shapes) to master_id1.\n  shape_table_[master_id2]->set_destination_index(master_id1);\n  // Add all the shapes of master_id2 to master_id1.\n  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);\n}\n\n// Swaps two shape_ids.\nvoid ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) {\n  Shape *tmp = shape_table_[shape_id1];\n  shape_table_[shape_id1] = shape_table_[shape_id2];\n  shape_table_[shape_id2] = tmp;\n}\n\n// Returns the destination of this shape, (if merged), taking into account\n// the fact that the destination may itself have been merged.\nunsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const {\n  auto dest_id = shape_table_[shape_id]->destination_index();\n  if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) {\n    return shape_id; // Is master already.\n  }\n  auto master_id = shape_table_[dest_id]->destination_index();\n  if (master_id == dest_id || master_id < 0) {\n    return dest_id; // Dest is the master and shape_id points to it.\n  }\n  master_id = MasterDestinationIndex(master_id);\n  return master_id;\n}\n\n// Returns false if the unichars in neither shape is a subset of the other.\nbool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const {\n  const Shape &shape1 = GetShape(shape_id1);\n  const Shape &shape2 = GetShape(shape_id2);\n  int c1, c2;\n  for (c1 = 0; c1 < shape1.size(); ++c1) {\n    int unichar_id1 = shape1[c1].unichar_id;\n    if (!shape2.ContainsUnichar(unichar_id1)) {\n      break;\n    }\n  }\n  for (c2 = 0; c2 < shape2.size(); ++c2) {\n    int unichar_id2 = shape2[c2].unichar_id;\n    if (!shape1.ContainsUnichar(unichar_id2)) {\n      break;\n    }\n  }\n  return c1 == shape1.size() || c2 == shape2.size();\n}\n\n// Returns false if the unichars in neither shape is a subset of the other.\nbool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const {\n  const Shape &merge1 = GetShape(merge_id1);\n  const Shape &merge2 = GetShape(merge_id2);\n  const Shape &shape = GetShape(shape_id);\n  int cm1, cm2, cs;\n  for (cs = 0; cs < shape.size(); ++cs) {\n    int unichar_id = shape[cs].unichar_id;\n    if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {\n      break; // Shape is not a subset of the merge.\n    }\n  }\n  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {\n    int unichar_id1 = merge1[cm1].unichar_id;\n    if (!shape.ContainsUnichar(unichar_id1)) {\n      break; // Merge is not a subset of shape\n    }\n  }\n  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {\n    int unichar_id2 = merge2[cm2].unichar_id;\n    if (!shape.ContainsUnichar(unichar_id2)) {\n      break; // Merge is not a subset of shape\n    }\n  }\n  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());\n}\n\n// Returns true if the unichar sets are equal between the shapes.\nbool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const {\n  const Shape &shape1 = GetShape(shape_id1);\n  const Shape &shape2 = GetShape(shape_id2);\n  for (int c1 = 0; c1 < shape1.size(); ++c1) {\n    int unichar_id1 = shape1[c1].unichar_id;\n    if (!shape2.ContainsUnichar(unichar_id1)) {\n      return false;\n    }\n  }\n  for (int c2 = 0; c2 < shape2.size(); ++c2) {\n    int unichar_id2 = shape2[c2].unichar_id;\n    if (!shape1.ContainsUnichar(unichar_id2)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Returns true if the unichar sets are equal between the shapes.\nbool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const {\n  const Shape &merge1 = GetShape(merge_id1);\n  const Shape &merge2 = GetShape(merge_id2);\n  const Shape &shape = GetShape(shape_id);\n  for (int cs = 0; cs < shape.size(); ++cs) {\n    auto unichar_id = shape[cs].unichar_id;\n    if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {\n      return false; // Shape has a unichar that appears in neither merge.\n    }\n  }\n  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {\n    auto unichar_id1 = merge1[cm1].unichar_id;\n    if (!shape.ContainsUnichar(unichar_id1)) {\n      return false; // Merge has a unichar that is not in shape.\n    }\n  }\n  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {\n    auto unichar_id2 = merge2[cm2].unichar_id;\n    if (!shape.ContainsUnichar(unichar_id2)) {\n      return false; // Merge has a unichar that is not in shape.\n    }\n  }\n  return true;\n}\n\n// Returns true if there is a common unichar between the shapes.\nbool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const {\n  const Shape &shape1 = GetShape(shape_id1);\n  const Shape &shape2 = GetShape(shape_id2);\n  for (int c1 = 0; c1 < shape1.size(); ++c1) {\n    auto unichar_id1 = shape1[c1].unichar_id;\n    if (shape2.ContainsUnichar(unichar_id1)) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns true if there is a common font id between the shapes.\nbool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const {\n  const Shape &shape1 = GetShape(shape_id1);\n  const Shape &shape2 = GetShape(shape_id2);\n  for (int c1 = 0; c1 < shape1.size(); ++c1) {\n    const std::vector<int> &font_list1 = shape1[c1].font_ids;\n    for (int f : font_list1) {\n      if (shape2.ContainsFont(f)) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n// Appends the master shapes from other to this.\n// If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.\nvoid ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map) {\n  if (shape_map != nullptr) {\n    shape_map->clear();\n    shape_map->resize(other.NumShapes(), -1);\n  }\n  for (unsigned s = 0; s < other.shape_table_.size(); ++s) {\n    if (other.shape_table_[s]->destination_index() < 0) {\n      int index = AddShape(*other.shape_table_[s]);\n      if (shape_map != nullptr) {\n        (*shape_map)[s] = index;\n      }\n    }\n  }\n}\n\n// Returns the number of master shapes remaining after merging.\nint ShapeTable::NumMasterShapes() const {\n  int num_shapes = 0;\n  for (auto s : shape_table_) {\n    if (s->destination_index() < 0) {\n      ++num_shapes;\n    }\n  }\n  return num_shapes;\n}\n\n// Adds the unichars of the given shape_id to the vector of results. Any\n// unichar_id that is already present just has the fonts added to the\n// font set for that result without adding a new entry in the vector.\n// NOTE: it is assumed that the results are given to this function in order\n// of decreasing rating.\n// The unichar_map vector indicates the index of the results entry containing\n// each unichar, or -1 if the unichar is not yet included in results.\nvoid ShapeTable::AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,\n                                   std::vector<UnicharRating> *results) const {\n  if (shape_rating.joined) {\n    AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results);\n  }\n  if (shape_rating.broken) {\n    AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results);\n  }\n  const Shape &shape = GetShape(shape_rating.shape_id);\n  for (int u = 0; u < shape.size(); ++u) {\n    int result_index =\n        AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results);\n    for (auto font_id : shape[u].font_ids) {\n      (*results)[result_index].fonts.emplace_back(font_id,\n                                                  IntCastRounded(shape_rating.rating * INT16_MAX));\n    }\n  }\n}\n\n// Adds the given unichar_id to the results if needed, updating unichar_map\n// and returning the index of unichar in results.\nint ShapeTable::AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,\n                                    std::vector<UnicharRating> *results) const {\n  int result_index = unichar_map->at(unichar_id);\n  if (result_index < 0) {\n    UnicharRating result(unichar_id, rating);\n    result_index = results->size();\n    results->push_back(result);\n    (*unichar_map)[unichar_id] = result_index;\n  }\n  return result_index;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/shapetable.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        shapetable.h\n// Description: Class to map a classifier shape index to unicharset\n//              indices and font indices.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_\n#define TESSERACT_CLASSIFY_SHAPETABLE_H_\n\n#include \"bitvector.h\"\n#include \"fontinfo.h\"\n#include \"genericheap.h\"\n#include \"intmatcher.h\"\n#include \"tesserrstream.h\"  // for tesserr\n\nnamespace tesseract {\n\nclass UNICHARSET;\nclass ShapeTable;\n\n// Simple struct to hold a single classifier unichar selection, a corresponding\n// rating, and a list of appropriate fonts.\nstruct UnicharRating {\n  UnicharRating() : unichar_id(0), rating(0.0f), adapted(false), config(0), feature_misses(0) {}\n  UnicharRating(int u, float r)\n      : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}\n\n  // Print debug info.\n  void Print() const {\n    tesserr << \"Unichar-id=\" << unichar_id << \", rating=\" << rating\n            << \", adapted=\" << adapted << \", config=\" << config\n            << \", misses=\" << feature_misses << \", \"\n            << fonts.size() << \" fonts\\n\";\n  }\n\n  // Helper function to get the index of the first result with the required\n  // unichar_id. If the results are sorted by rating, this will also be the\n  // best result with the required unichar_id.\n  // Returns -1 if the unichar_id is not found\n  static int FirstResultWithUnichar(const std::vector<UnicharRating> &results,\n                                    UNICHAR_ID unichar_id);\n\n  // Index into some UNICHARSET table indicates the class of the answer.\n  UNICHAR_ID unichar_id;\n  // Rating from classifier with 1.0 perfect and 0.0 impossible.\n  // Call it a probability if you must.\n  float rating;\n  // True if this result is from the adaptive classifier.\n  bool adapted;\n  // Index of best matching font configuration of result.\n  uint8_t config;\n  // Number of features that were total misses - were liked by no classes.\n  uint16_t feature_misses;\n  // Unsorted collection of fontinfo ids and scores. Note that a raw result\n  // from the IntegerMatch will contain config ids, that require transforming\n  // to fontinfo ids via fontsets and (possibly) shapetable.\n  std::vector<ScoredFont> fonts;\n};\n\n// Classifier result from a low-level classification is an index into some\n// ShapeTable and a rating.\nstruct ShapeRating {\n  ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), joined(false), broken(false) {}\n  ShapeRating(int s, float r)\n      : shape_id(s), rating(r), raw(1.0f), font(0.0f), joined(false), broken(false) {}\n\n  // Helper function to get the index of the first result with the required\n  // unichar_id. If the results are sorted by rating, this will also be the\n  // best result with the required unichar_id.\n  // Returns -1 if the unichar_id is not found\n  static int FirstResultWithUnichar(const std::vector<ShapeRating> &results,\n                                    const ShapeTable &shape_table, UNICHAR_ID unichar_id);\n\n  // Index into some shape table indicates the class of the answer.\n  int shape_id;\n  // Rating from classifier with 1.0 perfect and 0.0 impossible.\n  // Call it a probability if you must.\n  float rating;\n  // Subsidiary rating that a classifier may use internally.\n  float raw;\n  // Subsidiary rating that a classifier may use internally.\n  float font;\n  // Flag indicating that the input may be joined.\n  bool joined;\n  // Flag indicating that the input may be broken (a fragment).\n  bool broken;\n};\n\n// Simple struct to hold an entry for a heap-based priority queue of\n// ShapeRating.\nstruct ShapeQueueEntry {\n  ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}\n  ShapeQueueEntry(const ShapeRating &rating, int level0) : result(rating), level(level0) {}\n\n  // Sort by decreasing rating and decreasing level for equal rating.\n  bool operator<(const ShapeQueueEntry &other) const {\n    if (result.rating > other.result.rating) {\n      return true;\n    }\n    if (result.rating == other.result.rating) {\n      return level > other.level;\n    }\n    return false;\n  }\n\n  // Output from classifier.\n  ShapeRating result;\n  // Which level in the tree did this come from?\n  int level;\n};\nusing ShapeQueue = GenericHeap<ShapeQueueEntry>;\n\n// Simple struct to hold a set of fonts associated with a single unichar-id.\n// A vector of UnicharAndFonts makes a shape.\nstruct UnicharAndFonts {\n  UnicharAndFonts() : unichar_id(0) {}\n  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {\n    font_ids.push_back(font_id);\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp);\n\n  // Sort function to sort a pair of UnicharAndFonts by unichar_id.\n  static int SortByUnicharId(const void *v1, const void *v2);\n  static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2);\n\n  std::vector<int32_t> font_ids;\n  int32_t unichar_id;\n};\n\n// A Shape is a collection of unichar-ids and a list of fonts associated with\n// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is\n// a classifiable unit, and represents a group of characters or parts of\n// characters that have a similar or identical shape. Shapes/ShapeTables may\n// be organized hierarchically from identical shapes at the leaves to vaguely\n// similar shapes near the root.\nclass TESS_API Shape {\npublic:\n  Shape() : destination_index_(-1) {}\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp);\n\n  int destination_index() const {\n    return destination_index_;\n  }\n  void set_destination_index(int index) {\n    destination_index_ = index;\n  }\n  int size() const {\n    return unichars_.size();\n  }\n  // Returns a UnicharAndFonts entry for the given index, which must be\n  // in the range [0, size()).\n  const UnicharAndFonts &operator[](int index) const {\n    return unichars_[index];\n  }\n  // Sets the unichar_id of the given index to the new unichar_id.\n  void SetUnicharId(int index, int unichar_id) {\n    unichars_[index].unichar_id = unichar_id;\n  }\n  // Adds a font_id for the given unichar_id. If the unichar_id is not\n  // in the shape, it is added.\n  void AddToShape(int unichar_id, int font_id);\n  // Adds everything in other to this.\n  void AddShape(const Shape &other);\n  // Returns true if the shape contains the given unichar_id, font_id pair.\n  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;\n  // Returns true if the shape contains the given unichar_id, ignoring font.\n  bool ContainsUnichar(int unichar_id) const;\n  // Returns true if the shape contains the given font, ignoring unichar_id.\n  bool ContainsFont(int font_id) const;\n  // Returns true if the shape contains the given font properties, ignoring\n  // unichar_id.\n  bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const;\n  // Returns true if the shape contains multiple different font properties,\n  // ignoring unichar_id.\n  bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const;\n  // Returns true if this shape is equal to other (ignoring order of unichars\n  // and fonts).\n  bool operator==(const Shape &other) const;\n  // Returns true if this is a subset (including equal) of other.\n  bool IsSubsetOf(const Shape &other) const;\n  // Returns true if the lists of unichar ids are the same in this and other,\n  // ignoring fonts.\n  // NOT const, as it will sort the unichars on demand.\n  bool IsEqualUnichars(Shape *other);\n\nprivate:\n  // Sorts the unichars_ vector by unichar.\n  void SortUnichars();\n\n  // Flag indicates that the unichars are sorted, allowing faster set\n  // operations with another shape.\n  bool unichars_sorted_ = false;\n  // If this Shape is part of a ShapeTable the destination_index_ is the index\n  // of some other shape in the ShapeTable with which this shape is merged.\n  int destination_index_ = 0;\n  // Array of unichars, each with a set of fonts. Each unichar has at most\n  // one entry in the vector.\n  std::vector<UnicharAndFonts> unichars_;\n};\n\n// ShapeTable is a class to encapsulate the triple indirection that is\n// used here.\n// ShapeTable is a vector of shapes.\n// Each shape is a vector of UnicharAndFonts representing the set of unichars\n// that the shape represents.\n// Each UnicharAndFonts also lists the fonts of the unichar_id that were\n// mapped to the shape during training.\nclass TESS_API ShapeTable {\npublic:\n  ShapeTable();\n  // The UNICHARSET reference supplied here, or in set_unicharset below must\n  // exist for the entire life of the ShapeTable. It is used only by DebugStr.\n  explicit ShapeTable(const UNICHARSET &unicharset);\n  ~ShapeTable() {\n    for (auto data : shape_table_) {\n      delete data;\n    }\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp);\n\n  // Accessors.\n  unsigned NumShapes() const {\n    return shape_table_.size();\n  }\n  const UNICHARSET &unicharset() const {\n    return *unicharset_;\n  }\n  // Returns the number of fonts used in this ShapeTable, computing it if\n  // necessary.\n  int NumFonts() const;\n  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the\n  // entire life of the ShapeTable.\n  void set_unicharset(const UNICHARSET &unicharset) {\n    unicharset_ = &unicharset;\n  }\n  // Re-indexes the class_ids in the shapetable according to the given map.\n  // Useful in conjunction with set_unicharset.\n  void ReMapClassIds(const std::vector<int> &unicharset_map);\n  // Returns a string listing the classes/fonts in a shape.\n  std::string DebugStr(unsigned shape_id) const;\n  // Returns a debug string summarizing the table.\n  std::string SummaryStr() const;\n\n  // Adds a new shape starting with the given unichar_id and font_id.\n  // Returns the assigned index.\n  unsigned AddShape(int unichar_id, int font_id);\n  // Adds a copy of the given shape unless it is already present.\n  // Returns the assigned index or index of existing shape if already present.\n  unsigned AddShape(const Shape &other);\n  // Removes the shape given by the shape index. All indices above are changed!\n  void DeleteShape(unsigned shape_id);\n  // Adds a font_id to the given existing shape index for the given\n  // unichar_id. If the unichar_id is not in the shape, it is added.\n  void AddToShape(unsigned shape_id, int unichar_id, int font_id);\n  // Adds the given shape to the existing shape with the given index.\n  void AddShapeToShape(unsigned shape_id, const Shape &other);\n  // Returns the id of the shape that contains the given unichar and font.\n  // If not found, returns -1.\n  // If font_id < 0, the font_id is ignored and the first shape that matches\n  // the unichar_id is returned.\n  int FindShape(int unichar_id, int font_id) const;\n  // Returns the first unichar_id and font_id in the given shape.\n  void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const;\n\n  // Accessors for the Shape with the given shape_id.\n  const Shape &GetShape(unsigned shape_id) const {\n    return *shape_table_[shape_id];\n  }\n  Shape *MutableShape(unsigned shape_id) {\n    return shape_table_[shape_id];\n  }\n\n  // Expands all the classes/fonts in the shape individually to build\n  // a ShapeTable.\n  int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);\n\n  // Returns true if the shapes are already merged.\n  bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const;\n  // Returns true if any shape contains multiple unichars.\n  bool AnyMultipleUnichars() const;\n  // Returns the maximum number of unichars over all shapes.\n  int MaxNumUnichars() const;\n  // Merges shapes with a common unichar over the [start, end) interval.\n  // Assumes single unichar per shape.\n  void ForceFontMerges(unsigned start, unsigned end);\n  // Returns the number of unichars in the master shape.\n  unsigned MasterUnicharCount(unsigned shape_id) const;\n  // Returns the sum of the font counts in the master shape.\n  int MasterFontCount(unsigned shape_id) const;\n  // Returns the number of unichars that would result from merging the shapes.\n  int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const;\n  // Merges two shape_ids, leaving shape_id2 marked as merged.\n  void MergeShapes(unsigned shape_id1, unsigned shape_id2);\n  // Swaps two shape_ids.\n  void SwapShapes(unsigned shape_id1, unsigned shape_id2);\n  // Appends the master shapes from other to this.\n  // Used to create a clean ShapeTable from a merged one, or to create a\n  // copy of a ShapeTable.\n  // If not nullptr, shape_map is set to map other shape_ids to this's\n  // shape_ids.\n  void AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map);\n  // Returns the number of master shapes remaining after merging.\n  int NumMasterShapes() const;\n  // Returns the destination of this shape, (if merged), taking into account\n  // the fact that the destination may itself have been merged.\n  // For a non-merged shape, returns the input shape_id.\n  unsigned MasterDestinationIndex(unsigned shape_id) const;\n\n  // Returns false if the unichars in neither shape is a subset of the other..\n  bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const;\n  // Returns false if the unichars in neither shape is a subset of the other..\n  bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const;\n  // Returns true if the unichar sets are equal between the shapes.\n  bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const;\n  bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const;\n  // Returns true if there is a common unichar between the shapes.\n  bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const;\n  // Returns true if there is a common font id between the shapes.\n  bool CommonFont(unsigned shape_id1, unsigned shape_id2) const;\n\n  // Adds the unichars of the given shape_id to the vector of results. Any\n  // unichar_id that is already present just has the fonts added to the\n  // font set for that result without adding a new entry in the vector.\n  // NOTE: it is assumed that the results are given to this function in order\n  // of decreasing rating.\n  // The unichar_map vector indicates the index of the results entry containing\n  // each unichar, or -1 if the unichar is not yet included in results.\n  void AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,\n                         std::vector<UnicharRating> *results) const;\n\nprivate:\n  // Adds the given unichar_id to the results if needed, updating unichar_map\n  // and returning the index of unichar in results.\n  int AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,\n                          std::vector<UnicharRating> *results) const;\n\n  // Pointer to a provided unicharset used only by the Debugstr member.\n  const UNICHARSET *unicharset_;\n  // Vector of pointers to the Shapes in this ShapeTable.\n  std::vector<Shape *> shape_table_;\n\n  // Cached data calculated on demand.\n  mutable int num_fonts_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_\n"
  },
  {
    "path": "src/classify/tessclassifier.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tessclassifier.cpp\n// Description: Tesseract implementation of a ShapeClassifier.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"tessclassifier.h\"\n\n#include \"classify.h\"\n#include \"trainingsample.h\"\n\nnamespace tesseract {\n\n// Classifies the given [training] sample, writing to results.\n// See ShapeClassifier for a full description.\nint TessClassifier::UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                                          UNICHAR_ID keep_this,\n                                          std::vector<UnicharRating> *results) {\n  const int old_matcher_level = classify_->matcher_debug_level;\n  const int old_matcher_flags = classify_->matcher_debug_flags;\n  const int old_classify_level = classify_->classify_debug_level;\n  if (debug) {\n    // Explicitly set values of various control parameters to generate debug\n    // output if required, restoring the old values after classifying.\n    classify_->matcher_debug_level.set_value(2);\n    classify_->matcher_debug_flags.set_value(25);\n    classify_->classify_debug_level.set_value(3);\n  }\n  classify_->CharNormTrainingSample(pruner_only_, keep_this, sample, results);\n  if (debug) {\n    classify_->matcher_debug_level.set_value(old_matcher_level);\n    classify_->matcher_debug_flags.set_value(old_matcher_flags);\n    classify_->classify_debug_level.set_value(old_classify_level);\n  }\n  return results->size();\n}\n\n// Provides access to the ShapeTable that this classifier works with.\nconst ShapeTable *TessClassifier::GetShapeTable() const {\n  return classify_->shape_table();\n}\n// Provides access to the UNICHARSET that this classifier works with.\n// Only needs to be overridden if GetShapeTable() can return nullptr.\nconst UNICHARSET &TessClassifier::GetUnicharset() const {\n  return classify_->unicharset;\n}\n\n// Displays classification as the given shape_id. Creates as many windows\n// as it feels fit, using index as a guide for placement. Adds any created\n// windows to the windows output and returns a new index that may be used\n// by any subsequent classifiers. Caller waits for the user to view and\n// then destroys the windows by clearing the vector.\nint TessClassifier::DisplayClassifyAs(const TrainingSample &sample, Image page_pix, int unichar_id,\n                                      int index, std::vector<ScrollView *> &windows) {\n  int shape_id = unichar_id;\n  // TODO(rays) Fix this so it works with both flat and real shapetables.\n  //  if (GetShapeTable() != nullptr)\n  //  shape_id = BestShapeForUnichar(sample, page_pix, unichar_id, nullptr);\n  if (shape_id < 0) {\n    return index;\n  }\n  if (UnusedClassIdIn(classify_->PreTrainedTemplates, shape_id)) {\n    tprintf(\"No built-in templates for class/shape %d\\n\", shape_id);\n    return index;\n  }\n#ifndef GRAPHICS_DISABLED\n  classify_->ShowBestMatchFor(shape_id, sample.features(), sample.num_features());\n#endif\n  return index;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/tessclassifier.h",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        tessclassifier.h\n// Description: Tesseract implementation of a ShapeClassifier.\n// Author:      Ray Smith\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_\n#define THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_\n\n#include \"shapeclassifier.h\"\n\nnamespace tesseract {\n\nclass Classify;\nclass TrainingSample;\n\n// Tesseract implementation of a ShapeClassifier.\n// Due to limitations in the content of TrainingSample, this currently\n// only works for the static classifier and only works if the ShapeTable\n// in classify is not nullptr.\nclass TESS_API TessClassifier : public ShapeClassifier {\npublic:\n  TessClassifier(bool pruner_only, tesseract::Classify *classify)\n      : pruner_only_(pruner_only), classify_(classify) {}\n  ~TessClassifier() override = default;\n\n  // Classifies the given [training] sample, writing to results.\n  // See ShapeClassifier for a full description.\n  int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug,\n                            UNICHAR_ID keep_this, std::vector<UnicharRating> *results) override;\n  // Provides access to the ShapeTable that this classifier works with.\n  const ShapeTable *GetShapeTable() const override;\n  // Provides access to the UNICHARSET that this classifier works with.\n  // Only needs to be overridden if GetShapeTable() can return nullptr.\n  const UNICHARSET &GetUnicharset() const override;\n\n  // Displays classification as the given shape_id. Creates as many windows\n  // as it feels fit, using index as a guide for placement. Adds any created\n  // windows to the windows output and returns a new index that may be used\n  // by any subsequent classifiers. Caller waits for the user to view and\n  // then destroys the windows by clearing the vector.\n  int DisplayClassifyAs(const TrainingSample &sample, Image page_pix, int unichar_id, int index,\n                        std::vector<ScrollView *> &windows) override;\n\nprivate:\n  // Indicates that this classifier is to use just the ClassPruner, or the\n  // full classifier if false.\n  bool pruner_only_;\n  // Borrowed pointer to the actual Tesseract classifier.\n  tesseract::Classify *classify_;\n};\n\n} // namespace tesseract\n\n#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_ */\n"
  },
  {
    "path": "src/classify/trainingsample.cpp",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#define _USE_MATH_DEFINES // for M_PI\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"trainingsample.h\"\n\n#include \"helpers.h\"\n#include \"intfeaturespace.h\"\n#include \"normfeat.h\"\n#include \"shapetable.h\"\n\n#include <allheaders.h>\n\n#include <cmath> // for M_PI\n\nnamespace tesseract {\n\n// Center of randomizing operations.\nconst int kRandomizingCenter = 128;\n\n// Randomizing factors.\nconst int TrainingSample::kYShiftValues[kSampleYShiftSize] = {6, 3, -3, -6, 0};\nconst double TrainingSample::kScaleValues[kSampleScaleSize] = {1.0625, 0.9375, 1.0};\n\nTrainingSample::~TrainingSample() {\n  delete[] features_;\n  delete[] micro_features_;\n}\n\n// WARNING! Serialize/DeSerialize do not save/restore the \"cache\" data\n// members, which is mostly the mapped features, and the weight.\n// It is assumed these can all be reconstructed from what is saved.\n// Writes to the given file. Returns false in case of error.\nbool TrainingSample::Serialize(FILE *fp) const {\n  if (fwrite(&class_id_, sizeof(class_id_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&font_id_, sizeof(font_id_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&page_num_, sizeof(page_num_), 1, fp) != 1) {\n    return false;\n  }\n  if (!bounding_box_.Serialize(fp)) {\n    return false;\n  }\n  if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_) {\n    return false;\n  }\n  if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_, fp) !=\n      num_micro_features_) {\n    return false;\n  }\n  if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {\n    return false;\n  }\n  if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {\n    return false;\n  }\n  return true;\n}\n\n// Creates from the given file. Returns nullptr in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nTrainingSample *TrainingSample::DeSerializeCreate(bool swap, FILE *fp) {\n  auto *sample = new TrainingSample;\n  if (sample->DeSerialize(swap, fp)) {\n    return sample;\n  }\n  delete sample;\n  return nullptr;\n}\n\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool TrainingSample::DeSerialize(bool swap, FILE *fp) {\n  if (fread(&class_id_, sizeof(class_id_), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&font_id_, sizeof(font_id_), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&page_num_, sizeof(page_num_), 1, fp) != 1) {\n    return false;\n  }\n  if (!bounding_box_.DeSerialize(swap, fp)) {\n    return false;\n  }\n  if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&outline_length_, sizeof(outline_length_), 1, fp) != 1) {\n    return false;\n  }\n  if (swap) {\n    ReverseN(&class_id_, sizeof(class_id_));\n    ReverseN(&num_features_, sizeof(num_features_));\n    ReverseN(&num_micro_features_, sizeof(num_micro_features_));\n    ReverseN(&outline_length_, sizeof(outline_length_));\n  }\n  // Arbitrarily limit the number of elements to protect against bad data.\n  if (num_features_ > UINT16_MAX) {\n    return false;\n  }\n  if (num_micro_features_ > UINT16_MAX) {\n    return false;\n  }\n  delete[] features_;\n  features_ = new INT_FEATURE_STRUCT[num_features_];\n  if (fread(features_, sizeof(*features_), num_features_, fp) != num_features_) {\n    return false;\n  }\n  delete[] micro_features_;\n  micro_features_ = new MicroFeature[num_micro_features_];\n  if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_, fp) !=\n      num_micro_features_) {\n    return false;\n  }\n  if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {\n    return false;\n  }\n  if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {\n    return false;\n  }\n  return true;\n}\n\n// Saves the given features into a TrainingSample.\nTrainingSample *TrainingSample::CopyFromFeatures(const INT_FX_RESULT_STRUCT &fx_info,\n                                                 const TBOX &bounding_box,\n                                                 const INT_FEATURE_STRUCT *features,\n                                                 int num_features) {\n  auto *sample = new TrainingSample;\n  sample->num_features_ = num_features;\n  sample->features_ = new INT_FEATURE_STRUCT[num_features];\n  sample->outline_length_ = fx_info.Length;\n  memcpy(sample->features_, features, num_features * sizeof(features[0]));\n  sample->geo_feature_[GeoBottom] = bounding_box.bottom();\n  sample->geo_feature_[GeoTop] = bounding_box.top();\n  sample->geo_feature_[GeoWidth] = bounding_box.width();\n\n  // Generate the cn_feature_ from the fx_info.\n  sample->cn_feature_[CharNormY] = MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);\n  sample->cn_feature_[CharNormLength] = MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;\n  sample->cn_feature_[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;\n  sample->cn_feature_[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;\n\n  sample->features_are_indexed_ = false;\n  sample->features_are_mapped_ = false;\n  return sample;\n}\n\n// Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.\nFEATURE_STRUCT *TrainingSample::GetCNFeature() const {\n  auto feature = new FEATURE_STRUCT(&CharNormDesc);\n  for (int i = 0; i < kNumCNParams; ++i) {\n    feature->Params[i] = cn_feature_[i];\n  }\n  return feature;\n}\n\n// Constructs and returns a copy randomized by the method given by\n// the randomizer index. If index is out of [0, kSampleRandomSize) then\n// an exact copy is returned.\nTrainingSample *TrainingSample::RandomizedCopy(int index) const {\n  TrainingSample *sample = Copy();\n  if (index >= 0 && index < kSampleRandomSize) {\n    ++index; // Remove the first combination.\n    const int yshift = kYShiftValues[index / kSampleScaleSize];\n    double scaling = kScaleValues[index % kSampleScaleSize];\n    for (uint32_t i = 0; i < num_features_; ++i) {\n      double result = (features_[i].X - kRandomizingCenter) * scaling;\n      result += kRandomizingCenter;\n      sample->features_[i].X = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);\n      result = (features_[i].Y - kRandomizingCenter) * scaling;\n      result += kRandomizingCenter + yshift;\n      sample->features_[i].Y = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);\n    }\n  }\n  return sample;\n}\n\n// Constructs and returns an exact copy.\nTrainingSample *TrainingSample::Copy() const {\n  auto *sample = new TrainingSample;\n  sample->class_id_ = class_id_;\n  sample->font_id_ = font_id_;\n  sample->weight_ = weight_;\n  sample->sample_index_ = sample_index_;\n  sample->num_features_ = num_features_;\n  if (num_features_ > 0) {\n    sample->features_ = new INT_FEATURE_STRUCT[num_features_];\n    memcpy(sample->features_, features_, num_features_ * sizeof(features_[0]));\n  }\n  sample->num_micro_features_ = num_micro_features_;\n  if (num_micro_features_ > 0) {\n    sample->micro_features_ = new MicroFeature[num_micro_features_];\n    memcpy(sample->micro_features_, micro_features_,\n           num_micro_features_ * sizeof(micro_features_[0]));\n  }\n  memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);\n  memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);\n  return sample;\n}\n\n// Extracts the needed information from the CHAR_DESC_STRUCT.\nvoid TrainingSample::ExtractCharDesc(int int_feature_type, int micro_type, int cn_type,\n                                     int geo_type, CHAR_DESC_STRUCT *char_desc) {\n  // Extract the INT features.\n  delete[] features_;\n  FEATURE_SET_STRUCT *char_features = char_desc->FeatureSets[int_feature_type];\n  if (char_features == nullptr) {\n    tprintf(\"Error: no features to train on of type %s\\n\", kIntFeatureType);\n    num_features_ = 0;\n    features_ = nullptr;\n  } else {\n    num_features_ = char_features->NumFeatures;\n    features_ = new INT_FEATURE_STRUCT[num_features_];\n    for (uint32_t f = 0; f < num_features_; ++f) {\n      features_[f].X = static_cast<uint8_t>(char_features->Features[f]->Params[IntX]);\n      features_[f].Y = static_cast<uint8_t>(char_features->Features[f]->Params[IntY]);\n      features_[f].Theta = static_cast<uint8_t>(char_features->Features[f]->Params[IntDir]);\n      features_[f].CP_misses = 0;\n    }\n  }\n  // Extract the Micro features.\n  delete[] micro_features_;\n  char_features = char_desc->FeatureSets[micro_type];\n  if (char_features == nullptr) {\n    tprintf(\"Error: no features to train on of type %s\\n\", kMicroFeatureType);\n    num_micro_features_ = 0;\n    micro_features_ = nullptr;\n  } else {\n    num_micro_features_ = char_features->NumFeatures;\n    micro_features_ = new MicroFeature[num_micro_features_];\n    for (uint32_t f = 0; f < num_micro_features_; ++f) {\n      for (int d = 0; d < (int)MicroFeatureParameter::MFCount; ++d) {\n        micro_features_[f][d] = char_features->Features[f]->Params[d];\n      }\n    }\n  }\n  // Extract the CN feature.\n  char_features = char_desc->FeatureSets[cn_type];\n  if (char_features == nullptr) {\n    tprintf(\"Error: no CN feature to train on.\\n\");\n  } else {\n    ASSERT_HOST(char_features->NumFeatures == 1);\n    cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];\n    cn_feature_[CharNormLength] = char_features->Features[0]->Params[CharNormLength];\n    cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];\n    cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];\n  }\n  // Extract the Geo feature.\n  char_features = char_desc->FeatureSets[geo_type];\n  if (char_features == nullptr) {\n    tprintf(\"Error: no Geo feature to train on.\\n\");\n  } else {\n    ASSERT_HOST(char_features->NumFeatures == 1);\n    geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];\n    geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];\n    geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];\n  }\n  features_are_indexed_ = false;\n  features_are_mapped_ = false;\n}\n\n// Sets the mapped_features_ from the features_ using the provided\n// feature_space to the indexed versions of the features.\nvoid TrainingSample::IndexFeatures(const IntFeatureSpace &feature_space) {\n  std::vector<int> indexed_features;\n  feature_space.IndexAndSortFeatures(features_, num_features_, &mapped_features_);\n  features_are_indexed_ = true;\n  features_are_mapped_ = false;\n}\n\n// Returns a pix representing the sample. (Int features only.)\nImage TrainingSample::RenderToPix(const UNICHARSET *unicharset) const {\n  Image pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);\n  for (uint32_t f = 0; f < num_features_; ++f) {\n    int start_x = features_[f].X;\n    int start_y = kIntFeatureExtent - features_[f].Y;\n    double dx = cos((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);\n    double dy = -sin((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);\n    for (int i = 0; i <= 5; ++i) {\n      int x = static_cast<int>(start_x + dx * i);\n      int y = static_cast<int>(start_y + dy * i);\n      if (x >= 0 && x < 256 && y >= 0 && y < 256) {\n        pixSetPixel(pix, x, y, 1);\n      }\n    }\n  }\n  if (unicharset != nullptr) {\n    pixSetText(pix, unicharset->id_to_unichar(class_id_));\n  }\n  return pix;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the features in the given window with the given color.\nvoid TrainingSample::DisplayFeatures(ScrollView::Color color, ScrollView *window) const {\n  for (uint32_t f = 0; f < num_features_; ++f) {\n    RenderIntFeature(window, &features_[f], color);\n  }\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Returns a pix of the original sample image. The pix is padded all round\n// by padding wherever possible.\n// The returned Pix must be pixDestroyed after use.\n// If the input page_pix is nullptr, nullptr is returned.\nImage TrainingSample::GetSamplePix(int padding, Image page_pix) const {\n  if (page_pix == nullptr) {\n    return nullptr;\n  }\n  int page_width = pixGetWidth(page_pix);\n  int page_height = pixGetHeight(page_pix);\n  TBOX padded_box = bounding_box();\n  padded_box.pad(padding, padding);\n  // Clip the padded_box to the limits of the page\n  TBOX page_box(0, 0, page_width, page_height);\n  padded_box &= page_box;\n  Box *box =\n      boxCreate(page_box.left(), page_height - page_box.top(), page_box.width(), page_box.height());\n  Image sample_pix = pixClipRectangle(page_pix, box, nullptr);\n  boxDestroy(&box);\n  return sample_pix;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/classify/trainingsample.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H_\n#define TESSERACT_TRAINING_TRAININGSAMPLE_H_\n\n#include \"elst.h\"\n#include \"featdefs.h\"\n#include \"intfx.h\"\n#include \"intmatcher.h\"\n#include \"matrix.h\"\n#include \"mf.h\"\n#include \"mfdefs.h\"\n#include \"picofeat.h\"\n#include \"shapetable.h\"\n#include \"unicharset.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass IntFeatureMap;\nclass IntFeatureSpace;\nclass ShapeTable;\n\n// Number of elements of cn_feature_.\nstatic const int kNumCNParams = 4;\n// Number of ways to shift the features when randomizing.\nstatic const int kSampleYShiftSize = 5;\n// Number of ways to scale the features when randomizing.\nstatic const int kSampleScaleSize = 3;\n// Total number of different ways to manipulate the features when randomizing.\n// The first and last combinations are removed to avoid an excessive\n// top movement (first) and an identity transformation (last).\n// WARNING: To avoid patterned duplication of samples, be sure to keep\n// kSampleRandomSize prime!\n// Eg with current values (kSampleYShiftSize = 5 and TkSampleScaleSize = 3)\n// kSampleRandomSize is 13, which is prime.\nstatic const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2;\n// ASSERT_IS_PRIME(kSampleRandomSize) !!\n\nclass TESS_API TrainingSample : public ELIST<TrainingSample>::LINK {\npublic:\n  TrainingSample()\n      : class_id_(INVALID_UNICHAR_ID)\n      , font_id_(0)\n      , page_num_(0)\n      , num_features_(0)\n      , num_micro_features_(0)\n      , outline_length_(0)\n      , features_(nullptr)\n      , micro_features_(nullptr)\n      , weight_(1.0)\n      , max_dist_(0.0)\n      , sample_index_(0)\n      , features_are_indexed_(false)\n      , features_are_mapped_(false)\n      , is_error_(false) {}\n  ~TrainingSample();\n\n  // Saves the given features into a TrainingSample. The features are copied,\n  // so may be deleted afterwards. Delete the return value after use.\n  static TrainingSample *CopyFromFeatures(const INT_FX_RESULT_STRUCT &fx_info,\n                                          const TBOX &bounding_box,\n                                          const INT_FEATURE_STRUCT *features, int num_features);\n  // Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.\n  FEATURE_STRUCT *GetCNFeature() const;\n  // Constructs and returns a copy \"randomized\" by the method given by\n  // the randomizer index. If index is out of [0, kSampleRandomSize) then\n  // an exact copy is returned.\n  TrainingSample *RandomizedCopy(int index) const;\n  // Constructs and returns an exact copy.\n  TrainingSample *Copy() const;\n\n  // WARNING! Serialize/DeSerialize do not save/restore the \"cache\" data\n  // members, which is mostly the mapped features, and the weight.\n  // It is assumed these can all be reconstructed from what is saved.\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Creates from the given file. Returns nullptr in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  static TrainingSample *DeSerializeCreate(bool swap, FILE *fp);\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\n  // Extracts the needed information from the CHAR_DESC_STRUCT.\n  void ExtractCharDesc(int feature_type, int micro_type, int cn_type, int geo_type,\n                       CHAR_DESC_STRUCT *char_desc);\n\n  // Sets the mapped_features_ from the features_ using the provided\n  // feature_space to the indexed versions of the features.\n  void IndexFeatures(const IntFeatureSpace &feature_space);\n\n  // Returns a pix representing the sample. (Int features only.)\n  Image RenderToPix(const UNICHARSET *unicharset) const;\n  // Displays the features in the given window with the given color.\n  void DisplayFeatures(ScrollView::Color color, ScrollView *window) const;\n\n  // Returns a pix of the original sample image. The pix is padded all round\n  // by padding wherever possible.\n  // The returned Pix must be pixDestroyed after use.\n  // If the input page_pix is nullptr, nullptr is returned.\n  Image GetSamplePix(int padding, Image page_pix) const;\n\n  // Accessors.\n  UNICHAR_ID class_id() const {\n    return class_id_;\n  }\n  void set_class_id(int id) {\n    class_id_ = id;\n  }\n  int font_id() const {\n    return font_id_;\n  }\n  void set_font_id(int id) {\n    font_id_ = id;\n  }\n  int page_num() const {\n    return page_num_;\n  }\n  void set_page_num(int page) {\n    page_num_ = page;\n  }\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n  void set_bounding_box(const TBOX &box) {\n    bounding_box_ = box;\n  }\n  uint32_t num_features() const {\n    return num_features_;\n  }\n  const INT_FEATURE_STRUCT *features() const {\n    return features_;\n  }\n  uint32_t num_micro_features() const {\n    return num_micro_features_;\n  }\n  const MicroFeature *micro_features() const {\n    return micro_features_;\n  }\n  int outline_length() const {\n    return outline_length_;\n  }\n  float cn_feature(int index) const {\n    return cn_feature_[index];\n  }\n  int geo_feature(int index) const {\n    return geo_feature_[index];\n  }\n  double weight() const {\n    return weight_;\n  }\n  void set_weight(double value) {\n    weight_ = value;\n  }\n  double max_dist() const {\n    return max_dist_;\n  }\n  void set_max_dist(double value) {\n    max_dist_ = value;\n  }\n  int sample_index() const {\n    return sample_index_;\n  }\n  void set_sample_index(int value) {\n    sample_index_ = value;\n  }\n  bool features_are_mapped() const {\n    return features_are_mapped_;\n  }\n  const std::vector<int> &mapped_features() const {\n    ASSERT_HOST(features_are_mapped_);\n    return mapped_features_;\n  }\n  const std::vector<int> &indexed_features() const {\n    ASSERT_HOST(features_are_indexed_);\n    return mapped_features_;\n  }\n  bool is_error() const {\n    return is_error_;\n  }\n  void set_is_error(bool value) {\n    is_error_ = value;\n  }\n\nprivate:\n  // Unichar id that this sample represents. There obviously must be a\n  // reference UNICHARSET somewhere. Usually in TrainingSampleSet.\n  UNICHAR_ID class_id_;\n  // Font id in which this sample was printed. Refers to a fontinfo_table_ in\n  // MasterTrainer.\n  int font_id_;\n  // Number of page that the sample came from.\n  int page_num_;\n  // Bounding box of sample in original image.\n  TBOX bounding_box_;\n  // Number of INT_FEATURE_STRUCT in features_ array.\n  uint32_t num_features_;\n  // Number of MicroFeature in micro_features_ array.\n  uint32_t num_micro_features_;\n  // Total length of outline in the baseline normalized coordinate space.\n  // See comment in WERD_RES class definition for a discussion of coordinate\n  // spaces.\n  int outline_length_;\n  // Array of features.\n  INT_FEATURE_STRUCT *features_;\n  // Array of features.\n  MicroFeature *micro_features_;\n  // The one and only CN feature. Indexed by NORM_PARAM_NAME enum.\n  float cn_feature_[kNumCNParams];\n  // The one and only geometric feature. (Aims at replacing cn_feature_).\n  // Indexed by GeoParams enum in picofeat.h\n  int geo_feature_[GeoCount];\n\n  // Non-serialized cache data.\n  // Weight used for boosting training.\n  double weight_;\n  // Maximum distance to other samples of same class/font used in computing\n  // the canonical sample.\n  double max_dist_;\n  // Global index of this sample.\n  int sample_index_;\n\npublic:\n  // both are used in training tools\n  // hide after refactoring\n\n  // Indexed/mapped features, as indicated by the bools below.\n  std::vector<int> mapped_features_;\n  bool features_are_indexed_;\n  bool features_are_mapped_;\n\nprivate:\n  // True if the last classification was an error by the current definition.\n  bool is_error_;\n\n  // Randomizing factors.\n  static const int kYShiftValues[kSampleYShiftSize];\n  static const double kScaleValues[kSampleScaleSize];\n};\n\nELISTIZEH(TrainingSample)\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H_\n"
  },
  {
    "path": "src/cutil/bitvec.h",
    "content": "/******************************************************************************\n **    Filename:    bitvec.h\n **    Purpose:     Routines for manipulating bit vectors\n **    Author:      Dan Johnson\n **\n **    (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef BITVEC_H\n#define BITVEC_H\n\n#include <cstddef> // for size_t\n#include <cstdint> // for uint32_t\n\n/*-----------------------------------------------------------------------------\n          Include Files and Type Defines\n-----------------------------------------------------------------------------*/\n\nusing BIT_VECTOR = uint32_t *;\n\n//< no of bits in a BIT_VECTOR element\nconst size_t BITSINLONG = 8 * sizeof(uint32_t);\n\n/*-----------------------------------------------------------------------------\n          Public Function Prototypes\n-----------------------------------------------------------------------------*/\n\nstatic inline void zero_all_bits(BIT_VECTOR array, size_t length) {\n  for (size_t index = 0; index < length; index++) {\n    array[index] = 0;\n  }\n}\n\nstatic inline void set_all_bits(BIT_VECTOR array, size_t length) {\n  for (size_t index = 0; index < length; index++) {\n    array[index] = ~0;\n  }\n}\n\nstatic inline void copy_all_bits(BIT_VECTOR source, BIT_VECTOR dest, size_t length) {\n  for (size_t index = 0; index < length; index++) {\n    dest[index] = source[index];\n  }\n}\n\n#define SET_BIT(array, bit) (array[bit / BITSINLONG] |= 1 << (bit & (BITSINLONG - 1)))\n\n#define reset_bit(array, bit) (array[bit / BITSINLONG] &= ~(1 << (bit & (BITSINLONG - 1))))\n\n#define test_bit(array, bit) (array[bit / BITSINLONG] & (1 << (bit & (BITSINLONG - 1))))\n\nstatic inline size_t WordsInVectorOfSize(size_t NumBits) {\n  return (NumBits + BITSINLONG - 1) / BITSINLONG;\n}\n\n/**\n * This routine frees a bit vector.\n *\n * @param BitVector bit vector to be freed\n *\n */\nstatic inline void FreeBitVector(BIT_VECTOR BitVector) {\n  delete[] BitVector;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * Allocate and return a new bit vector large enough to\n * hold the specified number of bits.\n *\n * @param NumBits number of bits in new bit vector\n *\n * @return New bit vector.\n */\nstatic inline BIT_VECTOR NewBitVector(size_t NumBits) {\n  return new uint32_t[WordsInVectorOfSize(NumBits)];\n}\n\n#endif\n"
  },
  {
    "path": "src/cutil/oldlist.cpp",
    "content": "/******************************************************************************\n#\n# File:         oldlist.cpp\n# Description:  List processing procedures.\n# Author:       Mark Seaman, Software Productivity\n#\n# (c) Copyright 1987, Hewlett-Packard Company.\n** Licensed under the Apache License, Version 2.0 (the \"License\");\n** you may not use this file except in compliance with the License.\n** You may obtain a copy of the License at\n** http://www.apache.org/licenses/LICENSE-2.0\n** Unless required by applicable law or agreed to in writing, software\n** distributed under the License is distributed on an \"AS IS\" BASIS,\n** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n** See the License for the specific language governing permissions and\n** limitations under the License.\n#\n###############################################################################\n\n  This file contains a set of general purpose list manipulation routines.\n  These routines can be used in a wide variety of ways to provide several\n  different popular data structures. A new list can be created by declaring\n  a variable of type 'LIST', and can be initialized with the value 'NIL_LIST'.\n  All of these routines check for the NIL_LIST condition before dereferencing\n  pointers.  NOTE:  There is a users' manual available in printed form from\n  Mark Seaman at (303) 350-4492 at Greeley Hard Copy.\n\n  To implement a STACK use:\n\n  push         to add to the Stack             l = push(l, (LIST)\"jim\");\n  pop          to remove items from the Stack  l = pop(l);\n  first_node   to access the head              name = (char *)first_node(l);\n\n  To implement a QUEUE use:\n\n  push_last    to add to the Queue              l = push_last(l, (LIST)\"x\");\n  pop          remove items from the Queue      l = pop(l);\n  first_node   to access the head               name = (char *)first_node (l);\n\n  To implement LISP like functions use:\n\n  first_node   CAR                              x = (int)first_node(l);\n  rest         CDR                              l = list_rest (l);\n  push         CONS                             l = push(l, (LIST)this);\n  last         LAST                             x = last(l);\n  concat       APPEND                           l = concat(r, s);\n  count        LENGTH                           x = count(l);\n  search       MEMBER                           if (search(l, x, nullptr))\n\n  The following rules of closure exist for the functions provided.\n  a = first_node (push (a, b))\n  b = list_rest (push (a, b))\n  a = push (pop (a), a))        For all a <> NIL_LIST\n  a = reverse (reverse (a))\n\n******************************************************************************/\n#include \"oldlist.h\"\n\n#include \"errcode.h\" // for ASSERT_HOST\n\n#include <cstdio>\n#include <cstring> // for strcmp\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n\n/**********************************************************************\n *  i s   s a m e\n *\n *  Compare the list node with the key value return true (non-zero)\n *  if they are equivalent strings.  (Return false if not)\n **********************************************************************/\nstatic int is_same(void *item1, void *item2) {\n  return strcmp(static_cast<char *>(item1), static_cast<char *>(item2)) == 0;\n}\n\n/**********************************************************************\n *  d e l e t e    d\n *\n *  Delete all the elements out of the current list that match the key.\n *  This operation destroys the original list.  The caller will supply a\n *  routine that will compare each node to the\n *  key, and return a non-zero value when they match.\n **********************************************************************/\nLIST delete_d(LIST list, void *key, int_compare is_equal) {\n  LIST result = NIL_LIST;\n  LIST last_one = NIL_LIST;\n\n  if (is_equal == nullptr) {\n    is_equal = is_same;\n  }\n\n  while (list != NIL_LIST) {\n    if (!(*is_equal)(list->first_node(), key)) {\n      if (last_one == NIL_LIST) {\n        last_one = list;\n        list = list->list_rest();\n        result = last_one;\n        set_rest(last_one, NIL_LIST);\n      } else {\n        set_rest(last_one, list);\n        last_one = list;\n        list = list->list_rest();\n        set_rest(last_one, NIL_LIST);\n      }\n    } else {\n      list = pop(list);\n    }\n  }\n  return (result);\n}\n\n/**********************************************************************\n *  d e s t r o y\n *\n *  Return the space taken by a list to the heap.\n **********************************************************************/\nLIST destroy(LIST list) {\n  LIST next;\n\n  while (list != NIL_LIST) {\n    next = list->list_rest();\n    delete list;\n    list = next;\n  }\n  return (NIL_LIST);\n}\n\n/**********************************************************************\n *  d e s t r o y   n o d e s\n *\n *  Return the space taken by the LISTs of a list to the heap.\n **********************************************************************/\nvoid destroy_nodes(LIST list, void_dest destructor) {\n  ASSERT_HOST(destructor != nullptr);\n\n  while (list != NIL_LIST) {\n    if (list->first_node() != nullptr) {\n      (*destructor)(list->first_node());\n    }\n    list = pop(list);\n  }\n}\n\n/**********************************************************************\n *  l a s t\n *\n *  Return the last list item (this is list type).\n **********************************************************************/\nLIST last(LIST var_list) {\n  while (var_list->list_rest() != NIL_LIST) {\n    var_list = var_list->list_rest();\n  }\n  return var_list;\n}\n\n/**********************************************************************\n *  p o p\n *\n *  Return the list with the first element removed.  Destroy the space\n *  that it occupied in the list.\n **********************************************************************/\nLIST pop(LIST list) {\n  LIST temp = list->list_rest();\n  delete list;\n  return temp;\n}\n\n/**********************************************************************\n *  p u s h\n *\n *  Create a list element.  Push the second parameter (the node) onto\n *  the first parameter (the list). Return the new list to the caller.\n **********************************************************************/\nLIST push(LIST list, void *element) {\n  LIST t;\n\n  t = new list_rec;\n  t->node = static_cast<LIST>(element);\n  set_rest(t, list);\n  return (t);\n}\n\n/**********************************************************************\n *  p u s h   l a s t\n *\n *  Create a list element. Add the element onto the end of the list.\n **********************************************************************/\nLIST push_last(LIST list, void *item) {\n  LIST t;\n\n  if (list != NIL_LIST) {\n    t = last(list);\n    t->next = push(NIL_LIST, item);\n    return (list);\n  } else {\n    return (push(NIL_LIST, item));\n  }\n}\n\n/**********************************************************************\n *   s e a r c h\n *\n *  Search list, return NIL_LIST if not found. Return the list starting from\n *  the item if found.  The compare routine \"is_equal\" is passed in as\n *  the third parameter to this routine.\n **********************************************************************/\nLIST search(LIST list, void *key, int_compare is_equal) {\n  if (is_equal == nullptr) {\n    is_equal = is_same;\n  }\n\n  iterate(list) if ((*is_equal)(list->first_node(), key)) return list;\n  return (NIL_LIST);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/cutil/oldlist.h",
    "content": "/******************************************************************************\n *\n * File:         oldlist.h  (Formerly list.h)\n * Description:  List processing procedures declarations.\n * Author:       Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n ******************************************************************************\n *\n * This file contains the interface for a set of general purpose list\n * manipulation routines.  For the implementation of these routines see\n * the file \"list.c\".\n *\n ******************************************************************************\n *\n *                            INDEX\n *                           =======\n *\n * BASICS:\n * -------\n * first_node        - Macro to return the first list node (not the cell).\n * list_rest         - Macro the return the second list cell\n * pop               - Destroy one list cell\n * push              - Create one list cell and set the node and next fields\n *\n * ITERATION:\n * -----------------\n * iterate           - Macro to create a for loop to visit each cell.\n *\n * LIST CELL COUNTS:\n * -----------------\n * count             - Returns the number of list cells in the list.\n * last              - Returns the last list cell.\n *\n * TRANSFORMS:             (Note: These functions all modify the input list.)\n * ----------\n * delete_d          - Removes the requested elements from the list.\n * push_last         - Add a new element onto the end of a list.\n *\n * SETS:\n * -----\n * search            - Return the pointer to the list cell whose node matches.\n *\n * CELL OPERATIONS:\n * -----------------\n * destroy           - Return all list cells in a list.\n * destroy_nodes     - Apply a function to each list cell and destroy the list.\n * set_rest          - Assign the next field in a list cell.\n *\n ***********************************************************************/\n\n#ifndef LIST_H\n#define LIST_H\n\n#include <tesseract/export.h>\n\n#include <cstddef> // for size_t\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n                  T y p e s\n----------------------------------------------------------------------*/\n\n#define NIL_LIST static_cast<LIST>(nullptr)\n\nusing int_compare = int (*)(void *, void *);\nusing void_dest = void (*)(void *);\n\n/*----------------------------------------------------------------------\n                  M a c r o s\n----------------------------------------------------------------------*/\n\n/**********************************************************************\n *  i t e r a t e\n *\n *  Visit each node in the list.  Replace the old list with the list\n *  minus the head.  Continue until the list is NIL_LIST.\n **********************************************************************/\n\n#define iterate(l) for (; (l) != nullptr; (l) = (l)->list_rest())\n\n/**********************************************************************\n *  s e t   r e s t\n *\n *  Change the \"next\" field of a list element to point to a desired place.\n *\n *  #define set_rest(l,node)        l->next = node;\n **********************************************************************/\n\n#define set_rest(l, cell) ((l)->next = (cell))\n\nstruct list_rec {\n  list_rec *node;\n  list_rec *next;\n\n  list_rec *first_node() {\n    return node;\n  }\n\n  list_rec *list_rest() {\n    return next;\n  }\n\n  //********************************************************************\n  // Recursively count the elements in  a list.  Return the count.\n  //********************************************************************\n  size_t size() {\n    auto var_list = this;\n    size_t n = 0;\n    iterate(var_list) n++;\n    return n;\n  }\n};\nusing LIST = list_rec *;\n\n/*----------------------------------------------------------------------\n          Public Function Prototypes\n----------------------------------------------------------------------*/\n\nLIST delete_d(LIST list, void *key, int_compare is_equal);\n\nTESS_API\nLIST destroy(LIST list);\n\nvoid destroy_nodes(LIST list, void_dest destructor);\n\nLIST last(LIST var_list);\n\nLIST pop(LIST list);\n\nTESS_API\nLIST push(LIST list, void *element);\n\nTESS_API\nLIST push_last(LIST list, void *item);\n\nLIST search(LIST list, void *key, int_compare is_equal);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/dict/context.cpp",
    "content": "/******************************************************************************\n *\n * File:         context.cpp  (Formerly context.c)\n * Description:  Context checking functions\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1990, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#include \"dict.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\nstatic const int kMinAbsoluteGarbageWordLength = 10;\nstatic const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;\n\nconst int case_state_table[6][4] = {\n    {/*  0. Beginning of word       */\n     /*    P   U   L   D                                          */\n     /* -1. Error on case           */\n     0, 1, 5, 4},\n    {/*  1. After initial capital    */\n     0, 3, 2, 4},\n    {/*  2. After lower case         */\n     0, -1, 2, -1},\n    {/*  3. After upper case         */\n     0, 3, -1, 4},\n    {/*  4. After a digit            */\n     0, -1, -1, 4},\n    {/*  5. After initial lower case */\n     5, -1, 2, -1},\n};\n\nint Dict::case_ok(const WERD_CHOICE &word) const {\n  int state = 0;\n  const UNICHARSET *unicharset = word.unicharset();\n  for (unsigned x = 0; x < word.length(); ++x) {\n    UNICHAR_ID ch_id = word.unichar_id(x);\n    if (unicharset->get_isupper(ch_id)) {\n      state = case_state_table[state][1];\n    } else if (unicharset->get_islower(ch_id)) {\n      state = case_state_table[state][2];\n    } else if (unicharset->get_isdigit(ch_id)) {\n      state = case_state_table[state][3];\n    } else {\n      state = case_state_table[state][0];\n    }\n    if (state == -1) {\n      return false;\n    }\n  }\n  return state != 5; // single lower is bad\n}\n\nbool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {\n  if (word.length() < kMinAbsoluteGarbageWordLength) {\n    return false;\n  }\n  int num_alphanum = 0;\n  for (unsigned x = 0; x < word.length(); ++x) {\n    num_alphanum +=\n        (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));\n  }\n  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <\n          kMinAbsoluteGarbageAlphanumFrac);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/dawg.cpp",
    "content": "/********************************************************************************\n *\n * File:         dawg.cpp  (Formerly dawg.c)\n * Description:  Use a Directed Acyclic Word Graph\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *********************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include \"dawg.h\"\n\n#include \"dict.h\"\n#include \"helpers.h\"\n#include \"tprintf.h\"\n\n#include <memory>\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s   f o r   D a w g\n----------------------------------------------------------------------*/\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nDawg::~Dawg() = default;\n\nbool Dawg::prefix_in_dawg(const WERD_CHOICE &word,\n                          bool requires_complete) const {\n  if (word.empty()) {\n    return !requires_complete;\n  }\n  NODE_REF node = 0;\n  int end_index = word.length() - 1;\n  for (int i = 0; i < end_index; i++) {\n    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);\n    if (edge == NO_EDGE) {\n      return false;\n    }\n    if ((node = next_node(edge)) == 0) {\n      // This only happens if all words following this edge terminate --\n      // there are no larger words.  See Trie::add_word_to_dawg()\n      return false;\n    }\n  }\n  // Now check the last character.\n  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=\n         NO_EDGE;\n}\n\nbool Dawg::word_in_dawg(const WERD_CHOICE &word) const {\n  return prefix_in_dawg(word, true);\n}\n\nint Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset,\n                          bool enable_wildcard) const {\n  if (filename == nullptr) {\n    return 0;\n  }\n\n  FILE *word_file;\n  char string[CHARS_PER_LINE];\n  int misses = 0;\n  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);\n\n  word_file = fopen(filename, \"r\");\n  if (word_file == nullptr) {\n    tprintf(\"Error: Could not open file %s\\n\", filename);\n    ASSERT_HOST(word_file);\n  }\n\n  while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) {\n    chomp_string(string); // remove newline\n    WERD_CHOICE word(string, unicharset);\n    if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {\n      if (!match_words(&word, 0, 0,\n                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {\n        tprintf(\"Missing word: %s\\n\", string);\n        ++misses;\n      }\n    } else {\n      tprintf(\"Failed to create a valid word from %s\\n\", string);\n    }\n  }\n  fclose(word_file);\n  // Make sure the user sees this with fprintf instead of tprintf.\n  if (debug_level_) {\n    tprintf(\"Number of lost words=%d\\n\", misses);\n  }\n  return misses;\n}\n\nvoid Dawg::iterate_words(const UNICHARSET &unicharset,\n                         std::function<void(const WERD_CHOICE *)> cb) const {\n  WERD_CHOICE word(&unicharset);\n  iterate_words_rec(word, 0, cb);\n}\n\nstatic void CallWithUTF8(const std::function<void(const char *)> &cb,\n                         const WERD_CHOICE *wc) {\n  std::string s;\n  wc->string_and_lengths(&s, nullptr);\n  cb(s.c_str());\n}\n\nvoid Dawg::iterate_words(const UNICHARSET &unicharset,\n                         const std::function<void(const char *)> &cb) const {\n  using namespace std::placeholders; // for _1\n  std::function<void(const WERD_CHOICE *)> shim(\n      std::bind(CallWithUTF8, cb, _1));\n  WERD_CHOICE word(&unicharset);\n  iterate_words_rec(word, 0, shim);\n}\n\nvoid Dawg::iterate_words_rec(\n    const WERD_CHOICE &word_so_far, NODE_REF to_explore,\n    const std::function<void(const WERD_CHOICE *)> &cb) const {\n  NodeChildVector children;\n  this->unichar_ids_of(to_explore, &children, false);\n  for (auto &i : children) {\n    WERD_CHOICE next_word(word_so_far);\n    next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0);\n    if (this->end_of_word(i.edge_ref)) {\n      cb(&next_word);\n    }\n    NODE_REF next = next_node(i.edge_ref);\n    if (next != 0) {\n      iterate_words_rec(next_word, next, cb);\n    }\n  }\n}\n\nbool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,\n                       UNICHAR_ID wildcard) const {\n  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {\n    bool any_matched = false;\n    NodeChildVector vec;\n    this->unichar_ids_of(node, &vec, false);\n    for (auto &i : vec) {\n      word->set_unichar_id(i.unichar_id, index);\n      if (match_words(word, index, node, wildcard)) {\n        any_matched = true;\n      }\n    }\n    word->set_unichar_id(wildcard, index);\n    return any_matched;\n  } else {\n    auto word_end = index == word->length() - 1;\n    auto edge = edge_char_of(node, word->unichar_id(index), word_end);\n    if (edge != NO_EDGE) { // normal edge in DAWG\n      node = next_node(edge);\n      if (word_end) {\n        if (debug_level_ > 1) {\n          word->print(\"match_words() found: \");\n        }\n        return true;\n      } else if (node != 0) {\n        return match_words(word, index + 1, node, wildcard);\n      }\n    }\n  }\n  return false;\n}\n\nvoid Dawg::init(int unicharset_size) {\n  ASSERT_HOST(unicharset_size > 0);\n  unicharset_size_ = unicharset_size;\n  // Set bit masks. We will use the value unicharset_size_ as a null char, so\n  // the actual number of unichars is unicharset_size_ + 1.\n  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));\n  next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;\n  letter_mask_ = ~(~0ull << flag_start_bit_);\n  next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);\n  flags_mask_ = ~(letter_mask_ | next_node_mask_);\n}\n\n/*----------------------------------------------------------------------\n         F u n c t i o n s   f o r   S q u i s h e d    D a w g\n----------------------------------------------------------------------*/\n\nSquishedDawg::~SquishedDawg() {\n  delete[] edges_;\n}\n\nEDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,\n                                    bool word_end) const {\n  EDGE_REF edge = node;\n  if (node == 0) { // binary search\n    EDGE_REF start = 0;\n    EDGE_REF end = num_forward_edges_in_node0 - 1;\n    int compare;\n    while (start <= end) {\n      edge = (start + end) >> 1; // (start + end) / 2\n      compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,\n                                            edges_[edge]);\n      if (compare == 0) { // given == vec[k]\n        return edge;\n      } else if (compare == 1) { // given > vec[k]\n        start = edge + 1;\n      } else { // given < vec[k]\n        end = edge - 1;\n      }\n    }\n  } else { // linear search\n    if (edge != NO_EDGE && edge_occupied(edge)) {\n      do {\n        if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&\n            (!word_end || end_of_word_from_edge_rec(edges_[edge]))) {\n          return (edge);\n        }\n      } while (!last_edge(edge++));\n    }\n  }\n  return (NO_EDGE); // not found\n}\n\nint32_t SquishedDawg::num_forward_edges(NODE_REF node) const {\n  EDGE_REF edge = node;\n  int32_t num = 0;\n\n  if (forward_edge(edge)) {\n    do {\n      num++;\n    } while (!last_edge(edge++));\n  }\n\n  return (num);\n}\n\nvoid SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {\n  if (node == NO_EDGE) {\n    return; // nothing to print\n  }\n\n  EDGE_REF edge = node;\n  const char *forward_string = \"FORWARD\";\n  const char *backward_string = \"       \";\n\n  const char *last_string = \"LAST\";\n  const char *not_last_string = \"    \";\n\n  const char *eow_string = \"EOW\";\n  const char *not_eow_string = \"   \";\n\n  const char *direction;\n  const char *is_last;\n  const char *eow;\n\n  UNICHAR_ID unichar_id;\n\n  if (edge_occupied(edge)) {\n    do {\n      direction = forward_edge(edge) ? forward_string : backward_string;\n      is_last = last_edge(edge) ? last_string : not_last_string;\n      eow = end_of_word(edge) ? eow_string : not_eow_string;\n\n      unichar_id = edge_letter(edge);\n      tprintf(REFFORMAT \" : next = \" REFFORMAT \", unichar_id = %d, %s %s %s\\n\",\n              edge, next_node(edge), unichar_id, direction, is_last, eow);\n\n      if (edge - node > max_num_edges) {\n        return;\n      }\n    } while (!last_edge(edge++));\n\n    if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {\n      do {\n        direction = forward_edge(edge) ? forward_string : backward_string;\n        is_last = last_edge(edge) ? last_string : not_last_string;\n        eow = end_of_word(edge) ? eow_string : not_eow_string;\n\n        unichar_id = edge_letter(edge);\n        tprintf(REFFORMAT \" : next = \" REFFORMAT\n                          \", unichar_id = %d, %s %s %s\\n\",\n                edge, next_node(edge), unichar_id, direction, is_last, eow);\n\n        if (edge - node > MAX_NODE_EDGES_DISPLAY) {\n          return;\n        }\n      } while (!last_edge(edge++));\n    }\n  } else {\n    tprintf(REFFORMAT \" : no edges in this node\\n\", node);\n  }\n  tprintf(\"\\n\");\n}\n\nvoid SquishedDawg::print_edge(EDGE_REF edge) const {\n  if (edge == NO_EDGE) {\n    tprintf(\"NO_EDGE\\n\");\n  } else {\n    tprintf(REFFORMAT \" : next = \" REFFORMAT \", unichar_id = '%d', %s %s %s\\n\",\n            edge, next_node(edge), edge_letter(edge),\n            (forward_edge(edge) ? \"FORWARD\" : \"       \"),\n            (last_edge(edge) ? \"LAST\" : \"    \"),\n            (end_of_word(edge) ? \"EOW\" : \"\"));\n  }\n}\n\nbool SquishedDawg::read_squished_dawg(TFile *file) {\n  if (debug_level_) {\n    tprintf(\"Reading squished dawg\\n\");\n  }\n\n  // Read the magic number and check that it matches kDawgMagicNumber, as\n  // auto-endian fixing should make sure it is always correct.\n  int16_t magic;\n  if (!file->DeSerialize(&magic)) {\n    return false;\n  }\n  if (magic != kDawgMagicNumber) {\n    tprintf(\"Bad magic number on dawg: %d vs %d\\n\", magic, kDawgMagicNumber);\n    return false;\n  }\n\n  int32_t unicharset_size;\n  if (!file->DeSerialize(&unicharset_size)) {\n    return false;\n  }\n  if (!file->DeSerialize(&num_edges_)) {\n    return false;\n  }\n  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty\n  Dawg::init(unicharset_size);\n\n  edges_ = new EDGE_RECORD[num_edges_];\n  if (!file->DeSerialize(&edges_[0], num_edges_)) {\n    return false;\n  }\n  if (debug_level_ > 2) {\n    tprintf(\"type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\\n\",\n            type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);\n    for (EDGE_REF edge = 0; edge < num_edges_; ++edge) {\n      print_edge(edge);\n    }\n  }\n  return true;\n}\n\nstd::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(\n    int32_t *num_nodes) const {\n  EDGE_REF edge;\n  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);\n  int32_t node_counter;\n  int32_t num_edges;\n\n  for (edge = 0; edge < num_edges_; edge++) { // init all slots\n    node_map[edge] = -1;\n  }\n\n  node_counter = num_forward_edges(0);\n\n  *num_nodes = 0;\n  for (edge = 0; edge < num_edges_; edge++) { // search all slots\n\n    if (forward_edge(edge)) {\n      (*num_nodes)++; // count nodes links\n      node_map[edge] = (edge ? node_counter : 0);\n      num_edges = num_forward_edges(edge);\n      if (edge != 0) {\n        node_counter += num_edges;\n      }\n      edge += num_edges;\n      if (edge >= num_edges_) {\n        break;\n      }\n      if (backward_edge(edge)) {\n        while (!last_edge(edge++)) {\n          ;\n        }\n      }\n      edge--;\n    }\n  }\n  return node_map;\n}\n\nbool SquishedDawg::write_squished_dawg(TFile *file) {\n  EDGE_REF edge;\n  int32_t num_edges;\n  int32_t node_count = 0;\n  EDGE_REF old_index;\n  EDGE_RECORD temp_record;\n\n  if (debug_level_) {\n    tprintf(\"write_squished_dawg\\n\");\n  }\n\n  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));\n\n  // Write the magic number to help detecting a change in endianness.\n  int16_t magic = kDawgMagicNumber;\n  if (!file->Serialize(&magic)) {\n    return false;\n  }\n  if (!file->Serialize(&unicharset_size_)) {\n    return false;\n  }\n\n  // Count the number of edges in this Dawg.\n  num_edges = 0;\n  for (edge = 0; edge < num_edges_; edge++) {\n    if (forward_edge(edge)) {\n      num_edges++;\n    }\n  }\n\n  // Write edge count to file.\n  if (!file->Serialize(&num_edges)) {\n    return false;\n  }\n\n  if (debug_level_) {\n    tprintf(\"%d nodes in DAWG\\n\", node_count);\n    tprintf(\"%d edges in DAWG\\n\", num_edges);\n  }\n\n  for (edge = 0; edge < num_edges_; edge++) {\n    if (forward_edge(edge)) { // write forward edges\n      do {\n        old_index = next_node_from_edge_rec(edges_[edge]);\n        set_next_node(edge, node_map[old_index]);\n        temp_record = edges_[edge];\n        if (!file->Serialize(&temp_record)) {\n          return false;\n        }\n        set_next_node(edge, old_index);\n      } while (!last_edge(edge++));\n\n      if (edge >= num_edges_) {\n        break;\n      }\n      if (backward_edge(edge)) { // skip back links\n        while (!last_edge(edge++)) {\n          ;\n        }\n      }\n\n      edge--;\n    }\n  }\n  return true;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/dawg.h",
    "content": "/******************************************************************************\n *\n * File:         dawg.h\n * Description:  Definition of a class that represents Directed Acyclic Word\n *               Graph (DAWG), functions to build and manipulate the DAWG.\n * Author:       Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef DICT_DAWG_H_\n#define DICT_DAWG_H_\n\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include <cinttypes>  // for PRId64\n#include <functional> // for std::function\n#include <memory>\n#include \"elst.h\"\n#include \"params.h\"\n#include \"ratngs.h\"\n\n#ifndef __GNUC__\n#  ifdef _WIN32\n#    define NO_EDGE static_cast<int64_t>(0xffffffffffffffffi64)\n#  endif /*_WIN32*/\n#else\n#  define NO_EDGE static_cast<int64_t>(0xffffffffffffffffll)\n#endif /*__GNUC__*/\n\nnamespace tesseract {\n\nclass UNICHARSET;\n\nusing EDGE_RECORD = uint64_t;\nusing EDGE_ARRAY = EDGE_RECORD *;\nusing EDGE_REF = int64_t;\nusing NODE_REF = int64_t;\nusing NODE_MAP = EDGE_REF *;\n\nstruct NodeChild {\n  UNICHAR_ID unichar_id;\n  EDGE_REF edge_ref;\n  NodeChild(UNICHAR_ID id, EDGE_REF ref) : unichar_id(id), edge_ref(ref) {}\n  NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}\n};\n\nusing NodeChildVector = std::vector<NodeChild>;\nusing SuccessorList = std::vector<int>;\nusing SuccessorListsVector = std::vector<SuccessorList *>;\n\nenum DawgType {\n  DAWG_TYPE_PUNCTUATION,\n  DAWG_TYPE_WORD,\n  DAWG_TYPE_NUMBER,\n  DAWG_TYPE_PATTERN,\n\n  DAWG_TYPE_COUNT // number of enum entries\n};\n\n/*----------------------------------------------------------------------\n              C o n s t a n t s\n----------------------------------------------------------------------*/\n\n#define FORWARD_EDGE static_cast<int32_t>(0)\n#define BACKWARD_EDGE static_cast<int32_t>(1)\n#define MAX_NODE_EDGES_DISPLAY static_cast<int64_t>(100)\n#define MARKER_FLAG static_cast<int64_t>(1)\n#define DIRECTION_FLAG static_cast<int64_t>(2)\n#define WERD_END_FLAG static_cast<int64_t>(4)\n#define LETTER_START_BIT 0\n#define NUM_FLAG_BITS 3\n#define REFFORMAT \"%\" PRId64\n\nstatic const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {\n    {false, true, true, false},   // for DAWG_TYPE_PUNCTUATION\n    {true, false, false, false},  // for DAWG_TYPE_WORD\n    {true, false, false, false},  // for DAWG_TYPE_NUMBER\n    {false, false, false, false}, // for DAWG_TYPE_PATTERN\n};\n\nstatic const char kWildcard[] = \"*\";\n\n/*----------------------------------------------------------------------\n              C l a s s e s   a n d   S t r u c t s\n----------------------------------------------------------------------*/\n//\n/// Abstract class (an interface) that declares methods needed by the\n/// various tesseract classes to operate on SquishedDawg and Trie objects.\n///\n/// This class initializes all the edge masks (since their usage by\n/// SquishedDawg and Trie is identical) and implements simple accessors\n/// for each of the fields encoded in an EDGE_RECORD.\n/// This class also implements word_in_dawg() and check_for_words()\n/// (since they use only the public methods of SquishedDawg and Trie\n/// classes that are inherited from the Dawg base class).\n//\nclass TESS_API Dawg {\npublic:\n  /// Magic number to determine endianness when reading the Dawg from file.\n  static constexpr int16_t kDawgMagicNumber = 42;\n  /// A special unichar id that indicates that any appropriate pattern\n  /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead\n  /// Used for expressing patterns in punctuation and number Dawgs.\n  static const UNICHAR_ID kPatternUnicharID = 0;\n\n  inline DawgType type() const {\n    return type_;\n  }\n  inline const std::string &lang() const {\n    return lang_;\n  }\n  inline PermuterType permuter() const {\n    return perm_;\n  }\n\n  virtual ~Dawg();\n\n  /// Returns true if the given word is in the Dawg.\n  bool word_in_dawg(const WERD_CHOICE &word) const;\n\n  // Returns true if the given word prefix is not contraindicated by the dawg.\n  // If requires_complete is true, then the exact complete word must be present.\n  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;\n\n  /// Checks the Dawg for the words that are listed in the requested file.\n  /// Returns the number of words in the given file missing from the Dawg.\n  int check_for_words(const char *filename, const UNICHARSET &unicharset,\n                      bool enable_wildcard) const;\n\n  // For each word in the Dawg, call the given (permanent) callback with the\n  // text (UTF-8) version of the word.\n  void iterate_words(const UNICHARSET &unicharset,\n                     std::function<void(const WERD_CHOICE *)> cb) const;\n\n  // For each word in the Dawg, call the given (permanent) callback with the\n  // text (UTF-8) version of the word.\n  void iterate_words(const UNICHARSET &unicharset,\n                     const std::function<void(const char *)> &cb) const;\n\n  // Pure virtual function that should be implemented by the derived classes.\n\n  /// Returns the edge that corresponds to the letter out of this node.\n  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,\n                                bool word_end) const = 0;\n\n  /// Fills the given NodeChildVector with all the unichar ids (and the\n  /// corresponding EDGE_REFs) for which there is an edge out of this node.\n  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,\n                              bool word_end) const = 0;\n\n  /// Returns the next node visited by following the edge\n  /// indicated by the given EDGE_REF.\n  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;\n\n  /// Returns true if the edge indicated by the given EDGE_REF\n  /// marks the end of a word.\n  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;\n\n  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.\n  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;\n\n  /// Prints the contents of the node indicated by the given NODE_REF.\n  /// At most max_num_edges will be printed.\n  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;\n\n  /// Fills vec with unichar ids that represent the character classes\n  /// of the given unichar_id.\n  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,\n                                      const UNICHARSET &unicharset,\n                                      std::vector<UNICHAR_ID> *vec) const {\n    (void)unichar_id;\n    (void)unicharset;\n    (void)vec;\n  }\n\n  /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has\n  /// a self loop and the given unichar_id matches the unichar_id stored in the\n  /// EDGE_RECORD, returns NO_EDGE otherwise.\n  virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,\n                                     bool word_end) const {\n    (void)edge_ref;\n    (void)unichar_id;\n    (void)word_end;\n    return false;\n  }\n\nprotected:\n  Dawg(DawgType type, const std::string &lang, PermuterType perm,\n       int debug_level)\n      : lang_(lang),\n        type_(type),\n        perm_(perm),\n        unicharset_size_(0),\n        debug_level_(debug_level) {}\n\n  /// Returns the next node visited by following this edge.\n  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {\n    return ((edge_rec & next_node_mask_) >> next_node_start_bit_);\n  }\n  /// Returns the marker flag of this edge.\n  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {\n    return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;\n  }\n  /// Returns the direction flag of this edge.\n  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {\n    return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? BACKWARD_EDGE\n                                                              : FORWARD_EDGE;\n  }\n  /// Returns true if this edge marks the end of a word.\n  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {\n    return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;\n  }\n  /// Returns UNICHAR_ID recorded in this edge.\n  inline UNICHAR_ID unichar_id_from_edge_rec(\n      const EDGE_RECORD &edge_rec) const {\n    return ((edge_rec & letter_mask_) >> LETTER_START_BIT);\n  }\n  /// Sets the next node link for this edge in the Dawg.\n  inline void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value) {\n    *edge_rec &= (~next_node_mask_);\n    *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);\n  }\n  /// Sets this edge record to be the last one in a sequence of edges.\n  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {\n    *edge_rec |= (MARKER_FLAG << flag_start_bit_);\n  }\n  /// Sequentially compares the given values of unichar ID, next node\n  /// and word end marker with the values in the given EDGE_RECORD.\n  /// Returns: 1 if at any step the given input value exceeds\n  ///            that of edge_rec (and all the values already\n  ///            checked are the same)\n  ///          0 if edge_rec_match() returns true\n  ///         -1 otherwise\n  inline int given_greater_than_edge_rec(NODE_REF next_node, bool word_end,\n                                         UNICHAR_ID unichar_id,\n                                         const EDGE_RECORD &edge_rec) const {\n    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);\n    NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);\n    bool curr_word_end = end_of_word_from_edge_rec(edge_rec);\n    if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,\n                       curr_word_end, curr_unichar_id)) {\n      return 0;\n    }\n    if (unichar_id > curr_unichar_id) {\n      return 1;\n    }\n    if (unichar_id == curr_unichar_id) {\n      if (next_node > curr_next_node) {\n        return 1;\n      }\n      if (next_node == curr_next_node) {\n        if (word_end > curr_word_end) {\n          return 1;\n        }\n      }\n    }\n    return -1;\n  }\n  /// Returns true if all the values are equal (any value matches\n  /// next_node if next_node == NO_EDGE, any value matches word_end\n  /// if word_end is false).\n  inline bool edge_rec_match(NODE_REF next_node, bool word_end,\n                             UNICHAR_ID unichar_id, NODE_REF other_next_node,\n                             bool other_word_end,\n                             UNICHAR_ID other_unichar_id) const {\n    return ((unichar_id == other_unichar_id) &&\n            (next_node == NO_EDGE || next_node == other_next_node) &&\n            (!word_end || (word_end == other_word_end)));\n  }\n\n  /// Sets unicharset_size_.\n  /// Initializes the values of various masks from unicharset_size_.\n  void init(int unicharset_size);\n\n  /// Matches all of the words that are represented by this string.\n  /// If wildcard is set to something other than INVALID_UNICHAR_ID,\n  /// the *'s in this string are interpreted as wildcards.\n  /// WERD_CHOICE param is not passed by const so that wildcard searches\n  /// can modify it and work without having to copy WERD_CHOICEs.\n  bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,\n                   UNICHAR_ID wildcard) const;\n\n  // Recursively iterate over all words in a dawg (see public iterate_words).\n  void iterate_words_rec(\n      const WERD_CHOICE &word_so_far, NODE_REF to_explore,\n      const std::function<void(const WERD_CHOICE *)> &cb) const;\n\n  // Member Variables.\n  std::string lang_;\n  DawgType type_;\n  /// Permuter code that should be used if the word is found in this Dawg.\n  PermuterType perm_;\n  // Variables to construct various edge masks. Formerly:\n  // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64\n  // #define FLAGS_MASK     (int64_t) 0x0000000700000000i64\n  // #define LETTER_MASK    (int64_t) 0x00000000ffffffffi64\n  uint64_t next_node_mask_ = 0;\n  uint64_t flags_mask_ = 0;\n  uint64_t letter_mask_ = 0;\n  int unicharset_size_;\n  int flag_start_bit_ = 0;\n  int next_node_start_bit_ = 0;\n  // Level of debug statements to print to stdout.\n  int debug_level_;\n};\n\n//\n// DawgPosition keeps track of where we are in the primary dawg we're searching\n// as well as where we may be in the \"punctuation dawg\" which may provide\n// surrounding context.\n//\n// Example:\n//   punctuation dawg  -- space is the \"pattern character\"\n//     \" \"     // no punctuation\n//     \"' '\"   // leading and trailing apostrophes\n//     \" '\"    // trailing apostrophe\n//   word dawg:\n//     \"cat\"\n//     \"cab\"\n//     \"cat's\"\n//\n//  DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)\n//\n//  DawgPosition(-1, NO_EDGE, p, pe, false)\n//    We're in the punctuation dawg, no other dawg has been started.\n//    (1) If there's a pattern edge as a punc dawg child of us,\n//        for each punc-following dawg starting with ch, produce:\n//        Result: DawgPosition(k, w, p', false)\n//    (2) If there's a valid continuation in the punc dawg, produce:\n//        Result: DawgPosition(-k, NO_EDGE, p', false)\n//\n//  DawgPosition(k, w, -1, NO_EDGE, false)\n//    We're in dawg k.  Going back to punctuation dawg is not an option.\n//    Follow ch in dawg k.\n//\n//  DawgPosition(k, w, p, pe, false)\n//    We're in dawg k.  Continue in dawg k and/or go back to the punc dawg.\n//    If ending, check that the punctuation dawg is also ok to end here.\n//\n//  DawgPosition(k, w, p, pe true)\n//    We're back in the punctuation dawg.  Continuing there is the only option.\nstruct DawgPosition {\n  DawgPosition() = default;\n  DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref,\n               bool backtopunc)\n      : dawg_ref(dawgref),\n        punc_ref(puncref),\n        dawg_index(dawg_idx),\n        punc_index(punc_idx),\n        back_to_punc(backtopunc) {}\n  bool operator==(const DawgPosition &other) const {\n    return dawg_index == other.dawg_index && dawg_ref == other.dawg_ref &&\n           punc_index == other.punc_index && punc_ref == other.punc_ref &&\n           back_to_punc == other.back_to_punc;\n  }\n\n  EDGE_REF dawg_ref = NO_EDGE;\n  EDGE_REF punc_ref = NO_EDGE;\n  int8_t dawg_index = -1;\n  int8_t punc_index = -1;\n  // Have we returned to the punc dawg at the end of the word?\n  bool back_to_punc = false;\n};\n\nclass DawgPositionVector : public std::vector<DawgPosition> {\npublic:\n  /// Adds an entry for the given dawg_index with the given node to the vec.\n  /// Returns false if the same entry already exists in the vector,\n  /// true otherwise.\n  inline bool add_unique(const DawgPosition &new_pos, bool debug,\n                         const char *debug_msg) {\n    for (auto &&position : *this) {\n      if (position == new_pos) {\n        return false;\n      }\n    }\n    push_back(new_pos);\n    if (debug) {\n      tprintf(\"%s[%d, \" REFFORMAT \"] [punc: \" REFFORMAT \"%s]\\n\", debug_msg,\n              new_pos.dawg_index, new_pos.dawg_ref, new_pos.punc_ref,\n              new_pos.back_to_punc ? \" returned\" : \"\");\n    }\n    return true;\n  }\n};\n\n//\n/// Concrete class that can operate on a compacted (squished) Dawg (read,\n/// search and write to file). This class is read-only in the sense that\n/// new words cannot be added to an instance of SquishedDawg.\n/// The underlying representation of the nodes and edges in SquishedDawg\n/// is stored as a contiguous EDGE_ARRAY (read from file or given as an\n/// argument to the constructor).\n//\nclass TESS_API SquishedDawg : public Dawg {\npublic:\n  SquishedDawg(DawgType type, const std::string &lang, PermuterType perm,\n               int debug_level)\n      : Dawg(type, lang, perm, debug_level) {}\n  SquishedDawg(const char *filename, DawgType type, const std::string &lang,\n               PermuterType perm, int debug_level)\n      : Dawg(type, lang, perm, debug_level) {\n    TFile file;\n    ASSERT_HOST(file.Open(filename, nullptr));\n    ASSERT_HOST(read_squished_dawg(&file));\n    num_forward_edges_in_node0 = num_forward_edges(0);\n  }\n  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,\n               const std::string &lang, PermuterType perm, int unicharset_size,\n               int debug_level)\n      : Dawg(type, lang, perm, debug_level),\n        edges_(edges),\n        num_edges_(num_edges) {\n    init(unicharset_size);\n    num_forward_edges_in_node0 = num_forward_edges(0);\n    if (debug_level > 3) {\n      print_all(\"SquishedDawg:\");\n    }\n  }\n  ~SquishedDawg() override;\n\n  // Loads using the given TFile. Returns false on failure.\n  bool Load(TFile *fp) {\n    if (!read_squished_dawg(fp)) {\n      return false;\n    }\n    num_forward_edges_in_node0 = num_forward_edges(0);\n    return true;\n  }\n\n  int NumEdges() {\n    return num_edges_;\n  }\n\n  /// Returns the edge that corresponds to the letter out of this node.\n  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,\n                        bool word_end) const override;\n\n  /// Fills the given NodeChildVector with all the unichar ids (and the\n  /// corresponding EDGE_REFs) for which there is an edge out of this node.\n  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,\n                      bool word_end) const override {\n    EDGE_REF edge = node;\n    if (!edge_occupied(edge) || edge == NO_EDGE) {\n      return;\n    }\n    assert(forward_edge(edge)); // we don't expect any backward edges to\n    do {                        // be present when this function is called\n      if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {\n        vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));\n      }\n    } while (!last_edge(edge++));\n  }\n\n  /// Returns the next node visited by following the edge\n  /// indicated by the given EDGE_REF.\n  NODE_REF next_node(EDGE_REF edge) const override {\n    return next_node_from_edge_rec((edges_[edge]));\n  }\n\n  /// Returns true if the edge indicated by the given EDGE_REF\n  /// marks the end of a word.\n  bool end_of_word(EDGE_REF edge_ref) const override {\n    return end_of_word_from_edge_rec((edges_[edge_ref]));\n  }\n\n  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.\n  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {\n    return unichar_id_from_edge_rec((edges_[edge_ref]));\n  }\n\n  /// Prints the contents of the node indicated by the given NODE_REF.\n  /// At most max_num_edges will be printed.\n  void print_node(NODE_REF node, int max_num_edges) const override;\n\n  /// Writes the squished/reduced Dawg to a file.\n  bool write_squished_dawg(TFile *file);\n\n  /// Opens the file with the given filename and writes the\n  /// squished/reduced Dawg to the file.\n  bool write_squished_dawg(const char *filename) {\n    TFile file;\n    file.OpenWrite(nullptr);\n    if (!this->write_squished_dawg(&file)) {\n      tprintf(\"Error serializing %s\\n\", filename);\n      return false;\n    }\n    if (!file.CloseWrite(filename, nullptr)) {\n      tprintf(\"Error writing file %s\\n\", filename);\n      return false;\n    }\n    return true;\n  }\n\nprivate:\n  /// Sets the next node link for this edge.\n  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {\n    set_next_node_in_edge_rec(&(edges_[edge_ref]), value);\n  }\n  /// Sets the edge to be empty.\n  inline void set_empty_edge(EDGE_REF edge_ref) {\n    (edges_[edge_ref] = next_node_mask_);\n  }\n  /// Goes through all the edges and clears each one out.\n  inline void clear_all_edges() {\n    for (int edge = 0; edge < num_edges_; edge++) {\n      set_empty_edge(edge);\n    }\n  }\n  /// Clears the last flag of this edge.\n  inline void clear_marker_flag(EDGE_REF edge_ref) {\n    (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));\n  }\n  /// Returns true if this edge is in the forward direction.\n  inline bool forward_edge(EDGE_REF edge_ref) const {\n    return (edge_occupied(edge_ref) &&\n            (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));\n  }\n  /// Returns true if this edge is in the backward direction.\n  inline bool backward_edge(EDGE_REF edge_ref) const {\n    return (edge_occupied(edge_ref) &&\n            (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));\n  }\n  /// Returns true if the edge spot in this location is occupied.\n  inline bool edge_occupied(EDGE_REF edge_ref) const {\n    return (edges_[edge_ref] != next_node_mask_);\n  }\n  /// Returns true if this edge is the last edge in a sequence.\n  inline bool last_edge(EDGE_REF edge_ref) const {\n    return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;\n  }\n\n  /// Counts and returns the number of forward edges in this node.\n  int32_t num_forward_edges(NODE_REF node) const;\n\n  /// Reads SquishedDawg from a file.\n  bool read_squished_dawg(TFile *file);\n\n  /// Prints the contents of an edge indicated by the given EDGE_REF.\n  void print_edge(EDGE_REF edge) const;\n\n  /// Prints the contents of the SquishedDawg.\n  void print_all(const char *msg) {\n    tprintf(\"\\n__________________________\\n%s\\n\", msg);\n    for (int i = 0; i < num_edges_; ++i) {\n      print_edge(i);\n    }\n    tprintf(\"__________________________\\n\");\n  }\n  /// Constructs a mapping from the memory node indices to disk node indices.\n  std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;\n\n  // Member variables.\n  EDGE_ARRAY edges_ = nullptr;\n  int32_t num_edges_ = 0;\n  int num_forward_edges_in_node0 = 0;\n};\n\n} // namespace tesseract\n\n#endif // DICT_DAWG_H_\n"
  },
  {
    "path": "src/dict/dawg_cache.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dawg_cache.cpp\n// Description: A class that knows about loading and caching dawgs.\n// Author:      David Eger\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"dawg_cache.h\"\n\n#include \"dawg.h\"\n#include \"object_cache.h\"\n#include \"tessdatamanager.h\"\n\nnamespace tesseract {\n\nstruct DawgLoader {\n  DawgLoader(const std::string &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,\n             TessdataManager *data_file)\n      : lang_(lang)\n      , data_file_(data_file)\n      , tessdata_dawg_type_(tessdata_dawg_type)\n      , dawg_debug_level_(dawg_debug_level) {}\n\n  Dawg *Load();\n\n  std::string lang_;\n  TessdataManager *data_file_;\n  TessdataType tessdata_dawg_type_;\n  int dawg_debug_level_;\n};\n\nDawg *DawgCache::GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type,\n                                 int debug_level, TessdataManager *data_file) {\n  std::string data_id = data_file->GetDataFileName();\n  data_id += kTessdataFileSuffixes[tessdata_dawg_type];\n  DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);\n  return dawgs_.Get(data_id, std::bind(&DawgLoader::Load, &loader));\n}\n\nDawg *DawgLoader::Load() {\n  TFile fp;\n  if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) {\n    return nullptr;\n  }\n  DawgType dawg_type;\n  PermuterType perm_type;\n  switch (tessdata_dawg_type_) {\n    case TESSDATA_PUNC_DAWG:\n    case TESSDATA_LSTM_PUNC_DAWG:\n      dawg_type = DAWG_TYPE_PUNCTUATION;\n      perm_type = PUNC_PERM;\n      break;\n    case TESSDATA_SYSTEM_DAWG:\n    case TESSDATA_LSTM_SYSTEM_DAWG:\n      dawg_type = DAWG_TYPE_WORD;\n      perm_type = SYSTEM_DAWG_PERM;\n      break;\n    case TESSDATA_NUMBER_DAWG:\n    case TESSDATA_LSTM_NUMBER_DAWG:\n      dawg_type = DAWG_TYPE_NUMBER;\n      perm_type = NUMBER_PERM;\n      break;\n    case TESSDATA_BIGRAM_DAWG:\n      dawg_type = DAWG_TYPE_WORD; // doesn't actually matter\n      perm_type = COMPOUND_PERM;  // doesn't actually matter\n      break;\n    case TESSDATA_UNAMBIG_DAWG:\n      dawg_type = DAWG_TYPE_WORD;\n      perm_type = SYSTEM_DAWG_PERM;\n      break;\n    case TESSDATA_FREQ_DAWG:\n      dawg_type = DAWG_TYPE_WORD;\n      perm_type = FREQ_DAWG_PERM;\n      break;\n    default:\n      return nullptr;\n  }\n  auto *retval = new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);\n  if (retval->Load(&fp)) {\n    return retval;\n  }\n  delete retval;\n  return nullptr;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/dawg_cache.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dawg_cache.h\n// Description: A class that knows about loading and caching dawgs.\n// Author:      David Eger\n// Created:     Fri Jan 27 12:08:00 PST 2012\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_DICT_DAWG_CACHE_H_\n#define TESSERACT_DICT_DAWG_CACHE_H_\n\n#include \"dawg.h\"\n#include \"object_cache.h\"\n#include \"tessdatamanager.h\"\n\nnamespace tesseract {\n\nclass DawgCache {\npublic:\n  Dawg *GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level,\n                        TessdataManager *data_file);\n\n  // If we manage the given dawg, decrement its count,\n  // and possibly delete it if the count reaches zero.\n  // If dawg is unknown to us, return false.\n  bool FreeDawg(Dawg *dawg) {\n    return dawgs_.Free(dawg);\n  }\n\n  // Free up any currently unused dawgs.\n  void DeleteUnusedDawgs() {\n    dawgs_.DeleteUnusedObjects();\n  }\n\nprivate:\n  ObjectCache<Dawg> dawgs_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_DICT_DAWG_CACHE_H_\n"
  },
  {
    "path": "src/dict/dict.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dict.cpp\n// Description: dict class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"dict.h\"\n\n#include \"tesserrstream.h\"  // for tesserr\n#include \"tprintf.h\"\n\n#include <cstdio>\n\nnamespace tesseract {\n\nclass Image;\n\nDict::Dict(CCUtil *ccutil)\n    : letter_is_okay_(&tesseract::Dict::def_letter_is_okay)\n    , probability_in_context_(&tesseract::Dict::def_probability_in_context)\n    , ccutil_(ccutil)\n    , wildcard_unichar_id_(INVALID_UNICHAR_ID)\n    , apostrophe_unichar_id_(INVALID_UNICHAR_ID)\n    , question_unichar_id_(INVALID_UNICHAR_ID)\n    , slash_unichar_id_(INVALID_UNICHAR_ID)\n    , hyphen_unichar_id_(INVALID_UNICHAR_ID)\n    , STRING_MEMBER(user_words_file, \"\", \"A filename of user-provided words.\",\n                    getCCUtil()->params())\n    , STRING_INIT_MEMBER(user_words_suffix, \"\",\n                         \"A suffix of user-provided words located in tessdata.\",\n                         getCCUtil()->params())\n    , STRING_MEMBER(user_patterns_file, \"\", \"A filename of user-provided patterns.\",\n                    getCCUtil()->params())\n    , STRING_INIT_MEMBER(user_patterns_suffix, \"\",\n                         \"A suffix of user-provided patterns located in \"\n                         \"tessdata.\",\n                         getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_system_dawg, true, \"Load system word dawg.\", getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_freq_dawg, true, \"Load frequent word dawg.\", getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_unambig_dawg, true, \"Load unambiguous word dawg.\",\n                       getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_punc_dawg, true,\n                       \"Load dawg with punctuation\"\n                       \" patterns.\",\n                       getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_number_dawg, true,\n                       \"Load dawg with number\"\n                       \" patterns.\",\n                       getCCUtil()->params())\n    , BOOL_INIT_MEMBER(load_bigram_dawg, true,\n                       \"Load dawg with special word \"\n                       \"bigrams.\",\n                       getCCUtil()->params())\n    , double_MEMBER(xheight_penalty_subscripts, 0.125,\n                    \"Score penalty (0.1 = 10%) added if there are subscripts \"\n                    \"or superscripts in a word, but it is otherwise OK.\",\n                    getCCUtil()->params())\n    , double_MEMBER(xheight_penalty_inconsistent, 0.25,\n                    \"Score penalty (0.1 = 10%) added if an xheight is \"\n                    \"inconsistent.\",\n                    getCCUtil()->params())\n    , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,\n                    \"Score multiplier for word matches which have good case and\"\n                    \" are frequent in the given language (lower is better).\",\n                    getCCUtil()->params())\n    , double_MEMBER(segment_penalty_dict_case_ok, 1.1,\n                    \"Score multiplier for word matches that have good case \"\n                    \"(lower is better).\",\n                    getCCUtil()->params())\n    , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,\n                    \"Default score multiplier for word matches, which may have \"\n                    \"case issues (lower is better).\",\n                    getCCUtil()->params())\n    , double_MEMBER(segment_penalty_dict_nonword, 1.25,\n                    \"Score multiplier for glyph fragment segmentations which \"\n                    \"do not match a dictionary word (lower is better).\",\n                    getCCUtil()->params())\n    , double_MEMBER(segment_penalty_garbage, 1.50,\n                    \"Score multiplier for poorly cased strings that are not in\"\n                    \" the dictionary and generally look like garbage (lower is\"\n                    \" better).\",\n                    getCCUtil()->params())\n    , STRING_MEMBER(output_ambig_words_file, \"\",\n                    \"Output file for ambiguities found in the dictionary\", getCCUtil()->params())\n    , INT_MEMBER(dawg_debug_level, 0,\n                 \"Set to 1 for general debug info\"\n                 \", to 2 for more details, to 3 to see all the debug messages\",\n                 getCCUtil()->params())\n    , INT_MEMBER(hyphen_debug_level, 0, \"Debug level for hyphenated words.\", getCCUtil()->params())\n    , BOOL_MEMBER(use_only_first_uft8_step, false,\n                  \"Use only the first UTF8 step of the given string\"\n                  \" when computing log probabilities.\",\n                  getCCUtil()->params())\n    , double_MEMBER(certainty_scale, 20.0, \"Certainty scaling factor\", getCCUtil()->params())\n    , double_MEMBER(stopper_nondict_certainty_base, -2.50, \"Certainty threshold for non-dict words\",\n                    getCCUtil()->params())\n    , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, \"Reject certainty offset\",\n                    getCCUtil()->params())\n    , INT_MEMBER(stopper_smallword_size, 2, \"Size of dict word to be treated as non-dict word\",\n                 getCCUtil()->params())\n    , double_MEMBER(stopper_certainty_per_char, -0.50,\n                    \"Certainty to add\"\n                    \" for each dict char above small word size.\",\n                    getCCUtil()->params())\n    , double_MEMBER(stopper_allowable_character_badness, 3.0,\n                    \"Max certainty variation allowed in a word (in sigma)\", getCCUtil()->params())\n    , INT_MEMBER(stopper_debug_level, 0, \"Stopper debug level\", getCCUtil()->params())\n    , BOOL_MEMBER(stopper_no_acceptable_choices, false,\n                  \"Make AcceptableChoice() always return false. Useful\"\n                  \" when there is a need to explore all segmentations\",\n                  getCCUtil()->params())\n    , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, \"Max words to keep in list\",\n                 getCCUtil()->params())\n    , STRING_MEMBER(word_to_debug, \"\",\n                    \"Word for which stopper debug\"\n                    \" information should be printed to stdout\",\n                    getCCUtil()->params())\n    , BOOL_MEMBER(segment_nonalphabetic_script, false,\n                  \"Don't use any alphabetic-specific tricks.\"\n                  \" Set to true in the traineddata config file for\"\n                  \" scripts that are cursive or inherently fixed-pitch\",\n                  getCCUtil()->params())\n    , BOOL_MEMBER(save_doc_words, 0, \"Save Document Words\", getCCUtil()->params())\n    , double_MEMBER(doc_dict_pending_threshold, 0.0, \"Worst certainty for using pending dictionary\",\n                    getCCUtil()->params())\n    , double_MEMBER(doc_dict_certainty_threshold, -2.25,\n                    \"Worst certainty for words that can be inserted into the\"\n                    \" document dictionary\",\n                    getCCUtil()->params())\n    , INT_MEMBER(max_permuter_attempts, 10000,\n                 \"Maximum number of different\"\n                 \" character choices to consider during permutation.\"\n                 \" This limit is especially useful when user patterns\"\n                 \" are specified, since overly generic patterns can result in\"\n                 \" dawg search exploring an overly large number of options.\",\n                 getCCUtil()->params()) {\n  reject_offset_ = 0.0;\n  go_deeper_fxn_ = nullptr;\n  hyphen_word_ = nullptr;\n  last_word_on_line_ = false;\n  document_words_ = nullptr;\n  dawg_cache_ = nullptr;\n  dawg_cache_is_ours_ = false;\n  pending_words_ = nullptr;\n  bigram_dawg_ = nullptr;\n  freq_dawg_ = nullptr;\n  punc_dawg_ = nullptr;\n  unambig_dawg_ = nullptr;\n  wordseg_rating_adjust_factor_ = -1.0f;\n  output_ambig_words_file_ = nullptr;\n}\n\nDict::~Dict() {\n  End();\n  delete hyphen_word_;\n  if (output_ambig_words_file_ != nullptr) {\n    fclose(output_ambig_words_file_);\n  }\n}\n\nDawgCache *Dict::GlobalDawgCache() {\n  // This global cache (a singleton) will outlive every Tesseract instance\n  // (even those that someone else might declare as global static variables).\n  static DawgCache cache;\n  return &cache;\n}\n\n// Sets up ready for a Load or LoadLSTM.\nvoid Dict::SetupForLoad(DawgCache *dawg_cache) {\n  if (dawgs_.size() != 0) {\n    this->End();\n  }\n\n  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);\n  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);\n  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);\n  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);\n\n  if (dawg_cache != nullptr) {\n    dawg_cache_ = dawg_cache;\n    dawg_cache_is_ours_ = false;\n  } else {\n    dawg_cache_ = new DawgCache();\n    dawg_cache_is_ours_ = true;\n  }\n}\n\n// Loads the dawgs needed by Tesseract. Call FinishLoad() after.\nvoid Dict::Load(const std::string &lang, TessdataManager *data_file) {\n  // Load dawgs_.\n  if (load_punc_dawg) {\n    punc_dawg_ =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);\n    if (punc_dawg_) {\n      dawgs_.push_back(punc_dawg_);\n    }\n  }\n  if (load_system_dawg) {\n    Dawg *system_dawg =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);\n    if (system_dawg) {\n      dawgs_.push_back(system_dawg);\n    }\n  }\n  if (load_number_dawg) {\n    Dawg *number_dawg =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);\n    if (number_dawg) {\n      dawgs_.push_back(number_dawg);\n    }\n  }\n  if (load_bigram_dawg) {\n    bigram_dawg_ =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);\n    // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the\n    // dawgs_!!\n  }\n  if (load_freq_dawg) {\n    freq_dawg_ =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);\n    if (freq_dawg_) {\n      dawgs_.push_back(freq_dawg_);\n    }\n  }\n  if (load_unambig_dawg) {\n    unambig_dawg_ =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);\n    if (unambig_dawg_) {\n      dawgs_.push_back(unambig_dawg_);\n    }\n  }\n\n  std::string name;\n  if (!user_words_suffix.empty() || !user_words_file.empty()) {\n    Trie *trie_ptr =\n        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);\n    if (!user_words_file.empty()) {\n      name = user_words_file;\n    } else {\n      name = getCCUtil()->language_data_path_prefix;\n      name += user_words_suffix;\n    }\n    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),\n                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {\n      tprintf(\"Error: failed to load %s\\n\", name.c_str());\n      delete trie_ptr;\n    } else {\n      dawgs_.push_back(trie_ptr);\n    }\n  }\n\n  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {\n    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),\n                              dawg_debug_level);\n    trie_ptr->initialize_patterns(&(getUnicharset()));\n    if (!user_patterns_file.empty()) {\n      name = user_patterns_file;\n    } else {\n      name = getCCUtil()->language_data_path_prefix;\n      name += user_patterns_suffix;\n    }\n    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {\n      tprintf(\"Error: failed to load %s\\n\", name.c_str());\n      delete trie_ptr;\n    } else {\n      dawgs_.push_back(trie_ptr);\n    }\n  }\n\n  document_words_ =\n      new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);\n  dawgs_.push_back(document_words_);\n\n  // This dawg is temporary and should not be searched by letter_is_ok.\n  pending_words_ =\n      new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);\n}\n\n// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.\nvoid Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {\n  // Load dawgs_.\n  if (load_punc_dawg) {\n    punc_dawg_ =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);\n    if (punc_dawg_) {\n      dawgs_.push_back(punc_dawg_);\n    }\n  }\n  if (load_system_dawg) {\n    Dawg *system_dawg =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);\n    if (system_dawg) {\n      dawgs_.push_back(system_dawg);\n    }\n  }\n  if (load_number_dawg) {\n    Dawg *number_dawg =\n        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);\n    if (number_dawg) {\n      dawgs_.push_back(number_dawg);\n    }\n  }\n\n  // stolen from Dict::Load (but needs params_ from Tesseract\n  // langdata/config/api):\n  std::string name;\n  if (!user_words_suffix.empty() || !user_words_file.empty()) {\n    Trie *trie_ptr =\n        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);\n    if (!user_words_file.empty()) {\n      name = user_words_file;\n    } else {\n      name = getCCUtil()->language_data_path_prefix;\n      name += user_words_suffix;\n    }\n    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),\n                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {\n      tprintf(\"Error: failed to load %s\\n\", name.c_str());\n      delete trie_ptr;\n    } else {\n      dawgs_.push_back(trie_ptr);\n    }\n  }\n\n  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {\n    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),\n                              dawg_debug_level);\n    trie_ptr->initialize_patterns(&(getUnicharset()));\n    if (!user_patterns_file.empty()) {\n      name = user_patterns_file;\n    } else {\n      name = getCCUtil()->language_data_path_prefix;\n      name += user_patterns_suffix;\n    }\n    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {\n      tprintf(\"Error: failed to load %s\\n\", name.c_str());\n      delete trie_ptr;\n    } else {\n      dawgs_.push_back(trie_ptr);\n    }\n  }\n}\n\n// Completes the loading process after Load() and/or LoadLSTM().\n// Returns false if no dictionaries were loaded.\nbool Dict::FinishLoad() {\n  if (dawgs_.empty()) {\n    return false;\n  }\n  // Construct a list of corresponding successors for each dawg. Each entry, i,\n  // in the successors_ vector is a vector of integers that represent the\n  // indices into the dawgs_ vector of the successors for dawg i.\n  successors_.reserve(dawgs_.size());\n  for (auto dawg : dawgs_) {\n    auto *lst = new SuccessorList();\n    for (unsigned j = 0; j < dawgs_.size(); ++j) {\n      const Dawg *other = dawgs_[j];\n      if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&\n          kDawgSuccessors[dawg->type()][other->type()]) {\n        lst->push_back(j);\n      }\n    }\n    successors_.push_back(lst);\n  }\n  return true;\n}\n\nvoid Dict::End() {\n  if (dawgs_.empty()) {\n    return; // Not safe to call twice.\n  }\n  for (auto &dawg : dawgs_) {\n    if (!dawg_cache_->FreeDawg(dawg)) {\n      delete dawg;\n    }\n  }\n  dawg_cache_->FreeDawg(bigram_dawg_);\n  if (dawg_cache_is_ours_) {\n    delete dawg_cache_;\n    dawg_cache_ = nullptr;\n  }\n  for (auto successor : successors_) {\n    delete successor;\n  }\n  dawgs_.clear();\n  successors_.clear();\n  document_words_ = nullptr;\n  delete pending_words_;\n  pending_words_ = nullptr;\n}\n\n// Returns true if in light of the current state unichar_id is allowed\n// according to at least one of the dawgs in the dawgs_ vector.\n// See more extensive comments in dict.h where this function is declared.\nint Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset,\n                             UNICHAR_ID unichar_id, bool word_end) const {\n  auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);\n\n  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));\n\n  if (dawg_debug_level >= 3) {\n    tesserr << \"def_letter_is_okay: current unichar=\"\n            << getUnicharset().debug_str(unichar_id)\n            << \" word_end=\" << word_end\n            << \" num active dawgs=\" << dawg_args->active_dawgs->size() << '\\n';\n  }\n\n  // Do not accept words that contain kPatternUnicharID.\n  // (otherwise pattern dawgs would not function correctly).\n  // Do not accept words containing INVALID_UNICHAR_IDs.\n  if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {\n    dawg_args->permuter = NO_PERM;\n    return NO_PERM;\n  }\n\n  // Initialization.\n  PermuterType curr_perm = NO_PERM;\n  dawg_args->updated_dawgs->clear();\n  dawg_args->valid_end = false;\n\n  // Go over the active_dawgs vector and insert DawgPosition records\n  // with the updated ref (an edge with the corresponding unichar id) into\n  // dawg_args->updated_pos.\n  for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {\n    const DawgPosition &pos = (*dawg_args->active_dawgs)[a];\n    const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;\n    const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;\n\n    if (!dawg && !punc_dawg) {\n      // shouldn't happen.\n      tprintf(\"Received DawgPosition with no dawg or punc_dawg.  wth?\\n\");\n      continue;\n    }\n    if (!dawg) {\n      // We're in the punctuation dawg.  A core dawg has not been chosen.\n      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);\n      EDGE_REF punc_transition_edge =\n          punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);\n      if (punc_transition_edge != NO_EDGE) {\n        // Find all successors, and see which can transition.\n        const SuccessorList &slist = *(successors_[pos.punc_index]);\n        for (int sdawg_index : slist) {\n          const Dawg *sdawg = dawgs_[sdawg_index];\n          UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);\n          EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);\n          if (dawg_edge != NO_EDGE) {\n            if (dawg_debug_level >= 3) {\n              tprintf(\"Letter found in dawg %d\\n\", sdawg_index);\n            }\n            dawg_args->updated_dawgs->add_unique(\n                DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),\n                dawg_debug_level > 0, \"Append transition from punc dawg to current dawgs: \");\n            if (sdawg->permuter() > curr_perm) {\n              curr_perm = sdawg->permuter();\n            }\n            if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {\n              dawg_args->valid_end = true;\n            }\n          }\n        }\n      }\n      EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);\n      if (punc_edge != NO_EDGE) {\n        if (dawg_debug_level >= 3) {\n          tprintf(\"Letter found in punctuation dawg\\n\");\n        }\n        dawg_args->updated_dawgs->add_unique(\n            DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,\n            \"Extend punctuation dawg: \");\n        if (PUNC_PERM > curr_perm) {\n          curr_perm = PUNC_PERM;\n        }\n        if (punc_dawg->end_of_word(punc_edge)) {\n          dawg_args->valid_end = true;\n        }\n      }\n      continue;\n    }\n\n    if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {\n      // We can end the main word here.\n      //  If we can continue on the punc ref, add that possibility.\n      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);\n      EDGE_REF punc_edge =\n          punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);\n      if (punc_edge != NO_EDGE) {\n        dawg_args->updated_dawgs->add_unique(\n            DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),\n            dawg_debug_level > 0, \"Return to punctuation dawg: \");\n        if (dawg->permuter() > curr_perm) {\n          curr_perm = dawg->permuter();\n        }\n        if (punc_dawg->end_of_word(punc_edge)) {\n          dawg_args->valid_end = true;\n        }\n      }\n    }\n\n    if (pos.back_to_punc) {\n      continue;\n    }\n\n    // If we are dealing with the pattern dawg, look up all the\n    // possible edges, not only for the exact unichar_id, but also\n    // for all its character classes (alpha, digit, etc).\n    if (dawg->type() == DAWG_TYPE_PATTERN) {\n      ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);\n      // There can't be any successors to dawg that is of type\n      // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.\n      continue;\n    }\n\n    // Find the edge out of the node for the unichar_id.\n    NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);\n    EDGE_REF edge =\n        (node == NO_EDGE)\n            ? NO_EDGE\n            : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);\n\n    if (dawg_debug_level >= 3) {\n      tprintf(\"Active dawg: [%d, \" REFFORMAT \"] edge=\" REFFORMAT \"\\n\", pos.dawg_index, node, edge);\n    }\n\n    if (edge != NO_EDGE) { // the unichar was found in the current dawg\n      if (dawg_debug_level >= 3) {\n        tprintf(\"Letter found in dawg %d\\n\", pos.dawg_index);\n      }\n      if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {\n        if (dawg_debug_level >= 3) {\n          tprintf(\"Punctuation constraint not satisfied at end of word.\\n\");\n        }\n        continue;\n      }\n      if (dawg->permuter() > curr_perm) {\n        curr_perm = dawg->permuter();\n      }\n      if (dawg->end_of_word(edge) &&\n          (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {\n        dawg_args->valid_end = true;\n      }\n      dawg_args->updated_dawgs->add_unique(\n          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),\n          dawg_debug_level > 0, \"Append current dawg to updated active dawgs: \");\n    }\n  } // end for\n  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM\n  // or if we found the current letter in a non-punctuation dawg. This\n  // allows preserving information on which dawg the \"core\" word came from.\n  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.\n  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||\n      (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {\n    dawg_args->permuter = curr_perm;\n  }\n  if (dawg_debug_level >= 2) {\n    tprintf(\"Returning %d for permuter code for this character.\\n\", dawg_args->permuter);\n  }\n  return dawg_args->permuter;\n}\n\nvoid Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id,\n                               bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const {\n  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);\n  // Try to find the edge corresponding to the exact unichar_id and to all the\n  // edges corresponding to the character class of unichar_id.\n  std::vector<UNICHAR_ID> unichar_id_patterns;\n  unichar_id_patterns.push_back(unichar_id);\n  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);\n  for (int unichar_id_pattern : unichar_id_patterns) {\n    // On the first iteration check all the outgoing edges.\n    // On the second iteration check all self-loops.\n    for (int k = 0; k < 2; ++k) {\n      EDGE_REF edge = (k == 0)\n                          ? dawg->edge_char_of(node, unichar_id_pattern, word_end)\n                          : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);\n      if (edge == NO_EDGE) {\n        continue;\n      }\n      if (dawg_debug_level >= 3) {\n        tprintf(\"Pattern dawg: [%d, \" REFFORMAT \"] edge=\" REFFORMAT \"\\n\", pos.dawg_index, node,\n                edge);\n        tprintf(\"Letter found in pattern dawg %d\\n\", pos.dawg_index);\n      }\n      if (dawg->permuter() > *curr_perm) {\n        *curr_perm = dawg->permuter();\n      }\n      if (dawg->end_of_word(edge)) {\n        dawg_args->valid_end = true;\n      }\n      dawg_args->updated_dawgs->add_unique(\n          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),\n          dawg_debug_level > 0, \"Append current dawg to updated active dawgs: \");\n    }\n  }\n}\n\n// Fill the given active_dawgs vector with dawgs that could contain the\n// beginning of the word. If hyphenated() returns true, copy the entries\n// from hyphen_active_dawgs_ instead.\nvoid Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {\n  if (hyphenated()) {\n    *active_dawgs = hyphen_active_dawgs_;\n    if (dawg_debug_level >= 3) {\n      for (const auto &dawg : hyphen_active_dawgs_) {\n        tprintf(\"Adding hyphen beginning dawg [%d, \" REFFORMAT \"]\\n\",\n                dawg.dawg_index, dawg.dawg_ref);\n      }\n    }\n  } else {\n    default_dawgs(active_dawgs, ambigs_mode);\n  }\n}\n\nvoid Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const {\n  bool punc_dawg_available = (punc_dawg_ != nullptr) &&\n                             punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;\n\n  for (unsigned i = 0; i < dawgs_.size(); i++) {\n    if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {\n      int dawg_ty = dawgs_[i]->type();\n      bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];\n      if (dawg_ty == DAWG_TYPE_PUNCTUATION) {\n        dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));\n        if (dawg_debug_level >= 3) {\n          tprintf(\"Adding beginning punc dawg [%u, \" REFFORMAT \"]\\n\", i, NO_EDGE);\n        }\n      } else if (!punc_dawg_available || !subsumed_by_punc) {\n        dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));\n        if (dawg_debug_level >= 3) {\n          tprintf(\"Adding beginning dawg [%u, \" REFFORMAT \"]\\n\", i, NO_EDGE);\n        }\n      }\n    }\n  }\n}\n\nvoid Dict::add_document_word(const WERD_CHOICE &best_choice) {\n  // Do not add hyphenated word parts to the document dawg.\n  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is\n  // called when the first part of the hyphenated word is\n  // discovered and while the second part of the word is recognized.\n  // hyphen_word_ is cleared in cc_recg() before the next word on\n  // the line is recognized.\n  if (hyphen_word_) {\n    return;\n  }\n\n  int stringlen = best_choice.length();\n\n  if (valid_word(best_choice) || stringlen < 2) {\n    return;\n  }\n\n  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.\n  if (best_choice.length() >= kDocDictMaxRepChars) {\n    int num_rep_chars = 1;\n    UNICHAR_ID uch_id = best_choice.unichar_id(0);\n    for (unsigned i = 1; i < best_choice.length(); ++i) {\n      if (best_choice.unichar_id(i) != uch_id) {\n        num_rep_chars = 1;\n        uch_id = best_choice.unichar_id(i);\n      } else {\n        ++num_rep_chars;\n        if (num_rep_chars == kDocDictMaxRepChars) {\n          return;\n        }\n      }\n    }\n  }\n\n  if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {\n    if (best_choice.certainty() < doc_dict_pending_threshold) {\n      return;\n    }\n\n    if (!pending_words_->word_in_dawg(best_choice)) {\n      if (stringlen > 2 ||\n          (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&\n           getUnicharset().get_isupper(best_choice.unichar_id(1)))) {\n        pending_words_->add_word_to_dawg(best_choice);\n      }\n      return;\n    }\n  }\n\n  if (save_doc_words) {\n    std::string filename(getCCUtil()->imagefile);\n    filename += \".doc\";\n    FILE *doc_word_file = fopen(filename.c_str(), \"a\");\n    if (doc_word_file == nullptr) {\n      tprintf(\"Error: Could not open file %s\\n\", filename.c_str());\n      ASSERT_HOST(doc_word_file);\n    }\n    fprintf(doc_word_file, \"%s\\n\", best_choice.debug_string().c_str());\n    fclose(doc_word_file);\n  }\n  document_words_->add_word_to_dawg(best_choice);\n}\n\nvoid Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,\n                       float additional_adjust, bool modify_rating, bool debug) {\n  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&\n                 word->GetTopScriptID() == getUnicharset().han_sid());\n  bool case_is_ok = (is_han || case_ok(*word));\n  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));\n\n  float adjust_factor = additional_adjust;\n  float new_rating = word->rating();\n  new_rating += kRatingPad;\n  const char *xheight_triggered = \"\";\n  if (word->length() > 1) {\n    // Calculate x-height and y-offset consistency penalties.\n    switch (xheight_consistency) {\n      case XH_INCONSISTENT:\n        adjust_factor += xheight_penalty_inconsistent;\n        xheight_triggered = \", xhtBAD\";\n        break;\n      case XH_SUBNORMAL:\n        adjust_factor += xheight_penalty_subscripts;\n        xheight_triggered = \", xhtSUB\";\n        break;\n      case XH_GOOD:\n        // leave the factor alone - all good!\n        break;\n    }\n    // TODO(eger): if nonword is true, but there is a \"core\" that is a dict\n    // word, negate nonword status.\n  } else {\n    if (debug) {\n      tprintf(\"Consistency could not be calculated.\\n\");\n    }\n  }\n  if (debug) {\n    tprintf(\"%sWord: %s %4.2f%s\", nonword ? \"Non-\" : \"\", word->unichar_string().c_str(),\n            word->rating(), xheight_triggered);\n  }\n\n  if (nonword) { // non-dictionary word\n    if (case_is_ok && punc_is_ok) {\n      adjust_factor += segment_penalty_dict_nonword;\n      new_rating *= adjust_factor;\n      if (debug) {\n        tprintf(\", W\");\n      }\n    } else {\n      adjust_factor += segment_penalty_garbage;\n      new_rating *= adjust_factor;\n      if (debug) {\n        if (!case_is_ok) {\n          tprintf(\", C\");\n        }\n        if (!punc_is_ok) {\n          tprintf(\", P\");\n        }\n      }\n    }\n  } else { // dictionary word\n    if (case_is_ok) {\n      if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {\n        word->set_permuter(FREQ_DAWG_PERM);\n        adjust_factor += segment_penalty_dict_frequent_word;\n        new_rating *= adjust_factor;\n        if (debug) {\n          tprintf(\", F\");\n        }\n      } else {\n        adjust_factor += segment_penalty_dict_case_ok;\n        new_rating *= adjust_factor;\n        if (debug) {\n          tprintf(\", \");\n        }\n      }\n    } else {\n      adjust_factor += segment_penalty_dict_case_bad;\n      new_rating *= adjust_factor;\n      if (debug) {\n        tprintf(\", C\");\n      }\n    }\n  }\n  new_rating -= kRatingPad;\n  if (modify_rating) {\n    word->set_rating(new_rating);\n  }\n  if (debug) {\n    tprintf(\" %4.2f --> %4.2f\\n\", adjust_factor, new_rating);\n  }\n  word->set_adjust_factor(adjust_factor);\n}\n\nint Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {\n  const WERD_CHOICE *word_ptr = &word;\n  WERD_CHOICE temp_word(word.unicharset());\n  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {\n    copy_hyphen_info(&temp_word);\n    temp_word += word;\n    word_ptr = &temp_word;\n  }\n  if (word_ptr->empty()) {\n    return NO_PERM;\n  }\n  // Allocate vectors for holding current and updated\n  // active_dawgs and initialize them.\n  DawgPositionVector active_dawgs[2];\n  init_active_dawgs(&(active_dawgs[0]), false);\n  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);\n  int last_index = word_ptr->length() - 1;\n  // Call letter_is_okay for each letter in the word.\n  for (int i = hyphen_base_size(); i <= last_index; ++i) {\n    if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),\n                                   i == last_index))) {\n      break;\n    }\n    // Swap active_dawgs, constraints with the corresponding updated vector.\n    if (dawg_args.updated_dawgs == &(active_dawgs[1])) {\n      dawg_args.updated_dawgs = &(active_dawgs[0]);\n      ++(dawg_args.active_dawgs);\n    } else {\n      ++(dawg_args.updated_dawgs);\n      dawg_args.active_dawgs = &(active_dawgs[0]);\n    }\n  }\n  return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;\n}\n\nbool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const {\n  if (bigram_dawg_ == nullptr) {\n    return false;\n  }\n\n  // Extract the core word from the middle of each word with any digits\n  //         replaced with question marks.\n  unsigned w1start, w1end, w2start, w2end;\n  word1.punct_stripped(&w1start, &w1end);\n  word2.punct_stripped(&w2start, &w2end);\n\n  // We don't want to penalize a single guillemet, hyphen, etc.\n  // But our bigram list doesn't have any information about punctuation.\n  if (w1start >= w1end) {\n    return word1.length() < 3;\n  }\n  if (w2start >= w2end) {\n    return word2.length() < 3;\n  }\n\n  const UNICHARSET &uchset = getUnicharset();\n  std::vector<UNICHAR_ID> bigram_string;\n  bigram_string.reserve(w1end + w2end + 1);\n  for (auto i = w1start; i < w1end; i++) {\n    const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));\n    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {\n      bigram_string.push_back(question_unichar_id_);\n    } else {\n      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());\n    }\n  }\n  bigram_string.push_back(UNICHAR_SPACE);\n  for (auto i = w2start; i < w2end; i++) {\n    const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));\n    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {\n      bigram_string.push_back(question_unichar_id_);\n    } else {\n      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());\n    }\n  }\n  WERD_CHOICE normalized_word(&uchset, bigram_string.size());\n  for (int i : bigram_string) {\n    normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);\n  }\n  return bigram_dawg_->word_in_dawg(normalized_word);\n}\n\nbool Dict::valid_punctuation(const WERD_CHOICE &word) {\n  if (word.empty()) {\n    return NO_PERM;\n  }\n  WERD_CHOICE new_word(word.unicharset());\n  auto last_index = word.length() - 1;\n  int new_len;\n  for (unsigned i = 0; i <= last_index; ++i) {\n    UNICHAR_ID unichar_id = (word.unichar_id(i));\n    if (getUnicharset().get_ispunctuation(unichar_id)) {\n      new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);\n    } else if (!getUnicharset().get_isalpha(unichar_id) &&\n               !getUnicharset().get_isdigit(unichar_id)) {\n      return false; // neither punc, nor alpha, nor digit\n    } else if ((new_len = new_word.length()) == 0 ||\n               new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {\n      new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);\n    }\n  }\n  for (auto dawg : dawgs_) {\n    if (dawg != nullptr && dawg->type() == DAWG_TYPE_PUNCTUATION &&\n        dawg->word_in_dawg(new_word)) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/// Returns true if the language is space-delimited (not CJ, or T).\nbool Dict::IsSpaceDelimitedLang() const {\n  const UNICHARSET &u_set = getUnicharset();\n  if (u_set.han_sid() > 0) {\n    return false;\n  }\n  if (u_set.katakana_sid() > 0) {\n    return false;\n  }\n  if (u_set.thai_sid() > 0) {\n    return false;\n  }\n  return true;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/dict.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dict.h\n// Description: dict class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_DICT_DICT_H_\n#define TESSERACT_DICT_DICT_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"ambigs.h\"\n#endif\n#include \"dawg.h\"\n#include \"dawg_cache.h\"\n#include \"ratngs.h\"\n#include \"stopper.h\"\n#include \"trie.h\"\n#include \"unicharset.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"params_training_featdef.h\"\n#endif // ndef DISABLED_LEGACY_ENGINE\n\nnamespace tesseract {\n\nclass MATRIX;\nclass WERD_RES;\n\n#define CHARS_PER_LINE 500\n#define MAX_WERD_LENGTH (int64_t)128\n#define NO_RATING -1\n\n/** Struct used to hold temporary information about fragments. */\nstruct CHAR_FRAGMENT_INFO {\n  UNICHAR_ID unichar_id;\n  const CHAR_FRAGMENT *fragment;\n  int num_fragments;\n  float rating;\n  float certainty;\n};\n\nusing DawgVector = std::vector<Dawg *>;\n\n//\n// Constants\n//\nstatic const int kRatingPad = 4;\nstatic const int kDictMaxWildcards = 2; // max wildcards for a word\n// TODO(daria): If hyphens are different in different languages and can be\n// inferred from training data we should load their values dynamically.\nstatic const char kHyphenSymbol[] = \"-\";\nstatic const char kSlashSymbol[] = \"/\";\nstatic const char kQuestionSymbol[] = \"?\";\nstatic const char kApostropheSymbol[] = \"'\";\nstatic const float kSimCertaintyScale = -10.0;  // similarity matcher scaling\nstatic const float kSimCertaintyOffset = -10.0; // similarity matcher offset\nstatic const float kSimilarityFloor = 100.0;    // worst E*L product to stop on\nstatic const int kDocDictMaxRepChars = 4;\n\n// Enum for describing whether the x-height for the word is consistent:\n//  0 - everything is good.\n//  1 - there are one or two secondary (but consistent) baselines\n//      [think subscript and superscript], or there is an oversized\n//      first character.\n//  2 - the word is inconsistent.\nenum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT };\n\nstruct DawgArgs {\n  DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)\n      : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}\n\n  DawgPositionVector *active_dawgs;\n  DawgPositionVector *updated_dawgs;\n  PermuterType permuter;\n  // True if the current position is a valid word end.\n  bool valid_end;\n};\n\nclass TESS_API Dict {\npublic:\n  Dict(CCUtil *image_ptr);\n  ~Dict();\n  const CCUtil *getCCUtil() const {\n    return ccutil_;\n  }\n  CCUtil *getCCUtil() {\n    return ccutil_;\n  }\n  const UNICHARSET &getUnicharset() const {\n    return getCCUtil()->unicharset;\n  }\n  UNICHARSET &getUnicharset() {\n    return getCCUtil()->unicharset;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  const UnicharAmbigs &getUnicharAmbigs() const {\n    return getCCUtil()->unichar_ambigs;\n  }\n#endif\n  // Returns true if unichar_id is a word compounding character like - or /.\n  inline bool compound_marker(UNICHAR_ID unichar_id) {\n    const UNICHARSET &unicharset = getUnicharset();\n    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));\n    const auto &normed_ids = unicharset.normed_ids(unichar_id);\n    return normed_ids.size() == 1 &&\n           (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);\n  }\n  // Returns true if unichar_id is an apostrophe-like character that may\n  // separate prefix/suffix words from a main body word.\n  inline bool is_apostrophe(UNICHAR_ID unichar_id) {\n    const UNICHARSET &unicharset = getUnicharset();\n    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));\n    const auto &normed_ids = unicharset.normed_ids(unichar_id);\n    return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;\n  }\n\n  /* hyphen.cpp ************************************************************/\n\n  /// Returns true if we've recorded the beginning of a hyphenated word.\n  inline bool hyphenated() const {\n    return !last_word_on_line_ && hyphen_word_;\n  }\n  /// Size of the base word (the part on the line before) of a hyphenated word.\n  inline int hyphen_base_size() const {\n    return this->hyphenated() ? hyphen_word_->length() : 0;\n  }\n  /// If this word is hyphenated copy the base word (the part on\n  /// the line before) of a hyphenated word into the given word.\n  /// This function assumes that word is not nullptr.\n  inline void copy_hyphen_info(WERD_CHOICE *word) const {\n    if (this->hyphenated()) {\n      *word = *hyphen_word_;\n      if (hyphen_debug_level) {\n        word->print(\"copy_hyphen_info: \");\n      }\n    }\n  }\n  /// Check whether the word has a hyphen at the end.\n  inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,\n                             bool first_pos) const {\n    if (!last_word_on_line_ || first_pos) {\n      return false;\n    }\n    ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));\n    const auto &normed_ids = unicharset->normed_ids(unichar_id);\n    return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;\n  }\n  /// Same as above, but check the unichar at the end of the word.\n  inline bool has_hyphen_end(const WERD_CHOICE &word) const {\n    int word_index = word.length() - 1;\n    return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);\n  }\n  /// Unless the previous word was the last one on the line, and the current\n  /// one is not (thus it is the first one on the line), erase hyphen_word_,\n  /// clear hyphen_active_dawgs_, update last_word_on_line_.\n  void reset_hyphen_vars(bool last_word_on_line);\n  /// Update hyphen_word_, and copy the given DawgPositionVectors into\n  /// hyphen_active_dawgs_ .\n  void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);\n\n  /* permdawg.cpp ************************************************************/\n  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().\n  // When this function is refactored, permdawg.cpp can be removed.\n\n  /// Copies word into best_choice if its rating is smaller\n  /// than that of best_choice.\n  inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {\n    if (word.rating() < best_choice->rating()) {\n      *best_choice = word;\n    }\n  }\n  /// Fill the given active_dawgs vector with dawgs that could contain the\n  /// beginning of the word. If hyphenated() returns true, copy the entries\n  /// from hyphen_active_dawgs_ instead.\n  void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;\n  // Fill the given vector with the default collection of any-length dawgs\n  void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;\n\n  /// Recursively explore all the possible character combinations in\n  /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the\n  /// dawgs in the dawgs_ vector in parallel and discard invalid words.\n  ///\n  /// Allocate and return a WERD_CHOICE with the best valid word found.\n  WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                                       float rating_limit);\n  /// If the choice being composed so far could be a dictionary word\n  /// and we have not reached the end of the word keep exploring the\n  /// char_choices further.\n  void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                          int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,\n                          bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,\n                          WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);\n\n  /// Pointer to go_deeper function.\n  void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                               int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,\n                               bool word_ending, WERD_CHOICE *word, float certainties[],\n                               float *limit, WERD_CHOICE *best_choice, int *attempts_left,\n                               void *void_more_args);\n  //\n  // Helper functions for dawg_permute_and_select().\n  //\n  void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,\n                       WERD_CHOICE *word, float certainties[], float *limit,\n                       WERD_CHOICE *best_choice, int *attempts_left, void *more_args);\n\n  void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                      const BLOB_CHOICE &blob_choice, int char_choice_index,\n                      const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,\n                      float certainties[], float *limit, WERD_CHOICE *best_choice,\n                      int *attempts_left, void *more_args);\n\n  bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,\n                           const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,\n                           int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);\n\n  /* stopper.cpp *************************************************************/\n#if !defined(DISABLED_LEGACY_ENGINE)\n  bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,\n                        MATRIX *ratings);\n#endif // !defined(DISABLED_LEGACY_ENGINE)\n  // Replaces the corresponding wrong ngram in werd_choice with the correct\n  // one. The whole correct n-gram is inserted into the ratings matrix and\n  // the werd_choice: no more fragments!. Rating and certainty of new entries\n  // in matrix and werd_choice are the sum and mean of the wrong ngram\n  // respectively.\n  // E.g. for werd_choice mystring'' and ambiguity ''->\": werd_choice becomes\n  // mystring\", with a new entry in the ratings matrix for \".\n  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,\n                    WERD_CHOICE *werd_choice, MATRIX *ratings);\n\n  /// Returns the length of the shortest alpha run in WordChoice.\n  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;\n  /// Returns true if the certainty of the BestChoice word is within a\n  /// reasonable range of the average certainties for the best choices for\n  /// each character in the segmentation.  This test is used to catch words\n  /// in which one character is much worse than the other characters in the\n  /// word (i.e. false will be returned in that case). The algorithm computes\n  /// the mean and std deviation of the certainties in the word with the worst\n  /// certainty thrown out.\n  int UniformCertainties(const WERD_CHOICE &word);\n  /// Returns true if the given best_choice is good enough to stop.\n  bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);\n  /// Returns false if the best choice for the current word is questionable\n  /// and should be tried again on the second pass or should be flagged to\n  /// the user.\n  bool AcceptableResult(WERD_RES *word) const;\n#if !defined(DISABLED_LEGACY_ENGINE)\n  void EndDangerousAmbigs();\n#endif // !defined(DISABLED_LEGACY_ENGINE)\n  /// Prints the current choices for this word to stdout.\n  void DebugWordChoices();\n  /// Sets up stopper variables in preparation for the first pass.\n  void SetupStopperPass1();\n  /// Sets up stopper variables in preparation for the second pass.\n  void SetupStopperPass2();\n  /* context.cpp *************************************************************/\n  /// Check a string to see if it matches a set of lexical rules.\n  int case_ok(const WERD_CHOICE &word) const;\n  /// Returns true if the word looks like an absolute garbage\n  /// (e.g. image mistakenly recognized as text).\n  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);\n\n  /* dict.cpp ****************************************************************/\n\n  /// Initialize Dict class - load dawgs from [lang].traineddata and\n  /// user-specified wordlist and pattern list.\n  static DawgCache *GlobalDawgCache();\n  // Sets up ready for a Load or LoadLSTM.\n  void SetupForLoad(DawgCache *dawg_cache);\n  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.\n  void Load(const std::string &lang, TessdataManager *data_file);\n  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.\n  void LoadLSTM(const std::string &lang, TessdataManager *data_file);\n  // Completes the loading process after Load() and/or LoadLSTM().\n  // Returns false if no dictionaries were loaded.\n  bool FinishLoad();\n  void End();\n\n  // Resets the document dictionary analogous to ResetAdaptiveClassifier.\n  void ResetDocumentDictionary() {\n    if (pending_words_ != nullptr) {\n      pending_words_->clear();\n    }\n    if (document_words_ != nullptr) {\n      document_words_->clear();\n    }\n  }\n\n  /**\n   * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light\n   * of the current state the letter at word_index in the given word\n   * is allowed according to at least one of the dawgs in dawgs_,\n   * otherwise returns NO_PERM.\n   *\n   * The state is described by void_dawg_args, which are interpreted as\n   * DawgArgs and contain relevant active dawg positions.\n   * Each entry in the active_dawgs vector contains an index\n   * into the dawgs_ vector and an EDGE_REF that indicates the last edge\n   * followed in the dawg.  It also may contain a position in the punctuation\n   * dawg which describes surrounding punctuation (see struct DawgPosition).\n   *\n   * Input:\n   * At word_index 0 dawg_args->active_dawgs should contain an entry for each\n   * dawg that may start at the beginning of a word, with punc_ref and edge_ref\n   * initialized to NO_EDGE.  Since the punctuation dawg includes the empty\n   * pattern \" \" (meaning anything without surrounding punctuation), having a\n   * single entry for the punctuation dawg will cover all dawgs reachable\n   * there from -- that includes all number and word dawgs. The only dawg\n   * non-reachable from the punctuation_dawg is the pattern dawg.\n   * If hyphen state needs to be applied, initial dawg_args->active_dawgs can\n   * be copied from the saved hyphen state (maintained by Dict).\n   * For word_index > 0 the corresponding state (active_dawgs and punc position)\n   * can be obtained from dawg_args->updated_dawgs passed to\n   * def_letter_is_okay for word_index-1.\n   * Note: the function assumes that active_dawgs, and updated_dawgs\n   * member variables of dawg_args are not nullptr.\n   *\n   * Output:\n   * The function fills in dawg_args->updated_dawgs vector with the\n   * entries for dawgs that contain the word up to the letter at word_index.\n   *\n   */\n\n  //\n  int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,\n                         bool word_end) const;\n\n  int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,\n                               UNICHAR_ID unichar_id, bool word_end) const;\n  /// Calls letter_is_okay_ member function.\n  int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,\n                   bool word_end) const {\n    return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);\n  }\n\n  /// Probability in context function used by the ngram permuter.\n  double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,\n                                          const char *character, int character_bytes);\n  /// Calls probability_in_context_ member function.\n  double ProbabilityInContext(const char *context, int context_bytes, const char *character,\n                              int character_bytes) {\n    return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,\n                                            character, character_bytes);\n  }\n\n  /// Default (no-op) implementation of probability in context function.\n  double def_probability_in_context(const char *lang, const char *context, int context_bytes,\n                                    const char *character, int character_bytes) {\n    (void)lang;\n    (void)context;\n    (void)context_bytes;\n    (void)character;\n    (void)character_bytes;\n    return 0.0;\n  }\n\n  inline void SetWildcardID(UNICHAR_ID id) {\n    wildcard_unichar_id_ = id;\n  }\n  inline UNICHAR_ID WildcardID() const {\n    return wildcard_unichar_id_;\n  }\n  /// Return the number of dawgs in the dawgs_ vector.\n  inline int NumDawgs() const {\n    return dawgs_.size();\n  }\n  /// Return i-th dawg pointer recorded in the dawgs_ vector.\n  inline const Dawg *GetDawg(int index) const {\n    return dawgs_[index];\n  }\n  /// Return the points to the punctuation dawg.\n  inline const Dawg *GetPuncDawg() const {\n    return punc_dawg_;\n  }\n  /// Return the points to the unambiguous words dawg.\n  inline const Dawg *GetUnambigDawg() const {\n    return unambig_dawg_;\n  }\n  /// Returns the appropriate next node given the EDGE_REF.\n  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {\n    if (edge_ref == NO_EDGE) {\n      return 0; // beginning to explore the dawg\n    }\n    NODE_REF node = dawg->next_node(edge_ref);\n    if (node == 0) {\n      node = NO_EDGE; // end of word\n    }\n    return node;\n  }\n\n  // Given a unichar from a string and a given dawg, return the unichar\n  // we should use to match in that dawg type.  (for example, in the number\n  // dawg, all numbers are transformed to kPatternUnicharId).\n  UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {\n    if (!dawg) {\n      return ch;\n    }\n    switch (dawg->type()) {\n      case DAWG_TYPE_NUMBER:\n        return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;\n      default:\n        return ch;\n    }\n  }\n\n  /// For each of the character classes of the given unichar_id (and the\n  /// unichar_id itself) finds the corresponding outgoing node or self-loop\n  /// in the given dawg and (after checking that it is valid) records it in\n  /// dawg_args->updated_active_dawgs. Updates current_permuter if any valid\n  /// edges were found.\n  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,\n                           bool word_end, DawgArgs *dawg_args,\n                           PermuterType *current_permuter) const;\n\n  /// Read/Write/Access special purpose dawgs which contain words\n  /// only of a certain length (used for phrase search for\n  /// non-space-delimited languages).\n\n  /// Check all the DAWGs to see if this word is in any of them.\n  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {\n    return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||\n            perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||\n            (numbers_ok && perm == NUMBER_PERM));\n  }\n  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;\n  int valid_word(const WERD_CHOICE &word) const {\n    return valid_word(word, false); // return NO_PERM for words with digits\n  }\n  int valid_word_or_number(const WERD_CHOICE &word) const {\n    return valid_word(word, true); // return NUMBER_PERM for valid numbers\n  }\n  /// This function is used by api/tesseract_cube_combiner.cpp\n  int valid_word(const char *string) const {\n    WERD_CHOICE word(string, getUnicharset());\n    return valid_word(word);\n  }\n  // Do the two WERD_CHOICEs form a meaningful bigram?\n  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;\n  /// Returns true if the word contains a valid punctuation pattern.\n  /// Note: Since the domains of punctuation symbols and symblos\n  /// used in numbers are not disjoint, a valid number might contain\n  /// an invalid punctuation pattern (e.g. .99).\n  bool valid_punctuation(const WERD_CHOICE &word);\n  /// Returns true if a good answer is found for the unknown blob rating.\n  int good_choice(const WERD_CHOICE &choice);\n  /// Adds a word found on this document to the document specific dictionary.\n  void add_document_word(const WERD_CHOICE &best_choice);\n  /// Adjusts the rating of the given word.\n  void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,\n                   float additional_adjust, bool modify_rating, bool debug);\n  /// Set wordseg_rating_adjust_factor_ to the given value.\n  inline void SetWordsegRatingAdjustFactor(float f) {\n    wordseg_rating_adjust_factor_ = f;\n  }\n  /// Returns true if the language is space-delimited (not CJ, or T).\n  bool IsSpaceDelimitedLang() const;\n\nprivate:\n  /** Private member variables. */\n  CCUtil *ccutil_;\n  /**\n   * Table that stores ambiguities computed during training\n   * (loaded when NoDangerousAmbigs() is called for the first time).\n   * Each entry i in the table stores a set of amibiguities whose\n   * wrong ngram starts with unichar id i.\n   */\n#ifndef DISABLED_LEGACY_ENGINE\n  UnicharAmbigs *dang_ambigs_table_ = nullptr;\n  /** Same as above, but for ambiguities with replace flag set. */\n  UnicharAmbigs *replace_ambigs_table_ = nullptr;\n#endif\n  /** Additional certainty padding allowed before a word is rejected. */\n  float reject_offset_;\n  // Cached UNICHAR_IDs:\n  UNICHAR_ID wildcard_unichar_id_;   // kDictWildcard.\n  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.\n  UNICHAR_ID question_unichar_id_;   // kQuestionSymbol.\n  UNICHAR_ID slash_unichar_id_;      // kSlashSymbol.\n  UNICHAR_ID hyphen_unichar_id_;     // kHyphenSymbol.\n  // Hyphen-related variables.\n  WERD_CHOICE *hyphen_word_;\n  DawgPositionVector hyphen_active_dawgs_;\n  bool last_word_on_line_;\n  // List of lists of \"equivalent\" UNICHAR_IDs for the purposes of dictionary\n  // matching.  The first member of each list is taken as canonical.  For\n  // example, the first list contains hyphens and dashes with the first symbol\n  // being the ASCII hyphen minus.\n  std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;\n  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.\n  DawgCache *dawg_cache_;\n  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_\n  // Dawgs.\n  DawgVector dawgs_;\n  SuccessorListsVector successors_;\n  Trie *pending_words_;\n  /// The following pointers are only cached for convenience.\n  /// The dawgs will be deleted when dawgs_ vector is destroyed.\n  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if\n  // any of them are present on the best choices list for a word pair.\n  // the bigrams are stored as space-separated words where:\n  // (1) leading and trailing punctuation has been removed from each word and\n  // (2) any digits have been replaced with '?' marks.\n  Dawg *bigram_dawg_;\n  // TODO(daria): need to support multiple languages in the future,\n  // so maybe will need to maintain a list of dawgs of each kind.\n  Dawg *freq_dawg_;\n  Dawg *unambig_dawg_;\n  Dawg *punc_dawg_;\n  Trie *document_words_;\n  /// Current segmentation cost adjust factor for word rating.\n  /// See comments in incorporate_segcost.\n  float wordseg_rating_adjust_factor_;\n  // File for recording ambiguities discovered during dictionary search.\n  FILE *output_ambig_words_file_;\n\npublic:\n  /// Variable members.\n  /// These have to be declared and initialized after image_ptr_, which contains\n  /// the pointer to the params vector - the member of its base CCUtil class.\n  STRING_VAR_H(user_words_file);\n  STRING_VAR_H(user_words_suffix);\n  STRING_VAR_H(user_patterns_file);\n  STRING_VAR_H(user_patterns_suffix);\n  BOOL_VAR_H(load_system_dawg);\n  BOOL_VAR_H(load_freq_dawg);\n  BOOL_VAR_H(load_unambig_dawg);\n  BOOL_VAR_H(load_punc_dawg);\n  BOOL_VAR_H(load_number_dawg);\n  BOOL_VAR_H(load_bigram_dawg);\n  double_VAR_H(xheight_penalty_subscripts);\n  double_VAR_H(xheight_penalty_inconsistent);\n  double_VAR_H(segment_penalty_dict_frequent_word);\n  double_VAR_H(segment_penalty_dict_case_ok);\n  double_VAR_H(segment_penalty_dict_case_bad);\n  double_VAR_H(segment_penalty_dict_nonword);\n  double_VAR_H(segment_penalty_garbage);\n  STRING_VAR_H(output_ambig_words_file);\n  INT_VAR_H(dawg_debug_level);\n  INT_VAR_H(hyphen_debug_level);\n  BOOL_VAR_H(use_only_first_uft8_step);\n  double_VAR_H(certainty_scale);\n  double_VAR_H(stopper_nondict_certainty_base);\n  double_VAR_H(stopper_phase2_certainty_rejection_offset);\n  INT_VAR_H(stopper_smallword_size);\n  double_VAR_H(stopper_certainty_per_char);\n  double_VAR_H(stopper_allowable_character_badness);\n  INT_VAR_H(stopper_debug_level);\n  BOOL_VAR_H(stopper_no_acceptable_choices);\n  INT_VAR_H(tessedit_truncate_wordchoice_log);\n  STRING_VAR_H(word_to_debug);\n  BOOL_VAR_H(segment_nonalphabetic_script);\n  BOOL_VAR_H(save_doc_words);\n  double_VAR_H(doc_dict_pending_threshold);\n  double_VAR_H(doc_dict_certainty_threshold);\n  INT_VAR_H(max_permuter_attempts);\n};\n\n} // namespace tesseract\n\n#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_\n"
  },
  {
    "path": "src/dict/hyphen.cpp",
    "content": "/******************************************************************************\n * File:         hyphen.cpp  (Formerly hyphen.c)\n * Description:  Functions for maintaining information about hyphenated words.\n * Author:       Mark Seaman, OCR Technology\n * Status:       Reusable Software Component\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#include \"dict.h\"\n\nnamespace tesseract {\n\n// Unless the previous word was the last one on the line, and the current\n// one is not (thus it is the first one on the line), erase hyphen_word_,\n// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.\nvoid Dict::reset_hyphen_vars(bool last_word_on_line) {\n  if (!(last_word_on_line_ == true && last_word_on_line == false)) {\n    if (hyphen_word_ != nullptr) {\n      delete hyphen_word_;\n      hyphen_word_ = nullptr;\n      hyphen_active_dawgs_.clear();\n    }\n  }\n  if (hyphen_debug_level) {\n    tprintf(\"reset_hyphen_vars: last_word_on_line %d -> %d\\n\", last_word_on_line_,\n            last_word_on_line);\n  }\n  last_word_on_line_ = last_word_on_line;\n}\n\n// Update hyphen_word_, and copy the given DawgPositionVectors into\n// hyphen_active_dawgs_.\nvoid Dict::set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs) {\n  if (hyphen_word_ == nullptr) {\n    hyphen_word_ = new WERD_CHOICE(word.unicharset());\n    hyphen_word_->make_bad();\n  }\n  if (hyphen_word_->rating() > word.rating()) {\n    *hyphen_word_ = word;\n    // Remove the last unichar id as it is a hyphen, and remove\n    // any unichar_string/lengths that are present.\n    hyphen_word_->remove_last_unichar_id();\n    hyphen_active_dawgs_ = active_dawgs;\n  }\n  if (hyphen_debug_level) {\n    hyphen_word_->print(\"set_hyphen_word: \");\n  }\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/matchdefs.h",
    "content": "/******************************************************************************\n ** Filename:    matchdefs.h\n ** Purpose:     Generic interface definitions for feature matchers.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#ifndef MATCHDEFS_H\n#define MATCHDEFS_H\n\n#include <tesseract/unichar.h>\n\n#include <climits> // INT16_MAX\n#include <cstdint> // int16_t\n\nnamespace tesseract {\n\n/* define the maximum number of classes defined for any matcher\n  and the maximum class id for any matcher. This must be changed\n  if more different classes need to be classified */\n#define MAX_NUM_CLASSES INT16_MAX\n\n/** a CLASS_ID is the ascii character to be associated with a class */\nusing CLASS_ID = UNICHAR_ID;\n#define NO_CLASS (0)\n\n/** a PROTO_ID is the index of a prototype within it's class.  Valid proto\n  id's are 0 to N-1 where N is the number of prototypes that make up the\n  class. */\nusing PROTO_ID = int16_t;\n#define NO_PROTO (-1)\n\n/** FEATURE_ID is the index of a feature within a character description\n  The feature id ranges from 0 to N-1 where N is the number\n  of features in a character description. */\nusing FEATURE_ID = uint8_t;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/dict/permdawg.cpp",
    "content": "/******************************************************************************\n *\n * File:         permdawg.cpp  (Formerly permdawg.c)\n * Description:  Scale word choices by a dictionary\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include \"dawg.h\"\n#include \"params.h\"\n#include \"stopper.h\"\n#include \"tprintf.h\"\n\n#include <algorithm>\n#include <cctype>\n#include \"dict.h\"\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\nnamespace tesseract {\n\n/**\n * @name go_deeper_dawg_fxn\n *\n * If the choice being composed so far could be a dictionary word\n * keep exploring choices.\n */\nvoid Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                              int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,\n                              bool word_ending, WERD_CHOICE *word, float certainties[],\n                              float *limit, WERD_CHOICE *best_choice, int *attempts_left,\n                              void *void_more_args) {\n  auto *more_args = static_cast<DawgArgs *>(void_more_args);\n  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);\n  int word_index = word->length() - 1;\n  if (best_choice->rating() < *limit) {\n    return;\n  }\n  // Look up char in DAWG\n\n  // If the current unichar is an ngram first try calling\n  // letter_is_okay() for each unigram it contains separately.\n  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);\n  bool checked_unigrams = false;\n  if (getUnicharset().get_isngram(orig_uch_id)) {\n    if (dawg_debug_level) {\n      tprintf(\"checking unigrams in an ngram %s\\n\", getUnicharset().debug_str(orig_uch_id).c_str());\n    }\n    int num_unigrams = 0;\n    word->remove_last_unichar_id();\n    std::vector<UNICHAR_ID> encoding;\n    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);\n    // Since the string came out of the unicharset, failure is impossible.\n    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));\n    bool unigrams_ok = true;\n    // Construct DawgArgs that reflect the current state.\n    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);\n    DawgPositionVector unigram_updated_dawgs;\n    DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);\n    // Check unigrams in the ngram with letter_is_okay().\n    for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {\n      UNICHAR_ID uch_id = encoding[i];\n      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);\n      ++num_unigrams;\n      word->append_unichar_id(uch_id, 1, 0.0, 0.0);\n      unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),\n                                             word->unichar_id(word_index + num_unigrams - 1),\n                                             word_ending && i == encoding.size() - 1);\n      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);\n      if (dawg_debug_level) {\n        tprintf(\"unigram %s is %s\\n\", getUnicharset().debug_str(uch_id).c_str(),\n                unigrams_ok ? \"OK\" : \"not OK\");\n      }\n    }\n    // Restore the word and copy the updated dawg state if needed.\n    while (num_unigrams-- > 0) {\n      word->remove_last_unichar_id();\n    }\n    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);\n    if (unigrams_ok) {\n      checked_unigrams = true;\n      more_args->permuter = unigram_dawg_args.permuter;\n      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);\n    }\n  }\n\n  // Check which dawgs from the dawgs_ vector contain the word\n  // up to and including the current unichar.\n  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),\n                                                   word->unichar_id(word_index), word_ending)) {\n    // Add a new word choice\n    if (word_ending) {\n      if (dawg_debug_level) {\n        tprintf(\"found word = %s\\n\", word->debug_string().c_str());\n      }\n      if (strcmp(output_ambig_words_file.c_str(), \"\") != 0) {\n        if (output_ambig_words_file_ == nullptr) {\n          output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), \"wb+\");\n          if (output_ambig_words_file_ == nullptr) {\n            tprintf(\"Failed to open output_ambig_words_file %s\\n\", output_ambig_words_file.c_str());\n            exit(1);\n          }\n          std::string word_str;\n          word->string_and_lengths(&word_str, nullptr);\n          word_str += \" \";\n          fprintf(output_ambig_words_file_, \"%s\", word_str.c_str());\n        }\n        std::string word_str;\n        word->string_and_lengths(&word_str, nullptr);\n        word_str += \" \";\n        fprintf(output_ambig_words_file_, \"%s\", word_str.c_str());\n      }\n      WERD_CHOICE *adjusted_word = word;\n      adjusted_word->set_permuter(more_args->permuter);\n      update_best_choice(*adjusted_word, best_choice);\n    } else { // search the next letter\n      // Make updated_* point to the next entries in the DawgPositionVector\n      // arrays (that were originally created in dawg_permute_and_select)\n      ++(more_args->updated_dawgs);\n      // Make active_dawgs and constraints point to the updated ones.\n      ++(more_args->active_dawgs);\n      permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,\n                      certainties, limit, best_choice, attempts_left, more_args);\n      // Restore previous state to explore another letter in this position.\n      --(more_args->updated_dawgs);\n      --(more_args->active_dawgs);\n    }\n  } else {\n    if (dawg_debug_level) {\n      tprintf(\"last unichar not OK at index %d in %s\\n\", word_index, word->debug_string().c_str());\n    }\n  }\n}\n\n/**\n * dawg_permute_and_select\n *\n * Recursively explore all the possible character combinations in\n * the given char_choices. Use go_deeper_dawg_fxn() to search all the\n * dawgs in the dawgs_ vector in parallel and discard invalid words.\n *\n * Allocate and return a WERD_CHOICE with the best valid word found.\n */\nWERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                                           float rating_limit) {\n  auto *best_choice = new WERD_CHOICE(&getUnicharset());\n  best_choice->make_bad();\n  best_choice->set_rating(rating_limit);\n  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {\n    return best_choice;\n  }\n  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];\n  init_active_dawgs(&(active_dawgs[0]), true);\n  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);\n  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);\n\n  float certainties[MAX_WERD_LENGTH];\n  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;\n  int attempts_left = max_permuter_attempts;\n  permute_choices((dawg_debug_level) ? \"permute_dawg_debug\" : nullptr, char_choices, 0, nullptr,\n                  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);\n  delete[] active_dawgs;\n  return best_choice;\n}\n\n/**\n * permute_choices\n *\n * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST\n * with the given char_choice_index in char_choices.\n */\nvoid Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                           int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,\n                           WERD_CHOICE *word, float certainties[], float *limit,\n                           WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {\n  if (debug) {\n    tprintf(\n        \"%s permute_choices: char_choice_index=%d\"\n        \" limit=%g rating=%g, certainty=%g word=%s\\n\",\n        debug, char_choice_index, *limit, word->rating(), word->certainty(),\n        word->debug_string().c_str());\n  }\n  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {\n    BLOB_CHOICE_IT blob_choice_it;\n    blob_choice_it.set_to_list(char_choices.at(char_choice_index));\n    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {\n      (*attempts_left)--;\n      append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,\n                     prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,\n                     more_args);\n      if (*attempts_left <= 0) {\n        if (debug) {\n          tprintf(\"permute_choices(): attempts_left is 0\\n\");\n        }\n        break;\n      }\n    }\n  }\n}\n\n/**\n * append_choices\n *\n * Checks to see whether or not the next choice is worth appending to\n * the word being generated. If so then keeps going deeper into the word.\n *\n * This function assumes that Dict::go_deeper_fxn_ is set.\n */\nvoid Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,\n                          const BLOB_CHOICE &blob_choice, int char_choice_index,\n                          const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,\n                          float certainties[], float *limit, WERD_CHOICE *best_choice,\n                          int *attempts_left, void *more_args) {\n  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);\n\n  // Deal with fragments.\n  CHAR_FRAGMENT_INFO char_frag_info;\n  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),\n                           prev_char_frag_info, debug, word_ending, &char_frag_info)) {\n    return; // blob_choice must be an invalid fragment\n  }\n  // Search the next letter if this character is a fragment.\n  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {\n    permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,\n                    limit, best_choice, attempts_left, more_args);\n    return;\n  }\n\n  // Add the next unichar.\n  float old_rating = word->rating();\n  float old_certainty = word->certainty();\n  uint8_t old_permuter = word->permuter();\n  certainties[word->length()] = char_frag_info.certainty;\n  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,\n                                          char_frag_info.rating, char_frag_info.certainty);\n\n  // Explore the next unichar.\n  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,\n                          word, certainties, limit, best_choice, attempts_left, more_args);\n\n  // Remove the unichar we added to explore other choices in it's place.\n  word->remove_last_unichar_id();\n  word->set_rating(old_rating);\n  word->set_certainty(old_certainty);\n  word->set_permuter(old_permuter);\n}\n\n/**\n * @name fragment_state\n *\n * Given the current char choice and information about previously seen\n * fragments, determines whether adjacent character fragments are\n * present and whether they can be concatenated.\n *\n * The given prev_char_frag_info contains:\n * - fragment: if not nullptr contains information about immediately\n *   preceding fragmented character choice\n * - num_fragments: number of fragments that have been used so far\n *   to construct a character\n * - certainty: certainty of the current choice or minimum\n *   certainty of all fragments concatenated so far\n * - rating: rating of the current choice or sum of fragment\n *   ratings concatenated so far\n *\n * The output char_frag_info is filled in as follows:\n * - character: is set to be nullptr if the choice is a non-matching\n *   or non-ending fragment piece; is set to unichar of the given choice\n *   if it represents a regular character or a matching ending fragment\n * - fragment,num_fragments,certainty,rating are set as described above\n *\n * @returns false if a non-matching fragment is discovered, true otherwise.\n */\nbool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,\n                               const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,\n                               int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {\n  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);\n  const CHAR_FRAGMENT *prev_fragment =\n      prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;\n\n  // Print debug info for fragments.\n  if (debug && (prev_fragment || this_fragment)) {\n    tprintf(\"%s check fragments: choice=%s word_ending=%d\\n\", debug,\n            getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);\n    if (prev_fragment) {\n      tprintf(\"prev_fragment %s\\n\", prev_fragment->to_string().c_str());\n    }\n    if (this_fragment) {\n      tprintf(\"this_fragment %s\\n\", this_fragment->to_string().c_str());\n    }\n  }\n\n  char_frag_info->unichar_id = curr_unichar_id;\n  char_frag_info->fragment = this_fragment;\n  char_frag_info->rating = curr_rating;\n  char_frag_info->certainty = curr_certainty;\n  char_frag_info->num_fragments = 1;\n  if (prev_fragment && !this_fragment) {\n    if (debug) {\n      tprintf(\"Skip choice with incomplete fragment\\n\");\n    }\n    return false;\n  }\n  if (this_fragment) {\n    // We are dealing with a fragment.\n    char_frag_info->unichar_id = INVALID_UNICHAR_ID;\n    if (prev_fragment) {\n      if (!this_fragment->is_continuation_of(prev_fragment)) {\n        if (debug) {\n          tprintf(\"Non-matching fragment piece\\n\");\n        }\n        return false;\n      }\n      if (this_fragment->is_ending()) {\n        char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());\n        char_frag_info->fragment = nullptr;\n        if (debug) {\n          tprintf(\"Built character %s from fragments\\n\",\n                  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());\n        }\n      } else {\n        if (debug) {\n          tprintf(\"Record fragment continuation\\n\");\n        }\n        char_frag_info->fragment = this_fragment;\n      }\n      // Update certainty and rating.\n      char_frag_info->rating = prev_char_frag_info->rating + curr_rating;\n      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;\n      char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);\n    } else {\n      if (this_fragment->is_beginning()) {\n        if (debug) {\n          tprintf(\"Record fragment beginning\\n\");\n        }\n      } else {\n        if (debug) {\n          tprintf(\"Non-starting fragment piece with no prev_fragment\\n\");\n        }\n        return false;\n      }\n    }\n  }\n  if (word_ending && char_frag_info->fragment) {\n    if (debug) {\n      tprintf(\"Word cannot end with a fragment\\n\");\n    }\n    return false;\n  }\n  return true;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/stopper.cpp",
    "content": "/******************************************************************************\n ** Filename:    stopper.c\n ** Purpose:     Stopping criteria for word classifier.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n#include <cctype>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n\n#include \"stopper.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"ambigs.h\"\n#endif\n#include <tesseract/unichar.h>\n#include \"ccutil.h\"\n#include \"dict.h\"\n#include \"helpers.h\"\n#include \"matchdefs.h\"\n#include \"pageres.h\"\n#include \"params.h\"\n#include \"ratngs.h\"\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n\nnamespace tesseract {\n\nbool Dict::AcceptableChoice(const WERD_CHOICE &best_choice,\n                            XHeightConsistencyEnum xheight_consistency) {\n  float CertaintyThreshold = stopper_nondict_certainty_base;\n  int WordSize;\n\n  if (stopper_no_acceptable_choices) {\n    return false;\n  }\n\n  if (best_choice.empty()) {\n    return false;\n  }\n\n  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();\n  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);\n  bool is_case_ok = case_ok(best_choice);\n\n  if (stopper_debug_level >= 1) {\n    const char *xht = \"UNKNOWN\";\n    switch (xheight_consistency) {\n      case XH_GOOD:\n        xht = \"NORMAL\";\n        break;\n      case XH_SUBNORMAL:\n        xht = \"SUBNORMAL\";\n        break;\n      case XH_INCONSISTENT:\n        xht = \"INCONSISTENT\";\n        break;\n      default:\n        xht = \"UNKNOWN\";\n    }\n    tprintf(\"\\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\\n\",\n            best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),\n            (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());\n  }\n  // Do not accept invalid words in PASS1.\n  if (reject_offset_ <= 0.0f && !is_valid_word) {\n    return false;\n  }\n  if (is_valid_word && is_case_ok) {\n    WordSize = LengthOfShortestAlphaRun(best_choice);\n    WordSize -= stopper_smallword_size;\n    if (WordSize < 0) {\n      WordSize = 0;\n    }\n    CertaintyThreshold += WordSize * stopper_certainty_per_char;\n  }\n\n  if (stopper_debug_level >= 1) {\n    tprintf(\"Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\\n\",\n            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);\n  }\n\n  if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&\n      xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {\n    return true;\n  } else {\n    if (stopper_debug_level >= 1) {\n      tprintf(\n          \"AcceptableChoice() returned false\"\n          \" (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\\n\",\n          no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,\n          UniformCertainties(best_choice));\n    }\n    return false;\n  }\n}\n\nbool Dict::AcceptableResult(WERD_RES *word) const {\n  if (word->best_choice == nullptr) {\n    return false;\n  }\n  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;\n  int WordSize;\n\n  if (stopper_debug_level >= 1) {\n    tprintf(\"\\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\\n\",\n            word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),\n            (case_ok(*word->best_choice) ? 'y' : 'n'),\n            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',\n            word->best_choices.singleton() ? 'n' : 'y');\n  }\n\n  if (word->best_choice->empty() || !word->best_choices.singleton()) {\n    return false;\n  }\n  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {\n    WordSize = LengthOfShortestAlphaRun(*word->best_choice);\n    WordSize -= stopper_smallword_size;\n    if (WordSize < 0) {\n      WordSize = 0;\n    }\n    CertaintyThreshold += WordSize * stopper_certainty_per_char;\n  }\n\n  if (stopper_debug_level >= 1) {\n    tprintf(\"Rejecter: Certainty = %4.1f, Threshold = %4.1f   \", word->best_choice->certainty(),\n            CertaintyThreshold);\n  }\n\n  if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {\n    if (stopper_debug_level >= 1) {\n      tprintf(\"ACCEPTED\\n\");\n    }\n    return true;\n  } else {\n    if (stopper_debug_level >= 1) {\n      tprintf(\"REJECTED\\n\");\n    }\n    return false;\n  }\n}\n\n#if !defined(DISABLED_LEGACY_ENGINE)\n\nbool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable,\n                            MATRIX *ratings) {\n  if (stopper_debug_level > 2) {\n    tprintf(\"\\nRunning NoDangerousAmbig() for %s\\n\", best_choice->debug_string().c_str());\n  }\n\n  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities\n  // for each unichar id in BestChoice.\n  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;\n  bool ambigs_found = false;\n  // For each position in best_choice:\n  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]\n  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]\n  // -- look for ambiguities corresponding to wrong_ngram in the list while\n  //    adding the following unichar_ids from best_choice to wrong_ngram\n  //\n  // Repeat the above procedure twice: first time look through\n  // ambigs to be replaced and replace all the ambiguities found;\n  // second time look through dangerous ambiguities and construct\n  // ambig_blob_choices with fake a blob choice for each ambiguity\n  // and pass them to dawg_permute_and_select() to search for\n  // ambiguous words in the dictionaries.\n  //\n  // Note that during the execution of the for loop (on the first pass)\n  // if replacements are made the length of best_choice might change.\n  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {\n    bool replace = (fix_replaceable && pass == 0);\n    const UnicharAmbigsVector &table =\n        replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();\n    if (!replace) {\n      // Initialize ambig_blob_choices with lists containing a single\n      // unichar id for the corresponding position in best_choice.\n      // best_choice consisting from only the original letters will\n      // have a rating of 0.0.\n      for (unsigned i = 0; i < best_choice->length(); ++i) {\n        auto *lst = new BLOB_CHOICE_LIST();\n        BLOB_CHOICE_IT lst_it(lst);\n        // TODO(rays/antonova) Put real xheights and y shifts here.\n        lst_it.add_to_end(\n            new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));\n        ambig_blob_choices.push_back(lst);\n      }\n    }\n    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];\n    int wrong_ngram_index;\n    int blob_index = 0;\n    for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {\n      auto curr_unichar_id = best_choice->unichar_id(i);\n      if (stopper_debug_level > 2) {\n        tprintf(\"Looking for %s ngrams starting with %s:\\n\", replace ? \"replaceable\" : \"ambiguous\",\n                getUnicharset().debug_str(curr_unichar_id).c_str());\n      }\n      int num_wrong_blobs = best_choice->state(i);\n      wrong_ngram_index = 0;\n      wrong_ngram[wrong_ngram_index] = curr_unichar_id;\n      if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||\n          table[curr_unichar_id] == nullptr) {\n        continue; // there is no ambig spec for this unichar id\n      }\n      AmbigSpec_IT spec_it(table[curr_unichar_id]);\n      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {\n        const AmbigSpec *ambig_spec = spec_it.data();\n        wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;\n        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);\n        if (stopper_debug_level > 2) {\n          tprintf(\"candidate ngram: \");\n          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());\n          tprintf(\"current ngram from spec: \");\n          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());\n          tprintf(\"comparison result: %d\\n\", compare);\n        }\n        if (compare == 0) {\n          // Record the place where we found an ambiguity.\n          if (fixpt != nullptr) {\n            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];\n            fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,\n                                          getUnicharset().get_isngram(ambig_spec->correct_ngram_id),\n                                          leftmost_id));\n            if (stopper_debug_level > 1) {\n              tprintf(\"fixpt+=(%d %d %d %d %s)\\n\", blob_index, blob_index + num_wrong_blobs, false,\n                      getUnicharset().get_isngram(ambig_spec->correct_ngram_id),\n                      getUnicharset().id_to_unichar(leftmost_id));\n            }\n          }\n\n          if (replace) {\n            if (stopper_debug_level > 2) {\n              tprintf(\"replace ambiguity with %s : \",\n                      getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));\n              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());\n            }\n            ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,\n                         ratings);\n          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {\n            // We found dang ambig - update ambig_blob_choices.\n            if (stopper_debug_level > 2) {\n              tprintf(\"found ambiguity: \");\n              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());\n            }\n            ambigs_found = true;\n            for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {\n              // Add a blob choice for the corresponding fragment of the\n              // ambiguity. These fake blob choices are initialized with\n              // negative ratings (which are not possible for real blob\n              // choices), so that dawg_permute_and_select() considers any\n              // word not consisting of only the original letters a better\n              // choice and stops searching for alternatives once such a\n              // choice is found.\n              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);\n              bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,\n                                               -1, 0, 1, 0, BCC_AMBIG));\n            }\n          }\n          spec_it.forward();\n        } else if (compare == -1) {\n          unsigned next_index;\n          if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&\n              ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {\n            // Add the next unichar id to wrong_ngram and keep looking for\n            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.\n            wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);\n            num_wrong_blobs += best_choice->state(next_index);\n          } else {\n            break; // no more matching ambigs in this AMBIG_SPEC_LIST\n          }\n        } else {\n          spec_it.forward();\n        }\n      } // end searching AmbigSpec_LIST\n    }   // end searching best_choice\n  }     // end searching replace and dangerous ambigs\n\n  // If any ambiguities were found permute the constructed ambig_blob_choices\n  // to see if an alternative dictionary word can be found.\n  if (ambigs_found) {\n    if (stopper_debug_level > 2) {\n      tprintf(\"\\nResulting ambig_blob_choices:\\n\");\n      for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {\n        print_ratings_list(\"\", ambig_blob_choices.at(i), getUnicharset());\n        tprintf(\"\\n\");\n      }\n    }\n    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);\n    ambigs_found = (alt_word->rating() < 0.0);\n    if (ambigs_found) {\n      if (stopper_debug_level >= 1) {\n        tprintf(\"Stopper: Possible ambiguous word = %s\\n\", alt_word->debug_string().c_str());\n      }\n      if (fixpt != nullptr) {\n        // Note: Currently character choices combined from fragments can only\n        // be generated by NoDangrousAmbigs(). This code should be updated if\n        // the capability to produce classifications combined from character\n        // fragments is added to other functions.\n        int orig_i = 0;\n        for (unsigned i = 0; i < alt_word->length(); ++i) {\n          const UNICHARSET &uchset = getUnicharset();\n          bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));\n          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);\n          if (replacement_is_ngram) {\n            // we have to extract the leftmost unichar from the ngram.\n            const char *str = uchset.id_to_unichar(leftmost_id);\n            int step = uchset.step(str);\n            if (step) {\n              leftmost_id = uchset.unichar_to_id(str, step);\n            }\n          }\n          int end_i = orig_i + alt_word->state(i);\n          if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {\n            // Compute proper blob indices.\n            int blob_start = 0;\n            for (int j = 0; j < orig_i; ++j) {\n              blob_start += best_choice->state(j);\n            }\n            int blob_end = blob_start;\n            for (int j = orig_i; j < end_i; ++j) {\n              blob_end += best_choice->state(j);\n            }\n            fixpt->push_back(\n                DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));\n            if (stopper_debug_level > 1) {\n              tprintf(\"fixpt->dangerous+=(%d %d %d %d %s)\\n\", orig_i, end_i, true,\n                      replacement_is_ngram, uchset.id_to_unichar(leftmost_id));\n            }\n          }\n          orig_i += alt_word->state(i);\n        }\n      }\n    }\n    delete alt_word;\n  }\n  if (output_ambig_words_file_ != nullptr) {\n    fprintf(output_ambig_words_file_, \"\\n\");\n  }\n\n  for (auto data : ambig_blob_choices) {\n    delete data;\n  }\n  return !ambigs_found;\n}\n\nvoid Dict::EndDangerousAmbigs() {}\n\n#endif // !defined(DISABLED_LEGACY_ENGINE)\n\nvoid Dict::SetupStopperPass1() {\n  reject_offset_ = 0.0;\n}\n\nvoid Dict::SetupStopperPass2() {\n  reject_offset_ = stopper_phase2_certainty_rejection_offset;\n}\n\nvoid Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,\n                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) {\n  int num_blobs_to_replace = 0;\n  int begin_blob_index = 0;\n  int i;\n  // Rating and certainty for the new BLOB_CHOICE are derived from the\n  // replaced choices.\n  float new_rating = 0.0f;\n  float new_certainty = 0.0f;\n  BLOB_CHOICE *old_choice = nullptr;\n  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {\n    if (i >= wrong_ngram_begin_index) {\n      int num_blobs = werd_choice->state(i);\n      int col = begin_blob_index + num_blobs_to_replace;\n      int row = col + num_blobs - 1;\n      BLOB_CHOICE_LIST *choices = ratings->get(col, row);\n      ASSERT_HOST(choices != nullptr);\n      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);\n      ASSERT_HOST(old_choice != nullptr);\n      new_rating += old_choice->rating();\n      new_certainty += old_choice->certainty();\n      num_blobs_to_replace += num_blobs;\n    } else {\n      begin_blob_index += werd_choice->state(i);\n    }\n  }\n  new_certainty /= wrong_ngram_size;\n  // If there is no entry in the ratings matrix, add it.\n  MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);\n  if (!coord.Valid(*ratings)) {\n    ratings->IncreaseBandSize(coord.row - coord.col + 1);\n  }\n  if (ratings->get(coord.col, coord.row) == nullptr) {\n    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);\n  }\n  BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);\n  BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);\n  if (choice != nullptr) {\n    // Already there. Upgrade if new rating better.\n    if (new_rating < choice->rating()) {\n      choice->set_rating(new_rating);\n    }\n    if (new_certainty < choice->certainty()) {\n      choice->set_certainty(new_certainty);\n    }\n    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.\n  } else {\n    // Need a new choice with the correct_ngram_id.\n    choice = new BLOB_CHOICE(*old_choice);\n    choice->set_unichar_id(correct_ngram_id);\n    choice->set_rating(new_rating);\n    choice->set_certainty(new_certainty);\n    choice->set_classifier(BCC_AMBIG);\n    choice->set_matrix_cell(coord.col, coord.row);\n    BLOB_CHOICE_IT it(new_choices);\n    it.add_to_end(choice);\n  }\n  // Remove current unichar from werd_choice. On the last iteration\n  // set the correct replacement unichar instead of removing a unichar.\n  for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {\n    if (replaced_count + 1 == wrong_ngram_size) {\n      werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);\n    } else {\n      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);\n    }\n  }\n  if (stopper_debug_level >= 1) {\n    werd_choice->print(\"ReplaceAmbig() \");\n    tprintf(\"Modified blob_choices: \");\n    print_ratings_list(\"\\n\", new_choices, getUnicharset());\n  }\n}\n\nint Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {\n  int shortest = INT32_MAX;\n  int curr_len = 0;\n  for (unsigned w = 0; w < WordChoice.length(); ++w) {\n    if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {\n      curr_len++;\n    } else if (curr_len > 0) {\n      if (curr_len < shortest) {\n        shortest = curr_len;\n      }\n      curr_len = 0;\n    }\n  }\n  if (curr_len > 0 && curr_len < shortest) {\n    shortest = curr_len;\n  } else if (shortest == INT32_MAX) {\n    shortest = 0;\n  }\n  return shortest;\n}\n\nint Dict::UniformCertainties(const WERD_CHOICE &word) {\n  float Certainty;\n  float WorstCertainty = FLT_MAX;\n  float CertaintyThreshold;\n  double TotalCertainty;\n  double TotalCertaintySquared;\n  double Variance;\n  float Mean, StdDev;\n  int word_length = word.length();\n\n  if (word_length < 3) {\n    return true;\n  }\n\n  TotalCertainty = TotalCertaintySquared = 0.0;\n  for (int i = 0; i < word_length; ++i) {\n    Certainty = word.certainty(i);\n    TotalCertainty += Certainty;\n    TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;\n    if (Certainty < WorstCertainty) {\n      WorstCertainty = Certainty;\n    }\n  }\n\n  // Subtract off worst certainty from statistics.\n  word_length--;\n  TotalCertainty -= WorstCertainty;\n  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;\n\n  Mean = TotalCertainty / word_length;\n  Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /\n              (word_length * (word_length - 1)));\n  if (Variance < 0.0) {\n    Variance = 0.0;\n  }\n  StdDev = sqrt(Variance);\n\n  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;\n  if (CertaintyThreshold > stopper_nondict_certainty_base) {\n    CertaintyThreshold = stopper_nondict_certainty_base;\n  }\n\n  if (word.certainty() < CertaintyThreshold) {\n    if (stopper_debug_level >= 1) {\n      tprintf(\n          \"Stopper: Non-uniform certainty = %4.1f\"\n          \" (m=%4.1f, s=%4.1f, t=%4.1f)\\n\",\n          word.certainty(), Mean, StdDev, CertaintyThreshold);\n    }\n    return false;\n  } else {\n    return true;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/stopper.h",
    "content": "/******************************************************************************\n ** Filename:    stopper.h\n ** Purpose:     Stopping criteria for word classifier.\n ** Author:      Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n#ifndef STOPPER_H\n#define STOPPER_H\n\n#include \"params.h\"\n#include \"ratngs.h\"\n\n#include <tesseract/unichar.h>\n\nnamespace tesseract {\n\nclass WERD_CHOICE;\n\nusing BLOB_WIDTH = uint8_t;\n\nstruct DANGERR_INFO {\n  DANGERR_INFO()\n      : begin(-1)\n      , end(-1)\n      , dangerous(false)\n      , correct_is_ngram(false)\n      , leftmost(INVALID_UNICHAR_ID) {}\n  DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l)\n      : begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {}\n  int begin;\n  int end;\n  bool dangerous;\n  bool correct_is_ngram;\n  UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?\n};\n\nusing DANGERR = std::vector<DANGERR_INFO>;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/dict/trie.cpp",
    "content": "/******************************************************************************\n *\n * File:         trie.cpp  (Formerly trie.c)\n * Description:  Functions to build a trie data structure.\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include \"trie.h\"\n\n#include \"dawg.h\"\n#include \"dict.h\"\n#include \"helpers.h\"\n#include \"kdpair.h\"\n\nnamespace tesseract {\n\nconst char kDoNotReverse[] = \"RRP_DO_NO_REVERSE\";\nconst char kReverseIfHasRTL[] = \"RRP_REVERSE_IF_HAS_RTL\";\nconst char kForceReverse[] = \"RRP_FORCE_REVERSE\";\n\nconst char *const RTLReversePolicyNames[] = {kDoNotReverse, kReverseIfHasRTL, kForceReverse};\n\nconst char Trie::kAlphaPatternUnicode[] = \"\\u2000\";\nconst char Trie::kDigitPatternUnicode[] = \"\\u2001\";\nconst char Trie::kAlphanumPatternUnicode[] = \"\\u2002\";\nconst char Trie::kPuncPatternUnicode[] = \"\\u2003\";\nconst char Trie::kLowerPatternUnicode[] = \"\\u2004\";\nconst char Trie::kUpperPatternUnicode[] = \"\\u2005\";\n\nconst char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {\n  return RTLReversePolicyNames[reverse_policy];\n}\n\n// Reset the Trie to empty.\nvoid Trie::clear() {\n  for (auto node : nodes_) {\n    delete node;\n  }\n  nodes_.clear();\n  root_back_freelist_.clear();\n  num_edges_ = 0;\n  new_dawg_node(); // Need to allocate node 0.\n}\n\nbool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,\n                        UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr,\n                        EDGE_INDEX *edge_index) const {\n  if (debug_level_ == 3) {\n    tprintf(\"edge_char_of() given node_ref \" REFFORMAT \" next_node \" REFFORMAT\n            \" direction %d word_end %d unichar_id %d, exploring node:\\n\",\n            node_ref, next_node, direction, word_end, unichar_id);\n    if (node_ref != NO_EDGE) {\n      print_node(node_ref, nodes_[node_ref]->forward_edges.size());\n    }\n  }\n  if (node_ref == NO_EDGE) {\n    return false;\n  }\n  assert(static_cast<size_t>(node_ref) < nodes_.size());\n  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges\n                                                 : nodes_[node_ref]->backward_edges;\n  int vec_size = vec.size();\n  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search\n    EDGE_INDEX start = 0;\n    EDGE_INDEX end = vec_size - 1;\n    EDGE_INDEX k;\n    int compare;\n    while (start <= end) {\n      k = (start + end) >> 1; // (start + end) / 2\n      compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]);\n      if (compare == 0) { // given == vec[k]\n        *edge_ptr = &(vec[k]);\n        *edge_index = k;\n        return true;\n      } else if (compare == 1) { // given > vec[k]\n        start = k + 1;\n      } else { // given < vec[k]\n        end = k - 1;\n      }\n    }\n  } else { // linear search\n    for (int i = 0; i < vec_size; ++i) {\n      EDGE_RECORD &edge_rec = vec[i];\n      if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec),\n                         end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) {\n        *edge_ptr = &(edge_rec);\n        *edge_index = i;\n        return true;\n      }\n    }\n  }\n  return false; // not found\n}\n\nbool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction,\n                            bool word_end, UNICHAR_ID unichar_id) {\n  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)\n                                                 : &(nodes_[node1]->backward_edges);\n  unsigned search_index;\n  if (node1 == 0 && direction == FORWARD_EDGE) {\n    search_index = 0; // find the index to make the add sorted\n    while (search_index < vec->size() &&\n           given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) {\n      search_index++;\n    }\n  } else {\n    search_index = vec->size(); // add is unsorted, so index does not matter\n  }\n  EDGE_RECORD edge_rec;\n  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);\n  if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {\n    EDGE_INDEX edge_index = root_back_freelist_.back();\n    root_back_freelist_.pop_back();\n    (*vec)[edge_index] = edge_rec;\n  } else if (search_index < vec->size()) {\n    vec->insert(vec->begin() + search_index, edge_rec);\n  } else {\n    vec->push_back(edge_rec);\n  }\n  if (debug_level_ > 1) {\n    tprintf(\"new edge in nodes_[\" REFFORMAT \"]: \", node1);\n    print_edge_rec(edge_rec);\n    tprintf(\"\\n\");\n  }\n  num_edges_++;\n  return true;\n}\n\nvoid Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool marker_flag,\n                           UNICHAR_ID unichar_id) {\n  EDGE_RECORD *back_edge_ptr;\n  EDGE_INDEX back_edge_index;\n  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr,\n                           &back_edge_index));\n  if (marker_flag) {\n    *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);\n    *edge_ptr |= (MARKER_FLAG << flag_start_bit_);\n  }\n  // Mark both directions as end of word.\n  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);\n  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);\n}\n\nbool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {\n  if (word.length() <= 0) {\n    return false; // can't add empty words\n  }\n  if (repetitions != nullptr) {\n    ASSERT_HOST(repetitions->size() == word.length());\n  }\n  // Make sure the word does not contain invalid unchar ids.\n  for (unsigned i = 0; i < word.length(); ++i) {\n    if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {\n      return false;\n    }\n  }\n\n  EDGE_RECORD *edge_ptr;\n  NODE_REF last_node = 0;\n  NODE_REF the_next_node;\n  bool marker_flag = false;\n  EDGE_INDEX edge_index;\n  int32_t still_finding_chars = true;\n  int32_t word_end = false;\n  bool add_failed = false;\n  bool found;\n\n  if (debug_level_ > 1) {\n    word.print(\"\\nAdding word: \");\n  }\n\n  UNICHAR_ID unichar_id;\n  unsigned i;\n  for (i = 0; i < word.length() - 1; ++i) {\n    unichar_id = word.unichar_id(i);\n    marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;\n    if (debug_level_ > 1) {\n      tprintf(\"Adding letter %d\\n\", unichar_id);\n    }\n    if (still_finding_chars) {\n      found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,\n                           &edge_index);\n      if (found && debug_level_ > 1) {\n        tprintf(\"exploring edge \" REFFORMAT \" in node \" REFFORMAT \"\\n\", edge_index, last_node);\n      }\n      if (!found) {\n        still_finding_chars = false;\n      } else if (next_node_from_edge_rec(*edge_ptr) == 0) {\n        // We hit the end of an existing word, but the new word is longer.\n        // In this case we have to disconnect the existing word from the\n        // backwards root node, mark the current position as end-of-word\n        // and add new nodes for the increased length. Disconnecting the\n        // existing word from the backwards root node requires a linear\n        // search, so it is much faster to add the longest words first,\n        // to avoid having to come here.\n        word_end = true;\n        still_finding_chars = false;\n        remove_edge(last_node, 0, word_end, unichar_id);\n      } else {\n        // We have to add a new branch here for the new word.\n        if (marker_flag) {\n          set_marker_flag_in_edge_rec(edge_ptr);\n        }\n        last_node = next_node_from_edge_rec(*edge_ptr);\n      }\n    }\n    if (!still_finding_chars) {\n      the_next_node = new_dawg_node();\n      if (debug_level_ > 1) {\n        tprintf(\"adding node \" REFFORMAT \"\\n\", the_next_node);\n      }\n      if (the_next_node == 0) {\n        add_failed = true;\n        break;\n      }\n      if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) {\n        add_failed = true;\n        break;\n      }\n      word_end = false;\n      last_node = the_next_node;\n    }\n  }\n  the_next_node = 0;\n  unichar_id = word.unichar_id(i);\n  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;\n  if (debug_level_ > 1) {\n    tprintf(\"Adding letter %d\\n\", unichar_id);\n  }\n  if (still_finding_chars &&\n      edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) {\n    // An extension of this word already exists in the trie, so we\n    // only have to add the ending flags in both directions.\n    add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id);\n  } else {\n    // Add a link to node 0. All leaves connect to node 0 so the back links can\n    // be used in reduction to a dawg. This root backward node has one edge\n    // entry for every word, (except prefixes of longer words) so it is huge.\n    if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) {\n      add_failed = true;\n    }\n  }\n  if (add_failed) {\n    tprintf(\"Re-initializing document dictionary...\\n\");\n    clear();\n    return false;\n  } else {\n    return true;\n  }\n}\n\nNODE_REF Trie::new_dawg_node() {\n  auto *node = new TRIE_NODE_RECORD();\n  nodes_.push_back(node);\n  return nodes_.size() - 1;\n}\n\nbool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,\n                                  Trie::RTLReversePolicy reverse_policy) {\n  std::vector<std::string> word_list;\n  if (!read_word_list(filename, &word_list)) {\n    return false;\n  }\n  std::sort(word_list.begin(), word_list.end(),\n            [](auto &s1, auto &s2) { return s1.size() > s2.size(); });\n  return add_word_list(word_list, unicharset, reverse_policy);\n}\n\nbool Trie::read_word_list(const char *filename, std::vector<std::string> *words) {\n  FILE *word_file;\n  char line_str[CHARS_PER_LINE];\n  int word_count = 0;\n\n  word_file = fopen(filename, \"rb\");\n  if (word_file == nullptr) {\n    return false;\n  }\n\n  while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {\n    chomp_string(line_str); // remove newline\n    std::string word_str(line_str);\n    ++word_count;\n    if (debug_level_ && word_count % 10000 == 0) {\n      tprintf(\"Read %d words so far\\n\", word_count);\n    }\n    words->push_back(word_str);\n  }\n  if (debug_level_) {\n    tprintf(\"Read %d words total.\\n\", word_count);\n  }\n  fclose(word_file);\n  return true;\n}\n\nbool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,\n                         Trie::RTLReversePolicy reverse_policy) {\n  for (const auto &i : words) {\n    WERD_CHOICE word(i.c_str(), unicharset);\n    if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) {\n      continue;\n    }\n    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) ||\n        reverse_policy == RRP_FORCE_REVERSE) {\n      word.reverse_and_mirror_unichar_ids();\n    }\n    if (!word_in_dawg(word)) {\n      add_word_to_dawg(word);\n      if (!word_in_dawg(word)) {\n        tprintf(\"Error: word '%s' not in DAWG after adding it\\n\", i.c_str());\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\nvoid Trie::initialize_patterns(UNICHARSET *unicharset) {\n  unicharset->unichar_insert(kAlphaPatternUnicode);\n  alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);\n  unicharset->unichar_insert(kDigitPatternUnicode);\n  digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);\n  unicharset->unichar_insert(kAlphanumPatternUnicode);\n  alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);\n  unicharset->unichar_insert(kPuncPatternUnicode);\n  punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);\n  unicharset->unichar_insert(kLowerPatternUnicode);\n  lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);\n  unicharset->unichar_insert(kUpperPatternUnicode);\n  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);\n  initialized_patterns_ = true;\n  unicharset_size_ = unicharset->size();\n}\n\nvoid Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,\n                                  std::vector<UNICHAR_ID> *vec) const {\n  bool is_alpha = unicharset.get_isalpha(unichar_id);\n  if (is_alpha) {\n    vec->push_back(alpha_pattern_);\n    vec->push_back(alphanum_pattern_);\n    if (unicharset.get_islower(unichar_id)) {\n      vec->push_back(lower_pattern_);\n    } else if (unicharset.get_isupper(unichar_id)) {\n      vec->push_back(upper_pattern_);\n    }\n  }\n  if (unicharset.get_isdigit(unichar_id)) {\n    vec->push_back(digit_pattern_);\n    if (!is_alpha) {\n      vec->push_back(alphanum_pattern_);\n    }\n  }\n  if (unicharset.get_ispunctuation(unichar_id)) {\n    vec->push_back(punc_pattern_);\n  }\n}\n\nUNICHAR_ID Trie::character_class_to_pattern(char ch) {\n  if (ch == 'c') {\n    return alpha_pattern_;\n  } else if (ch == 'd') {\n    return digit_pattern_;\n  } else if (ch == 'n') {\n    return alphanum_pattern_;\n  } else if (ch == 'p') {\n    return punc_pattern_;\n  } else if (ch == 'a') {\n    return lower_pattern_;\n  } else if (ch == 'A') {\n    return upper_pattern_;\n  } else {\n    return INVALID_UNICHAR_ID;\n  }\n}\n\nbool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) {\n  if (!initialized_patterns_) {\n    tprintf(\"please call initialize_patterns() before read_pattern_list()\\n\");\n    return false;\n  }\n\n  FILE *pattern_file = fopen(filename, \"rb\");\n  if (pattern_file == nullptr) {\n    tprintf(\"Error opening pattern file %s\\n\", filename);\n    return false;\n  }\n\n  int pattern_count = 0;\n  char string[CHARS_PER_LINE];\n  while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {\n    chomp_string(string); // remove newline\n    // Parse the pattern and construct a unichar id vector.\n    // Record the number of repetitions of each unichar in the parallel vector.\n    WERD_CHOICE word(&unicharset);\n    std::vector<bool> repetitions_vec;\n    const char *str_ptr = string;\n    int step = unicharset.step(str_ptr);\n    bool failed = false;\n    while (step > 0) {\n      UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;\n      if (step == 1 && *str_ptr == '\\\\') {\n        ++str_ptr;\n        if (*str_ptr == '\\\\') { // regular '\\' unichar that was escaped\n          curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);\n        } else {\n#if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0.\n          if (word.length() < kSaneNumConcreteChars) {\n            tprintf(\n                \"Please provide at least %d concrete characters at the\"\n                \" beginning of the pattern\\n\",\n                kSaneNumConcreteChars);\n            failed = true;\n            break;\n          }\n#endif\n          // Parse character class from expression.\n          curr_unichar_id = character_class_to_pattern(*str_ptr);\n        }\n      } else {\n        curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);\n      }\n      if (curr_unichar_id == INVALID_UNICHAR_ID) {\n        failed = true;\n        break; // failed to parse this pattern\n      }\n      word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);\n      repetitions_vec.push_back(false);\n      str_ptr += step;\n      step = unicharset.step(str_ptr);\n      // Check if there is a repetition pattern specified after this unichar.\n      if (step == 1 && *str_ptr == '\\\\' && *(str_ptr + 1) == '*') {\n        repetitions_vec[repetitions_vec.size() - 1] = true;\n        str_ptr += 2;\n        step = unicharset.step(str_ptr);\n      }\n    }\n    if (failed) {\n      tprintf(\"Invalid user pattern %s\\n\", string);\n      continue;\n    }\n    // Insert the pattern into the trie.\n    if (debug_level_ > 2) {\n      tprintf(\"Inserting expanded user pattern %s\\n\", word.debug_string().c_str());\n    }\n    if (!this->word_in_dawg(word)) {\n      this->add_word_to_dawg(word, &repetitions_vec);\n      if (!this->word_in_dawg(word)) {\n        tprintf(\"Error: failed to insert pattern '%s'\\n\", string);\n      }\n    }\n    ++pattern_count;\n  }\n  if (debug_level_) {\n    tprintf(\"Read %d valid patterns from %s\\n\", pattern_count, filename);\n  }\n  fclose(pattern_file);\n  return true;\n}\n\nvoid Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,\n                               UNICHAR_ID unichar_id) {\n  EDGE_RECORD *edge_ptr = nullptr;\n  EDGE_INDEX edge_index = 0;\n  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index));\n  if (debug_level_ > 1) {\n    tprintf(\"removed edge in nodes_[\" REFFORMAT \"]: \", node1);\n    print_edge_rec(*edge_ptr);\n    tprintf(\"\\n\");\n  }\n  if (direction == FORWARD_EDGE) {\n    nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);\n  } else if (node1 == 0) {\n    KillEdge(&nodes_[node1]->backward_edges[edge_index]);\n    root_back_freelist_.push_back(edge_index);\n  } else {\n    nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);\n  }\n  --num_edges_;\n}\n\n// Some optimizations employed in add_word_to_dawg and trie_to_dawg:\n// 1 Avoid insertion sorting or bubble sorting the tail root node\n//   (back links on node 0, a list of all the leaves.). The node is\n//   huge, and sorting it with n^2 time is terrible.\n// 2 Avoid using vector::erase on the tail root node.\n//   (a) During add of words to the trie, zero-out the unichars and\n//       keep a freelist of spaces to re-use.\n//   (b) During reduction, just zero-out the unichars of deleted back\n//       links, skipping zero entries while searching.\n// 3 Avoid linear search of the tail root node. This has to be done when\n//   a suffix is added to an existing word. Adding words by decreasing\n//   length avoids this problem entirely. Words can still be added in\n//   any order, but it is faster to add the longest first.\nSquishedDawg *Trie::trie_to_dawg() {\n  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.\n  if (debug_level_ > 2) {\n    print_all(\"Before reduction:\", MAX_NODE_EDGES_DISPLAY);\n  }\n  std::vector<bool> reduced_nodes(nodes_.size());\n  this->reduce_node_input(0, reduced_nodes);\n\n  if (debug_level_ > 2) {\n    print_all(\"After reduction:\", MAX_NODE_EDGES_DISPLAY);\n  }\n  // Build a translation map from node indices in nodes_ vector to\n  // their target indices in EDGE_ARRAY.\n  std::vector<NODE_REF> node_ref_map(nodes_.size() + 1);\n  unsigned i;\n  for (i = 0; i < nodes_.size(); ++i) {\n    node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();\n  }\n  int num_forward_edges = node_ref_map[i];\n\n  // Convert nodes_ vector into EDGE_ARRAY translating the next node references\n  // in edges using node_ref_map. Empty nodes and backward edges are dropped.\n  auto edge_array = new EDGE_RECORD[num_forward_edges];\n  EDGE_ARRAY edge_array_ptr = edge_array;\n  for (i = 0; i < nodes_.size(); ++i) {\n    TRIE_NODE_RECORD *node_ptr = nodes_[i];\n    int end = node_ptr->forward_edges.size();\n    for (int j = 0; j < end; ++j) {\n      EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];\n      NODE_REF node_ref = next_node_from_edge_rec(edge_rec);\n      ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size());\n      UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);\n      link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,\n                end_of_word_from_edge_rec(edge_rec), unichar_id);\n      if (j == end - 1) {\n        set_marker_flag_in_edge_rec(edge_array_ptr);\n      }\n      ++edge_array_ptr;\n    }\n  }\n\n  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,\n                          debug_level_);\n}\n\nbool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,\n                                     const EDGE_RECORD &edge2) {\n  if (debug_level_ > 1) {\n    tprintf(\"\\nCollapsing node %\" PRIi64 \":\\n\", node);\n    print_node(node, MAX_NODE_EDGES_DISPLAY);\n    tprintf(\"Candidate edges: \");\n    print_edge_rec(edge1);\n    tprintf(\", \");\n    print_edge_rec(edge2);\n    tprintf(\"\\n\\n\");\n  }\n  NODE_REF next_node1 = next_node_from_edge_rec(edge1);\n  NODE_REF next_node2 = next_node_from_edge_rec(edge2);\n  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];\n  // Translate all edges going to/from next_node2 to go to/from next_node1.\n  EDGE_RECORD *edge_ptr = nullptr;\n  EDGE_INDEX edge_index;\n  // The backward link in node to next_node2 will be zeroed out by the caller.\n  // Copy all the backward links in next_node2 to node next_node1\n  for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {\n    const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];\n    NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);\n    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);\n    int curr_word_end = end_of_word_from_edge_rec(bkw_edge);\n    bool marker_flag = marker_flag_from_edge_rec(bkw_edge);\n    add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end,\n                     curr_unichar_id);\n    // Relocate the corresponding forward edge in curr_next_node\n    ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end,\n                             curr_unichar_id, &edge_ptr, &edge_index));\n    set_next_node_in_edge_rec(edge_ptr, next_node1);\n  }\n  int next_node2_num_edges =\n      (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size());\n  if (debug_level_ > 1) {\n    tprintf(\"removed %d edges from node \" REFFORMAT \"\\n\", next_node2_num_edges, next_node2);\n  }\n  next_node2_ptr->forward_edges.clear();\n  next_node2_ptr->backward_edges.clear();\n  num_edges_ -= next_node2_num_edges;\n  return true;\n}\n\nbool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,\n                                 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) {\n  if (debug_level_ > 1) {\n    tprintf(\"reduce_lettered_edges(edge=\" REFFORMAT \")\\n\", edge_index);\n  }\n  // Compare each of the edge pairs with the given unichar_id.\n  bool did_something = false;\n  for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) {\n    // Find the first edge that can be eliminated.\n    UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;\n    while (i < backward_edges->size()) {\n      if (!DeadEdge((*backward_edges)[i])) {\n        curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);\n        if (curr_unichar_id != unichar_id) {\n          return did_something;\n        }\n        if (can_be_eliminated((*backward_edges)[i])) {\n          break;\n        }\n      }\n      ++i;\n    }\n    if (i == backward_edges->size()) {\n      break;\n    }\n    const EDGE_RECORD &edge_rec = (*backward_edges)[i];\n    // Compare it to the rest of the edges with the given unichar_id.\n    for (auto j = i + 1; j < backward_edges->size(); ++j) {\n      const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];\n      if (DeadEdge(next_edge_rec)) {\n        continue;\n      }\n      UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);\n      if (next_id != unichar_id) {\n        break;\n      }\n      if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) &&\n          can_be_eliminated(next_edge_rec) &&\n          eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {\n        reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;\n        did_something = true;\n        KillEdge(&(*backward_edges)[j]);\n      }\n    }\n  }\n  return did_something;\n}\n\nvoid Trie::sort_edges(EDGE_VECTOR *edges) {\n  int num_edges = edges->size();\n  if (num_edges <= 1) {\n    return;\n  }\n  std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;\n  sort_vec.reserve(num_edges);\n  for (int i = 0; i < num_edges; ++i) {\n    sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]);\n  }\n  std::sort(sort_vec.begin(), sort_vec.end());\n  for (int i = 0; i < num_edges; ++i) {\n    (*edges)[i] = sort_vec[i].data();\n  }\n}\n\nvoid Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) {\n  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;\n  sort_edges(&backward_edges);\n  if (debug_level_ > 1) {\n    tprintf(\"reduce_node_input(node=\" REFFORMAT \")\\n\", node);\n    print_node(node, MAX_NODE_EDGES_DISPLAY);\n  }\n\n  EDGE_INDEX edge_index = 0;\n  while (static_cast<size_t>(edge_index) < backward_edges.size()) {\n    if (DeadEdge(backward_edges[edge_index])) {\n      continue;\n    }\n    UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]);\n    while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {\n      ;\n    }\n    while (static_cast<size_t>(++edge_index) < backward_edges.size()) {\n      UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);\n      if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {\n        break;\n      }\n    }\n  }\n  reduced_nodes[node] = true; // mark as reduced\n\n  if (debug_level_ > 1) {\n    tprintf(\"Node \" REFFORMAT \" after reduction:\\n\", node);\n    print_node(node, MAX_NODE_EDGES_DISPLAY);\n  }\n\n  for (auto &backward_edge : backward_edges) {\n    if (DeadEdge(backward_edge)) {\n      continue;\n    }\n    NODE_REF next_node = next_node_from_edge_rec(backward_edge);\n    if (next_node != 0 && !reduced_nodes[next_node]) {\n      reduce_node_input(next_node, reduced_nodes);\n    }\n  }\n}\n\nvoid Trie::print_node(NODE_REF node, int max_num_edges) const {\n  if (node == NO_EDGE) {\n    return; // nothing to print\n  }\n  TRIE_NODE_RECORD *node_ptr = nodes_[node];\n  int num_fwd = node_ptr->forward_edges.size();\n  int num_bkw = node_ptr->backward_edges.size();\n  EDGE_VECTOR *vec;\n  for (int dir = 0; dir < 2; ++dir) {\n    if (dir == 0) {\n      vec = &(node_ptr->forward_edges);\n      tprintf(REFFORMAT \" (%d %d): \", node, num_fwd, num_bkw);\n    } else {\n      vec = &(node_ptr->backward_edges);\n      tprintf(\"\\t\");\n    }\n    int i;\n    for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) {\n      if (DeadEdge((*vec)[i])) {\n        continue;\n      }\n      print_edge_rec((*vec)[i]);\n      tprintf(\" \");\n    }\n    if (dir == 0 ? i < num_fwd : i < num_bkw) {\n      tprintf(\"...\");\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/dict/trie.h",
    "content": "/******************************************************************************\n *\n * File:        trie.h\n * Description: Functions to build a trie data structure.\n * Author:      Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n#ifndef TRIE_H\n#define TRIE_H\n\n#include \"dawg.h\"\n\nnamespace tesseract {\n\nclass UNICHARSET;\n\n// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed\n// max int32, we will need to change vector to use int64 for size\n// and address indices. This does not seem to be needed immediately,\n// since currently the largest number of edges limit used by tesseract\n// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.\n// There are also int casts below to satisfy the WIN32 compiler that would\n// need to be changed.\n// It might be cleanest to change the types of most of the Trie/Dawg related\n// typedefs to int and restrict the casts to extracting these values from\n// the 64 bit EDGE_RECORD.\nusing EDGE_INDEX = int64_t; // index of an edge in a given node\nusing EDGE_VECTOR = std::vector<EDGE_RECORD>;\n\nstruct TRIE_NODE_RECORD {\n  EDGE_VECTOR forward_edges;\n  EDGE_VECTOR backward_edges;\n};\nusing TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;\n\n/**\n * Concrete class for Trie data structure that allows to store a list of\n * words (extends Dawg base class) as well as dynamically add new words.\n * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of\n * which has a vector of forward and backward edges.\n */\nclass TESS_API Trie : public Dawg {\npublic:\n  enum RTLReversePolicy {\n    RRP_DO_NO_REVERSE,\n    RRP_REVERSE_IF_HAS_RTL,\n    RRP_FORCE_REVERSE,\n  };\n\n  // Minimum number of concrete characters at the beginning of user patterns.\n  static const int kSaneNumConcreteChars = 0;\n  // Various unicode whitespace characters are used to denote unichar patterns,\n  // (character classifier would never produce these whitespace characters as a\n  // valid classification).\n  static const char kAlphaPatternUnicode[];\n  static const char kDigitPatternUnicode[];\n  static const char kAlphanumPatternUnicode[];\n  static const char kPuncPatternUnicode[];\n  static const char kLowerPatternUnicode[];\n  static const char kUpperPatternUnicode[];\n\n  static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);\n\n  // max_num_edges argument allows limiting the amount of memory this\n  // Trie can consume (if a new word insert would cause the Trie to\n  // contain more edges than max_num_edges, all the edges are cleared\n  // so that new inserts can proceed).\n  Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)\n      : Dawg(type, lang, perm, debug_level) {\n    init(unicharset_size);\n    deref_node_index_mask_ = ~letter_mask_;\n    new_dawg_node(); // need to allocate node 0\n  }\n  ~Trie() override {\n    for (auto node : nodes_) {\n      delete node;\n    }\n  }\n\n  // Reset the Trie to empty.\n  void clear();\n\n  /** Returns the edge that corresponds to the letter out of this node. */\n  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {\n    EDGE_RECORD *edge_ptr;\n    EDGE_INDEX edge_index;\n    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,\n                      &edge_index)) {\n      return NO_EDGE;\n    }\n    return make_edge_ref(node_ref, edge_index);\n  }\n\n  /**\n   * Fills the given NodeChildVector with all the unichar ids (and the\n   * corresponding EDGE_REFs) for which there is an edge out of this node.\n   */\n  void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {\n    const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;\n    for (auto &edge : forward_edges) {\n      if (!word_end || end_of_word_from_edge_rec(edge)) {\n        vec->push_back(\n            NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));\n      }\n    }\n  }\n\n  /**\n   * Returns the next node visited by following the edge\n   * indicated by the given EDGE_REF.\n   */\n  NODE_REF next_node(EDGE_REF edge_ref) const override {\n    if (edge_ref == NO_EDGE || num_edges_ == 0) {\n      return NO_EDGE;\n    }\n    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));\n  }\n\n  /**\n   * Returns true if the edge indicated by the given EDGE_REF\n   * marks the end of a word.\n   */\n  bool end_of_word(EDGE_REF edge_ref) const override {\n    if (edge_ref == NO_EDGE || num_edges_ == 0) {\n      return false;\n    }\n    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));\n  }\n\n  /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */\n  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {\n    if (edge_ref == NO_EDGE || num_edges_ == 0) {\n      return INVALID_UNICHAR_ID;\n    }\n    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));\n  }\n  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking\n  // the edge dead.\n  void KillEdge(EDGE_RECORD *edge_rec) const {\n    *edge_rec &= ~letter_mask_;\n    *edge_rec |= (unicharset_size_ << LETTER_START_BIT);\n  }\n  bool DeadEdge(const EDGE_RECORD &edge_rec) const {\n    return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;\n  }\n\n  // Prints the contents of the node indicated by the given NODE_REF.\n  // At most max_num_edges will be printed.\n  void print_node(NODE_REF node, int max_num_edges) const override;\n\n  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.\n  // Eliminates redundant edges and returns the pointer to the SquishedDawg.\n  // Note: the caller is responsible for deallocating memory associated\n  // with the returned SquishedDawg pointer.\n  SquishedDawg *trie_to_dawg();\n\n  // Reads a list of words from the given file and adds into the Trie.\n  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse\n  // policy and information in the unicharset.\n  // Returns false on error.\n  bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,\n                              Trie::RTLReversePolicy reverse);\n\n  // Reads a list of words from the given file.\n  // Returns false on error.\n  bool read_word_list(const char *filename, std::vector<std::string> *words);\n  // Adds a list of words previously read using read_word_list to the trie\n  // using the given unicharset and reverse_policy to convert to unichar-ids.\n  // Returns false on error.\n  bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,\n                     Trie::RTLReversePolicy reverse_policy);\n\n  // Inserts the list of patterns from the given file into the Trie.\n  // The pattern list file should contain one pattern per line in UTF-8 format.\n  //\n  // Each pattern can contain any non-whitespace characters, however only the\n  // patterns that contain characters from the unicharset of the corresponding\n  // language will be useful.\n  // The only meta character is '\\'. To be used in a pattern as an ordinary\n  // string it should be escaped with '\\' (e.g. string \"C:\\Documents\" should\n  // be written in the patterns file as \"C:\\\\Documents\").\n  // This function supports a very limited regular expression syntax. One can\n  // express a character, a certain character class and a number of times the\n  // entity should be repeated in the pattern.\n  //\n  // To denote a character class use one of:\n  // \\c - unichar for which UNICHARSET::get_isalpha() is true (character)\n  // \\d - unichar for which UNICHARSET::get_isdigit() is true\n  // \\n - unichar for which UNICHARSET::get_isdigit() or\n  //      UNICHARSET::isalpha() are true\n  // \\p - unichar for which UNICHARSET::get_ispunct() is true\n  // \\a - unichar for which UNICHARSET::get_islower() is true\n  // \\A - unichar for which UNICHARSET::get_isupper() is true\n  //\n  // \\* could be specified after each character or pattern to indicate that\n  // the character/pattern can be repeated any number of times before the next\n  // character/pattern occurs.\n  //\n  // Examples:\n  // 1-8\\d\\d-GOOG-411 will be expanded to strings:\n  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.\n  //\n  // http://www.\\n\\*.com will be expanded to strings like:\n  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com\n  //\n  // Note: In choosing which patterns to include please be aware of the fact\n  // providing very generic patterns will make tesseract run slower.\n  // For example \\n\\* at the beginning of the pattern will make Tesseract\n  // consider all the combinations of proposed character choices for each\n  // of the segmentations, which will be unacceptably slow.\n  // Because of potential problems with speed that could be difficult to\n  // identify, each user pattern has to have at least kSaneNumConcreteChars\n  // concrete characters from the unicharset at the beginning.\n  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);\n\n  // Initializes the values of *_pattern_ unichar ids.\n  // This function should be called before calling read_pattern_list().\n  void initialize_patterns(UNICHARSET *unicharset);\n\n  // Fills in the given unichar id vector with the unichar ids that represent\n  // the patterns of the character classes of the given unichar_id.\n  void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,\n                              std::vector<UNICHAR_ID> *vec) const override;\n\n  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has\n  // a self loop and the given unichar_id matches the unichar_id stored in the\n  // EDGE_RECORD, returns NO_EDGE otherwise.\n  EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,\n                             bool word_end) const override {\n    if (edge_ref == NO_EDGE) {\n      return NO_EDGE;\n    }\n    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);\n    return (marker_flag_from_edge_rec(*edge_rec) &&\n            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&\n            word_end == end_of_word_from_edge_rec(*edge_rec))\n               ? edge_ref\n               : NO_EDGE;\n  }\n\n  // Adds a word to the Trie (creates the necessary nodes and edges).\n  //\n  // If repetitions vector is not nullptr, each entry in the vector indicates\n  // whether the unichar id with the corresponding index in the word is allowed\n  // to repeat an unlimited number of times. For each entry that is true, MARKER\n  // flag of the corresponding edge created for this unichar id is set to true).\n  //\n  // Return true if add succeeded, false otherwise (e.g. when a word contained\n  // an invalid unichar id or the trie was getting too large and was cleared).\n  bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);\n  bool add_word_to_dawg(const WERD_CHOICE &word) {\n    return add_word_to_dawg(word, nullptr);\n  }\n\nprotected:\n  // The structure of an EDGE_REF for Trie edges is as follows:\n  // [LETTER_START_BIT, flag_start_bit_):\n  //                             edge index in *_edges in a TRIE_NODE_RECORD\n  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)\n  //\n  // With this arrangement there are enough bits to represent edge indices\n  // (each node can have at most unicharset_size_ forward edges and\n  // the position of flag_start_bit is set to be log2(unicharset_size_)).\n  // It is also possible to accommodate a maximum number of nodes that is at\n  // least as large as that of the SquishedDawg representation (in SquishedDawg\n  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent\n  // the next node index).\n  //\n\n  // Returns the pointer to EDGE_RECORD after decoding the location\n  // of the edge from the information in the given EDGE_REF.\n  // This function assumes that EDGE_REF holds valid node/edge indices.\n  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {\n    int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);\n    int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);\n    TRIE_NODE_RECORD *node_rec = nodes_[node_index];\n    return &(node_rec->forward_edges[edge_index]);\n  }\n  /** Constructs EDGE_REF from the given node_index and edge_index. */\n  inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {\n    return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));\n  }\n  /** Sets up this edge record to the requested values. */\n  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,\n                        UNICHAR_ID unichar_id) {\n    EDGE_RECORD flags = 0;\n    if (repeats) {\n      flags |= MARKER_FLAG;\n    }\n    if (word_end) {\n      flags |= WERD_END_FLAG;\n    }\n    if (direction == BACKWARD_EDGE) {\n      flags |= DIRECTION_FLAG;\n    }\n    *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |\n             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));\n  }\n  /** Prints the given EDGE_RECORD. */\n  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {\n    tprintf(\"|\" REFFORMAT \"|%s%s%s|%d|\", next_node_from_edge_rec(edge_rec),\n            marker_flag_from_edge_rec(edge_rec) ? \"R,\" : \"\",\n            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? \"F\" : \"B\",\n            end_of_word_from_edge_rec(edge_rec) ? \",E\" : \"\", unichar_id_from_edge_rec(edge_rec));\n  }\n  // Returns true if the next node in recorded the given EDGE_RECORD\n  // has exactly one forward edge.\n  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {\n    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);\n    return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);\n  }\n\n  // Prints the contents of the Trie.\n  // At most max_num_edges will be printed for each node.\n  void print_all(const char *msg, int max_num_edges) {\n    tprintf(\"\\n__________________________\\n%s\\n\", msg);\n    for (size_t i = 0; i < nodes_.size(); ++i) {\n      print_node(i, max_num_edges);\n    }\n    tprintf(\"__________________________\\n\");\n  }\n\n  // Finds the edge with the given direction, word_end and unichar_id\n  // in the node indicated by node_ref. Fills in the pointer to the\n  // EDGE_RECORD and the index of the edge with the values\n  // corresponding to the edge found. Returns true if an edge was found.\n  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,\n                    UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;\n\n  // Adds an single edge linkage between node1 and node2 in the direction\n  // indicated by direction argument.\n  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,\n                        UNICHAR_ID unichar_id);\n\n  // Adds forward edge linkage from node1 to node2 and the corresponding\n  // backward edge linkage in the other direction.\n  bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,\n                    UNICHAR_ID unichar_id) {\n    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&\n            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));\n  }\n\n  // Sets the word ending flags in an already existing edge pair.\n  // Returns true on success.\n  void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,\n                       UNICHAR_ID unichar_id);\n\n  // Allocates space for a new node in the Trie.\n  NODE_REF new_dawg_node();\n\n  // Removes a single edge linkage to between node1 and node2 in the\n  // direction indicated by direction argument.\n  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,\n                           UNICHAR_ID unichar_id);\n\n  // Removes forward edge linkage from node1 to node2 and the corresponding\n  // backward edge linkage in the other direction.\n  void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {\n    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);\n    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);\n  }\n\n  // Compares edge1 and edge2 in the given node to see if they point to two\n  // next nodes that could be collapsed. If they do, performs the reduction\n  // and returns true.\n  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);\n\n  // Assuming that edge_index indicates the first edge in a group of edges\n  // in this node with a particular letter value, looks through these edges\n  // to see if any of them can be collapsed. If so does it. Returns to the\n  // caller when all edges with this letter have been reduced.\n  // Returns true if further reduction is possible with this same letter.\n  bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,\n                             EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);\n\n  /**\n   * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in\n   * increasing order of unichar ids. This function is normally called\n   * for all edges in a single node, and since number of edges in each node\n   * is usually quite small, selection sort is used.\n   */\n  void sort_edges(EDGE_VECTOR *edges);\n\n  /** Eliminates any redundant edges from this node in the Trie. */\n  void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);\n\n  // Returns the pattern unichar id for the given character class code.\n  UNICHAR_ID character_class_to_pattern(char ch);\n\n  // Member variables\n  TRIE_NODES nodes_; // vector of nodes in the Trie\n  // Freelist of edges in the root backwards node that were previously zeroed.\n  std::vector<EDGE_INDEX> root_back_freelist_;\n  uint64_t num_edges_ = 0;             // sum of all edges (forward and backward)\n  uint64_t deref_direction_mask_ = 0;  // mask for EDGE_REF to extract direction\n  uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index\n  // Variables for translating character class codes denoted in user patterns\n  // file to the unichar ids used to represent them in a Trie.\n  UNICHAR_ID alpha_pattern_ = 0;\n  UNICHAR_ID digit_pattern_ = 0;\n  UNICHAR_ID alphanum_pattern_ = 0;\n  UNICHAR_ID punc_pattern_ = 0;\n  UNICHAR_ID lower_pattern_ = 0;\n  UNICHAR_ID upper_pattern_ = 0;\n  bool initialized_patterns_ = false;\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/lstm/convolve.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        convolve.cpp\n// Description: Convolutional layer that stacks the inputs over its rectangle\n//              and pulls in random data to fill out-of-input inputs.\n//              Output is therefore same size as its input, but deeper.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"convolve.h\"\n\n#include \"networkscratch.h\"\n#include \"serialis.h\"\n\nnamespace tesseract {\n\nConvolve::Convolve(const std::string &name, int ni, int half_x, int half_y)\n    : Network(NT_CONVOLVE, name, ni, ni * (2 * half_x + 1) * (2 * half_y + 1))\n    , half_x_(half_x)\n    , half_y_(half_y) {}\n\n// Writes to the given file. Returns false in case of error.\nbool Convolve::Serialize(TFile *fp) const {\n  return Network::Serialize(fp) && fp->Serialize(&half_x_) && fp->Serialize(&half_y_);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool Convolve::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(&half_x_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&half_y_)) {\n    return false;\n  }\n  no_ = ni_ * (2 * half_x_ + 1) * (2 * half_y_ + 1);\n  return true;\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Convolve::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                       NetworkScratch *scratch, NetworkIO *output) {\n  output->Resize(input, no_);\n  int y_scale = 2 * half_y_ + 1;\n  StrideMap::Index dest_index(output->stride_map());\n  do {\n    // Stack x_scale groups of y_scale * ni_ inputs together.\n    int t = dest_index.t();\n    int out_ix = 0;\n    for (int x = -half_x_; x <= half_x_; ++x, out_ix += y_scale * ni_) {\n      StrideMap::Index x_index(dest_index);\n      if (!x_index.AddOffset(x, FD_WIDTH)) {\n        // This x is outside the image.\n        output->Randomize(t, out_ix, y_scale * ni_, randomizer_);\n      } else {\n        int out_iy = out_ix;\n        for (int y = -half_y_; y <= half_y_; ++y, out_iy += ni_) {\n          StrideMap::Index y_index(x_index);\n          if (!y_index.AddOffset(y, FD_HEIGHT)) {\n            // This y is outside the image.\n            output->Randomize(t, out_iy, ni_, randomizer_);\n          } else {\n            output->CopyTimeStepGeneral(t, out_iy, ni_, input, y_index.t(), 0);\n          }\n        }\n      }\n    }\n  } while (dest_index.Increment());\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    DisplayForward(*output);\n  }\n#endif\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Convolve::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                        NetworkIO *back_deltas) {\n  back_deltas->Resize(fwd_deltas, ni_);\n  NetworkScratch::IO delta_sum;\n  delta_sum.ResizeFloat(fwd_deltas, ni_, scratch);\n  delta_sum->Zero();\n  int y_scale = 2 * half_y_ + 1;\n  StrideMap::Index src_index(fwd_deltas.stride_map());\n  do {\n    // Stack x_scale groups of y_scale * ni_ inputs together.\n    int t = src_index.t();\n    int out_ix = 0;\n    for (int x = -half_x_; x <= half_x_; ++x, out_ix += y_scale * ni_) {\n      StrideMap::Index x_index(src_index);\n      if (x_index.AddOffset(x, FD_WIDTH)) {\n        int out_iy = out_ix;\n        for (int y = -half_y_; y <= half_y_; ++y, out_iy += ni_) {\n          StrideMap::Index y_index(x_index);\n          if (y_index.AddOffset(y, FD_HEIGHT)) {\n            fwd_deltas.AddTimeStepPart(t, out_iy, ni_, delta_sum->f(y_index.t()));\n          }\n        }\n      }\n    }\n  } while (src_index.Increment());\n  back_deltas->CopyAll(*delta_sum);\n  return true;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/convolve.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        convolve.h\n// Description: Convolutional layer that stacks the inputs over its rectangle\n//              and pulls in random data to fill out-of-input inputs.\n//              Output is therefore same size as its input, but deeper.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_CONVOLVE_H_\n#define TESSERACT_LSTM_CONVOLVE_H_\n\n#include \"matrix.h\"\n#include \"network.h\"\n\nnamespace tesseract {\n\n// Makes each time-step deeper by stacking inputs over its rectangle. Does not\n// affect the size of its input. Achieves this by bringing in random values in\n// out-of-input areas.\nclass Convolve : public Network {\npublic:\n  // The area of convolution is 2*half_x + 1 by 2*half_y + 1, forcing it to\n  // always be odd, so the center is the current pixel.\n  TESS_API\n  Convolve(const std::string &name, int ni, int half_x, int half_y);\n  ~Convolve() override = default;\n\n  std::string spec() const override {\n    return \"C\" + std::to_string(half_y_ * 2 + 1) + \",\" + std::to_string(half_x_ * 2 + 1);\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\nprivate:\n  void DebugWeights() override {\n    tprintf(\"Must override Network::DebugWeights for type %d\\n\", type_);\n  }\n\nprotected:\n  // Serialized data.\n  int32_t half_x_;\n  int32_t half_y_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_SUBSAMPLE_H_\n"
  },
  {
    "path": "src/lstm/fullyconnected.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        fullyconnected.cpp\n// Description: Simple feed-forward layer with various non-linearities.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"fullyconnected.h\"\n\n#ifdef _OPENMP\n#  include <omp.h>\n#endif\n#include <cstdio>\n#include <cstdlib>\n\n#include \"functions.h\"\n#include \"networkscratch.h\"\n\n// Number of threads to use for parallel calculation of Forward and Backward.\n#ifdef _OPENMP\nconst int kNumThreads = 4;\n#else\nconst int kNumThreads = 1;\n#endif\n\nnamespace tesseract {\n\nFullyConnected::FullyConnected(const std::string &name, int ni, int no, NetworkType type)\n    : Network(type, name, ni, no), external_source_(nullptr), int_mode_(false) {}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape FullyConnected::OutputShape(const StaticShape &input_shape) const {\n  LossType loss_type = LT_NONE;\n  if (type_ == NT_SOFTMAX) {\n    loss_type = LT_CTC;\n  } else if (type_ == NT_SOFTMAX_NO_CTC) {\n    loss_type = LT_SOFTMAX;\n  } else if (type_ == NT_LOGISTIC) {\n    loss_type = LT_LOGISTIC;\n  }\n  StaticShape result(input_shape);\n  result.set_depth(no_);\n  result.set_loss_type(loss_type);\n  return result;\n}\n\n// Suspends/Enables training by setting the training_ flag.\nvoid FullyConnected::SetEnableTraining(TrainingState state) {\n  if (state == TS_RE_ENABLE) {\n    // Enable only from temp disabled.\n    if (training_ == TS_TEMP_DISABLE) {\n      training_ = TS_ENABLED;\n    }\n  } else if (state == TS_TEMP_DISABLE) {\n    // Temp disable only from enabled.\n    if (training_ == TS_ENABLED) {\n      training_ = state;\n    }\n  } else {\n    if (state == TS_ENABLED && training_ != TS_ENABLED) {\n      weights_.InitBackward();\n    }\n    training_ = state;\n  }\n}\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\nint FullyConnected::InitWeights(float range, TRand *randomizer) {\n  Network::SetRandomizer(randomizer);\n  num_weights_ = weights_.InitWeightsFloat(no_, ni_ + 1, TestFlag(NF_ADAM), range, randomizer);\n  return num_weights_;\n}\n\n// Recursively searches the network for softmaxes with old_no outputs,\n// and remaps their outputs according to code_map. See network.h for details.\n\nint FullyConnected::RemapOutputs(int old_no, const std::vector<int> &code_map) {\n  if (type_ == NT_SOFTMAX && no_ == old_no) {\n    num_weights_ = weights_.RemapOutputs(code_map);\n    no_ = code_map.size();\n  }\n  return num_weights_;\n}\n\n// Converts a float network to an int network.\nvoid FullyConnected::ConvertToInt() {\n  weights_.ConvertToInt();\n}\n\n// Provides debug output on the weights.\nvoid FullyConnected::DebugWeights() {\n  weights_.Debug2D(name_.c_str());\n}\n\n// Writes to the given file. Returns false in case of error.\nbool FullyConnected::Serialize(TFile *fp) const {\n  if (!Network::Serialize(fp)) {\n    return false;\n  }\n  if (!weights_.Serialize(IsTraining(), fp)) {\n    return false;\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\nbool FullyConnected::DeSerialize(TFile *fp) {\n  return weights_.DeSerialize(IsTraining(), fp);\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid FullyConnected::Forward(bool debug, const NetworkIO &input,\n                             const TransposedArray *input_transpose, NetworkScratch *scratch,\n                             NetworkIO *output) {\n  int width = input.Width();\n  if (type_ == NT_SOFTMAX) {\n    output->ResizeFloat(input, no_);\n  } else {\n    output->Resize(input, no_);\n  }\n  SetupForward(input, input_transpose);\n  std::vector<NetworkScratch::FloatVec> temp_lines(kNumThreads);\n  std::vector<NetworkScratch::FloatVec> curr_input(kNumThreads);\n  int ro = no_;\n  if (IntSimdMatrix::intSimdMatrix) {\n    ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);\n  }\n  for (int i = 0; i < kNumThreads; ++i) {\n    temp_lines[i].Init(ro, scratch);\n    curr_input[i].Init(ni_, scratch);\n  }\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(kNumThreads)\n  for (int t = 0; t < width; ++t) {\n    // Thread-local pointer to temporary storage.\n    int thread_id = omp_get_thread_num();\n#else\n  for (int t = 0; t < width; ++t) {\n    // Thread-local pointer to temporary storage.\n    int thread_id = 0;\n#endif\n    TFloat *temp_line = temp_lines[thread_id];\n    if (input.int_mode()) {\n      ForwardTimeStep(input.i(t), t, temp_line);\n    } else {\n      input.ReadTimeStep(t, curr_input[thread_id]);\n      ForwardTimeStep(curr_input[thread_id], t, temp_line);\n    }\n    output->WriteTimeStep(t, temp_line);\n    if (IsTraining() && type_ != NT_SOFTMAX) {\n      acts_.CopyTimeStepFrom(t, *output, t);\n    }\n  }\n  // Zero all the elements that are in the padding around images that allows\n  // multiple different-sized images to exist in a single array.\n  // acts_ is only used if this is not a softmax op.\n  if (IsTraining() && type_ != NT_SOFTMAX) {\n    acts_.ZeroInvalidElements();\n  }\n  output->ZeroInvalidElements();\n#if DEBUG_DETAIL > 0\n  tprintf(\"F Output:%s\\n\", name_.c_str());\n  output->Print(10);\n#endif\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    DisplayForward(*output);\n  }\n#endif\n}\n\n// Components of Forward so FullyConnected can be reused inside LSTM.\nvoid FullyConnected::SetupForward(const NetworkIO &input, const TransposedArray *input_transpose) {\n  // Softmax output is always float, so save the input type.\n  int_mode_ = input.int_mode();\n  if (IsTraining()) {\n    acts_.Resize(input, no_);\n    // Source_ is a transposed copy of input. It isn't needed if provided.\n    external_source_ = input_transpose;\n    if (external_source_ == nullptr) {\n      source_t_.ResizeNoInit(ni_, input.Width());\n    }\n  }\n}\n\nvoid FullyConnected::ForwardTimeStep(int t, TFloat *output_line) {\n  if (type_ == NT_TANH) {\n    FuncInplace<GFunc>(no_, output_line);\n  } else if (type_ == NT_LOGISTIC) {\n    FuncInplace<FFunc>(no_, output_line);\n  } else if (type_ == NT_POSCLIP) {\n    FuncInplace<ClipFFunc>(no_, output_line);\n  } else if (type_ == NT_SYMCLIP) {\n    FuncInplace<ClipGFunc>(no_, output_line);\n  } else if (type_ == NT_RELU) {\n    FuncInplace<Relu>(no_, output_line);\n  } else if (type_ == NT_SOFTMAX || type_ == NT_SOFTMAX_NO_CTC) {\n    SoftmaxInPlace(no_, output_line);\n  } else if (type_ != NT_LINEAR) {\n    ASSERT_HOST(\"Invalid fully-connected type!\" == nullptr);\n  }\n}\n\nvoid FullyConnected::ForwardTimeStep(const TFloat *d_input, int t, TFloat *output_line) {\n  // input is copied to source_ line-by-line for cache coherency.\n  if (IsTraining() && external_source_ == nullptr) {\n    source_t_.WriteStrided(t, d_input);\n  }\n  weights_.MatrixDotVector(d_input, output_line);\n  ForwardTimeStep(t, output_line);\n}\n\nvoid FullyConnected::ForwardTimeStep(const int8_t *i_input, int t, TFloat *output_line) {\n  // input is copied to source_ line-by-line for cache coherency.\n  weights_.MatrixDotVector(i_input, output_line);\n  ForwardTimeStep(t, output_line);\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                              NetworkIO *back_deltas) {\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    DisplayBackward(fwd_deltas);\n  }\n#endif\n  back_deltas->Resize(fwd_deltas, ni_);\n  std::vector<NetworkScratch::FloatVec> errors(kNumThreads);\n  for (int i = 0; i < kNumThreads; ++i) {\n    errors[i].Init(no_, scratch);\n  }\n  std::vector<NetworkScratch::FloatVec> temp_backprops;\n  if (needs_to_backprop_) {\n    temp_backprops.resize(kNumThreads);\n    for (int i = 0; i < kNumThreads; ++i) {\n      temp_backprops[i].Init(ni_, scratch);\n    }\n  }\n  int width = fwd_deltas.Width();\n  NetworkScratch::GradientStore errors_t;\n  errors_t.Init(no_, width, scratch);\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(kNumThreads)\n  for (int t = 0; t < width; ++t) {\n    int thread_id = omp_get_thread_num();\n#else\n  for (int t = 0; t < width; ++t) {\n    int thread_id = 0;\n#endif\n    TFloat *backprop = nullptr;\n    if (needs_to_backprop_) {\n      backprop = temp_backprops[thread_id];\n    }\n    TFloat *curr_errors = errors[thread_id];\n    BackwardTimeStep(fwd_deltas, t, curr_errors, errors_t.get(), backprop);\n    if (backprop != nullptr) {\n      back_deltas->WriteTimeStep(t, backprop);\n    }\n  }\n  FinishBackward(*errors_t.get());\n  if (needs_to_backprop_) {\n    back_deltas->ZeroInvalidElements();\n#if DEBUG_DETAIL > 0\n    tprintf(\"F Backprop:%s\\n\", name_.c_str());\n    back_deltas->Print(10);\n#endif\n    return true;\n  }\n  return false; // No point going further back.\n}\n\nvoid FullyConnected::BackwardTimeStep(const NetworkIO &fwd_deltas, int t, TFloat *curr_errors,\n                                      TransposedArray *errors_t, TFloat *backprop) {\n  if (type_ == NT_TANH) {\n    acts_.FuncMultiply<GPrime>(fwd_deltas, t, curr_errors);\n  } else if (type_ == NT_LOGISTIC) {\n    acts_.FuncMultiply<FPrime>(fwd_deltas, t, curr_errors);\n  } else if (type_ == NT_POSCLIP) {\n    acts_.FuncMultiply<ClipFPrime>(fwd_deltas, t, curr_errors);\n  } else if (type_ == NT_SYMCLIP) {\n    acts_.FuncMultiply<ClipGPrime>(fwd_deltas, t, curr_errors);\n  } else if (type_ == NT_RELU) {\n    acts_.FuncMultiply<ReluPrime>(fwd_deltas, t, curr_errors);\n  } else if (type_ == NT_SOFTMAX || type_ == NT_SOFTMAX_NO_CTC || type_ == NT_LINEAR) {\n    fwd_deltas.ReadTimeStep(t, curr_errors); // fwd_deltas are the errors.\n  } else {\n    ASSERT_HOST(\"Invalid fully-connected type!\" == nullptr);\n  }\n  // Generate backprop only if needed by the lower layer.\n  if (backprop != nullptr) {\n    weights_.VectorDotMatrix(curr_errors, backprop);\n  }\n  errors_t->WriteStrided(t, curr_errors);\n}\n\nvoid FullyConnected::FinishBackward(const TransposedArray &errors_t) {\n  if (external_source_ == nullptr) {\n    weights_.SumOuterTransposed(errors_t, source_t_, true);\n  } else {\n    weights_.SumOuterTransposed(errors_t, *external_source_, true);\n  }\n}\n\n// Updates the weights using the given learning rate, momentum and adam_beta.\n// num_samples is used in the adam computation iff use_adam_ is true.\nvoid FullyConnected::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {\n  weights_.Update(learning_rate, momentum, adam_beta, num_samples);\n}\n\n// Sums the products of weight updates in *this and other, splitting into\n// positive (same direction) in *same and negative (different direction) in\n// *changed.\nvoid FullyConnected::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {\n  ASSERT_HOST(other.type() == type_);\n  const auto *fc = static_cast<const FullyConnected *>(&other);\n  weights_.CountAlternators(fc->weights_, same, changed);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/fullyconnected.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        fullyconnected.h\n// Description: Simple feed-forward layer with various non-linearities.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_FULLYCONNECTED_H_\n#define TESSERACT_LSTM_FULLYCONNECTED_H_\n\n#include \"network.h\"\n#include \"networkscratch.h\"\n#include \"tesstypes.h\"\n\nnamespace tesseract {\n\n// C++ Implementation of the Softmax (output) class from lstm.py.\nclass FullyConnected : public Network {\npublic:\n  TESS_API\n  FullyConnected(const std::string &name, int ni, int no, NetworkType type);\n  ~FullyConnected() override = default;\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    std::string spec;\n    if (type_ == NT_TANH) {\n      spec += \"Ft\" + std::to_string(no_);\n    } else if (type_ == NT_LOGISTIC) {\n      spec += \"Fs\" + std::to_string(no_);\n    } else if (type_ == NT_RELU) {\n      spec += \"Fr\" + std::to_string(no_);\n    } else if (type_ == NT_LINEAR) {\n      spec += \"Fl\" + std::to_string(no_);\n    } else if (type_ == NT_POSCLIP) {\n      spec += \"Fp\" + std::to_string(no_);\n    } else if (type_ == NT_SYMCLIP) {\n      spec += \"Fn\" + std::to_string(no_);\n    } else if (type_ == NT_SOFTMAX) {\n      spec += \"Fc\" + std::to_string(no_);\n    } else {\n      spec += \"Fm\" + std::to_string(no_);\n    }\n    return spec;\n  }\n\n  // Changes the type to the given type. Used to commute a softmax to a\n  // non-output type for adding on other networks.\n  void ChangeType(NetworkType type) {\n    type_ = type;\n  }\n\n  // Suspends/Enables training by setting the training_ flag. Serialize and\n  // DeSerialize only operate on the run-time data if state is false.\n  void SetEnableTraining(TrainingState state) override;\n\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  int InitWeights(float range, TRand *randomizer) override;\n  // Recursively searches the network for softmaxes with old_no outputs,\n  // and remaps their outputs according to code_map. See network.h for details.\n  int RemapOutputs(int old_no, const std::vector<int> &code_map) override;\n\n  // Converts a float network to an int network.\n  void ConvertToInt() override;\n\n  // Provides debug output on the weights.\n  void DebugWeights() override;\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n  // Components of Forward so FullyConnected can be reused inside LSTM.\n  void SetupForward(const NetworkIO &input, const TransposedArray *input_transpose);\n  void ForwardTimeStep(int t, TFloat *output_line);\n  void ForwardTimeStep(const TFloat *d_input, int t, TFloat *output_line);\n  void ForwardTimeStep(const int8_t *i_input, int t, TFloat *output_line);\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n  // Components of Backward so FullyConnected can be reused inside LSTM.\n  void BackwardTimeStep(const NetworkIO &fwd_deltas, int t, TFloat *curr_errors,\n                        TransposedArray *errors_t, TFloat *backprop);\n  void FinishBackward(const TransposedArray &errors_t);\n\n  // Updates the weights using the given learning rate, momentum and adam_beta.\n  // num_samples is used in the adam computation iff use_adam_ is true.\n  void Update(float learning_rate, float momentum, float adam_beta, int num_samples) override;\n  // Sums the products of weight updates in *this and other, splitting into\n  // positive (same direction) in *same and negative (different direction) in\n  // *changed.\n  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;\n\nprotected:\n  // Weight arrays of size [no, ni + 1].\n  WeightMatrix weights_;\n  // Transposed copy of input used during training of size [ni, width].\n  TransposedArray source_t_;\n  // Pointer to transposed input stored elsewhere. If not null, this is used\n  // in preference to calculating the transpose and storing it in source_t_.\n  const TransposedArray *external_source_;\n  // Activations from forward pass of size [width, no].\n  NetworkIO acts_;\n  // Memory of the integer mode input to forward as softmax always outputs\n  // float, so the information is otherwise lost.\n  bool int_mode_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_FULLYCONNECTED_H_\n"
  },
  {
    "path": "src/lstm/functions.cpp",
    "content": "// Generated code with lookup tables (see generate_lut.py)\n#include \"functions.h\"\nnamespace tesseract {\nconst TFloat TanhTable[] = {\n    0.0,\n    0.00390623013190634,\n    0.007812341058161014,\n    0.011718213587663012,\n    0.015623728558408866,\n    0.019528766852031983,\n    0.023433209408330664,\n    0.02733693723978106,\n    0.031239831446031256,\n    0.03514177322837281,\n    0.039042643904185916,\n    0.04294232492135461,\n    0.04684069787264807,\n    0.0507376445100646,\n    0.05463304675913431,\n    0.0585267867331772,\n    0.062418746747512514,\n    0.0663088093336163,\n    0.07019685725322307,\n    0.0740827735123683,\n    0.07796644137536818,\n    0.08184774437873293,\n    0.0857265663450104,\n    0.08960279139655628,\n    0.09347630396922774,\n    0.09734698882599686,\n    0.10121473107048072,\n    0.10507941616038445,\n    0.10894092992085458,\n    0.11279915855773871,\n    0.11665398867074886,\n    0.12050530726652507,\n    0.1243530017715962,\n    0.12819696004523476,\n    0.13203707039220292,\n    0.1358732215753865,\n    0.1397053028283142,\n    0.14353320386755908,\n    0.14735681490501934,\n    0.15117602666007585,\n    0.1549907303716235,\n    0.15880081780997396,\n    0.16260618128862667,\n    0.16640671367590604,\n    0.17020230840646236,\n    0.1739928594926332,\n    0.177778261535664,\n    0.18155840973678453,\n    0.18533319990813948,\n    0.18910252848357115,\n    0.1928662925292509,\n    0.1966243897541591,\n    0.20037671852040995,\n    0.20412317785341963,\n    0.20786366745191662,\n    0.2115980876977913,\n    0.21532633966578324,\n    0.21904832513300543,\n    0.22276394658830215,\n    0.2264731072414406,\n    0.23017571103213297,\n    0.23387166263888973,\n    0.2375608674877001,\n    0.24124323176054063,\n    0.24491866240370913,\n    0.2485870671359841,\n    0.25224835445660676,\n    0.2559024336530864,\n    0.2595492148088268,\n    0.2631886088105734,\n    0.2668205273556803,\n    0.2704448829591958,\n    0.2740615889607664,\n    0.27767055953135844,\n    0.2812717096797961,\n    0.28486495525911637,\n    0.2884502129727393,\n    0.2920274003804542,\n    0.2955964359042207,\n    0.2991572388337846,\n    0.3027097293321085,\n    0.3062538284406168,\n    0.3097894580842551,\n    0.31331654107636375,\n    0.31683500112336604,\n    0.3203447628292705,\n    0.32384575169998836,\n    0.3273378941474652,\n    0.330821117493628,\n    0.33429534997414756,\n    0.3377605207420171,\n    0.34121655987094607,\n    0.34466339835857224,\n    0.3481009681294895,\n    0.3515292020380951,\n    0.3549480338712546,\n    0.35835739835078595,\n    0.36175723113576447,\n    0.36514746882464827,\n    0.3685280489572257,\n    0.37189891001638503,\n    0.3752599914297089,\n    0.37861123357089205,\n    0.3819525777609865,\n    0.38528396626947237,\n    0.3886053423151571,\n    0.3919166500669051,\n    0.3952178346441962,\n    0.3985088421175169,\n    0.40178961950858566,\n    0.40506011479041065,\n    0.40832027688718564,\n    0.4115700556740224,\n    0.41480940197652194,\n    0.4180382675701864,\n    0.42125660517967356,\n    0.4244643684778938,\n    0.42766151208495395,\n    0.4308479915669466,\n    0.43402376343458904,\n    0.4371887851417123,\n    0.44034301508360263,\n    0.4434864125951957,\n    0.4466189379491292,\n    0.44974055235364957,\n    0.4528512179503811,\n    0.455950897811955,\n    0.4590395559395018,\n    0.46211715726000974,\n    0.465183667623549,\n    0.4682390538003661,\n    0.47128328347784937,\n    0.4743163252573668,\n    0.4773381486509793,\n    0.4803487240780326,\n    0.48334802286162554,\n    0.4863360172249622,\n    0.4893126802875867,\n    0.4922779860615022,\n    0.4952319094471798,\n    0.49817442622945507,\n    0.5011055130733177,\n    0.5040251475195945,\n    0.5069333079805285,\n    0.5098299737352566,\n    0.5127151249251873,\n    0.515588742549281,\n    0.5184508084592355,\n    0.5213013053545766,\n    0.5241402167776607,\n    0.5269675271085849,\n    0.5297832215600139,\n    0.5325872861719194,\n    0.5353797078062398,\n    0.5381604741414564,\n    0.540929573667095,\n    0.5436869956781493,\n    0.5464327302694306,\n    0.5491667683298467,\n    0.5518891015366105,\n    0.5545997223493823,\n    0.5572986240043442,\n    0.5599858005082137,\n    0.562661246632196,\n    0.5653249579058751,\n    0.5679769306110504,\n    0.5706171617755161,\n    0.5732456491667901,\n    0.5758623912857893,\n    0.5784673873604582,\n    0.5810606373393494,\n    0.58364214188516,\n    0.5862119023682238,\n    0.5887699208599646,\n    0.5913162001263083,\n    0.5938507436210597,\n    0.5963735554792423,\n    0.5988846405104075,\n    0.6013840041919087,\n    0.6038716526621499,\n    0.6063475927138031,\n    0.6088118317870018,\n    0.6112643779625107,\n    0.6137052399548708,\n    0.6161344271055265,\n    0.6185519493759327,\n    0.6209578173406448,\n    0.623352042180394,\n    0.6257346356751469,\n    0.6281056101971566,\n    0.6304649787039999,\n    0.6328127547316077,\n    0.6351489523872873,\n    0.6374735863427397,\n    0.6397866718270722,\n    0.6420882246198091,\n    0.644378261043901,\n    0.6466567979587338,\n    0.6489238527531414,\n    0.6511794433384197,\n    0.6534235881413468,\n    0.6556563060972085,\n    0.6578776166428312,\n    0.6600875397096249,\n    0.6622860957166337,\n    0.6644733055636015,\n    0.6666491906240467,\n    0.6688137727383526,\n    0.6709670742068737,\n    0.6731091177830563,\n    0.6752399266665784,\n    0.6773595244965075,\n    0.6794679353444786,\n    0.6815651837078924,\n    0.6836512945031366,\n    0.6857262930588289,\n    0.6877902051090853,\n    0.6898430567868128,\n    0.6918848746170292,\n    0.6939156855102073,\n    0.6959355167556515,\n    0.6979443960148981,\n    0.6999423513151498,\n    0.7019294110427386,\n    0.7039056039366212,\n    0.7058709590819044,\n    0.7078255059034086,\n    0.7097692741592597,\n    0.7117022939345188,\n    0.7136245956348475,\n    0.7155362099802073,\n    0.7174371679985984,\n    0.7193275010198334,\n    0.7212072406693522,\n    0.7230764188620713,\n    0.7249350677962784,\n    0.7267832199475612,\n    0.7286209080627815,\n    0.7304481651540864,\n    0.7322650244929644,\n    0.7340715196043416,\n    0.7358676842607217,\n    0.7376535524763688,\n    0.7394291585015331,\n    0.7411945368167218,\n    0.7429497221270137,\n    0.7446947493564187,\n    0.746429653642282,\n    0.7481544703297355,\n    0.7498692349661928,\n    0.7515739832958932,\n    0.7532687512544888,\n    0.7549535749636815,\n    0.7566284907259059,\n    0.7582935350190586,\n    0.7599487444912773,\n    0.7615941559557649,\n    0.7632298063856651,\n    0.7648557329089823,\n    0.7664719728035536,\n    0.7680785634920665,\n    0.7696755425371268,\n    0.771262947636375,\n    0.7728408166176522,\n    0.7744091874342136,\n    0.7759680981599926,\n    0.7775175869849139,\n    0.7790576922102551,\n    0.7805884522440585,\n    0.782109905596592,\n    0.7836220908758593,\n    0.7851250467831598,\n    0.7866188121086977,\n    0.7881034257272403,\n    0.7895789265938273,\n    0.7910453537395261,\n    0.7925027462672405,\n    0.7939511433475658,\n    0.7953905842146939,\n    0.7968211081623683,\n    0.7982427545398869,\n    0.799655562748155,\n    0.8010595722357858,\n    0.8024548224952512,\n    0.80384135305908,\n    0.8052192034961044,\n    0.8065884134077569,\n    0.8079490224244125,\n    0.809301070201781,\n    0.8106445964173468,\n    0.8119796407668555,\n    0.8133062429608491,\n    0.8146244427212479,\n    0.81593427977798,\n    0.8172357938656573,\n    0.8185290247202979,\n    0.8198140120760964,\n    0.8210907956622387,\n    0.8223594151997646,\n    0.8236199103984744,\n    0.8248723209538826,\n    0.8261166865442167,\n    0.8273530468274595,\n    0.8285814414384387,\n    0.8298019099859595,\n    0.8310144920499815,\n    0.8322192271788401,\n    0.8334161548865118,\n    0.8346053146499223,\n    0.8357867459062983,\n    0.8369604880505621,\n    0.8381265804327689,\n    0.8392850623555859,\n    0.8404359730718141,\n    0.8415793517819513,\n    0.8427152376317971,\n    0.8438436697100982,\n    0.8449646870462346,\n    0.8460783286079477,\n    0.847184633299106,\n    0.8482836399575129,\n    0.849375387352753,\n    0.8504599141840773,\n    0.8515372590783284,\n    0.852607460587903,\n    0.8536705571887534,\n    0.854726587278426,\n    0.855775589174139,\n    0.856817601110895,\n    0.8578526612396329,\n    0.8588808076254141,\n    0.8599020782456466,\n    0.8609165109883439,\n    0.86192414365042,\n    0.8629250139360195,\n    0.8639191594548821,\n    0.8649066177207418,\n    0.8658874261497609,\n    0.8668616220589966,\n    0.8678292426649025,\n    0.8687903250818614,\n    0.869744906320753,\n    0.8706930232875522,\n    0.8716347127819606,\n    0.8725700114960693,\n    0.8734989560130535,\n    0.8744215828058973,\n    0.8753379282361504,\n    0.8762480285527146,\n    0.8771519198906597,\n    0.8780496382700704,\n    0.878941219594922,\n    0.8798266996519848,\n    0.8807061141097572,\n    0.8815794985174281,\n    0.8824468883038663,\n    0.8833083187766378,\n    0.8841638251210511,\n    0.8850134423992284,\n    0.8858572055492046,\n    0.8866951493840524,\n    0.8875273085910325,\n    0.8883537177307711,\n    0.8891744112364617,\n    0.8899894234130917,\n    0.8907987884366952,\n    0.8916025403536288,\n    0.892400713079872,\n    0.8931933404003516,\n    0.8939804559682891,\n    0.8947620933045725,\n    0.8955382857971486,\n    0.8963090667004403,\n    0.8970744691347852,\n    0.897834526085895,\n    0.898589270404339,\n    0.8993387348050462,\n    0.9000829518668307,\n    0.9008219540319365,\n    0.9015557736056028,\n    0.9022844427556502,\n    0.9030079935120863,\n    0.9037264577667303,\n    0.9044398672728585,\n    0.9051482536448664,\n    0.9058516483579518,\n    0.9065500827478152,\n    0.9072435880103779,\n    0.9079321952015192,\n    0.9086159352368307,\n    0.9092948388913872,\n    0.9099689367995354,\n    0.9106382594546996,\n    0.9113028372092032,\n    0.9119627002741072,\n    0.9126178787190637,\n    0.9132684024721864,\n    0.9139143013199357,\n    0.9145556049070188,\n    0.9151923427363066,\n    0.9158245441687624,\n    0.9164522384233875,\n    0.9170754545771808,\n    0.9176942215651103,\n    0.9183085681801013,\n    0.9189185230730362,\n    0.919524114752768,\n    0.9201253715861468,\n    0.92072232179806,\n    0.9213149934714836,\n    0.9219034145475472,\n    0.9224876128256104,\n    0.9230676159633514,\n    0.9236434514768671,\n    0.9242151467407852,\n    0.9247827289883869,\n    0.925346225311741,\n    0.9259056626618496,\n    0.9264610678488026,\n    0.9270124675419444,\n    0.9275598882700505,\n    0.9281033564215134,\n    0.9286428982445389,\n    0.9291785398473523,\n    0.9297103071984134,\n    0.9302382261266414,\n    0.9307623223216483,\n    0.931282621333982,\n    0.9317991485753769,\n    0.9323119293190149,\n    0.932820988699792,\n    0.9333263517145964,\n    0.9338280432225917,\n    0.9343260879455095,\n    0.9348205104679493,\n    0.9353113352376858,\n    0.9357985865659837,\n    0.9362822886279191,\n    0.936762465462709,\n    0.9372391409740465,\n    0.937712338930443,\n    0.9381820829655777,\n    0.9386483965786518,\n    0.9391113031347509,\n    0.939570825865212,\n    0.940026987867997,\n    0.9404798121080715,\n    0.9409293214177905,\n    0.9413755384972874,\n    0.9418184859148706,\n    0.9422581861074242,\n    0.9426946613808131,\n    0.9431279339102947,\n    0.943558025740934,\n    0.9439849587880237,\n    0.9444087548375099,\n    0.94482943554642,\n    0.9452470224432973,\n    0.9456615369286379,\n    0.9460730002753329,\n    0.9464814336291137,\n    0.9468868580090015,\n    0.9472892943077605,\n    0.9476887632923543,\n    0.9480852856044063,\n    0.948478881760663,\n    0.9488695721534608,\n    0.9492573770511957,\n    0.9496423165987963,\n    0.9500244108181993,\n    0.9504036796088287,\n    0.9507801427480761,\n    0.9511538198917856,\n    0.9515247305747395,\n    0.9518928942111476,\n    0.9522583300951382,\n    0.9526210574012511,\n    0.952981095184934,\n    0.953338462383039,\n    0.9536931778143231,\n    0.9540452601799487,\n    0.9543947280639877,\n    0.9547415999339255,\n    0.9550858941411678,\n    0.955427628921549,\n    0.955766822395841,\n    0.9561034925702643,\n    0.9564376573370004,\n    0.9567693344747047,\n    0.9570985416490212,\n    0.9574252964130979,\n    0.9577496162081036,\n    0.958071518363745,\n    0.9583910200987853,\n    0.9587081385215631,\n    0.9590228906305123,\n    0.9593352933146825,\n    0.95964536335426,\n    0.9599531174210895,\n    0.9602585720791958,\n    0.9605617437853068,\n    0.9608626488893753,\n    0.9611613036351028,\n    0.9614577241604624,\n    0.9617519264982222,\n    0.9620439265764688,\n    0.9623337402191307,\n    0.9626213831465018,\n    0.9629068709757654,\n    0.9631902192215166,\n    0.963471443296286,\n    0.9637505585110627,\n    0.9640275800758169,\n    0.9643025231000221,\n    0.9645754025931771,\n    0.9648462334653282,\n    0.9651150305275893,\n    0.9653818084926636,\n    0.9656465819753632,\n    0.9659093654931283,\n    0.966170173466547,\n    0.9664290202198726,\n    0.9666859199815419,\n    0.9669408868846915,\n    0.9671939349676739,\n    0.967445078174573,\n    0.9676943303557181,\n    0.9679417052681973,\n    0.9681872165763705,\n    0.9684308778523805,\n    0.9686727025766634,\n    0.9689127041384588,\n    0.9691508958363177,\n    0.9693872908786104,\n    0.9696219023840321,\n    0.9698547433821086,\n    0.9700858268137,\n    0.9703151655315034,\n    0.9705427723005546,\n    0.9707686597987281,\n    0.970992840617236,\n    0.9712153272611261,\n    0.9714361321497776,\n    0.9716552676173966,\n    0.971872745913509,\n    0.9720885792034534,\n    0.9723027795688709,\n    0.9725153590081953,\n    0.9727263294371402,\n    0.9729357026891858,\n    0.973143490516063,\n    0.9733497045882371,\n    0.9735543564953897,\n    0.973757457746898,\n    0.9739590197723144,\n    0.9741590539218421,\n    0.9743575714668116,\n    0.9745545836001535,\n    0.9747501014368711,\n    0.9749441360145104,\n    0.9751366982936284,\n    0.9753277991582605,\n    0.9755174494163856,\n    0.9757056598003897,\n    0.9758924409675271,\n    0.9760778035003809,\n    0.9762617579073211,\n    0.9764443146229608,\n    0.9766254840086112,\n    0.9768052763527341,\n    0.9769837018713932,\n    0.9771607707087032,\n    0.9773364929372771,\n    0.9775108785586718,\n    0.977683937503832,\n    0.9778556796335314,\n    0.9780261147388136,\n    0.9781952525414295,\n    0.9783631026942736,\n    0.9785296747818186,\n    0.9786949783205476,\n    0.9788590227593849,\n    0.979021817480124,\n    0.9791833717978551,\n    0.9793436949613896,\n    0.9795027961536827,\n    0.9796606844922547,\n    0.97981736902961,\n    0.9799728587536537,\n    0.9801271625881075,\n    0.9802802893929224,\n    0.98043224796469,\n    0.9805830470370519,\n    0.980732695281107,\n    0.9808812013058169,\n    0.9810285736584096,\n    0.9811748208247804,\n    0.9813199512298918,\n    0.9814639732381714,\n    0.9816068951539065,\n    0.9817487252216389,\n    0.9818894716265558,\n    0.9820291424948796,\n    0.982167745894256,\n    0.9823052898341392,\n    0.9824417822661764,\n    0.9825772310845889,\n    0.9827116441265529,\n    0.982845029172576,\n    0.9829773939468749,\n    0.9831087461177479,\n    0.9832390932979479,\n    0.9833684430450518,\n    0.983496802861829,\n    0.9836241801966076,\n    0.9837505824436382,\n    0.9838760169434567,\n    0.9840004909832442,\n    0.9841240117971853,\n    0.9842465865668251,\n    0.9843682224214227,\n    0.9844889264383049,\n    0.9846087056432157,\n    0.9847275670106655,\n    0.9848455174642784,\n    0.9849625638771357,\n    0.9850787130721197,\n    0.9851939718222548,\n    0.9853083468510458,\n    0.9854218448328161,\n    0.9855344723930419,\n    0.9856462361086861,\n    0.9857571425085297,\n    0.9858671980735016,\n    0.9859764092370058,\n    0.9860847823852472,\n    0.9861923238575563,\n    0.9862990399467101,\n    0.9864049368992532,\n    0.986510020915816,\n    0.9866142981514303,\n    0.9867177747158452,\n    0.9868204566738389,\n    0.98692235004553,\n    0.9870234608066866,\n    0.9871237948890335,\n    0.9872233581805574,\n    0.9873221565258112,\n    0.9874201957262151,\n    0.9875174815403567,\n    0.9876140196842896,\n    0.9877098158318297,\n    0.9878048756148494,\n    0.9878992046235708,\n    0.987992808406857,\n    0.9880856924725008,\n    0.9881778622875124,\n    0.9882693232784058,\n    0.988360080831482,\n    0.9884501402931117,\n    0.9885395069700158,\n    0.9886281861295444,\n    0.9887161829999533,\n    0.9888035027706802,\n    0.9888901505926178,\n    0.9889761315783859,\n    0.989061450802602,\n    0.9891461133021493,\n    0.9892301240764441,\n    0.9893134880877008,\n    0.9893962102611957,\n    0.9894782954855286,\n    0.9895597486128832,\n    0.9896405744592859,\n    0.9897207778048623,\n    0.9898003633940932,\n    0.9898793359360676,\n    0.9899577001047354,\n    0.9900354605391575,\n    0.9901126218437549,\n    0.990189188588556,\n    0.9902651653094419,\n    0.9903405565083914,\n    0.9904153666537223,\n    0.9904896001803339,\n    0.990563261489945,\n    0.9906363549513325,\n    0.990708884900568,\n    0.9907808556412516,\n    0.9908522714447464,\n    0.9909231365504092,\n    0.9909934551658216,\n    0.9910632314670178,\n    0.9911324695987126,\n    0.991201173674527,\n    0.9912693477772121,\n    0.991336995958872,\n    0.9914041222411856,\n    0.9914707306156254,\n    0.9915368250436774,\n    0.9916024094570562,\n    0.9916674877579225,\n    0.9917320638190951,\n    0.9917961414842655,\n    0.9918597245682077,\n    0.9919228168569889,\n    0.9919854221081772,\n    0.9920475440510494,\n    0.9921091863867959,\n    0.9921703527887257,\n    0.9922310469024684,\n    0.9922912723461766,\n    0.9923510327107256,\n    0.9924103315599122,\n    0.9924691724306522,\n    0.992527558833177,\n    0.9925854942512276,\n    0.9926429821422486,\n    0.9927000259375803,\n    0.9927566290426497,\n    0.9928127948371598,\n    0.9928685266752779,\n    0.9929238278858229,\n    0.9929787017724506,\n    0.9930331516138386,\n    0.9930871806638691,\n    0.9931407921518111,\n    0.9931939892825006,\n    0.993246775236521,\n    0.9932991531703803,\n    0.9933511262166888,\n    0.9934026974843341,\n    0.9934538700586567,\n    0.9935046470016224,\n    0.9935550313519951,\n    0.9936050261255072,\n    0.9936546343150297,\n    0.9937038588907408,\n    0.9937527028002934,\n    0.9938011689689811,\n    0.9938492602999035,\n    0.9938969796741302,\n    0.9939443299508632,\n    0.9939913139675994,\n    0.9940379345402902,\n    0.9940841944635017,\n    0.9941300965105725,\n    0.9941756434337713,\n    0.9942208379644527,\n    0.9942656828132125,\n    0.994310180670041,\n    0.9943543342044768,\n    0.994398146065758,\n    0.9944416188829728,\n    0.9944847552652092,\n    0.9945275578017039,\n    0.9945700290619897,\n    0.9946121715960415,\n    0.9946539879344227,\n    0.9946954805884287,\n    0.9947366520502309,\n    0.9947775047930187,\n    0.9948180412711414,\n    0.9948582639202478,\n    0.9948981751574265,\n    0.9949377773813436,\n    0.9949770729723802,\n    0.9950160642927693,\n    0.9950547536867305,\n    0.9950931434806051,\n    0.9951312359829894,\n    0.9951690334848673,\n    0.995206538259742,\n    0.9952437525637667,\n    0.9952806786358739,\n    0.995317318697905,\n    0.9953536749547375,\n    0.9953897495944124,\n    0.9954255447882603,\n    0.9954610626910261,\n    0.995496305440994,\n    0.99553127516011,\n    0.9955659739541052,\n    0.995600403912617,\n    0.9956345671093096,\n    0.9956684656019946,\n    0.9957021014327492,\n    0.9957354766280346,\n    0.9957685931988137,\n    0.995801453140667,\n    0.9958340584339084,\n    0.9958664110436998,\n    0.9958985129201653,\n    0.995930365998504,\n    0.9959619721991029,\n    0.9959933334276472,\n    0.9960244515752322,\n    0.9960553285184723,\n    0.9960859661196108,\n    0.996116366226627,\n    0.996146530673345,\n    0.9961764612795397,\n    0.9962061598510425,\n    0.9962356281798472,\n    0.9962648680442134,\n    0.9962938812087712,\n    0.9963226694246229,\n    0.9963512344294461,\n    0.9963795779475947,\n    0.9964077016901992,\n    0.9964356073552673,\n    0.9964632966277819,\n    0.996490771179801,\n    0.9965180326705543,\n    0.9965450827465406,\n    0.996571923041624,\n    0.9965985551771295,\n    0.9966249807619377,\n    0.9966512013925798,\n    0.9966772186533295,\n    0.9967030341162975,\n    0.9967286493415222,\n    0.9967540658770623,\n    0.9967792852590868,\n    0.9968043090119653,\n    0.9968291386483575,\n    0.996853775669302,\n    0.9968782215643042,\n    0.996902477811424,\n    0.9969265458773626,\n    0.9969504272175485,\n    0.9969741232762231,\n    0.996997635486526,\n    0.9970209652705786,\n    0.997044114039568,\n    0.9970670831938309,\n    0.9970898741229343,\n    0.997112488205759,\n    0.9971349268105798,\n    0.9971571912951466,\n    0.997179283006764,\n    0.997201203282371,\n    0.99722295344862,\n    0.9972445348219545,\n    0.9972659487086872,\n    0.9972871964050772,\n    0.9973082791974058,\n    0.9973291983620532,\n    0.9973499551655737,\n    0.99737055086477,\n    0.9973909867067684,\n    0.9974112639290914,\n    0.9974313837597321,\n    0.9974513474172254,\n    0.9974711561107216,\n    0.9974908110400564,\n    0.9975103133958233,\n    0.9975296643594429,\n    0.9975488651032339,\n    0.9975679167904815,\n    0.9975868205755072,\n    0.9976055776037366,\n    0.9976241890117673,\n    0.9976426559274368,\n    0.997660979469889,\n    0.9976791607496401,\n    0.9976972008686456,\n    0.9977151009203645,\n    0.997732861989825,\n    0.9977504851536886,\n    0.9977679714803138,\n    0.9977853220298202,\n    0.9978025378541507,\n    0.9978196199971345,\n    0.9978365694945488,\n    0.9978533873741807,\n    0.9978700746558881,\n    0.9978866323516602,\n    0.997903061465678,\n    0.9979193629943737,\n    0.9979355379264904,\n    0.9979515872431404,\n    0.9979675119178639,\n    0.9979833129166874,\n    0.99799899119818,\n    0.9980145477135118,\n    0.9980299834065098,\n    0.9980452992137143,\n    0.9980604960644348,\n    0.9980755748808052,\n    0.9980905365778389,\n    0.9981053820634833,\n    0.9981201122386743,\n    0.9981347279973892,\n    0.9981492302267008,\n    0.9981636198068301,\n    0.9981778976111987,\n    0.998192064506481,\n    0.9982061213526561,\n    0.9982200690030588,\n    0.9982339083044309,\n    0.9982476400969715,\n    0.9982612652143875,\n    0.9982747844839432,\n    0.9982881987265095,\n    0.9983015087566136,\n    0.9983147153824872,\n    0.9983278194061147,\n    0.9983408216232817,\n    0.9983537228236223,\n    0.9983665237906661,\n    0.9983792253018854,\n    0.9983918281287415,\n    0.9984043330367313,\n    0.9984167407854325,\n    0.9984290521285495,\n    0.9984412678139584,\n    0.9984533885837515,\n    0.9984654151742826,\n    0.9984773483162099,\n    0.998489188734541,\n    0.998500937148675,\n    0.9985125942724473,\n    0.9985241608141705,\n    0.9985356374766783,\n    0.998547024957367,\n    0.9985583239482376,\n    0.9985695351359368,\n    0.9985806592017988,\n    0.9985916968218859,\n    0.9986026486670286,\n    0.998613515402867,\n    0.9986242976898895,\n    0.9986349961834732,\n    0.9986456115339228,\n    0.9986561443865103,\n    0.9986665953815128,\n    0.9986769651542518,\n    0.9986872543351306,\n    0.9986974635496731,\n    0.9987075934185605,\n    0.998717644557669,\n    0.9987276175781065,\n    0.9987375130862499,\n    0.9987473316837812,\n    0.9987570739677233,\n    0.9987667405304765,\n    0.998776331959854,\n    0.998785848839117,\n    0.9987952917470095,\n    0.9988046612577943,\n    0.9988139579412859,\n    0.9988231823628861,\n    0.9988323350836175,\n    0.9988414166601571,\n    0.9988504276448703,\n    0.9988593685858433,\n    0.9988682400269172,\n    0.9988770425077198,\n    0.9988857765636986,\n    0.9988944427261528,\n    0.9989030415222654,\n    0.9989115734751349,\n    0.9989200391038069,\n    0.9989284389233052,\n    0.998936773444663,\n    0.9989450431749537,\n    0.998953248617321,\n    0.9989613902710102,\n    0.9989694686313969,\n    0.9989774841900182,\n    0.9989854374346016,\n    0.9989933288490944,\n    0.9990011589136932,\n    0.9990089281048729,\n    0.9990166368954146,\n    0.9990242857544355,\n    0.9990318751474156,\n    0.9990394055362274,\n    0.9990468773791623,\n    0.9990542911309593,\n    0.9990616472428321,\n    0.999068946162496,\n    0.9990761883341952,\n    0.99908337419873,\n    0.9990905041934823,\n    0.9990975787524436,\n    0.9991045983062395,\n    0.9991115632821571,\n    0.9991184741041698,\n    0.9991253311929634,\n    0.9991321349659616,\n    0.9991388858373508,\n    0.9991455842181054,\n    0.9991522305160125,\n    0.9991588251356968,\n    0.9991653684786446,\n    0.9991718609432283,\n    0.9991783029247308,\n    0.9991846948153684,\n    0.9991910370043159,\n    0.9991973298777287,\n    0.9992035738187673,\n    0.9992097692076197,\n    0.9992159164215247,\n    0.9992220158347948,\n    0.9992280678188384,\n    0.9992340727421829,\n    0.9992400309704963,\n    0.9992459428666097,\n    0.9992518087905395,\n    0.9992576290995084,\n    0.9992634041479679,\n    0.9992691342876189,\n    0.9992748198674336,\n    0.9992804612336763,\n    0.9992860587299247,\n    0.9992916126970901,\n    0.9992971234734385,\n    0.9993025913946108,\n    0.9993080167936436,\n    0.9993134000009886,\n    0.9993187413445331,\n    0.99932404114962,\n    0.999329299739067,\n    0.9993345174331864,\n    0.9993396945498044,\n    0.9993448314042807,\n    0.999349928309527,\n    0.9993549855760263,\n    0.9993600035118521,\n    0.999364982422686,\n    0.9993699226118373,\n    0.9993748243802607,\n    0.9993796880265746,\n    0.9993845138470797,\n    0.9993893021357763,\n    0.9993940531843823,\n    0.9993987672823514,\n    0.9994034447168897,\n    0.9994080857729738,\n    0.999412690733368,\n    0.9994172598786412,\n    0.9994217934871837,\n    0.999426291835225,\n    0.9994307551968495,\n    0.9994351838440136,\n    0.9994395780465622,\n    0.999443938072245,\n    0.9994482641867324,\n    0.9994525566536324,\n    0.9994568157345057,\n    0.9994610416888823,\n    0.9994652347742766,\n    0.9994693952462034,\n    0.9994735233581935,\n    0.9994776193618086,\n    0.9994816835066569,\n    0.9994857160404085,\n    0.9994897172088095,\n    0.9994936872556981,\n    0.9994976264230185,\n    0.9995015349508359,\n    0.9995054130773511,\n    0.9995092610389148,\n    0.999513079070042,\n    0.9995168674034266,\n    0.9995206262699546,\n    0.9995243558987194,\n    0.9995280565170341,\n    0.9995317283504472,\n    0.9995353716227545,\n    0.9995389865560137,\n    0.9995425733705577,\n    0.999546132285008,\n    0.9995496635162875,\n    0.9995531672796345,\n    0.9995566437886149,\n    0.9995600932551358,\n    0.9995635158894584,\n    0.9995669119002097,\n    0.9995702814943971,\n    0.9995736248774187,\n    0.9995769422530777,\n    0.9995802338235937,\n    0.9995834997896154,\n    0.9995867403502323,\n    0.9995899557029877,\n    0.9995931460438896,\n    0.9995963115674235,\n    0.9995994524665637,\n    0.9996025689327853,\n    0.9996056611560755,\n    0.9996087293249456,\n    0.999611773626442,\n    0.999614794246158,\n    0.9996177913682445,\n    0.9996207651754216,\n    0.9996237158489901,\n    0.9996266435688413,\n    0.9996295485134694,\n    0.999632430859981,\n    0.9996352907841067,\n    0.9996381284602118,\n    0.9996409440613064,\n    0.9996437377590562,\n    0.9996465097237929,\n    0.9996492601245245,\n    0.9996519891289458,\n    0.9996546969034483,\n    0.9996573836131304,\n    0.9996600494218074,\n    0.9996626944920217,\n    0.9996653189850525,\n    0.9996679230609253,\n    0.9996705068784224,\n    0.9996730705950916,\n    0.9996756143672566,\n    0.9996781383500262,\n    0.9996806426973033,\n    0.9996831275617949,\n    0.9996855930950214,\n    0.9996880394473251,\n    0.9996904667678801,\n    0.9996928752047008,\n    0.9996952649046514,\n    0.9996976360134545,\n    0.9996999886757001,\n    0.9997023230348543,\n    0.9997046392332679,\n    0.9997069374121856,\n    0.999709217711754,\n    0.9997114802710304,\n    0.9997137252279912,\n    0.9997159527195401,\n    0.9997181628815168,\n    0.9997203558487053,\n    0.9997225317548416,\n    0.999724690732622,\n    0.9997268329137116,\n    0.9997289584287519,\n    0.9997310674073687,\n    0.9997331599781802,\n    0.9997352362688051,\n    0.9997372964058695,\n    0.9997393405150153,\n    0.9997413687209078,\n    0.9997433811472431,\n    0.9997453779167558,\n    0.9997473591512258,\n    0.999749324971487,\n    0.9997512754974336,\n    0.9997532108480275,\n    0.9997551311413061,\n    0.9997570364943889,\n    0.9997589270234848,\n    0.9997608028438996,\n    0.9997626640700421,\n    0.9997645108154323,\n    0.9997663431927069,\n    0.9997681613136273,\n    0.999769965289086,\n    0.9997717552291131,\n    0.9997735312428833,\n    0.9997752934387226,\n    0.9997770419241145,\n    0.9997787768057073,\n    0.9997804981893194,\n    0.9997822061799468,\n    0.9997839008817695,\n    0.9997855823981568,\n    0.9997872508316746,\n    0.9997889062840913,\n    0.9997905488563841,\n    0.9997921786487448,\n    0.9997937957605862,\n    0.9997954002905484,\n    0.9997969923365041,\n    0.9997985719955652,\n    0.9998001393640886,\n    0.9998016945376816,\n    0.9998032376112084,\n    0.9998047686787952,\n    0.9998062878338367,\n    0.9998077951690011,\n    0.9998092907762359,\n    0.9998107747467739,\n    0.9998122471711384,\n    0.9998137081391485,\n    0.9998151577399251,\n    0.9998165960618961,\n    0.9998180231928016,\n    0.9998194392196994,\n    0.9998208442289704,\n    0.9998222383063238,\n    0.9998236215368022,\n    0.9998249940047867,\n    0.9998263557940027,\n    0.9998277069875239,\n    0.9998290476677782,\n    0.9998303779165528,\n    0.9998316978149984,\n    0.9998330074436348,\n    0.9998343068823555,\n    0.9998355962104329,\n    0.9998368755065229,\n    0.9998381448486693,\n    0.9998394043143093,\n    0.9998406539802779,\n    0.9998418939228124,\n    0.9998431242175573,\n    0.9998443449395685,\n    0.9998455561633186,\n    0.9998467579627008,\n    0.9998479504110337,\n    0.9998491335810655,\n    0.9998503075449787,\n    0.9998514723743946,\n    0.9998526281403772,\n    0.9998537749134379,\n    0.9998549127635397,\n    0.9998560417601013,\n    0.999857161972002,\n    0.9998582734675849,\n    0.9998593763146617,\n    0.9998604705805166,\n    0.9998615563319107,\n    0.9998626336350855,\n    0.9998637025557677,\n    0.9998647631591725,\n    0.999865815510008,\n    0.9998668596724789,\n    0.9998678957102907,\n    0.9998689236866533,\n    0.9998699436642847,\n    0.9998709557054156,\n    0.9998719598717923,\n    0.999872956224681,\n    0.9998739448248711,\n    0.9998749257326796,\n    0.999875899007954,\n    0.9998768647100764,\n    0.999877822897967,\n    0.9998787736300878,\n    0.999879716964446,\n    0.9998806529585975,\n    0.9998815816696507,\n    0.9998825031542696,\n    0.9998834174686777,\n    0.9998843246686608,\n    0.9998852248095712,\n    0.9998861179463303,\n    0.9998870041334325,\n    0.999887883424948,\n    0.9998887558745271,\n    0.9998896215354018,\n    0.999890480460391,\n    0.999891332701902,\n    0.9998921783119349,\n    0.9998930173420851,\n    0.9998938498435467,\n    0.9998946758671154,\n    0.9998954954631922,\n    0.9998963086817857,\n    0.9998971155725156,\n    0.9998979161846154,\n    0.9998987105669361,\n    0.9998994987679483,\n    0.9999002808357454,\n    0.9999010568180473,\n    0.9999018267622022,\n    0.99990259071519,\n    0.9999033487236255,\n    0.9999041008337604,\n    0.9999048470914869,\n    0.99990558754234,\n    0.999906322231501,\n    0.999907051203799,\n    0.9999077745037147,\n    0.999908492175383,\n    0.9999092042625951,\n    0.9999099108088018,\n    0.9999106118571157,\n    0.9999113074503141,\n    0.9999119976308415,\n    0.9999126824408122,\n    0.9999133619220131,\n    0.9999140361159056,\n    0.9999147050636289,\n    0.999915368806002,\n    0.9999160273835264,\n    0.9999166808363885,\n    0.9999173292044622,\n    0.999917972527311,\n    0.9999186108441906,\n    0.9999192441940518,\n    0.9999198726155416,\n    0.999920496147007,\n    0.9999211148264961,\n    0.9999217286917618,\n    0.9999223377802624,\n    0.9999229421291654,\n    0.999923541775349,\n    0.9999241367554044,\n    0.9999247271056381,\n    0.9999253128620742,\n    0.9999258940604568,\n    0.9999264707362516,\n    0.9999270429246484,\n    0.9999276106605633,\n    0.999928173978641,\n    0.9999287329132562,\n    0.9999292874985165,\n    0.999929837768264,\n    0.9999303837560778,\n    0.9999309254952752,\n    0.9999314630189149,\n    0.999931996359798,\n    0.9999325255504705,\n    0.9999330506232254,\n    0.9999335716101041,\n    0.9999340885428991,\n    0.9999346014531553,\n    0.9999351103721722,\n    0.999935615331006,\n    0.9999361163604709,\n    0.9999366134911416,\n    0.9999371067533551,\n    0.9999375961772121,\n    0.9999380817925791,\n    0.9999385636290905,\n    0.9999390417161499,\n    0.9999395160829321,\n    0.9999399867583851,\n    0.9999404537712318,\n    0.9999409171499711,\n    0.9999413769228808,\n    0.9999418331180182,\n    0.9999422857632225,\n    0.9999427348861165,\n    0.9999431805141078,\n    0.9999436226743907,\n    0.9999440613939483,\n    0.9999444966995535,\n    0.9999449286177708,\n    0.9999453571749584,\n    0.9999457823972688,\n    0.9999462043106518,\n    0.9999466229408548,\n    0.9999470383134248,\n    0.9999474504537104,\n    0.9999478593868627,\n    0.9999482651378373,\n    0.9999486677313955,\n    0.9999490671921059,\n    0.9999494635443461,\n    0.9999498568123041,\n    0.9999502470199793,\n    0.9999506341911848,\n    0.9999510183495482,\n    0.9999513995185134,\n    0.9999517777213418,\n    0.9999521529811138,\n    0.9999525253207303,\n    0.9999528947629139,\n    0.9999532613302106,\n    0.9999536250449906,\n    0.9999539859294506,\n    0.9999543440056142,\n    0.9999546992953338,\n    0.9999550518202914,\n    0.999955401602001,\n    0.9999557486618086,\n    0.9999560930208944,\n    0.9999564347002737,\n    0.9999567737207985,\n    0.9999571101031581,\n    0.9999574438678814,\n    0.999957775035337,\n    0.9999581036257355,\n    0.9999584296591298,\n    0.9999587531554174,\n    0.9999590741343402,\n    0.9999593926154869,\n    0.9999597086182941,\n    0.9999600221620466,\n    0.9999603332658794,\n    0.9999606419487787,\n    0.9999609482295827,\n    0.9999612521269834,\n    0.9999615536595269,\n    0.9999618528456155,\n    0.9999621497035079,\n    0.999962444251321,\n    0.9999627365070306,\n    0.9999630264884726,\n    0.9999633142133443,\n    0.999963599699205,\n    0.9999638829634777,\n    0.9999641640234497,\n    0.9999644428962736,\n    0.9999647195989689,\n    0.9999649941484224,\n    0.9999652665613896,\n    0.9999655368544957,\n    0.9999658050442364,\n    0.9999660711469789,\n    0.9999663351789635,\n    0.9999665971563038,\n    0.9999668570949881,\n    0.9999671150108803,\n    0.9999673709197209,\n    0.9999676248371276,\n    0.9999678767785973,\n    0.9999681267595056,\n    0.9999683747951089,\n    0.9999686209005445,\n    0.9999688650908324,\n    0.9999691073808752,\n    0.99996934778546,\n    0.9999695863192587,\n    0.9999698229968288,\n    0.9999700578326147,\n    0.9999702908409487,\n    0.999970522036051,\n    0.9999707514320316,\n    0.9999709790428906,\n    0.9999712048825189,\n    0.9999714289646998,\n    0.9999716513031089,\n    0.9999718719113158,\n    0.9999720908027839,\n    0.9999723079908724,\n    0.9999725234888366,\n    0.9999727373098282,\n    0.9999729494668967,\n    0.9999731599729904,\n    0.9999733688409564,\n    0.9999735760835422,\n    0.9999737817133959,\n    0.9999739857430671,\n    0.9999741881850078,\n    0.9999743890515735,\n    0.9999745883550231,\n    0.9999747861075201,\n    0.9999749823211337,\n    0.9999751770078388,\n    0.9999753701795174,\n    0.999975561847959,\n    0.9999757520248613,\n    0.9999759407218309,\n    0.9999761279503843,\n    0.9999763137219481,\n    0.9999764980478602,\n    0.9999766809393703,\n    0.9999768624076404,\n    0.9999770424637459,\n    0.9999772211186756,\n    0.9999773983833331,\n    0.9999775742685373,\n    0.9999777487850224,\n    0.9999779219434396,\n    0.999978093754357,\n    0.9999782642282604,\n    0.999978433375554,\n    0.9999786012065612,\n    0.999978767731525,\n    0.9999789329606088,\n    0.9999790969038965,\n    0.999979259571394,\n    0.9999794209730292,\n    0.9999795811186526,\n    0.9999797400180382,\n    0.999979897680884,\n    0.9999800541168123,\n    0.9999802093353708,\n    0.9999803633460327,\n    0.9999805161581975,\n    0.9999806677811918,\n    0.9999808182242692,\n    0.9999809674966117,\n    0.9999811156073296,\n    0.9999812625654625,\n    0.9999814083799794,\n    0.9999815530597798,\n    0.9999816966136937,\n    0.9999818390504825,\n    0.9999819803788396,\n    0.9999821206073904,\n    0.9999822597446933,\n    0.9999823977992404,\n    0.9999825347794573,\n    0.9999826706937042,\n    0.9999828055502763,\n    0.9999829393574042,\n    0.9999830721232544,\n    0.9999832038559301,\n    0.9999833345634711,\n    0.9999834642538548,\n    0.9999835929349966,\n    0.9999837206147502,\n    0.9999838473009081,\n    0.9999839730012023,\n    0.9999840977233048,\n    0.9999842214748275,\n    0.9999843442633234,\n    0.9999844660962866,\n    0.9999845869811527,\n    0.9999847069252997,\n    0.9999848259360483,\n    0.9999849440206618,\n    0.9999850611863474,\n    0.9999851774402561,\n    0.9999852927894829,\n    0.9999854072410681,\n    0.9999855208019969,\n    0.9999856334792003,\n    0.9999857452795553,\n    0.9999858562098853,\n    0.9999859662769609,\n    0.9999860754874997,\n    0.9999861838481672,\n    0.9999862913655767,\n    0.9999863980462906,\n    0.9999865038968199,\n    0.9999866089236249,\n    0.9999867131331157,\n    0.9999868165316526,\n    0.9999869191255462,\n    0.9999870209210582,\n    0.9999871219244014,\n    0.9999872221417405,\n    0.999987321579192,\n    0.9999874202428249,\n    0.9999875181386609,\n    0.9999876152726751,\n    0.9999877116507956,\n    0.9999878072789048,\n    0.9999879021628392,\n    0.9999879963083899,\n    0.9999880897213027,\n    0.9999881824072793,\n    0.9999882743719762,\n    0.9999883656210067,\n    0.9999884561599397,\n    0.9999885459943012,\n    0.9999886351295743,\n    0.999988723571199,\n    0.9999888113245732,\n    0.9999888983950528,\n    0.9999889847879521,\n    0.9999890705085438,\n    0.9999891555620599,\n    0.9999892399536915,\n    0.9999893236885892,\n    0.9999894067718638,\n    0.9999894892085859,\n    0.999989571003787,\n    0.9999896521624595,\n    0.9999897326895567,\n    0.9999898125899935,\n    0.9999898918686465,\n    0.9999899705303543,\n    0.9999900485799179,\n    0.9999901260221011,\n    0.9999902028616303,\n    0.9999902791031954,\n    0.9999903547514497,\n    0.9999904298110103,\n    0.9999905042864582,\n    0.9999905781823392,\n    0.9999906515031631,\n    0.9999907242534053,\n    0.9999907964375058,\n    0.9999908680598704,\n    0.9999909391248702,\n    0.999991009636843,\n    0.999991079600092,\n    0.9999911490188876,\n    0.9999912178974667,\n    0.9999912862400331,\n    0.999991354050758,\n    0.9999914213337802,\n    0.9999914880932063,\n    0.9999915543331107,\n    0.9999916200575365,\n    0.999991685270495,\n    0.9999917499759664,\n    0.9999918141779001,\n    0.9999918778802143,\n    0.9999919410867972,\n    0.9999920038015065,\n    0.9999920660281701,\n    0.9999921277705857,\n    0.9999921890325217,\n    0.9999922498177174,\n    0.9999923101298825,\n    0.9999923699726982,\n    0.999992429349817,\n    0.9999924882648629,\n    0.9999925467214317,\n    0.9999926047230911,\n    0.9999926622733815,\n    0.9999927193758151,\n    0.9999927760338775,\n    0.9999928322510263,\n    0.9999928880306931,\n    0.999992943376282,\n    0.9999929982911712,\n    0.9999930527787123,\n    0.999993106842231,\n    0.9999931604850268,\n    0.9999932137103741,\n    0.9999932665215211,\n    0.9999933189216913,\n    0.9999933709140828,\n    0.9999934225018691,\n    0.9999934736881986,\n    0.9999935244761955,\n    0.9999935748689597,\n    0.9999936248695668,\n    0.9999936744810685,\n    0.9999937237064929,\n    0.9999937725488444,\n    0.999993821011104,\n    0.9999938690962297,\n    0.9999939168071562,\n    0.9999939641467956,\n    0.9999940111180372,\n    0.9999940577237479,\n    0.9999941039667724,\n    0.9999941498499327,\n    0.9999941953760296,\n    0.9999942405478416,\n    0.9999942853681258,\n    0.9999943298396178,\n    0.9999943739650319,\n    0.9999944177470611,\n    0.9999944611883779,\n    0.9999945042916334,\n    0.9999945470594586,\n    0.9999945894944637,\n    0.9999946315992388,\n    0.9999946733763537,\n    0.9999947148283583,\n    0.9999947559577825,\n    0.9999947967671367,\n    0.9999948372589115,\n    0.9999948774355785,\n    0.9999949172995898,\n    0.9999949568533785,\n    0.9999949960993586,\n    0.9999950350399257,\n    0.9999950736774563,\n    0.9999951120143088,\n    0.999995150052823,\n    0.9999951877953206,\n    0.9999952252441051,\n    0.9999952624014622,\n    0.9999952992696599,\n    0.9999953358509484,\n    0.9999953721475602,\n    0.999995408161711,\n    0.9999954438955986,\n    0.9999954793514042,\n    0.9999955145312918,\n    0.9999955494374085,\n    0.9999955840718849,\n    0.9999956184368348,\n    0.9999956525343558,\n    0.9999956863665289,\n    0.999995719935419,\n    0.9999957532430751,\n    0.9999957862915301,\n    0.999995819082801,\n    0.9999958516188893,\n    0.9999958839017807,\n    0.9999959159334458,\n    0.9999959477158396,\n    0.9999959792509017,\n    0.999996010540557,\n    0.9999960415867152,\n    0.9999960723912713,\n    0.9999961029561053,\n    0.9999961332830829,\n    0.999996163374055,\n    0.9999961932308581,\n    0.9999962228553146,\n    0.9999962522492327,\n    0.9999962814144063,\n    0.9999963103526156,\n    0.9999963390656268,\n    0.9999963675551924,\n    0.9999963958230512,\n    0.9999964238709286,\n    0.9999964517005364,\n    0.9999964793135733,\n    0.9999965067117246,\n    0.9999965338966625,\n    0.9999965608700463,\n    0.9999965876335223,\n    0.9999966141887239,\n    0.9999966405372721,\n    0.9999966666807748,\n    0.9999966926208279,\n    0.9999967183590146,\n    0.9999967438969058,\n    0.9999967692360602,\n    0.9999967943780244,\n    0.9999968193243328,\n    0.9999968440765081,\n    0.9999968686360612,\n    0.9999968930044908,\n    0.9999969171832844,\n    0.9999969411739177,\n    0.9999969649778551,\n    0.9999969885965492,\n    0.9999970120314418,\n    0.9999970352839631,\n    0.9999970583555324,\n    0.9999970812475578,\n    0.9999971039614367,\n    0.9999971264985552,\n    0.999997148860289,\n    0.9999971710480028,\n    0.999997193063051,\n    0.9999972149067773,\n    0.9999972365805148,\n    0.9999972580855863,\n    0.9999972794233045,\n    0.9999973005949717,\n    0.9999973216018802,\n    0.999997342445312,\n    0.9999973631265394,\n    0.9999973836468247,\n    0.9999974040074202,\n    0.9999974242095687,\n    0.9999974442545032,\n    0.9999974641434473,\n    0.9999974838776147,\n    0.99999750345821,\n    0.9999975228864283,\n    0.9999975421634554,\n    0.9999975612904678,\n    0.999997580268633,\n    0.9999975990991092,\n    0.999997617783046,\n    0.9999976363215834,\n    0.9999976547158532,\n    0.9999976729669781,\n    0.9999976910760717,\n    0.9999977090442398,\n    0.9999977268725787,\n    0.9999977445621766,\n    0.9999977621141134,\n    0.9999977795294602,\n    0.99999779680928,\n    0.9999978139546275,\n    0.999997830966549,\n    0.9999978478460831,\n    0.9999978645942599,\n    0.9999978812121016,\n    0.9999978977006225,\n    0.999997914060829,\n    0.9999979302937196,\n    0.9999979464002852,\n    0.9999979623815086,\n    0.9999979782383656,\n    0.9999979939718235,\n    0.9999980095828431,\n    0.9999980250723771,\n    0.9999980404413708,\n    0.9999980556907622,\n    0.9999980708214821,\n    0.999998085834454,\n    0.9999981007305944,\n    0.9999981155108122,\n    0.9999981301760097,\n    0.9999981447270819,\n    0.9999981591649171,\n    0.9999981734903962,\n    0.9999981877043939,\n    0.9999982018077774,\n    0.9999982158014079,\n    0.9999982296861392,\n    0.9999982434628188,\n    0.9999982571322877,\n    0.9999982706953802,\n    0.9999982841529239,\n    0.9999982975057404,\n    0.9999983107546446,\n    0.9999983239004453,\n    0.9999983369439447,\n    0.9999983498859389,\n    0.9999983627272179,\n    0.9999983754685654,\n    0.9999983881107591,\n    0.9999984006545707,\n    0.9999984131007656,\n    0.9999984254501038,\n    0.9999984377033387,\n    0.9999984498612183,\n    0.9999984619244848,\n    0.9999984738938743,\n    0.9999984857701174,\n    0.999998497553939,\n    0.9999985092460583,\n    0.9999985208471889,\n    0.9999985323580389,\n    0.999998543779311,\n    0.9999985551117021,\n    0.999998566355904,\n    0.9999985775126029,\n    0.9999985885824798,\n    0.9999985995662104,\n    0.9999986104644651,\n    0.999998621277909,\n    0.9999986320072021,\n    0.9999986426529992,\n    0.9999986532159503,\n    0.9999986636966999,\n    0.9999986740958878,\n    0.9999986844141486,\n    0.9999986946521123,\n    0.9999987048104035,\n    0.9999987148896425,\n    0.9999987248904443,\n    0.9999987348134193,\n    0.9999987446591732,\n    0.9999987544283069,\n    0.9999987641214167,\n    0.9999987737390943,\n    0.9999987832819266,\n    0.9999987927504961,\n    0.9999988021453806,\n    0.9999988114671536,\n    0.9999988207163841,\n    0.9999988298936368,\n    0.9999988389994714,\n    0.9999988480344439,\n    0.999998856999106,\n    0.9999988658940044,\n    0.9999988747196823,\n    0.9999988834766783,\n    0.999998892165527,\n    0.9999989007867585,\n    0.9999989093408991,\n    0.999998917828471,\n    0.9999989262499921,\n    0.9999989346059766,\n    0.9999989428969343,\n    0.9999989511233712,\n    0.9999989592857896,\n    0.9999989673846877,\n    0.9999989754205597,\n    0.9999989833938961,\n    0.9999989913051835,\n    0.999998999154905,\n    0.9999990069435395,\n    0.9999990146715624,\n    0.9999990223394454,\n    0.9999990299476564,\n    0.9999990374966601,\n    0.9999990449869168,\n    0.9999990524188841,\n    0.9999990597930154,\n    0.9999990671097608,\n    0.9999990743695669,\n    0.9999990815728768,\n    0.9999990887201301,\n    0.999999095811763,\n    0.9999991028482086,\n    0.9999991098298961,\n    0.9999991167572517,\n    0.9999991236306983,\n    0.9999991304506554,\n    0.9999991372175392,\n    0.9999991439317627,\n    0.9999991505937357,\n    0.9999991572038649,\n    0.9999991637625537,\n    0.9999991702702025,\n    0.9999991767272083,\n    0.9999991831339654,\n    0.9999991894908647,\n    0.9999991957982943,\n    0.9999992020566392,\n    0.9999992082662812,\n    0.9999992144275994,\n    0.99999922054097,\n    0.999999226606766,\n    0.9999992326253575,\n    0.9999992385971122,\n    0.9999992445223942,\n    0.9999992504015653,\n    0.9999992562349844,\n    0.9999992620230076,\n    0.999999267765988,\n    0.9999992734642762,\n    0.9999992791182201,\n    0.9999992847281646,\n    0.9999992902944521,\n    0.9999992958174226,\n    0.999999301297413,\n    0.9999993067347578,\n    0.9999993121297889,\n    0.9999993174828354,\n    0.9999993227942244,\n    0.9999993280642797,\n    0.9999993332933232,\n    0.9999993384816741,\n    0.9999993436296488,\n    0.9999993487375618,\n    0.9999993538057247,\n    0.9999993588344468,\n    0.9999993638240351,\n    0.9999993687747942,\n    0.9999993736870262,\n    0.9999993785610309,\n    0.9999993833971058,\n    0.9999993881955461,\n    0.9999993929566448,\n    0.9999993976806922,\n    0.9999994023679768,\n    0.9999994070187848,\n    0.9999994116333998,\n    0.9999994162121036,\n    0.9999994207551758,\n    0.9999994252628935,\n    0.9999994297355319,\n    0.9999994341733639,\n    0.9999994385766605,\n    0.9999994429456904,\n    0.9999994472807202,\n    0.9999994515820146,\n    0.9999994558498361,\n    0.9999994600844451,\n    0.9999994642861001,\n    0.9999994684550577,\n    0.9999994725915722,\n    0.9999994766958961,\n    0.9999994807682798,\n    0.999999484808972,\n    0.9999994888182194,\n    0.9999994927962665,\n    0.9999994967433564,\n    0.9999995006597296,\n    0.9999995045456255,\n    0.999999508401281,\n    0.9999995122269317,\n    0.999999516022811,\n    0.9999995197891505,\n    0.99999952352618,\n    0.9999995272341278,\n    0.9999995309132201,\n    0.9999995345636816,\n    0.999999538185735,\n    0.9999995417796013,\n    0.9999995453454998,\n    0.9999995488836484,\n    0.999999552394263,\n    0.9999995558775576,\n    0.9999995593337452,\n    0.9999995627630364,\n    0.9999995661656407,\n    0.9999995695417657,\n    0.9999995728916176,\n    0.9999995762154007,\n    0.999999579513318,\n    0.9999995827855706,\n    0.9999995860323584,\n    0.9999995892538794,\n    0.9999995924503304,\n    0.9999995956219065,\n    0.9999995987688012,\n    0.9999996018912065,\n    0.9999996049893131,\n    0.99999960806331,\n    0.999999611113385,\n    0.999999614139724,\n    0.999999617142512,\n    0.999999620121932,\n    0.999999623078166,\n    0.9999996260113945,\n    0.9999996289217963,\n    0.9999996318095493,\n    0.9999996346748296,\n    0.9999996375178121,\n    0.9999996403386704,\n    0.9999996431375765,\n    0.9999996459147015,\n    0.9999996486702146,\n    0.9999996514042843,\n    0.9999996541170771,\n    0.9999996568087589,\n    0.9999996594794939,\n    0.999999662129445,\n    0.9999996647587741,\n    0.9999996673676417,\n    0.9999996699562067,\n    0.9999996725246275,\n    0.9999996750730606,\n    0.9999996776016618,\n    0.9999996801105852,\n    0.999999682599984,\n    0.99999968507001,\n    0.9999996875208143,\n    0.9999996899525462,\n    0.9999996923653542,\n    0.9999996947593857,\n    0.9999996971347866,\n    0.9999996994917021,\n    0.9999997018302759,\n    0.9999997041506508,\n    0.9999997064529683,\n    0.9999997087373691,\n    0.9999997110039925,\n    0.999999713252977,\n    0.9999997154844598,\n    0.9999997176985769,\n    0.9999997198954637,\n    0.9999997220752542,\n    0.9999997242380814,\n    0.9999997263840774,\n    0.9999997285133732,\n    0.9999997306260985,\n    0.9999997327223825,\n    0.9999997348023533,\n    0.9999997368661374,\n    0.9999997389138611,\n    0.9999997409456493,\n    0.999999742961626,\n    0.9999997449619142,\n    0.9999997469466361,\n    0.9999997489159128,\n    0.9999997508698644,\n    0.9999997528086102,\n    0.9999997547322687,\n    0.9999997566409571,\n    0.999999758534792,\n    0.9999997604138888,\n    0.9999997622783625,\n    0.9999997641283268,\n    0.9999997659638945,\n    0.9999997677851777,\n    0.9999997695922875,\n    0.9999997713853345,\n    0.9999997731644277,\n    0.9999997749296758,\n    0.9999997766811869,\n    0.9999997784190674,\n    0.9999997801434237,\n    0.999999781854361,\n    0.9999997835519837,\n    0.9999997852363953,\n    0.9999997869076988,\n    0.999999788565996,\n    0.9999997902113883,\n    0.9999997918439761,\n    0.9999997934638589,\n    0.9999997950711358,\n    0.9999997966659047,\n    0.999999798248263,\n    0.9999997998183073,\n    0.9999998013761335,\n    0.9999998029218365,\n    0.9999998044555108,\n    0.9999998059772499,\n    0.9999998074871468,\n    0.9999998089852936,\n    0.9999998104717817,\n    0.9999998119467018,\n    0.9999998134101441,\n    0.9999998148621977,\n    0.9999998163029514,\n    0.999999817732493,\n    0.99999981915091,\n    0.9999998205582886,\n    0.9999998219547149,\n    0.9999998233402742,\n    0.999999824715051,\n    0.9999998260791292,\n    0.999999827432592,\n    0.9999998287755221,\n    0.9999998301080014,\n    0.9999998314301114,\n    0.9999998327419325,\n    0.999999834043545,\n    0.9999998353350283,\n    0.9999998366164612,\n    0.9999998378879219,\n    0.999999839149488,\n    0.9999998404012365,\n    0.9999998416432438,\n    0.9999998428755857,\n    0.9999998440983375,\n    0.9999998453115738,\n    0.9999998465153687,\n    0.9999998477097954,\n    0.9999998488949271,\n    0.999999850070836,\n    0.9999998512375939,\n    0.9999998523952721,\n    0.999999853543941,\n    0.9999998546836711,\n    0.9999998558145317,\n    0.9999998569365918,\n    0.99999985804992,\n    0.9999998591545842,\n    0.9999998602506518,\n    0.9999998613381899,\n    0.9999998624172646,\n    0.9999998634879418,\n    0.999999864550287,\n    0.9999998656043649,\n    0.99999986665024,\n    0.999999867687976,\n    0.9999998687176362,\n    0.9999998697392837,\n    0.9999998707529806,\n    0.9999998717587888,\n    0.9999998727567697,\n    0.9999998737469843,\n    0.999999874729493,\n    0.9999998757043558,\n    0.9999998766716321,\n    0.999999877631381,\n    0.9999998785836611,\n    0.9999998795285305,\n    0.9999998804660468,\n    0.9999998813962674,\n    0.999999882319249,\n    0.9999998832350477,\n    0.9999998841437198,\n    0.9999998850453204,\n    0.9999998859399047,\n    0.9999998868275275,\n    0.9999998877082426,\n    0.9999998885821039,\n    0.9999998894491648,\n    0.9999998903094782,\n    0.9999998911630965,\n    0.999999892010072,\n    0.9999998928504562,\n    0.9999998936843005,\n    0.9999998945116558,\n    0.9999998953325726,\n    0.9999998961471009,\n    0.9999998969552906,\n    0.9999998977571909,\n    0.9999998985528507,\n    0.9999998993423186,\n    0.9999999001256429,\n    0.9999999009028713,\n    0.9999999016740512,\n    0.9999999024392298,\n    0.9999999031984537,\n    0.9999999039517692,\n    0.9999999046992224,\n    0.9999999054408589,\n    0.999999906176724,\n    0.9999999069068625,\n    0.999999907631319,\n    0.9999999083501377,\n    0.9999999090633626,\n    0.999999909771037,\n    0.9999999104732044,\n    0.9999999111699074,\n    0.9999999118611886,\n    0.9999999125470903,\n    0.9999999132276542,\n    0.999999913902922,\n    0.9999999145729347,\n    0.9999999152377334,\n    0.9999999158973585,\n    0.9999999165518504,\n    0.999999917201249,\n    0.999999917845594,\n    0.9999999184849246,\n    0.99999991911928,\n    0.9999999197486987,\n    0.9999999203732193,\n    0.9999999209928797,\n    0.999999921607718,\n    0.9999999222177716,\n    0.9999999228230777,\n    0.9999999234236732,\n    0.9999999240195949,\n    0.999999924610879,\n    0.9999999251975619,\n    0.999999925779679,\n    0.9999999263572661,\n    0.9999999269303584,\n    0.9999999274989908,\n    0.9999999280631982,\n    0.9999999286230148,\n    0.9999999291784748,\n    0.9999999297296123,\n    0.9999999302764608,\n    0.9999999308190537,\n    0.999999931357424,\n    0.9999999318916047,\n    0.9999999324216285,\n    0.9999999329475275,\n    0.999999933469334,\n    0.9999999339870798,\n    0.9999999345007964,\n    0.9999999350105152,\n    0.9999999355162673,\n    0.9999999360180837,\n    0.999999936515995,\n    0.9999999370100313,\n    0.9999999375002232,\n    0.9999999379866003,\n    0.9999999384691923,\n    0.9999999389480289,\n    0.999999939423139,\n    0.9999999398945519,\n    0.9999999403622962,\n    0.9999999408264005,\n    0.9999999412868931,\n    0.9999999417438021,\n    0.9999999421971554,\n    0.9999999426469807,\n    0.9999999430933054,\n    0.9999999435361568,\n    0.9999999439755619,\n    0.9999999444115476,\n    0.9999999448441403,\n    0.9999999452733667,\n    0.9999999456992527,\n    0.9999999461218244,\n    0.9999999465411077,\n    0.999999946957128,\n    0.9999999473699109,\n    0.9999999477794815,\n    0.9999999481858648,\n    0.9999999485890856,\n    0.9999999489891686,\n    0.999999949386138,\n    0.9999999497800182,\n    0.9999999501708331,\n    0.9999999505586068,\n    0.9999999509433627,\n    0.9999999513251245,\n    0.9999999517039154,\n    0.9999999520797584,\n    0.9999999524526768,\n    0.9999999528226929,\n    0.9999999531898297,\n    0.9999999535541093,\n    0.9999999539155541,\n    0.9999999542741861,\n    0.9999999546300272,\n    0.9999999549830991,\n    0.9999999553334233,\n    0.9999999556810214,\n    0.9999999560259144,\n    0.9999999563681234,\n    0.9999999567076694,\n    0.999999957044573,\n    0.9999999573788548,\n    0.9999999577105352,\n    0.9999999580396344,\n    0.9999999583661725,\n    0.9999999586901696,\n    0.9999999590116452,\n    0.9999999593306191,\n    0.9999999596471107,\n    0.9999999599611394,\n    0.9999999602727243,\n    0.9999999605818843,\n    0.9999999608886386,\n    0.9999999611930056,\n    0.999999961495004,\n    0.9999999617946523,\n    0.9999999620919686,\n    0.9999999623869713,\n    0.9999999626796782,\n    0.9999999629701072,\n    0.9999999632582761,\n    0.9999999635442025,\n    0.9999999638279038,\n    0.9999999641093973,\n    0.9999999643887001,\n    0.9999999646658294,\n    0.9999999649408021,\n    0.999999965213635,\n    0.9999999654843447,\n    0.9999999657529476,\n    0.9999999660194603,\n    0.9999999662838989,\n    0.9999999665462797,\n    0.9999999668066186,\n    0.9999999670649315,\n    0.9999999673212343,\n    0.9999999675755424,\n    0.9999999678278716,\n    0.999999968078237,\n    0.9999999683266542,\n    0.9999999685731381,\n    0.9999999688177039,\n    0.9999999690603665,\n    0.9999999693011405,\n    0.999999969540041,\n    0.9999999697770823,\n    0.9999999700122789,\n    0.9999999702456451,\n    0.9999999704771954,\n    0.9999999707069437,\n    0.999999970934904,\n    0.9999999711610904,\n    0.9999999713855166,\n    0.9999999716081962,\n    0.999999971829143,\n    0.9999999720483703,\n    0.9999999722658917,\n    0.9999999724817202,\n    0.9999999726958692,\n    0.9999999729083516,\n    0.9999999731191804,\n    0.9999999733283687,\n    0.999999973535929,\n    0.999999973741874,\n    0.9999999739462163,\n    0.9999999741489685,\n    0.9999999743501428,\n    0.9999999745497515,\n    0.999999974747807,\n    0.9999999749443211,\n    0.9999999751393059,\n    0.9999999753327734,\n    0.9999999755247353,\n    0.9999999757152033,\n    0.9999999759041891,\n    0.9999999760917041,\n    0.9999999762777599,\n    0.9999999764623679,\n    0.9999999766455392,\n    0.999999976827285,\n    0.9999999770076166,\n    0.9999999771865448,\n    0.9999999773640804,\n    0.9999999775402345,\n    0.9999999777150178,\n    0.999999977888441,\n    0.9999999780605144,\n    0.9999999782312489,\n    0.9999999784006547,\n    0.9999999785687421,\n    0.9999999787355215,\n    0.9999999789010029,\n    0.9999999790651967,\n    0.9999999792281126,\n    0.9999999793897607,\n    0.9999999795501509,\n    0.9999999797092928,\n    0.9999999798671964,\n    0.999999980023871,\n    0.9999999801793266,\n    0.9999999803335722,\n    0.9999999804866175,\n    0.999999980638472,\n    0.9999999807891445,\n    0.9999999809386446,\n    0.9999999810869813,\n    0.9999999812341636,\n    0.9999999813802004,\n    0.9999999815251008,\n    0.9999999816688736,\n    0.9999999818115276,\n    0.9999999819530714,\n    0.9999999820935137,\n    0.9999999822328631,\n    0.9999999823711281,\n    0.999999982508317,\n    0.9999999826444383,\n    0.9999999827795004,\n    0.9999999829135114,\n    0.9999999830464795,\n    0.9999999831784129,\n    0.9999999833093195,\n    0.9999999834392074,\n    0.9999999835680845,\n    0.9999999836959587,\n    0.9999999838228377,\n    0.9999999839487295,\n    0.9999999840736414,\n    0.9999999841975813,\n    0.9999999843205567,\n    0.9999999844425751,\n    0.9999999845636439,\n    0.9999999846837706,\n    0.9999999848029625,\n    0.9999999849212268,\n    0.9999999850385707,\n    0.9999999851550014,\n    0.9999999852705261,\n    0.9999999853851518,\n    0.9999999854988855,\n    0.9999999856117341,\n    0.9999999857237044,\n    0.9999999858348034,\n    0.9999999859450379,\n    0.9999999860544144,\n    0.9999999861629398,\n    0.9999999862706207,\n    0.9999999863774636,\n    0.999999986483475,\n    0.9999999865886614,\n    0.9999999866930293,\n    0.999999986796585,\n    0.9999999868993348,\n    0.9999999870012849,\n    0.9999999871024418,\n    0.9999999872028114,\n    0.9999999873023998,\n    0.9999999874012133,\n    0.9999999874992579,\n    0.9999999875965394,\n    0.999999987693064,\n    0.9999999877888373,\n    0.9999999878838652,\n    0.9999999879781538,\n    0.9999999880717085,\n    0.9999999881645353,\n    0.9999999882566396,\n    0.9999999883480272,\n    0.9999999884387035,\n    0.9999999885286742,\n    0.9999999886179448,\n    0.9999999887065206,\n    0.9999999887944072,\n    0.9999999888816098,\n    0.9999999889681338,\n    0.9999999890539845,\n    0.999999989139167,\n    0.9999999892236867,\n    0.9999999893075486,\n    0.999999989390758,\n    0.9999999894733197,\n    0.999999989555239,\n    0.9999999896365208,\n    0.99999998971717,\n    0.9999999897971916,\n    0.9999999898765906,\n    0.9999999899553715,\n    0.9999999900335395,\n    0.9999999901110991,\n    0.9999999901880551,\n    0.9999999902644122,\n    0.9999999903401752,\n    0.9999999904153486,\n    0.9999999904899369,\n    0.9999999905639448,\n    0.9999999906373768,\n    0.9999999907102373,\n    0.9999999907825308,\n    0.9999999908542617,\n    0.9999999909254343,\n    0.9999999909960532,\n    0.9999999910661225,\n    0.9999999911356464,\n    0.9999999912046295,\n    0.9999999912730755,\n    0.999999991340989,\n    0.9999999914083739,\n    0.9999999914752346,\n    0.9999999915415748,\n    0.9999999916073987,\n    0.9999999916727105,\n    0.999999991737514,\n    0.9999999918018132,\n    0.999999991865612,\n    0.9999999919289143,\n    0.999999991991724,\n    0.9999999920540449,\n    0.9999999921158808,\n    0.9999999921772355,\n    0.9999999922381126,\n    0.9999999922985162,\n    0.9999999923584496,\n    0.9999999924179166,\n    0.9999999924769208,\n    0.9999999925354659,\n    0.9999999925935554,\n    0.9999999926511928,\n    0.9999999927083817,\n    0.9999999927651255,\n    0.9999999928214277,\n    0.9999999928772918,\n    0.9999999929327212,\n    0.9999999929877191,\n    0.9999999930422891,\n    0.9999999930964345,\n    0.9999999931501585,\n    0.9999999932034643,\n    0.9999999932563555,\n    0.9999999933088349,\n    0.999999993360906,\n    0.9999999934125718,\n    0.9999999934638356,\n    0.9999999935147004,\n    0.9999999935651694,\n    0.9999999936152456,\n    0.9999999936649322,\n    0.9999999937142321,\n    0.9999999937631483,\n    0.9999999938116839,\n    0.9999999938598417,\n    0.9999999939076248,\n    0.9999999939550361,\n    0.9999999940020784,\n    0.9999999940487546,\n    0.9999999940950676,\n    0.99999999414102,\n    0.999999994186615,\n    0.9999999942318551,\n    0.9999999942767432,\n    0.999999994321282,\n    0.999999994365474,\n    0.9999999944093223,\n    0.9999999944528293,\n    0.9999999944959977,\n    0.9999999945388303,\n    0.9999999945813294,\n    0.9999999946234978,\n    0.9999999946653381,\n    0.9999999947068527,\n    0.9999999947480444,\n    0.9999999947889153,\n    0.9999999948294683,\n    0.9999999948697058,\n    0.99999999490963,\n    0.9999999949492436,\n    0.9999999949885489,\n    0.9999999950275483,\n    0.9999999950662443,\n    0.999999995104639,\n    0.9999999951427351,\n    0.9999999951805346,\n    0.99999999521804,\n    0.9999999952552534,\n    0.9999999952921773,\n    0.9999999953288139,\n    0.9999999953651654,\n    0.999999995401234,\n    0.9999999954370219,\n    0.9999999954725313,\n    0.9999999955077643,\n    0.9999999955427231,\n    0.9999999955774099,\n    0.9999999956118268,\n    0.9999999956459759,\n    0.9999999956798591,\n    0.9999999957134788,\n    0.9999999957468367,\n    0.999999995779935,\n    0.9999999958127759,\n    0.9999999958453611,\n    0.9999999958776927,\n    0.9999999959097728,\n    0.9999999959416032,\n    0.9999999959731859,\n    0.9999999960045228,\n    0.9999999960356158,\n    0.999999996066467,\n    0.9999999960970779,\n    0.9999999961274507,\n    0.9999999961575872,\n    0.999999996187489,\n    0.9999999962171583,\n    0.9999999962465965,\n    0.9999999962758057,\n    0.9999999963047876,\n    0.9999999963335441,\n    0.9999999963620766,\n    0.9999999963903872,\n    0.9999999964184774,\n    0.9999999964463491,\n    0.9999999964740037,\n    0.9999999965014433,\n    0.9999999965286692,\n    0.9999999965556834,\n    0.9999999965824873,\n    0.9999999966090826,\n    0.9999999966354709,\n    0.9999999966616538,\n    0.9999999966876331,\n    0.9999999967134101,\n    0.9999999967389865,\n    0.999999996764364,\n    0.9999999967895439,\n    0.9999999968145279,\n    0.9999999968393174,\n    0.999999996863914,\n    0.9999999968883192,\n    0.9999999969125345,\n    0.9999999969365614,\n    0.9999999969604012,\n    0.9999999969840556,\n    0.9999999970075258,\n    0.9999999970308134,\n    0.9999999970539198,\n    0.9999999970768465,\n    0.9999999970995946,\n    0.9999999971221657,\n    0.9999999971445612,\n    0.9999999971667825,\n    0.9999999971888307,\n    0.9999999972107074,\n    0.9999999972324138,\n    0.9999999972539514,\n    0.9999999972753213,\n    0.9999999972965249,\n    0.9999999973175635,\n    0.9999999973384384,\n    0.9999999973591508,\n    0.9999999973797021,\n    0.9999999974000934,\n    0.9999999974203261,\n    0.9999999974404012,\n    0.9999999974603202,\n    0.9999999974800842,\n    0.9999999974996943,\n    0.9999999975191518,\n    0.9999999975384579,\n    0.9999999975576138,\n    0.9999999975766206,\n    0.9999999975954795,\n    0.9999999976141916,\n    0.9999999976327582,\n    0.9999999976511801,\n    0.9999999976694588,\n    0.9999999976875952,\n    0.9999999977055904,\n    0.9999999977234457,\n    0.999999997741162,\n    0.9999999977587404,\n    0.9999999977761821,\n    0.9999999977934879,\n    0.9999999978106592,\n    0.9999999978276968,\n    0.9999999978446017,\n    0.9999999978613752,\n    0.9999999978780181,\n    0.9999999978945315,\n    0.9999999979109163,\n    0.9999999979271738,\n    0.9999999979433046,\n    0.9999999979593099,\n    0.9999999979751907,\n    0.9999999979909479,\n    0.9999999980065825,\n    0.9999999980220954,\n    0.9999999980374875,\n    0.99999999805276,\n    0.9999999980679135,\n    0.9999999980829491,\n    0.9999999980978677,\n    0.9999999981126703,\n    0.9999999981273575,\n    0.9999999981419305,\n    0.9999999981563902,\n    0.9999999981707373,\n    0.9999999981849726,\n    0.9999999981990974,\n    0.999999998213112,\n    0.9999999982270178,\n    0.9999999982408152,\n    0.9999999982545053,\n    0.9999999982680888,\n    0.9999999982815667,\n    0.9999999982949397,\n    0.9999999983082085,\n    0.9999999983213741,\n    0.9999999983344373,\n    0.9999999983473988,\n    0.9999999983602594,\n    0.99999999837302,\n    0.9999999983856813,\n    0.999999998398244,\n    0.999999998410709,\n    0.999999998423077,\n    0.9999999984353487,\n    0.9999999984475249,\n    0.9999999984596063,\n    0.9999999984715937,\n    0.9999999984834879,\n    0.9999999984952895,\n    0.9999999985069993,\n    0.9999999985186179,\n    0.9999999985301461,\n    0.9999999985415846,\n    0.9999999985529341,\n    0.9999999985641952,\n    0.9999999985753687,\n    0.9999999985864553,\n    0.9999999985974556,\n    0.9999999986083703,\n    0.9999999986192001,\n    0.9999999986299455,\n    0.9999999986406074,\n    0.9999999986511863,\n    0.9999999986616828,\n    0.9999999986720977,\n    0.9999999986824315,\n    0.9999999986926849,\n    0.9999999987028585,\n    0.9999999987129529,\n    0.9999999987229687,\n    0.9999999987329067,\n    0.9999999987427673,\n    0.9999999987525512,\n    0.9999999987622589,\n    0.999999998771891,\n    0.9999999987814483,\n    0.9999999987909312,\n    0.9999999988003402,\n    0.999999998809676,\n    0.9999999988189392,\n    0.9999999988281303,\n    0.9999999988372499,\n    0.9999999988462984,\n    0.9999999988552766,\n    0.9999999988641849,\n    0.9999999988730239,\n    0.9999999988817941,\n    0.9999999988904961,\n    0.9999999988991303,\n    0.9999999989076973,\n    0.9999999989161977,\n    0.9999999989246319,\n    0.9999999989330005,\n    0.999999998941304,\n    0.9999999989495428,\n    0.9999999989577175,\n    0.9999999989658286,\n    0.9999999989738766,\n    0.999999998981862,\n    0.9999999989897852,\n    0.9999999989976467,\n    0.9999999990054471,\n    0.9999999990131868,\n    0.9999999990208662,\n    0.9999999990284859,\n    0.9999999990360463,\n    0.9999999990435479,\n    0.999999999050991,\n    0.9999999990583762,\n    0.999999999065704,\n    0.9999999990729748,\n    0.9999999990801889,\n    0.9999999990873469,\n    0.9999999990944493,\n    0.9999999991014963,\n    0.9999999991084886,\n    0.9999999991154264,\n    0.9999999991223102,\n    0.9999999991291404,\n    0.9999999991359174,\n    0.9999999991426418,\n    0.9999999991493138,\n    0.999999999155934,\n    0.9999999991625025,\n    0.9999999991690199,\n    0.9999999991754867,\n    0.9999999991819031,\n    0.9999999991882695,\n    0.9999999991945865,\n    0.9999999992008543,\n    0.9999999992070733,\n    0.9999999992132439,\n    0.9999999992193664,\n    0.9999999992254415,\n    0.9999999992314691,\n    0.9999999992374499,\n    0.999999999243384,\n    0.9999999992492721,\n    0.9999999992551143,\n    0.999999999260911,\n    0.9999999992666627,\n    0.9999999992723696,\n    0.9999999992780321,\n    0.9999999992836505,\n    0.9999999992892251,\n    0.9999999992947564,\n    0.9999999993002446,\n    0.9999999993056902,\n    0.9999999993110933,\n    0.9999999993164544,\n    0.9999999993217739,\n    0.9999999993270519,\n    0.9999999993322888,\n    0.999999999337485,\n    0.9999999993426407,\n    0.9999999993477563,\n    0.9999999993528321,\n    0.9999999993578684,\n    0.9999999993628655,\n    0.9999999993678237,\n    0.9999999993727433,\n    0.9999999993776247,\n    0.9999999993824681,\n    0.9999999993872737,\n    0.999999999392042,\n    0.9999999993967732,\n    0.9999999994014676,\n    0.9999999994061254,\n    0.9999999994107469,\n    0.9999999994153325,\n    0.9999999994198824,\n    0.999999999424397,\n    0.9999999994288763,\n    0.9999999994333209,\n    0.9999999994377308,\n    0.9999999994421064,\n    0.9999999994464479,\n    0.9999999994507557,\n    0.9999999994550299,\n    0.999999999459271,\n    0.999999999463479,\n    0.9999999994676542,\n    0.999999999471797,\n    0.9999999994759075,\n    0.999999999479986,\n    0.9999999994840327,\n    0.9999999994880481,\n    0.9999999994920321,\n    0.9999999994959852,\n    0.9999999994999074,\n    0.9999999995037991,\n    0.9999999995076606,\n    0.999999999511492,\n    0.9999999995152936,\n    0.9999999995190656,\n    0.9999999995228083,\n    0.9999999995265219,\n    0.9999999995302065,\n    0.9999999995338624,\n    0.99999999953749,\n    0.9999999995410892,\n    0.9999999995446605,\n    0.999999999548204,\n    0.9999999995517199,\n    0.9999999995552085,\n    0.9999999995586698,\n    0.9999999995621043,\n    0.999999999565512,\n    0.9999999995688932,\n    0.9999999995722482,\n    0.999999999575577,\n    0.9999999995788798,\n    0.999999999582157,\n    0.9999999995854086,\n    0.9999999995886351,\n    0.9999999995918363,\n    0.9999999995950126,\n    0.9999999995981643,\n    0.9999999996012914,\n    0.9999999996043942,\n    0.9999999996074728,\n    0.9999999996105274,\n    0.9999999996135583,\n    0.9999999996165657,\n    0.9999999996195497,\n    0.9999999996225103,\n    0.9999999996254479,\n    0.9999999996283627,\n    0.9999999996312549,\n    0.9999999996341244,\n    0.9999999996369717,\n    0.9999999996397968,\n    0.9999999996425999,\n    0.9999999996453812,\n    0.9999999996481409,\n    0.999999999650879,\n    0.999999999653596,\n    0.9999999996562917,\n    0.9999999996589665,\n    0.9999999996616205,\n    0.9999999996642537,\n    0.9999999996668665,\n    0.999999999669459,\n    0.9999999996720312,\n    0.9999999996745835,\n    0.999999999677116,\n    0.9999999996796286,\n    0.9999999996821218,\n    0.9999999996845955,\n    0.99999999968705,\n    0.9999999996894854,\n    0.9999999996919019,\n    0.9999999996942995,\n    0.9999999996966785,\n    0.999999999699039,\n    0.9999999997013811,\n    0.9999999997037049,\n    0.9999999997060107,\n    0.9999999997082986,\n    0.9999999997105686,\n    0.9999999997128209,\n    0.9999999997150558,\n    0.9999999997172733,\n    0.9999999997194735,\n    0.9999999997216565,\n    0.9999999997238226,\n    0.9999999997259719,\n    0.9999999997281044,\n    0.9999999997302202,\n    0.9999999997323197,\n    0.9999999997344028,\n    0.9999999997364697,\n    0.9999999997385205,\n    0.9999999997405554,\n    0.9999999997425744,\n    0.9999999997445777,\n    0.9999999997465654,\n    0.9999999997485376,\n    0.9999999997504946,\n    0.9999999997524363,\n    0.9999999997543627,\n    0.9999999997562743,\n    0.999999999758171,\n    0.9999999997600529,\n    0.9999999997619202,\n    0.999999999763773,\n    0.9999999997656113,\n    0.9999999997674354,\n    0.9999999997692451,\n    0.9999999997710409,\n    0.9999999997728227,\n    0.9999999997745906,\n    0.9999999997763448,\n    0.9999999997780853,\n    0.9999999997798122,\n    0.9999999997815258,\n    0.9999999997832258,\n    0.9999999997849128,\n    0.9999999997865867,\n    0.9999999997882475,\n    0.9999999997898953,\n    0.9999999997915304,\n    0.9999999997931527,\n    0.9999999997947624,\n    0.9999999997963596,\n    0.9999999997979443,\n    0.9999999997995167,\n    0.9999999998010769,\n    0.9999999998026249,\n    0.9999999998041609,\n    0.999999999805685,\n    0.9999999998071971,\n    0.9999999998086976,\n    0.9999999998101863,\n    0.9999999998116634,\n    0.999999999813129,\n    0.9999999998145833,\n    0.9999999998160262,\n    0.9999999998174579,\n    0.9999999998188784,\n    0.999999999820288,\n    0.9999999998216865,\n    0.9999999998230741,\n    0.999999999824451,\n    0.9999999998258171,\n    0.9999999998271726,\n    0.9999999998285176,\n    0.9999999998298521,\n    0.9999999998311762,\n    0.99999999983249,\n    0.9999999998337935,\n    0.9999999998350869,\n    0.9999999998363703,\n    0.9999999998376436,\n    0.9999999998389072,\n    0.9999999998401607,\n    0.9999999998414046,\n    0.9999999998426389,\n    0.9999999998438635,\n    0.9999999998450786,\n    0.9999999998462842,\n    0.9999999998474803,\n    0.9999999998486673,\n    0.999999999849845,\n    0.9999999998510135,\n    0.9999999998521729,\n    0.9999999998533233,\n    0.9999999998544647,\n    0.9999999998555973,\n    0.9999999998567211,\n    0.999999999857836,\n    0.9999999998589424,\n    0.9999999998600401,\n    0.9999999998611293,\n    0.99999999986221,\n    0.9999999998632823,\n    0.9999999998643463,\n    0.9999999998654019,\n    0.9999999998664494,\n    0.9999999998674887,\n    0.9999999998685198,\n    0.999999999869543,\n    0.9999999998705583,\n    0.9999999998715656,\n    0.999999999872565,\n    0.9999999998735568,\n    0.9999999998745408,\n    0.9999999998755171,\n    0.9999999998764858,\n    0.999999999877447,\n    0.9999999998784007,\n    0.9999999998793471,\n    0.999999999880286,\n    0.9999999998812176,\n    0.9999999998821419,\n    0.9999999998830591,\n    0.9999999998839691,\n    0.9999999998848721,\n    0.999999999885768,\n    0.999999999886657,\n    0.9999999998875391,\n    0.9999999998884143,\n    0.9999999998892827,\n    0.9999999998901442,\n    0.9999999998909992,\n    0.9999999998918474,\n    0.9999999998926891,\n    0.9999999998935242,\n    0.9999999998943527,\n    0.9999999998951749,\n    0.9999999998959906,\n    0.9999999998968001,\n    0.9999999998976031,\n    0.9999999998984,\n    0.9999999998991906,\n    0.9999999998999752,\n    0.9999999999007536,\n    0.9999999999015259,\n    0.9999999999022923,\n    0.9999999999030527,\n    0.9999999999038071,\n    0.9999999999045557,\n    0.9999999999052984,\n    0.9999999999060354,\n    0.9999999999067666,\n    0.9999999999074922,\n    0.9999999999082121,\n    0.9999999999089264,\n    0.9999999999096351,\n    0.9999999999103384,\n    0.999999999911036,\n    0.9999999999117284,\n    0.9999999999124154,\n    0.999999999913097,\n    0.9999999999137732,\n    0.9999999999144442,\n    0.99999999991511,\n    0.9999999999157707,\n    0.9999999999164262,\n    0.9999999999170766,\n    0.9999999999177218,\n    0.9999999999183621,\n    0.9999999999189975,\n    0.9999999999196278,\n    0.9999999999202532,\n    0.9999999999208738,\n    0.9999999999214896,\n    0.9999999999221006,\n    0.9999999999227068,\n    0.9999999999233083,\n    0.9999999999239051,\n    0.9999999999244973,\n    0.9999999999250849,\n    0.9999999999256679,\n    0.9999999999262463,\n    0.9999999999268203,\n    0.9999999999273897,\n    0.9999999999279549,\n    0.9999999999285155,\n    0.9999999999290717,\n    0.9999999999296237,\n    0.9999999999301714,\n    0.9999999999307149,\n    0.999999999931254,\n    0.999999999931789,\n    0.9999999999323198,\n    0.9999999999328465,\n    0.9999999999333691,\n    0.9999999999338877,\n    0.9999999999344021,\n    0.9999999999349126,\n    0.9999999999354191,\n    0.9999999999359217,\n    0.9999999999364204,\n    0.9999999999369151,\n    0.9999999999374061,\n    0.9999999999378932,\n    0.9999999999383765,\n    0.999999999938856,\n    0.9999999999393319,\n    0.999999999939804,\n    0.9999999999402724,\n    0.9999999999407373,\n    0.9999999999411985,\n    0.999999999941656,\n    0.9999999999421101,\n    0.9999999999425606,\n    0.9999999999430076,\n    0.9999999999434511,\n    0.9999999999438912,\n    0.9999999999443278,\n    0.9999999999447611,\n    0.999999999945191,\n    0.9999999999456175,\n    0.9999999999460406,\n    0.9999999999464606,\n    0.9999999999468773,\n    0.9999999999472906,\n    0.9999999999477008,\n    0.9999999999481078,\n    0.9999999999485116,\n    0.9999999999489123,\n    0.9999999999493099,\n    0.9999999999497043,\n    0.9999999999500958,\n    0.9999999999504842,\n    0.9999999999508694,\n    0.9999999999512518,\n    0.9999999999516311,\n    0.9999999999520076,\n    0.9999999999523811,\n    0.9999999999527516,\n    0.9999999999531193,\n    0.9999999999534841,\n    0.9999999999538461,\n    0.9999999999542053,\n    0.9999999999545617,\n    0.9999999999549153,\n    0.9999999999552661,\n    0.9999999999556143,\n    0.9999999999559597,\n    0.9999999999563024,\n    0.9999999999566425,\n    0.9999999999569799,\n    0.9999999999573147,\n    0.9999999999576469,\n    0.9999999999579764,\n    0.9999999999583035,\n    0.999999999958628,\n    0.99999999995895,\n    0.9999999999592694,\n    0.9999999999595863,\n    0.9999999999599009,\n    0.9999999999602128,\n    0.9999999999605225,\n    0.9999999999608297,\n    0.9999999999611345,\n    0.999999999961437,\n    0.999999999961737,\n    0.9999999999620348,\n    0.9999999999623304,\n    0.9999999999626235,\n    0.9999999999629143,\n    0.9999999999632029,\n    0.9999999999634893,\n    0.9999999999637734,\n    0.9999999999640553,\n    0.9999999999643351,\n    0.9999999999646126,\n    0.999999999964888,\n    0.9999999999651612,\n    0.9999999999654323,\n    0.9999999999657013,\n    0.9999999999659682,\n    0.9999999999662331,\n    0.9999999999664959,\n    0.9999999999667566,\n    0.9999999999670153,\n    0.999999999967272,\n    0.9999999999675266,\n    0.9999999999677794,\n    0.9999999999680301,\n    0.9999999999682789,\n    0.9999999999685258,\n    0.9999999999687708,\n    0.9999999999690138,\n    0.9999999999692549,\n    0.9999999999694942,\n    0.9999999999697315,\n    0.9999999999699671,\n    0.9999999999702008,\n    0.9999999999704328,\n    0.9999999999706628,\n    0.9999999999708912,\n    0.9999999999711177,\n    0.9999999999713424,\n    0.9999999999715654,\n    0.9999999999717867,\n    0.9999999999720063,\n    0.9999999999722241,\n    0.9999999999724403,\n    0.9999999999726548,\n    0.9999999999728676,\n    0.9999999999730786,\n    0.9999999999732881,\n    0.9999999999734961,\n    0.9999999999737024,\n    0.999999999973907,\n    0.99999999997411,\n    0.9999999999743115,\n    0.9999999999745114,\n    0.9999999999747098,\n    0.9999999999749066,\n    0.9999999999751018,\n    0.9999999999752957,\n    0.9999999999754878,\n    0.9999999999756786,\n    0.9999999999758679,\n    0.9999999999760557,\n    0.999999999976242,\n    0.9999999999764269,\n    0.9999999999766104,\n    0.9999999999767923,\n    0.999999999976973,\n    0.9999999999771522,\n    0.99999999997733,\n    0.9999999999775064,\n    0.9999999999776814,\n    0.9999999999778552,\n    0.9999999999780275,\n    0.9999999999781984,\n    0.9999999999783681,\n    0.9999999999785365,\n    0.9999999999787035,\n    0.9999999999788692,\n    0.9999999999790337,\n    0.9999999999791969,\n    0.9999999999793587,\n    0.9999999999795194,\n    0.9999999999796787,\n    0.9999999999798369,\n    0.9999999999799938,\n    0.9999999999801494,\n    0.999999999980304,\n    0.9999999999804572,\n    0.9999999999806093,\n    0.9999999999807602,\n    0.9999999999809099,\n    0.9999999999810585,\n    0.9999999999812059,\n    0.9999999999813521,\n    0.9999999999814972,\n    0.9999999999816412,\n    0.9999999999817841,\n    0.9999999999819259,\n    0.9999999999820666,\n    0.9999999999822061,\n    0.9999999999823446,\n    0.999999999982482,\n    0.9999999999826183,\n    0.9999999999827536,\n    0.9999999999828878,\n    0.9999999999830209,\n    0.999999999983153,\n    0.9999999999832841,\n    0.9999999999834143,\n    0.9999999999835434,\n    0.9999999999836714,\n    0.9999999999837985,\n    0.9999999999839245,\n    0.9999999999840496,\n    0.9999999999841738,\n    0.9999999999842969,\n    0.9999999999844191,\n    0.9999999999845404,\n    0.9999999999846607,\n    0.9999999999847801,\n    0.9999999999848985,\n    0.999999999985016,\n    0.9999999999851327,\n    0.9999999999852484,\n    0.9999999999853632,\n    0.9999999999854771,\n    0.9999999999855901,\n    0.9999999999857022,\n    0.9999999999858135,\n    0.9999999999859238,\n    0.9999999999860334,\n    0.9999999999861421,\n    0.9999999999862499,\n    0.9999999999863569,\n    0.9999999999864632,\n    0.9999999999865684,\n    0.999999999986673,\n    0.9999999999867767,\n    0.9999999999868796,\n    0.9999999999869817,\n    0.999999999987083,\n    0.9999999999871836,\n    0.9999999999872833,\n    0.9999999999873822,\n    0.9999999999874805,\n    0.9999999999875778,\n    0.9999999999876745,\n    0.9999999999877704,\n    0.9999999999878656,\n    0.9999999999879601,\n    0.9999999999880538,\n    0.9999999999881467,\n    0.999999999988239,\n    0.9999999999883304,\n    0.9999999999884213,\n    0.9999999999885114,\n    0.9999999999886008,\n    0.9999999999886895,\n    0.9999999999887775,\n    0.9999999999888649,\n    0.9999999999889515,\n    0.9999999999890375,\n    0.9999999999891228,\n    0.9999999999892074,\n    0.9999999999892915,\n    0.9999999999893747,\n    0.9999999999894574,\n    0.9999999999895395,\n    0.9999999999896209,\n    0.9999999999897017,\n    0.9999999999897818,\n    0.9999999999898613,\n    0.9999999999899403,\n    0.9999999999900185,\n    0.9999999999900963,\n    0.9999999999901733,\n    0.9999999999902498,\n    0.9999999999903256,\n    0.9999999999904009,\n    0.9999999999904756,\n    0.9999999999905498,\n    0.9999999999906233,\n    0.9999999999906962,\n    0.9999999999907686,\n    0.9999999999908404,\n    0.9999999999909117,\n    0.9999999999909824,\n    0.9999999999910527,\n    0.9999999999911223,\n    0.9999999999911914,\n    0.9999999999912599,\n    0.9999999999913279,\n    0.9999999999913954,\n    0.9999999999914624,\n    0.9999999999915289,\n    0.9999999999915947,\n    0.9999999999916601,\n    0.9999999999917251,\n    0.9999999999917895,\n    0.9999999999918534,\n    0.9999999999919168,\n    0.9999999999919796,\n    0.999999999992042,\n    0.999999999992104,\n    0.9999999999921655,\n    0.9999999999922264,\n    0.999999999992287,\n    0.9999999999923469,\n    0.9999999999924065,\n    0.9999999999924656,\n    0.9999999999925242,\n    0.9999999999925824,\n    0.9999999999926401,\n    0.9999999999926974,\n    0.9999999999927542,\n    0.9999999999928106,\n    0.9999999999928666,\n    0.9999999999929221,\n    0.9999999999929772,\n    0.9999999999930318,\n    0.9999999999930861,\n    0.9999999999931398,\n    0.9999999999931932,\n    0.9999999999932462,\n    0.9999999999932988,\n    0.9999999999933509,\n    0.9999999999934026,\n    0.999999999993454,\n    0.999999999993505,\n    0.9999999999935555,\n    0.9999999999936057,\n    0.9999999999936554,\n    0.9999999999937048,\n    0.9999999999937538,\n    0.9999999999938024,\n    0.9999999999938506,\n    0.9999999999938984,\n    0.999999999993946,\n    0.999999999993993,\n    0.9999999999940398,\n    0.9999999999940862,\n    0.9999999999941322,\n    0.9999999999941779,\n    0.9999999999942232,\n    0.9999999999942681,\n    0.9999999999943128,\n    0.999999999994357,\n    0.9999999999944009,\n    0.9999999999944444,\n    0.9999999999944877,\n    0.9999999999945306,\n    0.9999999999945731,\n    0.9999999999946154,\n    0.9999999999946573,\n    0.9999999999946989,\n    0.9999999999947401,\n    0.9999999999947811,\n    0.9999999999948217,\n    0.999999999994862,\n    0.999999999994902,\n    0.9999999999949416,\n    0.999999999994981,\n    0.9999999999950201,\n    0.9999999999950588,\n    0.9999999999950973,\n    0.9999999999951354,\n    0.9999999999951733,\n    0.9999999999952108,\n    0.9999999999952481,\n    0.9999999999952851,\n    0.9999999999953217,\n    0.9999999999953582,\n    0.9999999999953944,\n    0.9999999999954301,\n    0.9999999999954657,\n    0.999999999995501,\n    0.999999999995536,\n    0.9999999999955708,\n    0.9999999999956052,\n    0.9999999999956394,\n    0.9999999999956733,\n    0.999999999995707,\n    0.9999999999957404,\n    0.9999999999957736,\n    0.9999999999958065,\n    0.9999999999958391,\n    0.9999999999958715,\n    0.9999999999959036,\n    0.9999999999959355,\n    0.9999999999959671,\n    0.9999999999959985,\n    0.9999999999960296,\n    0.9999999999960606,\n    0.9999999999960912,\n    0.9999999999961217,\n    0.9999999999961517,\n    0.9999999999961817,\n    0.9999999999962115,\n    0.9999999999962409,\n    0.9999999999962702,\n    0.9999999999962992,\n    0.999999999996328,\n    0.9999999999963566,\n    0.999999999996385,\n    0.9999999999964131,\n    0.999999999996441,\n    0.9999999999964687,\n    0.9999999999964961,\n    0.9999999999965234,\n    0.9999999999965505,\n    0.9999999999965773,\n    0.9999999999966039,\n    0.9999999999966304,\n    0.9999999999966567,\n    0.9999999999966827,\n    0.9999999999967084,\n    0.9999999999967341,\n    0.9999999999967595,\n    0.9999999999967847,\n    0.9999999999968098,\n    0.9999999999968345,\n    0.9999999999968592,\n    0.9999999999968836,\n    0.9999999999969079,\n    0.9999999999969319,\n    0.9999999999969558,\n    0.9999999999969795,\n    0.9999999999970031,\n    0.9999999999970264,\n    0.9999999999970495,\n    0.9999999999970725,\n    0.9999999999970952,\n    0.9999999999971179,\n    0.9999999999971403,\n    0.9999999999971625,\n    0.9999999999971846,\n    0.9999999999972065,\n    0.9999999999972282,\n    0.9999999999972499,\n    0.9999999999972712,\n    0.9999999999972925,\n    0.9999999999973135,\n    0.9999999999973345,\n    0.9999999999973552,\n    0.9999999999973758,\n    0.9999999999973962,\n    0.9999999999974164,\n    0.9999999999974365,\n    0.9999999999974565,\n    0.9999999999974762,\n    0.9999999999974959,\n    0.9999999999975154,\n    0.9999999999975347,\n    0.999999999997554,\n    0.9999999999975729,\n    0.9999999999975918,\n    0.9999999999976106,\n    0.9999999999976292,\n    0.9999999999976477,\n    0.999999999997666,\n    0.9999999999976841,\n    0.9999999999977022,\n    0.99999999999772,\n    0.9999999999977378,\n    0.9999999999977554,\n    0.9999999999977728,\n    0.9999999999977902,\n    0.9999999999978073,\n    0.9999999999978244,\n    0.9999999999978414,\n    0.9999999999978582,\n    0.9999999999978748,\n    0.9999999999978914,\n    0.9999999999979078,\n    0.9999999999979241,\n    0.9999999999979402,\n    0.9999999999979562,\n    0.9999999999979722,\n    0.9999999999979879,\n    0.9999999999980036,\n    0.9999999999980191,\n    0.9999999999980346,\n    0.9999999999980498,\n    0.999999999998065,\n    0.9999999999980801,\n    0.999999999998095,\n    0.9999999999981098,\n    0.9999999999981245,\n    0.9999999999981392,\n    0.9999999999981536,\n    0.999999999998168,\n    0.9999999999981822,\n    0.9999999999981963,\n    0.9999999999982104,\n    0.9999999999982243,\n    0.9999999999982382,\n    0.9999999999982518,\n    0.9999999999982655,\n    0.9999999999982789,\n    0.9999999999982924,\n    0.9999999999983057,\n    0.9999999999983189,\n    0.9999999999983319,\n    0.9999999999983449,\n    0.9999999999983578,\n    0.9999999999983705,\n    0.9999999999983833,\n    0.9999999999983958,\n    0.9999999999984083,\n    0.9999999999984207,\n    0.999999999998433,\n    0.9999999999984451,\n    0.9999999999984572,\n    0.9999999999984693,\n    0.9999999999984812,\n    0.999999999998493,\n    0.9999999999985048,\n    0.9999999999985164,\n    0.999999999998528,\n    0.9999999999985394,\n    0.9999999999985507,\n    0.999999999998562,\n    0.9999999999985733,\n    0.9999999999985844,\n    0.9999999999985953,\n    0.9999999999986062,\n    0.9999999999986171,\n    0.9999999999986279,\n    0.9999999999986385,\n    0.9999999999986492,\n    0.9999999999986596,\n    0.9999999999986701,\n    0.9999999999986805,\n    0.9999999999986907,\n    0.9999999999987009,\n    0.999999999998711,\n    0.999999999998721,\n    0.999999999998731,\n    0.9999999999987409,\n    0.9999999999987507,\n    0.9999999999987604,\n    0.9999999999987701,\n    0.9999999999987796,\n    0.9999999999987891,\n    0.9999999999987985,\n    0.9999999999988078,\n    0.9999999999988172,\n    0.9999999999988264,\n    0.9999999999988355,\n    0.9999999999988446,\n    0.9999999999988536,\n    0.9999999999988625,\n    0.9999999999988713,\n    0.9999999999988801,\n    0.9999999999988888,\n    0.9999999999988974,\n    0.9999999999989061,\n    0.9999999999989145,\n    0.999999999998923,\n    0.9999999999989314,\n    0.9999999999989397,\n    0.999999999998948,\n    0.9999999999989562,\n    0.9999999999989643,\n    0.9999999999989724,\n    0.9999999999989804,\n    0.9999999999989883,\n    0.9999999999989961,\n    0.9999999999990039,\n    0.9999999999990117,\n    0.9999999999990193,\n    0.999999999999027,\n    0.9999999999990346,\n    0.9999999999990421,\n    0.9999999999990495,\n    0.999999999999057,\n    0.9999999999990643,\n    0.9999999999990716,\n    0.9999999999990788,\n    0.999999999999086,\n    0.9999999999990931,\n    0.9999999999991002,\n    0.9999999999991072,\n    0.999999999999114,\n    0.999999999999121,\n    0.9999999999991278,\n    0.9999999999991346,\n    0.9999999999991414,\n    0.999999999999148,\n    0.9999999999991547,\n    0.9999999999991612,\n    0.9999999999991678,\n    0.9999999999991742,\n    0.9999999999991807,\n    0.9999999999991871,\n    0.9999999999991934,\n    0.9999999999991996,\n    0.9999999999992059,\n    0.9999999999992121,\n    0.9999999999992182,\n    0.9999999999992243,\n    0.9999999999992303,\n    0.9999999999992363,\n    0.9999999999992423,\n    0.9999999999992482,\n    0.999999999999254,\n    0.9999999999992598,\n    0.9999999999992656,\n    0.9999999999992712,\n    0.9999999999992769,\n    0.9999999999992826,\n    0.9999999999992881,\n    0.9999999999992937,\n    0.9999999999992992,\n    0.9999999999993047,\n    0.9999999999993101,\n    0.9999999999993154,\n    0.9999999999993208,\n    0.999999999999326,\n    0.9999999999993313,\n    0.9999999999993365,\n    0.9999999999993416,\n    0.9999999999993467,\n    0.9999999999993519,\n    0.9999999999993568,\n    0.999999999999362,\n    0.9999999999993668,\n    0.9999999999993718,\n    0.9999999999993767,\n    0.9999999999993815,\n    0.9999999999993864,\n    0.9999999999993912,\n    0.9999999999993958,\n    0.9999999999994006,\n    0.9999999999994053,\n    0.9999999999994098,\n    0.9999999999994145,\n    0.999999999999419,\n    0.9999999999994236,\n    0.999999999999428,\n    0.9999999999994325,\n    0.9999999999994369,\n    0.9999999999994412,\n    0.9999999999994457,\n    0.9999999999994499,\n    0.9999999999994542,\n    0.9999999999994584,\n    0.9999999999994627,\n    0.9999999999994669,\n    0.999999999999471,\n    0.9999999999994751,\n    0.9999999999994792,\n    0.9999999999994833,\n    0.9999999999994873,\n    0.9999999999994913,\n    0.9999999999994952,\n    0.9999999999994992,\n    0.9999999999995031,\n    0.999999999999507,\n    0.9999999999995107,\n    0.9999999999995146,\n    0.9999999999995184,\n    0.999999999999522,\n    0.9999999999995258,\n    0.9999999999995295,\n    0.9999999999995332,\n    0.9999999999995368,\n    0.9999999999995404,\n    0.9999999999995439,\n    0.9999999999995475,\n    0.999999999999551,\n    0.9999999999995546,\n    0.999999999999558,\n    0.9999999999995615,\n    0.9999999999995649,\n    0.9999999999995682,\n    0.9999999999995716,\n    0.9999999999995749,\n    0.9999999999995782,\n    0.9999999999995816,\n    0.9999999999995848,\n    0.999999999999588,\n    0.9999999999995912,\n    0.9999999999995944,\n    0.9999999999995975,\n    0.9999999999996007,\n    0.9999999999996038,\n    0.9999999999996069,\n    0.99999999999961,\n    0.999999999999613,\n    0.999999999999616,\n    0.999999999999619,\n    0.999999999999622,\n    0.9999999999996249,\n    0.9999999999996279,\n    0.9999999999996307,\n    0.9999999999996336,\n    0.9999999999996364,\n    0.9999999999996393,\n    0.9999999999996421,\n    0.9999999999996448,\n    0.9999999999996476,\n    0.9999999999996504,\n    0.999999999999653,\n    0.9999999999996557,\n    0.9999999999996585,\n    0.9999999999996612,\n    0.9999999999996637,\n    0.9999999999996664,\n    0.9999999999996689,\n    0.9999999999996715,\n    0.999999999999674,\n    0.9999999999996766,\n    0.9999999999996791,\n    0.9999999999996816,\n    0.9999999999996841,\n    0.9999999999996866,\n    0.999999999999689,\n    0.9999999999996915,\n    0.9999999999996938,\n    0.9999999999996962,\n    0.9999999999996986,\n    0.9999999999997009,\n    0.9999999999997032,\n    0.9999999999997056,\n    0.9999999999997079,\n    0.9999999999997101,\n    0.9999999999997123,\n    0.9999999999997147,\n    0.9999999999997169,\n    0.999999999999719,\n    0.9999999999997212,\n    0.9999999999997234,\n    0.9999999999997256,\n    0.9999999999997277,\n    0.9999999999997298,\n    0.9999999999997319,\n    0.999999999999734,\n    0.9999999999997361,\n    0.9999999999997381,\n    0.9999999999997402,\n    0.9999999999997422,\n    0.9999999999997442,\n    0.9999999999997462,\n    0.9999999999997482,\n    0.9999999999997501,\n    0.9999999999997521,\n    0.999999999999754,\n    0.9999999999997559,\n    0.9999999999997579,\n    0.9999999999997596,\n    0.9999999999997615,\n    0.9999999999997634,\n    0.9999999999997653,\n    0.9999999999997671,\n    0.9999999999997689,\n    0.9999999999997707,\n    0.9999999999997725,\n    0.9999999999997743,\n    0.999999999999776,\n    0.9999999999997777,\n    0.9999999999997795,\n    0.9999999999997812,\n    0.999999999999783,\n    0.9999999999997846,\n    0.9999999999997863,\n    0.999999999999788,\n    0.9999999999997896,\n    0.9999999999997912,\n    0.9999999999997928,\n    0.9999999999997945,\n    0.999999999999796,\n    0.9999999999997976,\n    0.9999999999997992,\n    0.9999999999998008,\n    0.9999999999998024,\n    0.9999999999998038,\n    0.9999999999998054,\n    0.9999999999998069,\n    0.9999999999998084,\n    0.9999999999998099,\n    0.9999999999998114,\n    0.9999999999998128,\n    0.9999999999998143,\n    0.9999999999998157,\n    0.9999999999998171,\n    0.9999999999998186,\n    0.99999999999982,\n    0.9999999999998214,\n    0.9999999999998228,\n    0.9999999999998241,\n    0.9999999999998256,\n    0.9999999999998269,\n    0.9999999999998282,\n    0.9999999999998296,\n    0.9999999999998309,\n    0.9999999999998322,\n    0.9999999999998336,\n    0.9999999999998348,\n    0.9999999999998361,\n    0.9999999999998374,\n    0.9999999999998387,\n    0.9999999999998399,\n    0.9999999999998411,\n    0.9999999999998423,\n    0.9999999999998437,\n    0.9999999999998448,\n    0.999999999999846,\n    0.9999999999998472,\n    0.9999999999998485,\n    0.9999999999998496,\n    0.9999999999998508,\n    0.9999999999998519,\n    0.9999999999998531,\n    0.9999999999998542,\n    0.9999999999998553,\n    0.9999999999998566,\n    0.9999999999998577,\n    0.9999999999998588,\n    0.9999999999998598,\n    0.9999999999998609,\n    0.999999999999862,\n    0.9999999999998631,\n    0.9999999999998641,\n    0.9999999999998652,\n    0.9999999999998662,\n    0.9999999999998673,\n    0.9999999999998683,\n    0.9999999999998693,\n    0.9999999999998703,\n    0.9999999999998713,\n    0.9999999999998723,\n    0.9999999999998733,\n    0.9999999999998743,\n    0.9999999999998753,\n    0.9999999999998763,\n    0.9999999999998772,\n    0.9999999999998782,\n    0.9999999999998792,\n    0.9999999999998801,\n    0.999999999999881,\n    0.999999999999882,\n    0.9999999999998829,\n    0.9999999999998838,\n    0.9999999999998846,\n    0.9999999999998856,\n    0.9999999999998865,\n    0.9999999999998874,\n    0.9999999999998882,\n    0.9999999999998891,\n    0.99999999999989,\n    0.9999999999998909,\n    0.9999999999998916,\n    0.9999999999998925,\n    0.9999999999998934,\n    0.9999999999998942,\n    0.999999999999895,\n    0.9999999999998959,\n    0.9999999999998966,\n    0.9999999999998974,\n    0.9999999999998982,\n    0.9999999999998991,\n    0.9999999999998999,\n    0.9999999999999006,\n    0.9999999999999014,\n    0.9999999999999022,\n    0.9999999999999029,\n    0.9999999999999036,\n    0.9999999999999044,\n    0.9999999999999052,\n    0.9999999999999059,\n    0.9999999999999066,\n    0.9999999999999074,\n    0.9999999999999081,\n    0.9999999999999087,\n    0.9999999999999095,\n    0.9999999999999102,\n    0.9999999999999108,\n    0.9999999999999116,\n    0.9999999999999123,\n    0.999999999999913,\n    0.9999999999999136,\n    0.9999999999999143,\n    0.999999999999915,\n    0.9999999999999156,\n    0.9999999999999163,\n    0.999999999999917,\n    0.9999999999999176,\n    0.9999999999999183,\n    0.9999999999999188,\n    0.9999999999999195,\n    0.9999999999999202,\n    0.9999999999999207,\n    0.9999999999999214,\n    0.999999999999922,\n    0.9999999999999226,\n    0.9999999999999232,\n    0.9999999999999238,\n    0.9999999999999244,\n    0.999999999999925,\n    0.9999999999999255,\n    0.9999999999999262,\n    0.9999999999999267,\n    0.9999999999999273,\n    0.9999999999999278,\n    0.9999999999999284,\n    0.999999999999929,\n    0.9999999999999295,\n    0.9999999999999301,\n    0.9999999999999306,\n    0.9999999999999312,\n    0.9999999999999317,\n    0.9999999999999322,\n    0.9999999999999327,\n    0.9999999999999333,\n    0.9999999999999338,\n    0.9999999999999343,\n    0.9999999999999348,\n    0.9999999999999353,\n    0.9999999999999358,\n    0.9999999999999363,\n    0.9999999999999368,\n    0.9999999999999373,\n    0.9999999999999378,\n    0.9999999999999383,\n    0.9999999999999387,\n    0.9999999999999393,\n    0.9999999999999397,\n    0.9999999999999402,\n    0.9999999999999406,\n    0.9999999999999412,\n    0.9999999999999416,\n    0.999999999999942,\n    0.9999999999999425,\n    0.9999999999999429,\n    0.9999999999999434,\n    0.9999999999999438,\n    0.9999999999999443,\n    0.9999999999999447,\n    0.9999999999999452,\n    0.9999999999999455,\n    0.9999999999999459,\n    0.9999999999999464,\n    0.9999999999999468,\n    0.9999999999999473,\n    0.9999999999999476,\n    0.999999999999948,\n    0.9999999999999485,\n    0.9999999999999488,\n    0.9999999999999493,\n    0.9999999999999496,\n    0.99999999999995,\n    0.9999999999999504,\n    0.9999999999999508,\n    0.9999999999999512,\n    0.9999999999999516,\n    0.9999999999999519,\n    0.9999999999999523,\n    0.9999999999999527,\n    0.999999999999953,\n    0.9999999999999534,\n    0.9999999999999538,\n    0.9999999999999541,\n    0.9999999999999545,\n    0.9999999999999548,\n    0.9999999999999551,\n    0.9999999999999556,\n    0.9999999999999559,\n    0.9999999999999563,\n    0.9999999999999566,\n    0.9999999999999569,\n    0.9999999999999573,\n    0.9999999999999576,\n    0.9999999999999579,\n    0.9999999999999583,\n    0.9999999999999586,\n    0.9999999999999589,\n    0.9999999999999593,\n    0.9999999999999595,\n    0.9999999999999598,\n    0.9999999999999601,\n    0.9999999999999605,\n    0.9999999999999608,\n    0.999999999999961,\n    0.9999999999999614,\n    0.9999999999999617,\n    0.999999999999962,\n    0.9999999999999623,\n    0.9999999999999626,\n    0.9999999999999628,\n    0.9999999999999631,\n    0.9999999999999635,\n    0.9999999999999637,\n    0.999999999999964,\n    0.9999999999999643,\n    0.9999999999999646,\n    0.9999999999999648,\n    0.9999999999999651,\n    0.9999999999999654,\n    0.9999999999999657,\n    0.9999999999999659,\n    0.9999999999999661,\n    0.9999999999999665,\n    0.9999999999999667,\n    0.9999999999999669,\n    0.9999999999999672,\n    0.9999999999999675,\n    0.9999999999999677,\n    0.999999999999968,\n    0.9999999999999682,\n    0.9999999999999685,\n    0.9999999999999687,\n    0.9999999999999689,\n    0.9999999999999692,\n    0.9999999999999695,\n    0.9999999999999697,\n    0.9999999999999699,\n    0.9999999999999701,\n    0.9999999999999704,\n    0.9999999999999706,\n    0.9999999999999708,\n    0.999999999999971,\n    0.9999999999999712,\n    0.9999999999999715,\n    0.9999999999999717,\n    0.9999999999999719,\n    0.9999999999999721,\n    0.9999999999999724,\n    0.9999999999999726,\n    0.9999999999999728,\n    0.999999999999973,\n    0.9999999999999732,\n    0.9999999999999735,\n    0.9999999999999737,\n    0.9999999999999739,\n    0.999999999999974,\n    0.9999999999999742,\n    0.9999999999999745,\n};\nconst TFloat LogisticTable[] = {\n    0.5,\n    0.5009765612582384,\n    0.5019531150659532,\n    0.5029296539728477,\n    0.5039061705290805,\n    0.5048826572854919,\n    0.5058591067938315,\n    0.5068355116069857,\n    0.5078118642792044,\n    0.5087881573663283,\n    0.5097643834260159,\n    0.5107405350179702,\n    0.5117166047041654,\n    0.5126925850490733,\n    0.5136684686198906,\n    0.5146442479867636,\n    0.5156199157230156,\n    0.5165954644053721,\n    0.5175708866141864,\n    0.5185461749336651,\n    0.5195213219520929,\n    0.5204963202620584,\n    0.5214711624606773,\n    0.5224458411498177,\n    0.523420348936324,\n    0.5243946784322399,\n    0.5253688222550323,\n    0.5263427730278142,\n    0.5273165233795671,\n    0.5282900659453637,\n    0.5292633933665886,\n    0.5302364982911614,\n    0.5312093733737563,\n    0.5321820112760233,\n    0.5331544046668082,\n    0.5341265462223723,\n    0.5350984286266115,\n    0.5360700445712753,\n    0.5370413867561842,\n    0.5380124478894484,\n    0.5389832206876841,\n    0.5399536978762307,\n    0.5409238721893666,\n    0.541893736370524,\n    0.5428632831725052,\n    0.5438325053576957,\n    0.5448013956982781,\n    0.5457699469764457,\n    0.5467381519846138,\n    0.5477060035256329,\n    0.5486734944129984,\n    0.5496406174710619,\n    0.5506073655352404,\n    0.5515737314522255,\n    0.5525397080801923,\n    0.5535052882890054,\n    0.5544704649604273,\n    0.5554352309883238,\n    0.5563995792788694,\n    0.5573635027507514,\n    0.5583269943353745,\n    0.5592900469770627,\n    0.5602526536332625,\n    0.5612148072747433,\n    0.5621765008857981,\n    0.5631377274644433,\n    0.5640984800226174,\n    0.5650587515863784,\n    0.5660185351961015,\n    0.5669778239066745,\n    0.5679366107876933,\n    0.5688948889236563,\n    0.5698526514141571,\n    0.5708098913740778,\n    0.5717666019337796,\n    0.5727227762392935,\n    0.5736784074525096,\n    0.5746334887513667,\n    0.575588013330038,\n    0.5765419743991188,\n    0.5774953651858118,\n    0.5784481789341114,\n    0.579400408904987,\n    0.5803520483765656,\n    0.5813030906443133,\n    0.5822535290212147,\n    0.583203356837953,\n    0.5841525674430875,\n    0.5851011542032312,\n    0.5860491105032255,\n    0.5869964297463166,\n    0.5879431053543276,\n    0.588889130767832,\n    0.5898344994463247,\n    0.5907792048683922,\n    0.591723240531882,\n    0.5926665999540697,\n    0.5936092766718268,\n    0.5945512642417856,\n    0.5954925562405032,\n    0.5964331462646254,\n    0.5973730279310485,\n    0.5983121948770795,\n    0.5992506407605962,\n    0.600188359260205,\n    0.6011253440753982,\n    0.6020615889267098,\n    0.60299708755587,\n    0.6039318337259583,\n    0.6048658212215559,\n    0.6057990438488957,\n    0.6067314954360127,\n    0.6076631698328917,\n    0.6085940609116138,\n    0.6095241625665028,\n    0.6104534687142686,\n    0.6113819732941511,\n    0.6123096702680609,\n    0.6132365536207203,\n    0.6141626173598018,\n    0.6150878555160665,\n    0.6160122621434994,\n    0.616935831319445,\n    0.6178585571447408,\n    0.6187804337438501,\n    0.6197014552649915,\n    0.6206216158802703,\n    0.6215409097858049,\n    0.6224593312018546,\n    0.6233768743729451,\n    0.624293533567992,\n    0.625209303080424,\n    0.6261241772283034,\n    0.6270381503544469,\n    0.6279512168265432,\n    0.628863371037271,\n    0.6297746074044134,\n    0.6306849203709737,\n    0.6315943044052867,\n    0.6325027540011314,\n    0.6334102636778401,\n    0.6343168279804076,\n    0.6352224414795979,\n    0.6361270987720502,\n    0.6370307944803831,\n    0.6379335232532979,\n    0.6388352797656791,\n    0.6397360587186955,\n    0.640635854839898,\n    0.6415346628833171,\n    0.6424324776295582,\n    0.6433292938858967,\n    0.6442251064863697,\n    0.6451199102918683,\n    0.6460137001902271,\n    0.6469064710963125,\n    0.6477982179521103,\n    0.6486889357268106,\n    0.6495786194168923,\n    0.6504672640462058,\n    0.6513548646660542,\n    0.652241416355273,\n    0.6531269142203084,\n    0.6540113533952943,\n    0.6548947290421275,\n    0.6557770363505423,\n    0.6566582705381819,\n    0.6575384268506705,\n    0.658417500561683,\n    0.6592954869730118,\n    0.6601723814146353,\n    0.6610481792447818,\n    0.6619228758499942,\n    0.6627964666451921,\n    0.6636689470737326,\n    0.6645403126074702,\n    0.665410558746814,\n    0.6662796810207858,\n    0.6671476749870737,\n    0.6680145362320873,\n    0.6688802603710086,\n    0.6697448430478439,\n    0.670608279935473,\n    0.6714705667356973,\n    0.672331699179286,\n    0.6731916730260219,\n    0.6740504840647448,\n    0.6749081281133934,\n    0.6757646010190476,\n    0.6766198986579665,\n    0.6774740169356273,\n    0.6783269517867622,\n    0.679178699175393,\n    0.6800292550948656,\n    0.6808786155678822,\n    0.6817267766465328,\n    0.682573734412324,\n    0.6834194849762091,\n    0.6842640244786128,\n    0.6851073490894588,\n    0.6859494550081925,\n    0.6867903384638052,\n    0.6876299957148544,\n    0.6884684230494847,\n    0.689305616785446,\n    0.6901415732701115,\n    0.6909762888804932,\n    0.6918097600232571,\n    0.6926419831347361,\n    0.6934729546809425,\n    0.6943026711575786,\n    0.6951311290900456,\n    0.6959583250334526,\n    0.6967842555726225,\n    0.6976089173220981,\n    0.6984323069261458,\n    0.6992544210587585,\n    0.7000752564236576,\n    0.7008948097542927,\n    0.7017130778138413,\n    0.7025300573952054,\n    0.7033457453210089,\n    0.7041601384435928,\n    0.7049732336450086,\n    0.7057850278370112,\n    0.7065955179610504,\n    0.7074047009882609,\n    0.7082125739194518,\n    0.7090191337850932,\n    0.7098243776453041,\n    0.7106283025898368,\n    0.711430905738061,\n    0.712232184238947,\n    0.7130321352710473,\n    0.713830756042477,\n    0.7146280437908932,\n    0.7154239957834733,\n    0.7162186093168913,\n    0.7170118817172945,\n    0.7178038103402775,\n    0.7185943925708561,\n    0.7193836258234395,\n    0.7201715075418013,\n    0.72095803519905,\n    0.7217432062975979,\n    0.7225270183691288,\n    0.7233094689745646,\n    0.7240905557040317,\n    0.7248702761768248,\n    0.7256486280413706,\n    0.7264256089751905,\n    0.7272012166848617,\n    0.7279754489059775,\n    0.7287483034031065,\n    0.7295197779697509,\n    0.7302898704283038,\n    0.7310585786300049,\n    0.7318259004548959,\n    0.7325918338117745,\n    0.7333563766381476,\n    0.734119526900183,\n    0.7348812825926613,\n    0.7356416417389247,\n    0.7364006023908275,\n    0.7371581626286834,\n    0.7379143205612129,\n    0.7386690743254897,\n    0.7394224220868858,\n    0.7401743620390163,\n    0.740924892403682,\n    0.7416740114308128,\n    0.7424217173984088,\n    0.7431680086124811,\n    0.743912883406992,\n    0.7446563401437933,\n    0.7453983772125647,\n    0.746138993030751,\n    0.7468781860434985,\n    0.74761595472359,\n    0.7483522975713798,\n    0.7490872131147275,\n    0.7498206999089312,\n    0.7505527565366589,\n    0.7512833816078802,\n    0.7520125737597972,\n    0.7527403316567737,\n    0.7534666539902642,\n    0.7541915394787425,\n    0.7549149868676283,\n    0.7556369949292148,\n    0.7563575624625937,\n    0.7570766882935809,\n    0.7577943712746406,\n    0.758510610284809,\n    0.7592254042296177,\n    0.7599387520410151,\n    0.7606506526772884,\n    0.7613611051229845,\n    0.7620701083888303,\n    0.7627776615116516,\n    0.7634837635542924,\n    0.7641884136055329,\n    0.7648916107800069,\n    0.7655933542181188,\n    0.7662936430859597,\n    0.7669924765752232,\n    0.76768985390312,\n    0.7683857743122923,\n    0.7690802370707283,\n    0.769773241471674,\n    0.7704647868335476,\n    0.7711548724998493,\n    0.7718434978390747,\n    0.7725306622446245,\n    0.7732163651347154,\n    0.7739006059522892,\n    0.7745833841649233,\n    0.775264699264738,\n    0.7759445507683053,\n    0.776622938216557,\n    0.7772998611746911,\n    0.7779753192320792,\n    0.7786493120021721,\n    0.7793218391224057,\n    0.7799929002541067,\n    0.780662495082397,\n    0.7813306233160979,\n    0.7819972846876342,\n    0.7826624789529376,\n    0.7833262058913498,\n    0.7839884653055251,\n    0.7846492570213327,\n    0.785308580887758,\n    0.7859664367768047,\n    0.786622824583395,\n    0.7872777442252712,\n    0.7879311956428947,\n    0.7885831787993471,\n    0.7892336936802291,\n    0.7898827402935596,\n    0.7905303186696747,\n    0.7911764288611264,\n    0.79182107094258,\n    0.792464245010713,\n    0.7931059511841119,\n    0.7937461896031696,\n    0.7943849604299823,\n    0.7950222638482464,\n    0.7956581000631541,\n    0.7962924693012908,\n    0.7969253718105298,\n    0.7975568078599278,\n    0.7981867777396212,\n    0.79881528176072,\n    0.7994423202552038,\n    0.800067893575815,\n    0.8006920020959544,\n    0.8013146462095748,\n    0.801935826331075,\n    0.8025555428951934,\n    0.8031737963569016,\n    0.8037905871912979,\n    0.804405915893501,\n    0.8050197829785412,\n    0.8056321889812553,\n    0.8062431344561782,\n    0.8068526199774353,\n    0.8074606461386356,\n    0.8080672135527632,\n    0.8086723228520697,\n    0.8092759746879663,\n    0.8098781697309154,\n    0.8104789086703224,\n    0.8110781922144276,\n    0.811676021090197,\n    0.8122723960432147,\n    0.8128673178375735,\n    0.8134607872557665,\n    0.8140528050985784,\n    0.8146433721849758,\n    0.8152324893520001,\n    0.8158201574546557,\n    0.8164063773658039,\n    0.8169911499760516,\n    0.8175744761936437,\n    0.8181563569443524,\n    0.8187367931713698,\n    0.819315785835197,\n    0.8198933359135361,\n    0.8204694444011801,\n    0.8210441123099046,\n    0.8216173406683573,\n    0.8221891305219503,\n    0.8227594829327498,\n    0.8233283989793668,\n    0.823895879756849,\n    0.8244619263765707,\n    0.825026539966124,\n    0.8255897216692099,\n    0.8261514726455292,\n    0.8267117940706734,\n    0.8272706871360163,\n    0.8278281530486042,\n    0.8283841930310485,\n    0.8289388083214156,\n    0.8294920001731195,\n    0.8300437698548124,\n    0.8305941186502768,\n    0.8311430478583168,\n    0.8316905587926504,\n    0.8322366527818007,\n    0.8327813311689884,\n    0.8333245953120233,\n    0.8338664465831974,\n    0.8344068863691763,\n    0.8349459160708924,\n    0.8354835371034369,\n    0.8360197508959527,\n    0.8365545588915281,\n    0.8370879625470882,\n    0.8376199633332891,\n    0.8381505627344117,\n    0.8386797622482537,\n    0.8392075633860248,\n    0.8397339676722393,\n    0.8402589766446108,\n    0.8407825918539462,\n    0.8413048148640399,\n    0.8418256472515684,\n    0.8423450906059847,\n    0.8428631465294143,\n    0.843379816636549,\n    0.8438951025545426,\n    0.8444090059229068,\n    0.8449215283934065,\n    0.8454326716299558,\n    0.8459424373085146,\n    0.8464508271169839,\n    0.8469578427551038,\n    0.8474634859343486,\n    0.8479677583778257,\n    0.8484706618201715,\n    0.8489721980074492,\n    0.8494723686970466,\n    0.849971175657575,\n    0.8504686206687655,\n    0.8509647055213694,\n    0.8514594320170558,\n    0.8519528019683106,\n    0.8524448171983366,\n    0.8529354795409523,\n    0.8534247908404916,\n    0.8539127529517043,\n    0.8543993677396563,\n    0.8548846370796298,\n    0.8553685628570248,\n    0.8558511469672594,\n    0.8563323913156724,\n    0.8568122978174237,\n    0.8572908683973974,\n    0.8577681049901036,\n    0.8582440095395809,\n    0.8587185839992991,\n    0.8591918303320635,\n    0.8596637505099167,\n    0.8601343465140441,\n    0.8606036203346761,\n    0.8610715739709943,\n    0.8615382094310357,\n    0.8620035287315965,\n    0.8624675338981391,\n    0.862930226964697,\n    0.8633916099737806,\n    0.863851684976284,\n    0.8643104540313907,\n    0.8647679192064821,\n    0.8652240825770433,\n    0.865678946226571,\n    0.8661325122464822,\n    0.8665847827360217,\n    0.8670357598021706,\n    0.8674854455595564,\n    0.8679338421303608,\n    0.8683809516442299,\n    0.8688267762381844,\n    0.8692713180565296,\n    0.8697145792507666,\n    0.870156561979502,\n    0.870597268408361,\n    0.8710367007098975,\n    0.8714748610635069,\n    0.8719117516553389,\n    0.8723473746782093,\n    0.8727817323315137,\n    0.8732148268211409,\n    0.8736466603593868,\n    0.8740772351648677,\n    0.8745065534624367,\n    0.8749346174830964,\n    0.8753614294639163,\n    0.8757869916479466,\n    0.8762113062841353,\n    0.8766343756272444,\n    0.8770562019377661,\n    0.8774767874818407,\n    0.8778961345311737,\n    0.878314245362953,\n    0.8787311222597683,\n    0.8791467675095294,\n    0.8795611834053839,\n    0.8799743722456386,\n    0.8803863363336775,\n    0.8807970779778823,\n    0.8812065994915543,\n    0.8816149031928324,\n    0.8820219914046175,\n    0.8824278664544911,\n    0.8828325306746402,\n    0.8832359864017768,\n    0.8836382359770631,\n    0.8840392817460332,\n    0.8844391260585168,\n    0.8848377712685633,\n    0.8852352197343663,\n    0.8856314738181876,\n    0.8860265358862826,\n    0.8864204083088262,\n    0.8868130934598374,\n    0.8872045937171068,\n    0.8875949114621223,\n    0.8879840490799963,\n    0.8883720089593933,\n    0.888758793492457,\n    0.8891444050747391,\n    0.8895288461051276,\n    0.8899121189857752,\n    0.8902942261220291,\n    0.8906751699223611,\n    0.8910549527982959,\n    0.8914335771643437,\n    0.8918110454379297,\n    0.8921873600393251,\n    0.8925625233915799,\n    0.8929365379204537,\n    0.8933094060543487,\n    0.8936811302242426,\n    0.8940517128636203,\n    0.8944211564084097,\n    0.8947894632969136,\n    0.8951566359697456,\n    0.8955226768697631,\n    0.8958875884420037,\n    0.8962513731336202,\n    0.8966140333938164,\n    0.896975571673783,\n    0.8973359904266346,\n    0.8976952921073469,\n    0.8980534791726937,\n    0.8984105540811842,\n    0.8987665192930021,\n    0.8991213772699436,\n    0.8994751304753559,\n    0.8998277813740775,\n    0.9001793324323768,\n    0.9005297861178929,\n    0.9008791448995754,\n    0.9012274112476257,\n    0.9015745876334377,\n    0.90192067652954,\n    0.9022656804095371,\n    0.9026096017480523,\n    0.9029524430206697,\n    0.9032942067038784,\n    0.9036348952750146,\n    0.9039745112122062,\n    0.904313056994317,\n    0.9046505351008906,\n    0.9049869480120961,\n    0.9053222982086733,\n    0.9056565881718779,\n    0.9059898203834277,\n    0.9063219973254493,\n    0.9066531214804244,\n    0.9069831953311379,\n    0.9073122213606241,\n    0.9076402020521152,\n    0.90796713988899,\n    0.9082930373547222,\n    0.9086178969328287,\n    0.9089417211068199,\n    0.909264512360149,\n    0.9095862731761619,\n    0.9099070060380482,\n    0.910226713428791,\n    0.9105453978311193,\n    0.9108630617274585,\n    0.9111797075998822,\n    0.9114953379300658,\n    0.9118099551992372,\n    0.9121235618881309,\n    0.9124361604769414,\n    0.9127477534452755,\n    0.9130583432721083,\n    0.9133679324357356,\n    0.9136765234137297,\n    0.9139841186828943,\n    0.9142907207192192,\n    0.9145963319978375,\n    0.9149009549929797,\n    0.9152045921779318,\n    0.9155072460249908,\n    0.9158089190054228,\n    0.91610961358942,\n    0.9164093322460585,\n    0.916708077443256,\n    0.9170058516477309,\n    0.9173026573249612,\n    0.9175984969391425,\n    0.9178933729531492,\n    0.9181872878284922,\n    0.9184802440252811,\n    0.9187722440021829,\n    0.9190632902163844,\n    0.9193533851235517,\n    0.9196425311777929,\n    0.9199307308316192,\n    0.9202179865359069,\n    0.9205043007398609,\n    0.9207896758909757,\n    0.9210741144349998,\n    0.9213576188158985,\n    0.9216401914758181,\n    0.9219218348550491,\n    0.922202551391991,\n    0.9224823435231172,\n    0.9227612136829397,\n    0.9230391643039738,\n    0.9233161978167049,\n    0.923592316649553,\n    0.9238675232288402,\n    0.9241418199787566,\n    0.9244152093213263,\n    0.9246876936763765,\n    0.9249592754615029,\n    0.9252299570920387,\n    0.9254997409810218,\n    0.9257686295391643,\n    0.9260366251748193,\n    0.9263037302939515,\n    0.9265699473001059,\n    0.9268352785943768,\n    0.927099726575378,\n    0.927363293639213,\n    0.9276259821794456,\n    0.9278877945870695,\n    0.9281487332504805,\n    0.9284088005554476,\n    0.9286679988850831,\n    0.9289263306198166,\n    0.929183798137365,\n    0.9294404038127071,\n    0.929696150018054,\n    0.9299510391228232,\n    0.9302050734936125,\n    0.9304582554941719,\n    0.9307105874853785,\n    0.9309620718252101,\n    0.9312127108687195,\n    0.9314625069680097,\n    0.9317114624722079,\n    0.9319595797274409,\n    0.9322068610768112,\n    0.9324533088603709,\n    0.9326989254150999,\n    0.9329437130748804,\n    0.933187674170474,\n    0.9334308110294983,\n    0.9336731259764038,\n    0.9339146213324513,\n    0.9341552994156889,\n    0.9343951625409306,\n    0.9346342130197337,\n    0.9348724531603766,\n    0.935109885267838,\n    0.9353465116437761,\n    0.935582334586506,\n    0.9358173563909802,\n    0.9360515793487681,\n    0.9362850057480346,\n    0.9365176378735215,\n    0.9367494780065266,\n    0.9369805284248849,\n    0.9372107914029487,\n    0.9374402692115686,\n    0.9376689641180752,\n    0.9378968783862601,\n    0.9381240142763573,\n    0.9383503740450254,\n    0.9385759599453297,\n    0.9388007742267246,\n    0.9390248191350352,\n    0.9392480969124417,\n    0.9394706097974611,\n    0.9396923600249307,\n    0.9399133498259924,\n    0.9401335814280749,\n    0.9403530570548787,\n    0.9405717789263597,\n    0.9407897492587141,\n    0.9410069702643621,\n    0.9412234441519332,\n    0.941439173126251,\n    0.941654159388319,\n    0.9418684051353043,\n    0.9420819125605256,\n    0.9422946838534367,\n    0.9425067211996143,\n    0.9427180267807429,\n    0.9429286027746023,\n    0.9431384513550537,\n    0.9433475746920261,\n    0.9435559749515046,\n    0.9437636542955161,\n    0.9439706148821183,\n    0.9441768588653856,\n    0.9443823883953985,\n    0.944587205618231,\n    0.9447913126759381,\n    0.9449947117065459,\n    0.9451974048440384,\n    0.9453993942183476,\n    0.9456006819553416,\n    0.9458012701768145,\n    0.9460011610004747,\n    0.946200356539936,\n    0.9463988589047058,\n    0.9465966702001757,\n    0.9467937925276119,\n    0.9469902279841446,\n    0.947185978662759,\n    0.9473810466522863,\n    0.9475754340373931,\n    0.9477691428985743,\n    0.9479621753121426,\n    0.94815453335022,\n    0.9483462190807312,\n    0.9485372345673925,\n    0.9487275818697054,\n    0.9489172630429474,\n    0.9491062801381662,\n    0.9492946352021694,\n    0.949482330277519,\n    0.9496693674025232,\n    0.9498557486112288,\n    0.9500414759334155,\n    0.9502265513945876,\n    0.9504109770159683,\n    0.9505947548144925,\n    0.9507778868028013,\n    0.9509603749892346,\n    0.9511422213778251,\n    0.9513234279682939,\n    0.9515039967560431,\n    0.9516839297321508,\n    0.9518632288833652,\n    0.9520418961921002,\n    0.9522199336364293,\n    0.9523973431900807,\n    0.9525741268224334,\n    0.9527502864985101,\n    0.9529258241789759,\n    0.9531007418201309,\n    0.9532750413739076,\n    0.9534487247878658,\n    0.953621794005189,\n    0.9537942509646805,\n    0.9539660976007597,\n    0.954137335843458,\n    0.9543079676184154,\n    0.954477994846878,\n    0.9546474194456936,\n    0.9548162433273092,\n    0.9549844683997677,\n    0.9551520965667056,\n    0.9553191297273498,\n    0.9554855697765151,\n    0.9556514186046017,\n    0.955816678097593,\n    0.9559813501370537,\n    0.9561454366001265,\n    0.956308939359532,\n    0.9564718602835648,\n    0.9566342012360932,\n    0.9567959640765575,\n    0.9569571506599678,\n    0.9571177628369031,\n    0.9572778024535095,\n    0.9574372713515003,\n    0.9575961713681533,\n    0.9577545043363116,\n    0.9579122720843811,\n    0.9580694764363313,\n    0.9582261192116938,\n    0.9583822022255619,\n    0.9585377272885904,\n    0.9586926962069957,\n    0.9588471107825551,\n    0.959000972812607,\n    0.9591542840900507,\n    0.959307046403347,\n    0.9594592615365182,\n    0.9596109312691483,\n    0.9597620573763841,\n    0.9599126416289347,\n    0.9600626857930734,\n    0.9602121916306376,\n    0.96036116089903,\n    0.9605095953512194,\n    0.9606574967357419,\n    0.9608048667967017,\n    0.9609517072737735,\n    0.9610980199022023,\n    0.9612438064128053,\n    0.961389068531974,\n    0.9615338079816756,\n    0.9616780264794544,\n    0.9618217257384336,\n    0.9619649074673171,\n    0.9621075733703927,\n    0.9622497251475317,\n    0.9623913644941934,\n    0.9625324931014263,\n    0.9626731126558706,\n    0.9628132248397597,\n    0.9629528313309248,\n    0.9630919338027948,\n    0.9632305339244013,\n    0.9633686333603795,\n    0.9635062337709722,\n    0.9636433368120323,\n    0.9637799441350253,\n    0.9639160573870332,\n    0.9640516782107567,\n    0.9641868082445192,\n    0.9643214491222695,\n    0.9644556024735851,\n    0.9645892699236761,\n    0.9647224530933884,\n    0.9648551535992067,\n    0.9649873730532592,\n    0.9651191130633208,\n    0.965250375232816,\n    0.9653811611608242,\n    0.9655114724420827,\n    0.965641310666991,\n    0.9657706774216144,\n    0.9658995742876885,\n    0.9660280028426234,\n    0.9661559646595075,\n    0.9662834613071121,\n    0.9664104943498959,\n    0.9665370653480092,\n    0.9666631758572982,\n    0.9667888274293097,\n    0.9669140216112958,\n    0.9670387599462187,\n    0.9671630439727548,\n    0.9672868752253001,\n    0.9674102552339746,\n    0.9675331855246277,\n    0.9676556676188429,\n    0.9677777030339422,\n    0.9678992932829918,\n    0.9680204398748077,\n    0.9681411443139596,\n    0.9682614081007771,\n    0.9683812327313545,\n    0.9685006196975565,\n    0.9686195704870232,\n    0.9687380865831757,\n    0.9688561694652216,\n    0.9689738206081602,\n    0.9690910414827888,\n    0.9692078335557077,\n    0.9693241982893259,\n    0.9694401371418673,\n    0.9695556515673754,\n    0.969670743015721,\n    0.9697854129326061,\n    0.9698996627595705,\n    0.9700134939339985,\n    0.9701269078891237,\n    0.9702399060540357,\n    0.9703524898536865,\n    0.9704646607088951,\n    0.9705764200363561,\n    0.9706877692486436,\n    0.9707987097542187,\n    0.9709092429574353,\n    0.9710193702585468,\n    0.9711290930537122,\n    0.9712384127350021,\n    0.9713473306904065,\n    0.9714558483038396,\n    0.9715639669551474,\n    0.9716716880201136,\n    0.9717790128704671,\n    0.9718859428738872,\n    0.9719924793940118,\n    0.9720986237904434,\n    0.9722043774187549,\n    0.9723097416304984,\n    0.9724147177732099,\n    0.972519307190418,\n    0.9726235112216486,\n    0.9727273312024342,\n    0.9728307684643189,\n    0.9729338243348664,\n    0.9730365001376665,\n    0.9731387971923422,\n    0.9732407168145568,\n    0.9733422603160213,\n    0.9734434290045008,\n    0.9735442241838218,\n    0.9736446471538801,\n    0.9737446992106472,\n    0.9738443816461771,\n    0.9739436957486151,\n    0.9740426428022031,\n    0.9741412240872889,\n    0.9742394408803315,\n    0.9743372944539102,\n    0.9744347860767304,\n    0.9745319170136325,\n    0.9746286885255978,\n    0.9747251018697572,\n    0.9748211582993981,\n    0.9749168590639715,\n    0.9750122054090997,\n    0.9751071985765847,\n    0.9752018398044143,\n    0.9752961303267709,\n    0.975390071374038,\n    0.9754836641728087,\n    0.9755769099458929,\n    0.9756698099123244,\n    0.9757623652873698,\n    0.975854577282535,\n    0.9759464471055739,\n    0.9760379759604947,\n    0.976129165047569,\n    0.9762200155633391,\n    0.9763105287006256,\n    0.9764007056485348,\n    0.9764905475924671,\n    0.9765800557141248,\n    0.9766692311915195,\n    0.9767580751989804,\n    0.9768465889071616,\n    0.9769347734830501,\n    0.9770226300899744,\n    0.977110159887611,\n    0.9771973640319939,\n    0.9772842436755209,\n    0.9773707999669629,\n    0.9774570340514706,\n    0.9775429470705839,\n    0.9776285401622385,\n    0.9777138144607745,\n    0.9777987710969442,\n    0.9778834111979205,\n    0.977967735887304,\n    0.9780517462851323,\n    0.9781354435078863,\n    0.9782188286685003,\n    0.9783019028763682,\n    0.9783846672373524,\n    0.9784671228537921,\n    0.9785492708245106,\n    0.9786311122448239,\n    0.9787126482065489,\n    0.9787938797980108,\n    0.9788748081040518,\n    0.9789554342060389,\n    0.9790357591818724,\n    0.9791157841059936,\n    0.9791955100493928,\n    0.9792749380796175,\n    0.9793540692607815,\n    0.9794329046535715,\n    0.9795114453152562,\n    0.979589692299694,\n    0.9796676466573412,\n    0.979745309435261,\n    0.97982268167713,\n    0.9798997644232479,\n    0.9799765587105447,\n    0.9800530655725895,\n    0.980129286039598,\n    0.9802052211384412,\n    0.9802808718926534,\n    0.9803562393224404,\n    0.9804313244446876,\n    0.9805061282729682,\n    0.9805806518175514,\n    0.9806548960854106,\n    0.9807288620802312,\n    0.9808025508024198,\n    0.9808759632491112,\n    0.980949100414177,\n    0.9810219632882344,\n    0.9810945528586532,\n    0.9811668701095654,\n    0.9812389160218716,\n    0.9813106915732508,\n    0.9813821977381683,\n    0.9814534354878828,\n    0.9815244057904553,\n    0.9815951096107584,\n    0.9816655479104817,\n    0.9817357216481429,\n    0.9818056317790944,\n    0.9818752792555313,\n    0.9819446650265006,\n    0.9820137900379085,\n    0.9820826552325287,\n    0.982151261550011,\n    0.9822196099268891,\n    0.9822877012965885,\n    0.9823555365894354,\n    0.982423116732664,\n    0.9824904426504252,\n    0.9825575152637948,\n    0.9826243354907803,\n    0.9826909042463318,\n    0.9827572224423471,\n    0.9828232909876817,\n    0.982889110788156,\n    0.9829546827465642,\n    0.9830200077626816,\n    0.9830850867332734,\n    0.9831499205521023,\n    0.9832145101099363,\n    0.9832788562945579,\n    0.983342959990771,\n    0.9834068220804095,\n    0.9834704434423458,\n    0.9835338249524974,\n    0.983596967483837,\n    0.9836598719063984,\n    0.9837225390872865,\n    0.9837849698906836,\n    0.983847165177859,\n    0.9839091258071755,\n    0.9839708526340988,\n    0.9840323465112041,\n    0.9840936082881853,\n    0.9841546388118627,\n    0.9842154389261902,\n    0.9842760094722643,\n    0.9843363512883317,\n    0.9843964652097964,\n    0.9844563520692292,\n    0.9845160126963748,\n    0.9845754479181588,\n    0.9846346585586978,\n    0.9846936454393053,\n    0.9847524093785005,\n    0.984810951192016,\n    0.9848692716928064,\n    0.9849273716910543,\n    0.9849852519941804,\n    0.98504291340685,\n    0.9851003567309808,\n    0.9851575827657517,\n    0.9852145923076094,\n    0.9852713861502774,\n    0.9853279650847626,\n    0.9853843298993641,\n    0.9854404813796805,\n    0.985496420308618,\n    0.9855521474663975,\n    0.9856076636305632,\n    0.9856629695759892,\n    0.9857180660748888,\n    0.985772953896821,\n    0.9858276338086983,\n    0.985882106574795,\n    0.9859363729567544,\n    0.985990433713597,\n    0.9860442896017267,\n    0.9860979413749407,\n    0.9861513897844354,\n    0.9862046355788149,\n    0.9862576795040977,\n    0.9863105223037255,\n    0.9863631647185701,\n    0.986415607486941,\n    0.9864678513445929,\n    0.9865198970247337,\n    0.9865717452580314,\n    0.9866233967726227,\n    0.9866748522941186,\n    0.9867261125456144,\n    0.9867771782476948,\n    0.9868280501184434,\n    0.986878728873449,\n    0.9869292152258129,\n    0.9869795098861571,\n    0.9870296135626316,\n    0.9870795269609209,\n    0.9871292507842533,\n    0.9871787857334058,\n    0.9872281325067137,\n    0.9872772918000768,\n    0.9873262643069669,\n    0.9873750507184357,\n    0.9874236517231209,\n    0.9874720680072551,\n    0.9875203002546722,\n    0.9875683491468141,\n    0.9876162153627396,\n    0.9876638995791303,\n    0.987711402470298,\n    0.9877587247081928,\n    0.9878058669624095,\n    0.9878528299001947,\n    0.9878996141864553,\n    0.9879462204837635,\n    0.9879926494523661,\n    0.9880389017501905,\n    0.9880849780328519,\n    0.9881308789536605,\n    0.9881766051636294,\n    0.9882221573114803,\n    0.9882675360436518,\n    0.9883127420043056,\n    0.9883577758353339,\n    0.9884026381763671,\n    0.9884473296647792,\n    0.9884918509356967,\n    0.9885362026220039,\n    0.9885803853543514,\n    0.9886243997611625,\n    0.9886682464686385,\n    0.9887119261007691,\n    0.988755439279336,\n    0.9887987866239217,\n    0.988841968751916,\n    0.9888849862785224,\n    0.9889278398167658,\n    0.9889705299774982,\n    0.9890130573694068,\n    0.9890554225990201,\n    0.9890976262707148,\n    0.9891396689867223,\n    0.9891815513471369,\n    0.9892232739499199,\n    0.9892648373909093,\n    0.9893062422638245,\n    0.9893474891602738,\n    0.9893885786697612,\n    0.9894295113796924,\n    0.9894702878753826,\n    0.9895109087400621,\n    0.9895513745548833,\n    0.9895916858989275,\n    0.9896318433492118,\n    0.9896718474806949,\n    0.9897116988662837,\n    0.9897513980768414,\n    0.9897909456811917,\n    0.9898303422461273,\n    0.9898695883364156,\n    0.989908684514805,\n    0.9899476313420319,\n    0.9899864293768268,\n    0.9900250791759213,\n    0.9900635812940538,\n    0.9901019362839765,\n    0.9901401446964613,\n    0.9901782070803069,\n    0.990216123982345,\n    0.9902538959474461,\n    0.9902915235185259,\n    0.9903290072365529,\n    0.9903663476405535,\n    0.9904035452676181,\n    0.9904406006529085,\n    0.9904775143296634,\n    0.9905142868292047,\n    0.9905509186809447,\n    0.9905874104123901,\n    0.9906237625491514,\n    0.9906599756149459,\n    0.9906960501316063,\n    0.9907319866190857,\n    0.9907677855954636,\n    0.9908034475769532,\n    0.9908389730779061,\n    0.9908743626108194,\n    0.9909096166863416,\n    0.9909447358132779,\n    0.9909797204985978,\n    0.9910145712474399,\n    0.9910492885631179,\n    0.991083872947128,\n    0.991118324899153,\n    0.9911526449170697,\n    0.991186833496954,\n    0.9912208911330882,\n    0.9912548183179649,\n    0.9912886155422945,\n    0.9913222832950107,\n    0.9913558220632764,\n    0.9913892323324893,\n    0.991422514586288,\n    0.9914556693065582,\n    0.9914886969734374,\n    0.9915215980653225,\n    0.991554373058874,\n    0.9915870224290223,\n    0.9916195466489739,\n    0.9916519461902167,\n    0.9916842215225259,\n    0.9917163731139695,\n    0.9917484014309144,\n    0.9917803069380321,\n    0.9918120900983038,\n    0.9918437513730266,\n    0.991875291221819,\n    0.9919067101026269,\n    0.9919380084717284,\n    0.9919691867837398,\n    0.9920002454916221,\n    0.9920311850466846,\n    0.9920620058985926,\n    0.9920927084953715,\n    0.9921232932834126,\n    0.9921537607074791,\n    0.9921841112107114,\n    0.9922143452346321,\n    0.9922444632191525,\n    0.9922744656025764,\n    0.9923043528216078,\n    0.9923341253113541,\n    0.9923637835053328,\n    0.9923933278354767,\n    0.9924227587321393,\n    0.9924520766240994,\n    0.9924812819385679,\n    0.9925103751011916,\n    0.9925393565360598,\n    0.9925682266657089,\n    0.9925969859111274,\n    0.9926256346917622,\n    0.9926541734255229,\n    0.9926826025287876,\n    0.992710922416408,\n    0.9927391335017144,\n    0.992767236196521,\n    0.9927952309111313,\n    0.9928231180543431,\n    0.9928508980334536,\n    0.9928785712542649,\n    0.9929061381210885,\n    0.9929335990367509,\n    0.9929609544025984,\n    0.992988204618503,\n    0.9930153500828657,\n    0.9930423911926235,\n    0.9930693283432538,\n    0.993096161928778,\n    0.9931228923417693,\n    0.993149519973355,\n    0.9931760452132232,\n    0.9932024684496267,\n    0.9932287900693889,\n    0.993255010457908,\n    0.9932811299991624,\n    0.9933071490757153,\n    0.9933330680687192,\n    0.9933588873579225,\n    0.993384607321672,\n    0.9934102283369194,\n    0.9934357507792255,\n    0.993461175022765,\n    0.9934865014403319,\n    0.9935117304033434,\n    0.9935368622818452,\n    0.9935618974445167,\n    0.9935868362586746,\n    0.9936116790902788,\n    0.993636426303936,\n    0.9936610782629057,\n    0.9936856353291037,\n    0.9937100978631075,\n    0.993734466224161,\n    0.9937587407701782,\n    0.9937829218577495,\n    0.9938070098421449,\n    0.9938310050773187,\n    0.9938549079159148,\n    0.9938787187092715,\n    0.9939024378074247,\n    0.9939260655591138,\n    0.9939496023117854,\n    0.9939730484115986,\n    0.9939964042034285,\n    0.9940196700308718,\n    0.9940428462362503,\n    0.9940659331606164,\n    0.9940889311437562,\n    0.9941118405241954,\n    0.9941346616392029,\n    0.9941573948247951,\n    0.994180040415741,\n    0.9942025987455659,\n    0.9942250701465559,\n    0.9942474549497627,\n    0.994269753485008,\n    0.9942919660808865,\n    0.9943140930647723,\n    0.9943361347628211,\n    0.9943580914999767,\n    0.9943799635999729,\n    0.99440175138534,\n    0.9944234551774075,\n    0.994445075296309,\n    0.994466612060986,\n    0.994488065789193,\n    0.9945094367975008,\n    0.9945307254013009,\n    0.9945519319148104,\n    0.9945730566510748,\n    0.9945940999219731,\n    0.9946150620382221,\n    0.99463594330938,\n    0.9946567440438505,\n    0.9946774645488873,\n    0.994698105130598,\n    0.9947186660939477,\n    0.9947391477427643,\n    0.994759550379741,\n    0.9947798743064417,\n    0.9948001198233035,\n    0.9948202872296429,\n    0.9948403768236574,\n    0.9948603889024311,\n    0.9948803237619381,\n    0.9949001816970466,\n    0.9949199630015225,\n    0.9949396679680339,\n    0.9949592968881543,\n    0.9949788500523676,\n    0.994998327750071,\n    0.9950177302695786,\n    0.9950370578981272,\n    0.9950563109218775,\n    0.99507548962592,\n    0.9950945942942779,\n    0.9951136252099114,\n    0.9951325826547209,\n    0.9951514669095514,\n    0.9951702782541956,\n    0.9951890169673988,\n    0.995207683326861,\n    0.9952262776092429,\n    0.9952448000901669,\n    0.9952632510442234,\n    0.9952816307449724,\n    0.9952999394649492,\n    0.9953181774756662,\n    0.995336345047618,\n    0.995354442450284,\n    0.9953724699521328,\n    0.9953904278206259,\n    0.9954083163222202,\n    0.9954261357223732,\n    0.9954438862855455,\n    0.9954615682752046,\n    0.9954791819538289,\n    0.9954967275829107,\n    0.9955142054229602,\n    0.9955316157335089,\n    0.9955489587731127,\n    0.9955662347993564,\n    0.9955834440688559,\n    0.9956005868372635,\n    0.9956176633592693,\n    0.9956346738886059,\n    0.9956516186780524,\n    0.9956684979794361,\n    0.9956853120436374,\n    0.9957020611205928,\n    0.9957187454592983,\n    0.9957353653078126,\n    0.9957519209132613,\n    0.9957684125218387,\n    0.995784840378813,\n    0.9958012047285281,\n    0.9958175058144085,\n    0.9958337438789611,\n    0.9958499191637799,\n    0.9958660319095476,\n    0.9958820823560408,\n    0.9958980707421329,\n    0.9959139973057952,\n    0.995929862284104,\n    0.9959456659132401,\n    0.9959614084284943,\n    0.9959770900642708,\n    0.9959927110540886,\n    0.9960082716305863,\n    0.9960237720255247,\n    0.9960392124697902,\n    0.9960545931933981,\n    0.9960699144254949,\n    0.9960851763943629,\n    0.9961003793274221,\n    0.9961155234512341,\n    0.9961306089915051,\n    0.9961456361730884,\n    0.9961606052199882,\n    0.9961755163553627,\n    0.9961903698015271,\n    0.9962051657799561,\n    0.9962199045112877,\n    0.9962345862153261,\n    0.9962492111110447,\n    0.9962637794165885,\n    0.9962782913492785,\n    0.9962927471256138,\n    0.9963071469612743,\n    0.9963214910711243,\n    0.9963357796692157,\n    0.9963500129687901,\n    0.9963641911822829,\n    0.996378314521325,\n    0.9963923831967464,\n    0.9964063974185798,\n    0.9964203573960624,\n    0.9964342633376391,\n    0.9964481154509656,\n    0.9964619139429115,\n    0.9964756590195629,\n    0.9964893508862253,\n    0.9965029897474263,\n    0.9965165758069192,\n    0.9965301092676847,\n    0.9965435903319345,\n    0.9965570192011142,\n    0.9965703960759055,\n    0.9965837211562298,\n    0.9965969946412504,\n    0.9966102167293753,\n    0.9966233876182606,\n    0.9966365075048124,\n    0.9966495765851902,\n    0.9966625950548095,\n    0.9966755631083445,\n    0.9966884809397303,\n    0.9967013487421672,\n    0.9967141667081211,\n    0.9967269350293284,\n    0.9967396538967973,\n    0.9967523235008112,\n    0.996764944030931,\n    0.9967775156759975,\n    0.9967900386241352,\n    0.9968025130627537,\n    0.9968149391785505,\n    0.9968273171575148,\n    0.9968396471849287,\n    0.9968519294453704,\n    0.9968641641227171,\n    0.9968763514001466,\n    0.9968884914601416,\n    0.9969005844844906,\n    0.9969126306542909,\n    0.9969246301499518,\n    0.9969365831511966,\n    0.9969484898370652,\n    0.9969603503859165,\n    0.9969721649754317,\n    0.9969839337826155,\n    0.9969956569837995,\n    0.9970073347546454,\n    0.9970189672701452,\n    0.9970305547046259,\n    0.997042097231751,\n    0.9970535950245228,\n    0.9970650482552862,\n    0.9970764570957289,\n    0.9970878217168856,\n    0.99709914228914,\n    0.9971104189822263,\n    0.9971216519652336,\n    0.9971328414066062,\n    0.9971439874741471,\n    0.9971550903350206,\n    0.9971661501557534,\n    0.9971771671022386,\n    0.9971881413397367,\n    0.9971990730328789,\n    0.9972099623456693,\n    0.9972208094414864,\n    0.9972316144830862,\n    0.9972423776326046,\n    0.9972530990515591,\n    0.997263778900852,\n    0.9972744173407714,\n    0.9972850145309949,\n    0.9972955706305906,\n    0.9973060857980207,\n    0.9973165601911425,\n    0.9973269939672114,\n    0.9973373872828826,\n    0.9973477402942144,\n    0.9973580531566689,\n    0.9973683260251155,\n    0.9973785590538324,\n    0.9973887523965093,\n    0.9973989062062492,\n    0.9974090206355707,\n    0.9974190958364102,\n    0.997429131960124,\n    0.9974391291574908,\n    0.9974490875787134,\n    0.997459007373421,\n    0.9974688886906717,\n    0.9974787316789544,\n    0.9974885364861902,\n    0.9974983032597357,\n    0.9975080321463846,\n    0.9975177232923699,\n    0.9975273768433653,\n    0.9975369929444886,\n    0.9975465717403025,\n    0.997556113374818,\n    0.9975656179914948,\n    0.9975750857332447,\n    0.9975845167424336,\n    0.9975939111608827,\n    0.997603269129871,\n    0.9976125907901375,\n    0.9976218762818835,\n    0.9976311257447729,\n    0.9976403393179369,\n    0.9976495171399739,\n    0.9976586593489525,\n    0.9976677660824127,\n    0.9976768374773688,\n    0.9976858736703106,\n    0.9976948747972061,\n    0.9977038409935028,\n    0.9977127723941301,\n    0.9977216691335006,\n    0.997730531345513,\n    0.9977393591635534,\n    0.997748152720497,\n    0.9977569121487109,\n    0.997765637580055,\n    0.9977743291458849,\n    0.9977829869770526,\n    0.9977916112039098,\n    0.9978002019563085,\n    0.9978087593636037,\n    0.9978172835546547,\n    0.9978257746578281,\n    0.9978342328009973,\n    0.9978426581115474,\n    0.9978510507163746,\n    0.9978594107418892,\n    0.9978677383140173,\n    0.9978760335582023,\n    0.9978842965994068,\n    0.997892527562115,\n    0.9979007265703336,\n    0.9979088937475938,\n    0.9979170292169542,\n    0.9979251331010007,\n    0.9979332055218498,\n    0.9979412466011499,\n    0.9979492564600826,\n    0.9979572352193652,\n    0.997965182999252,\n    0.9979730999195362,\n    0.9979809860995514,\n    0.9979888416581738,\n    0.9979966667138236,\n    0.9980044613844665,\n    0.998012225787616,\n    0.9980199600403347,\n    0.9980276642592362,\n    0.9980353385604862,\n    0.9980429830598053,\n    0.9980505978724697,\n    0.9980581831133134,\n    0.9980657388967297,\n    0.9980732653366725,\n    0.998080762546659,\n    0.9980882306397698,\n    0.9980956697286526,\n    0.9981030799255213,\n    0.9981104613421601,\n    0.9981178140899236,\n    0.9981251382797387,\n    0.9981324340221067,\n    0.9981397014271043,\n    0.9981469406043857,\n    0.9981541516631836,\n    0.9981613347123114,\n    0.9981684898601648,\n    0.998175617214723,\n    0.9981827168835503,\n    0.9981897889737974,\n    0.998196833592204,\n    0.9982038508450997,\n    0.9982108408384048,\n    0.9982178036776337,\n    0.998224739467894,\n    0.998231648313891,\n    0.998238530319926,\n    0.9982453855899006,\n    0.9982522142273162,\n    0.9982590163352771,\n    0.998265792016491,\n    0.9982725413732704,\n    0.9982792645075347,\n    0.998285961520812,\n    0.9982926325142392,\n    0.9982992775885648,\n    0.9983058968441496,\n    0.9983124903809688,\n    0.998319058298613,\n    0.99832560069629,\n    0.9983321176728251,\n    0.9983386093266648,\n    0.9983450757558759,\n    0.9983515170581486,\n    0.9983579333307973,\n    0.998364324670761,\n    0.9983706911746074,\n    0.9983770329385311,\n    0.9983833500583577,\n    0.9983896426295433,\n    0.9983959107471775,\n    0.9984021545059827,\n    0.998408374000318,\n    0.9984145693241788,\n    0.9984207405711987,\n    0.998426887834651,\n    0.99843301120745,\n    0.9984391107821522,\n    0.9984451866509576,\n    0.998451238905712,\n    0.9984572676379068,\n    0.9984632729386814,\n    0.9984692548988241,\n    0.9984752136087741,\n    0.9984811491586221,\n    0.9984870616381116,\n    0.9984929511366406,\n    0.998498817743263,\n    0.9985046615466895,\n    0.9985104826352893,\n    0.998516281097091,\n    0.9985220570197841,\n    0.9985278104907205,\n    0.9985335415969153,\n    0.998539250425049,\n    0.9985449370614672,\n    0.9985506015921835,\n    0.9985562441028796,\n    0.9985618646789075,\n    0.9985674634052898,\n    0.9985730403667223,\n    0.9985785956475733,\n    0.9985841293318867,\n    0.9985896415033819,\n    0.9985951322454563,\n    0.9986006016411856,\n    0.9986060497733246,\n    0.99861147672431,\n    0.9986168825762602,\n    0.9986222674109773,\n    0.9986276313099476,\n    0.9986329743543437,\n    0.9986382966250248,\n    0.9986435982025386,\n    0.9986488791671222,\n    0.998654139598703,\n    0.9986593795769002,\n    0.9986645991810267,\n    0.9986697984900882,\n    0.9986749775827868,\n    0.9986801365375205,\n    0.9986852754323851,\n    0.9986903943451748,\n    0.9986954933533843,\n    0.9987005725342086,\n    0.9987056319645456,\n    0.9987106717209965,\n    0.9987156918798661,\n    0.9987206925171654,\n    0.9987256737086126,\n    0.9987306355296329,\n    0.9987355780553607,\n    0.9987405013606405,\n    0.9987454055200282,\n    0.9987502906077915,\n    0.9987551566979116,\n    0.9987600038640845,\n    0.9987648321797215,\n    0.9987696417179502,\n    0.998774432551617,\n    0.9987792047532855,\n    0.9987839583952408,\n    0.9987886935494876,\n    0.9987934102877537,\n    0.998798108681489,\n    0.9988027888018683,\n    0.9988074507197912,\n    0.9988120945058837,\n    0.9988167202304989,\n    0.9988213279637184,\n    0.9988259177753532,\n    0.9988304897349445,\n    0.9988350439117649,\n    0.99883958037482,\n    0.9988440991928482,\n    0.9988486004343227,\n    0.9988530841674523,\n    0.9988575504601823,\n    0.9988619993801953,\n    0.9988664309949126,\n    0.9988708453714951,\n    0.9988752425768442,\n    0.9988796226776031,\n    0.9988839857401568,\n    0.9988883318306349,\n    0.99889266101491,\n    0.998896973358602,\n    0.9989012689270753,\n    0.9989055477854435,\n    0.9989098099985672,\n    0.9989140556310573,\n    0.9989182847472744,\n    0.9989224974113308,\n    0.9989266936870904,\n    0.998930873638171,\n    0.9989350373279441,\n    0.9989391848195363,\n    0.9989433161758301,\n    0.9989474314594652,\n    0.9989515307328389,\n    0.9989556140581076,\n    0.998959681497187,\n    0.9989637331117535,\n    0.9989677689632452,\n    0.9989717891128628,\n    0.9989757936215702,\n    0.9989797825500956,\n    0.9989837559589321,\n    0.9989877139083393,\n    0.9989916564583438,\n    0.9989955836687396,\n    0.99899949559909,\n    0.9990033923087277,\n    0.9990072738567558,\n    0.9990111403020493,\n    0.9990149917032549,\n    0.999018828118793,\n    0.9990226496068572,\n    0.9990264562254172,\n    0.9990302480322174,\n    0.9990340250847796,\n    0.9990377874404025,\n    0.9990415351561638,\n    0.9990452682889194,\n    0.9990489868953062,\n    0.9990526910317417,\n    0.9990563807544247,\n    0.9990600561193372,\n    0.9990637171822441,\n    0.9990673639986946,\n    0.9990709966240232,\n    0.9990746151133504,\n    0.9990782195215828,\n    0.9990818099034151,\n    0.99908538631333,\n    0.9990889488055994,\n    0.9990924974342854,\n    0.9990960322532406,\n    0.9990995533161093,\n    0.9991030606763281,\n    0.9991065543871269,\n    0.9991100345015295,\n    0.9991135010723542,\n    0.9991169541522155,\n    0.9991203937935235,\n    0.9991238200484857,\n    0.9991272329691079,\n    0.9991306326071938,\n    0.9991340190143472,\n    0.9991373922419715,\n    0.999140752341272,\n    0.9991440993632549,\n    0.999147433358729,\n    0.9991507543783068,\n    0.9991540624724046,\n    0.9991573576912437,\n    0.9991606400848502,\n    0.9991639097030574,\n    0.999167166595505,\n    0.9991704108116409,\n    0.9991736424007212,\n    0.9991768614118112,\n    0.9991800678937864,\n    0.999183261895333,\n    0.9991864434649486,\n    0.9991896126509426,\n    0.9991927695014381,\n    0.9991959140643708,\n    0.9991990463874916,\n    0.9992021665183656,\n    0.9992052745043747,\n    0.9992083703927161,\n    0.9992114542304054,\n    0.9992145260642746,\n    0.9992175859409759,\n    0.9992206339069791,\n    0.9992236700085755,\n    0.9992266942918758,\n    0.9992297068028129,\n    0.9992327075871412,\n    0.9992356966904383,\n    0.9992386741581051,\n    0.9992416400353659,\n    0.9992445943672705,\n    0.9992475371986941,\n    0.9992504685743375,\n    0.999253388538729,\n    0.9992562971362237,\n    0.999259194411005,\n    0.9992620804070853,\n    0.9992649551683059,\n    0.9992678187383391,\n    0.999270671160687,\n    0.9992735124786836,\n    0.9992763427354946,\n    0.9992791619741188,\n    0.9992819702373881,\n    0.9992847675679684,\n    0.9992875540083602,\n    0.9992903296008995,\n    0.9992930943877577,\n    0.9992958484109428,\n    0.9992985917123008,\n    0.9993013243335144,\n    0.999304046316105,\n    0.9993067577014336,\n    0.9993094585307,\n    0.9993121488449448,\n    0.9993148286850494,\n    0.9993174980917366,\n    0.9993201571055714,\n    0.9993228057669614,\n    0.9993254441161578,\n    0.9993280721932551,\n    0.9993306900381931,\n    0.9993332976907565,\n    0.9993358951905751,\n    0.999338482577126,\n    0.9993410598897322,\n    0.9993436271675653,\n    0.9993461844496442,\n    0.9993487317748365,\n    0.9993512691818597,\n    0.9993537967092804,\n    0.9993563143955159,\n    0.9993588222788345,\n    0.9993613203973564,\n    0.9993638087890532,\n    0.9993662874917502,\n    0.9993687565431251,\n    0.9993712159807097,\n    0.9993736658418905,\n    0.9993761061639089,\n    0.9993785369838617,\n    0.9993809583387016,\n    0.9993833702652382,\n    0.9993857728001386,\n    0.9993881659799271,\n    0.9993905498409864,\n    0.9993929244195585,\n    0.9993952897517442,\n    0.9993976458735048,\n    0.9993999928206616,\n    0.999402330628897,\n    0.9994046593337557,\n    0.999406978970643,\n    0.9994092895748281,\n    0.9994115911814431,\n    0.9994138838254832,\n    0.9994161675418086,\n    0.9994184423651439,\n    0.9994207083300786,\n    0.9994229654710683,\n    0.9994252138224351,\n    0.9994274534183676,\n    0.9994296842929216,\n    0.9994319064800213,\n    0.9994341200134585,\n    0.9994363249268947,\n    0.99943852125386,\n    0.9994407090277545,\n    0.9994428882818492,\n    0.9994450590492857,\n    0.9994472213630764,\n    0.9994493752561064,\n    0.9994515207611326,\n    0.9994536579107853,\n    0.9994557867375674,\n    0.9994579072738564,\n    0.9994600195519033,\n    0.999462123603835,\n    0.9994642194616526,\n    0.9994663071572335,\n    0.9994683867223316,\n    0.9994704581885768,\n    0.9994725215874769,\n    0.9994745769504169,\n    0.9994766243086606,\n    0.9994786636933496,\n    0.999480695135505,\n    0.9994827186660278,\n    0.9994847343156984,\n    0.9994867421151781,\n    0.9994887420950092,\n    0.9994907342856149,\n    0.9994927187173008,\n    0.9994946954202548,\n    0.9994966644245472,\n    0.999498625760132,\n    0.9995005794568467,\n    0.9995025255444128,\n    0.9995044640524365,\n    0.9995063950104092,\n    0.9995083184477073,\n    0.9995102343935939,\n    0.9995121428772178,\n    0.9995140439276147,\n    0.9995159375737079,\n    0.9995178238443079,\n    0.9995197027681136,\n    0.9995215743737126,\n    0.999523438689581,\n    0.9995252957440849,\n    0.9995271455654797,\n    0.9995289881819112,\n    0.999530823621416,\n    0.999532651911922,\n    0.9995344730812479,\n    0.9995362871571053,\n    0.9995380941670976,\n    0.999539894138721,\n    0.999541687099365,\n    0.9995434730763126,\n    0.9995452520967412,\n    0.999547024187722,\n    0.9995487893762218,\n    0.999550547689102,\n    0.9995522991531197,\n    0.9995540437949288,\n    0.9995557816410785,\n    0.999557512718016,\n    0.9995592370520848,\n    0.9995609546695269,\n    0.9995626655964818,\n    0.9995643698589873,\n    0.9995660674829807,\n    0.9995677584942982,\n    0.9995694429186754,\n    0.9995711207817481,\n    0.9995727921090527,\n    0.9995744569260261,\n    0.9995761152580062,\n    0.9995777671302333,\n    0.9995794125678485,\n    0.9995810515958958,\n    0.9995826842393223,\n    0.9995843105229772,\n    0.9995859304716141,\n    0.9995875441098898,\n    0.9995891514623653,\n    0.9995907525535066,\n    0.9995923474076841,\n    0.9995939360491741,\n    0.9995955185021579,\n    0.9995970947907232,\n    0.9995986649388644,\n    0.9996002289704817,\n    0.9996017869093836,\n    0.9996033387792852,\n    0.9996048846038099,\n    0.9996064244064888,\n    0.9996079582107623,\n    0.9996094860399791,\n    0.9996110079173974,\n    0.9996125238661848,\n    0.9996140339094192,\n    0.9996155380700886,\n    0.9996170363710914,\n    0.9996185288352377,\n    0.999620015485248,\n    0.9996214963437557,\n    0.9996229714333049,\n    0.999624440776353,\n    0.9996259043952698,\n    0.9996273623123381,\n    0.9996288145497543,\n    0.9996302611296282,\n    0.999631702073984,\n    0.99963313740476,\n    0.9996345671438095,\n    0.9996359913129005,\n    0.9996374099337169,\n    0.9996388230278576,\n    0.9996402306168382,\n    0.9996416327220904,\n    0.9996430293649623,\n    0.9996444205667198,\n    0.9996458063485449,\n    0.9996471867315387,\n    0.9996485617367191,\n    0.999649931385023,\n    0.9996512956973053,\n    0.9996526546943404,\n    0.9996540083968218,\n    0.999655356825362,\n    0.9996567000004942,\n    0.9996580379426713,\n    0.9996593706722667,\n    0.9996606982095744,\n    0.9996620205748099,\n    0.9996633377881102,\n    0.9996646498695336,\n    0.9996659568390602,\n    0.9996672587165932,\n    0.9996685555219577,\n    0.9996698472749022,\n    0.9996711339950981,\n    0.9996724157021403,\n    0.9996736924155477,\n    0.9996749641547633,\n    0.9996762309391545,\n    0.9996774927880132,\n    0.9996787497205563,\n    0.9996800017559261,\n    0.9996812489131904,\n    0.9996824912113431,\n    0.9996837286693037,\n    0.9996849613059188,\n    0.999686189139961,\n    0.9996874121901304,\n    0.9996886304750542,\n    0.9996898440132873,\n    0.9996910528233123,\n    0.9996922569235399,\n    0.9996934563323092,\n    0.999694651067888,\n    0.9996958411484734,\n    0.9996970265921912,\n    0.9996982074170968,\n    0.9996993836411757,\n    0.9997005552823433,\n    0.9997017223584449,\n    0.999702884887257,\n    0.999704042886487,\n    0.9997051963737726,\n    0.999706345366684,\n    0.9997074898827222,\n    0.9997086299393205,\n    0.9997097655538445,\n    0.9997108967435919,\n    0.9997120235257934,\n    0.9997131459176125,\n    0.9997142639361463,\n    0.9997153775984248,\n    0.9997164869214124,\n    0.9997175919220068,\n    0.9997186926170408,\n    0.9997197890232813,\n    0.9997208811574295,\n    0.9997219690361224,\n    0.9997230526759323,\n    0.9997241320933663,\n    0.9997252073048677,\n    0.9997262783268163,\n    0.9997273451755272,\n    0.9997284078672529,\n    0.9997294664181822,\n    0.9997305208444413,\n    0.999731571162093,\n    0.9997326173871383,\n    0.9997336595355156,\n    0.9997346976231016,\n    0.999735731665711,\n    0.9997367616790969,\n    0.9997377876789509,\n    0.9997388096809043,\n    0.9997398277005269,\n    0.9997408417533283,\n    0.9997418518547578,\n    0.9997428580202044,\n    0.9997438602649966,\n    0.9997448586044048,\n    0.9997458530536386,\n    0.9997468436278489,\n    0.9997478303421281,\n    0.9997488132115092,\n    0.9997497922509672,\n    0.9997507674754179,\n    0.9997517388997206,\n    0.9997527065386754,\n    0.9997536704070258,\n    0.9997546305194572,\n    0.9997555868905984,\n    0.9997565395350211,\n    0.9997574884672398,\n    0.9997584337017134,\n    0.9997593752528438,\n    0.9997603131349773,\n    0.9997612473624043,\n    0.9997621779493597,\n    0.9997631049100225,\n    0.9997640282585171,\n    0.9997649480089127,\n    0.9997658641752237,\n    0.9997667767714098,\n    0.9997676858113772,\n    0.999768591308977,\n    0.9997694932780068,\n    0.9997703917322106,\n    0.9997712866852789,\n    0.9997721781508487,\n    0.999773066142504,\n    0.9997739506737762,\n    0.9997748317581437,\n    0.9997757094090328,\n    0.9997765836398173,\n    0.9997774544638187,\n    0.9997783218943074,\n    0.9997791859445015,\n    0.9997800466275678,\n    0.9997809039566223,\n    0.9997817579447292,\n    0.9997826086049023,\n    0.9997834559501049,\n    0.9997842999932496,\n    0.9997851407471986,\n    0.9997859782247642,\n    0.9997868124387095,\n    0.9997876434017465,\n    0.9997884711265388,\n    0.9997892956257006,\n    0.9997901169117969,\n    0.9997909349973435,\n    0.9997917498948077,\n    0.9997925616166083,\n    0.9997933701751162,\n    0.9997941755826532,\n    0.9997949778514938,\n    0.9997957769938647,\n    0.9997965730219448,\n    0.9997973659478657,\n    0.9997981557837118,\n    0.9997989425415202,\n    0.9997997262332818,\n    0.9998005068709402,\n    0.9998012844663927,\n    0.9998020590314901,\n    0.9998028305780378,\n    0.9998035991177943,\n    0.9998043646624727,\n    0.999805127223741,\n    0.9998058868132211,\n    0.9998066434424898,\n    0.9998073971230789,\n    0.9998081478664758,\n    0.9998088956841223,\n    0.9998096405874162,\n    0.9998103825877107,\n    0.9998111216963155,\n    0.9998118579244951,\n    0.999812591283471,\n    0.9998133217844207,\n    0.9998140494384784,\n    0.9998147742567347,\n    0.9998154962502372,\n    0.9998162154299904,\n    0.9998169318069563,\n    0.9998176453920533,\n    0.9998183561961586,\n    0.9998190642301059,\n    0.9998197695046875,\n    0.9998204720306533,\n    0.9998211718187113,\n    0.9998218688795282,\n    0.9998225632237286,\n    0.9998232548618965,\n    0.9998239438045738,\n    0.9998246300622622,\n    0.999825313645422,\n    0.9998259945644729,\n    0.999826672829794,\n    0.999827348451724,\n    0.9998280214405616,\n    0.9998286918065652,\n    0.9998293595599529,\n    0.9998300247109037,\n    0.9998306872695565,\n    0.9998313472460109,\n    0.9998320046503272,\n    0.9998326594925261,\n    0.9998333117825905,\n    0.9998339615304626,\n    0.9998346087460476,\n    0.9998352534392112,\n    0.9998358956197809,\n    0.9998365352975459,\n    0.9998371724822572,\n    0.9998378071836282,\n    0.9998384394113344,\n    0.9998390691750131,\n    0.9998396964842645,\n    0.9998403213486515,\n    0.9998409437776998,\n    0.9998415637808975,\n    0.9998421813676962,\n    0.9998427965475107,\n    0.9998434093297189,\n    0.9998440197236627,\n    0.9998446277386465,\n    0.99984523338394,\n    0.9998458366687758,\n    0.9998464376023504,\n    0.9998470361938254,\n    0.9998476324523258,\n    0.9998482263869415,\n    0.9998488180067273,\n    0.9998494073207022,\n    0.9998499943378502,\n    0.9998505790671203,\n    0.999851161517427,\n    0.99985174169765,\n    0.999852319616634,\n    0.9998528952831894,\n    0.9998534687060929,\n    0.9998540398940859,\n    0.9998546088558771,\n    0.99985517560014,\n    0.9998557401355151,\n    0.9998563024706094,\n    0.9998568626139955,\n    0.9998574205742137,\n    0.9998579763597699,\n    0.9998585299791383,\n    0.9998590814407583,\n    0.9998596307530382,\n    0.9998601779243528,\n    0.9998607229630435,\n    0.9998612658774207,\n    0.9998618066757615,\n    0.9998623453663109,\n    0.9998628819572821,\n    0.9998634164568558,\n    0.9998639488731812,\n    0.9998644792143758,\n    0.9998650074885256,\n    0.9998655337036843,\n    0.9998660578678753,\n    0.9998665799890902,\n    0.9998671000752894,\n    0.9998676181344025,\n    0.9998681341743284,\n    0.9998686482029348,\n    0.9998691602280588,\n    0.9998696702575076,\n    0.9998701782990573,\n    0.9998706843604539,\n    0.9998711884494135,\n    0.9998716905736217,\n    0.9998721907407344,\n    0.9998726889583779,\n    0.9998731852341485,\n    0.9998736795756129,\n    0.9998741719903086,\n    0.9998746624857435,\n    0.9998751510693965,\n    0.9998756377487168,\n    0.9998761225311253,\n    0.9998766054240137,\n    0.9998770864347449,\n    0.999877565570653,\n    0.9998780428390438,\n    0.9998785182471945,\n    0.9998789918023538,\n    0.9998794635117424,\n    0.999879933382553,\n    0.9998804014219498,\n    0.9998808676370696,\n    0.9998813320350209,\n    0.9998817946228854,\n    0.9998822554077161,\n    0.9998827143965393,\n    0.9998831715963533,\n    0.9998836270141301,\n    0.9998840806568136,\n    0.9998845325313211,\n    0.9998849826445431,\n    0.9998854310033425,\n    0.9998858776145566,\n    0.9998863224849951,\n    0.9998867656214416,\n    0.9998872070306536,\n    0.9998876467193613,\n    0.9998880846942698,\n    0.9998885209620572,\n    0.9998889555293764,\n    0.9998893884028537,\n    0.9998898195890897,\n    0.9998902490946596,\n    0.9998906769261129,\n    0.9998911030899734,\n    0.9998915275927396,\n    0.9998919504408847,\n    0.9998923716408566,\n    0.9998927911990784,\n    0.9998932091219477,\n    0.9998936254158373,\n    0.9998940400870956,\n    0.9998944531420457,\n    0.9998948645869864,\n    0.9998952744281919,\n    0.9998956826719122,\n    0.9998960893243724,\n    0.9998964943917736,\n    0.999896897880293,\n    0.9998972997960838,\n    0.9998977001452741,\n    0.99989809893397,\n    0.999898496168252,\n    0.9998988918541782,\n    0.9998992859977825,\n    0.9998996786050756,\n    0.9999000696820444,\n    0.9999004592346524,\n    0.9999008472688409,\n    0.9999012337905265,\n    0.9999016188056041,\n    0.999902002319945,\n    0.9999023843393975,\n    0.9999027648697877,\n    0.9999031439169183,\n    0.99990352148657,\n    0.9999038975845005,\n    0.9999042722164455,\n    0.9999046453881179,\n    0.9999050171052088,\n    0.999905387373387,\n    0.9999057561982988,\n    0.9999061235855692,\n    0.9999064895408006,\n    0.9999068540695742,\n    0.999907217177449,\n    0.9999075788699625,\n    0.9999079391526307,\n    0.999908298030948,\n    0.9999086555103874,\n    0.9999090115964009,\n    0.9999093662944185,\n    0.9999097196098496,\n    0.999910071548083,\n    0.9999104221144851,\n    0.999910771314403,\n    0.999911119153162,\n    0.9999114656360666,\n    0.9999118107684012,\n    0.9999121545554291,\n    0.9999124970023934,\n    0.9999128381145168,\n    0.9999131778970014,\n    0.9999135163550291,\n    0.9999138534937618,\n    0.9999141893183413,\n    0.999914523833889,\n    0.999914857045507,\n    0.9999151889582765,\n    0.9999155195772598,\n    0.9999158489074993,\n    0.9999161769540174,\n    0.9999165037218174,\n    0.9999168292158827,\n    0.9999171534411778,\n    0.9999174764026472,\n    0.9999177981052164,\n    0.9999181185537922,\n    0.9999184377532614,\n    0.9999187557084925,\n    0.9999190724243346,\n    0.9999193879056183,\n    0.9999197021571548,\n    0.999920015183737,\n    0.9999203269901391,\n    0.9999206375811165,\n    0.9999209469614062,\n    0.9999212551357268,\n    0.9999215621087787,\n    0.9999218678852432,\n    0.9999221724697843,\n    0.9999224758670472,\n    0.9999227780816594,\n    0.99992307911823,\n    0.9999233789813504,\n    0.9999236776755942,\n    0.999923975205517,\n    0.9999242715756563,\n    0.9999245667905327,\n    0.9999248608546487,\n    0.9999251537724895,\n    0.9999254455485221,\n    0.9999257361871973,\n    0.9999260256929475,\n    0.9999263140701885,\n    0.9999266013233187,\n    0.9999268874567189,\n    0.9999271724747535,\n    0.9999274563817697,\n    0.9999277391820977,\n    0.9999280208800507,\n    0.9999283014799253,\n    0.9999285809860011,\n    0.9999288594025413,\n    0.9999291367337926,\n    0.9999294129839844,\n    0.9999296881573309,\n    0.9999299622580285,\n    0.9999302352902583,\n    0.9999305072581846,\n    0.9999307781659553,\n    0.9999310480177028,\n    0.9999313168175428,\n    0.9999315845695751,\n    0.9999318512778838,\n    0.9999321169465368,\n    0.9999323815795863,\n    0.9999326451810684,\n    0.999932907755004,\n    0.999933169305398,\n    0.9999334298362395,\n    0.9999336893515024,\n    0.9999339478551454,\n    0.9999342053511108,\n    0.9999344618433266,\n    0.9999347173357047,\n    0.9999349718321424,\n    0.9999352253365212,\n    0.9999354778527079,\n    0.9999357293845541,\n    0.9999359799358962,\n    0.9999362295105562,\n    0.9999364781123405,\n    0.9999367257450412,\n    0.9999369724124355,\n    0.9999372181182858,\n    0.9999374628663398,\n    0.9999377066603307,\n    0.9999379495039771,\n    0.999938191400983,\n    0.9999384323550382,\n    0.9999386723698181,\n    0.9999389114489835,\n    0.9999391495961814,\n    0.9999393868150438,\n    0.9999396231091898,\n    0.9999398584822231,\n    0.9999400929377339,\n    0.9999403264792988,\n    0.9999405591104797,\n    0.9999407908348255,\n    0.9999410216558702,\n    0.9999412515771349,\n    0.9999414806021265,\n    0.9999417087343389,\n    0.9999419359772515,\n    0.9999421623343305,\n    0.9999423878090287,\n    0.9999426124047855,\n    0.999942836125027,\n    0.9999430589731653,\n    0.9999432809525995,\n    0.9999435020667163,\n    0.9999437223188877,\n    0.9999439417124741,\n    0.9999441602508214,\n    0.9999443779372634,\n    0.9999445947751209,\n    0.9999448107677009,\n    0.9999450259182986,\n    0.9999452402301955,\n    0.9999454537066609,\n    0.9999456663509511,\n    0.9999458781663095,\n    0.9999460891559674,\n    0.9999462993231432,\n    0.9999465086710425,\n    0.9999467172028589,\n    0.9999469249217734,\n    0.9999471318309542,\n    0.9999473379335577,\n    0.9999475432327278,\n    0.999947747731596,\n    0.999947951433282,\n    0.9999481543408928,\n    0.9999483564575237,\n    0.9999485577862579,\n    0.999948758330166,\n    0.9999489580923077,\n    0.99994915707573,\n    0.9999493552834681,\n    0.9999495527185456,\n    0.9999497493839742,\n    0.9999499452827538,\n    0.9999501404178727,\n    0.9999503347923077,\n    0.9999505284090237,\n    0.9999507212709743,\n    0.999950913381101,\n    0.9999511047423352,\n    0.9999512953575951,\n    0.9999514852297888,\n    0.9999516743618128,\n    0.9999518627565519,\n    0.9999520504168802,\n    0.9999522373456601,\n    0.9999524235457434,\n    0.9999526090199702,\n    0.99995279377117,\n    0.9999529778021611,\n    0.9999531611157506,\n    0.9999533437147349,\n    0.9999535256018995,\n    0.999953706780019,\n    0.9999538872518573,\n    0.9999540670201674,\n    0.9999542460876916,\n    0.9999544244571611,\n    0.9999546021312976,\n    0.9999547791128108,\n    0.9999549554044008,\n    0.999955131008757,\n    0.9999553059285579,\n    0.9999554801664718,\n    0.999955653725157,\n    0.999955826607261,\n    0.9999559988154207,\n    0.9999561703522636,\n    0.9999563412204061,\n    0.9999565114224549,\n    0.9999566809610065,\n    0.9999568498386471,\n    0.9999570180579529,\n    0.9999571856214898,\n    0.9999573525318144,\n    0.9999575187914727,\n    0.999957684403001,\n    0.9999578493689257,\n    0.9999580136917633,\n    0.9999581773740205,\n    0.9999583404181943,\n    0.999958502826772,\n    0.999958664602231,\n    0.9999588257470395,\n    0.9999589862636554,\n    0.9999591461545276,\n    0.9999593054220953,\n    0.999959464068788,\n    0.9999596220970259,\n    0.9999597795092198,\n    0.9999599363077709,\n    0.9999600924950712,\n    0.9999602480735034,\n    0.999960403045441,\n    0.9999605574132482,\n    0.9999607111792794,\n    0.9999608643458809,\n    0.999961016915389,\n    0.9999611688901312,\n    0.999961320272426,\n    0.9999614710645828,\n    0.9999616212689018,\n    0.9999617708876745,\n    0.9999619199231834,\n    0.9999620683777022,\n    0.9999622162534955,\n    0.9999623635528191,\n    0.9999625102779202,\n    0.9999626564310371,\n    0.9999628020143996,\n    0.9999629470302284,\n    0.9999630914807359,\n    0.9999632353681258,\n    0.9999633786945932,\n    0.9999635214623243,\n    0.9999636636734973,\n    0.9999638053302817,\n    0.9999639464348384,\n    0.9999640869893204,\n    0.9999642269958717,\n    0.9999643664566282,\n    0.9999645053737171,\n    0.9999646437492582,\n    0.9999647815853623,\n    0.9999649188841321,\n    0.9999650556476621,\n    0.9999651918780389,\n    0.9999653275773406,\n    0.9999654627476376,\n    0.9999655973909919,\n    0.9999657315094574,\n    0.9999658651050805,\n    0.9999659981798991,\n    0.9999661307359431,\n    0.9999662627752351,\n    0.9999663942997896,\n    0.9999665253116128,\n    0.9999666558127032,\n    0.9999667858050522,\n    0.9999669152906423,\n    0.9999670442714497,\n    0.9999671727494412,\n    0.9999673007265776,\n    0.999967428204811,\n    0.9999675551860863,\n    0.9999676816723402,\n    0.9999678076655029,\n    0.9999679331674964,\n    0.9999680581802354,\n    0.9999681827056268,\n    0.9999683067455709,\n    0.9999684303019593,\n    0.9999685533766776,\n    0.9999686759716031,\n    0.9999687980886061,\n    0.9999689197295496,\n    0.9999690408962896,\n    0.9999691615906743,\n    0.9999692818145453,\n    0.9999694015697365,\n    0.999969520858075,\n    0.9999696396813804,\n    0.9999697580414662,\n    0.9999698759401372,\n    0.9999699933791927,\n    0.9999701103604238,\n    0.9999702268856159,\n    0.9999703429565462,\n    0.9999704585749857,\n    0.9999705737426979,\n    0.9999706884614404,\n    0.9999708027329629,\n    0.9999709165590092,\n    0.9999710299413154,\n    0.9999711428816113,\n    0.9999712553816202,\n    0.9999713674430583,\n    0.9999714790676352,\n    0.999971590257054,\n    0.9999717010130107,\n    0.9999718113371955,\n    0.9999719212312911,\n    0.9999720306969742,\n    0.9999721397359149,\n    0.9999722483497768,\n    0.9999723565402167,\n    0.9999724643088853,\n    0.999972571657427,\n    0.9999726785874792,\n    0.9999727851006732,\n    0.9999728911986345,\n    0.9999729968829811,\n    0.999973102155326,\n    0.9999732070172748,\n    0.9999733114704275,\n    0.9999734155163775,\n    0.9999735191567124,\n    0.9999736223930134,\n    0.9999737252268551,\n    0.9999738276598068,\n    0.9999739296934315,\n    0.999974031329285,\n    0.9999741325689185,\n    0.9999742334138766,\n    0.9999743338656978,\n    0.9999744339259143,\n    0.9999745335960529,\n    0.9999746328776344,\n    0.999974731772173,\n    0.9999748302811781,\n    0.999974928406152,\n    0.9999750261485922,\n    0.9999751235099896,\n    0.99997522049183,\n    0.9999753170955924,\n    0.9999754133227512,\n    0.9999755091747741,\n    0.9999756046531239,\n    0.9999756997592568,\n    0.9999757944946239,\n    0.999975888860671,\n    0.9999759828588372,\n    0.999976076490557,\n    0.9999761697572584,\n    0.9999762626603651,\n    0.999976355201294,\n    0.999976447381457,\n    0.9999765392022604,\n    0.9999766306651053,\n    0.9999767217713869,\n    0.9999768125224954,\n    0.9999769029198151,\n    0.9999769929647252,\n    0.9999770826585999,\n    0.9999771720028072,\n    0.9999772609987101,\n    0.9999773496476667,\n    0.9999774379510297,\n    0.9999775259101457,\n    0.9999776135263571,\n    0.9999777008010005,\n    0.9999777877354074,\n    0.9999778743309043,\n    0.9999779605888122,\n    0.9999780465104471,\n    0.9999781320971202,\n    0.999978217350137,\n    0.999978302270798,\n    0.9999783868603993,\n    0.9999784711202312,\n    0.9999785550515792,\n    0.9999786386557237,\n    0.9999787219339408,\n    0.9999788048875003,\n    0.9999788875176685,\n    0.9999789698257058,\n    0.9999790518128678,\n    0.9999791334804055,\n    0.9999792148295649,\n    0.9999792958615872,\n    0.9999793765777086,\n    0.9999794569791607,\n    0.99997953706717,\n    0.9999796168429586,\n    0.9999796963077434,\n    0.9999797754627371,\n    0.999979854309147,\n    0.9999799328481763,\n    0.9999800110810233,\n    0.9999800890088814,\n    0.9999801666329398,\n    0.9999802439543825,\n    0.9999803209743893,\n    0.9999803976941355,\n    0.9999804741147913,\n    0.9999805502375229,\n    0.9999806260634916,\n    0.9999807015938544,\n    0.9999807768297634,\n    0.9999808517723667,\n    0.9999809264228077,\n    0.9999810007822253,\n    0.999981074851754,\n    0.9999811486325237,\n    0.9999812221256604,\n    0.9999812953322854,\n    0.9999813682535154,\n    0.9999814408904628,\n    0.9999815132442363,\n    0.9999815853159396,\n    0.9999816571066721,\n    0.9999817286175293,\n    0.9999817998496026,\n    0.999981870803978,\n    0.9999819414817388,\n    0.999982011883963,\n    0.9999820820117249,\n    0.9999821518660943,\n    0.9999822214481369,\n    0.9999822907589145,\n    0.9999823597994846,\n    0.9999824285709003,\n    0.9999824970742113,\n    0.9999825653104624,\n    0.9999826332806948,\n    0.9999827009859457,\n    0.999982768427248,\n    0.9999828356056304,\n    0.9999829025221182,\n    0.9999829691777323,\n    0.9999830355734896,\n    0.9999831017104029,\n    0.9999831675894818,\n    0.999983233211731,\n    0.999983298578152,\n    0.9999833636897418,\n    0.9999834285474941,\n    0.9999834931523983,\n    0.9999835575054402,\n    0.9999836216076015,\n    0.9999836854598604,\n    0.9999837490631911,\n    0.9999838124185639,\n    0.9999838755269453,\n    0.9999839383892987,\n    0.9999840010065828,\n    0.9999840633797528,\n    0.9999841255097607,\n    0.9999841873975545,\n    0.9999842490440781,\n    0.9999843104502723,\n    0.999984371617074,\n    0.999984432545416,\n    0.9999844932362287,\n    0.9999845536904376,\n    0.9999846139089651,\n    0.99998467389273,\n    0.9999847336426476,\n    0.9999847931596293,\n    0.9999848524445835,\n    0.9999849114984143,\n    0.9999849703220232,\n    0.9999850289163075,\n    0.9999850872821608,\n    0.9999851454204743,\n    0.9999852033321346,\n    0.9999852610180254,\n    0.999985318479027,\n    0.9999853757160159,\n    0.9999854327298654,\n    0.9999854895214452,\n    0.9999855460916223,\n    0.9999856024412594,\n    0.9999856585712167,\n    0.9999857144823499,\n    0.9999857701755127,\n    0.9999858256515545,\n    0.9999858809113218,\n    0.9999859359556579,\n    0.9999859907854024,\n    0.999986045401392,\n    0.9999860998044598,\n    0.9999861539954363,\n    0.9999862079751479,\n    0.9999862617444183,\n    0.9999863153040679,\n    0.9999863686549141,\n    0.9999864217977705,\n    0.9999864747334485,\n    0.9999865274627551,\n    0.9999865799864952,\n    0.9999866323054701,\n    0.9999866844204783,\n    0.9999867363323144,\n    0.9999867880417712,\n    0.9999868395496371,\n    0.999986890856698,\n    0.9999869419637368,\n    0.9999869928715335,\n    0.9999870435808645,\n    0.9999870940925041,\n    0.999987144407222,\n    0.9999871945257869,\n    0.9999872444489627,\n    0.9999872941775114,\n    0.999987343712192,\n    0.9999873930537599,\n    0.9999874422029684,\n    0.9999874911605668,\n    0.9999875399273026,\n    0.9999875885039194,\n    0.9999876368911587,\n    0.9999876850897587,\n    0.9999877331004549,\n    0.9999877809239794,\n    0.9999878285610626,\n    0.9999878760124306,\n    0.9999879232788079,\n    0.9999879703609155,\n    0.9999880172594717,\n    0.999988063975192,\n    0.9999881105087897,\n    0.9999881568609741,\n    0.9999882030324528,\n    0.9999882490239302,\n    0.999988294836108,\n    0.9999883404696851,\n    0.999988385925358,\n    0.9999884312038202,\n    0.9999884763057625,\n    0.999988521231873,\n    0.999988565982837,\n    0.9999886105593377,\n    0.9999886549620552,\n    0.9999886991916666,\n    0.9999887432488469,\n    0.9999887871342686,\n    0.9999888308486011,\n    0.9999888743925112,\n    0.9999889177666637,\n    0.99998896097172,\n    0.9999890040083393,\n    0.9999890468771786,\n    0.9999890895788917,\n    0.9999891321141301,\n    0.9999891744835431,\n    0.999989216687777,\n    0.9999892587274757,\n    0.9999893006032806,\n    0.9999893423158307,\n    0.9999893838657626,\n    0.9999894252537099,\n    0.9999894664803045,\n    0.999989507546175,\n    0.9999895484519483,\n    0.9999895891982484,\n    0.9999896297856969,\n    0.9999896702149134,\n    0.9999897104865146,\n    0.9999897506011148,\n    0.9999897905593262,\n    0.9999898303617587,\n    0.9999898700090192,\n    0.9999899095017128,\n    0.9999899488404421,\n    0.9999899880258071,\n    0.9999900270584062,\n    0.9999900659388344,\n    0.9999901046676853,\n    0.9999901432455499,\n    0.9999901816730163,\n    0.9999902199506713,\n    0.9999902580790988,\n    0.9999902960588805,\n    0.999990333890596,\n    0.9999903715748223,\n    0.9999904091121345,\n    0.9999904465031056,\n    0.9999904837483059,\n    0.9999905208483034,\n    0.9999905578036649,\n    0.9999905946149534,\n    0.9999906312827312,\n    0.9999906678075575,\n    0.9999907041899896,\n    0.9999907404305829,\n    0.9999907765298898,\n    0.9999908124884618,\n    0.9999908483068468,\n    0.9999908839855919,\n    0.9999909195252414,\n    0.9999909549263373,\n    0.9999909901894198,\n    0.9999910253150271,\n    0.9999910603036951,\n    0.9999910951559577,\n    0.9999911298723466,\n    0.9999911644533916,\n    0.9999911988996202,\n    0.999991233211558,\n    0.9999912673897285,\n    0.9999913014346535,\n    0.9999913353468521,\n    0.9999913691268418,\n    0.9999914027751382,\n    0.9999914362922543,\n    0.9999914696787021,\n    0.9999915029349906,\n    0.9999915360616273,\n    0.9999915690591175,\n    0.9999916019279651,\n    0.9999916346686712,\n    0.9999916672817356,\n    0.9999916997676557,\n    0.9999917321269275,\n    0.9999917643600443,\n    0.9999917964674984,\n    0.9999918284497793,\n    0.9999918603073751,\n    0.9999918920407719,\n    0.9999919236504541,\n    0.9999919551369035,\n    0.9999919865006011,\n    0.9999920177420251,\n    0.9999920488616524,\n    0.9999920798599575,\n    0.9999921107374138,\n    0.9999921414944921,\n    0.9999921721316617,\n    0.9999922026493901,\n    0.9999922330481432,\n    0.9999922633283846,\n    0.9999922934905763,\n    0.9999923235351785,\n    0.9999923534626499,\n    0.9999923832734467,\n    0.9999924129680242,\n    0.9999924425468351,\n    0.9999924720103309,\n    0.9999925013589611,\n    0.9999925305931738,\n    0.9999925597134146,\n    0.9999925887201281,\n    0.9999926176137567,\n    0.9999926463947413,\n    0.9999926750635213,\n    0.9999927036205339,\n    0.9999927320662151,\n    0.9999927604009985,\n    0.9999927886253165,\n    0.9999928167396002,\n    0.999992844744278,\n    0.9999928726397775,\n    0.9999929004265247,\n    0.9999929281049428,\n    0.9999929556754544,\n    0.9999929831384805,\n    0.9999930104944398,\n    0.9999930377437499,\n    0.9999930648868264,\n    0.9999930919240836,\n    0.999993118855934,\n    0.9999931456827884,\n    0.9999931724050563,\n    0.9999931990231453,\n    0.9999932255374616,\n    0.9999932519484098,\n    0.9999932782563931,\n    0.9999933044618126,\n    0.999993330565068,\n    0.9999933565665579,\n    0.9999933824666789,\n    0.9999934082658262,\n    0.9999934339643937,\n    0.9999934595627731,\n    0.9999934850613552,\n    0.9999935104605292,\n    0.9999935357606823,\n    0.9999935609622008,\n    0.9999935860654692,\n    0.9999936110708703,\n    0.9999936359787859,\n    0.9999936607895961,\n    0.9999936855036792,\n    0.9999937101214125,\n    0.9999937346431714,\n    0.9999937590693306,\n    0.999993783400262,\n    0.9999938076363376,\n    0.9999938317779268,\n    0.9999938558253978,\n    0.9999938797791179,\n    0.9999939036394523,\n    0.9999939274067654,\n    0.9999939510814195,\n    0.9999939746637762,\n    0.9999939981541949,\n    0.9999940215530343,\n    0.9999940448606512,\n    0.9999940680774019,\n    0.9999940912036396,\n    0.9999941142397178,\n    0.9999941371859881,\n    0.9999941600428003,\n    0.9999941828105033,\n    0.9999942054894445,\n    0.9999942280799697,\n    0.9999942505824242,\n    0.9999942729971506,\n    0.9999942953244914,\n    0.9999943175647871,\n    0.9999943397183771,\n    0.9999943617855995,\n    0.9999943837667907,\n    0.9999944056622865,\n    0.999994427472421,\n    0.9999944491975263,\n    0.9999944708379347,\n    0.999994492393976,\n    0.9999945138659793,\n    0.999994535254272,\n    0.9999945565591806,\n    0.9999945777810301,\n    0.9999945989201442,\n    0.9999946199768459,\n    0.9999946409514558,\n    0.9999946618442948,\n    0.9999946826556807,\n    0.9999947033859318,\n    0.9999947240353642,\n    0.999994744604293,\n    0.9999947650930318,\n    0.9999947855018936,\n    0.9999948058311894,\n    0.9999948260812298,\n    0.9999948462523234,\n    0.9999948663447785,\n    0.9999948863589009,\n    0.9999949062949967,\n    0.9999949261533698,\n    0.9999949459343233,\n    0.9999949656381589,\n    0.999994985265177,\n    0.9999950048156778,\n    0.999995024289959,\n    0.9999950436883179,\n    0.9999950630110505,\n    0.9999950822584517,\n    0.9999951014308153,\n    0.9999951205284334,\n    0.9999951395515977,\n    0.9999951585005985,\n    0.999995177375725,\n    0.9999951961772648,\n    0.9999952149055051,\n    0.9999952335607317,\n    0.999995252143229,\n    0.9999952706532809,\n    0.9999952890911695,\n    0.9999953074571765,\n    0.9999953257515817,\n    0.9999953439746643,\n    0.9999953621267027,\n    0.9999953802079736,\n    0.9999953982187529,\n    0.9999954161593154,\n    0.9999954340299353,\n    0.9999954518308845,\n    0.9999954695624351,\n    0.9999954872248578,\n    0.9999955048184215,\n    0.9999955223433952,\n    0.999995539800046,\n    0.9999955571886405,\n    0.9999955745094438,\n    0.9999955917627203,\n    0.9999956089487334,\n    0.999995626067745,\n    0.9999956431200164,\n    0.999995660105808,\n    0.9999956770253791,\n    0.9999956938789871,\n    0.9999957106668901,\n    0.9999957273893437,\n    0.9999957440466031,\n    0.9999957606389227,\n    0.9999957771665553,\n    0.9999957936297534,\n    0.9999958100287683,\n    0.9999958263638499,\n    0.9999958426352475,\n    0.9999958588432095,\n    0.9999958749879833,\n    0.999995891069815,\n    0.99999590708895,\n    0.999995923045633,\n    0.9999959389401071,\n    0.9999959547726153,\n    0.9999959705433985,\n    0.9999959862526981,\n    0.9999960019007533,\n    0.9999960174878029,\n    0.999996033014085,\n    0.9999960484798365,\n    0.9999960638852928,\n    0.9999960792306896,\n    0.9999960945162609,\n    0.9999961097422398,\n    0.9999961249088587,\n    0.999996140016349,\n    0.9999961550649413,\n    0.999996170054865,\n    0.999996184986349,\n    0.9999961998596212,\n    0.9999962146749086,\n    0.9999962294324368,\n    0.9999962441324314,\n    0.9999962587751166,\n    0.9999962733607158,\n    0.9999962878894517,\n    0.9999963023615457,\n    0.9999963167772187,\n    0.9999963311366906,\n    0.9999963454401809,\n    0.9999963596879077,\n    0.9999963738800881,\n    0.9999963880169387,\n    0.9999964020986756,\n    0.999996416125513,\n    0.9999964300976658,\n    0.9999964440153465,\n    0.9999964578787677,\n    0.999996471688141,\n    0.9999964854436769,\n    0.9999964991455856,\n    0.9999965127940759,\n    0.9999965263893561,\n    0.9999965399316338,\n    0.9999965534211155,\n    0.999996566858007,\n    0.9999965802425134,\n    0.9999965935748388,\n    0.9999966068551871,\n    0.9999966200837603,\n    0.9999966332607606,\n    0.999996646386389,\n    0.9999966594608457,\n    0.9999966724843302,\n    0.9999966854570413,\n    0.9999966983791772,\n    0.9999967112509346,\n    0.9999967240725102,\n    0.9999967368440993,\n    0.9999967495658972,\n    0.9999967622380979,\n    0.9999967748608946,\n    0.99999678743448,\n    0.9999967999590458,\n    0.9999968124347834,\n    0.9999968248618829,\n    0.9999968372405343,\n    0.999996849570926,\n    0.9999968618532465,\n    0.9999968740876829,\n    0.9999968862744222,\n    0.99999689841365,\n    0.999996910505552,\n    0.9999969225503124,\n    0.9999969345481149,\n    0.9999969464991427,\n    0.999996958403578,\n    0.9999969702616027,\n    0.9999969820733979,\n    0.9999969938391431,\n    0.9999970055590187,\n    0.9999970172332029,\n    0.9999970288618739,\n    0.9999970404452094,\n    0.9999970519833861,\n    0.9999970634765799,\n    0.9999970749249663,\n    0.9999970863287201,\n    0.9999970976880148,\n    0.9999971090030241,\n    0.9999971202739207,\n    0.9999971315008767,\n    0.9999971426840629,\n    0.9999971538236504,\n    0.9999971649198088,\n    0.999997175972708,\n    0.999997186982516,\n    0.9999971979494011,\n    0.9999972088735305,\n    0.9999972197550713,\n    0.9999972305941889,\n    0.9999972413910492,\n    0.9999972521458168,\n    0.9999972628586555,\n    0.9999972735297293,\n    0.9999972841592005,\n    0.9999972947472318,\n    0.9999973052939845,\n    0.9999973157996194,\n    0.999997326264297,\n    0.9999973366881769,\n    0.9999973470714182,\n    0.9999973574141792,\n    0.9999973677166178,\n    0.9999973779788912,\n    0.999997388201156,\n    0.9999973983835683,\n    0.9999974085262833,\n    0.9999974186294557,\n    0.9999974286932398,\n    0.9999974387177893,\n    0.9999974487032567,\n    0.999997458649795,\n    0.9999974685575552,\n    0.9999974784266893,\n    0.9999974882573471,\n    0.9999974980496794,\n    0.9999975078038348,\n    0.9999975175199628,\n    0.9999975271982114,\n    0.9999975368387282,\n    0.9999975464416603,\n    0.9999975560071545,\n    0.9999975655353563,\n    0.9999975750264115,\n    0.9999975844804648,\n    0.9999975938976604,\n    0.9999976032781419,\n    0.9999976126220526,\n    0.999997621929535,\n    0.9999976312007312,\n    0.9999976404357824,\n    0.99999764963483,\n    0.9999976587980138,\n    0.9999976679254741,\n    0.9999976770173499,\n    0.9999976860737801,\n    0.9999976950949028,\n    0.9999977040808555,\n    0.9999977130317754,\n    0.9999977219477992,\n    0.999997730829063,\n    0.999997739675702,\n    0.9999977484878515,\n    0.9999977572656459,\n    0.999997766009219,\n    0.9999977747187042,\n    0.9999977833942346,\n    0.9999977920359425,\n    0.9999978006439595,\n    0.9999978092184174,\n    0.9999978177594468,\n    0.9999978262671778,\n    0.9999978347417408,\n    0.9999978431832643,\n    0.9999978515918779,\n    0.9999978599677096,\n    0.999997868310887,\n    0.9999978766215375,\n    0.9999978848997882,\n    0.9999978931457651,\n    0.9999979013595941,\n    0.9999979095414004,\n    0.9999979176913093,\n    0.9999979258094446,\n    0.9999979338959306,\n    0.9999979419508905,\n    0.9999979499744471,\n    0.999997957966723,\n    0.9999979659278401,\n    0.9999979738579198,\n    0.9999979817570831,\n    0.999997989625451,\n    0.9999979974631428,\n    0.9999980052702785,\n    0.9999980130469772,\n    0.9999980207933576,\n    0.9999980285095379,\n    0.9999980361956357,\n    0.9999980438517684,\n    0.9999980514780527,\n    0.9999980590746052,\n    0.9999980666415416,\n    0.9999980741789773,\n    0.9999980816870275,\n    0.9999980891658067,\n    0.9999980966154292,\n    0.9999981040360082,\n    0.9999981114276573,\n    0.9999981187904892,\n    0.9999981261246162,\n    0.9999981334301504,\n    0.9999981407072032,\n    0.9999981479558854,\n    0.9999981551763079,\n    0.9999981623685805,\n    0.9999981695328135,\n    0.9999981766691156,\n    0.9999981837775962,\n    0.9999981908583636,\n    0.9999981979115257,\n    0.9999982049371903,\n    0.9999982119354642,\n    0.9999982189064549,\n    0.9999982258502681,\n    0.9999982327670104,\n    0.9999982396567868,\n    0.9999982465197024,\n    0.9999982533558623,\n    0.9999982601653706,\n    0.9999982669483313,\n    0.9999982737048476,\n    0.999998280435023,\n    0.9999982871389601,\n    0.9999982938167612,\n    0.9999983004685279,\n    0.9999983070943619,\n    0.9999983136943643,\n    0.999998320268636,\n    0.999998326817277,\n    0.9999983333403875,\n    0.9999983398380666,\n    0.9999983463104141,\n    0.999998352757528,\n    0.9999983591795074,\n    0.9999983655764496,\n    0.9999983719484529,\n    0.9999983782956141,\n    0.99999838461803,\n    0.9999983909157973,\n    0.9999983971890123,\n    0.9999984034377699,\n    0.9999984096621665,\n    0.9999984158622963,\n    0.9999984220382541,\n    0.9999984281901343,\n    0.9999984343180306,\n    0.9999984404220366,\n    0.9999984465022453,\n    0.9999984525587498,\n    0.9999984585916423,\n    0.9999984646010147,\n    0.9999984705869588,\n    0.9999984765495661,\n    0.9999984824889274,\n    0.9999984884051335,\n    0.9999984942982746,\n    0.9999985001684406,\n    0.9999985060157208,\n    0.999998511840205,\n    0.9999985176419814,\n    0.9999985234211393,\n    0.9999985291777661,\n    0.9999985349119502,\n    0.9999985406237789,\n    0.9999985463133394,\n    0.9999985519807184,\n    0.9999985576260023,\n    0.9999985632492775,\n    0.9999985688506297,\n    0.9999985744301444,\n    0.9999985799879068,\n    0.9999985855240013,\n    0.9999985910385131,\n    0.9999985965315255,\n    0.9999986020031228,\n    0.9999986074533888,\n    0.9999986128824058,\n    0.9999986182902574,\n    0.9999986236770257,\n    0.999998629042793,\n    0.9999986343876415,\n    0.9999986397116524,\n    0.9999986450149067,\n    0.9999986502974858,\n    0.9999986555594703,\n    0.9999986608009401,\n    0.9999986660219755,\n    0.9999986712226561,\n    0.9999986764030611,\n    0.9999986815632697,\n    0.9999986867033607,\n    0.9999986918234123,\n    0.9999986969235028,\n    0.9999987020037101,\n    0.9999987070641115,\n    0.9999987121047844,\n    0.9999987171258056,\n    0.9999987221272515,\n    0.999998727109199,\n    0.9999987320717236,\n    0.9999987370149013,\n    0.9999987419388074,\n    0.9999987468435171,\n    0.999998751729105,\n    0.9999987565956461,\n    0.9999987614432141,\n    0.9999987662718834,\n    0.9999987710817276,\n    0.99999877587282,\n    0.999998780645234,\n    0.9999987853990417,\n    0.9999987901343165,\n    0.9999987948511302,\n    0.9999987995495545,\n    0.9999988042296618,\n    0.9999988088915229,\n    0.9999988135352093,\n    0.9999988181607918,\n    0.9999988227683406,\n    0.9999988273579266,\n    0.9999988319296195,\n    0.9999988364834891,\n    0.9999988410196047,\n    0.9999988455380359,\n    0.9999988500388514,\n    0.99999885452212,\n    0.99999885898791,\n    0.9999988634362894,\n    0.9999988678673263,\n    0.9999988722810883,\n    0.9999988766776429,\n    0.9999988810570567,\n    0.9999988854193971,\n    0.9999988897647301,\n    0.9999988940931224,\n    0.99999889840464,\n    0.9999989026993487,\n    0.9999989069773138,\n    0.9999989112386006,\n    0.9999989154832745,\n    0.9999989197114001,\n    0.9999989239230416,\n    0.9999989281182636,\n    0.9999989322971299,\n    0.9999989364597046,\n    0.9999989406060509,\n    0.9999989447362321,\n    0.9999989488503112,\n    0.9999989529483512,\n    0.9999989570304144,\n    0.9999989610965634,\n    0.9999989651468597,\n    0.9999989691813657,\n    0.9999989732001424,\n    0.9999989772032517,\n    0.9999989811907543,\n    0.9999989851627111,\n    0.9999989891191827,\n    0.9999989930602297,\n    0.9999989969859118,\n    0.9999990008962892,\n    0.9999990047914217,\n    0.9999990086713684,\n    0.9999990125361885,\n    0.9999990163859414,\n    0.9999990202206853,\n    0.9999990240404791,\n    0.9999990278453811,\n    0.999999031635449,\n    0.999999035410741,\n    0.9999990391713146,\n    0.9999990429172271,\n    0.9999990466485356,\n    0.9999990503652972,\n    0.9999990540675685,\n    0.9999990577554062,\n    0.9999990614288662,\n    0.999999065088005,\n    0.9999990687328779,\n    0.9999990723635409,\n    0.9999990759800494,\n    0.9999990795824586,\n    0.9999990831708231,\n    0.999999086745198,\n    0.9999990903056379,\n    0.999999093852197,\n    0.9999990973849291,\n    0.9999991009038888,\n    0.9999991044091291,\n    0.9999991079007038,\n    0.9999991113786665,\n    0.9999991148430695,\n    0.9999991182939663,\n    0.9999991217314095,\n    0.9999991251554512,\n    0.9999991285661438,\n    0.9999991319635396,\n    0.99999913534769,\n    0.9999991387186471,\n    0.999999142076462,\n    0.9999991454211861,\n    0.9999991487528702,\n    0.9999991520715654,\n    0.9999991553773224,\n    0.9999991586701913,\n    0.9999991619502226,\n    0.9999991652174663,\n    0.9999991684719722,\n    0.99999917171379,\n    0.9999991749429694,\n    0.9999991781595594,\n    0.9999991813636089,\n    0.9999991845551671,\n    0.9999991877342828,\n    0.999999190901004,\n    0.9999991940553795,\n    0.9999991971974574,\n    0.9999992003272853,\n    0.9999992034449113,\n    0.999999206550383,\n    0.9999992096437473,\n    0.999999212725052,\n    0.9999992157943437,\n    0.9999992188516693,\n    0.9999992218970756,\n    0.9999992249306092,\n    0.9999992279523161,\n    0.9999992309622424,\n    0.9999992339604342,\n    0.999999236946937,\n    0.999999239921797,\n    0.9999992428850587,\n    0.9999992458367678,\n    0.9999992487769694,\n    0.9999992517057084,\n    0.9999992546230292,\n    0.9999992575289763,\n    0.9999992604235944,\n    0.9999992633069275,\n    0.9999992661790194,\n    0.9999992690399143,\n    0.9999992718896554,\n    0.9999992747282866,\n    0.9999992775558509,\n    0.9999992803723919,\n    0.9999992831779521,\n    0.9999992859725744,\n    0.9999992887563014,\n    0.9999992915291759,\n    0.9999992942912399,\n    0.9999992970425357,\n    0.9999992997831051,\n    0.9999993025129903,\n    0.9999993052322325,\n    0.9999993079408735,\n    0.9999993106389544,\n    0.9999993133265167,\n    0.9999993160036009,\n    0.9999993186702484,\n    0.9999993213264996,\n    0.999999323972395,\n    0.9999993266079752,\n    0.9999993292332802,\n    0.9999993318483499,\n    0.9999993344532248,\n    0.999999337047944,\n    0.9999993396325473,\n    0.9999993422070742,\n    0.9999993447715642,\n    0.9999993473260561,\n    0.9999993498705891,\n    0.9999993524052017,\n    0.9999993549299331,\n    0.9999993574448213,\n    0.9999993599499049,\n    0.999999362445222,\n    0.9999993649308111,\n    0.9999993674067096,\n    0.9999993698729555,\n    0.9999993723295867,\n    0.9999993747766401,\n    0.9999993772141536,\n    0.9999993796421638,\n    0.9999993820607084,\n    0.999999384469824,\n    0.9999993868695471,\n    0.9999993892599148,\n    0.9999993916409633,\n    0.999999394012729,\n    0.999999396375248,\n    0.9999993987285565,\n    0.9999994010726904,\n    0.9999994034076851,\n    0.9999994057335769,\n    0.9999994080504007,\n    0.9999994103581922,\n    0.9999994126569863,\n    0.9999994149468184,\n    0.9999994172277232,\n    0.9999994194997356,\n    0.9999994217628905,\n    0.9999994240172219,\n    0.9999994262627648,\n    0.999999428499553,\n    0.9999994307276209,\n    0.9999994329470021,\n    0.999999435157731,\n    0.9999994373598412,\n    0.9999994395533661,\n    0.9999994417383392,\n    0.9999994439147938,\n    0.9999994460827635,\n    0.9999994482422809,\n    0.9999994503933792,\n    0.9999994525360913,\n    0.9999994546704497,\n    0.9999994567964869,\n    0.9999994589142355,\n    0.9999994610237278,\n    0.999999463124996,\n    0.9999994652180721,\n    0.9999994673029882,\n    0.999999469379776,\n    0.999999471448467,\n    0.9999994735090932,\n    0.9999994755616856,\n    0.9999994776062757,\n    0.9999994796428949,\n    0.9999994816715738,\n    0.9999994836923439,\n    0.9999994857052357,\n    0.9999994877102798,\n    0.9999994897075072,\n    0.999999491696948,\n    0.9999994936786328,\n    0.9999994956525918,\n    0.999999497618855,\n    0.9999994995774526,\n    0.9999995015284142,\n    0.9999995034717698,\n    0.9999995054075489,\n    0.9999995073357812,\n    0.999999509256496,\n    0.9999995111697226,\n    0.9999995130754904,\n    0.9999995149738282,\n    0.9999995168647651,\n    0.9999995187483299,\n    0.9999995206245516,\n    0.9999995224934584,\n    0.9999995243550792,\n    0.9999995262094421,\n    0.9999995280565754,\n    0.9999995298965078,\n    0.9999995317292666,\n    0.9999995335548804,\n    0.9999995353733766,\n    0.9999995371847833,\n    0.9999995389891281,\n    0.9999995407864383,\n    0.9999995425767415,\n    0.9999995443600651,\n    0.999999546136436,\n    0.9999995479058815,\n    0.9999995496684287,\n    0.9999995514241042,\n    0.9999995531729351,\n    0.9999995549149481,\n    0.9999995566501694,\n    0.999999558378626,\n    0.9999995601003436,\n    0.9999995618153491,\n    0.9999995635236685,\n    0.9999995652253278,\n    0.9999995669203526,\n    0.9999995686087695,\n    0.9999995702906038,\n    0.9999995719658814,\n    0.9999995736346275,\n    0.9999995752968679,\n    0.9999995769526278,\n    0.9999995786019323,\n    0.9999995802448071,\n    0.9999995818812769,\n    0.9999995835113666,\n    0.9999995851351011,\n    0.9999995867525056,\n    0.9999995883636041,\n    0.9999995899684218,\n    0.9999995915669827,\n    0.9999995931593114,\n    0.9999995947454324,\n    0.9999995963253696,\n    0.9999995978991472,\n    0.9999995994667893,\n    0.9999996010283196,\n    0.9999996025837622,\n    0.9999996041331407,\n    0.9999996056764787,\n    0.9999996072137998,\n    0.9999996087451274,\n    0.9999996102704849,\n    0.9999996117898958,\n    0.9999996133033829,\n    0.9999996148109696,\n    0.9999996163126788,\n    0.9999996178085333,\n    0.999999619298556,\n    0.9999996207827698,\n    0.999999622261197,\n    0.9999996237338605,\n    0.9999996252007827,\n    0.9999996266619858,\n    0.9999996281174923,\n    0.9999996295673241,\n    0.9999996310115038,\n    0.9999996324500532,\n    0.9999996338829941,\n    0.9999996353103484,\n    0.9999996367321381,\n    0.9999996381483848,\n    0.99999963955911,\n    0.9999996409643354,\n    0.9999996423640822,\n    0.9999996437583721,\n    0.9999996451472262,\n    0.9999996465306654,\n    0.9999996479087113,\n    0.9999996492813847,\n    0.9999996506487064,\n    0.9999996520106976,\n    0.999999653367379,\n    0.9999996547187711,\n    0.9999996560648944,\n    0.9999996574057698,\n    0.9999996587414178,\n    0.9999996600718585,\n    0.9999996613971122,\n    0.9999996627171993,\n    0.9999996640321399,\n    0.999999665341954,\n    0.9999996666466616,\n    0.9999996679462827,\n    0.9999996692408369,\n    0.9999996705303444,\n    0.9999996718148245,\n    0.9999996730942967,\n    0.9999996743687808,\n    0.9999996756382963,\n    0.9999996769028623,\n    0.9999996781624984,\n    0.9999996794172235,\n    0.9999996806670568,\n    0.9999996819120175,\n    0.9999996831521247,\n    0.9999996843873972,\n    0.9999996856178537,\n    0.9999996868435131,\n    0.999999688064394,\n    0.9999996892805154,\n    0.9999996904918956,\n    0.999999691698553,\n    0.9999996929005058,\n    0.999999694097773,\n    0.9999996952903725,\n    0.9999996964783224,\n    0.9999996976616409,\n    0.999999698840346,\n    0.9999997000144559,\n    0.9999997011839883,\n    0.9999997023489613,\n    0.9999997035093924,\n    0.9999997046652993,\n    0.9999997058167001,\n    0.9999997069636116,\n    0.9999997081060518,\n    0.9999997092440381,\n    0.999999710377588,\n    0.9999997115067183,\n    0.9999997126314468,\n    0.9999997137517902,\n    0.999999714867766,\n    0.9999997159793909,\n    0.999999717086682,\n    0.9999997181896562,\n    0.9999997192883303,\n    0.9999997203827211,\n    0.9999997214728452,\n    0.9999997225587193,\n    0.9999997236403602,\n    0.999999724717784,\n    0.9999997257910073,\n    0.9999997268600465,\n    0.9999997279249181,\n    0.9999997289856379,\n    0.9999997300422225,\n    0.9999997310946879,\n    0.99999973214305,\n    0.999999733187325,\n    0.9999997342275289,\n    0.9999997352636772,\n    0.999999736295786,\n    0.999999737323871,\n    0.9999997383479481,\n    0.9999997393680324,\n    0.99999974038414,\n    0.999999741396286,\n    0.999999742404486,\n    0.9999997434087554,\n    0.9999997444091098,\n    0.999999745405564,\n    0.9999997463981333,\n    0.9999997473868331,\n    0.999999748371678,\n    0.9999997493526838,\n    0.9999997503298649,\n    0.9999997513032363,\n    0.9999997522728127,\n    0.9999997532386093,\n    0.9999997542006405,\n    0.9999997551589213,\n    0.9999997561134659,\n    0.9999997570642891,\n    0.9999997580114054,\n    0.9999997589548294,\n    0.9999997598945752,\n    0.9999997608306573,\n    0.9999997617630899,\n    0.9999997626918874,\n    0.9999997636170638,\n    0.9999997645386335,\n    0.9999997654566101,\n    0.9999997663710078,\n    0.9999997672818408,\n    0.9999997681891226,\n    0.9999997690928675,\n    0.9999997699930888,\n    0.9999997708898006,\n    0.9999997717830164,\n    0.99999977267275,\n    0.9999997735590147,\n    0.9999997744418241,\n    0.9999997753211921,\n    0.9999997761971314,\n    0.999999777069656,\n    0.9999997779387788,\n    0.9999997788045133,\n    0.9999997796668726,\n    0.9999997805258698,\n    0.9999997813815182,\n    0.9999997822338308,\n    0.9999997830828204,\n    0.9999997839285001,\n    0.9999997847708829,\n    0.9999997856099816,\n    0.9999997864458089,\n    0.9999997872783775,\n    0.9999997881077003,\n    0.99999978893379,\n    0.999999789756659,\n    0.9999997905763199,\n    0.9999997913927854,\n    0.9999997922060676,\n    0.9999997930161793,\n    0.9999997938231325,\n    0.9999997946269397,\n    0.9999997954276133,\n    0.9999997962251652,\n    0.9999997970196078,\n    0.9999997978109532,\n    0.9999997985992134,\n    0.9999997993844006,\n    0.9999998001665265,\n    0.9999998009456034,\n    0.9999998017216427,\n    0.9999998024946565,\n    0.9999998032646568,\n    0.9999998040316551,\n    0.999999804795663,\n    0.9999998055566924,\n    0.999999806314755,\n    0.999999807069862,\n    0.9999998078220252,\n    0.999999808571256,\n    0.9999998093175658,\n    0.999999810060966,\n    0.9999998108014678,\n    0.9999998115390829,\n    0.9999998122738224,\n    0.9999998130056972,\n    0.9999998137347189,\n    0.9999998144608981,\n    0.9999998151842464,\n    0.9999998159047746,\n    0.9999998166224938,\n    0.9999998173374148,\n    0.9999998180495486,\n    0.9999998187589061,\n    0.999999819465498,\n    0.9999998201693351,\n    0.9999998208704283,\n    0.9999998215687883,\n    0.9999998222644256,\n    0.9999998229573508,\n    0.9999998236475744,\n    0.9999998243351074,\n    0.9999998250199598,\n    0.9999998257021421,\n    0.999999826381665,\n    0.9999998270585385,\n    0.9999998277327732,\n    0.9999998284043795,\n    0.9999998290733673,\n    0.9999998297397469,\n    0.9999998304035286,\n    0.9999998310647226,\n    0.9999998317233386,\n    0.9999998323793872,\n    0.9999998330328778,\n    0.9999998336838208,\n    0.999999834332226,\n    0.9999998349781033,\n    0.9999998356214627,\n    0.9999998362623136,\n    0.9999998369006664,\n    0.9999998375365304,\n    0.9999998381699153,\n    0.9999998388008309,\n    0.9999998394292868,\n    0.9999998400552925,\n    0.9999998406788578,\n    0.9999998412999919,\n    0.9999998419187045,\n    0.9999998425350051,\n    0.9999998431489028,\n    0.9999998437604072,\n    0.9999998443695275,\n    0.999999844976273,\n    0.9999998455806532,\n    0.9999998461826772,\n    0.999999846782354,\n    0.9999998473796929,\n    0.999999847974703,\n    0.9999998485673934,\n    0.999999849157773,\n    0.999999849745851,\n    0.9999998503316363,\n    0.999999850915138,\n    0.9999998514963647,\n    0.9999998520753254,\n    0.9999998526520291,\n    0.9999998532264842,\n    0.9999998537986999,\n    0.9999998543686844,\n    0.9999998549364473,\n    0.9999998555019962,\n    0.9999998560653406,\n    0.9999998566264884,\n    0.9999998571854487,\n    0.9999998577422298,\n    0.9999998582968402,\n    0.9999998588492883,\n    0.9999998593995829,\n    0.9999998599477319,\n    0.9999998604937438,\n    0.9999998610376271,\n    0.99999986157939,\n    0.9999998621190406,\n    0.9999998626565876,\n    0.9999998631920387,\n    0.9999998637254023,\n    0.9999998642566865,\n    0.9999998647858995,\n    0.9999998653130492,\n    0.9999998658381439,\n    0.9999998663611912,\n    0.9999998668821997,\n    0.9999998674011765,\n    0.9999998679181304,\n    0.9999998684330687,\n    0.9999998689459996,\n    0.9999998694569304,\n    0.9999998699658696,\n    0.9999998704728247,\n    0.9999998709778033,\n    0.999999871480813,\n    0.9999998719818617,\n    0.9999998724809571,\n    0.9999998729781067,\n    0.999999873473318,\n    0.9999998739665988,\n    0.9999998744579564,\n    0.9999998749473983,\n    0.9999998754349322,\n    0.9999998759205654,\n    0.9999998764043052,\n    0.999999876886159,\n    0.9999998773661344,\n    0.9999998778442383,\n    0.9999998783204785,\n    0.9999998787948619,\n    0.999999879267396,\n    0.9999998797380878,\n    0.9999998802069444,\n    0.9999998806739732,\n    0.9999998811391811,\n    0.9999998816025757,\n    0.9999998820641635,\n    0.9999998825239516,\n    0.9999998829819473,\n    0.9999998834381574,\n    0.9999998838925889,\n    0.9999998843452487,\n    0.9999998847961438,\n    0.999999885245281,\n    0.9999998856926673,\n    0.9999998861383091,\n    0.999999886582214,\n    0.9999998870243879,\n};\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/functions.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        functions.h\n// Description: Collection of function-objects used by the network layers.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_FUNCTIONS_H_\n#define TESSERACT_LSTM_FUNCTIONS_H_\n\n#include \"helpers.h\"\n#include \"tesstypes.h\"\n\n// Setting this to 1 or more causes massive dumps of debug data: weights,\n// updates, internal calculations etc, and reduces the number of test iterations\n// to a small number, so outputs can be diffed.\n#define DEBUG_DETAIL 0\n#if DEBUG_DETAIL > 0\n#  undef _OPENMP // Disable open mp to get the outputs in sync.\n#endif\n\nnamespace tesseract {\n\n// Size of static tables.\nconstexpr int kTableSize = 4096;\n// Scale factor for float arg to int index.\nconstexpr TFloat kScaleFactor = 256.0;\n\n// Generated lookup tables.\nextern const TFloat TanhTable[];\nextern const TFloat LogisticTable[];\n\n// Non-linearity (sigmoid) functions with cache tables and clipping.\ninline TFloat Tanh(TFloat x) {\n  if (x < 0) {\n    return -Tanh(-x);\n  }\n  x *= kScaleFactor;\n  auto index = static_cast<unsigned>(x);\n  if (index >= (kTableSize - 1)) {\n    return 1;\n  }\n  TFloat tanh_i0 = TanhTable[index];\n  TFloat tanh_i1 = TanhTable[index + 1];\n  // Linear interpolation.\n  return tanh_i0 + (tanh_i1 - tanh_i0) * (x - index);\n}\n\ninline TFloat Logistic(TFloat x) {\n  if (x < 0) {\n    return 1 - Logistic(-x);\n  }\n  x *= kScaleFactor;\n  auto index = static_cast<unsigned>(x);\n  if (index >= (kTableSize - 1)) {\n    return 1;\n  }\n  TFloat l0 = LogisticTable[index];\n  TFloat l1 = LogisticTable[index + 1];\n  // Linear interpolation.\n  return l0 + (l1 - l0) * (x - index);\n}\n\n// Non-linearity (sigmoid) functions and their derivatives.\nstruct FFunc {\n  inline TFloat operator()(TFloat x) const {\n    return Logistic(x);\n  }\n};\nstruct FPrime {\n  inline TFloat operator()(TFloat y) const {\n    return y * (1 - y);\n  }\n};\nstruct ClipFFunc {\n  inline TFloat operator()(TFloat x) const {\n    if (x <= 0) {\n      return 0;\n    }\n    if (x >= 1) {\n      return 1;\n    }\n    return x;\n  }\n};\nstruct ClipFPrime {\n  inline TFloat operator()(TFloat y) const {\n    return 0 < y && y < 1 ? 1 : 0;\n  }\n};\nstruct Relu {\n  inline TFloat operator()(TFloat x) const {\n    if (x <= 0) {\n      return 0;\n    }\n    return x;\n  }\n};\nstruct ReluPrime {\n  inline TFloat operator()(TFloat y) const {\n    return 0 < y ? 1 : 0;\n  }\n};\nstruct GFunc {\n  inline TFloat operator()(TFloat x) const {\n    return Tanh(x);\n  }\n};\nstruct GPrime {\n  inline TFloat operator()(TFloat y) const {\n    return 1 - y * y;\n  }\n};\nstruct ClipGFunc {\n  inline TFloat operator()(TFloat x) const {\n    if (x <= -1) {\n      return -1;\n    }\n    if (x >= 1) {\n      return 1;\n    }\n    return x;\n  }\n};\nstruct ClipGPrime {\n  inline TFloat operator()(TFloat y) const {\n    return -1 < y && y < 1 ? 1 : 0;\n  }\n};\nstruct HFunc {\n  inline TFloat operator()(TFloat x) const {\n    return Tanh(x);\n  }\n};\nstruct HPrime {\n  inline TFloat operator()(TFloat y) const {\n    TFloat u = Tanh(y);\n    return 1 - u * u;\n  }\n};\nstruct UnityFunc {\n  inline TFloat operator()(TFloat /*x*/) const {\n    return 1.0;\n  }\n};\nstruct IdentityFunc {\n  inline TFloat operator()(TFloat x) const {\n    return x;\n  }\n};\n\n// Applies Func in-place to inout, of size n.\ntemplate <class Func>\ninline void FuncInplace(int n, TFloat *inout) {\n  Func f;\n  for (int i = 0; i < n; ++i) {\n    inout[i] = f(inout[i]);\n  }\n}\n// Applies Func to u and multiplies the result by v component-wise,\n// putting the product in out, all of size n.\ntemplate <class Func>\ninline void FuncMultiply(const TFloat *u, const TFloat *v, int n, TFloat *out) {\n  Func f;\n  for (int i = 0; i < n; ++i) {\n    out[i] = f(u[i]) * v[i];\n  }\n}\n// Applies the Softmax function in-place to inout, of size n.\ntemplate <typename T>\ninline void SoftmaxInPlace(int n, T *inout) {\n  if (n <= 0) {\n    return;\n  }\n  // A limit on the negative range input to exp to guarantee non-zero output.\n  const T kMaxSoftmaxActivation = 86;\n\n  T max_output = inout[0];\n  for (int i = 1; i < n; i++) {\n    T output = inout[i];\n    if (output > max_output) {\n      max_output = output;\n    }\n  }\n  T prob_total = 0;\n  for (int i = 0; i < n; i++) {\n    T prob = inout[i] - max_output;\n    prob = std::exp(ClipToRange(prob, -kMaxSoftmaxActivation, static_cast<T>(0)));\n    prob_total += prob;\n    inout[i] = prob;\n  }\n  if (prob_total > 0) {\n    for (int i = 0; i < n; i++) {\n      inout[i] /= prob_total;\n    }\n  }\n}\n\n// Copies n values of the given src vector to dest.\ninline void CopyVector(unsigned n, const TFloat *src, TFloat *dest) {\n  memcpy(dest, src, n * sizeof(dest[0]));\n}\n\n// Adds n values of the given src vector to dest.\ninline void AccumulateVector(int n, const TFloat *src, TFloat *dest) {\n  for (int i = 0; i < n; ++i) {\n    dest[i] += src[i];\n  }\n}\n\n// Multiplies n values of inout in-place element-wise by the given src vector.\ninline void MultiplyVectorsInPlace(int n, const TFloat *src, TFloat *inout) {\n  for (int i = 0; i < n; ++i) {\n    inout[i] *= src[i];\n  }\n}\n\n// Multiplies n values of u by v, element-wise, accumulating to out.\ninline void MultiplyAccumulate(int n, const TFloat *u, const TFloat *v, TFloat *out) {\n  for (int i = 0; i < n; i++) {\n    out[i] += u[i] * v[i];\n  }\n}\n\n// Sums the given 5 n-vectors putting the result into sum.\ninline void SumVectors(int n, const TFloat *v1, const TFloat *v2, const TFloat *v3,\n                       const TFloat *v4, const TFloat *v5, TFloat *sum) {\n  for (int i = 0; i < n; ++i) {\n    sum[i] = v1[i] + v2[i] + v3[i] + v4[i] + v5[i];\n  }\n}\n\n// Sets the given n-vector vec to 0.\ntemplate <typename T>\ninline void ZeroVector(unsigned n, T *vec) {\n  memset(vec, 0, n * sizeof(*vec));\n}\n\n// Clips the given vector vec, of size n to [lower, upper].\ntemplate <typename T>\ninline void ClipVector(int n, T lower, T upper, T *vec) {\n  for (int i = 0; i < n; ++i) {\n    vec[i] = ClipToRange(vec[i], lower, upper);\n  }\n}\n\n// Converts the given n-vector to a binary encoding of the maximum value,\n// encoded as vector of nf binary values.\ninline void CodeInBinary(int n, int nf, TFloat *vec) {\n  if (nf <= 0 || n < nf) {\n    return;\n  }\n  int index = 0;\n  TFloat best_score = vec[0];\n  for (int i = 1; i < n; ++i) {\n    if (vec[i] > best_score) {\n      best_score = vec[i];\n      index = i;\n    }\n  }\n  int mask = 1;\n  for (int i = 0; i < nf; ++i, mask *= 2) {\n    vec[i] = (index & mask) ? 1.0 : 0.0;\n  }\n}\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_FUNCTIONS_H_\n"
  },
  {
    "path": "src/lstm/generate_lut.py",
    "content": "#!/usr/bin/env python3\n\n# Create C/C++ code for two lookup tables.\n\nimport math\n\n# kTableSize and kScaleFactor must match the values in functions.h.\n\n# Size of static tables.\nkTableSize = 4096\n# Scale factor for float arg to int index.\nkScaleFactor = 256.0\n\nprint(\"// Generated code with lookup tables (see generate_lut.py)\")\nprint('#include \"functions.h\"')\nprint(\"namespace tesseract {\")\n\nprint(\"const TFloat TanhTable[] = {\")\nfor i in range(kTableSize):\n    print(\"    %a,\" % math.tanh(i / kScaleFactor))\nprint(\"};\")\n\nprint(\"const TFloat LogisticTable[] = {\")\nfor i in range(kTableSize):\n    print(\"    %a,\" % (1 / (1 + math.exp(-i / kScaleFactor))))\nprint(\"};\")\nprint(\"} // namespace tesseract.\")\n"
  },
  {
    "path": "src/lstm/input.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        input.cpp\n// Description: Input layer class for neural network implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"input.h\"\n\n#include <allheaders.h>\n#include \"imagedata.h\"\n#include \"pageres.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n// Max height for variable height inputs before scaling anyway.\nconst int kMaxInputHeight = 48;\n\nInput::Input(const std::string &name, int ni, int no)\n    : Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}\nInput::Input(const std::string &name, const StaticShape &shape)\n    : Network(NT_INPUT, name, shape.height(), shape.depth()), shape_(shape), cached_x_scale_(1) {\n  if (shape.height() == 1) {\n    ni_ = shape.depth();\n  }\n}\n\n// Writes to the given file. Returns false in case of error.\nbool Input::Serialize(TFile *fp) const {\n  return Network::Serialize(fp) && shape_.Serialize(fp);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool Input::DeSerialize(TFile *fp) {\n  return shape_.DeSerialize(fp);\n}\n\n// Returns an integer reduction factor that the network applies to the\n// time sequence. Assumes that any 2-d is already eliminated. Used for\n// scaling bounding boxes of truth data.\nint Input::XScaleFactor() const {\n  return 1;\n}\n\n// Provides the (minimum) x scale factor to the network (of interest only to\n// input units) so they can determine how to scale bounding boxes.\nvoid Input::CacheXScaleFactor(int factor) {\n  cached_x_scale_ = factor;\n}\n\n// Runs forward propagation of activations on the input line.\n// See Network for a detailed discussion of the arguments.\nvoid Input::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                    NetworkScratch *scratch, NetworkIO *output) {\n  *output = input;\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Input::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                     NetworkIO *back_deltas) {\n  tprintf(\"Input::Backward should not be called!!\\n\");\n  return false;\n}\n\n// Creates and returns a Pix of appropriate size for the network from the\n// image_data. If non-null, *image_scale returns the image scale factor used.\n// Returns nullptr on error.\n/* static */\nImage Input::PrepareLSTMInputs(const ImageData &image_data, const Network *network, int min_width,\n                              TRand *randomizer, float *image_scale) {\n  // Note that NumInputs() is defined as input image height.\n  int target_height = network->NumInputs();\n  int width, height;\n  Image pix =\n      image_data.PreScale(target_height, kMaxInputHeight, image_scale, &width, &height, nullptr);\n  if (pix == nullptr) {\n    tprintf(\"Bad pix from ImageData!\\n\");\n    return nullptr;\n  }\n  if (width < min_width || height < min_width) {\n    tprintf(\"Image too small to scale!! (%dx%d vs min width of %d)\\n\", width, height, min_width);\n    pix.destroy();\n    return nullptr;\n  }\n  return pix;\n}\n\n// Converts the given pix to a NetworkIO of height and depth appropriate to the\n// given StaticShape:\n// If depth == 3, convert to 24 bit color, otherwise normalized grey.\n// Scale to target height, if the shape's height is > 1, or its depth if the\n// height == 1. If height == 0 then no scaling.\n// NOTE: It isn't safe for multiple threads to call this on the same pix.\n/* static */\nvoid Input::PreparePixInput(const StaticShape &shape, const Image pix, TRand *randomizer,\n                            NetworkIO *input) {\n  bool color = shape.depth() == 3;\n  Image var_pix = pix;\n  int depth = pixGetDepth(var_pix);\n  Image normed_pix = nullptr;\n  // On input to BaseAPI, an image is forced to be 1, 8 or 24 bit, without\n  // colormap, so we just have to deal with depth conversion here.\n  if (color) {\n    // Force RGB.\n    if (depth == 32) {\n      normed_pix = var_pix.clone();\n    } else {\n      normed_pix = pixConvertTo32(var_pix);\n    }\n  } else {\n    // Convert non-8-bit images to 8 bit.\n    if (depth == 8) {\n      normed_pix = var_pix.clone();\n    } else {\n      normed_pix = pixConvertTo8(var_pix, false);\n    }\n  }\n  int height = pixGetHeight(normed_pix);\n  int target_height = shape.height();\n  if (target_height == 1) {\n    target_height = shape.depth();\n  }\n  if (target_height != 0 && target_height != height) {\n    // Get the scaled image.\n    float im_factor = static_cast<float>(target_height) / height;\n    Image scaled_pix = pixScale(normed_pix, im_factor, im_factor);\n    normed_pix.destroy();\n    normed_pix = scaled_pix;\n  }\n  input->FromPix(shape, normed_pix, randomizer);\n  normed_pix.destroy();\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/input.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        input.h\n// Description: Input layer class for neural network implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_INPUT_H_\n#define TESSERACT_LSTM_INPUT_H_\n\n#include \"network.h\"\n\nnamespace tesseract {\n\nclass ScrollView;\n\nclass Input : public Network {\npublic:\n  TESS_API\n  Input(const std::string &name, int ni, int no);\n  TESS_API\n  Input(const std::string &name, const StaticShape &shape);\n  ~Input() override = default;\n\n  std::string spec() const override {\n    return std::to_string(shape_.batch()) + \",\" +\n           std::to_string(shape_.height()) + \",\" +\n           std::to_string(shape_.width()) + \",\" +\n           std::to_string(shape_.depth());\n  }\n\n  // Returns the required shape input to the network.\n  StaticShape InputShape() const override {\n    return shape_;\n  }\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(\n      [[maybe_unused]] const StaticShape &input_shape) const override {\n    return shape_;\n  }\n  // Writes to the given file. Returns false in case of error.\n  // Should be overridden by subclasses, but called by their Serialize.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Returns an integer reduction factor that the network applies to the\n  // time sequence. Assumes that any 2-d is already eliminated. Used for\n  // scaling bounding boxes of truth data.\n  // WARNING: if GlobalMinimax is used to vary the scale, this will return\n  // the last used scale factor. Call it before any forward, and it will return\n  // the minimum scale factor of the paths through the GlobalMinimax.\n  int XScaleFactor() const override;\n\n  // Provides the (minimum) x scale factor to the network (of interest only to\n  // input units) so they can determine how to scale bounding boxes.\n  void CacheXScaleFactor(int factor) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input,\n               const TransposedArray *input_transpose, NetworkScratch *scratch,\n               NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas,\n                NetworkScratch *scratch, NetworkIO *back_deltas) override;\n  // Creates and returns a Pix of appropriate size for the network from the\n  // image_data. If non-null, *image_scale returns the image scale factor used.\n  // Returns nullptr on error.\n  /* static */\n  static Image PrepareLSTMInputs(const ImageData &image_data,\n                                 const Network *network, int min_width,\n                                 TRand *randomizer, float *image_scale);\n  // Converts the given pix to a NetworkIO of height and depth appropriate to\n  // the given StaticShape:\n  // If depth == 3, convert to 24 bit color, otherwise normalized grey.\n  // Scale to target height, if the shape's height is > 1, or its depth if the\n  // height == 1. If height == 0 then no scaling.\n  // NOTE: It isn't safe for multiple threads to call this on the same pix.\n  static void PreparePixInput(const StaticShape &shape, const Image pix,\n                              TRand *randomizer, NetworkIO *input);\n\nprivate:\n  void DebugWeights() override {\n    tprintf(\"Must override Network::DebugWeights for type %d\\n\", type_);\n  }\n\n  // Input shape determines how images are dealt with.\n  StaticShape shape_;\n  // Cached total network x scale factor for scaling bounding boxes.\n  int cached_x_scale_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_INPUT_H_\n"
  },
  {
    "path": "src/lstm/lstm.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstm.cpp\n// Description: Long-term-short-term-memory Recurrent neural network.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"lstm.h\"\n\n#ifdef _OPENMP\n#  include <omp.h>\n#endif\n#include <cstdio>\n#include <cstdlib>\n#include <sstream> // for std::ostringstream\n\n#if defined(_MSC_VER) && !defined(__clang__)\n#  include <intrin.h> // _BitScanReverse\n#endif\n\n#include \"fullyconnected.h\"\n#include \"functions.h\"\n#include \"networkscratch.h\"\n#include \"tprintf.h\"\n\n// Macros for openmp code if it is available, otherwise empty macros.\n#ifdef _OPENMP\n#  define PARALLEL_IF_OPENMP(__num_threads)                                  \\\n    PRAGMA(omp parallel if (__num_threads > 1) num_threads(__num_threads)) { \\\n      PRAGMA(omp sections nowait) {                                          \\\n        PRAGMA(omp section) {\n#  define SECTION_IF_OPENMP \\\n    }                       \\\n    PRAGMA(omp section) {\n#  define END_PARALLEL_IF_OPENMP \\\n    }                            \\\n    } /* end of sections */      \\\n    } /* end of parallel section */\n\n// Define the portable PRAGMA macro.\n#  ifdef _MSC_VER // Different _Pragma\n#    define PRAGMA(x) __pragma(x)\n#  else\n#    define PRAGMA(x) _Pragma(#    x)\n#  endif // _MSC_VER\n\n#else // _OPENMP\n#  define PARALLEL_IF_OPENMP(__num_threads)\n#  define SECTION_IF_OPENMP\n#  define END_PARALLEL_IF_OPENMP\n#endif // _OPENMP\n\nnamespace tesseract {\n\n// Max absolute value of state_. It is reasonably high to enable the state\n// to count things.\nconst TFloat kStateClip = 100.0;\n// Max absolute value of gate_errors (the gradients).\nconst TFloat kErrClip = 1.0f;\n\n// Calculate ceil(log2(n)).\nstatic inline uint32_t ceil_log2(uint32_t n) {\n  // l2 = (unsigned)log2(n).\n#if defined(__GNUC__)\n  // Use fast inline assembler code for gcc or clang.\n  uint32_t l2 = 31 - __builtin_clz(n);\n#elif defined(_MSC_VER)\n  // Use fast intrinsic function for MS compiler.\n  unsigned long l2 = 0;\n  _BitScanReverse(&l2, n);\n#else\n  if (n == 0)\n    return UINT_MAX;\n  if (n == 1)\n    return 0;\n  uint32_t val = n;\n  uint32_t l2 = 0;\n  while (val > 1) {\n    val >>= 1;\n    l2++;\n  }\n#endif\n  // Round up if n is not a power of 2.\n  return (n == (1u << l2)) ? l2 : l2 + 1;\n}\n\nLSTM::LSTM(const std::string &name, int ni, int ns, int no, bool two_dimensional, NetworkType type)\n    : Network(type, name, ni, no)\n    , na_(ni + ns)\n    , ns_(ns)\n    , nf_(0)\n    , is_2d_(two_dimensional)\n    , softmax_(nullptr)\n    , input_width_(0) {\n  if (two_dimensional) {\n    na_ += ns_;\n  }\n  if (type_ == NT_LSTM || type_ == NT_LSTM_SUMMARY) {\n    nf_ = 0;\n    // networkbuilder ensures this is always true.\n    ASSERT_HOST(no == ns);\n  } else if (type_ == NT_LSTM_SOFTMAX || type_ == NT_LSTM_SOFTMAX_ENCODED) {\n    nf_ = type_ == NT_LSTM_SOFTMAX ? no_ : ceil_log2(no_);\n    softmax_ = new FullyConnected(\"LSTM Softmax\", ns_, no_, NT_SOFTMAX);\n  } else {\n    tprintf(\"%d is invalid type of LSTM!\\n\", type);\n    ASSERT_HOST(false);\n  }\n  na_ += nf_;\n}\n\nLSTM::~LSTM() {\n  delete softmax_;\n}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape LSTM::OutputShape(const StaticShape &input_shape) const {\n  StaticShape result = input_shape;\n  result.set_depth(no_);\n  if (type_ == NT_LSTM_SUMMARY) {\n    result.set_width(1);\n  }\n  if (softmax_ != nullptr) {\n    return softmax_->OutputShape(result);\n  }\n  return result;\n}\n\n// Suspends/Enables training by setting the training_ flag. Serialize and\n// DeSerialize only operate on the run-time data if state is false.\nvoid LSTM::SetEnableTraining(TrainingState state) {\n  if (state == TS_RE_ENABLE) {\n    // Enable only from temp disabled.\n    if (training_ == TS_TEMP_DISABLE) {\n      training_ = TS_ENABLED;\n    }\n  } else if (state == TS_TEMP_DISABLE) {\n    // Temp disable only from enabled.\n    if (training_ == TS_ENABLED) {\n      training_ = state;\n    }\n  } else {\n    if (state == TS_ENABLED && training_ != TS_ENABLED) {\n      for (int w = 0; w < WT_COUNT; ++w) {\n        if (w == GFS && !Is2D()) {\n          continue;\n        }\n        gate_weights_[w].InitBackward();\n      }\n    }\n    training_ = state;\n  }\n  if (softmax_ != nullptr) {\n    softmax_->SetEnableTraining(state);\n  }\n}\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\nint LSTM::InitWeights(float range, TRand *randomizer) {\n  Network::SetRandomizer(randomizer);\n  num_weights_ = 0;\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    num_weights_ +=\n        gate_weights_[w].InitWeightsFloat(ns_, na_ + 1, TestFlag(NF_ADAM), range, randomizer);\n  }\n  if (softmax_ != nullptr) {\n    num_weights_ += softmax_->InitWeights(range, randomizer);\n  }\n  return num_weights_;\n}\n\n// Recursively searches the network for softmaxes with old_no outputs,\n// and remaps their outputs according to code_map. See network.h for details.\nint LSTM::RemapOutputs(int old_no, const std::vector<int> &code_map) {\n  if (softmax_ != nullptr) {\n    num_weights_ -= softmax_->num_weights();\n    num_weights_ += softmax_->RemapOutputs(old_no, code_map);\n  }\n  return num_weights_;\n}\n\n// Converts a float network to an int network.\nvoid LSTM::ConvertToInt() {\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    gate_weights_[w].ConvertToInt();\n  }\n  if (softmax_ != nullptr) {\n    softmax_->ConvertToInt();\n  }\n}\n\n// Sets up the network for training using the given weight_range.\nvoid LSTM::DebugWeights() {\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    std::ostringstream msg;\n    msg << name_ << \" Gate weights \" << w;\n    gate_weights_[w].Debug2D(msg.str().c_str());\n  }\n  if (softmax_ != nullptr) {\n    softmax_->DebugWeights();\n  }\n}\n\n// Writes to the given file. Returns false in case of error.\nbool LSTM::Serialize(TFile *fp) const {\n  if (!Network::Serialize(fp)) {\n    return false;\n  }\n  if (!fp->Serialize(&na_)) {\n    return false;\n  }\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    if (!gate_weights_[w].Serialize(IsTraining(), fp)) {\n      return false;\n    }\n  }\n  if (softmax_ != nullptr && !softmax_->Serialize(fp)) {\n    return false;\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\n\nbool LSTM::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(&na_)) {\n    return false;\n  }\n  if (type_ == NT_LSTM_SOFTMAX) {\n    nf_ = no_;\n  } else if (type_ == NT_LSTM_SOFTMAX_ENCODED) {\n    nf_ = ceil_log2(no_);\n  } else {\n    nf_ = 0;\n  }\n  is_2d_ = false;\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    if (!gate_weights_[w].DeSerialize(IsTraining(), fp)) {\n      return false;\n    }\n    if (w == CI) {\n      ns_ = gate_weights_[CI].NumOutputs();\n      is_2d_ = na_ - nf_ == ni_ + 2 * ns_;\n    }\n  }\n  delete softmax_;\n  if (type_ == NT_LSTM_SOFTMAX || type_ == NT_LSTM_SOFTMAX_ENCODED) {\n    softmax_ = static_cast<FullyConnected *>(Network::CreateFromFile(fp));\n    if (softmax_ == nullptr) {\n      return false;\n    }\n  } else {\n    softmax_ = nullptr;\n  }\n  return true;\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                   NetworkScratch *scratch, NetworkIO *output) {\n  input_map_ = input.stride_map();\n  input_width_ = input.Width();\n  if (softmax_ != nullptr) {\n    output->ResizeFloat(input, no_);\n  } else if (type_ == NT_LSTM_SUMMARY) {\n    output->ResizeXTo1(input, no_);\n  } else {\n    output->Resize(input, no_);\n  }\n  ResizeForward(input);\n  // Temporary storage of forward computation for each gate.\n  NetworkScratch::FloatVec temp_lines[WT_COUNT];\n  int ro = ns_;\n  if (source_.int_mode() && IntSimdMatrix::intSimdMatrix) {\n    ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);\n  }\n  for (auto &temp_line : temp_lines) {\n    temp_line.Init(ns_, ro, scratch);\n  }\n  // Single timestep buffers for the current/recurrent output and state.\n  NetworkScratch::FloatVec curr_state, curr_output;\n  curr_state.Init(ns_, scratch);\n  ZeroVector<TFloat>(ns_, curr_state);\n  curr_output.Init(ns_, scratch);\n  ZeroVector<TFloat>(ns_, curr_output);\n  // Rotating buffers of width buf_width allow storage of the state and output\n  // for the other dimension, used only when working in true 2D mode. The width\n  // is enough to hold an entire strip of the major direction.\n  int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;\n  std::vector<NetworkScratch::FloatVec> states, outputs;\n  if (Is2D()) {\n    states.resize(buf_width);\n    outputs.resize(buf_width);\n    for (int i = 0; i < buf_width; ++i) {\n      states[i].Init(ns_, scratch);\n      ZeroVector<TFloat>(ns_, states[i]);\n      outputs[i].Init(ns_, scratch);\n      ZeroVector<TFloat>(ns_, outputs[i]);\n    }\n  }\n  // Used only if a softmax LSTM.\n  NetworkScratch::FloatVec softmax_output;\n  NetworkScratch::IO int_output;\n  if (softmax_ != nullptr) {\n    softmax_output.Init(no_, scratch);\n    ZeroVector<TFloat>(no_, softmax_output);\n    int rounded_softmax_inputs = gate_weights_[CI].RoundInputs(ns_);\n    if (input.int_mode()) {\n      int_output.Resize2d(true, 1, rounded_softmax_inputs, scratch);\n    }\n    softmax_->SetupForward(input, nullptr);\n  }\n  NetworkScratch::FloatVec curr_input;\n  curr_input.Init(na_, scratch);\n  StrideMap::Index src_index(input_map_);\n  // Used only by NT_LSTM_SUMMARY.\n  StrideMap::Index dest_index(output->stride_map());\n  do {\n    int t = src_index.t();\n    // True if there is a valid old state for the 2nd dimension.\n    bool valid_2d = Is2D();\n    if (valid_2d) {\n      StrideMap::Index dim_index(src_index);\n      if (!dim_index.AddOffset(-1, FD_HEIGHT)) {\n        valid_2d = false;\n      }\n    }\n    // Index of the 2-D revolving buffers (outputs, states).\n    int mod_t = Modulo(t, buf_width); // Current timestep.\n    // Setup the padded input in source.\n    source_.CopyTimeStepGeneral(t, 0, ni_, input, t, 0);\n    if (softmax_ != nullptr) {\n      source_.WriteTimeStepPart(t, ni_, nf_, softmax_output);\n    }\n    source_.WriteTimeStepPart(t, ni_ + nf_, ns_, curr_output);\n    if (Is2D()) {\n      source_.WriteTimeStepPart(t, ni_ + nf_ + ns_, ns_, outputs[mod_t]);\n    }\n    if (!source_.int_mode()) {\n      source_.ReadTimeStep(t, curr_input);\n    }\n    // Matrix multiply the inputs with the source.\n    PARALLEL_IF_OPENMP(GFS)\n    // It looks inefficient to create the threads on each t iteration, but the\n    // alternative of putting the parallel outside the t loop, a single around\n    // the t-loop and then tasks in place of the sections is a *lot* slower.\n    // Cell inputs.\n    if (source_.int_mode()) {\n      gate_weights_[CI].MatrixDotVector(source_.i(t), temp_lines[CI]);\n    } else {\n      gate_weights_[CI].MatrixDotVector(curr_input, temp_lines[CI]);\n    }\n    FuncInplace<GFunc>(ns_, temp_lines[CI]);\n\n    SECTION_IF_OPENMP\n    // Input Gates.\n    if (source_.int_mode()) {\n      gate_weights_[GI].MatrixDotVector(source_.i(t), temp_lines[GI]);\n    } else {\n      gate_weights_[GI].MatrixDotVector(curr_input, temp_lines[GI]);\n    }\n    FuncInplace<FFunc>(ns_, temp_lines[GI]);\n\n    SECTION_IF_OPENMP\n    // 1-D forget gates.\n    if (source_.int_mode()) {\n      gate_weights_[GF1].MatrixDotVector(source_.i(t), temp_lines[GF1]);\n    } else {\n      gate_weights_[GF1].MatrixDotVector(curr_input, temp_lines[GF1]);\n    }\n    FuncInplace<FFunc>(ns_, temp_lines[GF1]);\n\n    // 2-D forget gates.\n    if (Is2D()) {\n      if (source_.int_mode()) {\n        gate_weights_[GFS].MatrixDotVector(source_.i(t), temp_lines[GFS]);\n      } else {\n        gate_weights_[GFS].MatrixDotVector(curr_input, temp_lines[GFS]);\n      }\n      FuncInplace<FFunc>(ns_, temp_lines[GFS]);\n    }\n\n    SECTION_IF_OPENMP\n    // Output gates.\n    if (source_.int_mode()) {\n      gate_weights_[GO].MatrixDotVector(source_.i(t), temp_lines[GO]);\n    } else {\n      gate_weights_[GO].MatrixDotVector(curr_input, temp_lines[GO]);\n    }\n    FuncInplace<FFunc>(ns_, temp_lines[GO]);\n    END_PARALLEL_IF_OPENMP\n\n    // Apply forget gate to state.\n    MultiplyVectorsInPlace(ns_, temp_lines[GF1], curr_state);\n    if (Is2D()) {\n      // Max-pool the forget gates (in 2-d) instead of blindly adding.\n      int8_t *which_fg_col = which_fg_[t];\n      memset(which_fg_col, 1, ns_ * sizeof(which_fg_col[0]));\n      if (valid_2d) {\n        const TFloat *stepped_state = states[mod_t];\n        for (int i = 0; i < ns_; ++i) {\n          if (temp_lines[GF1][i] < temp_lines[GFS][i]) {\n            curr_state[i] = temp_lines[GFS][i] * stepped_state[i];\n            which_fg_col[i] = 2;\n          }\n        }\n      }\n    }\n    MultiplyAccumulate(ns_, temp_lines[CI], temp_lines[GI], curr_state);\n    // Clip curr_state to a sane range.\n    ClipVector<TFloat>(ns_, -kStateClip, kStateClip, curr_state);\n    if (IsTraining()) {\n      // Save the gate node values.\n      node_values_[CI].WriteTimeStep(t, temp_lines[CI]);\n      node_values_[GI].WriteTimeStep(t, temp_lines[GI]);\n      node_values_[GF1].WriteTimeStep(t, temp_lines[GF1]);\n      node_values_[GO].WriteTimeStep(t, temp_lines[GO]);\n      if (Is2D()) {\n        node_values_[GFS].WriteTimeStep(t, temp_lines[GFS]);\n      }\n    }\n    FuncMultiply<HFunc>(curr_state, temp_lines[GO], ns_, curr_output);\n    if (IsTraining()) {\n      state_.WriteTimeStep(t, curr_state);\n    }\n    if (softmax_ != nullptr) {\n      if (input.int_mode()) {\n        int_output->WriteTimeStepPart(0, 0, ns_, curr_output);\n        softmax_->ForwardTimeStep(int_output->i(0), t, softmax_output);\n      } else {\n        softmax_->ForwardTimeStep(curr_output, t, softmax_output);\n      }\n      output->WriteTimeStep(t, softmax_output);\n      if (type_ == NT_LSTM_SOFTMAX_ENCODED) {\n        CodeInBinary(no_, nf_, softmax_output);\n      }\n    } else if (type_ == NT_LSTM_SUMMARY) {\n      // Output only at the end of a row.\n      if (src_index.IsLast(FD_WIDTH)) {\n        output->WriteTimeStep(dest_index.t(), curr_output);\n        dest_index.Increment();\n      }\n    } else {\n      output->WriteTimeStep(t, curr_output);\n    }\n    // Save states for use by the 2nd dimension only if needed.\n    if (Is2D()) {\n      CopyVector(ns_, curr_state, states[mod_t]);\n      CopyVector(ns_, curr_output, outputs[mod_t]);\n    }\n    // Always zero the states at the end of every row, but only for the major\n    // direction. The 2-D state remains intact.\n    if (src_index.IsLast(FD_WIDTH)) {\n      ZeroVector<TFloat>(ns_, curr_state);\n      ZeroVector<TFloat>(ns_, curr_output);\n    }\n  } while (src_index.Increment());\n#if DEBUG_DETAIL > 0\n  tprintf(\"Source:%s\\n\", name_.c_str());\n  source_.Print(10);\n  tprintf(\"State:%s\\n\", name_.c_str());\n  state_.Print(10);\n  tprintf(\"Output:%s\\n\", name_.c_str());\n  output->Print(10);\n#endif\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    DisplayForward(*output);\n  }\n#endif\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                    NetworkIO *back_deltas) {\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    DisplayBackward(fwd_deltas);\n  }\n#endif\n  back_deltas->ResizeToMap(fwd_deltas.int_mode(), input_map_, ni_);\n  // ======Scratch space.======\n  // Output errors from deltas with recurrence from sourceerr.\n  NetworkScratch::FloatVec outputerr;\n  outputerr.Init(ns_, scratch);\n  // Recurrent error in the state/source.\n  NetworkScratch::FloatVec curr_stateerr, curr_sourceerr;\n  curr_stateerr.Init(ns_, scratch);\n  curr_sourceerr.Init(na_, scratch);\n  ZeroVector<TFloat>(ns_, curr_stateerr);\n  ZeroVector<TFloat>(na_, curr_sourceerr);\n  // Errors in the gates.\n  NetworkScratch::FloatVec gate_errors[WT_COUNT];\n  for (auto &gate_error : gate_errors) {\n    gate_error.Init(ns_, scratch);\n  }\n  // Rotating buffers of width buf_width allow storage of the recurrent time-\n  // steps used only for true 2-D. Stores one full strip of the major direction.\n  int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;\n  std::vector<NetworkScratch::FloatVec> stateerr, sourceerr;\n  if (Is2D()) {\n    stateerr.resize(buf_width);\n    sourceerr.resize(buf_width);\n    for (int t = 0; t < buf_width; ++t) {\n      stateerr[t].Init(ns_, scratch);\n      sourceerr[t].Init(na_, scratch);\n      ZeroVector<TFloat>(ns_, stateerr[t]);\n      ZeroVector<TFloat>(na_, sourceerr[t]);\n    }\n  }\n  // Parallel-generated sourceerr from each of the gates.\n  NetworkScratch::FloatVec sourceerr_temps[WT_COUNT];\n  for (auto &sourceerr_temp : sourceerr_temps) {\n    sourceerr_temp.Init(na_, scratch);\n  }\n  int width = input_width_;\n  // Transposed gate errors stored over all timesteps for sum outer.\n  NetworkScratch::GradientStore gate_errors_t[WT_COUNT];\n  for (auto &w : gate_errors_t) {\n    w.Init(ns_, width, scratch);\n  }\n  // Used only if softmax_ != nullptr.\n  NetworkScratch::FloatVec softmax_errors;\n  NetworkScratch::GradientStore softmax_errors_t;\n  if (softmax_ != nullptr) {\n    softmax_errors.Init(no_, scratch);\n    softmax_errors_t.Init(no_, width, scratch);\n  }\n  TFloat state_clip = Is2D() ? 9.0 : 4.0;\n#if DEBUG_DETAIL > 1\n  tprintf(\"fwd_deltas:%s\\n\", name_.c_str());\n  fwd_deltas.Print(10);\n#endif\n  StrideMap::Index dest_index(input_map_);\n  dest_index.InitToLast();\n  // Used only by NT_LSTM_SUMMARY.\n  StrideMap::Index src_index(fwd_deltas.stride_map());\n  src_index.InitToLast();\n  do {\n    int t = dest_index.t();\n    bool at_last_x = dest_index.IsLast(FD_WIDTH);\n    // up_pos is the 2-D back step, down_pos is the 2-D fwd step, and are only\n    // valid if >= 0, which is true if 2d and not on the top/bottom.\n    int up_pos = -1;\n    int down_pos = -1;\n    if (Is2D()) {\n      if (dest_index.index(FD_HEIGHT) > 0) {\n        StrideMap::Index up_index(dest_index);\n        if (up_index.AddOffset(-1, FD_HEIGHT)) {\n          up_pos = up_index.t();\n        }\n      }\n      if (!dest_index.IsLast(FD_HEIGHT)) {\n        StrideMap::Index down_index(dest_index);\n        if (down_index.AddOffset(1, FD_HEIGHT)) {\n          down_pos = down_index.t();\n        }\n      }\n    }\n    // Index of the 2-D revolving buffers (sourceerr, stateerr).\n    int mod_t = Modulo(t, buf_width); // Current timestep.\n    // Zero the state in the major direction only at the end of every row.\n    if (at_last_x) {\n      ZeroVector<TFloat>(na_, curr_sourceerr);\n      ZeroVector<TFloat>(ns_, curr_stateerr);\n    }\n    // Setup the outputerr.\n    if (type_ == NT_LSTM_SUMMARY) {\n      if (dest_index.IsLast(FD_WIDTH)) {\n        fwd_deltas.ReadTimeStep(src_index.t(), outputerr);\n        src_index.Decrement();\n      } else {\n        ZeroVector<TFloat>(ns_, outputerr);\n      }\n    } else if (softmax_ == nullptr) {\n      fwd_deltas.ReadTimeStep(t, outputerr);\n    } else {\n      softmax_->BackwardTimeStep(fwd_deltas, t, softmax_errors, softmax_errors_t.get(), outputerr);\n    }\n    if (!at_last_x) {\n      AccumulateVector(ns_, curr_sourceerr + ni_ + nf_, outputerr);\n    }\n    if (down_pos >= 0) {\n      AccumulateVector(ns_, sourceerr[mod_t] + ni_ + nf_ + ns_, outputerr);\n    }\n    // Apply the 1-d forget gates.\n    if (!at_last_x) {\n      const float *next_node_gf1 = node_values_[GF1].f(t + 1);\n      for (int i = 0; i < ns_; ++i) {\n        curr_stateerr[i] *= next_node_gf1[i];\n      }\n    }\n    if (Is2D() && t + 1 < width) {\n      for (int i = 0; i < ns_; ++i) {\n        if (which_fg_[t + 1][i] != 1) {\n          curr_stateerr[i] = 0.0;\n        }\n      }\n      if (down_pos >= 0) {\n        const float *right_node_gfs = node_values_[GFS].f(down_pos);\n        const TFloat *right_stateerr = stateerr[mod_t];\n        for (int i = 0; i < ns_; ++i) {\n          if (which_fg_[down_pos][i] == 2) {\n            curr_stateerr[i] += right_stateerr[i] * right_node_gfs[i];\n          }\n        }\n      }\n    }\n    state_.FuncMultiply3Add<HPrime>(node_values_[GO], t, outputerr, curr_stateerr);\n    // Clip stateerr_ to a sane range.\n    ClipVector<TFloat>(ns_, -state_clip, state_clip, curr_stateerr);\n#if DEBUG_DETAIL > 1\n    if (t + 10 > width) {\n      tprintf(\"t=%d, stateerr=\", t);\n      for (int i = 0; i < ns_; ++i)\n        tprintf(\" %g,%g,%g\", curr_stateerr[i], outputerr[i], curr_sourceerr[ni_ + nf_ + i]);\n      tprintf(\"\\n\");\n    }\n#endif\n    // Matrix multiply to get the source errors.\n    PARALLEL_IF_OPENMP(GFS)\n\n    // Cell inputs.\n    node_values_[CI].FuncMultiply3<GPrime>(t, node_values_[GI], t, curr_stateerr, gate_errors[CI]);\n    ClipVector(ns_, -kErrClip, kErrClip, gate_errors[CI].get());\n    gate_weights_[CI].VectorDotMatrix(gate_errors[CI], sourceerr_temps[CI]);\n    gate_errors_t[CI].get()->WriteStrided(t, gate_errors[CI]);\n\n    SECTION_IF_OPENMP\n    // Input Gates.\n    node_values_[GI].FuncMultiply3<FPrime>(t, node_values_[CI], t, curr_stateerr, gate_errors[GI]);\n    ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GI].get());\n    gate_weights_[GI].VectorDotMatrix(gate_errors[GI], sourceerr_temps[GI]);\n    gate_errors_t[GI].get()->WriteStrided(t, gate_errors[GI]);\n\n    SECTION_IF_OPENMP\n    // 1-D forget Gates.\n    if (t > 0) {\n      node_values_[GF1].FuncMultiply3<FPrime>(t, state_, t - 1, curr_stateerr, gate_errors[GF1]);\n      ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GF1].get());\n      gate_weights_[GF1].VectorDotMatrix(gate_errors[GF1], sourceerr_temps[GF1]);\n    } else {\n      memset(gate_errors[GF1], 0, ns_ * sizeof(gate_errors[GF1][0]));\n      memset(sourceerr_temps[GF1], 0, na_ * sizeof(*sourceerr_temps[GF1]));\n    }\n    gate_errors_t[GF1].get()->WriteStrided(t, gate_errors[GF1]);\n\n    // 2-D forget Gates.\n    if (up_pos >= 0) {\n      node_values_[GFS].FuncMultiply3<FPrime>(t, state_, up_pos, curr_stateerr, gate_errors[GFS]);\n      ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GFS].get());\n      gate_weights_[GFS].VectorDotMatrix(gate_errors[GFS], sourceerr_temps[GFS]);\n    } else {\n      memset(gate_errors[GFS], 0, ns_ * sizeof(gate_errors[GFS][0]));\n      memset(sourceerr_temps[GFS], 0, na_ * sizeof(*sourceerr_temps[GFS]));\n    }\n    if (Is2D()) {\n      gate_errors_t[GFS].get()->WriteStrided(t, gate_errors[GFS]);\n    }\n\n    SECTION_IF_OPENMP\n    // Output gates.\n    state_.Func2Multiply3<HFunc, FPrime>(node_values_[GO], t, outputerr, gate_errors[GO]);\n    ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GO].get());\n    gate_weights_[GO].VectorDotMatrix(gate_errors[GO], sourceerr_temps[GO]);\n    gate_errors_t[GO].get()->WriteStrided(t, gate_errors[GO]);\n    END_PARALLEL_IF_OPENMP\n\n    SumVectors(na_, sourceerr_temps[CI], sourceerr_temps[GI], sourceerr_temps[GF1],\n               sourceerr_temps[GO], sourceerr_temps[GFS], curr_sourceerr);\n    back_deltas->WriteTimeStep(t, curr_sourceerr);\n    // Save states for use by the 2nd dimension only if needed.\n    if (Is2D()) {\n      CopyVector(ns_, curr_stateerr, stateerr[mod_t]);\n      CopyVector(na_, curr_sourceerr, sourceerr[mod_t]);\n    }\n  } while (dest_index.Decrement());\n#if DEBUG_DETAIL > 2\n  for (int w = 0; w < WT_COUNT; ++w) {\n    tprintf(\"%s gate errors[%d]\\n\", name_.c_str(), w);\n    gate_errors_t[w].get()->PrintUnTransposed(10);\n  }\n#endif\n  // Transposed source_ used to speed-up SumOuter.\n  NetworkScratch::GradientStore source_t, state_t;\n  source_t.Init(na_, width, scratch);\n  source_.Transpose(source_t.get());\n  state_t.Init(ns_, width, scratch);\n  state_.Transpose(state_t.get());\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(GFS) if (!Is2D())\n#endif\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    gate_weights_[w].SumOuterTransposed(*gate_errors_t[w], *source_t, false);\n  }\n  if (softmax_ != nullptr) {\n    softmax_->FinishBackward(*softmax_errors_t);\n  }\n  return needs_to_backprop_;\n}\n\n// Updates the weights using the given learning rate, momentum and adam_beta.\n// num_samples is used in the adam computation iff use_adam_ is true.\nvoid LSTM::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {\n#if DEBUG_DETAIL > 3\n  PrintW();\n#endif\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    gate_weights_[w].Update(learning_rate, momentum, adam_beta, num_samples);\n  }\n  if (softmax_ != nullptr) {\n    softmax_->Update(learning_rate, momentum, adam_beta, num_samples);\n  }\n#if DEBUG_DETAIL > 3\n  PrintDW();\n#endif\n}\n\n// Sums the products of weight updates in *this and other, splitting into\n// positive (same direction) in *same and negative (different direction) in\n// *changed.\nvoid LSTM::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {\n  ASSERT_HOST(other.type() == type_);\n  const LSTM *lstm = static_cast<const LSTM *>(&other);\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    gate_weights_[w].CountAlternators(lstm->gate_weights_[w], same, changed);\n  }\n  if (softmax_ != nullptr) {\n    softmax_->CountAlternators(*lstm->softmax_, same, changed);\n  }\n}\n\n#if DEBUG_DETAIL > 3\n\n// Prints the weights for debug purposes.\nvoid LSTM::PrintW() {\n  tprintf(\"Weight state:%s\\n\", name_.c_str());\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    tprintf(\"Gate %d, inputs\\n\", w);\n    for (int i = 0; i < ni_; ++i) {\n      tprintf(\"Row %d:\", i);\n      for (int s = 0; s < ns_; ++s) {\n        tprintf(\" %g\", gate_weights_[w].GetWeights(s)[i]);\n      }\n      tprintf(\"\\n\");\n    }\n    tprintf(\"Gate %d, outputs\\n\", w);\n    for (int i = ni_; i < ni_ + ns_; ++i) {\n      tprintf(\"Row %d:\", i - ni_);\n      for (int s = 0; s < ns_; ++s) {\n        tprintf(\" %g\", gate_weights_[w].GetWeights(s)[i]);\n      }\n      tprintf(\"\\n\");\n    }\n    tprintf(\"Gate %d, bias\\n\", w);\n    for (int s = 0; s < ns_; ++s) {\n      tprintf(\" %g\", gate_weights_[w].GetWeights(s)[na_]);\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n// Prints the weight deltas for debug purposes.\nvoid LSTM::PrintDW() {\n  tprintf(\"Delta state:%s\\n\", name_.c_str());\n  for (int w = 0; w < WT_COUNT; ++w) {\n    if (w == GFS && !Is2D()) {\n      continue;\n    }\n    tprintf(\"Gate %d, inputs\\n\", w);\n    for (int i = 0; i < ni_; ++i) {\n      tprintf(\"Row %d:\", i);\n      for (int s = 0; s < ns_; ++s) {\n        tprintf(\" %g\", gate_weights_[w].GetDW(s, i));\n      }\n      tprintf(\"\\n\");\n    }\n    tprintf(\"Gate %d, outputs\\n\", w);\n    for (int i = ni_; i < ni_ + ns_; ++i) {\n      tprintf(\"Row %d:\", i - ni_);\n      for (int s = 0; s < ns_; ++s) {\n        tprintf(\" %g\", gate_weights_[w].GetDW(s, i));\n      }\n      tprintf(\"\\n\");\n    }\n    tprintf(\"Gate %d, bias\\n\", w);\n    for (int s = 0; s < ns_; ++s) {\n      tprintf(\" %g\", gate_weights_[w].GetDW(s, na_));\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n#endif\n\n// Resizes forward data to cope with an input image of the given width.\nvoid LSTM::ResizeForward(const NetworkIO &input) {\n  int rounded_inputs = gate_weights_[CI].RoundInputs(na_);\n  source_.Resize(input, rounded_inputs);\n  which_fg_.ResizeNoInit(input.Width(), ns_);\n  if (IsTraining()) {\n    state_.ResizeFloat(input, ns_);\n    for (int w = 0; w < WT_COUNT; ++w) {\n      if (w == GFS && !Is2D()) {\n        continue;\n      }\n      node_values_[w].ResizeFloat(input, ns_);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/lstm.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstm.h\n// Description: Long-term-short-term-memory Recurrent neural network.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_LSTM_H_\n#define TESSERACT_LSTM_LSTM_H_\n\n#include \"fullyconnected.h\"\n#include \"network.h\"\n\nnamespace tesseract {\n\n// C++ Implementation of the LSTM class from lstm.py.\nclass LSTM : public Network {\npublic:\n  // Enum for the different weights in LSTM, to reduce some of the I/O and\n  // setup code to loops. The elements of the enum correspond to elements of an\n  // array of WeightMatrix or a corresponding array of NetworkIO.\n  enum WeightType {\n    CI,  // Cell Inputs.\n    GI,  // Gate at the input.\n    GF1, // Forget gate at the memory (1-d or looking back 1 timestep).\n    GO,  // Gate at the output.\n    GFS, // Forget gate at the memory, looking back in the other dimension.\n\n    WT_COUNT // Number of WeightTypes.\n  };\n\n  // Constructor for NT_LSTM (regular 1 or 2-d LSTM), NT_LSTM_SOFTMAX (LSTM with\n  // additional softmax layer included and fed back into the input at the next\n  // timestep), or NT_LSTM_SOFTMAX_ENCODED (as LSTM_SOFTMAX, but the feedback\n  // is binary encoded instead of categorical) only.\n  // 2-d and bidi softmax LSTMs are not rejected, but are impossible to build\n  // in the conventional way because the output feedback both forwards and\n  // backwards in time does become impossible.\n  TESS_API\n  LSTM(const std::string &name, int num_inputs, int num_states, int num_outputs,\n       bool two_dimensional, NetworkType type);\n  ~LSTM() override;\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    std::string spec;\n    if (type_ == NT_LSTM) {\n      spec += \"Lfx\" + std::to_string(ns_);\n    } else if (type_ == NT_LSTM_SUMMARY) {\n      spec += \"Lfxs\" + std::to_string(ns_);\n    } else if (type_ == NT_LSTM_SOFTMAX) {\n      spec += \"LS\" + std::to_string(ns_);\n    } else if (type_ == NT_LSTM_SOFTMAX_ENCODED) {\n      spec += \"LE\" + std::to_string(ns_);\n    }\n    if (softmax_ != nullptr) {\n      spec += softmax_->spec();\n    }\n    return spec;\n  }\n\n  // Suspends/Enables training by setting the training_ flag. Serialize and\n  // DeSerialize only operate on the run-time data if state is false.\n  void SetEnableTraining(TrainingState state) override;\n\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  int InitWeights(float range, TRand *randomizer) override;\n  // Recursively searches the network for softmaxes with old_no outputs,\n  // and remaps their outputs according to code_map. See network.h for details.\n  int RemapOutputs(int old_no, const std::vector<int> &code_map) override;\n\n  // Converts a float network to an int network.\n  void ConvertToInt() override;\n\n  // Provides debug output on the weights.\n  void DebugWeights() override;\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n  // Updates the weights using the given learning rate, momentum and adam_beta.\n  // num_samples is used in the adam computation iff use_adam_ is true.\n  void Update(float learning_rate, float momentum, float adam_beta, int num_samples) override;\n  // Sums the products of weight updates in *this and other, splitting into\n  // positive (same direction) in *same and negative (different direction) in\n  // *changed.\n  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;\n  // Prints the weights for debug purposes.\n  void PrintW();\n  // Prints the weight deltas for debug purposes.\n  void PrintDW();\n\n  // Returns true of this is a 2-d lstm.\n  bool Is2D() const {\n    return is_2d_;\n  }\n\nprivate:\n  // Resizes forward data to cope with an input image of the given width.\n  void ResizeForward(const NetworkIO &input);\n\nprivate:\n  // Size of padded input to weight matrices = ni_ + no_ for 1-D operation\n  // and ni_ + 2 * no_ for 2-D operation. Note that there is a phantom 1 input\n  // for the bias that makes the weight matrices of size [na + 1][no].\n  int32_t na_;\n  // Number of internal states. Equal to no_ except for a softmax LSTM.\n  // ns_ is NOT serialized, but is calculated from gate_weights_.\n  int32_t ns_;\n  // Number of additional feedback states. The softmax types feed back\n  // additional output information on top of the ns_ internal states.\n  // In the case of a binary-coded (EMBEDDED) softmax, nf_ < no_.\n  int32_t nf_;\n  // Flag indicating 2-D operation.\n  bool is_2d_;\n\n  // Gate weight arrays of size [na + 1, no].\n  WeightMatrix gate_weights_[WT_COUNT];\n  // Used only if this is a softmax LSTM.\n  FullyConnected *softmax_;\n  // Input padded with previous output of size [width, na].\n  NetworkIO source_;\n  // Internal state used during forward operation, of size [width, ns].\n  NetworkIO state_;\n  // State of the 2-d maxpool, generated during forward, used during backward.\n  GENERIC_2D_ARRAY<int8_t> which_fg_;\n  // Internal state saved from forward, but used only during backward.\n  NetworkIO node_values_[WT_COUNT];\n  // Preserved input stride_map used for Backward when NT_LSTM_SQUASHED.\n  StrideMap input_map_;\n  int input_width_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_LSTM_H_\n"
  },
  {
    "path": "src/lstm/lstmrecognizer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmrecognizer.cpp\n// Description: Top-level line recognizer class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"lstmrecognizer.h\"\n\n#include <allheaders.h>\n#include \"dict.h\"\n#include \"genericheap.h\"\n#include \"helpers.h\"\n#include \"imagedata.h\"\n#include \"input.h\"\n#include \"lstm.h\"\n#include \"normalis.h\"\n#include \"pageres.h\"\n#include \"ratngs.h\"\n#include \"recodebeam.h\"\n#include \"scrollview.h\"\n#include \"statistc.h\"\n#include \"tprintf.h\"\n\n#include <unordered_set>\n#include <vector>\n\nnamespace tesseract {\n\n// Default ratio between dict and non-dict words.\nconst double kDictRatio = 2.25;\n// Default certainty offset to give the dictionary a chance.\nconst double kCertOffset = -0.085;\n\nLSTMRecognizer::LSTMRecognizer(const std::string &language_data_path_prefix)\n    : LSTMRecognizer::LSTMRecognizer() {\n  ccutil_.language_data_path_prefix = language_data_path_prefix;\n}\n\nLSTMRecognizer::LSTMRecognizer()\n    : network_(nullptr)\n    , training_flags_(0)\n    , training_iteration_(0)\n    , sample_iteration_(0)\n    , null_char_(UNICHAR_BROKEN)\n    , learning_rate_(0.0f)\n    , momentum_(0.0f)\n    , adam_beta_(0.0f)\n    , dict_(nullptr)\n    , search_(nullptr)\n    , debug_win_(nullptr) {}\n\nLSTMRecognizer::~LSTMRecognizer() {\n  delete network_;\n  delete dict_;\n  delete search_;\n}\n\n// Loads a model from mgr, including the dictionary only if lang is not null.\nbool LSTMRecognizer::Load(const ParamsVectors *params, const std::string &lang,\n                          TessdataManager *mgr) {\n  TFile fp;\n  if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) {\n    return false;\n  }\n  if (!DeSerialize(mgr, &fp)) {\n    return false;\n  }\n  if (lang.empty()) {\n    return true;\n  }\n  // Allow it to run without a dictionary.\n  LoadDictionary(params, lang, mgr);\n  return true;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool LSTMRecognizer::Serialize(const TessdataManager *mgr, TFile *fp) const {\n  bool include_charsets = mgr == nullptr || !mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) ||\n                          !mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET);\n  if (!network_->Serialize(fp)) {\n    return false;\n  }\n  if (include_charsets && !GetUnicharset().save_to_file(fp)) {\n    return false;\n  }\n  if (!fp->Serialize(network_str_)) {\n    return false;\n  }\n  if (!fp->Serialize(&training_flags_)) {\n    return false;\n  }\n  if (!fp->Serialize(&training_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&sample_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&null_char_)) {\n    return false;\n  }\n  if (!fp->Serialize(&adam_beta_)) {\n    return false;\n  }\n  if (!fp->Serialize(&learning_rate_)) {\n    return false;\n  }\n  if (!fp->Serialize(&momentum_)) {\n    return false;\n  }\n  if (include_charsets && IsRecoding() && !recoder_.Serialize(fp)) {\n    return false;\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\nbool LSTMRecognizer::DeSerialize(const TessdataManager *mgr, TFile *fp) {\n  delete network_;\n  network_ = Network::CreateFromFile(fp);\n  if (network_ == nullptr) {\n    return false;\n  }\n  bool include_charsets = mgr == nullptr || !mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) ||\n                          !mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET);\n  if (include_charsets && !ccutil_.unicharset.load_from_file(fp, false)) {\n    return false;\n  }\n  if (!fp->DeSerialize(network_str_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&training_flags_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&training_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&sample_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&null_char_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&adam_beta_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&learning_rate_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&momentum_)) {\n    return false;\n  }\n  if (include_charsets && !LoadRecoder(fp)) {\n    return false;\n  }\n  if (!include_charsets && !LoadCharsets(mgr)) {\n    return false;\n  }\n  network_->SetRandomizer(&randomizer_);\n  network_->CacheXScaleFactor(network_->XScaleFactor());\n  return true;\n}\n\n// Loads the charsets from mgr.\nbool LSTMRecognizer::LoadCharsets(const TessdataManager *mgr) {\n  TFile fp;\n  if (!mgr->GetComponent(TESSDATA_LSTM_UNICHARSET, &fp)) {\n    return false;\n  }\n  if (!ccutil_.unicharset.load_from_file(&fp, false)) {\n    return false;\n  }\n  if (!mgr->GetComponent(TESSDATA_LSTM_RECODER, &fp)) {\n    return false;\n  }\n  if (!LoadRecoder(&fp)) {\n    return false;\n  }\n  return true;\n}\n\n// Loads the Recoder.\nbool LSTMRecognizer::LoadRecoder(TFile *fp) {\n  if (IsRecoding()) {\n    if (!recoder_.DeSerialize(fp)) {\n      return false;\n    }\n    RecodedCharID code;\n    recoder_.EncodeUnichar(UNICHAR_SPACE, &code);\n    if (code(0) != UNICHAR_SPACE) {\n      tprintf(\"Space was garbled in recoding!!\\n\");\n      return false;\n    }\n  } else {\n    recoder_.SetupPassThrough(GetUnicharset());\n    training_flags_ |= TF_COMPRESS_UNICHARSET;\n  }\n  return true;\n}\n\n// Loads the dictionary if possible from the traineddata file.\n// Prints a warning message, and returns false but otherwise fails silently\n// and continues to work without it if loading fails.\n// Note that dictionary load is independent from DeSerialize, but dependent\n// on the unicharset matching. This enables training to deserialize a model\n// from checkpoint or restore without having to go back and reload the\n// dictionary.\n// Some parameters have to be passed in (from langdata/config/api via Tesseract)\nbool LSTMRecognizer::LoadDictionary(const ParamsVectors *params, const std::string &lang,\n                                    TessdataManager *mgr) {\n  delete dict_;\n  dict_ = new Dict(&ccutil_);\n  dict_->user_words_file.ResetFrom(params);\n  dict_->user_words_suffix.ResetFrom(params);\n  dict_->user_patterns_file.ResetFrom(params);\n  dict_->user_patterns_suffix.ResetFrom(params);\n  dict_->SetupForLoad(Dict::GlobalDawgCache());\n  dict_->LoadLSTM(lang, mgr);\n  if (dict_->FinishLoad()) {\n    return true; // Success.\n  }\n  if (log_level <= 0) {\n    tprintf(\"Failed to load any lstm-specific dictionaries for lang %s!!\\n\", lang.c_str());\n  }\n  delete dict_;\n  dict_ = nullptr;\n  return false;\n}\n\n// Recognizes the line image, contained within image_data, returning the\n// ratings matrix and matching box_word for each WERD_RES in the output.\nvoid LSTMRecognizer::RecognizeLine(const ImageData &image_data,\n                                   float invert_threshold, bool debug,\n                                   double worst_dict_cert, const TBOX &line_box,\n                                   PointerVector<WERD_RES> *words, int lstm_choice_mode,\n                                   int lstm_choice_amount) {\n  NetworkIO outputs;\n  float scale_factor;\n  NetworkIO inputs;\n  if (!RecognizeLine(image_data, invert_threshold, debug, false, false, &scale_factor, &inputs, &outputs)) {\n    return;\n  }\n  if (search_ == nullptr) {\n    search_ = new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);\n  }\n  search_->excludedUnichars.clear();\n  search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, &GetUnicharset(),\n                  lstm_choice_mode);\n  search_->ExtractBestPathAsWords(line_box, scale_factor, debug, &GetUnicharset(), words,\n                                  lstm_choice_mode);\n  if (lstm_choice_mode) {\n    search_->extractSymbolChoices(&GetUnicharset());\n    for (int i = 0; i < lstm_choice_amount; ++i) {\n      search_->DecodeSecondaryBeams(outputs, kDictRatio, kCertOffset, worst_dict_cert,\n                                    &GetUnicharset(), lstm_choice_mode);\n      search_->extractSymbolChoices(&GetUnicharset());\n    }\n    search_->segmentTimestepsByCharacters();\n    unsigned char_it = 0;\n    for (size_t i = 0; i < words->size(); ++i) {\n      for (int j = 0; j < words->at(i)->end; ++j) {\n        if (char_it < search_->ctc_choices.size()) {\n          words->at(i)->CTC_symbol_choices.push_back(search_->ctc_choices[char_it]);\n        }\n        if (char_it < search_->segmentedTimesteps.size()) {\n          words->at(i)->segmented_timesteps.push_back(search_->segmentedTimesteps[char_it]);\n        }\n        ++char_it;\n      }\n      words->at(i)->timesteps =\n          search_->combineSegmentedTimesteps(&words->at(i)->segmented_timesteps);\n    }\n    search_->segmentedTimesteps.clear();\n    search_->ctc_choices.clear();\n    search_->excludedUnichars.clear();\n  }\n}\n\n// Helper computes min and mean best results in the output.\nvoid LSTMRecognizer::OutputStats(const NetworkIO &outputs, float *min_output, float *mean_output,\n                                 float *sd) {\n  const int kOutputScale = INT8_MAX;\n  STATS stats(0, kOutputScale);\n  for (int t = 0; t < outputs.Width(); ++t) {\n    int best_label = outputs.BestLabel(t, nullptr);\n    if (best_label != null_char_) {\n      float best_output = outputs.f(t)[best_label];\n      stats.add(static_cast<int>(kOutputScale * best_output), 1);\n    }\n  }\n  // If the output is all nulls it could be that the photometric interpretation\n  // is wrong, so make it look bad, so the other way can win, even if not great.\n  if (stats.get_total() == 0) {\n    *min_output = 0.0f;\n    *mean_output = 0.0f;\n    *sd = 1.0f;\n  } else {\n    *min_output = static_cast<float>(stats.min_bucket()) / kOutputScale;\n    *mean_output = stats.mean() / kOutputScale;\n    *sd = stats.sd() / kOutputScale;\n  }\n}\n\n// Recognizes the image_data, returning the labels,\n// scores, and corresponding pairs of start, end x-coords in coords.\nbool LSTMRecognizer::RecognizeLine(const ImageData &image_data,\n                                   float invert_threshold, bool debug,\n                                   bool re_invert, bool upside_down, float *scale_factor,\n                                   NetworkIO *inputs, NetworkIO *outputs) {\n  // This ensures consistent recognition results.\n  SetRandomSeed();\n  int min_width = network_->XScaleFactor();\n  Image pix = Input::PrepareLSTMInputs(image_data, network_, min_width, &randomizer_, scale_factor);\n  if (pix == nullptr) {\n    tprintf(\"Line cannot be recognized!!\\n\");\n    return false;\n  }\n  // Maximum width of image to train on.\n  const int kMaxImageWidth = 128 * pixGetHeight(pix);\n  if (network_->IsTraining() && pixGetWidth(pix) > kMaxImageWidth) {\n    tprintf(\"Image too large to learn!! Size = %dx%d\\n\", pixGetWidth(pix), pixGetHeight(pix));\n    pix.destroy();\n    return false;\n  }\n  if (upside_down) {\n    pixRotate180(pix, pix);\n  }\n  // Reduction factor from image to coords.\n  *scale_factor = min_width / *scale_factor;\n  inputs->set_int_mode(IsIntMode());\n  SetRandomSeed();\n  Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs);\n  network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs);\n  // Check for auto inversion.\n  if (invert_threshold > 0.0f) {\n    float pos_min, pos_mean, pos_sd;\n    OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd);\n    if (pos_mean < invert_threshold) {\n      // Run again inverted and see if it is any better.\n      NetworkIO inv_inputs, inv_outputs;\n      inv_inputs.set_int_mode(IsIntMode());\n      SetRandomSeed();\n      pixInvert(pix, pix);\n      Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, &inv_inputs);\n      network_->Forward(debug, inv_inputs, nullptr, &scratch_space_, &inv_outputs);\n      float inv_min, inv_mean, inv_sd;\n      OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd);\n      if (inv_mean > pos_mean) {\n        // Inverted did better. Use inverted data.\n        if (debug) {\n          tprintf(\"Inverting image: old min=%g, mean=%g, sd=%g, inv %g,%g,%g\\n\", pos_min, pos_mean,\n                  pos_sd, inv_min, inv_mean, inv_sd);\n        }\n        *outputs = std::move(inv_outputs);\n        *inputs = std::move(inv_inputs);\n      } else if (re_invert) {\n        // Inverting was not an improvement, so undo and run again, so the\n        // outputs match the best forward result.\n        SetRandomSeed();\n        network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs);\n      }\n    }\n  }\n\n  pix.destroy();\n  if (debug) {\n    std::vector<int> labels, coords;\n    LabelsFromOutputs(*outputs, &labels, &coords);\n#ifndef GRAPHICS_DISABLED\n    DisplayForward(*inputs, labels, coords, \"LSTMForward\", &debug_win_);\n#endif\n    DebugActivationPath(*outputs, labels, coords);\n  }\n  return true;\n}\n\n// Converts an array of labels to utf-8, whether or not the labels are\n// augmented with character boundaries.\nstd::string LSTMRecognizer::DecodeLabels(const std::vector<int> &labels) {\n  std::string result;\n  unsigned end = 1;\n  for (unsigned start = 0; start < labels.size(); start = end) {\n    if (labels[start] == null_char_) {\n      end = start + 1;\n    } else {\n      result += DecodeLabel(labels, start, &end, nullptr);\n    }\n  }\n  return result;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the forward results in a window with the characters and\n// boundaries as determined by the labels and label_coords.\nvoid LSTMRecognizer::DisplayForward(const NetworkIO &inputs, const std::vector<int> &labels,\n                                    const std::vector<int> &label_coords, const char *window_name,\n                                    ScrollView **window) {\n  Image input_pix = inputs.ToPix();\n  Network::ClearWindow(false, window_name, pixGetWidth(input_pix), pixGetHeight(input_pix), window);\n  int line_height = Network::DisplayImage(input_pix, *window);\n  DisplayLSTMOutput(labels, label_coords, line_height, *window);\n}\n\n// Displays the labels and cuts at the corresponding xcoords.\n// Size of labels should match xcoords.\nvoid LSTMRecognizer::DisplayLSTMOutput(const std::vector<int> &labels,\n                                       const std::vector<int> &xcoords, int height,\n                                       ScrollView *window) {\n  int x_scale = network_->XScaleFactor();\n  window->TextAttributes(\"Arial\", height / 4, false, false, false);\n  unsigned end = 1;\n  for (unsigned start = 0; start < labels.size(); start = end) {\n    int xpos = xcoords[start] * x_scale;\n    if (labels[start] == null_char_) {\n      end = start + 1;\n      window->Pen(ScrollView::RED);\n    } else {\n      window->Pen(ScrollView::GREEN);\n      const char *str = DecodeLabel(labels, start, &end, nullptr);\n      if (*str == '\\\\') {\n        str = \"\\\\\\\\\";\n      }\n      xpos = xcoords[(start + end) / 2] * x_scale;\n      window->Text(xpos, height, str);\n    }\n    window->Line(xpos, 0, xpos, height * 3 / 2);\n  }\n  window->Update();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Prints debug output detailing the activation path that is implied by the\n// label_coords.\nvoid LSTMRecognizer::DebugActivationPath(const NetworkIO &outputs, const std::vector<int> &labels,\n                                         const std::vector<int> &xcoords) {\n  if (xcoords[0] > 0) {\n    DebugActivationRange(outputs, \"<null>\", null_char_, 0, xcoords[0]);\n  }\n  unsigned end = 1;\n  for (unsigned start = 0; start < labels.size(); start = end) {\n    if (labels[start] == null_char_) {\n      end = start + 1;\n      DebugActivationRange(outputs, \"<null>\", null_char_, xcoords[start], xcoords[end]);\n      continue;\n    } else {\n      int decoded;\n      const char *label = DecodeLabel(labels, start, &end, &decoded);\n      DebugActivationRange(outputs, label, labels[start], xcoords[start], xcoords[start + 1]);\n      for (unsigned i = start + 1; i < end; ++i) {\n        DebugActivationRange(outputs, DecodeSingleLabel(labels[i]), labels[i], xcoords[i],\n                             xcoords[i + 1]);\n      }\n    }\n  }\n}\n\n// Prints debug output detailing activations and 2nd choice over a range\n// of positions.\nvoid LSTMRecognizer::DebugActivationRange(const NetworkIO &outputs, const char *label,\n                                          int best_choice, int x_start, int x_end) {\n  tprintf(\"%s=%d On [%d, %d), scores=\", label, best_choice, x_start, x_end);\n  double max_score = 0.0;\n  double mean_score = 0.0;\n  const int width = x_end - x_start;\n  for (int x = x_start; x < x_end; ++x) {\n    const float *line = outputs.f(x);\n    const double score = line[best_choice] * 100.0;\n    if (score > max_score) {\n      max_score = score;\n    }\n    mean_score += score / width;\n    int best_c = 0;\n    double best_score = 0.0;\n    for (int c = 0; c < outputs.NumFeatures(); ++c) {\n      if (c != best_choice && line[c] > best_score) {\n        best_c = c;\n        best_score = line[c];\n      }\n    }\n    tprintf(\" %.3g(%s=%d=%.3g)\", score, DecodeSingleLabel(best_c), best_c, best_score * 100.0);\n  }\n  tprintf(\", Mean=%g, max=%g\\n\", mean_score, max_score);\n}\n\n// Helper returns true if the null_char is the winner at t, and it beats the\n// null_threshold, or the next choice is space, in which case we will use the\n// null anyway.\n#if 0 // TODO: unused, remove if still unused after 2020.\nstatic bool NullIsBest(const NetworkIO& output, float null_thr,\n                       int null_char, int t) {\n  if (output.f(t)[null_char] >= null_thr) return true;\n  if (output.BestLabel(t, null_char, null_char, nullptr) != UNICHAR_SPACE)\n    return false;\n  return output.f(t)[null_char] > output.f(t)[UNICHAR_SPACE];\n}\n#endif\n\n// Converts the network output to a sequence of labels. Outputs labels, scores\n// and start xcoords of each char, and each null_char_, with an additional\n// final xcoord for the end of the output.\n// The conversion method is determined by internal state.\nvoid LSTMRecognizer::LabelsFromOutputs(const NetworkIO &outputs, std::vector<int> *labels,\n                                       std::vector<int> *xcoords) {\n  if (SimpleTextOutput()) {\n    LabelsViaSimpleText(outputs, labels, xcoords);\n  } else {\n    LabelsViaReEncode(outputs, labels, xcoords);\n  }\n}\n\n// As LabelsViaCTC except that this function constructs the best path that\n// contains only legal sequences of subcodes for CJK.\nvoid LSTMRecognizer::LabelsViaReEncode(const NetworkIO &output, std::vector<int> *labels,\n                                       std::vector<int> *xcoords) {\n  if (search_ == nullptr) {\n    search_ = new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);\n  }\n  search_->Decode(output, 1.0, 0.0, RecodeBeamSearch::kMinCertainty, nullptr);\n  search_->ExtractBestPathAsLabels(labels, xcoords);\n}\n\n// Converts the network output to a sequence of labels, with scores, using\n// the simple character model (each position is a char, and the null_char_ is\n// mainly intended for tail padding.)\nvoid LSTMRecognizer::LabelsViaSimpleText(const NetworkIO &output, std::vector<int> *labels,\n                                         std::vector<int> *xcoords) {\n  labels->clear();\n  xcoords->clear();\n  const int width = output.Width();\n  for (int t = 0; t < width; ++t) {\n    float score = 0.0f;\n    const int label = output.BestLabel(t, &score);\n    if (label != null_char_) {\n      labels->push_back(label);\n      xcoords->push_back(t);\n    }\n  }\n  xcoords->push_back(width);\n}\n\n// Returns a string corresponding to the label starting at start. Sets *end\n// to the next start and if non-null, *decoded to the unichar id.\nconst char *LSTMRecognizer::DecodeLabel(const std::vector<int> &labels, unsigned start, unsigned *end,\n                                        int *decoded) {\n  *end = start + 1;\n  if (IsRecoding()) {\n    // Decode labels via recoder_.\n    RecodedCharID code;\n    if (labels[start] == null_char_) {\n      if (decoded != nullptr) {\n        code.Set(0, null_char_);\n        *decoded = recoder_.DecodeUnichar(code);\n      }\n      return \"<null>\";\n    }\n    unsigned index = start;\n    while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen) {\n      code.Set(code.length(), labels[index++]);\n      while (index < labels.size() && labels[index] == null_char_) {\n        ++index;\n      }\n      int uni_id = recoder_.DecodeUnichar(code);\n      // If the next label isn't a valid first code, then we need to continue\n      // extending even if we have a valid uni_id from this prefix.\n      if (uni_id != INVALID_UNICHAR_ID &&\n          (index == labels.size() || code.length() == RecodedCharID::kMaxCodeLen ||\n           recoder_.IsValidFirstCode(labels[index]))) {\n        *end = index;\n        if (decoded != nullptr) {\n          *decoded = uni_id;\n        }\n        if (uni_id == UNICHAR_SPACE) {\n          return \" \";\n        }\n        return GetUnicharset().get_normed_unichar(uni_id);\n      }\n    }\n    return \"<Undecodable>\";\n  } else {\n    if (decoded != nullptr) {\n      *decoded = labels[start];\n    }\n    if (labels[start] == null_char_) {\n      return \"<null>\";\n    }\n    if (labels[start] == UNICHAR_SPACE) {\n      return \" \";\n    }\n    return GetUnicharset().get_normed_unichar(labels[start]);\n  }\n}\n\n// Returns a string corresponding to a given single label id, falling back to\n// a default of \"..\" for part of a multi-label unichar-id.\nconst char *LSTMRecognizer::DecodeSingleLabel(int label) {\n  if (label == null_char_) {\n    return \"<null>\";\n  }\n  if (IsRecoding()) {\n    // Decode label via recoder_.\n    RecodedCharID code;\n    code.Set(0, label);\n    label = recoder_.DecodeUnichar(code);\n    if (label == INVALID_UNICHAR_ID) {\n      return \"..\"; // Part of a bigger code.\n    }\n  }\n  if (label == UNICHAR_SPACE) {\n    return \" \";\n  }\n  return GetUnicharset().get_normed_unichar(label);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/lstmrecognizer.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmrecognizer.h\n// Description: Top-level line recognizer class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_LSTMRECOGNIZER_H_\n#define TESSERACT_LSTM_LSTMRECOGNIZER_H_\n\n#include \"ccutil.h\"\n#include \"helpers.h\"\n#include \"matrix.h\"\n#include \"network.h\"\n#include \"networkscratch.h\"\n#include \"params.h\"\n#include \"recodebeam.h\"\n#include \"series.h\"\n#include \"unicharcompress.h\"\n\nclass BLOB_CHOICE_IT;\nstruct Pix;\nclass ROW_RES;\nclass ScrollView;\nclass TBOX;\nclass WERD_RES;\n\nnamespace tesseract {\n\nclass Dict;\nclass ImageData;\n\n// Enum indicating training mode control flags.\nenum TrainingFlags {\n  TF_INT_MODE = 1,\n  TF_COMPRESS_UNICHARSET = 64,\n};\n\n// Top-level line recognizer class for LSTM-based networks.\n// Note that a sub-class, LSTMTrainer is used for training.\nclass TESS_API LSTMRecognizer {\npublic:\n  LSTMRecognizer();\n  LSTMRecognizer(const std::string &language_data_path_prefix);\n  ~LSTMRecognizer();\n\n  int NumOutputs() const {\n    return network_->NumOutputs();\n  }\n\n  // Return the training iterations.\n  int training_iteration() const {\n    return training_iteration_;\n  }\n\n  // Return the sample iterations.\n  int sample_iteration() const {\n    return sample_iteration_;\n  }\n\n  // Return the learning rate.\n  float learning_rate() const {\n    return learning_rate_;\n  }\n\n  LossType OutputLossType() const {\n    if (network_ == nullptr) {\n      return LT_NONE;\n    }\n    StaticShape shape;\n    shape = network_->OutputShape(shape);\n    return shape.loss_type();\n  }\n  bool SimpleTextOutput() const {\n    return OutputLossType() == LT_SOFTMAX;\n  }\n  bool IsIntMode() const {\n    return (training_flags_ & TF_INT_MODE) != 0;\n  }\n  // True if recoder_ is active to re-encode text to a smaller space.\n  bool IsRecoding() const {\n    return (training_flags_ & TF_COMPRESS_UNICHARSET) != 0;\n  }\n  // Returns true if the network is a TensorFlow network.\n  bool IsTensorFlow() const {\n    return network_->type() == NT_TENSORFLOW;\n  }\n  // Returns a vector of layer ids that can be passed to other layer functions\n  // to access a specific layer.\n  std::vector<std::string> EnumerateLayers() const {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    auto *series = static_cast<Series *>(network_);\n    std::vector<std::string> layers;\n    series->EnumerateLayers(nullptr, layers);\n    return layers;\n  }\n  // Returns a specific layer from its id (from EnumerateLayers).\n  Network *GetLayer(const std::string &id) const {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    ASSERT_HOST(id.length() > 1 && id[0] == ':');\n    auto *series = static_cast<Series *>(network_);\n    return series->GetLayer(&id[1]);\n  }\n  // Returns the learning rate of the layer from its id.\n  float GetLayerLearningRate(const std::string &id) const {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) {\n      ASSERT_HOST(id.length() > 1 && id[0] == ':');\n      auto *series = static_cast<Series *>(network_);\n      return series->LayerLearningRate(&id[1]);\n    } else {\n      return learning_rate_;\n    }\n  }\n\n  // Return the network string.\n  const char *GetNetwork() const {\n    return network_str_.c_str();\n  }\n\n  // Return the adam beta.\n  float GetAdamBeta() const {\n    return adam_beta_;\n  }\n\n  // Return the momentum.\n  float GetMomentum() const {\n    return momentum_;\n  }\n\n  // Multiplies the all the learning rate(s) by the given factor.\n  void ScaleLearningRate(double factor) {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    learning_rate_ *= factor;\n    if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) {\n      std::vector<std::string> layers = EnumerateLayers();\n      for (auto &layer : layers) {\n        ScaleLayerLearningRate(layer, factor);\n      }\n    }\n  }\n  // Multiplies the learning rate of the layer with id, by the given factor.\n  void ScaleLayerLearningRate(const std::string &id, double factor) {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    ASSERT_HOST(id.length() > 1 && id[0] == ':');\n    auto *series = static_cast<Series *>(network_);\n    series->ScaleLayerLearningRate(&id[1], factor);\n  }\n\n  // Set the all the learning rate(s) to the given value.\n  void SetLearningRate(float learning_rate)\n  {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    learning_rate_ = learning_rate;\n    if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) {\n      for (auto &id : EnumerateLayers()) {\n        SetLayerLearningRate(id, learning_rate);\n      }\n    }\n  }\n  // Set the learning rate of the layer with id, by the given value.\n  void SetLayerLearningRate(const std::string &id, float learning_rate)\n  {\n    ASSERT_HOST(network_ != nullptr && network_->type() == NT_SERIES);\n    ASSERT_HOST(id.length() > 1 && id[0] == ':');\n    auto *series = static_cast<Series *>(network_);\n    series->SetLayerLearningRate(&id[1], learning_rate);\n  }\n\n  // Converts the network to int if not already.\n  void ConvertToInt() {\n    if ((training_flags_ & TF_INT_MODE) == 0) {\n      network_->ConvertToInt();\n      training_flags_ |= TF_INT_MODE;\n    }\n  }\n\n  // Provides access to the UNICHARSET that this classifier works with.\n  const UNICHARSET &GetUnicharset() const {\n    return ccutil_.unicharset;\n  }\n  UNICHARSET &GetUnicharset() {\n    return ccutil_.unicharset;\n  }\n  // Provides access to the UnicharCompress that this classifier works with.\n  const UnicharCompress &GetRecoder() const {\n    return recoder_;\n  }\n  // Provides access to the Dict that this classifier works with.\n  const Dict *GetDict() const {\n    return dict_;\n  }\n  Dict *GetDict() {\n    return dict_;\n  }\n  // Sets the sample iteration to the given value. The sample_iteration_\n  // determines the seed for the random number generator. The training\n  // iteration is incremented only by a successful training iteration.\n  void SetIteration(int iteration) {\n    sample_iteration_ = iteration;\n  }\n  // Accessors for textline image normalization.\n  int NumInputs() const {\n    return network_->NumInputs();\n  }\n\n  // Return the null char index.\n  int null_char() const {\n    return null_char_;\n  }\n\n  // Loads a model from mgr, including the dictionary only if lang is not null.\n  bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr);\n\n  // Writes to the given file. Returns false in case of error.\n  // If mgr contains a unicharset and recoder, then they are not encoded to fp.\n  bool Serialize(const TessdataManager *mgr, TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If mgr contains a unicharset and recoder, then they are taken from there,\n  // otherwise, they are part of the serialization in fp.\n  bool DeSerialize(const TessdataManager *mgr, TFile *fp);\n  // Loads the charsets from mgr.\n  bool LoadCharsets(const TessdataManager *mgr);\n  // Loads the Recoder.\n  bool LoadRecoder(TFile *fp);\n  // Loads the dictionary if possible from the traineddata file.\n  // Prints a warning message, and returns false but otherwise fails silently\n  // and continues to work without it if loading fails.\n  // Note that dictionary load is independent from DeSerialize, but dependent\n  // on the unicharset matching. This enables training to deserialize a model\n  // from checkpoint or restore without having to go back and reload the\n  // dictionary.\n  bool LoadDictionary(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr);\n\n  // Recognizes the line image, contained within image_data, returning the\n  // recognized tesseract WERD_RES for the words.\n  // If invert_threshold > 0, tries inverted as well if the normal\n  // interpretation doesn't produce a result which at least reaches\n  // that threshold. The line_box is used for computing the\n  // box_word in the output words. worst_dict_cert is the worst certainty that\n  // will be used in a dictionary word.\n  void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert,\n                     const TBOX &line_box, PointerVector<WERD_RES> *words, int lstm_choice_mode = 0,\n                     int lstm_choice_amount = 5);\n\n  // Helper computes min and mean best results in the output.\n  void OutputStats(const NetworkIO &outputs, float *min_output, float *mean_output, float *sd);\n  // Recognizes the image_data, returning the labels,\n  // scores, and corresponding pairs of start, end x-coords in coords.\n  // Returned in scale_factor is the reduction factor\n  // between the image and the output coords, for computing bounding boxes.\n  // If re_invert is true, the input is inverted back to its original\n  // photometric interpretation if inversion is attempted but fails to\n  // improve the results. This ensures that outputs contains the correct\n  // forward outputs for the best photometric interpretation.\n  // inputs is filled with the used inputs to the network.\n  bool RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, bool re_invert,\n                     bool upside_down, float *scale_factor, NetworkIO *inputs, NetworkIO *outputs);\n\n  // Converts an array of labels to utf-8, whether or not the labels are\n  // augmented with character boundaries.\n  std::string DecodeLabels(const std::vector<int> &labels);\n\n  // Displays the forward results in a window with the characters and\n  // boundaries as determined by the labels and label_coords.\n  void DisplayForward(const NetworkIO &inputs, const std::vector<int> &labels,\n                      const std::vector<int> &label_coords, const char *window_name,\n                      ScrollView **window);\n  // Converts the network output to a sequence of labels. Outputs labels, scores\n  // and start xcoords of each char, and each null_char_, with an additional\n  // final xcoord for the end of the output.\n  // The conversion method is determined by internal state.\n  void LabelsFromOutputs(const NetworkIO &outputs, std::vector<int> *labels,\n                         std::vector<int> *xcoords);\n\nprotected:\n  // Sets the random seed from the sample_iteration_;\n  void SetRandomSeed() {\n    int64_t seed = sample_iteration_ * 0x10000001LL;\n    randomizer_.set_seed(seed);\n    randomizer_.IntRand();\n  }\n\n  // Displays the labels and cuts at the corresponding xcoords.\n  // Size of labels should match xcoords.\n  void DisplayLSTMOutput(const std::vector<int> &labels, const std::vector<int> &xcoords,\n                         int height, ScrollView *window);\n\n  // Prints debug output detailing the activation path that is implied by the\n  // xcoords.\n  void DebugActivationPath(const NetworkIO &outputs, const std::vector<int> &labels,\n                           const std::vector<int> &xcoords);\n\n  // Prints debug output detailing activations and 2nd choice over a range\n  // of positions.\n  void DebugActivationRange(const NetworkIO &outputs, const char *label, int best_choice,\n                            int x_start, int x_end);\n\n  // As LabelsViaCTC except that this function constructs the best path that\n  // contains only legal sequences of subcodes for recoder_.\n  void LabelsViaReEncode(const NetworkIO &output, std::vector<int> *labels,\n                         std::vector<int> *xcoords);\n  // Converts the network output to a sequence of labels, with scores, using\n  // the simple character model (each position is a char, and the null_char_ is\n  // mainly intended for tail padding.)\n  void LabelsViaSimpleText(const NetworkIO &output, std::vector<int> *labels,\n                           std::vector<int> *xcoords);\n\n  // Returns a string corresponding to the label starting at start. Sets *end\n  // to the next start and if non-null, *decoded to the unichar id.\n  const char *DecodeLabel(const std::vector<int> &labels, unsigned start, unsigned *end, int *decoded);\n\n  // Returns a string corresponding to a given single label id, falling back to\n  // a default of \"..\" for part of a multi-label unichar-id.\n  const char *DecodeSingleLabel(int label);\n\nprotected:\n  // The network hierarchy.\n  Network *network_;\n  // The unicharset. Only the unicharset element is serialized.\n  // Has to be a CCUtil, so Dict can point to it.\n  CCUtil ccutil_;\n  // For backward compatibility, recoder_ is serialized iff\n  // training_flags_ & TF_COMPRESS_UNICHARSET.\n  // Further encode/decode ccutil_.unicharset's ids to simplify the unicharset.\n  UnicharCompress recoder_;\n\n  // ==Training parameters that are serialized to provide a record of them.==\n  std::string network_str_;\n  // Flags used to determine the training method of the network.\n  // See enum TrainingFlags above.\n  int32_t training_flags_;\n  // Number of actual backward training steps used.\n  int32_t training_iteration_;\n  // Index into training sample set. sample_iteration >= training_iteration_.\n  int32_t sample_iteration_;\n  // Index in softmax of null character. May take the value UNICHAR_BROKEN or\n  // ccutil_.unicharset.size().\n  int32_t null_char_;\n  // Learning rate and momentum multipliers of deltas in backprop.\n  float learning_rate_;\n  float momentum_;\n  // Smoothing factor for 2nd moment of gradients.\n  float adam_beta_;\n\n  // === NOT SERIALIZED.\n  TRand randomizer_;\n  NetworkScratch scratch_space_;\n  // Language model (optional) to use with the beam search.\n  Dict *dict_;\n  // Beam search held between uses to optimize memory allocation/use.\n  RecodeBeamSearch *search_;\n\n  // == Debugging parameters.==\n  // Recognition debug display window.\n  ScrollView *debug_win_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_LSTMRECOGNIZER_H_\n"
  },
  {
    "path": "src/lstm/maxpool.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        maxpool.cpp\n// Description: Standard Max-Pooling layer.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"maxpool.h\"\n\nnamespace tesseract {\n\nMaxpool::Maxpool(const std::string &name, int ni, int x_scale, int y_scale)\n    : Reconfig(name, ni, x_scale, y_scale) {\n  type_ = NT_MAXPOOL;\n  no_ = ni;\n}\n\n// Reads from the given file. Returns false in case of error.\nbool Maxpool::DeSerialize(TFile *fp) {\n  bool result = Reconfig::DeSerialize(fp);\n  no_ = ni_;\n  return result;\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Maxpool::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                      NetworkScratch *scratch, NetworkIO *output) {\n  output->ResizeScaled(input, x_scale_, y_scale_, no_);\n  maxes_.ResizeNoInit(output->Width(), ni_);\n  back_map_ = input.stride_map();\n\n  StrideMap::Index dest_index(output->stride_map());\n  do {\n    int out_t = dest_index.t();\n    StrideMap::Index src_index(input.stride_map(), dest_index.index(FD_BATCH),\n                               dest_index.index(FD_HEIGHT) * y_scale_,\n                               dest_index.index(FD_WIDTH) * x_scale_);\n    // Find the max input out of x_scale_ groups of y_scale_ inputs.\n    // Do it independently for each input dimension.\n    int *max_line = maxes_[out_t];\n    int in_t = src_index.t();\n    output->CopyTimeStepFrom(out_t, input, in_t);\n    for (int i = 0; i < ni_; ++i) {\n      max_line[i] = in_t;\n    }\n    for (int x = 0; x < x_scale_; ++x) {\n      for (int y = 0; y < y_scale_; ++y) {\n        StrideMap::Index src_xy(src_index);\n        if (src_xy.AddOffset(x, FD_WIDTH) && src_xy.AddOffset(y, FD_HEIGHT)) {\n          output->MaxpoolTimeStep(out_t, input, src_xy.t(), max_line);\n        }\n      }\n    }\n  } while (dest_index.Increment());\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Maxpool::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                       NetworkIO *back_deltas) {\n  back_deltas->ResizeToMap(fwd_deltas.int_mode(), back_map_, ni_);\n  back_deltas->MaxpoolBackward(fwd_deltas, maxes_);\n  return true;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/maxpool.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        maxpool.h\n// Description: Standard Max-Pooling layer.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_MAXPOOL_H_\n#define TESSERACT_LSTM_MAXPOOL_H_\n\n#include \"reconfig.h\"\n\nnamespace tesseract {\n\n// Maxpooling reduction. Independently for each input, selects the location\n// in the rectangle that contains the max value.\n// Backprop propagates only to the position that was the max.\nclass Maxpool : public Reconfig {\npublic:\n  TESS_API\n  Maxpool(const std::string &name, int ni, int x_scale, int y_scale);\n  ~Maxpool() override = default;\n\n  // Accessors.\n  std::string spec() const override {\n    return \"Mp\" + std::to_string(y_scale_) + \",\" + std::to_string(x_scale_);\n  }\n\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\nprivate:\n  // Memory of which input was the max.\n  GENERIC_2D_ARRAY<int> maxes_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_MAXPOOL_H_\n"
  },
  {
    "path": "src/lstm/network.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        network.cpp\n// Description: Base class for neural network implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"network.h\"\n\n#include <cstdlib>\n\n// This base class needs to know about all its sub-classes because of the\n// factory deserializing method: CreateFromFile.\n#include <allheaders.h>\n#include \"convolve.h\"\n#include \"fullyconnected.h\"\n#include \"input.h\"\n#include \"lstm.h\"\n#include \"maxpool.h\"\n#include \"parallel.h\"\n#include \"reconfig.h\"\n#include \"reversed.h\"\n#include \"scrollview.h\"\n#include \"series.h\"\n#include \"statistc.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n#ifndef GRAPHICS_DISABLED\n\n// Min and max window sizes.\nconst int kMinWinSize = 500;\nconst int kMaxWinSize = 2000;\n// Window frame sizes need adding on to make the content fit.\nconst int kXWinFrameSize = 30;\nconst int kYWinFrameSize = 80;\n\n#endif // !GRAPHICS_DISABLED\n\n// String names corresponding to the NetworkType enum.\n// Keep in sync with NetworkType.\n// Names used in Serialization to allow re-ordering/addition/deletion of\n// layer types in NetworkType without invalidating existing network files.\nstatic char const *const kTypeNames[NT_COUNT] = {\n    \"Invalid\",     \"Input\",\n    \"Convolve\",    \"Maxpool\",\n    \"Parallel\",    \"Replicated\",\n    \"ParBidiLSTM\", \"DepParUDLSTM\",\n    \"Par2dLSTM\",   \"Series\",\n    \"Reconfig\",    \"RTLReversed\",\n    \"TTBReversed\", \"XYTranspose\",\n    \"LSTM\",        \"SummLSTM\",\n    \"Logistic\",    \"LinLogistic\",\n    \"LinTanh\",     \"Tanh\",\n    \"Relu\",        \"Linear\",\n    \"Softmax\",     \"SoftmaxNoCTC\",\n    \"LSTMSoftmax\", \"LSTMBinarySoftmax\",\n    \"TensorFlow\",\n};\n\nNetwork::Network()\n    : type_(NT_NONE)\n    , training_(TS_ENABLED)\n    , needs_to_backprop_(true)\n    , network_flags_(0)\n    , ni_(0)\n    , no_(0)\n    , num_weights_(0)\n    , forward_win_(nullptr)\n    , backward_win_(nullptr)\n    , randomizer_(nullptr) {}\nNetwork::Network(NetworkType type, const std::string &name, int ni, int no)\n    : type_(type)\n    , training_(TS_ENABLED)\n    , needs_to_backprop_(true)\n    , network_flags_(0)\n    , ni_(ni)\n    , no_(no)\n    , num_weights_(0)\n    , name_(name)\n    , forward_win_(nullptr)\n    , backward_win_(nullptr)\n    , randomizer_(nullptr) {}\n\n// Suspends/Enables/Permanently disables training by setting the training_\n// flag. Serialize and DeSerialize only operate on the run-time data if state\n// is TS_DISABLED or TS_TEMP_DISABLE. Specifying TS_TEMP_DISABLE will\n// temporarily disable layers in state TS_ENABLED, allowing a trainer to\n// serialize as if it were a recognizer.\n// TS_RE_ENABLE will re-enable layers that were previously in any disabled\n// state. If in TS_TEMP_DISABLE then the flag is just changed, but if in\n// TS_DISABLED, the deltas in the weight matrices are reinitialized so that a\n// recognizer can be converted back to a trainer.\nvoid Network::SetEnableTraining(TrainingState state) {\n  if (state == TS_RE_ENABLE) {\n    // Enable only from temp disabled.\n    if (training_ == TS_TEMP_DISABLE) {\n      training_ = TS_ENABLED;\n    }\n  } else if (state == TS_TEMP_DISABLE) {\n    // Temp disable only from enabled.\n    if (training_ == TS_ENABLED) {\n      training_ = state;\n    }\n  } else {\n    training_ = state;\n  }\n}\n\n// Sets flags that control the action of the network. See NetworkFlags enum\n// for bit values.\nvoid Network::SetNetworkFlags(uint32_t flags) {\n  network_flags_ = flags;\n}\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\nint Network::InitWeights([[maybe_unused]] float range, TRand *randomizer) {\n  randomizer_ = randomizer;\n  return 0;\n}\n\n// Provides a pointer to a TRand for any networks that care to use it.\n// Note that randomizer is a borrowed pointer that should outlive the network\n// and should not be deleted by any of the networks.\nvoid Network::SetRandomizer(TRand *randomizer) {\n  randomizer_ = randomizer;\n}\n\n// Sets needs_to_backprop_ to needs_backprop and returns true if\n// needs_backprop || any weights in this network so the next layer forward\n// can be told to produce backprop for this layer if needed.\nbool Network::SetupNeedsBackprop(bool needs_backprop) {\n  needs_to_backprop_ = needs_backprop;\n  return needs_backprop || num_weights_ > 0;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool Network::Serialize(TFile *fp) const {\n  int8_t data = NT_NONE;\n  if (!fp->Serialize(&data)) {\n    return false;\n  }\n  std::string type_name = kTypeNames[type_];\n  if (!fp->Serialize(type_name)) {\n    return false;\n  }\n  data = training_;\n  if (!fp->Serialize(&data)) {\n    return false;\n  }\n  data = needs_to_backprop_;\n  if (!fp->Serialize(&data)) {\n    return false;\n  }\n  if (!fp->Serialize(&network_flags_)) {\n    return false;\n  }\n  if (!fp->Serialize(&ni_)) {\n    return false;\n  }\n  if (!fp->Serialize(&no_)) {\n    return false;\n  }\n  if (!fp->Serialize(&num_weights_)) {\n    return false;\n  }\n  uint32_t length = name_.length();\n  if (!fp->Serialize(&length)) {\n    return false;\n  }\n  return fp->Serialize(name_.c_str(), length);\n}\n\nstatic NetworkType getNetworkType(TFile *fp) {\n  int8_t data;\n  if (!fp->DeSerialize(&data)) {\n    return NT_NONE;\n  }\n  if (data == NT_NONE) {\n    std::string type_name;\n    if (!fp->DeSerialize(type_name)) {\n      return NT_NONE;\n    }\n    for (data = 0; data < NT_COUNT && type_name != kTypeNames[data]; ++data) {\n    }\n    if (data == NT_COUNT) {\n      tprintf(\"Invalid network layer type:%s\\n\", type_name.c_str());\n      return NT_NONE;\n    }\n  }\n  return static_cast<NetworkType>(data);\n}\n\n// Reads from the given file. Returns nullptr in case of error.\n// Determines the type of the serialized class and calls its DeSerialize\n// on a new object of the appropriate type, which is returned.\nNetwork *Network::CreateFromFile(TFile *fp) {\n  NetworkType type;       // Type of the derived network class.\n  TrainingState training; // Are we currently training?\n  bool needs_to_backprop; // This network needs to output back_deltas.\n  int32_t network_flags;  // Behavior control flags in NetworkFlags.\n  int32_t ni;             // Number of input values.\n  int32_t no;             // Number of output values.\n  int32_t num_weights;    // Number of weights in this and sub-network.\n  std::string name;       // A unique name for this layer.\n  int8_t data;\n  Network *network = nullptr;\n  type = getNetworkType(fp);\n  if (!fp->DeSerialize(&data)) {\n    return nullptr;\n  }\n  training = data == TS_ENABLED ? TS_ENABLED : TS_DISABLED;\n  if (!fp->DeSerialize(&data)) {\n    return nullptr;\n  }\n  needs_to_backprop = data != 0;\n  if (!fp->DeSerialize(&network_flags)) {\n    return nullptr;\n  }\n  if (!fp->DeSerialize(&ni)) {\n    return nullptr;\n  }\n  if (!fp->DeSerialize(&no)) {\n    return nullptr;\n  }\n  if (!fp->DeSerialize(&num_weights)) {\n    return nullptr;\n  }\n  if (!fp->DeSerialize(name)) {\n    return nullptr;\n  }\n\n  switch (type) {\n    case NT_CONVOLVE:\n      network = new Convolve(name, ni, 0, 0);\n      break;\n    case NT_INPUT:\n      network = new Input(name, ni, no);\n      break;\n    case NT_LSTM:\n    case NT_LSTM_SOFTMAX:\n    case NT_LSTM_SOFTMAX_ENCODED:\n    case NT_LSTM_SUMMARY:\n      network = new LSTM(name, ni, no, no, false, type);\n      break;\n    case NT_MAXPOOL:\n      network = new Maxpool(name, ni, 0, 0);\n      break;\n    // All variants of Parallel.\n    case NT_PARALLEL:\n    case NT_REPLICATED:\n    case NT_PAR_RL_LSTM:\n    case NT_PAR_UD_LSTM:\n    case NT_PAR_2D_LSTM:\n      network = new Parallel(name, type);\n      break;\n    case NT_RECONFIG:\n      network = new Reconfig(name, ni, 0, 0);\n      break;\n    // All variants of reversed.\n    case NT_XREVERSED:\n    case NT_YREVERSED:\n    case NT_XYTRANSPOSE:\n      network = new Reversed(name, type);\n      break;\n    case NT_SERIES:\n      network = new Series(name);\n      break;\n    case NT_TENSORFLOW:\n      tprintf(\"Unsupported TensorFlow model\\n\");\n      break;\n    // All variants of FullyConnected.\n    case NT_SOFTMAX:\n    case NT_SOFTMAX_NO_CTC:\n    case NT_RELU:\n    case NT_TANH:\n    case NT_LINEAR:\n    case NT_LOGISTIC:\n    case NT_POSCLIP:\n    case NT_SYMCLIP:\n      network = new FullyConnected(name, ni, no, type);\n      break;\n    default:\n      break;\n  }\n  if (network) {\n    network->training_ = training;\n    network->needs_to_backprop_ = needs_to_backprop;\n    network->network_flags_ = network_flags;\n    network->num_weights_ = num_weights;\n    if (!network->DeSerialize(fp)) {\n      delete network;\n      network = nullptr;\n    }\n  }\n  return network;\n}\n\n// Returns a random number in [-range, range].\nTFloat Network::Random(TFloat range) {\n  ASSERT_HOST(randomizer_ != nullptr);\n  return randomizer_->SignedRand(range);\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// === Debug image display methods. ===\n// Displays the image of the matrix to the forward window.\nvoid Network::DisplayForward(const NetworkIO &matrix) {\n  Image image = matrix.ToPix();\n  ClearWindow(false, name_.c_str(), pixGetWidth(image), pixGetHeight(image), &forward_win_);\n  DisplayImage(image, forward_win_);\n  forward_win_->Update();\n}\n\n// Displays the image of the matrix to the backward window.\nvoid Network::DisplayBackward(const NetworkIO &matrix) {\n  Image image = matrix.ToPix();\n  std::string window_name = name_ + \"-back\";\n  ClearWindow(false, window_name.c_str(), pixGetWidth(image), pixGetHeight(image), &backward_win_);\n  DisplayImage(image, backward_win_);\n  backward_win_->Update();\n}\n\n// Creates the window if needed, otherwise clears it.\nvoid Network::ClearWindow(bool tess_coords, const char *window_name, int width, int height,\n                          ScrollView **window) {\n  if (*window == nullptr) {\n    int min_size = std::min(width, height);\n    if (min_size < kMinWinSize) {\n      if (min_size < 1) {\n        min_size = 1;\n      }\n      width = width * kMinWinSize / min_size;\n      height = height * kMinWinSize / min_size;\n    }\n    width += kXWinFrameSize;\n    height += kYWinFrameSize;\n    if (width > kMaxWinSize) {\n      width = kMaxWinSize;\n    }\n    if (height > kMaxWinSize) {\n      height = kMaxWinSize;\n    }\n    *window = new ScrollView(window_name, 80, 100, width, height, width, height, tess_coords);\n    tprintf(\"Created window %s of size %d, %d\\n\", window_name, width, height);\n  } else {\n    (*window)->Clear();\n  }\n}\n\n// Displays the pix in the given window. and returns the height of the pix.\n// The pix is pixDestroyed.\nint Network::DisplayImage(Image pix, ScrollView *window) {\n  int height = pixGetHeight(pix);\n  window->Draw(pix, 0, 0);\n  pix.destroy();\n  return height;\n}\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/network.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        network.h\n// Description: Base class for neural network implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_NETWORK_H_\n#define TESSERACT_LSTM_NETWORK_H_\n\n#include \"helpers.h\"\n#include \"matrix.h\"\n#include \"networkio.h\"\n#include \"serialis.h\"\n#include \"static_shape.h\"\n\n#include <cmath>\n#include <cstdio>\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass ScrollView;\nclass TBOX;\nclass ImageData;\nclass NetworkScratch;\n\n// Enum to store the run-time type of a Network. Keep in sync with kTypeNames.\nenum NetworkType {\n  NT_NONE,  // The naked base class.\n  NT_INPUT, // Inputs from an image.\n  // Plumbing networks combine other networks or rearrange the inputs.\n  NT_CONVOLVE,    // Duplicates inputs in a sliding window neighborhood.\n  NT_MAXPOOL,     // Chooses the max result from a rectangle.\n  NT_PARALLEL,    // Runs networks in parallel.\n  NT_REPLICATED,  // Runs identical networks in parallel.\n  NT_PAR_RL_LSTM, // Runs LTR and RTL LSTMs in parallel.\n  NT_PAR_UD_LSTM, // Runs Up and Down LSTMs in parallel.\n  NT_PAR_2D_LSTM, // Runs 4 LSTMs in parallel.\n  NT_SERIES,      // Executes a sequence of layers.\n  NT_RECONFIG,    // Scales the time/y size but makes the output deeper.\n  NT_XREVERSED,   // Reverses the x direction of the inputs/outputs.\n  NT_YREVERSED,   // Reverses the y-direction of the inputs/outputs.\n  NT_XYTRANSPOSE, // Transposes x and y (for just a single op).\n  // Functional networks actually calculate stuff.\n  NT_LSTM,           // Long-Short-Term-Memory block.\n  NT_LSTM_SUMMARY,   // LSTM that only keeps its last output.\n  NT_LOGISTIC,       // Fully connected logistic nonlinearity.\n  NT_POSCLIP,        // Fully connected rect lin version of logistic.\n  NT_SYMCLIP,        // Fully connected rect lin version of tanh.\n  NT_TANH,           // Fully connected with tanh nonlinearity.\n  NT_RELU,           // Fully connected with rectifier nonlinearity.\n  NT_LINEAR,         // Fully connected with no nonlinearity.\n  NT_SOFTMAX,        // Softmax uses exponential normalization, with CTC.\n  NT_SOFTMAX_NO_CTC, // Softmax uses exponential normalization, no CTC.\n  // The SOFTMAX LSTMs both have an extra softmax layer on top, but inside, with\n  // the outputs fed back to the input of the LSTM at the next timestep.\n  // The ENCODED version binary encodes the softmax outputs, providing log2 of\n  // the number of outputs as additional inputs, and the other version just\n  // provides all the softmax outputs as additional inputs.\n  NT_LSTM_SOFTMAX,         // 1-d LSTM with built-in fully connected softmax.\n  NT_LSTM_SOFTMAX_ENCODED, // 1-d LSTM with built-in binary encoded softmax.\n  // A TensorFlow graph encapsulated as a Tesseract network.\n  NT_TENSORFLOW,\n\n  NT_COUNT // Array size.\n};\n\n// Enum of Network behavior flags. Can in theory be set for each individual\n// network element.\nenum NetworkFlags {\n  // Network forward/backprop behavior.\n  NF_LAYER_SPECIFIC_LR = 64, // Separate learning rate for each layer.\n  NF_ADAM = 128,             // Weight-specific learning rate.\n};\n\n// State of training and desired state used in SetEnableTraining.\nenum TrainingState {\n  // Valid states of training_.\n  TS_DISABLED,     // Disabled permanently.\n  TS_ENABLED,      // Enabled for backprop and to write a training dump.\n                   // Re-enable from ANY disabled state.\n  TS_TEMP_DISABLE, // Temporarily disabled to write a recognition dump.\n  // Valid only for SetEnableTraining.\n  TS_RE_ENABLE, // Re-Enable from TS_TEMP_DISABLE, but not TS_DISABLED.\n};\n\n// Base class for network types. Not quite an abstract base class, but almost.\n// Most of the time no isolated Network exists, except prior to\n// deserialization.\nclass TESS_API Network {\npublic:\n  Network();\n  Network(NetworkType type, const std::string &name, int ni, int no);\n  virtual ~Network() = default;\n\n  // Accessors.\n  NetworkType type() const {\n    return type_;\n  }\n  bool IsTraining() const {\n    return training_ == TS_ENABLED;\n  }\n  bool needs_to_backprop() const {\n    return needs_to_backprop_;\n  }\n  int num_weights() const {\n    return num_weights_;\n  }\n  int NumInputs() const {\n    return ni_;\n  }\n  int NumOutputs() const {\n    return no_;\n  }\n  // Returns the required shape input to the network.\n  virtual StaticShape InputShape() const {\n    StaticShape result;\n    return result;\n  }\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  virtual StaticShape OutputShape(const StaticShape &input_shape) const {\n    StaticShape result(input_shape);\n    result.set_depth(no_);\n    return result;\n  }\n  const std::string &name() const {\n    return name_;\n  }\n  virtual std::string spec() const = 0;\n  bool TestFlag(NetworkFlags flag) const {\n    return (network_flags_ & flag) != 0;\n  }\n\n  // Initialization and administrative functions that are mostly provided\n  // by Plumbing.\n  // Returns true if the given type is derived from Plumbing, and thus contains\n  // multiple sub-networks that can have their own learning rate.\n  virtual bool IsPlumbingType() const {\n    return false;\n  }\n\n  // Suspends/Enables/Permanently disables training by setting the training_\n  // flag. Serialize and DeSerialize only operate on the run-time data if state\n  // is TS_DISABLED or TS_TEMP_DISABLE. Specifying TS_TEMP_DISABLE will\n  // temporarily disable layers in state TS_ENABLED, allowing a trainer to\n  // serialize as if it were a recognizer.\n  // TS_RE_ENABLE will re-enable layers that were previously in any disabled\n  // state. If in TS_TEMP_DISABLE then the flag is just changed, but if in\n  // TS_DISABLED, the deltas in the weight matrices are reinitialized so that a\n  // recognizer can be converted back to a trainer.\n  virtual void SetEnableTraining(TrainingState state);\n\n  // Sets flags that control the action of the network. See NetworkFlags enum\n  // for bit values.\n  virtual void SetNetworkFlags(uint32_t flags);\n\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  // Note that randomizer is a borrowed pointer that should outlive the network\n  // and should not be deleted by any of the networks.\n  // Returns the number of weights initialized.\n  virtual int InitWeights(float range, TRand *randomizer);\n  // Changes the number of outputs to the outside world to the size of the given\n  // code_map. Recursively searches the entire network for Softmax layers that\n  // have exactly old_no outputs, and operates only on those, leaving all others\n  // unchanged. This enables networks with multiple output layers to get all\n  // their softmaxes updated, but if an internal layer, uses one of those\n  // softmaxes for input, then the inputs will effectively be scrambled.\n  // TODO(rays) Fix this before any such network is implemented.\n  // The softmaxes are resized by copying the old weight matrix entries for each\n  // output from code_map[output] where non-negative, and uses the mean (over\n  // all outputs) of the existing weights for all outputs with negative code_map\n  // entries. Returns the new number of weights.\n  virtual int RemapOutputs([[maybe_unused]] int old_no,\n                           [[maybe_unused]] const std::vector<int> &code_map) {\n    return 0;\n  }\n\n  // Converts a float network to an int network.\n  virtual void ConvertToInt() {}\n\n  // Provides a pointer to a TRand for any networks that care to use it.\n  // Note that randomizer is a borrowed pointer that should outlive the network\n  // and should not be deleted by any of the networks.\n  virtual void SetRandomizer(TRand *randomizer);\n\n  // Sets needs_to_backprop_ to needs_backprop and returns true if\n  // needs_backprop || any weights in this network so the next layer forward\n  // can be told to produce backprop for this layer if needed.\n  virtual bool SetupNeedsBackprop(bool needs_backprop);\n\n  // Returns the most recent reduction factor that the network applied to the\n  // time sequence. Assumes that any 2-d is already eliminated. Used for\n  // scaling bounding boxes of truth data and calculating result bounding boxes.\n  // WARNING: if GlobalMinimax is used to vary the scale, this will return\n  // the last used scale factor. Call it before any forward, and it will return\n  // the minimum scale factor of the paths through the GlobalMinimax.\n  virtual int XScaleFactor() const {\n    return 1;\n  }\n\n  // Provides the (minimum) x scale factor to the network (of interest only to\n  // input units) so they can determine how to scale bounding boxes.\n  virtual void CacheXScaleFactor([[maybe_unused]] int factor) {}\n\n  // Provides debug output on the weights.\n  virtual void DebugWeights() = 0;\n\n  // Writes to the given file. Returns false in case of error.\n  // Should be overridden by subclasses, but called by their Serialize.\n  virtual bool Serialize(TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // Should be overridden by subclasses, but NOT called by their DeSerialize.\n  virtual bool DeSerialize(TFile *fp) = 0;\n\npublic:\n  // Updates the weights using the given learning rate, momentum and adam_beta.\n  // num_samples is used in the adam computation iff use_adam_ is true.\n  virtual void Update([[maybe_unused]] float learning_rate,\n                      [[maybe_unused]] float momentum,\n                      [[maybe_unused]] float adam_beta,\n                      [[maybe_unused]] int num_samples) {}\n  // Sums the products of weight updates in *this and other, splitting into\n  // positive (same direction) in *same and negative (different direction) in\n  // *changed.\n  virtual void CountAlternators([[maybe_unused]] const Network &other,\n                                [[maybe_unused]] TFloat *same,\n                                [[maybe_unused]] TFloat *changed) const {}\n\n  // Reads from the given file. Returns nullptr in case of error.\n  // Determines the type of the serialized class and calls its DeSerialize\n  // on a new object of the appropriate type, which is returned.\n  static Network *CreateFromFile(TFile *fp);\n\n  // Runs forward propagation of activations on the input line.\n  // Note that input and output are both 2-d arrays.\n  // The 1st index is the time element. In a 1-d network, it might be the pixel\n  // position on the textline. In a 2-d network, the linearization is defined\n  // by the stride_map. (See networkio.h).\n  // The 2nd index of input is the network inputs/outputs, and the dimension\n  // of the input must match NumInputs() of this network.\n  // The output array will be resized as needed so that its 1st dimension is\n  // always equal to the number of output values, and its second dimension is\n  // always NumOutputs(). Note that all this detail is encapsulated away inside\n  // NetworkIO, as are the internals of the scratch memory space used by the\n  // network. See networkscratch.h for that.\n  // If input_transpose is not nullptr, then it contains the transpose of input,\n  // and the caller guarantees that it will still be valid on the next call to\n  // backward. The callee is therefore at liberty to save the pointer and\n  // reference it on a call to backward. This is a bit ugly, but it makes it\n  // possible for a replicating parallel to calculate the input transpose once\n  // instead of all the replicated networks having to do it.\n  virtual void Forward(bool debug, const NetworkIO &input,\n                       const TransposedArray *input_transpose,\n                       NetworkScratch *scratch, NetworkIO *output) = 0;\n\n  // Runs backward propagation of errors on fwdX_deltas.\n  // Note that fwd_deltas and back_deltas are both 2-d arrays as with Forward.\n  // Returns false if back_deltas was not set, due to there being no point in\n  // propagating further backwards. Thus most complete networks will always\n  // return false from Backward!\n  virtual bool Backward(bool debug, const NetworkIO &fwd_deltas,\n                        NetworkScratch *scratch, NetworkIO *back_deltas) = 0;\n\n  // === Debug image display methods. ===\n  // Displays the image of the matrix to the forward window.\n  void DisplayForward(const NetworkIO &matrix);\n  // Displays the image of the matrix to the backward window.\n  void DisplayBackward(const NetworkIO &matrix);\n\n  // Creates the window if needed, otherwise clears it.\n  static void ClearWindow(bool tess_coords, const char *window_name, int width,\n                          int height, ScrollView **window);\n\n  // Displays the pix in the given window. and returns the height of the pix.\n  // The pix is pixDestroyed.\n  static int DisplayImage(Image pix, ScrollView *window);\n\nprotected:\n  // Returns a random number in [-range, range].\n  TFloat Random(TFloat range);\n\nprotected:\n  NetworkType type_;       // Type of the derived network class.\n  TrainingState training_; // Are we currently training?\n  bool needs_to_backprop_; // This network needs to output back_deltas.\n  int32_t network_flags_;  // Behavior control flags in NetworkFlags.\n  int32_t ni_;             // Number of input values.\n  int32_t no_;             // Number of output values.\n  int32_t num_weights_;    // Number of weights in this and sub-network.\n  std::string name_;       // A unique name for this layer.\n\n  // NOT-serialized debug data.\n  ScrollView *forward_win_;  // Recognition debug display window.\n  ScrollView *backward_win_; // Training debug display window.\n  TRand *randomizer_;        // Random number generator.\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_NETWORK_H_\n"
  },
  {
    "path": "src/lstm/networkio.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        networkio.cpp\n// Description: Network input/output data, allowing float/int implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"networkio.h\"\n#include <cfloat> // for FLT_MAX\n#include <cmath>\n\n#include <allheaders.h>\n#include \"functions.h\"\n#include \"statistc.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Minimum value to output for certainty.\nconst float kMinCertainty = -20.0f;\n// Probability corresponding to kMinCertainty.\nconst float kMinProb = std::exp(kMinCertainty);\n\n// Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.\nvoid NetworkIO::Resize2d(bool int_mode, int width, int num_features) {\n  stride_map_ = StrideMap();\n  int_mode_ = int_mode;\n  if (int_mode_) {\n    i_.ResizeNoInit(width, num_features, GetPadding(num_features));\n  } else {\n    f_.ResizeNoInit(width, num_features);\n  }\n}\n\n// Resizes to a specific stride_map.\nvoid NetworkIO::ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features) {\n  // If this method crashes with this == nullptr,\n  // it most likely got here through an uninitialized scratch element,\n  // ie call NetworkScratch::IO::Resizexxx() not NetworkIO::Resizexxx()!!\n  stride_map_ = stride_map;\n  int_mode_ = int_mode;\n  if (int_mode_) {\n    i_.ResizeNoInit(stride_map.Width(), num_features, GetPadding(num_features));\n  } else {\n    f_.ResizeNoInit(stride_map.Width(), num_features);\n  }\n  ZeroInvalidElements();\n}\n\n// Shrinks image size by x_scale,y_scale, and use given number of features.\nvoid NetworkIO::ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features) {\n  StrideMap stride_map = src.stride_map_;\n  stride_map.ScaleXY(x_scale, y_scale);\n  ResizeToMap(src.int_mode_, stride_map, num_features);\n}\n\n// Resizes to just 1 x-coord, whatever the input.\nvoid NetworkIO::ResizeXTo1(const NetworkIO &src, int num_features) {\n  StrideMap stride_map = src.stride_map_;\n  stride_map.ReduceWidthTo1();\n  ResizeToMap(src.int_mode_, stride_map, num_features);\n}\n\n// Initialize all the array to zero.\nvoid NetworkIO::Zero() {\n  int width = Width();\n  // Zero out the everything. Column-by-column in case it is aligned.\n  for (int t = 0; t < width; ++t) {\n    ZeroTimeStep(t);\n  }\n}\n\n// Initializes to zero all elements of the array that do not correspond to\n// valid image positions. (If a batch of different-sized images are packed\n// together, then there will be padding pixels.)\nvoid NetworkIO::ZeroInvalidElements() {\n  int num_features = NumFeatures();\n  int full_width = stride_map_.Size(FD_WIDTH);\n  int full_height = stride_map_.Size(FD_HEIGHT);\n  StrideMap::Index b_index(stride_map_);\n  do {\n    int end_x = b_index.MaxIndexOfDim(FD_WIDTH) + 1;\n    if (end_x < full_width) {\n      // The width is small, so fill for every valid y.\n      StrideMap::Index y_index(b_index);\n      int fill_size = num_features * (full_width - end_x);\n      do {\n        StrideMap::Index z_index(y_index);\n        z_index.AddOffset(end_x, FD_WIDTH);\n        if (int_mode_) {\n          ZeroVector(fill_size, i_[z_index.t()]);\n        } else {\n          ZeroVector(fill_size, f_[z_index.t()]);\n        }\n      } while (y_index.AddOffset(1, FD_HEIGHT));\n    }\n    int end_y = b_index.MaxIndexOfDim(FD_HEIGHT) + 1;\n    if (end_y < full_height) {\n      // The height is small, so fill in the space in one go.\n      StrideMap::Index y_index(b_index);\n      y_index.AddOffset(end_y, FD_HEIGHT);\n      int fill_size = num_features * full_width * (full_height - end_y);\n      if (int_mode_) {\n        ZeroVector(fill_size, i_[y_index.t()]);\n      } else {\n        ZeroVector(fill_size, f_[y_index.t()]);\n      }\n    }\n  } while (b_index.AddOffset(1, FD_BATCH));\n}\n\n// Helper computes a black point and white point to contrast-enhance an image.\n// The computation is based on the assumption that the image is of a single line\n// of text, so a horizontal line through the middle of the image passes through\n// at least some of it, so local minima and maxima are a good proxy for black\n// and white pixel samples.\nstatic void ComputeBlackWhite(Image pix, float *black, float *white) {\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  STATS mins(0, 255), maxes(0, 255);\n  if (width >= 3) {\n    int y = height / 2;\n    l_uint32 *line = pixGetData(pix) + pixGetWpl(pix) * y;\n    int prev = GET_DATA_BYTE(line, 0);\n    int curr = GET_DATA_BYTE(line, 1);\n    for (int x = 1; x + 1 < width; ++x) {\n      int next = GET_DATA_BYTE(line, x + 1);\n      if ((curr < prev && curr <= next) || (curr <= prev && curr < next)) {\n        // Local minimum.\n        mins.add(curr, 1);\n      }\n      if ((curr > prev && curr >= next) || (curr >= prev && curr > next)) {\n        // Local maximum.\n        maxes.add(curr, 1);\n      }\n      prev = curr;\n      curr = next;\n    }\n  }\n  if (mins.get_total() == 0) {\n    mins.add(0, 1);\n  }\n  if (maxes.get_total() == 0) {\n    maxes.add(255, 1);\n  }\n  *black = mins.ile(0.25);\n  *white = maxes.ile(0.75);\n}\n\n// Sets up the array from the given image, using the currently set int_mode_.\n// If the image width doesn't match the shape, the image is truncated or padded\n// with noise to match.\nvoid NetworkIO::FromPix(const StaticShape &shape, const Image pix, TRand *randomizer) {\n  std::vector<Image> pixes(1, pix);\n  FromPixes(shape, pixes, randomizer);\n}\n\n// Sets up the array from the given set of images, using the currently set\n// int_mode_. If the image width doesn't match the shape, the images are\n// truncated or padded with noise to match.\nvoid NetworkIO::FromPixes(const StaticShape &shape, const std::vector<Image> &pixes,\n                          TRand *randomizer) {\n  int target_height = shape.height();\n  int target_width = shape.width();\n  std::vector<std::pair<int, int>> h_w_pairs;\n  for (auto &&pix : pixes) {\n    Image var_pix = pix;\n    int width = pixGetWidth(var_pix);\n    if (target_width != 0) {\n      width = target_width;\n    }\n    int height = pixGetHeight(var_pix);\n    if (target_height != 0) {\n      height = target_height;\n    }\n    h_w_pairs.emplace_back(height, width);\n  }\n  stride_map_.SetStride(h_w_pairs);\n  ResizeToMap(int_mode(), stride_map_, shape.depth());\n  // Iterate over the images again to copy the data.\n  for (size_t b = 0; b < pixes.size(); ++b) {\n    Image pix = pixes[b];\n    float black = 0.0f, white = 255.0f;\n    if (shape.depth() != 3) {\n      ComputeBlackWhite(pix, &black, &white);\n    }\n    float contrast = (white - black) / 2.0f;\n    if (contrast <= 0.0f) {\n      contrast = 1.0f;\n    }\n    if (shape.height() == 1) {\n      Copy1DGreyImage(b, pix, black, contrast, randomizer);\n    } else {\n      Copy2DImage(b, pix, black, contrast, randomizer);\n    }\n  }\n}\n\n// Copies the given pix to *this at the given batch index, stretching and\n// clipping the pixel values so that [black, black + 2*contrast] maps to the\n// dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.\n// This is a 2-d operation in the sense that the output depth is the number\n// of input channels, the height is the height of the image, and the width\n// is the width of the image, or truncated/padded with noise if the width\n// is a fixed size.\nvoid NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer) {\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  int wpl = pixGetWpl(pix);\n  StrideMap::Index index(stride_map_);\n  index.AddOffset(batch, FD_BATCH);\n  int t = index.t();\n  int target_height = stride_map_.Size(FD_HEIGHT);\n  int target_width = stride_map_.Size(FD_WIDTH);\n  int num_features = NumFeatures();\n  bool color = num_features == 3;\n  if (width > target_width) {\n    width = target_width;\n  }\n  uint32_t *line = pixGetData(pix);\n  for (int y = 0; y < target_height; ++y, line += wpl) {\n    int x = 0;\n    if (y < height) {\n      for (x = 0; x < width; ++x, ++t) {\n        if (color) {\n          int f = 0;\n          for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) {\n            int pixel = GET_DATA_BYTE(line + x, c);\n            SetPixel(t, f++, pixel, black, contrast);\n          }\n        } else {\n          int pixel = GET_DATA_BYTE(line, x);\n          SetPixel(t, 0, pixel, black, contrast);\n        }\n      }\n    }\n    for (; x < target_width; ++x) {\n      Randomize(t++, 0, num_features, randomizer);\n    }\n  }\n}\n\n// Copies the given pix to *this at the given batch index, as Copy2DImage\n// above, except that the output depth is the height of the input image, the\n// output height is 1, and the output width as for Copy2DImage.\n// The image is thus treated as a 1-d set of vertical pixel strips.\nvoid NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contrast,\n                                TRand *randomizer) {\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  ASSERT_HOST(height == NumFeatures());\n  int wpl = pixGetWpl(pix);\n  StrideMap::Index index(stride_map_);\n  index.AddOffset(batch, FD_BATCH);\n  int t = index.t();\n  int target_width = stride_map_.Size(FD_WIDTH);\n  if (width > target_width) {\n    width = target_width;\n  }\n  int x;\n  for (x = 0; x < width; ++x, ++t) {\n    for (int y = 0; y < height; ++y) {\n      uint32_t *line = pixGetData(pix) + wpl * y;\n      int pixel = GET_DATA_BYTE(line, x);\n      SetPixel(t, y, pixel, black, contrast);\n    }\n  }\n  for (; x < target_width; ++x) {\n    Randomize(t++, 0, height, randomizer);\n  }\n}\n\n// Helper stores the pixel value in i_ or f_ according to int_mode_.\n// t: is the index from the StrideMap corresponding to the current\n//   [batch,y,x] position\n// f: is the index into the depth/channel\n// pixel: the value of the pixel from the image (in one channel)\n// black: the pixel value to map to the lowest of the range of *this\n// contrast: the range of pixel values to stretch to half the range of *this.\nvoid NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) {\n  float float_pixel = (pixel - black) / contrast - 1.0f;\n  if (int_mode_) {\n    i_[t][f] = ClipToRange<int>(IntCastRounded((INT8_MAX + 1) * float_pixel), -INT8_MAX, INT8_MAX);\n  } else {\n    f_[t][f] = float_pixel;\n  }\n}\n\n// Converts the array to a Pix. Must be pixDestroyed after use.\nImage NetworkIO::ToPix() const {\n  // Count the width of the image, and find the max multiplication factor.\n  int im_width = stride_map_.Size(FD_WIDTH);\n  int im_height = stride_map_.Size(FD_HEIGHT);\n  int num_features = NumFeatures();\n  int feature_factor = 1;\n  if (num_features == 3) {\n    // Special hack for color.\n    num_features = 1;\n    feature_factor = 3;\n  }\n  Image pix = pixCreate(im_width, im_height * num_features, 32);\n  StrideMap::Index index(stride_map_);\n  do {\n    int im_x = index.index(FD_WIDTH);\n    int top_im_y = index.index(FD_HEIGHT);\n    int im_y = top_im_y;\n    int t = index.t();\n    if (int_mode_) {\n      const int8_t *features = i_[t];\n      for (int y = 0; y < num_features; ++y, im_y += im_height) {\n        int pixel = features[y * feature_factor];\n        // 1 or 2 features use greyscale.\n        int red = ClipToRange<int>(pixel + 128, 0, 255);\n        int green = red, blue = red;\n        if (feature_factor == 3) {\n          // With 3 features assume RGB color.\n          green = ClipToRange<int>(features[y * feature_factor + 1] + 128, 0, 255);\n          blue = ClipToRange<int>(features[y * feature_factor + 2] + 128, 0, 255);\n        } else if (num_features > 3) {\n          // More than 3 features use false yellow/blue color, assuming a signed\n          // input in the range [-1,1].\n          red = abs(pixel) * 2;\n          if (pixel >= 0) {\n            green = red;\n            blue = 0;\n          } else {\n            blue = red;\n            green = red = 0;\n          }\n        }\n        pixSetPixel(pix, im_x, im_y,\n                    (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));\n      }\n    } else {\n      const float *features = f_[t];\n      for (int y = 0; y < num_features; ++y, im_y += im_height) {\n        float pixel = features[y * feature_factor];\n        // 1 or 2 features use greyscale.\n        int red = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);\n        int green = red, blue = red;\n        if (feature_factor == 3) {\n          // With 3 features assume RGB color.\n          pixel = features[y * feature_factor + 1];\n          green = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);\n          pixel = features[y * feature_factor + 2];\n          blue = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);\n        } else if (num_features > 3) {\n          // More than 3 features use false yellow/blue color, assuming a signed\n          // input in the range [-1,1].\n          red = ClipToRange<int>(IntCastRounded(std::fabs(pixel) * 255), 0, 255);\n          if (pixel >= 0) {\n            green = red;\n            blue = 0;\n          } else {\n            blue = red;\n            green = red = 0;\n          }\n        }\n        pixSetPixel(pix, im_x, im_y,\n                    (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));\n      }\n    }\n  } while (index.Increment());\n  return pix;\n}\n\n// Prints the first and last num timesteps of the array for each feature.\nvoid NetworkIO::Print(int num) const {\n  int num_features = NumFeatures();\n  for (int y = 0; y < num_features; ++y) {\n    for (int t = 0; t < Width(); ++t) {\n      if (num == 0 || t < num || t + num >= Width()) {\n        if (int_mode_) {\n          tprintf(\" %g\", static_cast<float>(i_[t][y]) / INT8_MAX);\n        } else {\n          tprintf(\" %g\", f_[t][y]);\n        }\n      }\n    }\n    tprintf(\"\\n\");\n  }\n}\n\n// Copies a single time step from src.\nvoid NetworkIO::CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t) {\n  ASSERT_HOST(int_mode_ == src.int_mode_);\n  if (int_mode_) {\n    memcpy(i_[dest_t], src.i_[src_t], i_.dim2() * sizeof(i_[0][0]));\n  } else {\n    memcpy(f_[dest_t], src.f_[src_t], f_.dim2() * sizeof(f_[0][0]));\n  }\n}\n\n// Copies a part of single time step from src.\nvoid NetworkIO::CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features,\n                                    const NetworkIO &src, int src_t, int src_offset) {\n  ASSERT_HOST(int_mode_ == src.int_mode_);\n  if (int_mode_) {\n    memcpy(i_[dest_t] + dest_offset, src.i_[src_t] + src_offset, num_features * sizeof(i_[0][0]));\n  } else {\n    memcpy(f_[dest_t] + dest_offset, src.f_[src_t] + src_offset, num_features * sizeof(f_[0][0]));\n  }\n}\n\n// Sets the given range to random values.\nvoid NetworkIO::Randomize(int t, int offset, int num_features, TRand *randomizer) {\n  if (int_mode_) {\n    int8_t *line = i_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      line[i] = IntCastRounded(randomizer->SignedRand(INT8_MAX));\n    }\n  } else {\n    // float mode.\n    float *line = f_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      line[i] = randomizer->SignedRand(1.0);\n    }\n  }\n}\n\n// Helper returns the label and score of the best choice over a range.\nint NetworkIO::BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating,\n                                   float *certainty) const {\n  if (t_end <= t_start) {\n    return -1;\n  }\n  int max_char = -1;\n  float min_score = 0.0f;\n  for (int c = 0; c < NumFeatures(); ++c) {\n    if (c == not_this || c == null_ch) {\n      continue;\n    }\n    ScoresOverRange(t_start, t_end, c, null_ch, rating, certainty);\n    if (max_char < 0 || *rating < min_score) {\n      min_score = *rating;\n      max_char = c;\n    }\n  }\n  ScoresOverRange(t_start, t_end, max_char, null_ch, rating, certainty);\n  return max_char;\n}\n\n// Helper returns the rating and certainty of the choice over a range in output.\nvoid NetworkIO::ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating,\n                                float *certainty) const {\n  ASSERT_HOST(!int_mode_);\n  *rating = 0.0f;\n  *certainty = 0.0f;\n  if (t_end <= t_start || t_end <= 0) {\n    return;\n  }\n  float ratings[3] = {0.0f, 0.0f, 0.0f};\n  float certs[3] = {0.0f, 0.0f, 0.0f};\n  for (int t = t_start; t < t_end; ++t) {\n    const float *line = f_[t];\n    float score = ProbToCertainty(line[choice]);\n    float zero = ProbToCertainty(line[null_ch]);\n    if (t == t_start) {\n      ratings[2] = FLT_MAX;\n      ratings[1] = -score;\n      certs[1] = score;\n    } else {\n      for (int i = 2; i >= 1; --i) {\n        if (ratings[i] > ratings[i - 1]) {\n          ratings[i] = ratings[i - 1];\n          certs[i] = certs[i - 1];\n        }\n      }\n      ratings[2] -= zero;\n      if (zero < certs[2]) {\n        certs[2] = zero;\n      }\n      ratings[1] -= score;\n      if (score < certs[1]) {\n        certs[1] = score;\n      }\n    }\n    ratings[0] -= zero;\n    if (zero < certs[0]) {\n      certs[0] = zero;\n    }\n  }\n  int best_i = ratings[2] < ratings[1] ? 2 : 1;\n  *rating = ratings[best_i] + t_end - t_start;\n  *certainty = certs[best_i];\n}\n\n// Returns the index (label) of the best value at the given timestep,\n// excluding not_this and not_that, and if not null, sets the score to the\n// log of the corresponding value.\nint NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const {\n  ASSERT_HOST(!int_mode_);\n  int best_index = -1;\n  float best_score = -FLT_MAX;\n  const float *line = f_[t];\n  for (int i = 0; i < f_.dim2(); ++i) {\n    if (line[i] > best_score && i != not_this && i != not_that) {\n      best_score = line[i];\n      best_index = i;\n    }\n  }\n  if (score != nullptr) {\n    *score = ProbToCertainty(best_score);\n  }\n  return best_index;\n}\n\n// Returns the best start position out of [start, end) (into which all labels\n// must fit) to obtain the highest cumulative score for the given labels.\nint NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const {\n  int length = labels.size();\n  int last_start = end - length;\n  int best_start = -1;\n  TFloat best_score = 0;\n  for (int s = start; s <= last_start; ++s) {\n    TFloat score = ScoreOfLabels(labels, s);\n    if (score > best_score || best_start < 0) {\n      best_score = score;\n      best_start = s;\n    }\n  }\n  return best_start;\n}\n\n// Returns the cumulative score of the given labels starting at start, and\n// using one label per time-step.\nTFloat NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {\n  int length = labels.size();\n  TFloat score = 0;\n  for (int i = 0; i < length; ++i) {\n    score += f_(start + i, labels[i]);\n  }\n  return score;\n}\n\n// Helper function sets all the outputs for a single timestep, such that\n// label has value ok_score, and the other labels share 1 - ok_score.\nvoid NetworkIO::SetActivations(int t, int label, float ok_score) {\n  ASSERT_HOST(!int_mode_);\n  int num_classes = NumFeatures();\n  float bad_score = (1.0f - ok_score) / (num_classes - 1);\n  float *targets = f_[t];\n  for (int i = 0; i < num_classes; ++i) {\n    targets[i] = bad_score;\n  }\n  targets[label] = ok_score;\n}\n\n// Modifies the values, only if needed, so that the given label is\n// the winner at the given time step t.\nvoid NetworkIO::EnsureBestLabel(int t, int label) {\n  ASSERT_HOST(!int_mode_);\n  if (BestLabel(t, nullptr) != label) {\n    // Output value needs enhancing. Third all the other elements and add the\n    // remainder to best_label.\n    int num_classes = NumFeatures();\n    float *targets = f_[t];\n    for (int c = 0; c < num_classes; ++c) {\n      if (c == label) {\n        targets[c] += (1.0 - targets[c]) * (2 / 3.0);\n      } else {\n        targets[c] /= 3.0;\n      }\n    }\n  }\n}\n\n// Helper function converts prob to certainty taking the minimum into account.\n/* static */\nfloat NetworkIO::ProbToCertainty(float prob) {\n  return prob > kMinProb ? std::log(prob) : kMinCertainty;\n}\n\n// Returns true if there is any bad value that is suspiciously like a GT\n// error. Assuming that *this is the difference(gradient) between target\n// and forward output, returns true if there is a large negative value\n// (correcting a very confident output) for which there is no corresponding\n// positive value in an adjacent timestep for the same feature index. This\n// allows the box-truthed samples to make fine adjustments to position while\n// stopping other disagreements of confident output with ground truth.\nbool NetworkIO::AnySuspiciousTruth(float confidence_thr) const {\n  int num_features = NumFeatures();\n  for (int t = 0; t < Width(); ++t) {\n    const float *features = f_[t];\n    for (int y = 0; y < num_features; ++y) {\n      float grad = features[y];\n      if (grad < -confidence_thr) {\n        // Correcting strong output. Check for movement.\n        if ((t == 0 || f_[t - 1][y] < confidence_thr / 2) &&\n            (t + 1 == Width() || f_[t + 1][y] < confidence_thr / 2)) {\n          return true; // No strong positive on either side.\n        }\n      }\n    }\n  }\n  return false;\n}\n\n// Reads a single timestep to floats in the range [-1, 1].\nvoid NetworkIO::ReadTimeStep(int t, TFloat *output) const {\n  if (int_mode_) {\n    const int8_t *line = i_[t];\n    for (int i = 0; i < i_.dim2(); ++i) {\n      output[i] = static_cast<TFloat>(line[i]) / INT8_MAX;\n    }\n  } else {\n    const float *line = f_[t];\n    for (int i = 0; i < f_.dim2(); ++i) {\n      output[i] = static_cast<TFloat>(line[i]);\n    }\n  }\n}\n\n// Adds a single timestep to floats.\nvoid NetworkIO::AddTimeStep(int t, TFloat *inout) const {\n  int num_features = NumFeatures();\n  if (int_mode_) {\n    const int8_t *line = i_[t];\n    for (int i = 0; i < num_features; ++i) {\n      inout[i] += static_cast<TFloat>(line[i]) / INT8_MAX;\n    }\n  } else {\n    const float *line = f_[t];\n    for (int i = 0; i < num_features; ++i) {\n      inout[i] += line[i];\n    }\n  }\n}\n\n// Adds part of a single timestep to floats.\nvoid NetworkIO::AddTimeStepPart(int t, int offset, int num_features, float *inout) const {\n  if (int_mode_) {\n    const int8_t *line = i_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      inout[i] += static_cast<float>(line[i]) / INT8_MAX;\n    }\n  } else {\n    const float *line = f_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      inout[i] += line[i];\n    }\n  }\n}\n\n// Writes a single timestep from floats in the range [-1, 1].\nvoid NetworkIO::WriteTimeStep(int t, const TFloat *input) {\n  WriteTimeStepPart(t, 0, NumFeatures(), input);\n}\n\n// Writes a single timestep from floats in the range [-1, 1] writing only\n// num_features elements of input to (*this)[t], starting at offset.\nvoid NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input) {\n  if (int_mode_) {\n    int8_t *line = i_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      line[i] = ClipToRange<int>(IntCastRounded(input[i] * INT8_MAX), -INT8_MAX, INT8_MAX);\n    }\n  } else {\n    float *line = f_[t] + offset;\n    for (int i = 0; i < num_features; ++i) {\n      line[i] = static_cast<float>(input[i]);\n    }\n  }\n}\n\n// Maxpools a single time step from src.\nvoid NetworkIO::MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line) {\n  ASSERT_HOST(int_mode_ == src.int_mode_);\n  if (int_mode_) {\n    int dim = i_.dim2();\n    int8_t *dest_line = i_[dest_t];\n    const int8_t *src_line = src.i_[src_t];\n    for (int i = 0; i < dim; ++i) {\n      if (dest_line[i] < src_line[i]) {\n        dest_line[i] = src_line[i];\n        max_line[i] = src_t;\n      }\n    }\n  } else {\n    int dim = f_.dim2();\n    float *dest_line = f_[dest_t];\n    const float *src_line = src.f_[src_t];\n    for (int i = 0; i < dim; ++i) {\n      if (dest_line[i] < src_line[i]) {\n        dest_line[i] = src_line[i];\n        max_line[i] = src_t;\n      }\n    }\n  }\n}\n\n// Runs maxpool backward, using maxes to index timesteps in *this.\nvoid NetworkIO::MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes) {\n  ASSERT_HOST(!int_mode_);\n  Zero();\n  StrideMap::Index index(fwd.stride_map_);\n  do {\n    int t = index.t();\n    const int *max_line = maxes[t];\n    const float *fwd_line = fwd.f_[t];\n    int num_features = fwd.f_.dim2();\n    for (int i = 0; i < num_features; ++i) {\n      f_[max_line[i]][i] = fwd_line[i];\n    }\n  } while (index.Increment());\n}\n\n// Returns the min over time of the maxes over features of the outputs.\nfloat NetworkIO::MinOfMaxes() const {\n  float min_max = 0.0f;\n  int width = Width();\n  int num_features = NumFeatures();\n  for (int t = 0; t < width; ++t) {\n    float max_value = -FLT_MAX;\n    if (int_mode_) {\n      const int8_t *column = i_[t];\n      for (int i = 0; i < num_features; ++i) {\n        if (column[i] > max_value) {\n          max_value = column[i];\n        }\n      }\n    } else {\n      const float *column = f_[t];\n      for (int i = 0; i < num_features; ++i) {\n        if (column[i] > max_value) {\n          max_value = column[i];\n        }\n      }\n    }\n    if (t == 0 || max_value < min_max) {\n      min_max = max_value;\n    }\n  }\n  return min_max;\n}\n\n// Computes combined results for a combiner that chooses between an existing\n// input and itself, with an additional output to indicate the choice.\nvoid NetworkIO::CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output) {\n  int no = base_output.NumFeatures();\n  ASSERT_HOST(combiner_output.NumFeatures() == no + 1);\n  Resize(base_output, no);\n  int width = Width();\n  if (int_mode_) {\n    // Number of outputs from base and final result.\n    for (int t = 0; t < width; ++t) {\n      int8_t *out_line = i_[t];\n      const int8_t *base_line = base_output.i_[t];\n      const int8_t *comb_line = combiner_output.i_[t];\n      float base_weight = static_cast<float>(comb_line[no]) / INT8_MAX;\n      float boost_weight = 1.0f - base_weight;\n      for (int i = 0; i < no; ++i) {\n        out_line[i] = IntCastRounded(base_line[i] * base_weight + comb_line[i] * boost_weight);\n      }\n    }\n  } else {\n    for (int t = 0; t < width; ++t) {\n      float *out_line = f_[t];\n      const float *base_line = base_output.f_[t];\n      const float *comb_line = combiner_output.f_[t];\n      float base_weight = comb_line[no];\n      float boost_weight = 1.0f - base_weight;\n      for (int i = 0; i < no; ++i) {\n        out_line[i] = base_line[i] * base_weight + comb_line[i] * boost_weight;\n      }\n    }\n  }\n}\n\n// Computes deltas for a combiner that chooses between 2 sets of inputs.\nvoid NetworkIO::ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output) {\n  ASSERT_HOST(!int_mode_);\n  // Compute the deltas for the combiner.\n  int width = Width();\n  int no = NumFeatures() - 1;\n  ASSERT_HOST(fwd_deltas.NumFeatures() == no);\n  ASSERT_HOST(base_output.NumFeatures() == no);\n  // Number of outputs from base and final result.\n  for (int t = 0; t < width; ++t) {\n    const float *delta_line = fwd_deltas.f_[t];\n    const float *base_line = base_output.f_[t];\n    float *comb_line = f_[t];\n    float base_weight = comb_line[no];\n    float boost_weight = 1.0f - base_weight;\n    float max_base_delta = 0.0;\n    for (int i = 0; i < no; ++i) {\n      // What did the combiner actually produce?\n      float output = base_line[i] * base_weight + comb_line[i] * boost_weight;\n      // Reconstruct the target from the delta.\n      float comb_target = delta_line[i] + output;\n      comb_line[i] = comb_target - comb_line[i];\n      float base_delta = std::fabs(comb_target - base_line[i]);\n      if (base_delta > max_base_delta) {\n        max_base_delta = base_delta;\n      }\n    }\n    if (max_base_delta >= 0.5) {\n      // The base network got it wrong. The combiner should output the right\n      // answer and 0 for the base network.\n      comb_line[no] = 0.0 - base_weight;\n    } else {\n      // The base network was right. The combiner should flag that.\n      for (int i = 0; i < no; ++i) {\n        // All other targets are 0.\n        if (comb_line[i] > 0.0) {\n          comb_line[i] -= 1.0;\n        }\n      }\n      comb_line[no] = 1.0 - base_weight;\n    }\n  }\n}\n\n// Copies the array checking that the types match.\nvoid NetworkIO::CopyAll(const NetworkIO &src) {\n  ASSERT_HOST(src.int_mode_ == int_mode_);\n  f_ = src.f_;\n}\n\n// Checks that both are floats and adds the src array to *this.\nvoid NetworkIO::AddAllToFloat(const NetworkIO &src) {\n  ASSERT_HOST(!int_mode_);\n  ASSERT_HOST(!src.int_mode_);\n  f_ += src.f_;\n}\n\n// Subtracts the array from a float array. src must also be float.\nvoid NetworkIO::SubtractAllFromFloat(const NetworkIO &src) {\n  ASSERT_HOST(!int_mode_);\n  ASSERT_HOST(!src.int_mode_);\n  f_ -= src.f_;\n}\n\n// Copies src to *this, with maxabs normalization to match scale.\nvoid NetworkIO::CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale) {\n  ASSERT_HOST(!int_mode_);\n  ASSERT_HOST(!src.int_mode_);\n  ASSERT_HOST(!scale.int_mode_);\n  float src_max = src.f_.MaxAbs();\n  ASSERT_HOST(std::isfinite(src_max));\n  float scale_max = scale.f_.MaxAbs();\n  ASSERT_HOST(std::isfinite(scale_max));\n  if (src_max > 0.0f) {\n    float factor = scale_max / src_max;\n    for (int t = 0; t < src.Width(); ++t) {\n      const float *src_ptr = src.f_[t];\n      float *dest_ptr = f_[t];\n      for (int i = 0; i < src.f_.dim2(); ++i) {\n        dest_ptr[i] = src_ptr[i] * factor;\n      }\n    }\n  } else {\n    f_.Clear();\n  }\n}\n\n// Copies src to *this with independent reversal of the y dimension.\nvoid NetworkIO::CopyWithYReversal(const NetworkIO &src) {\n  int num_features = src.NumFeatures();\n  Resize(src, num_features);\n  StrideMap::Index b_index(src.stride_map_);\n  do {\n    int width = b_index.MaxIndexOfDim(FD_WIDTH) + 1;\n    StrideMap::Index fwd_index(b_index);\n    StrideMap::Index rev_index(b_index);\n    rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_HEIGHT), FD_HEIGHT);\n    do {\n      int fwd_t = fwd_index.t();\n      int rev_t = rev_index.t();\n      for (int x = 0; x < width; ++x) {\n        CopyTimeStepFrom(rev_t++, src, fwd_t++);\n      }\n    } while (fwd_index.AddOffset(1, FD_HEIGHT) && rev_index.AddOffset(-1, FD_HEIGHT));\n  } while (b_index.AddOffset(1, FD_BATCH));\n}\n\n// Copies src to *this with independent reversal of the x dimension.\nvoid NetworkIO::CopyWithXReversal(const NetworkIO &src) {\n  int num_features = src.NumFeatures();\n  Resize(src, num_features);\n  StrideMap::Index b_index(src.stride_map_);\n  do {\n    StrideMap::Index y_index(b_index);\n    do {\n      StrideMap::Index fwd_index(y_index);\n      StrideMap::Index rev_index(y_index);\n      rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_WIDTH), FD_WIDTH);\n      do {\n        CopyTimeStepFrom(rev_index.t(), src, fwd_index.t());\n      } while (fwd_index.AddOffset(1, FD_WIDTH) && rev_index.AddOffset(-1, FD_WIDTH));\n    } while (y_index.AddOffset(1, FD_HEIGHT));\n  } while (b_index.AddOffset(1, FD_BATCH));\n}\n\n// Copies src to *this with independent transpose of the x and y dimensions.\nvoid NetworkIO::CopyWithXYTranspose(const NetworkIO &src) {\n  int num_features = src.NumFeatures();\n  stride_map_ = src.stride_map_;\n  stride_map_.TransposeXY();\n  ResizeToMap(src.int_mode(), stride_map_, num_features);\n  StrideMap::Index src_b_index(src.stride_map_);\n  StrideMap::Index dest_b_index(stride_map_);\n  do {\n    StrideMap::Index src_y_index(src_b_index);\n    StrideMap::Index dest_x_index(dest_b_index);\n    do {\n      StrideMap::Index src_x_index(src_y_index);\n      StrideMap::Index dest_y_index(dest_x_index);\n      do {\n        CopyTimeStepFrom(dest_y_index.t(), src, src_x_index.t());\n      } while (src_x_index.AddOffset(1, FD_WIDTH) && dest_y_index.AddOffset(1, FD_HEIGHT));\n    } while (src_y_index.AddOffset(1, FD_HEIGHT) && dest_x_index.AddOffset(1, FD_WIDTH));\n  } while (src_b_index.AddOffset(1, FD_BATCH) && dest_b_index.AddOffset(1, FD_BATCH));\n}\n\n// Copies src to *this, at the given feature_offset, returning the total\n// feature offset after the copy. Multiple calls will stack outputs from\n// multiple sources in feature space.\nint NetworkIO::CopyPacking(const NetworkIO &src, int feature_offset) {\n  ASSERT_HOST(int_mode_ == src.int_mode_);\n  int width = src.Width();\n  ASSERT_HOST(width <= Width());\n  int num_features = src.NumFeatures();\n  ASSERT_HOST(num_features + feature_offset <= NumFeatures());\n  if (int_mode_) {\n    for (int t = 0; t < width; ++t) {\n      memcpy(i_[t] + feature_offset, src.i_[t], num_features * sizeof(i_[t][0]));\n    }\n    for (int t = width; t < i_.dim1(); ++t) {\n      memset(i_[t], 0, num_features * sizeof(i_[t][0]));\n    }\n  } else {\n    for (int t = 0; t < width; ++t) {\n      memcpy(f_[t] + feature_offset, src.f_[t], num_features * sizeof(f_[t][0]));\n    }\n    for (int t = width; t < f_.dim1(); ++t) {\n      memset(f_[t], 0, num_features * sizeof(f_[t][0]));\n    }\n  }\n  return num_features + feature_offset;\n}\n\n// Opposite of CopyPacking, fills *this with a part of src, starting at\n// feature_offset, and picking num_features.\nvoid NetworkIO::CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features) {\n  Resize(src, num_features);\n  int width = src.Width();\n  ASSERT_HOST(num_features + feature_offset <= src.NumFeatures());\n  if (int_mode_) {\n    for (int t = 0; t < width; ++t) {\n      memcpy(i_[t], src.i_[t] + feature_offset, num_features * sizeof(i_[t][0]));\n    }\n  } else {\n    for (int t = 0; t < width; ++t) {\n      memcpy(f_[t], src.f_[t] + feature_offset, num_features * sizeof(f_[t][0]));\n    }\n  }\n}\n\n// Transposes the float part of *this into dest.\nvoid NetworkIO::Transpose(TransposedArray *dest) const {\n  int width = Width();\n  dest->ResizeNoInit(NumFeatures(), width);\n  for (int t = 0; t < width; ++t) {\n    dest->WriteStrided(t, f_[t]);\n  }\n}\n\n// Clips the content of a single time-step to +/-range.\nvoid NetworkIO::ClipVector(int t, float range) {\n  ASSERT_HOST(!int_mode_);\n  float *v = f_[t];\n  int dim = f_.dim2();\n  for (int i = 0; i < dim; ++i) {\n    v[i] = ClipToRange<float>(v[i], -range, range);\n  }\n}\n\n// Returns the padding required for the given number of features in order\n// for the SIMD operations to be safe.\n/* static */\nint NetworkIO::GetPadding(int num_features) {\n  int padding = 0;\n  if (IntSimdMatrix::intSimdMatrix) {\n    padding = IntSimdMatrix::intSimdMatrix->RoundInputs(num_features) - num_features;\n  }\n  return padding;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/networkio.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        networkio.h\n// Description: Network input/output data, allowing float/int implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_NETWORKIO_H_\n#define TESSERACT_LSTM_NETWORKIO_H_\n\n#include \"helpers.h\"\n#include \"image.h\"\n#include \"static_shape.h\"\n#include \"stridemap.h\"\n#include \"weightmatrix.h\"\n\n#include <cmath>\n#include <cstdio>\n#include <vector>\n\nstruct Pix;\n\nnamespace tesseract {\n\n// Class to contain all the input/output of a network, allowing for fixed or\n// variable-strided 2d to 1d mapping, and float or int8_t values. Provides\n// enough calculating functions to hide the detail of the implementation.\nclass TESS_API NetworkIO {\npublic:\n  NetworkIO() : int_mode_(false) {}\n  // Resizes the array (and stride), avoiding realloc if possible, to the given\n  // size from various size specs:\n  // Same stride size, but given number of features.\n  void Resize(const NetworkIO &src, int num_features) {\n    ResizeToMap(src.int_mode(), src.stride_map(), num_features);\n  }\n  // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.\n  void Resize2d(bool int_mode, int width, int num_features);\n  // Resizes forcing a float representation with the stridemap of src and the\n  // given number of features.\n  void ResizeFloat(const NetworkIO &src, int num_features) {\n    ResizeToMap(false, src.stride_map(), num_features);\n  }\n  // Resizes to a specific stride_map.\n  void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features);\n  // Shrinks image size by x_scale,y_scale, and use given number of features.\n  void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features);\n  // Resizes to just 1 x-coord, whatever the input.\n  void ResizeXTo1(const NetworkIO &src, int num_features);\n  // Initialize all the array to zero.\n  void Zero();\n  // Initializes to zero all elements of the array that do not correspond to\n  // valid image positions. (If a batch of different-sized images are packed\n  // together, then there will be padding pixels.)\n  void ZeroInvalidElements();\n  // Sets up the array from the given image, using the currently set int_mode_.\n  // If the image width doesn't match the shape, the image is truncated or\n  // padded with noise to match.\n  void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer);\n  // Sets up the array from the given set of images, using the currently set\n  // int_mode_. If the image width doesn't match the shape, the images are\n  // truncated or padded with noise to match.\n  void FromPixes(const StaticShape &shape, const std::vector<Image> &pixes,\n                 TRand *randomizer);\n  // Copies the given pix to *this at the given batch index, stretching and\n  // clipping the pixel values so that [black, black + 2*contrast] maps to the\n  // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.\n  // This is a 2-d operation in the sense that the output depth is the number\n  // of input channels, the height is the height of the image, and the width\n  // is the width of the image, or truncated/padded with noise if the width\n  // is a fixed size.\n  void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer);\n  // Copies the given pix to *this at the given batch index, as Copy2DImage\n  // above, except that the output depth is the height of the input image, the\n  // output height is 1, and the output width as for Copy2DImage.\n  // The image is thus treated as a 1-d set of vertical pixel strips.\n  void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer);\n  // Helper stores the pixel value in i_ or f_ according to int_mode_.\n  // t: is the index from the StrideMap corresponding to the current\n  //   [batch,y,x] position\n  // f: is the index into the depth/channel\n  // pixel: the value of the pixel from the image (in one channel)\n  // black: the pixel value to map to the lowest of the range of *this\n  // contrast: the range of pixel values to stretch to half the range of *this.\n  void SetPixel(int t, int f, int pixel, float black, float contrast);\n  // Converts the array to a Pix. Must be pixDestroyed after use.\n  Image ToPix() const;\n  // Prints the first and last num timesteps of the array for each feature.\n  void Print(int num) const;\n\n  // Returns the timestep width.\n  int Width() const {\n    return int_mode_ ? i_.dim1() : f_.dim1();\n  }\n  // Returns the number of features.\n  int NumFeatures() const {\n    return int_mode_ ? i_.dim2() : f_.dim2();\n  }\n  // Accessor to a timestep of the float matrix.\n  float *f(int t) {\n    ASSERT_HOST(!int_mode_);\n    return f_[t];\n  }\n  const float *f(int t) const {\n    ASSERT_HOST(!int_mode_);\n    return f_[t];\n  }\n  const int8_t *i(int t) const {\n    ASSERT_HOST(int_mode_);\n    return i_[t];\n  }\n  bool int_mode() const {\n    return int_mode_;\n  }\n  void set_int_mode(bool is_quantized) {\n    int_mode_ = is_quantized;\n  }\n  const StrideMap &stride_map() const {\n    return stride_map_;\n  }\n  void set_stride_map(const StrideMap &map) {\n    stride_map_ = map;\n  }\n  const GENERIC_2D_ARRAY<float> &float_array() const {\n    return f_;\n  }\n  GENERIC_2D_ARRAY<float> *mutable_float_array() {\n    return &f_;\n  }\n\n  // Copies a single time step from src.\n  void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t);\n  // Copies a part of single time step from src.\n  void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src,\n                           int src_t, int src_offset);\n  // Zeroes a single time step.\n  void ZeroTimeStep(int t) {\n    if (int_mode_) {\n      memset(i_[t], 0, sizeof(*i_[t]) * NumFeatures());\n    } else {\n      memset(f_[t], 0, sizeof(*f_[t]) * NumFeatures());\n    }\n  }\n  // Sets the given range to random values.\n  void Randomize(int t, int offset, int num_features, TRand *randomizer);\n\n  // Helper returns the label and score of the best choice over a range.\n  int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating,\n                          float *certainty) const;\n  // Helper returns the rating and certainty of the choice over a range in t.\n  void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating,\n                       float *certainty) const;\n  // Returns the index (label) of the best value at the given timestep,\n  // and if not null, sets the score to the log of the corresponding value.\n  int BestLabel(int t, float *score) const {\n    return BestLabel(t, -1, -1, score);\n  }\n  // Returns the index (label) of the best value at the given timestep,\n  // excluding not_this and not_that, and if not null, sets the score to the\n  // log of the corresponding value.\n  int BestLabel(int t, int not_this, int not_that, float *score) const;\n  // Returns the best start position out of range (into which both start and end\n  // must fit) to obtain the highest cumulative score for the given labels.\n  int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const;\n  // Returns the cumulative score of the given labels starting at start, and\n  // using one label per time-step.\n  TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const;\n  // Helper function sets all the outputs for a single timestep, such that\n  // label has value ok_score, and the other labels share 1 - ok_score.\n  // Assumes float mode.\n  void SetActivations(int t, int label, float ok_score);\n  // Modifies the values, only if needed, so that the given label is\n  // the winner at the given time step t.\n  // Assumes float mode.\n  void EnsureBestLabel(int t, int label);\n  // Helper function converts prob to certainty taking the minimum into account.\n  static float ProbToCertainty(float prob);\n  // Returns true if there is any bad value that is suspiciously like a GT\n  // error. Assuming that *this is the difference(gradient) between target\n  // and forward output, returns true if there is a large negative value\n  // (correcting a very confident output) for which there is no corresponding\n  // positive value in an adjacent timestep for the same feature index. This\n  // allows the box-truthed samples to make fine adjustments to position while\n  // stopping other disagreements of confident output with ground truth.\n  bool AnySuspiciousTruth(float confidence_thr) const;\n\n  // Reads a single timestep to floats in the range [-1, 1].\n  void ReadTimeStep(int t, TFloat *output) const;\n  // Adds a single timestep to floats.\n  void AddTimeStep(int t, TFloat *inout) const;\n  // Adds part of a single timestep to floats.\n  void AddTimeStepPart(int t, int offset, int num_features, float *inout) const;\n  // Writes a single timestep from floats in the range [-1, 1].\n  void WriteTimeStep(int t, const TFloat *input);\n  // Writes a single timestep from floats in the range [-1, 1] writing only\n  // num_features elements of input to (*this)[t], starting at offset.\n  void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input);\n  // Maxpools a single time step from src.\n  void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line);\n  // Runs maxpool backward, using maxes to index timesteps in *this.\n  void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes);\n  // Returns the min over time of the maxes over features of the outputs.\n  float MinOfMaxes() const;\n  // Returns the min over time.\n  float Max() const {\n    return int_mode_ ? i_.Max() : f_.Max();\n  }\n  // Computes combined results for a combiner that chooses between an existing\n  // input and itself, with an additional output to indicate the choice.\n  void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output);\n  // Computes deltas for a combiner that chooses between 2 sets of inputs.\n  void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output);\n\n  // Copies the array checking that the types match.\n  void CopyAll(const NetworkIO &src);\n  // Adds the array to a float array, with scaling to [-1, 1] if the src is int.\n  void AddAllToFloat(const NetworkIO &src);\n  // Subtracts the array from a float array. src must also be float.\n  void SubtractAllFromFloat(const NetworkIO &src);\n\n  // Copies src to *this, with maxabs normalization to match scale.\n  void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale);\n  // Multiplies the float data by the given factor.\n  void ScaleFloatBy(float factor) {\n    f_ *= factor;\n  }\n  // Copies src to *this with independent reversal of the y dimension.\n  void CopyWithYReversal(const NetworkIO &src);\n  // Copies src to *this with independent reversal of the x dimension.\n  void CopyWithXReversal(const NetworkIO &src);\n  // Copies src to *this with independent transpose of the x and y dimensions.\n  void CopyWithXYTranspose(const NetworkIO &src);\n  // Copies src to *this, at the given feature_offset, returning the total\n  // feature offset after the copy. Multiple calls will stack outputs from\n  // multiple sources in feature space.\n  int CopyPacking(const NetworkIO &src, int feature_offset);\n  // Opposite of CopyPacking, fills *this with a part of src, starting at\n  // feature_offset, and picking num_features. Resizes *this to match.\n  void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features);\n  // Transposes the float part of *this into dest.\n  void Transpose(TransposedArray *dest) const;\n\n  // Clips the content of a single time-step to +/-range.\n  void ClipVector(int t, float range);\n\n  // Applies Func to timestep t of *this (u) and multiplies the result by v\n  // component-wise, putting the product in *product.\n  // *this and v may be int or float, but must match. The outputs are TFloat.\n  template <class Func>\n  void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) {\n    Func f;\n    ASSERT_HOST(!int_mode_);\n    ASSERT_HOST(!v_io.int_mode_);\n    int dim = f_.dim2();\n    if (int_mode_) {\n      const int8_t *u = i_[t];\n      const int8_t *v = v_io.i_[t];\n      for (int i = 0; i < dim; ++i) {\n        product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX;\n      }\n    } else {\n      const float *u = f_[t];\n      const float *v = v_io.f_[t];\n      for (int i = 0; i < dim; ++i) {\n        product[i] = f(u[i]) * v[i];\n      }\n    }\n  }\n  // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,\n  // component-wise, putting the product in *product.\n  // All NetworkIOs are assumed to be float.\n  template <class Func>\n  void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w,\n                     TFloat *product) const {\n    ASSERT_HOST(!int_mode_);\n    ASSERT_HOST(!v_io.int_mode_);\n    Func f;\n    const float *u = f_[u_t];\n    const float *v = v_io.f_[v_t];\n    int dim = f_.dim2();\n    for (int i = 0; i < dim; ++i) {\n      product[i] = f(u[i]) * v[i] * w[i];\n    }\n  }\n  // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,\n  // component-wise, adding the product to *product.\n  // All NetworkIOs are assumed to be float.\n  template <class Func>\n  void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {\n    ASSERT_HOST(!int_mode_);\n    ASSERT_HOST(!v_io.int_mode_);\n    Func f;\n    const float *u = f_[t];\n    const float *v = v_io.f_[t];\n    int dim = f_.dim2();\n    for (int i = 0; i < dim; ++i) {\n      product[i] += f(u[i]) * v[i] * w[i];\n    }\n  }\n  // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w,\n  // component-wise, putting the product in product, all at timestep t, except\n  // w, which is a simple array. All NetworkIOs are assumed to be float.\n  template <class Func1, class Func2>\n  void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {\n    ASSERT_HOST(!int_mode_);\n    ASSERT_HOST(!v_io.int_mode_);\n    Func1 f;\n    Func2 g;\n    const float *u = f_[t];\n    const float *v = v_io.f_[t];\n    int dim = f_.dim2();\n    for (int i = 0; i < dim; ++i) {\n      product[i] = f(u[i]) * g(v[i]) * w[i];\n    }\n  }\n\nprivate:\n  // Returns the padding required for the given number of features in order\n  // for the SIMD operations to be safe.\n  static int GetPadding(int num_features);\n\n  // Choice of float vs 8 bit int for data.\n  GENERIC_2D_ARRAY<float> f_;\n  GENERIC_2D_ARRAY<int8_t> i_;\n  // Which of f_ and i_ are we actually using.\n  bool int_mode_;\n  // Stride for 2d input data.\n  StrideMap stride_map_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_NETWORKIO_H_\n"
  },
  {
    "path": "src/lstm/networkscratch.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        networkscratch.h\n// Description: Scratch space for Network layers that hides distinction\n//              between float/int implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_NETWORKSCRATCH_H_\n#define TESSERACT_LSTM_NETWORKSCRATCH_H_\n\n#include <mutex>\n#include \"matrix.h\"\n#include \"networkio.h\"\n\nnamespace tesseract {\n\n// Generic scratch space for network layers. Provides NetworkIO that can store\n// a complete set (over time) of intermediates, and vector<float>\n// scratch space that auto-frees after use. The aim here is to provide a set\n// of temporary buffers to network layers that can be reused between layers\n// and don't have to be reallocated on each call.\nclass NetworkScratch {\npublic:\n  NetworkScratch() : int_mode_(false) {}\n  ~NetworkScratch() = default;\n\n  // Sets the network representation. If the representation is integer, then\n  // default (integer) NetworkIOs are separated from the always-float variety.\n  // This saves memory by having separate int-specific and float-specific\n  // stacks. If the network representation is float, then all NetworkIOs go\n  // to the float stack.\n  void set_int_mode(bool int_mode) {\n    int_mode_ = int_mode;\n  }\n\n  // Class that acts like a NetworkIO (by having an implicit cast operator),\n  // yet actually holds a pointer to NetworkIOs in the source NetworkScratch,\n  // and knows how to unstack the borrowed pointers on destruction.\n  class IO {\n  public:\n    // The NetworkIO should be sized after construction.\n    IO(const NetworkIO &src, NetworkScratch *scratch)\n        : int_mode_(scratch->int_mode_ && src.int_mode()), scratch_space_(scratch) {\n      network_io_ =\n          int_mode_ ? scratch_space_->int_stack_.Borrow() : scratch_space_->float_stack_.Borrow();\n    }\n    // Default constructor for arrays. Use one of the Resize functions\n    // below to initialize and size.\n    IO() : int_mode_(false), network_io_(nullptr), scratch_space_(nullptr) {}\n\n    ~IO() {\n      if (scratch_space_ == nullptr) {\n        ASSERT_HOST(network_io_ == nullptr);\n      } else if (int_mode_) {\n        scratch_space_->int_stack_.Return(network_io_);\n      } else {\n        scratch_space_->float_stack_.Return(network_io_);\n      }\n    }\n    // Resizes the array (and stride), avoiding realloc if possible, to the\n    // size from various size specs:\n    // Same time size, given number of features.\n    void Resize(const NetworkIO &src, int num_features, NetworkScratch *scratch) {\n      if (scratch_space_ == nullptr) {\n        int_mode_ = scratch->int_mode_ && src.int_mode();\n        scratch_space_ = scratch;\n        network_io_ =\n            int_mode_ ? scratch_space_->int_stack_.Borrow() : scratch_space_->float_stack_.Borrow();\n      }\n      network_io_->Resize(src, num_features);\n    }\n    // Resizes to a specific size as a temp buffer. No batches, no y-dim.\n    void Resize2d(bool int_mode, int width, int num_features, NetworkScratch *scratch) {\n      if (scratch_space_ == nullptr) {\n        int_mode_ = scratch->int_mode_ && int_mode;\n        scratch_space_ = scratch;\n        network_io_ =\n            int_mode_ ? scratch_space_->int_stack_.Borrow() : scratch_space_->float_stack_.Borrow();\n      }\n      network_io_->Resize2d(int_mode, width, num_features);\n    }\n    // Resize forcing a float representation with the width of src and the given\n    // number of features.\n    void ResizeFloat(const NetworkIO &src, int num_features, NetworkScratch *scratch) {\n      if (scratch_space_ == nullptr) {\n        int_mode_ = false;\n        scratch_space_ = scratch;\n        network_io_ = scratch_space_->float_stack_.Borrow();\n      }\n      network_io_->ResizeFloat(src, num_features);\n    }\n\n    // Returns a ref to a NetworkIO that enables *this to be treated as if\n    // it were just a NetworkIO*.\n    NetworkIO &operator*() {\n      return *network_io_;\n    }\n    NetworkIO *operator->() {\n      return network_io_;\n    }\n    operator NetworkIO *() {\n      return network_io_;\n    }\n\n  private:\n    // True if this is from the always-float stack, otherwise the default stack.\n    bool int_mode_;\n    // The NetworkIO that we have borrowed from the scratch_space_.\n    NetworkIO *network_io_;\n    // The source scratch_space_. Borrowed pointer, used to free the\n    // NetworkIO. Don't delete!\n    NetworkScratch *scratch_space_;\n  }; // class IO.\n\n  // Class that acts like a fixed array of float, yet actually uses space\n  // from a vector<float> in the source NetworkScratch, and knows how\n  // to unstack the borrowed vector on destruction.\n  class FloatVec {\n  public:\n    // The array will have size elements in it, uninitialized.\n    FloatVec(int size, NetworkScratch *scratch) : vec_(nullptr), scratch_space_(scratch) {\n      Init(size, scratch);\n    }\n    // Default constructor is for arrays. Use Init to setup.\n    FloatVec() : vec_(nullptr), data_(nullptr), scratch_space_(nullptr) {}\n    ~FloatVec() {\n      if (scratch_space_ != nullptr) {\n        scratch_space_->vec_stack_.Return(vec_);\n      }\n    }\n\n    void Init(int /*size*/, int reserve, NetworkScratch *scratch) {\n      if (scratch_space_ != nullptr && vec_ != nullptr) {\n        scratch_space_->vec_stack_.Return(vec_);\n      }\n      scratch_space_ = scratch;\n      vec_ = scratch_space_->vec_stack_.Borrow();\n      // TODO: optimize.\n      vec_->resize(reserve);\n      data_ = &(*vec_)[0];\n    }\n\n    void Init(int size, NetworkScratch *scratch) {\n      Init(size, size, scratch);\n    }\n\n    // Use the cast operator instead of operator[] so the FloatVec can be used\n    // as a TFloat* argument to a function call.\n    operator TFloat *() const {\n      return data_;\n    }\n    TFloat *get() {\n      return data_;\n    }\n\n  private:\n    // Vector borrowed from the scratch space. Use Return to free it.\n    std::vector<TFloat> *vec_;\n    // Short-cut pointer to the underlying array.\n    TFloat *data_;\n    // The source scratch_space_. Borrowed pointer, used to free the\n    // vector. Don't delete!\n    NetworkScratch *scratch_space_;\n  }; // class FloatVec\n\n  // Class that acts like a 2-D array of TFloat, yet actually uses space\n  // from the source NetworkScratch, and knows how to unstack the borrowed\n  // array on destruction.\n  class GradientStore {\n  public:\n    // Default constructor is for arrays. Use Init to setup.\n    GradientStore() : array_(nullptr), scratch_space_(nullptr) {}\n    ~GradientStore() {\n      if (scratch_space_ != nullptr) {\n        scratch_space_->array_stack_.Return(array_);\n      }\n    }\n\n    void Init(int size1, int size2, NetworkScratch *scratch) {\n      if (scratch_space_ != nullptr && array_ != nullptr) {\n        scratch_space_->array_stack_.Return(array_);\n      }\n      scratch_space_ = scratch;\n      array_ = scratch_space_->array_stack_.Borrow();\n      array_->Resize(size1, size2, 0.0);\n    }\n\n    // Accessors to get to the underlying TransposedArray.\n    TransposedArray *get() const {\n      return array_;\n    }\n    const TransposedArray &operator*() const {\n      return *array_;\n    }\n\n  private:\n    // Array borrowed from the scratch space. Use Return to free it.\n    TransposedArray *array_;\n    // The source scratch_space_. Borrowed pointer, used to free the\n    // vector. Don't delete!\n    NetworkScratch *scratch_space_;\n  }; // class GradientStore\n\n  // Class that does the work of holding a stack of objects, a stack pointer\n  // and a vector of in-use flags, so objects can be returned out of order.\n  // It is safe to attempt to Borrow/Return in multiple threads.\n  template <typename T>\n  class Stack {\n  public:\n    Stack() = default;\n\n    ~Stack() {\n      for (auto data : stack_) {\n        delete data;\n      }\n    }\n\n    // Lends out the next free item, creating one if none available, sets\n    // the used flags and increments the stack top.\n    T *Borrow() {\n      std::lock_guard<std::mutex> lock(mutex_);\n      if (stack_top_ == stack_.size()) {\n        stack_.push_back(new T);\n        flags_.push_back(false);\n      }\n      flags_[stack_top_] = true;\n      return stack_[stack_top_++];\n    }\n    // Takes back the given item, and marks it free. Item does not have to be\n    // the most recently lent out, but free slots don't get re-used until the\n    // blocking item is returned. The assumption is that there will only be\n    // small, temporary variations from true stack use. (Determined by the order\n    // of destructors within a local scope.)\n    void Return(T *item) {\n      std::lock_guard<std::mutex> lock(mutex_);\n      // Linear search will do.\n      int index = stack_top_;\n      while (--index >= 0 && stack_[index] != item) {\n      }\n      if (index >= 0) {\n        flags_[index] = false;\n      }\n      while (stack_top_ > 0 && !flags_[stack_top_ - 1]) {\n        --stack_top_;\n      }\n    }\n\n  private:\n    std::vector<T *> stack_;\n    std::vector<bool> flags_;\n    unsigned stack_top_ = 0;\n    std::mutex mutex_;\n  }; // class Stack.\n\nprivate:\n  // If true, the network weights are int8_t, if false, float.\n  bool int_mode_;\n  // Stacks of NetworkIO and vector<float>. Once allocated, they are not\n  // deleted until the NetworkScratch is deleted.\n  Stack<NetworkIO> int_stack_;\n  Stack<NetworkIO> float_stack_;\n  Stack<std::vector<TFloat>> vec_stack_;\n  Stack<TransposedArray> array_stack_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_NETWORKSCRATCH_H_\n"
  },
  {
    "path": "src/lstm/parallel.cpp",
    "content": "/////////////////////////////////////////////////////////////////////////\n// File:        parallel.cpp\n// Description: Runs networks in parallel on the same input.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"parallel.h\"\n\n#ifdef _OPENMP\n#  include <omp.h>\n#endif\n\n#include \"functions.h\" // For conditional undef of _OPENMP.\n#include \"networkscratch.h\"\n\nnamespace tesseract {\n\n// ni_ and no_ will be set by AddToStack.\nParallel::Parallel(const std::string &name, NetworkType type) : Plumbing(name) {\n  type_ = type;\n}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape Parallel::OutputShape(const StaticShape &input_shape) const {\n  StaticShape result = stack_[0]->OutputShape(input_shape);\n  int stack_size = stack_.size();\n  for (int i = 1; i < stack_size; ++i) {\n    StaticShape shape = stack_[i]->OutputShape(input_shape);\n    result.set_depth(result.depth() + shape.depth());\n  }\n  return result;\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Parallel::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                       NetworkScratch *scratch, NetworkIO *output) {\n  bool parallel_debug = false;\n  // If this parallel is a replicator of convolvers, or holds a 1-d LSTM pair,\n  // or a 2-d LSTM quad, do debug locally, and don't pass the flag on.\n  if (debug && type_ != NT_PARALLEL) {\n    parallel_debug = true;\n    debug = false;\n  }\n  int stack_size = stack_.size();\n  if (type_ == NT_PAR_2D_LSTM) {\n    // Special case, run parallel in parallel.\n    std::vector<NetworkScratch::IO> results(stack_size);\n    for (int i = 0; i < stack_size; ++i) {\n      results[i].Resize(input, stack_[i]->NumOutputs(), scratch);\n    }\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(stack_size)\n#endif\n    for (int i = 0; i < stack_size; ++i) {\n      stack_[i]->Forward(debug, input, nullptr, scratch, results[i]);\n    }\n    // Now pack all the results (serially) into the output.\n    int out_offset = 0;\n    output->Resize(*results[0], NumOutputs());\n    for (int i = 0; i < stack_size; ++i) {\n      out_offset = output->CopyPacking(*results[i], out_offset);\n    }\n  } else {\n    // Revolving intermediate result.\n    NetworkScratch::IO result(input, scratch);\n    // Source for divided replicated.\n    NetworkScratch::IO source_part;\n    TransposedArray *src_transpose = nullptr;\n    if (IsTraining() && type_ == NT_REPLICATED) {\n      // Make a transposed copy of the input.\n      input.Transpose(&transposed_input_);\n      src_transpose = &transposed_input_;\n    }\n    // Run each network, putting the outputs into result.\n    int out_offset = 0;\n    for (int i = 0; i < stack_size; ++i) {\n      stack_[i]->Forward(debug, input, src_transpose, scratch, result);\n      // All networks must have the same output width\n      if (i == 0) {\n        output->Resize(*result, NumOutputs());\n      } else {\n        ASSERT_HOST(result->Width() == output->Width());\n      }\n      out_offset = output->CopyPacking(*result, out_offset);\n    }\n  }\n#ifndef GRAPHICS_DISABLED\n  if (parallel_debug) {\n    DisplayForward(*output);\n  }\n#endif\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                        NetworkIO *back_deltas) {\n  // If this parallel is a replicator of convolvers, or holds a 1-d LSTM pair,\n  // or a 2-d LSTM quad, do debug locally, and don't pass the flag on.\n  if (debug && type_ != NT_PARALLEL) {\n#ifndef GRAPHICS_DISABLED\n    DisplayBackward(fwd_deltas);\n#endif\n    debug = false;\n  }\n  auto stack_size = stack_.size();\n  if (type_ == NT_PAR_2D_LSTM) {\n    // Special case, run parallel in parallel.\n    std::vector<NetworkScratch::IO> in_deltas(stack_size);\n    std::vector<NetworkScratch::IO> out_deltas(stack_size);\n    // Split the forward deltas for each stack element.\n    int feature_offset = 0;\n    for (unsigned i = 0; i < stack_.size(); ++i) {\n      int num_features = stack_[i]->NumOutputs();\n      in_deltas[i].Resize(fwd_deltas, num_features, scratch);\n      out_deltas[i].Resize(fwd_deltas, stack_[i]->NumInputs(), scratch);\n      in_deltas[i]->CopyUnpacking(fwd_deltas, feature_offset, num_features);\n      feature_offset += num_features;\n    }\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(stack_size)\n#endif\n    for (unsigned i = 0; i < stack_size; ++i) {\n      stack_[i]->Backward(debug, *in_deltas[i], scratch, i == 0 ? back_deltas : out_deltas[i]);\n    }\n    if (needs_to_backprop_) {\n      for (unsigned i = 1; i < stack_size; ++i) {\n        back_deltas->AddAllToFloat(*out_deltas[i]);\n      }\n    }\n  } else {\n    // Revolving partial deltas.\n    NetworkScratch::IO in_deltas(fwd_deltas, scratch);\n    // The sum of deltas from different sources, which will eventually go into\n    // back_deltas.\n    NetworkScratch::IO out_deltas;\n    int feature_offset = 0;\n    for (unsigned i = 0; i < stack_.size(); ++i) {\n      int num_features = stack_[i]->NumOutputs();\n      in_deltas->CopyUnpacking(fwd_deltas, feature_offset, num_features);\n      feature_offset += num_features;\n      if (stack_[i]->Backward(debug, *in_deltas, scratch, back_deltas)) {\n        if (i == 0) {\n          out_deltas.ResizeFloat(*back_deltas, back_deltas->NumFeatures(), scratch);\n          out_deltas->CopyAll(*back_deltas);\n        } else if (back_deltas->NumFeatures() == out_deltas->NumFeatures()) {\n          // Widths are allowed to be different going back, as we may have\n          // input nets, so only accumulate the deltas if the widths are the\n          // same.\n          out_deltas->AddAllToFloat(*back_deltas);\n        }\n      }\n    }\n    if (needs_to_backprop_) {\n      back_deltas->CopyAll(*out_deltas);\n    }\n  }\n  if (needs_to_backprop_) {\n    back_deltas->ScaleFloatBy(1.0f / stack_size);\n  }\n  return needs_to_backprop_;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/parallel.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        parallel.h\n// Description: Runs networks in parallel on the same input.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_PARALLEL_H_\n#define TESSERACT_LSTM_PARALLEL_H_\n\n#include \"plumbing.h\"\n\nnamespace tesseract {\n\n// Runs multiple networks in parallel, interlacing their outputs.\nclass Parallel : public Plumbing {\npublic:\n  // ni_ and no_ will be set by AddToStack.\n  TESS_API\n  Parallel(const std::string &name, NetworkType type);\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    std::string spec;\n    if (type_ == NT_PAR_2D_LSTM) {\n      // We have 4 LSTMs operating in parallel here, so the size of each is\n      // the number of outputs/4.\n      spec += \"L2xy\" + std::to_string(no_ / 4);\n    } else if (type_ == NT_PAR_RL_LSTM) {\n      // We have 2 LSTMs operating in parallel here, so the size of each is\n      // the number of outputs/2.\n      if (stack_[0]->type() == NT_LSTM_SUMMARY) {\n        spec += \"Lbxs\" + std::to_string(no_ / 2);\n      } else {\n        spec += \"Lbx\" + std::to_string(no_ / 2);\n      }\n    } else {\n      if (type_ == NT_REPLICATED) {\n        spec += \"R\" + std::to_string(stack_.size()) + \"(\" + stack_[0]->spec();\n      } else {\n        for (auto &it : stack_) {\n          spec += it->spec();\n        }\n      }\n      spec += \")\";\n    }\n    return spec;\n  }\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\nprivate:\n  // If *this is a NT_REPLICATED, then it feeds a replicated network with\n  // identical inputs, and it would be extremely wasteful for them to each\n  // calculate and store the same transpose of the inputs, so Parallel does it\n  // and passes a pointer to the replicated network, allowing it to use the\n  // transpose on the next call to Backward.\n  TransposedArray transposed_input_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_PARALLEL_H_\n"
  },
  {
    "path": "src/lstm/plumbing.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        plumbing.cpp\n// Description: Base class for networks that organize other networks\n//              eg series or parallel.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"plumbing.h\"\n\nnamespace tesseract {\n\n// ni_ and no_ will be set by AddToStack.\nPlumbing::Plumbing(const std::string &name) : Network(NT_PARALLEL, name, 0, 0) {}\n\n// Suspends/Enables training by setting the training_ flag. Serialize and\n// DeSerialize only operate on the run-time data if state is false.\nvoid Plumbing::SetEnableTraining(TrainingState state) {\n  Network::SetEnableTraining(state);\n  for (auto &i : stack_) {\n    i->SetEnableTraining(state);\n  }\n}\n\n// Sets flags that control the action of the network. See NetworkFlags enum\n// for bit values.\nvoid Plumbing::SetNetworkFlags(uint32_t flags) {\n  Network::SetNetworkFlags(flags);\n  for (auto &i : stack_) {\n    i->SetNetworkFlags(flags);\n  }\n}\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\n// Note that randomizer is a borrowed pointer that should outlive the network\n// and should not be deleted by any of the networks.\n// Returns the number of weights initialized.\nint Plumbing::InitWeights(float range, TRand *randomizer) {\n  num_weights_ = 0;\n  for (auto &i : stack_) {\n    num_weights_ += i->InitWeights(range, randomizer);\n  }\n  return num_weights_;\n}\n\n// Recursively searches the network for softmaxes with old_no outputs,\n// and remaps their outputs according to code_map. See network.h for details.\nint Plumbing::RemapOutputs(int old_no, const std::vector<int> &code_map) {\n  num_weights_ = 0;\n  for (auto &i : stack_) {\n    num_weights_ += i->RemapOutputs(old_no, code_map);\n  }\n  return num_weights_;\n}\n\n// Converts a float network to an int network.\nvoid Plumbing::ConvertToInt() {\n  for (auto &i : stack_) {\n    i->ConvertToInt();\n  }\n}\n\n// Provides a pointer to a TRand for any networks that care to use it.\n// Note that randomizer is a borrowed pointer that should outlive the network\n// and should not be deleted by any of the networks.\nvoid Plumbing::SetRandomizer(TRand *randomizer) {\n  for (auto &i : stack_) {\n    i->SetRandomizer(randomizer);\n  }\n}\n\n// Adds the given network to the stack.\nvoid Plumbing::AddToStack(Network *network) {\n  if (stack_.empty()) {\n    ni_ = network->NumInputs();\n    no_ = network->NumOutputs();\n  } else if (type_ == NT_SERIES) {\n    // ni is input of first, no output of last, others match output to input.\n    ASSERT_HOST(no_ == network->NumInputs());\n    no_ = network->NumOutputs();\n  } else {\n    // All parallel types. Output is sum of outputs, inputs all match.\n    ASSERT_HOST(ni_ == network->NumInputs());\n    no_ += network->NumOutputs();\n  }\n  stack_.push_back(network);\n}\n\n// Sets needs_to_backprop_ to needs_backprop and calls on sub-network\n// according to needs_backprop || any weights in this network.\nbool Plumbing::SetupNeedsBackprop(bool needs_backprop) {\n  if (IsTraining()) {\n    needs_to_backprop_ = needs_backprop;\n    bool retval = needs_backprop;\n    for (auto &i : stack_) {\n      if (i->SetupNeedsBackprop(needs_backprop)) {\n        retval = true;\n      }\n    }\n    return retval;\n  }\n  // Frozen networks don't do backprop.\n  needs_to_backprop_ = false;\n  return false;\n}\n\n// Returns an integer reduction factor that the network applies to the\n// time sequence. Assumes that any 2-d is already eliminated. Used for\n// scaling bounding boxes of truth data.\n// WARNING: if GlobalMinimax is used to vary the scale, this will return\n// the last used scale factor. Call it before any forward, and it will return\n// the minimum scale factor of the paths through the GlobalMinimax.\nint Plumbing::XScaleFactor() const {\n  return stack_[0]->XScaleFactor();\n}\n\n// Provides the (minimum) x scale factor to the network (of interest only to\n// input units) so they can determine how to scale bounding boxes.\nvoid Plumbing::CacheXScaleFactor(int factor) {\n  for (auto &i : stack_) {\n    i->CacheXScaleFactor(factor);\n  }\n}\n\n// Provides debug output on the weights.\nvoid Plumbing::DebugWeights() {\n  for (auto &i : stack_) {\n    i->DebugWeights();\n  }\n}\n\n// Returns a set of strings representing the layer-ids of all layers below.\nvoid Plumbing::EnumerateLayers(const std::string *prefix, std::vector<std::string> &layers) const {\n  for (size_t i = 0; i < stack_.size(); ++i) {\n    std::string layer_name;\n    if (prefix) {\n      layer_name = *prefix;\n    }\n    layer_name += \":\" + std::to_string(i);\n    if (stack_[i]->IsPlumbingType()) {\n      auto *plumbing = static_cast<Plumbing *>(stack_[i]);\n      plumbing->EnumerateLayers(&layer_name, layers);\n    } else {\n      layers.push_back(layer_name);\n    }\n  }\n}\n\n// Returns a pointer to the network layer corresponding to the given id.\nNetwork *Plumbing::GetLayer(const char *id) const {\n  char *next_id;\n  int index = strtol(id, &next_id, 10);\n  if (index < 0 || static_cast<unsigned>(index) >= stack_.size()) {\n    return nullptr;\n  }\n  if (stack_[index]->IsPlumbingType()) {\n    auto *plumbing = static_cast<Plumbing *>(stack_[index]);\n    ASSERT_HOST(*next_id == ':');\n    return plumbing->GetLayer(next_id + 1);\n  }\n  return stack_[index];\n}\n\n// Returns a pointer to the learning rate for the given layer id.\nfloat *Plumbing::LayerLearningRatePtr(const char *id) {\n  char *next_id;\n  int index = strtol(id, &next_id, 10);\n  if (index < 0 || static_cast<unsigned>(index) >= stack_.size()) {\n    return nullptr;\n  }\n  if (stack_[index]->IsPlumbingType()) {\n    auto *plumbing = static_cast<Plumbing *>(stack_[index]);\n    ASSERT_HOST(*next_id == ':');\n    return plumbing->LayerLearningRatePtr(next_id + 1);\n  }\n  if (static_cast<unsigned>(index) >= learning_rates_.size()) {\n    return nullptr;\n  }\n  return &learning_rates_[index];\n}\n\n// Writes to the given file. Returns false in case of error.\nbool Plumbing::Serialize(TFile *fp) const {\n  if (!Network::Serialize(fp)) {\n    return false;\n  }\n  uint32_t size = stack_.size();\n  // Can't use PointerVector::Serialize here as we need a special DeSerialize.\n  if (!fp->Serialize(&size)) {\n    return false;\n  }\n  for (uint32_t i = 0; i < size; ++i) {\n    if (!stack_[i]->Serialize(fp)) {\n      return false;\n    }\n  }\n  if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->Serialize(learning_rates_)) {\n    return false;\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\nbool Plumbing::DeSerialize(TFile *fp) {\n  for (auto data : stack_) {\n    delete data;\n  }\n  stack_.clear();\n  no_ = 0; // We will be modifying this as we AddToStack.\n  uint32_t size;\n  if (!fp->DeSerialize(&size)) {\n    return false;\n  }\n  for (uint32_t i = 0; i < size; ++i) {\n    Network *network = CreateFromFile(fp);\n    if (network == nullptr) {\n      return false;\n    }\n    AddToStack(network);\n  }\n  if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->DeSerialize(learning_rates_)) {\n    return false;\n  }\n  return true;\n}\n\n// Updates the weights using the given learning rate, momentum and adam_beta.\n// num_samples is used in the adam computation iff use_adam_ is true.\nvoid Plumbing::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {\n  for (size_t i = 0; i < stack_.size(); ++i) {\n    if (network_flags_ & NF_LAYER_SPECIFIC_LR) {\n      if (i < learning_rates_.size()) {\n        learning_rate = learning_rates_[i];\n      } else {\n        learning_rates_.push_back(learning_rate);\n      }\n    }\n    if (stack_[i]->IsTraining()) {\n      stack_[i]->Update(learning_rate, momentum, adam_beta, num_samples);\n    }\n  }\n}\n\n// Sums the products of weight updates in *this and other, splitting into\n// positive (same direction) in *same and negative (different direction) in\n// *changed.\nvoid Plumbing::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {\n  ASSERT_HOST(other.type() == type_);\n  const auto *plumbing = static_cast<const Plumbing *>(&other);\n  ASSERT_HOST(plumbing->stack_.size() == stack_.size());\n  for (size_t i = 0; i < stack_.size(); ++i) {\n    stack_[i]->CountAlternators(*plumbing->stack_[i], same, changed);\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/plumbing.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        plumbing.h\n// Description: Base class for networks that organize other networks\n//              eg series or parallel.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_PLUMBING_H_\n#define TESSERACT_LSTM_PLUMBING_H_\n\n#include \"matrix.h\"\n#include \"network.h\"\n\nnamespace tesseract {\n\n// Holds a collection of other networks and forwards calls to each of them.\nclass TESS_API Plumbing : public Network {\npublic:\n  // ni_ and no_ will be set by AddToStack.\n  explicit Plumbing(const std::string &name);\n  ~Plumbing() override {\n    for (auto data : stack_) {\n      delete data;\n    }\n  }\n\n  // Returns the required shape input to the network.\n  StaticShape InputShape() const override {\n    return stack_[0]->InputShape();\n  }\n  std::string spec() const override {\n    return \"Sub-classes of Plumbing must implement spec()!\";\n  }\n\n  // Returns true if the given type is derived from Plumbing, and thus contains\n  // multiple sub-networks that can have their own learning rate.\n  bool IsPlumbingType() const override {\n    return true;\n  }\n\n  // Suspends/Enables training by setting the training_ flag. Serialize and\n  // DeSerialize only operate on the run-time data if state is false.\n  void SetEnableTraining(TrainingState state) override;\n\n  // Sets flags that control the action of the network. See NetworkFlags enum\n  // for bit values.\n  void SetNetworkFlags(uint32_t flags) override;\n\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  // Note that randomizer is a borrowed pointer that should outlive the network\n  // and should not be deleted by any of the networks.\n  // Returns the number of weights initialized.\n  int InitWeights(float range, TRand *randomizer) override;\n  // Recursively searches the network for softmaxes with old_no outputs,\n  // and remaps their outputs according to code_map. See network.h for details.\n  int RemapOutputs(int old_no, const std::vector<int> &code_map) override;\n\n  // Converts a float network to an int network.\n  void ConvertToInt() override;\n\n  // Provides a pointer to a TRand for any networks that care to use it.\n  // Note that randomizer is a borrowed pointer that should outlive the network\n  // and should not be deleted by any of the networks.\n  void SetRandomizer(TRand *randomizer) override;\n\n  // Adds the given network to the stack.\n  virtual void AddToStack(Network *network);\n\n  // Sets needs_to_backprop_ to needs_backprop and returns true if\n  // needs_backprop || any weights in this network so the next layer forward\n  // can be told to produce backprop for this layer if needed.\n  bool SetupNeedsBackprop(bool needs_backprop) override;\n\n  // Returns an integer reduction factor that the network applies to the\n  // time sequence. Assumes that any 2-d is already eliminated. Used for\n  // scaling bounding boxes of truth data.\n  // WARNING: if GlobalMinimax is used to vary the scale, this will return\n  // the last used scale factor. Call it before any forward, and it will return\n  // the minimum scale factor of the paths through the GlobalMinimax.\n  int XScaleFactor() const override;\n\n  // Provides the (minimum) x scale factor to the network (of interest only to\n  // input units) so they can determine how to scale bounding boxes.\n  void CacheXScaleFactor(int factor) override;\n\n  // Provides debug output on the weights.\n  void DebugWeights() override;\n\n  // Returns the current stack.\n  const std::vector<Network *> &stack() const {\n    return stack_;\n  }\n  // Returns a set of strings representing the layer-ids of all layers below.\n  void EnumerateLayers(const std::string *prefix, std::vector<std::string> &layers) const;\n  // Returns a pointer to the network layer corresponding to the given id.\n  Network *GetLayer(const char *id) const;\n  // Returns the learning rate for a specific layer of the stack.\n  float LayerLearningRate(const char *id) {\n    const float *lr_ptr = LayerLearningRatePtr(id);\n    ASSERT_HOST(lr_ptr != nullptr);\n    return *lr_ptr;\n  }\n  // Scales the learning rate for a specific layer of the stack.\n  void ScaleLayerLearningRate(const char *id, double factor) {\n    float *lr_ptr = LayerLearningRatePtr(id);\n    ASSERT_HOST(lr_ptr != nullptr);\n    *lr_ptr *= factor;\n  }\n\n  // Set the learning rate for a specific layer of the stack to the given value.\n  void SetLayerLearningRate(const char *id, float learning_rate) {\n    float *lr_ptr = LayerLearningRatePtr(id);\n    ASSERT_HOST(lr_ptr != nullptr);\n    *lr_ptr = learning_rate;\n  }\n\n  // Returns a pointer to the learning rate for the given layer id.\n  float *LayerLearningRatePtr(const char *id);\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Updates the weights using the given learning rate, momentum and adam_beta.\n  // num_samples is used in the adam computation iff use_adam_ is true.\n  void Update(float learning_rate, float momentum, float adam_beta, int num_samples) override;\n  // Sums the products of weight updates in *this and other, splitting into\n  // positive (same direction) in *same and negative (different direction) in\n  // *changed.\n  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;\n\nprotected:\n  // The networks.\n  std::vector<Network *> stack_;\n  // Layer-specific learning rate iff network_flags_ & NF_LAYER_SPECIFIC_LR.\n  // One element for each element of stack_.\n  std::vector<float> learning_rates_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_PLUMBING_H_\n"
  },
  {
    "path": "src/lstm/recodebeam.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        recodebeam.cpp\n// Description: Beam search to decode from the re-encoded CJK as a sequence of\n//              smaller numbers in place of a single large code.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"recodebeam.h\"\n\n#include \"networkio.h\"\n#include \"pageres.h\"\n#include \"unicharcompress.h\"\n\n#include <algorithm> // for std::reverse\n\nnamespace tesseract {\n\n// The beam width at each code position.\nconst int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = {\n    5, 10, 16, 16, 16, 16, 16, 16, 16, 16,\n};\n\nstatic const char *kNodeContNames[] = {\"Anything\", \"OnlyDup\", \"NoDup\"};\n\n// Prints debug details of the node.\nvoid RecodeNode::Print(int null_char, const UNICHARSET &unicharset,\n                       int depth) const {\n  if (code == null_char) {\n    tprintf(\"null_char\");\n  } else {\n    tprintf(\"label=%d, uid=%d=%s\", code, unichar_id,\n            unicharset.debug_str(unichar_id).c_str());\n  }\n  tprintf(\" score=%g, c=%g,%s%s%s perm=%d, hash=%\" PRIx64, score, certainty,\n          start_of_dawg ? \" DawgStart\" : \"\", start_of_word ? \" Start\" : \"\",\n          end_of_word ? \" End\" : \"\", permuter, code_hash);\n  if (depth > 0 && prev != nullptr) {\n    tprintf(\" prev:\");\n    prev->Print(null_char, unicharset, depth - 1);\n  } else {\n    tprintf(\"\\n\");\n  }\n}\n\n// Borrows the pointer, which is expected to survive until *this is deleted.\nRecodeBeamSearch::RecodeBeamSearch(const UnicharCompress &recoder,\n                                   int null_char, bool simple_text, Dict *dict)\n    : recoder_(recoder),\n      beam_size_(0),\n      top_code_(-1),\n      second_code_(-1),\n      dict_(dict),\n      space_delimited_(true),\n      is_simple_text_(simple_text),\n      null_char_(null_char) {\n  if (dict_ != nullptr && !dict_->IsSpaceDelimitedLang()) {\n    space_delimited_ = false;\n  }\n}\n\nRecodeBeamSearch::~RecodeBeamSearch() {\n  for (auto data : beam_) {\n    delete data;\n  }\n  for (auto data : secondary_beam_) {\n    delete data;\n  }\n}\n\n// Decodes the set of network outputs, storing the lattice internally.\nvoid RecodeBeamSearch::Decode(const NetworkIO &output, double dict_ratio,\n                              double cert_offset, double worst_dict_cert,\n                              const UNICHARSET *charset, int lstm_choice_mode) {\n  beam_size_ = 0;\n  int width = output.Width();\n  if (lstm_choice_mode) {\n    timesteps.clear();\n  }\n  for (int t = 0; t < width; ++t) {\n    ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);\n    DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,\n               charset);\n    if (lstm_choice_mode) {\n      SaveMostCertainChoices(output.f(t), output.NumFeatures(), charset, t);\n    }\n  }\n}\nvoid RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float> &output,\n                              double dict_ratio, double cert_offset,\n                              double worst_dict_cert,\n                              const UNICHARSET *charset) {\n  beam_size_ = 0;\n  int width = output.dim1();\n  for (int t = 0; t < width; ++t) {\n    ComputeTopN(output[t], output.dim2(), kBeamWidths[0]);\n    DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);\n  }\n}\n\nvoid RecodeBeamSearch::DecodeSecondaryBeams(\n    const NetworkIO &output, double dict_ratio, double cert_offset,\n    double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode) {\n  for (auto data : secondary_beam_) {\n    delete data;\n  }\n  secondary_beam_.clear();\n  if (character_boundaries_.size() < 2) {\n    return;\n  }\n  int width = output.Width();\n  unsigned bucketNumber = 0;\n  for (int t = 0; t < width; ++t) {\n    while ((bucketNumber + 1) < character_boundaries_.size() &&\n           t >= character_boundaries_[bucketNumber + 1]) {\n      ++bucketNumber;\n    }\n    ComputeSecTopN(&(excludedUnichars)[bucketNumber], output.f(t),\n                   output.NumFeatures(), kBeamWidths[0]);\n    DecodeSecondaryStep(output.f(t), t, dict_ratio, cert_offset,\n                        worst_dict_cert, charset);\n  }\n}\n\nvoid RecodeBeamSearch::SaveMostCertainChoices(const float *outputs,\n                                              int num_outputs,\n                                              const UNICHARSET *charset,\n                                              int xCoord) {\n  std::vector<std::pair<const char *, float>> choices;\n  for (int i = 0; i < num_outputs; ++i) {\n    if (outputs[i] >= 0.01f) {\n      const char *character;\n      if (i + 2 >= num_outputs) {\n        character = \"\";\n      } else if (i > 0) {\n        character = charset->id_to_unichar_ext(i + 2);\n      } else {\n        character = charset->id_to_unichar_ext(i);\n      }\n      size_t pos = 0;\n      // order the possible choices within one timestep\n      // beginning with the most likely\n      while (choices.size() > pos && choices[pos].second > outputs[i]) {\n        pos++;\n      }\n      choices.insert(choices.begin() + pos,\n                     std::pair<const char *, float>(character, outputs[i]));\n    }\n  }\n  timesteps.push_back(choices);\n}\n\nvoid RecodeBeamSearch::segmentTimestepsByCharacters() {\n  for (unsigned i = 1; i < character_boundaries_.size(); ++i) {\n    std::vector<std::vector<std::pair<const char *, float>>> segment;\n    for (int j = character_boundaries_[i - 1]; j < character_boundaries_[i];\n         ++j) {\n      segment.push_back(timesteps[j]);\n    }\n    segmentedTimesteps.push_back(segment);\n  }\n}\nstd::vector<std::vector<std::pair<const char *, float>>>\nRecodeBeamSearch::combineSegmentedTimesteps(\n    std::vector<std::vector<std::vector<std::pair<const char *, float>>>>\n        *segmentedTimesteps) {\n  std::vector<std::vector<std::pair<const char *, float>>> combined_timesteps;\n  for (auto &segmentedTimestep : *segmentedTimesteps) {\n    for (auto &j : segmentedTimestep) {\n      combined_timesteps.push_back(j);\n    }\n  }\n  return combined_timesteps;\n}\n\nvoid RecodeBeamSearch::calculateCharBoundaries(std::vector<int> *starts,\n                                               std::vector<int> *ends,\n                                               std::vector<int> *char_bounds_,\n                                               int maxWidth) {\n  char_bounds_->push_back(0);\n  for (unsigned i = 0; i < ends->size(); ++i) {\n    int middle = ((*starts)[i + 1] - (*ends)[i]) / 2;\n    char_bounds_->push_back((*ends)[i] + middle);\n  }\n  char_bounds_->pop_back();\n  char_bounds_->push_back(maxWidth);\n}\n\n// Returns the best path as labels/scores/xcoords similar to simple CTC.\nvoid RecodeBeamSearch::ExtractBestPathAsLabels(\n    std::vector<int> *labels, std::vector<int> *xcoords) const {\n  labels->clear();\n  xcoords->clear();\n  std::vector<const RecodeNode *> best_nodes;\n  ExtractBestPaths(&best_nodes, nullptr);\n  // Now just run CTC on the best nodes.\n  int t = 0;\n  int width = best_nodes.size();\n  while (t < width) {\n    int label = best_nodes[t]->code;\n    if (label != null_char_) {\n      labels->push_back(label);\n      xcoords->push_back(t);\n    }\n    while (++t < width && !is_simple_text_ && best_nodes[t]->code == label) {\n    }\n  }\n  xcoords->push_back(width);\n}\n\n// Returns the best path as unichar-ids/certs/ratings/xcoords skipping\n// duplicates, nulls and intermediate parts.\nvoid RecodeBeamSearch::ExtractBestPathAsUnicharIds(\n    bool debug, const UNICHARSET *unicharset, std::vector<int> *unichar_ids,\n    std::vector<float> *certs, std::vector<float> *ratings,\n    std::vector<int> *xcoords) const {\n  std::vector<const RecodeNode *> best_nodes;\n  ExtractBestPaths(&best_nodes, nullptr);\n  ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords);\n  if (debug) {\n    DebugPath(unicharset, best_nodes);\n    DebugUnicharPath(unicharset, best_nodes, *unichar_ids, *certs, *ratings,\n                     *xcoords);\n  }\n}\n\n// Returns the best path as a set of WERD_RES.\nvoid RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box,\n                                              float scale_factor, bool debug,\n                                              const UNICHARSET *unicharset,\n                                              PointerVector<WERD_RES> *words,\n                                              int lstm_choice_mode) {\n  words->truncate(0);\n  std::vector<int> unichar_ids;\n  std::vector<float> certs;\n  std::vector<float> ratings;\n  std::vector<int> xcoords;\n  std::vector<const RecodeNode *> best_nodes;\n  std::vector<const RecodeNode *> second_nodes;\n  character_boundaries_.clear();\n  ExtractBestPaths(&best_nodes, &second_nodes);\n  if (debug) {\n    DebugPath(unicharset, best_nodes);\n    ExtractPathAsUnicharIds(second_nodes, &unichar_ids, &certs, &ratings,\n                            &xcoords);\n    tprintf(\"\\nSecond choice path:\\n\");\n    DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings,\n                     xcoords);\n  }\n  // If lstm choice mode is required in granularity level 2, it stores the x\n  // Coordinates of every chosen character, to match the alternative choices to\n  // it.\n  ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords,\n                          &character_boundaries_);\n  int num_ids = unichar_ids.size();\n  if (debug) {\n    DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings,\n                     xcoords);\n  }\n  // Convert labels to unichar-ids.\n  int word_end = 0;\n  float prev_space_cert = 0.0f;\n  for (int word_start = 0; word_start < num_ids; word_start = word_end) {\n    for (word_end = word_start + 1; word_end < num_ids; ++word_end) {\n      // A word is terminated when a space character or start_of_word flag is\n      // hit. We also want to force a separate word for every non\n      // space-delimited character when not in a dictionary context.\n      if (unichar_ids[word_end] == UNICHAR_SPACE) {\n        break;\n      }\n      int index = xcoords[word_end];\n      if (best_nodes[index]->start_of_word) {\n        break;\n      }\n      if (best_nodes[index]->permuter == TOP_CHOICE_PERM &&\n          (!unicharset->IsSpaceDelimited(unichar_ids[word_end]) ||\n           !unicharset->IsSpaceDelimited(unichar_ids[word_end - 1]))) {\n        break;\n      }\n    }\n    float space_cert = 0.0f;\n    if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) {\n      space_cert = certs[word_end];\n    }\n    bool leading_space =\n        word_start > 0 && unichar_ids[word_start - 1] == UNICHAR_SPACE;\n    // Create a WERD_RES for the output word.\n    WERD_RES *word_res =\n        InitializeWord(leading_space, line_box, word_start, word_end,\n                       std::min(space_cert, prev_space_cert), unicharset,\n                       xcoords, scale_factor);\n    for (int i = word_start; i < word_end; ++i) {\n      auto *choices = new BLOB_CHOICE_LIST;\n      BLOB_CHOICE_IT bc_it(choices);\n      auto *choice = new BLOB_CHOICE(unichar_ids[i], ratings[i], certs[i], -1,\n                                     1.0f, static_cast<float>(INT16_MAX), 0.0f,\n                                     BCC_STATIC_CLASSIFIER);\n      int col = i - word_start;\n      choice->set_matrix_cell(col, col);\n      bc_it.add_after_then_move(choice);\n      word_res->ratings->put(col, col, choices);\n    }\n    int index = xcoords[word_end - 1];\n    word_res->FakeWordFromRatings(best_nodes[index]->permuter);\n    words->push_back(word_res);\n    prev_space_cert = space_cert;\n    if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) {\n      ++word_end;\n    }\n  }\n}\n\nstruct greater_than {\n  inline bool operator()(const RecodeNode *&node1, const RecodeNode *&node2) const {\n    return (node1->score > node2->score);\n  }\n};\n\nvoid RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs,\n                                  const UNICHARSET *charset,\n                                  bool secondary) const {\n  std::vector<std::vector<const RecodeNode *>> topology;\n  std::unordered_set<const RecodeNode *> visited;\n  const std::vector<RecodeBeam *> &beam = !secondary ? beam_ : secondary_beam_;\n  // create the topology\n  for (int step = beam.size() - 1; step >= 0; --step) {\n    std::vector<const RecodeNode *> layer;\n    topology.push_back(layer);\n  }\n  // fill the topology with depths first\n  for (int step = beam.size() - 1; step >= 0; --step) {\n    std::vector<tesseract::RecodePair> &heaps = beam.at(step)->beams_->heap();\n    for (auto &&node : heaps) {\n      int backtracker = 0;\n      const RecodeNode *curr = &node.data();\n      while (curr != nullptr && !visited.count(curr)) {\n        visited.insert(curr);\n        topology[step - backtracker].push_back(curr);\n        curr = curr->prev;\n        ++backtracker;\n      }\n    }\n  }\n  int ct = 0;\n  unsigned cb = 1;\n  for (const std::vector<const RecodeNode *> &layer : topology) {\n    if (cb >= character_boundaries_.size()) {\n      break;\n    }\n    if (ct == character_boundaries_[cb]) {\n      tprintf(\"***\\n\");\n      ++cb;\n    }\n    for (const RecodeNode *node : layer) {\n      const char *code;\n      int intCode;\n      if (node->unichar_id != INVALID_UNICHAR_ID) {\n        code = charset->id_to_unichar(node->unichar_id);\n        intCode = node->unichar_id;\n      } else if (node->code == null_char_) {\n        intCode = 0;\n        code = \" \";\n      } else {\n        intCode = 666;\n        code = \"*\";\n      }\n      int intPrevCode = 0;\n      const char *prevCode;\n      float prevScore = 0;\n      if (node->prev != nullptr) {\n        prevScore = node->prev->score;\n        if (node->prev->unichar_id != INVALID_UNICHAR_ID) {\n          prevCode = charset->id_to_unichar(node->prev->unichar_id);\n          intPrevCode = node->prev->unichar_id;\n        } else if (node->code == null_char_) {\n          intPrevCode = 0;\n          prevCode = \" \";\n        } else {\n          prevCode = \"*\";\n          intPrevCode = 666;\n        }\n      } else {\n        prevCode = \" \";\n      }\n      if (uids) {\n        tprintf(\"%x(|)%f(>)%x(|)%f\\n\", intPrevCode, prevScore, intCode,\n                node->score);\n      } else {\n        tprintf(\"%s(|)%f(>)%s(|)%f\\n\", prevCode, prevScore, code, node->score);\n      }\n    }\n    tprintf(\"-\\n\");\n    ++ct;\n  }\n  tprintf(\"***\\n\");\n}\n\nvoid RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {\n  if (character_boundaries_.size() < 2) {\n    return;\n  }\n  // For the first iteration the original beam is analyzed. After that a\n  // new beam is calculated based on the results from the original beam.\n  std::vector<RecodeBeam *> &currentBeam =\n      secondary_beam_.empty() ? beam_ : secondary_beam_;\n  character_boundaries_[0] = 0;\n  for (unsigned j = 1; j < character_boundaries_.size(); ++j) {\n    std::vector<int> unichar_ids;\n    std::vector<float> certs;\n    std::vector<float> ratings;\n    std::vector<int> xcoords;\n    int backpath = character_boundaries_[j] - character_boundaries_[j - 1];\n    std::vector<tesseract::RecodePair> &heaps =\n        currentBeam.at(character_boundaries_[j] - 1)->beams_->heap();\n    std::vector<const RecodeNode *> best_nodes;\n    std::vector<const RecodeNode *> best;\n    // Scan the segmented node chain for valid unichar ids.\n    for (auto &&entry : heaps) {\n      bool validChar = false;\n      int backcounter = 0;\n      const RecodeNode *node = &entry.data();\n      while (node != nullptr && backcounter < backpath) {\n        if (node->code != null_char_ &&\n            node->unichar_id != INVALID_UNICHAR_ID) {\n          validChar = true;\n          break;\n        }\n        node = node->prev;\n        ++backcounter;\n      }\n      if (validChar) {\n        best.push_back(&entry.data());\n      }\n    }\n    // find the best rated segmented node chain and extract the unichar id.\n    if (!best.empty()) {\n      std::sort(best.begin(), best.end(), greater_than());\n      ExtractPath(best[0], &best_nodes, backpath);\n      ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,\n                              &xcoords);\n    }\n    if (!unichar_ids.empty()) {\n      int bestPos = 0;\n      for (unsigned i = 1; i < unichar_ids.size(); ++i) {\n        if (ratings[i] < ratings[bestPos]) {\n          bestPos = i;\n        }\n      }\n#if 0 // TODO: bestCode is currently unused (see commit 2dd5d0d60).\n      int bestCode = -10;\n      for (auto &node : best_nodes) {\n        if (node->unichar_id == unichar_ids[bestPos]) {\n          bestCode = node->code;\n        }\n      }\n#endif\n      // Exclude the best choice for the followup decoding.\n      std::unordered_set<int> excludeCodeList;\n      for (auto &best_node : best_nodes) {\n        if (best_node->code != null_char_) {\n          excludeCodeList.insert(best_node->code);\n        }\n      }\n      if (j - 1 < excludedUnichars.size()) {\n        for (auto elem : excludeCodeList) {\n          excludedUnichars[j - 1].insert(elem);\n        }\n      } else {\n        excludedUnichars.push_back(excludeCodeList);\n      }\n      // Save the best choice for the choice iterator.\n      if (j - 1 < ctc_choices.size()) {\n        int id = unichar_ids[bestPos];\n        const char *result = unicharset->id_to_unichar_ext(id);\n        float rating = ratings[bestPos];\n        ctc_choices[j - 1].push_back(\n            std::pair<const char *, float>(result, rating));\n      } else {\n        std::vector<std::pair<const char *, float>> choice;\n        int id = unichar_ids[bestPos];\n        const char *result = unicharset->id_to_unichar_ext(id);\n        float rating = ratings[bestPos];\n        choice.emplace_back(result, rating);\n        ctc_choices.push_back(choice);\n      }\n      // fill the blank spot with an empty array\n    } else {\n      if (j - 1 >= excludedUnichars.size()) {\n        std::unordered_set<int> excludeCodeList;\n        excludedUnichars.push_back(excludeCodeList);\n      }\n      if (j - 1 >= ctc_choices.size()) {\n        std::vector<std::pair<const char *, float>> choice;\n        ctc_choices.push_back(choice);\n      }\n    }\n  }\n  for (auto data : secondary_beam_) {\n    delete data;\n  }\n  secondary_beam_.clear();\n}\n\n// Generates debug output of the content of the beams after a Decode.\nvoid RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const {\n  for (int p = 0; p < beam_size_; ++p) {\n    for (int d = 0; d < 2; ++d) {\n      for (int c = 0; c < NC_COUNT; ++c) {\n        auto cont = static_cast<NodeContinuation>(c);\n        int index = BeamIndex(d, cont, 0);\n        if (beam_[p]->beams_[index].empty()) {\n          continue;\n        }\n        // Print all the best scoring nodes for each unichar found.\n        tprintf(\"Position %d: %s+%s beam\\n\", p, d ? \"Dict\" : \"Non-Dict\",\n                kNodeContNames[c]);\n        DebugBeamPos(unicharset, beam_[p]->beams_[index]);\n      }\n    }\n  }\n}\n\n// Generates debug output of the content of a single beam position.\nvoid RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset,\n                                    const RecodeHeap &heap) const {\n  std::vector<const RecodeNode *> unichar_bests(unicharset.size());\n  const RecodeNode *null_best = nullptr;\n  int heap_size = heap.size();\n  for (int i = 0; i < heap_size; ++i) {\n    const RecodeNode *node = &heap.get(i).data();\n    if (node->unichar_id == INVALID_UNICHAR_ID) {\n      if (null_best == nullptr || null_best->score < node->score) {\n        null_best = node;\n      }\n    } else {\n      if (unichar_bests[node->unichar_id] == nullptr ||\n          unichar_bests[node->unichar_id]->score < node->score) {\n        unichar_bests[node->unichar_id] = node;\n      }\n    }\n  }\n  for (auto &unichar_best : unichar_bests) {\n    if (unichar_best != nullptr) {\n      const RecodeNode &node = *unichar_best;\n      node.Print(null_char_, unicharset, 1);\n    }\n  }\n  if (null_best != nullptr) {\n    null_best->Print(null_char_, unicharset, 1);\n  }\n}\n\n// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping\n// duplicates, nulls and intermediate parts.\n/* static */\nvoid RecodeBeamSearch::ExtractPathAsUnicharIds(\n    const std::vector<const RecodeNode *> &best_nodes,\n    std::vector<int> *unichar_ids, std::vector<float> *certs,\n    std::vector<float> *ratings, std::vector<int> *xcoords,\n    std::vector<int> *character_boundaries) {\n  unichar_ids->clear();\n  certs->clear();\n  ratings->clear();\n  xcoords->clear();\n  std::vector<int> starts;\n  std::vector<int> ends;\n  // Backtrack extracting only valid, non-duplicate unichar-ids.\n  int t = 0;\n  int width = best_nodes.size();\n  while (t < width) {\n    double certainty = 0.0;\n    double rating = 0.0;\n    while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) {\n      double cert = best_nodes[t++]->certainty;\n      if (cert < certainty) {\n        certainty = cert;\n      }\n      rating -= cert;\n    }\n    starts.push_back(t);\n    if (t < width) {\n      int unichar_id = best_nodes[t]->unichar_id;\n      if (unichar_id == UNICHAR_SPACE && !certs->empty() &&\n          best_nodes[t]->permuter != NO_PERM) {\n        // All the rating and certainty go on the previous character except\n        // for the space itself.\n        if (certainty < certs->back()) {\n          certs->back() = certainty;\n        }\n        ratings->back() += rating;\n        certainty = 0.0;\n        rating = 0.0;\n      }\n      unichar_ids->push_back(unichar_id);\n      xcoords->push_back(t);\n      do {\n        double cert = best_nodes[t++]->certainty;\n        // Special-case NO-PERM space to forget the certainty of the previous\n        // nulls. See long comment in ContinueContext.\n        if (cert < certainty || (unichar_id == UNICHAR_SPACE &&\n                                 best_nodes[t - 1]->permuter == NO_PERM)) {\n          certainty = cert;\n        }\n        rating -= cert;\n      } while (t < width && best_nodes[t]->duplicate);\n      ends.push_back(t);\n      certs->push_back(certainty);\n      ratings->push_back(rating);\n    } else if (!certs->empty()) {\n      if (certainty < certs->back()) {\n        certs->back() = certainty;\n      }\n      ratings->back() += rating;\n    }\n  }\n  starts.push_back(width);\n  if (character_boundaries != nullptr) {\n    calculateCharBoundaries(&starts, &ends, character_boundaries, width);\n  }\n  xcoords->push_back(width);\n}\n\n// Sets up a word with the ratings matrix and fake blobs with boxes in the\n// right places.\nWERD_RES *RecodeBeamSearch::InitializeWord(bool leading_space,\n                                           const TBOX &line_box, int word_start,\n                                           int word_end, float space_certainty,\n                                           const UNICHARSET *unicharset,\n                                           const std::vector<int> &xcoords,\n                                           float scale_factor) {\n  // Make a fake blob for each non-zero label.\n  C_BLOB_LIST blobs;\n  C_BLOB_IT b_it(&blobs);\n  for (int i = word_start; i < word_end; ++i) {\n    if (static_cast<unsigned>(i + 1) < character_boundaries_.size()) {\n      TBOX box(static_cast<int16_t>(\n                   std::floor(character_boundaries_[i] * scale_factor)) +\n                   line_box.left(),\n               line_box.bottom(),\n               static_cast<int16_t>(\n                   std::ceil(character_boundaries_[i + 1] * scale_factor)) +\n                   line_box.left(),\n               line_box.top());\n      b_it.add_after_then_move(C_BLOB::FakeBlob(box));\n    }\n  }\n  // Make a fake word from the blobs.\n  WERD *word = new WERD(&blobs, leading_space, nullptr);\n  // Make a WERD_RES from the word.\n  auto *word_res = new WERD_RES(word);\n  word_res->end = word_end - word_start + leading_space;\n  word_res->uch_set = unicharset;\n  word_res->combination = true; // Give it ownership of the word.\n  word_res->space_certainty = space_certainty;\n  word_res->ratings = new MATRIX(word_end - word_start, 1);\n  return word_res;\n}\n\n// Fills top_n_flags_ with bools that are true iff the corresponding output\n// is one of the top_n.\nvoid RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs,\n                                   int top_n) {\n  top_n_flags_.clear();\n  top_n_flags_.resize(num_outputs, TN_ALSO_RAN);\n  top_code_ = -1;\n  second_code_ = -1;\n  top_heap_.clear();\n  for (int i = 0; i < num_outputs; ++i) {\n    if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) {\n      TopPair entry(outputs[i], i);\n      top_heap_.Push(&entry);\n      if (top_heap_.size() > top_n) {\n        top_heap_.Pop(&entry);\n      }\n    }\n  }\n  while (!top_heap_.empty()) {\n    TopPair entry;\n    top_heap_.Pop(&entry);\n    if (top_heap_.size() > 1) {\n      top_n_flags_[entry.data()] = TN_TOPN;\n    } else {\n      top_n_flags_[entry.data()] = TN_TOP2;\n      if (top_heap_.empty()) {\n        top_code_ = entry.data();\n      } else {\n        second_code_ = entry.data();\n      }\n    }\n  }\n  top_n_flags_[null_char_] = TN_TOP2;\n}\n\nvoid RecodeBeamSearch::ComputeSecTopN(std::unordered_set<int> *exList,\n                                      const float *outputs, int num_outputs,\n                                      int top_n) {\n  top_n_flags_.clear();\n  top_n_flags_.resize(num_outputs, TN_ALSO_RAN);\n  top_code_ = -1;\n  second_code_ = -1;\n  top_heap_.clear();\n  for (int i = 0; i < num_outputs; ++i) {\n    if ((top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) &&\n        !exList->count(i)) {\n      TopPair entry(outputs[i], i);\n      top_heap_.Push(&entry);\n      if (top_heap_.size() > top_n) {\n        top_heap_.Pop(&entry);\n      }\n    }\n  }\n  while (!top_heap_.empty()) {\n    TopPair entry;\n    top_heap_.Pop(&entry);\n    if (top_heap_.size() > 1) {\n      top_n_flags_[entry.data()] = TN_TOPN;\n    } else {\n      top_n_flags_[entry.data()] = TN_TOP2;\n      if (top_heap_.empty()) {\n        top_code_ = entry.data();\n      } else {\n        second_code_ = entry.data();\n      }\n    }\n  }\n  top_n_flags_[null_char_] = TN_TOP2;\n}\n\n// Adds the computation for the current time-step to the beam. Call at each\n// time-step in sequence from left to right. outputs is the activation vector\n// for the current timestep.\nvoid RecodeBeamSearch::DecodeStep(const float *outputs, int t,\n                                  double dict_ratio, double cert_offset,\n                                  double worst_dict_cert,\n                                  const UNICHARSET *charset, bool debug) {\n  if (t == static_cast<int>(beam_.size())) {\n    beam_.push_back(new RecodeBeam);\n  }\n  RecodeBeam *step = beam_[t];\n  beam_size_ = t + 1;\n  step->Clear();\n  if (t == 0) {\n    // The first step can only use singles and initials.\n    ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2,\n                    charset, dict_ratio, cert_offset, worst_dict_cert, step);\n    if (dict_ != nullptr) {\n      ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs,\n                      TN_TOP2, charset, dict_ratio, cert_offset,\n                      worst_dict_cert, step);\n    }\n  } else {\n    RecodeBeam *prev = beam_[t - 1];\n    if (debug) {\n      int beam_index = BeamIndex(true, NC_ANYTHING, 0);\n      for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {\n        std::vector<const RecodeNode *> path;\n        ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);\n        tprintf(\"Step %d: Dawg beam %d:\\n\", t, i);\n        DebugPath(charset, path);\n      }\n      beam_index = BeamIndex(false, NC_ANYTHING, 0);\n      for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {\n        std::vector<const RecodeNode *> path;\n        ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);\n        tprintf(\"Step %d: Non-Dawg beam %d:\\n\", t, i);\n        DebugPath(charset, path);\n      }\n    }\n    int total_beam = 0;\n    // Work through the scores by group (top-2, top-n, the rest) while the beam\n    // is empty. This enables extending the context using only the top-n results\n    // first, which may have an empty intersection with the valid codes, so we\n    // fall back to the rest if the beam is empty.\n    for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) {\n      auto top_n = static_cast<TopNState>(tn);\n      for (int index = 0; index < kNumBeams; ++index) {\n        // Working backwards through the heaps doesn't guarantee that we see the\n        // best first, but it comes before a lot of the worst, so it is slightly\n        // more efficient than going forwards.\n        for (int i = prev->beams_[index].size() - 1; i >= 0; --i) {\n          ContinueContext(&prev->beams_[index].get(i).data(), index, outputs,\n                          top_n, charset, dict_ratio, cert_offset,\n                          worst_dict_cert, step);\n        }\n      }\n      for (int index = 0; index < kNumBeams; ++index) {\n        if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) {\n          total_beam += step->beams_[index].size();\n        }\n      }\n    }\n    // Special case for the best initial dawg. Push it on the heap if good\n    // enough, but there is only one, so it doesn't blow up the beam.\n    for (int c = 0; c < NC_COUNT; ++c) {\n      if (step->best_initial_dawgs_[c].code >= 0) {\n        int index = BeamIndex(true, static_cast<NodeContinuation>(c), 0);\n        RecodeHeap *dawg_heap = &step->beams_[index];\n        PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c],\n                         dawg_heap);\n      }\n    }\n  }\n}\n\nvoid RecodeBeamSearch::DecodeSecondaryStep(\n    const float *outputs, int t, double dict_ratio, double cert_offset,\n    double worst_dict_cert, const UNICHARSET *charset, bool debug) {\n  if (t == static_cast<int>(secondary_beam_.size())) {\n    secondary_beam_.push_back(new RecodeBeam);\n  }\n  RecodeBeam *step = secondary_beam_[t];\n  step->Clear();\n  if (t == 0) {\n    // The first step can only use singles and initials.\n    ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2,\n                    charset, dict_ratio, cert_offset, worst_dict_cert, step);\n    if (dict_ != nullptr) {\n      ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs,\n                      TN_TOP2, charset, dict_ratio, cert_offset,\n                      worst_dict_cert, step);\n    }\n  } else {\n    RecodeBeam *prev = secondary_beam_[t - 1];\n    if (debug) {\n      int beam_index = BeamIndex(true, NC_ANYTHING, 0);\n      for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {\n        std::vector<const RecodeNode *> path;\n        ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);\n        tprintf(\"Step %d: Dawg beam %d:\\n\", t, i);\n        DebugPath(charset, path);\n      }\n      beam_index = BeamIndex(false, NC_ANYTHING, 0);\n      for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {\n        std::vector<const RecodeNode *> path;\n        ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);\n        tprintf(\"Step %d: Non-Dawg beam %d:\\n\", t, i);\n        DebugPath(charset, path);\n      }\n    }\n    int total_beam = 0;\n    // Work through the scores by group (top-2, top-n, the rest) while the beam\n    // is empty. This enables extending the context using only the top-n results\n    // first, which may have an empty intersection with the valid codes, so we\n    // fall back to the rest if the beam is empty.\n    for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) {\n      auto top_n = static_cast<TopNState>(tn);\n      for (int index = 0; index < kNumBeams; ++index) {\n        // Working backwards through the heaps doesn't guarantee that we see the\n        // best first, but it comes before a lot of the worst, so it is slightly\n        // more efficient than going forwards.\n        for (int i = prev->beams_[index].size() - 1; i >= 0; --i) {\n          ContinueContext(&prev->beams_[index].get(i).data(), index, outputs,\n                          top_n, charset, dict_ratio, cert_offset,\n                          worst_dict_cert, step);\n        }\n      }\n      for (int index = 0; index < kNumBeams; ++index) {\n        if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) {\n          total_beam += step->beams_[index].size();\n        }\n      }\n    }\n    // Special case for the best initial dawg. Push it on the heap if good\n    // enough, but there is only one, so it doesn't blow up the beam.\n    for (int c = 0; c < NC_COUNT; ++c) {\n      if (step->best_initial_dawgs_[c].code >= 0) {\n        int index = BeamIndex(true, static_cast<NodeContinuation>(c), 0);\n        RecodeHeap *dawg_heap = &step->beams_[index];\n        PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c],\n                         dawg_heap);\n      }\n    }\n  }\n}\n\n// Adds to the appropriate beams the legal (according to recoder)\n// continuations of context prev, which is of the given length, using the\n// given network outputs to provide scores to the choices. Uses only those\n// choices for which top_n_flags[index] == top_n_flag.\nvoid RecodeBeamSearch::ContinueContext(\n    const RecodeNode *prev, int index, const float *outputs,\n    TopNState top_n_flag, const UNICHARSET *charset, double dict_ratio,\n    double cert_offset, double worst_dict_cert, RecodeBeam *step) {\n  RecodedCharID prefix;\n  RecodedCharID full_code;\n  const RecodeNode *previous = prev;\n  int length = LengthFromBeamsIndex(index);\n  bool use_dawgs = IsDawgFromBeamsIndex(index);\n  NodeContinuation prev_cont = ContinuationFromBeamsIndex(index);\n  for (int p = length - 1; p >= 0 && previous != nullptr; --p) {\n    while (previous->duplicate || previous->code == null_char_) {\n      previous = previous->prev;\n    }\n    prefix.Set(p, previous->code);\n    full_code.Set(p, previous->code);\n    previous = previous->prev;\n  }\n  if (prev != nullptr && !is_simple_text_) {\n    if (top_n_flags_[prev->code] == top_n_flag) {\n      if (prev_cont != NC_NO_DUP) {\n        float cert =\n            NetworkIO::ProbToCertainty(outputs[prev->code]) + cert_offset;\n        PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id,\n                                cert, worst_dict_cert, dict_ratio, use_dawgs,\n                                NC_ANYTHING, prev, step);\n      }\n      if (prev_cont == NC_ANYTHING && top_n_flag == TN_TOP2 &&\n          prev->code != null_char_) {\n        float cert = NetworkIO::ProbToCertainty(outputs[prev->code] +\n                                                outputs[null_char_]) +\n                     cert_offset;\n        PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id,\n                                cert, worst_dict_cert, dict_ratio, use_dawgs,\n                                NC_NO_DUP, prev, step);\n      }\n    }\n    if (prev_cont == NC_ONLY_DUP) {\n      return;\n    }\n    if (prev->code != null_char_ && length > 0 &&\n        top_n_flags_[null_char_] == top_n_flag) {\n      // Allow nulls within multi code sequences, as the nulls within are not\n      // explicitly included in the code sequence.\n      float cert =\n          NetworkIO::ProbToCertainty(outputs[null_char_]) + cert_offset;\n      PushDupOrNoDawgIfBetter(length, false, null_char_, INVALID_UNICHAR_ID,\n                              cert, worst_dict_cert, dict_ratio, use_dawgs,\n                              NC_ANYTHING, prev, step);\n    }\n  }\n  const std::vector<int> *final_codes = recoder_.GetFinalCodes(prefix);\n  if (final_codes != nullptr) {\n    for (int code : *final_codes) {\n      if (top_n_flags_[code] != top_n_flag) {\n        continue;\n      }\n      if (prev != nullptr && prev->code == code && !is_simple_text_) {\n        continue;\n      }\n      float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset;\n      if (cert < kMinCertainty && code != null_char_) {\n        continue;\n      }\n      full_code.Set(length, code);\n      int unichar_id = recoder_.DecodeUnichar(full_code);\n      // Map the null char to INVALID.\n      if (length == 0 && code == null_char_) {\n        unichar_id = INVALID_UNICHAR_ID;\n      }\n      if (unichar_id != INVALID_UNICHAR_ID && charset != nullptr &&\n          !charset->get_enabled(unichar_id)) {\n        continue; // disabled by whitelist/blacklist\n      }\n      ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio,\n                      use_dawgs, NC_ANYTHING, prev, step);\n      if (top_n_flag == TN_TOP2 && code != null_char_) {\n        float prob = outputs[code] + outputs[null_char_];\n        if (prev != nullptr && prev_cont == NC_ANYTHING &&\n            prev->code != null_char_ &&\n            ((prev->code == top_code_ && code == second_code_) ||\n             (code == top_code_ && prev->code == second_code_))) {\n          prob += outputs[prev->code];\n        }\n        cert = NetworkIO::ProbToCertainty(prob) + cert_offset;\n        ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio,\n                        use_dawgs, NC_ONLY_DUP, prev, step);\n      }\n    }\n  }\n  const std::vector<int> *next_codes = recoder_.GetNextCodes(prefix);\n  if (next_codes != nullptr) {\n    for (int code : *next_codes) {\n      if (top_n_flags_[code] != top_n_flag) {\n        continue;\n      }\n      if (prev != nullptr && prev->code == code && !is_simple_text_) {\n        continue;\n      }\n      float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset;\n      PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert,\n                              worst_dict_cert, dict_ratio, use_dawgs,\n                              NC_ANYTHING, prev, step);\n      if (top_n_flag == TN_TOP2 && code != null_char_) {\n        float prob = outputs[code] + outputs[null_char_];\n        if (prev != nullptr && prev_cont == NC_ANYTHING &&\n            prev->code != null_char_ &&\n            ((prev->code == top_code_ && code == second_code_) ||\n             (code == top_code_ && prev->code == second_code_))) {\n          prob += outputs[prev->code];\n        }\n        cert = NetworkIO::ProbToCertainty(prob) + cert_offset;\n        PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID,\n                                cert, worst_dict_cert, dict_ratio, use_dawgs,\n                                NC_ONLY_DUP, prev, step);\n      }\n    }\n  }\n}\n\n// Continues for a new unichar, using dawg or non-dawg as per flag.\nvoid RecodeBeamSearch::ContinueUnichar(int code, int unichar_id, float cert,\n                                       float worst_dict_cert, float dict_ratio,\n                                       bool use_dawgs, NodeContinuation cont,\n                                       const RecodeNode *prev,\n                                       RecodeBeam *step) {\n  if (use_dawgs) {\n    if (cert > worst_dict_cert) {\n      ContinueDawg(code, unichar_id, cert, cont, prev, step);\n    }\n  } else {\n    RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)];\n    PushHeapIfBetter(kBeamWidths[0], code, unichar_id, TOP_CHOICE_PERM, false,\n                     false, false, false, cert * dict_ratio, prev, nullptr,\n                     nodawg_heap);\n    if (dict_ != nullptr &&\n        ((unichar_id == UNICHAR_SPACE && cert > worst_dict_cert) ||\n         !dict_->getUnicharset().IsSpaceDelimited(unichar_id))) {\n      // Any top choice position that can start a new word, ie a space or\n      // any non-space-delimited character, should also be considered\n      // by the dawg search, so push initial dawg to the dawg heap.\n      float dawg_cert = cert;\n      PermuterType permuter = TOP_CHOICE_PERM;\n      // Since we use the space either side of a dictionary word in the\n      // certainty of the word, (to properly handle weak spaces) and the\n      // space is coming from a non-dict word, we need special conditions\n      // to avoid degrading the certainty of the dict word that follows.\n      // With a space we don't multiply the certainty by dict_ratio, and we\n      // flag the space with NO_PERM to indicate that we should not use the\n      // predecessor nulls to generate the confidence for the space, as they\n      // have already been multiplied by dict_ratio, and we can't go back to\n      // insert more entries in any previous heaps.\n      if (unichar_id == UNICHAR_SPACE) {\n        permuter = NO_PERM;\n      } else {\n        dawg_cert *= dict_ratio;\n      }\n      PushInitialDawgIfBetter(code, unichar_id, permuter, false, false,\n                              dawg_cert, cont, prev, step);\n    }\n  }\n}\n\n// Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev,\n// appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id\n// is a valid continuation of whatever is in prev.\nvoid RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert,\n                                    NodeContinuation cont,\n                                    const RecodeNode *prev, RecodeBeam *step) {\n  RecodeHeap *dawg_heap = &step->beams_[BeamIndex(true, cont, 0)];\n  RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)];\n  if (unichar_id == INVALID_UNICHAR_ID) {\n    PushHeapIfBetter(kBeamWidths[0], code, unichar_id, NO_PERM, false, false,\n                     false, false, cert, prev, nullptr, dawg_heap);\n    return;\n  }\n  // Avoid dictionary probe if score a total loss.\n  float score = cert;\n  if (prev != nullptr) {\n    score += prev->score;\n  }\n  if (dawg_heap->size() >= kBeamWidths[0] &&\n      score <= dawg_heap->PeekTop().data().score &&\n      nodawg_heap->size() >= kBeamWidths[0] &&\n      score <= nodawg_heap->PeekTop().data().score) {\n    return;\n  }\n  const RecodeNode *uni_prev = prev;\n  // Prev may be a partial code, null_char, or duplicate, so scan back to the\n  // last valid unichar_id.\n  while (uni_prev != nullptr &&\n         (uni_prev->unichar_id == INVALID_UNICHAR_ID || uni_prev->duplicate)) {\n    uni_prev = uni_prev->prev;\n  }\n  if (unichar_id == UNICHAR_SPACE) {\n    if (uni_prev != nullptr && uni_prev->end_of_word) {\n      // Space is good. Push initial state, to the dawg beam and a regular\n      // space to the top choice beam.\n      PushInitialDawgIfBetter(code, unichar_id, uni_prev->permuter, false,\n                              false, cert, cont, prev, step);\n      PushHeapIfBetter(kBeamWidths[0], code, unichar_id, uni_prev->permuter,\n                       false, false, false, false, cert, prev, nullptr,\n                       nodawg_heap);\n    }\n    return;\n  } else if (uni_prev != nullptr && uni_prev->start_of_dawg &&\n             uni_prev->unichar_id != UNICHAR_SPACE &&\n             dict_->getUnicharset().IsSpaceDelimited(uni_prev->unichar_id) &&\n             dict_->getUnicharset().IsSpaceDelimited(unichar_id)) {\n    return; // Can't break words between space delimited chars.\n  }\n  DawgPositionVector initial_dawgs;\n  auto *updated_dawgs = new DawgPositionVector;\n  DawgArgs dawg_args(&initial_dawgs, updated_dawgs, NO_PERM);\n  bool word_start = false;\n  if (uni_prev == nullptr) {\n    // Starting from beginning of line.\n    dict_->default_dawgs(&initial_dawgs, false);\n    word_start = true;\n  } else if (uni_prev->dawgs != nullptr) {\n    // Continuing a previous dict word.\n    dawg_args.active_dawgs = uni_prev->dawgs;\n    word_start = uni_prev->start_of_dawg;\n  } else {\n    return; // Can't continue if not a dict word.\n  }\n  auto permuter = static_cast<PermuterType>(dict_->def_letter_is_okay(\n      &dawg_args, dict_->getUnicharset(), unichar_id, false));\n  if (permuter != NO_PERM) {\n    PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,\n                     word_start, dawg_args.valid_end, false, cert, prev,\n                     dawg_args.updated_dawgs, dawg_heap);\n    if (dawg_args.valid_end && !space_delimited_) {\n      // We can start another word right away, so push initial state as well,\n      // to the dawg beam, and the regular character to the top choice beam,\n      // since non-dict words can start here too.\n      PushInitialDawgIfBetter(code, unichar_id, permuter, word_start, true,\n                              cert, cont, prev, step);\n      PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,\n                       word_start, true, false, cert, prev, nullptr,\n                       nodawg_heap);\n    }\n  } else {\n    delete updated_dawgs;\n  }\n}\n\n// Adds a RecodeNode composed of the tuple (code, unichar_id,\n// initial-dawg-state, prev, cert) to the given heap if/ there is room or if\n// better than the current worst element if already full.\nvoid RecodeBeamSearch::PushInitialDawgIfBetter(int code, int unichar_id,\n                                               PermuterType permuter,\n                                               bool start, bool end, float cert,\n                                               NodeContinuation cont,\n                                               const RecodeNode *prev,\n                                               RecodeBeam *step) {\n  RecodeNode *best_initial_dawg = &step->best_initial_dawgs_[cont];\n  float score = cert;\n  if (prev != nullptr) {\n    score += prev->score;\n  }\n  if (best_initial_dawg->code < 0 || score > best_initial_dawg->score) {\n    auto *initial_dawgs = new DawgPositionVector;\n    dict_->default_dawgs(initial_dawgs, false);\n    RecodeNode node(code, unichar_id, permuter, true, start, end, false, cert,\n                    score, prev, initial_dawgs,\n                    ComputeCodeHash(code, false, prev));\n    *best_initial_dawg = node;\n  }\n}\n\n// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,\n// false, false, false, false, cert, prev, nullptr) to heap if there is room\n// or if better than the current worst element if already full.\n/* static */\nvoid RecodeBeamSearch::PushDupOrNoDawgIfBetter(\n    int length, bool dup, int code, int unichar_id, float cert,\n    float worst_dict_cert, float dict_ratio, bool use_dawgs,\n    NodeContinuation cont, const RecodeNode *prev, RecodeBeam *step) {\n  int index = BeamIndex(use_dawgs, cont, length);\n  if (use_dawgs) {\n    if (cert > worst_dict_cert) {\n      PushHeapIfBetter(kBeamWidths[length], code, unichar_id,\n                       prev ? prev->permuter : NO_PERM, false, false, false,\n                       dup, cert, prev, nullptr, &step->beams_[index]);\n    }\n  } else {\n    cert *= dict_ratio;\n    if (cert >= kMinCertainty || code == null_char_) {\n      PushHeapIfBetter(kBeamWidths[length], code, unichar_id,\n                       prev ? prev->permuter : TOP_CHOICE_PERM, false, false,\n                       false, dup, cert, prev, nullptr, &step->beams_[index]);\n    }\n  }\n}\n\n// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,\n// dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room\n// or if better than the current worst element if already full.\nvoid RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id,\n                                        PermuterType permuter, bool dawg_start,\n                                        bool word_start, bool end, bool dup,\n                                        float cert, const RecodeNode *prev,\n                                        DawgPositionVector *d,\n                                        RecodeHeap *heap) {\n  float score = cert;\n  if (prev != nullptr) {\n    score += prev->score;\n  }\n  if (heap->size() < max_size || score > heap->PeekTop().data().score) {\n    uint64_t hash = ComputeCodeHash(code, dup, prev);\n    RecodeNode node(code, unichar_id, permuter, dawg_start, word_start, end,\n                    dup, cert, score, prev, d, hash);\n    if (UpdateHeapIfMatched(&node, heap)) {\n      return;\n    }\n    RecodePair entry(score, node);\n    heap->Push(&entry);\n    ASSERT_HOST(entry.data().dawgs == nullptr);\n    if (heap->size() > max_size) {\n      heap->Pop(&entry);\n    }\n  } else {\n    delete d;\n  }\n}\n\n// Adds a RecodeNode to heap if there is room\n// or if better than the current worst element if already full.\nvoid RecodeBeamSearch::PushHeapIfBetter(int max_size, RecodeNode *node,\n                                        RecodeHeap *heap) {\n  if (heap->size() < max_size || node->score > heap->PeekTop().data().score) {\n    if (UpdateHeapIfMatched(node, heap)) {\n      return;\n    }\n    RecodePair entry(node->score, *node);\n    heap->Push(&entry);\n    ASSERT_HOST(entry.data().dawgs == nullptr);\n    if (heap->size() > max_size) {\n      heap->Pop(&entry);\n    }\n  }\n}\n\n// Searches the heap for a matching entry, and updates the score with\n// reshuffle if needed. Returns true if there was a match.\nbool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node,\n                                           RecodeHeap *heap) {\n  // TODO(rays) consider hash map instead of linear search.\n  // It might not be faster because the hash map would have to be updated\n  // every time a heap reshuffle happens, and that would be a lot of overhead.\n  std::vector<RecodePair> &nodes = heap->heap();\n  for (auto &i : nodes) {\n    RecodeNode &node = i.data();\n    if (node.code == new_node->code && node.code_hash == new_node->code_hash &&\n        node.permuter == new_node->permuter &&\n        node.start_of_dawg == new_node->start_of_dawg) {\n      if (new_node->score > node.score) {\n        // The new one is better. Update the entire node in the heap and\n        // reshuffle.\n        node = *new_node;\n        i.key() = node.score;\n        heap->Reshuffle(&i);\n      }\n      return true;\n    }\n  }\n  return false;\n}\n\n// Computes and returns the code-hash for the given code and prev.\nuint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup,\n                                           const RecodeNode *prev) const {\n  uint64_t hash = prev == nullptr ? 0 : prev->code_hash;\n  if (!dup && code != null_char_) {\n    int num_classes = recoder_.code_range();\n    uint64_t carry = (((hash >> 32) * num_classes) >> 32);\n    hash *= num_classes;\n    hash += carry;\n    hash += code;\n  }\n  return hash;\n}\n\n// Backtracks to extract the best path through the lattice that was built\n// during Decode. On return the best_nodes vector essentially contains the set\n// of code, score pairs that make the optimal path with the constraint that\n// the recoder can decode the code sequence back to a sequence of unichar-ids.\nvoid RecodeBeamSearch::ExtractBestPaths(\n    std::vector<const RecodeNode *> *best_nodes,\n    std::vector<const RecodeNode *> *second_nodes) const {\n  // Scan both beams to extract the best and second best paths.\n  const RecodeNode *best_node = nullptr;\n  const RecodeNode *second_best_node = nullptr;\n  const RecodeBeam *last_beam = beam_[beam_size_ - 1];\n  for (int c = 0; c < NC_COUNT; ++c) {\n    if (c == NC_ONLY_DUP) {\n      continue;\n    }\n    auto cont = static_cast<NodeContinuation>(c);\n    for (int is_dawg = 0; is_dawg < 2; ++is_dawg) {\n      int beam_index = BeamIndex(is_dawg, cont, 0);\n      int heap_size = last_beam->beams_[beam_index].size();\n      for (int h = 0; h < heap_size; ++h) {\n        const RecodeNode *node = &last_beam->beams_[beam_index].get(h).data();\n        if (is_dawg) {\n          // dawg_node may be a null_char, or duplicate, so scan back to the\n          // last valid unichar_id.\n          const RecodeNode *dawg_node = node;\n          while (dawg_node != nullptr &&\n                 (dawg_node->unichar_id == INVALID_UNICHAR_ID ||\n                  dawg_node->duplicate)) {\n            dawg_node = dawg_node->prev;\n          }\n          if (dawg_node == nullptr ||\n              (!dawg_node->end_of_word &&\n               dawg_node->unichar_id != UNICHAR_SPACE)) {\n            // Dawg node is not valid.\n            continue;\n          }\n        }\n        if (best_node == nullptr || node->score > best_node->score) {\n          second_best_node = best_node;\n          best_node = node;\n        } else if (second_best_node == nullptr ||\n                   node->score > second_best_node->score) {\n          second_best_node = node;\n        }\n      }\n    }\n  }\n  if (second_nodes != nullptr) {\n    ExtractPath(second_best_node, second_nodes);\n  }\n  ExtractPath(best_node, best_nodes);\n}\n\n// Helper backtracks through the lattice from the given node, storing the\n// path and reversing it.\nvoid RecodeBeamSearch::ExtractPath(\n    const RecodeNode *node, std::vector<const RecodeNode *> *path) const {\n  path->clear();\n  while (node != nullptr) {\n    path->push_back(node);\n    node = node->prev;\n  }\n  std::reverse(path->begin(), path->end());\n}\n\nvoid RecodeBeamSearch::ExtractPath(const RecodeNode *node,\n                                   std::vector<const RecodeNode *> *path,\n                                   int limiter) const {\n  int pathcounter = 0;\n  path->clear();\n  while (node != nullptr && pathcounter < limiter) {\n    path->push_back(node);\n    node = node->prev;\n    ++pathcounter;\n  }\n  std::reverse(path->begin(), path->end());\n}\n\n// Helper prints debug information on the given lattice path.\nvoid RecodeBeamSearch::DebugPath(\n    const UNICHARSET *unicharset,\n    const std::vector<const RecodeNode *> &path) const {\n  for (unsigned c = 0; c < path.size(); ++c) {\n    const RecodeNode &node = *path[c];\n    tprintf(\"%u \", c);\n    node.Print(null_char_, *unicharset, 1);\n  }\n}\n\n// Helper prints debug information on the given unichar path.\nvoid RecodeBeamSearch::DebugUnicharPath(\n    const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path,\n    const std::vector<int> &unichar_ids, const std::vector<float> &certs,\n    const std::vector<float> &ratings, const std::vector<int> &xcoords) const {\n  auto num_ids = unichar_ids.size();\n  double total_rating = 0.0;\n  for (unsigned c = 0; c < num_ids; ++c) {\n    int coord = xcoords[c];\n    tprintf(\"%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\\n\", coord, unichar_ids[c],\n            unicharset->debug_str(unichar_ids[c]).c_str(), ratings[c], certs[c],\n            path[coord]->start_of_word, path[coord]->end_of_word,\n            path[coord]->permuter);\n    total_rating += ratings[c];\n  }\n  tprintf(\"Path total rating = %g\\n\", total_rating);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/recodebeam.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        recodebeam.h\n// Description: Beam search to decode from the re-encoded CJK as a sequence of\n//              smaller numbers in place of a single large code.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_\n#define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_\n\n#include \"dawg.h\"\n#include \"dict.h\"\n#include \"genericheap.h\"\n#include \"genericvector.h\"\n#include \"kdpair.h\"\n#include \"networkio.h\"\n#include \"ratngs.h\"\n#include \"unicharcompress.h\"\n\n#include <unordered_set> // for std::unordered_set\n#include <vector>        // for std::vector\n\nnamespace tesseract {\n\n// Enum describing what can follow the current node.\n// Consider the following softmax outputs:\n// Timestep    0    1    2    3    4    5    6    7    8\n// X-score    0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01\n// Y-score    0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01\n// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98\n// Then the correct CTC decoding (in which adjacent equal classes are folded,\n// and then all nulls are dropped) is clearly XYX, but simple decoding (taking\n// the max at each timestep) leads to:\n// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,\n// which folds to the correct XYX. The conversion to Tesseract rating and\n// certainty uses the sum of the log probs (log of the product of probabilities)\n// for the Rating and the minimum log prob for the certainty, but that yields a\n// minimum certainty of log(0.55), which is poor for such an obvious case.\n// CTC says that the probability of the result is the SUM of the products of the\n// probabilities over ALL PATHS that decode to the same result, which includes:\n// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.\n// That is intractable, so some compromise between simple and ideal is needed.\n// Observing that evenly split timesteps rarely happen next to each other, we\n// allow scores at a transition between classes to be added for decoding thus:\n// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.\n// This works because NNX and NXX both decode to X, so in the middle we can use\n// N+X. Note that the classes either side of a sum must stand alone, i.e. use a\n// single score, to force all paths to pass through them and decode to the same\n// result. Also in the special case of a transition from X to Y, with only one\n// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all\n// decode to XY.\n// An important condition is that we cannot combine X and Null between two\n// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for\n// X and Null have to go in separate paths. Combining scores in this way\n// provides a much better minimum certainty of log(0.95).\n// In the implementation of the beam search, we have to place the possibilities\n// X, X+N and X+Y+N in the beam under appropriate conditions of the previous\n// node, and constrain what can follow, to enforce the rules explained above.\n// We therefore have 3 different types of node determined by what can follow:\nenum NodeContinuation {\n  NC_ANYTHING, // This node used just its own score, so anything can follow.\n  NC_ONLY_DUP, // The current node combined another score with the score for\n               // itself, without a stand-alone duplicate before, so must be\n               // followed by a stand-alone duplicate.\n  NC_NO_DUP,   // The current node combined another score with the score for\n               // itself, after a stand-alone, so can only be followed by\n               // something other than a duplicate of the current node.\n  NC_COUNT\n};\n\n// Enum describing the top-n status of a code.\nenum TopNState {\n  TN_TOP2,     // Winner or 2nd.\n  TN_TOPN,     // Runner up in top-n, but not 1st or 2nd.\n  TN_ALSO_RAN, // Not in the top-n.\n  TN_COUNT\n};\n\n// Lattice element for Re-encode beam search.\nstruct RecodeNode {\n  RecodeNode()\n      : code(-1)\n      , unichar_id(INVALID_UNICHAR_ID)\n      , permuter(TOP_CHOICE_PERM)\n      , start_of_dawg(false)\n      , start_of_word(false)\n      , end_of_word(false)\n      , duplicate(false)\n      , certainty(0.0f)\n      , score(0.0f)\n      , prev(nullptr)\n      , dawgs(nullptr)\n      , code_hash(0) {}\n  RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, bool word_start, bool end,\n             bool dup, float cert, float s, const RecodeNode *p, DawgPositionVector *d,\n             uint64_t hash)\n      : code(c)\n      , unichar_id(uni_id)\n      , permuter(perm)\n      , start_of_dawg(dawg_start)\n      , start_of_word(word_start)\n      , end_of_word(end)\n      , duplicate(dup)\n      , certainty(cert)\n      , score(s)\n      , prev(p)\n      , dawgs(d)\n      , code_hash(hash) {}\n  // NOTE: If we could use C++11, then this would be a move constructor.\n  // Instead we have copy constructor that does a move!! This is because we\n  // don't want to copy the whole DawgPositionVector each time, and true\n  // copying isn't necessary for this struct. It does get moved around a lot\n  // though inside the heap and during heap push, hence the move semantics.\n  RecodeNode(const RecodeNode &src) : dawgs(nullptr) {\n    *this = src;\n    ASSERT_HOST(src.dawgs == nullptr);\n  }\n  RecodeNode &operator=(const RecodeNode &src) {\n    delete dawgs;\n    memcpy(this, &src, sizeof(src));\n    ((RecodeNode &)src).dawgs = nullptr;\n    return *this;\n  }\n  ~RecodeNode() {\n    delete dawgs;\n  }\n  // Prints details of the node.\n  void Print(int null_char, const UNICHARSET &unicharset, int depth) const;\n\n  // The re-encoded code here = index to network output.\n  int code;\n  // The decoded unichar_id is only valid for the final code of a sequence.\n  int unichar_id;\n  // The type of permuter active at this point. Intervals between start_of_word\n  // and end_of_word make valid words of type given by permuter where\n  // end_of_word is true. These aren't necessarily delimited by spaces.\n  PermuterType permuter;\n  // True if this is the initial dawg state. May be attached to a space or,\n  // in a non-space-delimited lang, the end of the previous word.\n  bool start_of_dawg;\n  // True if this is the first node in a dictionary word.\n  bool start_of_word;\n  // True if this represents a valid candidate end of word position. Does not\n  // necessarily mark the end of a word, since a word can be extended beyond a\n  // candidate end by a continuation, eg 'the' continues to 'these'.\n  bool end_of_word;\n  // True if this->code is a duplicate of prev->code. Some training modes\n  // allow the network to output duplicate characters and crush them with CTC,\n  // but that would mess up the dictionary search, so we just smash them\n  // together on the fly using the duplicate flag.\n  bool duplicate;\n  // Certainty (log prob) of (just) this position.\n  float certainty;\n  // Total certainty of the path to this position.\n  float score;\n  // The previous node in this chain. Borrowed pointer.\n  const RecodeNode *prev;\n  // The currently active dawgs at this position. Owned pointer.\n  DawgPositionVector *dawgs;\n  // A hash of all codes in the prefix and this->code as well. Used for\n  // duplicate path removal.\n  uint64_t code_hash;\n};\n\nusing RecodePair = KDPairInc<double, RecodeNode>;\nusing RecodeHeap = GenericHeap<RecodePair>;\n\n// Class that holds the entire beam search for recognition of a text line.\nclass TESS_API RecodeBeamSearch {\npublic:\n  // Borrows the pointer, which is expected to survive until *this is deleted.\n  RecodeBeamSearch(const UnicharCompress &recoder, int null_char, bool simple_text, Dict *dict);\n  ~RecodeBeamSearch();\n\n  // Decodes the set of network outputs, storing the lattice internally.\n  // If charset is not null, it enables detailed debugging of the beam search.\n  void Decode(const NetworkIO &output, double dict_ratio, double cert_offset,\n              double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode = 0);\n  void Decode(const GENERIC_2D_ARRAY<float> &output, double dict_ratio, double cert_offset,\n              double worst_dict_cert, const UNICHARSET *charset);\n\n  void DecodeSecondaryBeams(const NetworkIO &output, double dict_ratio, double cert_offset,\n                            double worst_dict_cert, const UNICHARSET *charset,\n                            int lstm_choice_mode = 0);\n\n  // Returns the best path as labels/scores/xcoords similar to simple CTC.\n  void ExtractBestPathAsLabels(std::vector<int> *labels, std::vector<int> *xcoords) const;\n  // Returns the best path as unichar-ids/certs/ratings/xcoords skipping\n  // duplicates, nulls and intermediate parts.\n  void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset,\n                                   std::vector<int> *unichar_ids, std::vector<float> *certs,\n                                   std::vector<float> *ratings, std::vector<int> *xcoords) const;\n\n  // Returns the best path as a set of WERD_RES.\n  void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug,\n                              const UNICHARSET *unicharset, PointerVector<WERD_RES> *words,\n                              int lstm_choice_mode = 0);\n\n  // Generates debug output of the content of the beams after a Decode.\n  void DebugBeams(const UNICHARSET &unicharset) const;\n\n  // Extract the best characters from the current decode iteration and block\n  // those symbols for the next iteration. In contrast to Tesseract's standard\n  // method to chose the best overall node chain, this methods looks at a short\n  // node chain segmented by the character boundaries and chooses the best\n  // option independent of the remaining node chain.\n  void extractSymbolChoices(const UNICHARSET *unicharset);\n\n  // Generates debug output of the content of the beams after a Decode.\n  void PrintBeam2(bool uids, int num_outputs, const UNICHARSET *charset, bool secondary) const;\n  // Segments the timestep bundle by the character_boundaries.\n  void segmentTimestepsByCharacters();\n  std::vector<std::vector<std::pair<const char *, float>>>\n  // Unions the segmented timestep character bundles to one big bundle.\n  combineSegmentedTimesteps(\n      std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *segmentedTimesteps);\n  // Stores the alternative characters of every timestep together with their\n  // probability.\n  std::vector<std::vector<std::pair<const char *, float>>> timesteps;\n  std::vector<std::vector<std::vector<std::pair<const char *, float>>>> segmentedTimesteps;\n  // Stores the character choices found in the ctc algorithm\n  std::vector<std::vector<std::pair<const char *, float>>> ctc_choices;\n  // Stores all unicharids which are excluded for future iterations\n  std::vector<std::unordered_set<int>> excludedUnichars;\n  // Stores the character boundaries regarding timesteps.\n  std::vector<int> character_boundaries_;\n  // Clipping value for certainty inside Tesseract. Reflects the minimum value\n  // of certainty that will be returned by ExtractBestPathAsUnicharIds.\n  // Supposedly on a uniform scale that can be compared across languages and\n  // engines.\n  static constexpr float kMinCertainty = -20.0f;\n  // Number of different code lengths for which we have a separate beam.\n  static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;\n  // Total number of beams: dawg/nodawg * number of NodeContinuation * number\n  // of different lengths.\n  static const int kNumBeams = 2 * NC_COUNT * kNumLengths;\n  // Returns the relevant factor in the beams_ index.\n  static int LengthFromBeamsIndex(int index) {\n    return index % kNumLengths;\n  }\n  static NodeContinuation ContinuationFromBeamsIndex(int index) {\n    return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);\n  }\n  static bool IsDawgFromBeamsIndex(int index) {\n    return index / (kNumLengths * NC_COUNT) > 0;\n  }\n  // Computes a beams_ index from the given factors.\n  static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {\n    return (is_dawg * NC_COUNT + cont) * kNumLengths + length;\n  }\n\nprivate:\n  // Struct for the Re-encode beam search. This struct holds the data for\n  // a single time-step position of the output. Use a vector<RecodeBeam>\n  // to hold all the timesteps and prevent reallocation of the individual heaps.\n  struct RecodeBeam {\n    // Resets to the initial state without deleting all the memory.\n    void Clear() {\n      for (auto &beam : beams_) {\n        beam.clear();\n      }\n      RecodeNode empty;\n      for (auto &best_initial_dawg : best_initial_dawgs_) {\n        best_initial_dawg = empty;\n      }\n    }\n\n    // A separate beam for each combination of code length,\n    // NodeContinuation, and dictionary flag. Separating out all these types\n    // allows the beam to be quite narrow, and yet still have a low chance of\n    // losing the best path.\n    // We have to keep all these beams separate, since the highest scoring paths\n    // come from the paths that are most likely to dead-end at any time, like\n    // dawg paths, NC_ONLY_DUP etc.\n    // Each heap is stored with the WORST result at the top, so we can quickly\n    // get the top-n values.\n    RecodeHeap beams_[kNumBeams];\n    // While the language model is only a single word dictionary, we can use\n    // word starts as a choke point in the beam, and keep only a single dict\n    // start node at each step (for each NodeContinuation type), so we find the\n    // best one here and push it on the heap, if it qualifies, after processing\n    // all of the step.\n    RecodeNode best_initial_dawgs_[NC_COUNT];\n  };\n  using TopPair = KDPairInc<float, int>;\n\n  // Generates debug output of the content of a single beam position.\n  void DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const;\n\n  // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping\n  // duplicates, nulls and intermediate parts.\n  static void ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes,\n                                      std::vector<int> *unichar_ids, std::vector<float> *certs,\n                                      std::vector<float> *ratings, std::vector<int> *xcoords,\n                                      std::vector<int> *character_boundaries = nullptr);\n\n  // Sets up a word with the ratings matrix and fake blobs with boxes in the\n  // right places.\n  WERD_RES *InitializeWord(bool leading_space, const TBOX &line_box, int word_start, int word_end,\n                           float space_certainty, const UNICHARSET *unicharset,\n                           const std::vector<int> &xcoords, float scale_factor);\n\n  // Fills top_n_flags_ with bools that are true iff the corresponding output\n  // is one of the top_n.\n  void ComputeTopN(const float *outputs, int num_outputs, int top_n);\n\n  void ComputeSecTopN(std::unordered_set<int> *exList, const float *outputs, int num_outputs,\n                      int top_n);\n\n  // Adds the computation for the current time-step to the beam. Call at each\n  // time-step in sequence from left to right. outputs is the activation vector\n  // for the current timestep.\n  void DecodeStep(const float *outputs, int t, double dict_ratio, double cert_offset,\n                  double worst_dict_cert, const UNICHARSET *charset, bool debug = false);\n\n  void DecodeSecondaryStep(const float *outputs, int t, double dict_ratio, double cert_offset,\n                           double worst_dict_cert, const UNICHARSET *charset, bool debug = false);\n\n  // Saves the most certain choices for the current time-step.\n  void SaveMostCertainChoices(const float *outputs, int num_outputs, const UNICHARSET *charset,\n                              int xCoord);\n\n  // Calculates more accurate character boundaries which can be used to\n  // provide more accurate alternative symbol choices.\n  static void calculateCharBoundaries(std::vector<int> *starts, std::vector<int> *ends,\n                                      std::vector<int> *character_boundaries_, int maxWidth);\n\n  // Adds to the appropriate beams the legal (according to recoder)\n  // continuations of context prev, which is from the given index to beams_,\n  // using the given network outputs to provide scores to the choices. Uses only\n  // those choices for which top_n_flags[code] == top_n_flag.\n  void ContinueContext(const RecodeNode *prev, int index, const float *outputs,\n                       TopNState top_n_flag, const UNICHARSET *unicharset, double dict_ratio,\n                       double cert_offset, double worst_dict_cert, RecodeBeam *step);\n  // Continues for a new unichar, using dawg or non-dawg as per flag.\n  void ContinueUnichar(int code, int unichar_id, float cert, float worst_dict_cert,\n                       float dict_ratio, bool use_dawgs, NodeContinuation cont,\n                       const RecodeNode *prev, RecodeBeam *step);\n  // Adds a RecodeNode composed of the args to the correct heap in step if\n  // unichar_id is a valid dictionary continuation of whatever is in prev.\n  void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,\n                    const RecodeNode *prev, RecodeBeam *step);\n  // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args\n  // if better than what is already there.\n  void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter, bool start,\n                               bool end, float cert, NodeContinuation cont, const RecodeNode *prev,\n                               RecodeBeam *step);\n  // Adds a RecodeNode composed of the args to the correct heap in step for\n  // partial unichar or duplicate if there is room or if better than the\n  // current worst element if already full.\n  void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id, float cert,\n                               float worst_dict_cert, float dict_ratio, bool use_dawgs,\n                               NodeContinuation cont, const RecodeNode *prev, RecodeBeam *step);\n  // Adds a RecodeNode composed of the args to the correct heap in step if there\n  // is room or if better than the current worst element if already full.\n  void PushHeapIfBetter(int max_size, int code, int unichar_id, PermuterType permuter,\n                        bool dawg_start, bool word_start, bool end, bool dup, float cert,\n                        const RecodeNode *prev, DawgPositionVector *d, RecodeHeap *heap);\n  // Adds a RecodeNode to heap if there is room\n  // or if better than the current worst element if already full.\n  void PushHeapIfBetter(int max_size, RecodeNode *node, RecodeHeap *heap);\n  // Searches the heap for an entry matching new_node, and updates the entry\n  // with reshuffle if needed. Returns true if there was a match.\n  bool UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap);\n  // Computes and returns the code-hash for the given code and prev.\n  uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const;\n  // Backtracks to extract the best path through the lattice that was built\n  // during Decode. On return the best_nodes vector essentially contains the set\n  // of code, score pairs that make the optimal path with the constraint that\n  // the recoder can decode the code sequence back to a sequence of unichar-ids.\n  void ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes,\n                        std::vector<const RecodeNode *> *second_nodes) const;\n  // Helper backtracks through the lattice from the given node, storing the\n  // path and reversing it.\n  void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path) const;\n  void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path,\n                   int limiter) const;\n  // Helper prints debug information on the given lattice path.\n  void DebugPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path) const;\n  // Helper prints debug information on the given unichar path.\n  void DebugUnicharPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path,\n                        const std::vector<int> &unichar_ids, const std::vector<float> &certs,\n                        const std::vector<float> &ratings, const std::vector<int> &xcoords) const;\n\n  static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1];\n\n  // The encoder/decoder that we will be using.\n  const UnicharCompress &recoder_;\n  // The beam for each timestep in the output.\n  std::vector<RecodeBeam *> beam_;\n  // Secondary Beam for Results with less Probability\n  std::vector<RecodeBeam *> secondary_beam_;\n  // The number of timesteps valid in beam_;\n  int beam_size_;\n  // A flag to indicate which outputs are the top-n choices. Current timestep\n  // only.\n  std::vector<TopNState> top_n_flags_;\n  // A record of the highest and second scoring codes.\n  int top_code_;\n  int second_code_;\n  // Heap used to compute the top_n_flags_.\n  GenericHeap<TopPair> top_heap_;\n  // Borrowed pointer to the dictionary to use in the search.\n  Dict *dict_;\n  // True if the language is space-delimited, which is true for most languages\n  // except chi*, jpn, tha.\n  bool space_delimited_;\n  // True if the input is simple text, ie adjacent equal chars are not to be\n  // eliminated.\n  bool is_simple_text_;\n  // The encoded (class label) of the null/reject character.\n  int null_char_;\n};\n\n} // namespace tesseract.\n\n#endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_\n"
  },
  {
    "path": "src/lstm/reconfig.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        reconfig.cpp\n// Description: Network layer that reconfigures the scaling vs feature\n//              depth.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"reconfig.h\"\n\nnamespace tesseract {\n\nReconfig::Reconfig(const std::string &name, int ni, int x_scale, int y_scale)\n    : Network(NT_RECONFIG, name, ni, ni * x_scale * y_scale)\n    , x_scale_(x_scale)\n    , y_scale_(y_scale) {}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape Reconfig::OutputShape(const StaticShape &input_shape) const {\n  StaticShape result = input_shape;\n  result.set_height(result.height() / y_scale_);\n  result.set_width(result.width() / x_scale_);\n  if (type_ != NT_MAXPOOL) {\n    result.set_depth(result.depth() * y_scale_ * x_scale_);\n  }\n  return result;\n}\n\n// Returns an integer reduction factor that the network applies to the\n// time sequence. Assumes that any 2-d is already eliminated. Used for\n// scaling bounding boxes of truth data.\n// WARNING: if GlobalMinimax is used to vary the scale, this will return\n// the last used scale factor. Call it before any forward, and it will return\n// the minimum scale factor of the paths through the GlobalMinimax.\nint Reconfig::XScaleFactor() const {\n  return x_scale_;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool Reconfig::Serialize(TFile *fp) const {\n  return Network::Serialize(fp) && fp->Serialize(&x_scale_) && fp->Serialize(&y_scale_);\n}\n\n// Reads from the given file. Returns false in case of error.\nbool Reconfig::DeSerialize(TFile *fp) {\n  if (!fp->DeSerialize(&x_scale_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&y_scale_)) {\n    return false;\n  }\n  no_ = ni_ * x_scale_ * y_scale_;\n  return true;\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Reconfig::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                       NetworkScratch *scratch, NetworkIO *output) {\n  output->ResizeScaled(input, x_scale_, y_scale_, no_);\n  back_map_ = input.stride_map();\n  StrideMap::Index dest_index(output->stride_map());\n  do {\n    int out_t = dest_index.t();\n    StrideMap::Index src_index(input.stride_map(), dest_index.index(FD_BATCH),\n                               dest_index.index(FD_HEIGHT) * y_scale_,\n                               dest_index.index(FD_WIDTH) * x_scale_);\n    // Stack x_scale_ groups of y_scale_ inputs together.\n    for (int x = 0; x < x_scale_; ++x) {\n      for (int y = 0; y < y_scale_; ++y) {\n        StrideMap::Index src_xy(src_index);\n        if (src_xy.AddOffset(x, FD_WIDTH) && src_xy.AddOffset(y, FD_HEIGHT)) {\n          output->CopyTimeStepGeneral(out_t, (x * y_scale_ + y) * ni_, ni_, input, src_xy.t(), 0);\n        }\n      }\n    }\n  } while (dest_index.Increment());\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Reconfig::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                        NetworkIO *back_deltas) {\n  back_deltas->ResizeToMap(fwd_deltas.int_mode(), back_map_, ni_);\n  StrideMap::Index src_index(fwd_deltas.stride_map());\n  do {\n    int in_t = src_index.t();\n    StrideMap::Index dest_index(back_deltas->stride_map(), src_index.index(FD_BATCH),\n                                src_index.index(FD_HEIGHT) * y_scale_,\n                                src_index.index(FD_WIDTH) * x_scale_);\n    // Unstack x_scale_ groups of y_scale_ inputs that are together.\n    for (int x = 0; x < x_scale_; ++x) {\n      for (int y = 0; y < y_scale_; ++y) {\n        StrideMap::Index dest_xy(dest_index);\n        if (dest_xy.AddOffset(x, FD_WIDTH) && dest_xy.AddOffset(y, FD_HEIGHT)) {\n          back_deltas->CopyTimeStepGeneral(dest_xy.t(), 0, ni_, fwd_deltas, in_t,\n                                           (x * y_scale_ + y) * ni_);\n        }\n      }\n    }\n  } while (src_index.Increment());\n  return needs_to_backprop_;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/reconfig.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        reconfig.h\n// Description: Network layer that reconfigures the scaling vs feature\n//              depth.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_RECONFIG_H_\n#define TESSERACT_LSTM_RECONFIG_H_\n\n#include \"matrix.h\"\n#include \"network.h\"\n\nnamespace tesseract {\n\n// Reconfigures (Shrinks) the inputs by concatenating an x_scale by y_scale tile\n// of inputs together, producing a single, deeper output per tile.\n// Note that fractional parts are truncated for efficiency, so make sure the\n// input stride is a multiple of the y_scale factor!\nclass Reconfig : public Network {\npublic:\n  TESS_API\n  Reconfig(const std::string &name, int ni, int x_scale, int y_scale);\n  ~Reconfig() override = default;\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    return \"S\" + std::to_string(y_scale_) + \",\" + std::to_string(x_scale_);\n  }\n\n  // Returns an integer reduction factor that the network applies to the\n  // time sequence. Assumes that any 2-d is already eliminated. Used for\n  // scaling bounding boxes of truth data.\n  // WARNING: if GlobalMinimax is used to vary the scale, this will return\n  // the last used scale factor. Call it before any forward, and it will return\n  // the minimum scale factor of the paths through the GlobalMinimax.\n  int XScaleFactor() const override;\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(TFile *fp) const override;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(TFile *fp) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\nprivate:\n  void DebugWeights() override {\n    tprintf(\"Must override Network::DebugWeights for type %d\\n\", type_);\n  }\n\nprotected:\n  // Non-serialized data used to store parameters between forward and back.\n  StrideMap back_map_;\n  // Serialized data.\n  int32_t x_scale_;\n  int32_t y_scale_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_SUBSAMPLE_H_\n"
  },
  {
    "path": "src/lstm/reversed.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        reversed.cpp\n// Description: Runs a single network on time-reversed input, reversing output.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"reversed.h\"\n\n#include <cstdio>\n\n#include \"networkscratch.h\"\n\nnamespace tesseract {\n\nReversed::Reversed(const std::string &name, NetworkType type) : Plumbing(name) {\n  type_ = type;\n}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape Reversed::OutputShape(const StaticShape &input_shape) const {\n  if (type_ == NT_XYTRANSPOSE) {\n    StaticShape x_shape(input_shape);\n    x_shape.set_width(input_shape.height());\n    x_shape.set_height(input_shape.width());\n    x_shape = stack_[0]->OutputShape(x_shape);\n    x_shape.SetShape(x_shape.batch(), x_shape.width(), x_shape.height(), x_shape.depth());\n    return x_shape;\n  }\n  return stack_[0]->OutputShape(input_shape);\n}\n\n// Takes ownership of the given network to make it the reversed one.\nvoid Reversed::SetNetwork(Network *network) {\n  stack_.clear();\n  AddToStack(network);\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Reversed::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                       NetworkScratch *scratch, NetworkIO *output) {\n  NetworkScratch::IO rev_input(input, scratch);\n  ReverseData(input, rev_input);\n  NetworkScratch::IO rev_output(input, scratch);\n  stack_[0]->Forward(debug, *rev_input, nullptr, scratch, rev_output);\n  ReverseData(*rev_output, output);\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Reversed::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                        NetworkIO *back_deltas) {\n  NetworkScratch::IO rev_input(fwd_deltas, scratch);\n  ReverseData(fwd_deltas, rev_input);\n  NetworkScratch::IO rev_output(fwd_deltas, scratch);\n  if (stack_[0]->Backward(debug, *rev_input, scratch, rev_output)) {\n    ReverseData(*rev_output, back_deltas);\n    return true;\n  }\n  return false;\n}\n\n// Copies src to *dest with the reversal according to type_.\nvoid Reversed::ReverseData(const NetworkIO &src, NetworkIO *dest) const {\n  if (type_ == NT_XREVERSED) {\n    dest->CopyWithXReversal(src);\n  } else if (type_ == NT_YREVERSED) {\n    dest->CopyWithYReversal(src);\n  } else {\n    dest->CopyWithXYTranspose(src);\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/reversed.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        reversed.h\n// Description: Runs a single network on time-reversed input, reversing output.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_REVERSED_H_\n#define TESSERACT_LSTM_REVERSED_H_\n\n#include \"matrix.h\"\n#include \"plumbing.h\"\n\nnamespace tesseract {\n\n// C++ Implementation of the Reversed class from lstm.py.\nclass Reversed : public Plumbing {\npublic:\n  TESS_API\n  explicit Reversed(const std::string &name, NetworkType type);\n  ~Reversed() override = default;\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    std::string spec(type_ == NT_XREVERSED ? \"Rx\" : (type_ == NT_YREVERSED ? \"Ry\" : \"Txy\"));\n    // For most simple cases, we will output Rx<net> or Ry<net> where <net> is\n    // the network in stack_[0], but in the special case that <net> is an\n    // LSTM, we will just output the LSTM's spec modified to take the reversal\n    // into account. This is because when the user specified Lfy64, we actually\n    // generated TxyLfx64, and if the user specified Lrx64 we actually\n    // generated RxLfx64, and we want to display what the user asked for.\n    std::string net_spec(stack_[0]->spec());\n    if (net_spec[0] == 'L') {\n      // Setup a from and to character according to the type of the reversal\n      // such that the LSTM spec gets modified to the spec that the user\n      // asked for\n      char from = 'f';\n      char to = 'r';\n      if (type_ == NT_XYTRANSPOSE) {\n        from = 'x';\n        to = 'y';\n      }\n      // Change the from char to the to char.\n      for (auto &it : net_spec) {\n        if (it == from) {\n          it = to;\n        }\n      }\n      spec += net_spec;\n      return spec;\n    }\n    spec += net_spec;\n    return spec;\n  }\n\n  // Takes ownership of the given network to make it the reversed one.\n  TESS_API\n  void SetNetwork(Network *network);\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\nprivate:\n  // Copies src to *dest with the reversal according to type_.\n  void ReverseData(const NetworkIO &src, NetworkIO *dest) const;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_REVERSED_H_\n"
  },
  {
    "path": "src/lstm/series.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        series.cpp\n// Description: Runs networks in series on the same input.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"series.h\"\n\n#include \"fullyconnected.h\"\n#include \"networkscratch.h\"\n#include \"scrollview.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// ni_ and no_ will be set by AddToStack.\nSeries::Series(const std::string &name) : Plumbing(name) {\n  type_ = NT_SERIES;\n}\n\n// Returns the shape output from the network given an input shape (which may\n// be partially unknown ie zero).\nStaticShape Series::OutputShape(const StaticShape &input_shape) const {\n  StaticShape result(input_shape);\n  int stack_size = stack_.size();\n  for (int i = 0; i < stack_size; ++i) {\n    result = stack_[i]->OutputShape(result);\n  }\n  return result;\n}\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\n// Note that series has its own implementation just for debug purposes.\nint Series::InitWeights(float range, TRand *randomizer) {\n  num_weights_ = 0;\n  tprintf(\"Num outputs,weights in Series:\\n\");\n  for (auto &i : stack_) {\n    int weights = i->InitWeights(range, randomizer);\n    tprintf(\"  %s:%d, %d\\n\", i->spec().c_str(), i->NumOutputs(), weights);\n    num_weights_ += weights;\n  }\n  tprintf(\"Total weights = %d\\n\", num_weights_);\n  return num_weights_;\n}\n\n// Recursively searches the network for softmaxes with old_no outputs,\n// and remaps their outputs according to code_map. See network.h for details.\nint Series::RemapOutputs(int old_no, const std::vector<int> &code_map) {\n  num_weights_ = 0;\n  tprintf(\"Num (Extended) outputs,weights in Series:\\n\");\n  for (auto &i : stack_) {\n    int weights = i->RemapOutputs(old_no, code_map);\n    tprintf(\"  %s:%d, %d\\n\", i->spec().c_str(), i->NumOutputs(), weights);\n    num_weights_ += weights;\n  }\n  tprintf(\"Total weights = %d\\n\", num_weights_);\n  no_ = stack_.back()->NumOutputs();\n  return num_weights_;\n}\n\n// Sets needs_to_backprop_ to needs_backprop and returns true if\n// needs_backprop || any weights in this network so the next layer forward\n// can be told to produce backprop for this layer if needed.\nbool Series::SetupNeedsBackprop(bool needs_backprop) {\n  needs_to_backprop_ = needs_backprop;\n  for (auto &i : stack_) {\n    needs_backprop = i->SetupNeedsBackprop(needs_backprop);\n  }\n  return needs_backprop;\n}\n\n// Returns an integer reduction factor that the network applies to the\n// time sequence. Assumes that any 2-d is already eliminated. Used for\n// scaling bounding boxes of truth data.\n// WARNING: if GlobalMinimax is used to vary the scale, this will return\n// the last used scale factor. Call it before any forward, and it will return\n// the minimum scale factor of the paths through the GlobalMinimax.\nint Series::XScaleFactor() const {\n  int factor = 1;\n  for (auto i : stack_) {\n    factor *= i->XScaleFactor();\n  }\n  return factor;\n}\n\n// Provides the (minimum) x scale factor to the network (of interest only to\n// input units) so they can determine how to scale bounding boxes.\nvoid Series::CacheXScaleFactor(int factor) {\n  stack_[0]->CacheXScaleFactor(factor);\n}\n\n// Runs forward propagation of activations on the input line.\n// See NetworkCpp for a detailed discussion of the arguments.\nvoid Series::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n                     NetworkScratch *scratch, NetworkIO *output) {\n  int stack_size = stack_.size();\n  ASSERT_HOST(stack_size > 1);\n  // Revolving intermediate buffers.\n  NetworkScratch::IO buffer1(input, scratch);\n  NetworkScratch::IO buffer2(input, scratch);\n  // Run each network in turn, giving the output of n as the input to n + 1,\n  // with the final network providing the real output.\n  stack_[0]->Forward(debug, input, input_transpose, scratch, buffer1);\n  for (int i = 1; i < stack_size; i += 2) {\n    stack_[i]->Forward(debug, *buffer1, nullptr, scratch, i + 1 < stack_size ? buffer2 : output);\n    if (i + 1 == stack_size) {\n      return;\n    }\n    stack_[i + 1]->Forward(debug, *buffer2, nullptr, scratch,\n                           i + 2 < stack_size ? buffer1 : output);\n  }\n}\n\n// Runs backward propagation of errors on the deltas line.\n// See NetworkCpp for a detailed discussion of the arguments.\nbool Series::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                      NetworkIO *back_deltas) {\n  if (!IsTraining()) {\n    return false;\n  }\n  int stack_size = stack_.size();\n  ASSERT_HOST(stack_size > 1);\n  // Revolving intermediate buffers.\n  NetworkScratch::IO buffer1(fwd_deltas, scratch);\n  NetworkScratch::IO buffer2(fwd_deltas, scratch);\n  // Run each network in reverse order, giving the back_deltas output of n as\n  // the fwd_deltas input to n-1, with the 0 network providing the real output.\n  if (!stack_.back()->IsTraining() ||\n      !stack_.back()->Backward(debug, fwd_deltas, scratch, buffer1)) {\n    return false;\n  }\n  for (int i = stack_size - 2; i >= 0; i -= 2) {\n    if (!stack_[i]->IsTraining() ||\n        !stack_[i]->Backward(debug, *buffer1, scratch, i > 0 ? buffer2 : back_deltas)) {\n      return false;\n    }\n    if (i == 0) {\n      return needs_to_backprop_;\n    }\n    if (!stack_[i - 1]->IsTraining() ||\n        !stack_[i - 1]->Backward(debug, *buffer2, scratch, i > 1 ? buffer1 : back_deltas)) {\n      return false;\n    }\n  }\n  return needs_to_backprop_;\n}\n\n// Splits the series after the given index, returning the two parts and\n// deletes itself. The first part, up to network with index last_start, goes\n// into start, and the rest goes into end.\nvoid Series::SplitAt(unsigned last_start, Series **start, Series **end) {\n  *start = nullptr;\n  *end = nullptr;\n  if (last_start >= stack_.size()) {\n    tesserr << \"Invalid split index \" << last_start\n            << \" must be in range [0,\" << stack_.size() - 1 << \"]!\\n\";\n    return;\n  }\n  auto *master_series = new Series(\"MasterSeries\");\n  auto *boosted_series = new Series(\"BoostedSeries\");\n  for (unsigned s = 0; s <= last_start; ++s) {\n    if (s + 1 == stack_.size() && stack_[s]->type() == NT_SOFTMAX) {\n      // Change the softmax to a tanh.\n      auto *fc = static_cast<FullyConnected *>(stack_[s]);\n      fc->ChangeType(NT_TANH);\n    }\n    master_series->AddToStack(stack_[s]);\n    stack_[s] = nullptr;\n  }\n  for (unsigned s = last_start + 1; s < stack_.size(); ++s) {\n    boosted_series->AddToStack(stack_[s]);\n    stack_[s] = nullptr;\n  }\n  *start = master_series;\n  *end = boosted_series;\n  delete this;\n}\n\n// Appends the elements of the src series to this, removing from src and\n// deleting it.\nvoid Series::AppendSeries(Network *src) {\n  ASSERT_HOST(src->type() == NT_SERIES);\n  auto *src_series = static_cast<Series *>(src);\n  for (auto &s : src_series->stack_) {\n    AddToStack(s);\n    s = nullptr;\n  }\n  delete src;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/series.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        series.h\n// Description: Runs networks in series on the same input.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_SERIES_H_\n#define TESSERACT_LSTM_SERIES_H_\n\n#include \"plumbing.h\"\n\nnamespace tesseract {\n\n// Runs two or more networks in series (layers) on the same input.\nclass Series : public Plumbing {\npublic:\n  // ni_ and no_ will be set by AddToStack.\n  TESS_API\n  explicit Series(const std::string &name);\n  ~Series() override = default;\n\n  // Returns the shape output from the network given an input shape (which may\n  // be partially unknown ie zero).\n  StaticShape OutputShape(const StaticShape &input_shape) const override;\n\n  std::string spec() const override {\n    std::string spec(\"[\");\n    for (auto &it : stack_) {\n      spec += it->spec();\n    }\n    spec += \"]\";\n    return spec;\n  }\n\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  // Returns the number of weights initialized.\n  int InitWeights(float range, TRand *randomizer) override;\n  // Recursively searches the network for softmaxes with old_no outputs,\n  // and remaps their outputs according to code_map. See network.h for details.\n  int RemapOutputs(int old_no, const std::vector<int> &code_map) override;\n\n  // Sets needs_to_backprop_ to needs_backprop and returns true if\n  // needs_backprop || any weights in this network so the next layer forward\n  // can be told to produce backprop for this layer if needed.\n  bool SetupNeedsBackprop(bool needs_backprop) override;\n\n  // Returns an integer reduction factor that the network applies to the\n  // time sequence. Assumes that any 2-d is already eliminated. Used for\n  // scaling bounding boxes of truth data.\n  // WARNING: if GlobalMinimax is used to vary the scale, this will return\n  // the last used scale factor. Call it before any forward, and it will return\n  // the minimum scale factor of the paths through the GlobalMinimax.\n  int XScaleFactor() const override;\n\n  // Provides the (minimum) x scale factor to the network (of interest only to\n  // input units) so they can determine how to scale bounding boxes.\n  void CacheXScaleFactor(int factor) override;\n\n  // Runs forward propagation of activations on the input line.\n  // See Network for a detailed discussion of the arguments.\n  void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose,\n               NetworkScratch *scratch, NetworkIO *output) override;\n\n  // Runs backward propagation of errors on the deltas line.\n  // See Network for a detailed discussion of the arguments.\n  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,\n                NetworkIO *back_deltas) override;\n\n  // Splits the series after the given index, returning the two parts and\n  // deletes itself. The first part, up to network with index last_start, goes\n  // into start, and the rest goes into end.\n  TESS_API\n  void SplitAt(unsigned last_start, Series **start, Series **end);\n\n  // Appends the elements of the src series to this, removing from src and\n  // deleting it.\n  TESS_API\n  void AppendSeries(Network *src);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_SERIES_H_\n"
  },
  {
    "path": "src/lstm/static_shape.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        static_shape.h\n// Description: Defines the size of the 4-d tensor input/output from a network.\n// Author:      Ray Smith\n// Created:     Fri Oct 14 09:07:31 PST 2016\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_STATIC_SHAPE_H_\n#define TESSERACT_LSTM_STATIC_SHAPE_H_\n\n#include \"serialis.h\" // for TFile\n#include \"tprintf.h\"  // for tprintf\n\nnamespace tesseract {\n\n// Enum describing the loss function to apply during training and/or the\n// decoding method to apply at runtime.\nenum LossType {\n  LT_NONE,     // Undefined.\n  LT_CTC,      // Softmax with standard CTC for training/decoding.\n  LT_SOFTMAX,  // Outputs sum to 1 in fixed positions.\n  LT_LOGISTIC, // Logistic outputs with independent values.\n};\n\n// Simple class to hold the tensor shape that is known at network build time\n// and the LossType of the loss function.\nclass StaticShape {\npublic:\n  StaticShape() : batch_(0), height_(0), width_(0), depth_(0), loss_type_(LT_NONE) {}\n  int batch() const {\n    return batch_;\n  }\n  void set_batch(int value) {\n    batch_ = value;\n  }\n  int height() const {\n    return height_;\n  }\n  void set_height(int value) {\n    height_ = value;\n  }\n  int width() const {\n    return width_;\n  }\n  void set_width(int value) {\n    width_ = value;\n  }\n  int depth() const {\n    return depth_;\n  }\n  void set_depth(int value) {\n    depth_ = value;\n  }\n  LossType loss_type() const {\n    return loss_type_;\n  }\n  void set_loss_type(LossType value) {\n    loss_type_ = value;\n  }\n  void SetShape(int batch, int height, int width, int depth) {\n    batch_ = batch;\n    height_ = height;\n    width_ = width;\n    depth_ = depth;\n  }\n\n  void Print() const {\n    tprintf(\"Batch=%d, Height=%d, Width=%d, Depth=%d, loss=%d\\n\", batch_, height_, width_, depth_,\n            loss_type_);\n  }\n\n  bool DeSerialize(TFile *fp) {\n    int32_t tmp = LT_NONE;\n    bool result = fp->DeSerialize(&batch_) && fp->DeSerialize(&height_) &&\n                  fp->DeSerialize(&width_) && fp->DeSerialize(&depth_) && fp->DeSerialize(&tmp);\n    loss_type_ = static_cast<LossType>(tmp);\n    return result;\n  }\n\n  bool Serialize(TFile *fp) const {\n    int32_t tmp = loss_type_;\n    return fp->Serialize(&batch_) && fp->Serialize(&height_) && fp->Serialize(&width_) &&\n           fp->Serialize(&depth_) && fp->Serialize(&tmp);\n  }\n\nprivate:\n  // Size of the 4-D tensor input/output to a network. A value of zero is\n  // allowed for all except depth_ and means to be determined at runtime, and\n  // regarded as variable.\n  // Number of elements in a batch, or number of frames in a video stream.\n  int32_t batch_;\n  // Height of the image.\n  int32_t height_;\n  // Width of the image.\n  int32_t width_;\n  // Depth of the image. (Number of \"nodes\").\n  int32_t depth_;\n  // How to train/interpret the output.\n  LossType loss_type_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_LSTM_STATIC_SHAPE_H_\n"
  },
  {
    "path": "src/lstm/stridemap.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        stridemap.cpp\n// Description: Indexing into a 4-d tensor held in a 2-d Array.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"stridemap.h\"\n#include <cassert> // for assert\n\nnamespace tesseract {\n\n// Returns true if *this is a valid index.\nbool StrideMap::Index::IsValid() const {\n  // Cheap check first.\n  for (int index : indices_) {\n    if (index < 0) {\n      return false;\n    }\n  }\n  for (int d = 0; d < FD_DIMSIZE; ++d) {\n    if (indices_[d] > MaxIndexOfDim(static_cast<FlexDimensions>(d))) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Returns true if the index of the given dimension is the last.\nbool StrideMap::Index::IsLast(FlexDimensions dimension) const {\n  return MaxIndexOfDim(dimension) == indices_[dimension];\n}\n\n// Given that the dimensions up to and including dim-1 are valid, returns the\n// maximum index for dimension dim.\nint StrideMap::Index::MaxIndexOfDim(FlexDimensions dim) const {\n  int max_index = stride_map_->shape_[dim] - 1;\n  if (dim == FD_BATCH) {\n    return max_index;\n  }\n  assert(0 <= indices_[FD_BATCH]);\n  const size_t batch = indices_[FD_BATCH];\n  if (dim == FD_HEIGHT) {\n    if (batch >= stride_map_->heights_.size() || stride_map_->heights_[batch] > max_index) {\n      return max_index;\n    }\n    return stride_map_->heights_[batch] - 1;\n  }\n  if (batch >= stride_map_->widths_.size() || stride_map_->widths_[batch] > max_index) {\n    return max_index;\n  }\n  return stride_map_->widths_[batch] - 1;\n}\n\n// Adds the given offset to the given dimension. Returns true if the result\n// makes a valid index.\nbool StrideMap::Index::AddOffset(int offset, FlexDimensions dimension) {\n  indices_[dimension] += offset;\n  SetTFromIndices();\n  return IsValid();\n}\n\n// Increments the index in some encapsulated way that guarantees to remain\n// valid until it returns false, meaning that the iteration is complete.\nbool StrideMap::Index::Increment() {\n  for (int d = FD_DIMSIZE - 1; d >= 0; --d) {\n    if (!IsLast(static_cast<FlexDimensions>(d))) {\n      t_ += stride_map_->t_increments_[d];\n      ++indices_[d];\n      return true;\n    }\n    t_ -= stride_map_->t_increments_[d] * indices_[d];\n    indices_[d] = 0;\n    // Now carry to the next dimension.\n  }\n  return false;\n}\n\n// Decrements the index in some encapsulated way that guarantees to remain\n// valid until it returns false, meaning that the iteration (that started\n// with InitToLast()) is complete.\nbool StrideMap::Index::Decrement() {\n  for (int d = FD_DIMSIZE - 1; d >= 0; --d) {\n    if (indices_[d] > 0) {\n      --indices_[d];\n      if (d == FD_BATCH) {\n        // The upper limits of the other dimensions may have changed as a result\n        // of a different batch index, so they have to be reset.\n        InitToLastOfBatch(indices_[FD_BATCH]);\n      } else {\n        t_ -= stride_map_->t_increments_[d];\n      }\n      return true;\n    }\n    indices_[d] = MaxIndexOfDim(static_cast<FlexDimensions>(d));\n    t_ += stride_map_->t_increments_[d] * indices_[d];\n    // Now borrow from the next dimension.\n  }\n  return false;\n}\n\n// Initializes the indices to the last valid location in the given batch\n// index.\nvoid StrideMap::Index::InitToLastOfBatch(int batch) {\n  indices_[FD_BATCH] = batch;\n  for (int d = FD_BATCH + 1; d < FD_DIMSIZE; ++d) {\n    indices_[d] = MaxIndexOfDim(static_cast<FlexDimensions>(d));\n  }\n  SetTFromIndices();\n}\n\n// Computes and sets t_ from the current indices_.\nvoid StrideMap::Index::SetTFromIndices() {\n  t_ = 0;\n  for (int d = 0; d < FD_DIMSIZE; ++d) {\n    t_ += stride_map_->t_increments_[d] * indices_[d];\n  }\n}\n\n// Sets up the stride for the given array of height, width pairs.\nvoid StrideMap::SetStride(const std::vector<std::pair<int, int>> &h_w_pairs) {\n  int max_height = 0;\n  int max_width = 0;\n  for (const std::pair<int, int> &hw : h_w_pairs) {\n    int height = hw.first;\n    int width = hw.second;\n    heights_.push_back(height);\n    widths_.push_back(width);\n    if (height > max_height) {\n      max_height = height;\n    }\n    if (width > max_width) {\n      max_width = width;\n    }\n  }\n  shape_[FD_BATCH] = heights_.size();\n  shape_[FD_HEIGHT] = max_height;\n  shape_[FD_WIDTH] = max_width;\n  ComputeTIncrements();\n}\n\n// Scales width and height dimensions by the given factors.\nvoid StrideMap::ScaleXY(int x_factor, int y_factor) {\n  for (int &height : heights_) {\n    height /= y_factor;\n  }\n  for (int &width : widths_) {\n    width /= x_factor;\n  }\n  shape_[FD_HEIGHT] /= y_factor;\n  shape_[FD_WIDTH] /= x_factor;\n  ComputeTIncrements();\n}\n\n// Reduces width to 1, across the batch, whatever the input size.\nvoid StrideMap::ReduceWidthTo1() {\n  widths_.assign(widths_.size(), 1);\n  shape_[FD_WIDTH] = 1;\n  ComputeTIncrements();\n}\n\n// Transposes the width and height dimensions.\nvoid StrideMap::TransposeXY() {\n  std::swap(shape_[FD_HEIGHT], shape_[FD_WIDTH]);\n  std::swap(heights_, widths_);\n  ComputeTIncrements();\n}\n\n// Computes t_increments_ from shape_.\nvoid StrideMap::ComputeTIncrements() {\n  t_increments_[FD_DIMSIZE - 1] = 1;\n  for (int d = FD_DIMSIZE - 2; d >= 0; --d) {\n    t_increments_[d] = t_increments_[d + 1] * shape_[d + 1];\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/lstm/stridemap.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        stridemap.h\n// Description: Indexing into a 4-d tensor held in a 2-d Array.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n#ifndef TESSERACT_LSTM_STRIDEMAP_H_\n#define TESSERACT_LSTM_STRIDEMAP_H_\n\n#include <cstring>\n#include <vector>\n\nnamespace tesseract {\n\n// Enum describing the dimensions of the 'Tensor' in a NetworkIO.\n// A NetworkIO is analogous to a TF Tensor, except that the number of dimensions\n// is fixed (4), and they always have the same meaning. The underlying\n// representation is a 2-D array, for which the product batch*height*width\n// is always dim1 and depth is always dim2. FlexDimensions is used only for\n// batch, height, width with the StrideMap, and therefore represents the runtime\n// shape. The build-time shape is defined by StaticShape.\nenum FlexDimensions {\n  FD_BATCH,   // Index of multiple images.\n  FD_HEIGHT,  // y-coordinate in image.\n  FD_WIDTH,   // x-coordinate in image.\n  FD_DIMSIZE, // Number of flexible non-depth dimensions.\n};\n\n// Encapsulation of information relating to the mapping from [batch][y][x] to\n// the first index into the 2-d array underlying a NetworkIO.\nclass StrideMap {\npublic:\n  // Class holding the non-depth indices.\n  class Index {\n  public:\n    explicit Index(const StrideMap &stride_map) : stride_map_(&stride_map) {\n      InitToFirst();\n    }\n    Index(const StrideMap &stride_map, int batch, int y, int x) : stride_map_(&stride_map) {\n      indices_[FD_BATCH] = batch;\n      indices_[FD_HEIGHT] = y;\n      indices_[FD_WIDTH] = x;\n      SetTFromIndices();\n    }\n    // Accesses the index to the underlying array.\n    int t() const {\n      return t_;\n    }\n    int index(FlexDimensions dimension) const {\n      return indices_[dimension];\n    }\n    // Initializes the indices to the first valid location.\n    void InitToFirst() {\n      memset(indices_, 0, sizeof(indices_));\n      t_ = 0;\n    }\n    // Initializes the indices to the last valid location.\n    void InitToLast() {\n      InitToLastOfBatch(MaxIndexOfDim(FD_BATCH));\n    }\n    // Returns true if *this is a valid index.\n    bool IsValid() const;\n    // Returns true if the index of the given dimension is the last.\n    bool IsLast(FlexDimensions dimension) const;\n    // Given that the dimensions up to and including dim-1 are valid, returns\n    // the maximum index for dimension dim.\n    int MaxIndexOfDim(FlexDimensions dim) const;\n    // Adds the given offset to the given dimension. Returns true if the result\n    // makes a valid index.\n    bool AddOffset(int offset, FlexDimensions dimension);\n    // Increments the index in some encapsulated way that guarantees to remain\n    // valid until it returns false, meaning that the iteration is complete.\n    bool Increment();\n    // Decrements the index in some encapsulated way that guarantees to remain\n    // valid until it returns false, meaning that the iteration (that started\n    // with InitToLast()) is complete.\n    bool Decrement();\n\n  private:\n    // Initializes the indices to the last valid location in the given batch\n    // index.\n    void InitToLastOfBatch(int batch);\n    // Computes and sets t_ from the current indices_.\n    void SetTFromIndices();\n\n    // Map into which *this is an index.\n    const StrideMap *stride_map_;\n    // Index to the first dimension of the underlying array.\n    int t_;\n    // Indices into the individual dimensions.\n    int indices_[FD_DIMSIZE];\n  };\n\n  StrideMap() {\n    memset(shape_, 0, sizeof(shape_));\n    memset(t_increments_, 0, sizeof(t_increments_));\n  }\n  // Default copy constructor and operator= are OK to use here!\n\n  // Sets up the stride for the given array of height, width pairs.\n  void SetStride(const std::vector<std::pair<int, int>> &h_w_pairs);\n  // Scales width and height dimensions by the given factors.\n  void ScaleXY(int x_factor, int y_factor);\n  // Reduces width to 1, across the batch, whatever the input size.\n  void ReduceWidthTo1();\n  // Transposes the width and height dimensions.\n  void TransposeXY();\n  // Returns the size of the given dimension.\n  int Size(FlexDimensions dimension) const {\n    return shape_[dimension];\n  }\n  // Returns the total width required.\n  int Width() const {\n    return t_increments_[FD_BATCH] * shape_[FD_BATCH];\n  }\n\nprivate:\n  // Computes t_increments_ from shape_.\n  void ComputeTIncrements();\n\n  // The size of each non-depth dimension.\n  int shape_[FD_DIMSIZE];\n  // Precomputed 't' increments for each dimension. This is the value of\n  // the given dimension in the packed 3-d array that the shape_ represents.\n  int t_increments_[FD_DIMSIZE];\n  // Vector of size shape_[FD_BATCH] holds the height of each image in a batch.\n  std::vector<int> heights_;\n  // Vector of size shape_[FD_BATCH] holds the width of each image in a batch.\n  std::vector<int> widths_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_LSTM_STRIDEMAP_H_\n"
  },
  {
    "path": "src/lstm/weightmatrix.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        weightmatrix.cpp\n// Description: Hides distinction between float/int implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"weightmatrix.h\"\n\n#include <cassert> // for assert\n#include \"intsimdmatrix.h\"\n#include \"simddetect.h\" // for DotProduct\n#include \"statistc.h\"\n#include \"tprintf.h\"    // forTFloat\n\nnamespace tesseract {\n\n#if defined(ANDROID)\nstatic inline TFloat log2(TFloat n) {\n  return log(n) / log(2.0);\n}\n#endif // ANDROID\n\n// Number of iterations after which the correction effectively becomes unity.\nconst int kAdamCorrectionIterations = 200000;\n// Epsilon in Adam to prevent division by zero.\nconst TFloat kAdamEpsilon = 1e-8;\n\n// Utility functions convert between double and float arrays.\n#ifdef FAST_FLOAT\nstatic void DoubleToFloat(const GENERIC_2D_ARRAY<double> &src, GENERIC_2D_ARRAY<float> &dst) {\n  const auto dim1 = src.dim1();\n  const auto dim2 = src.dim2();\n  dst.ResizeNoInit(dim1, dim2);\n  for (int i = 0; i < dim1; ++i) {\n    const auto *src_i = src[i];\n    auto *dst_i = dst[i];\n    for (int j = 0; j < dim2; ++j) {\n      dst_i[j] = static_cast<float>(src_i[j]);\n    }\n  }\n}\n#endif\n\nstatic void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<double> &dst) {\n  const auto dim1 = src.dim1();\n  const auto dim2 = src.dim2();\n  dst.ResizeNoInit(dim1, dim2);\n  for (int i = 0; i < dim1; ++i) {\n    const auto *src_i = src[i];\n    auto *dst_i = dst[i];\n    for (int j = 0; j < dim2; ++j) {\n      dst_i[j] = static_cast<double>(src_i[j]);\n    }\n  }\n}\n\nstatic bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {\n#ifdef FAST_FLOAT\n  GENERIC_2D_ARRAY<double> double_array;\n  if (!double_array.DeSerialize(fp)) {\n    return false;\n  }\n  DoubleToFloat(double_array, tfloat_array);\n  return true;\n#else\n  return tfloat_array.DeSerialize(fp);\n#endif\n}\n\nstatic bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {\n#ifdef FAST_FLOAT\n  GENERIC_2D_ARRAY<double> double_array;\n  FloatToDouble(tfloat_array, double_array);\n  return double_array.Serialize(fp);\n#else\n  return tfloat_array.Serialize(fp);\n#endif\n}\n\n// Computes matrix.vector v = Wu.\n// u is of size W.dim2() - add_bias_fwd and the output v is of size\n// W.dim1() - skip_bias_back.\n// If add_bias_fwd, u is imagined to have an extra element at the end with value\n// 1, to implement the bias, weight.\n// If skip_bias_back, we are actually performing the backwards product on a\n// transposed matrix, so we need to drop the v output corresponding to the last\n// element in dim1.\nstatic inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<TFloat> &w, bool add_bias_fwd,\n                                           bool skip_bias_back, const TFloat *u, TFloat *v) {\n  int num_results = w.dim1() - skip_bias_back;\n  int extent = w.dim2() - add_bias_fwd;\n  for (int i = 0; i < num_results; ++i) {\n    const TFloat *wi = w[i];\n    TFloat total = DotProduct(wi, u, extent);\n    if (add_bias_fwd) {\n      total += wi[extent]; // The bias value.\n    }\n    v[i] = total;\n  }\n}\n\n// Copies the whole input transposed, converted to TFloat, into *this.\nvoid TransposedArray::Transpose(const GENERIC_2D_ARRAY<TFloat> &input) {\n  int width = input.dim1();\n  int num_features = input.dim2();\n  ResizeNoInit(num_features, width);\n  for (int t = 0; t < width; ++t) {\n    WriteStrided(t, input[t]);\n  }\n}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nTransposedArray::~TransposedArray() = default;\n\n// Sets up the network for training. Initializes weights using weights of\n// scale `range` picked according to the random number generator `randomizer`.\nint WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam, float weight_range,\n                                   TRand *randomizer) {\n  int_mode_ = false;\n  wf_.Resize(no, ni, 0.0);\n  if (randomizer != nullptr) {\n    for (int i = 0; i < no; ++i) {\n      for (int j = 0; j < ni; ++j) {\n        wf_[i][j] = randomizer->SignedRand(weight_range);\n      }\n    }\n  }\n  use_adam_ = use_adam;\n  InitBackward();\n  return ni * no;\n}\n\n// Changes the number of outputs to the size of the given code_map, copying\n// the old weight matrix entries for each output from code_map[output] where\n// non-negative, and uses the mean (over all outputs) of the existing weights\n// for all outputs with negative code_map entries. Returns the new number of\n// weights.\nint WeightMatrix::RemapOutputs(const std::vector<int> &code_map) {\n  GENERIC_2D_ARRAY<TFloat> old_wf(wf_);\n  int old_no = wf_.dim1();\n  int new_no = code_map.size();\n  int ni = wf_.dim2();\n  std::vector<TFloat> means(ni, 0.0);\n  for (int c = 0; c < old_no; ++c) {\n    const TFloat *weights = wf_[c];\n    for (int i = 0; i < ni; ++i) {\n      means[i] += weights[i];\n    }\n  }\n  for (auto &mean : means) {\n    mean /= old_no;\n  }\n  wf_.Resize(new_no, ni, 0.0);\n  InitBackward();\n  for (int dest = 0; dest < new_no; ++dest) {\n    int src = code_map[dest];\n    const TFloat *src_data = src >= 0 ? old_wf[src] : means.data();\n    memcpy(wf_[dest], src_data, ni * sizeof(*src_data));\n  }\n  return ni * new_no;\n}\n\n// Converts a float network to an int network. Each set of input weights that\n// corresponds to a single output weight is converted independently:\n// Compute the max absolute value of the weight set.\n// Scale so the max absolute value becomes INT8_MAX.\n// Round to integer.\n// Store a multiplicative scale factor (as a TFloat) that will reproduce\n// the original value, subject to rounding errors.\nvoid WeightMatrix::ConvertToInt() {\n  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());\n  scales_.reserve(wi_.dim1());\n  int dim2 = wi_.dim2();\n  for (int t = 0; t < wi_.dim1(); ++t) {\n    TFloat *f_line = wf_[t];\n    int8_t *i_line = wi_[t];\n    TFloat max_abs = 0;\n    for (int f = 0; f < dim2; ++f) {\n      TFloat abs_val = fabs(f_line[f]);\n      if (abs_val > max_abs) {\n        max_abs = abs_val;\n      }\n    }\n    TFloat scale = max_abs / INT8_MAX;\n    scales_.push_back(scale / INT8_MAX);\n    if (scale == 0.0) {\n      scale = 1.0;\n    }\n    for (int f = 0; f < dim2; ++f) {\n      i_line[f] = IntCastRounded(f_line[f] / scale);\n    }\n  }\n  wf_.Resize(1, 1, 0.0);\n  int_mode_ = true;\n  if (IntSimdMatrix::intSimdMatrix) {\n    int32_t rounded_num_out;\n    IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);\n    scales_.resize(rounded_num_out);\n  }\n}\n\n// Allocates any needed memory for running Backward, and zeroes the deltas,\n// thus eliminating any existing momentum.\nvoid WeightMatrix::InitBackward() {\n  int no = int_mode_ ? wi_.dim1() : wf_.dim1();\n  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();\n  dw_.Resize(no, ni, 0.0);\n  updates_.Resize(no, ni, 0.0);\n  wf_t_.Transpose(wf_);\n  if (use_adam_) {\n    dw_sq_sum_.Resize(no, ni, 0.0);\n  }\n}\n\n// Flag on mode to indicate that this weightmatrix uses int8_t.\nconst int kInt8Flag = 1;\n// Flag on mode to indicate that this weightmatrix uses adam.\nconst int kAdamFlag = 4;\n// Flag on mode to indicate that this weightmatrix uses double. Set\n// independently of kInt8Flag as even in int mode the scales can\n// be float or double.\nconst int kDoubleFlag = 128;\n\n// Writes to the given file. Returns false in case of error.\nbool WeightMatrix::Serialize(bool training, TFile *fp) const {\n  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles\n  // format, without errs, so we can detect and read old format weight matrices.\n  uint8_t mode = (int_mode_ ? kInt8Flag : 0) | (use_adam_ ? kAdamFlag : 0) | kDoubleFlag;\n  if (!fp->Serialize(&mode)) {\n    return false;\n  }\n  if (int_mode_) {\n    if (!wi_.Serialize(fp)) {\n      return false;\n    }\n    uint32_t size = scales_.size();\n    if (!fp->Serialize(&size)) {\n      return false;\n    }\n    for (auto scale : scales_) {\n      // The scales stored in memory have an extra factor applied to them\n      // to allow faster operation. We have to remove that factor here\n      // before writing to disc.\n      double value = scale * INT8_MAX;\n      if (!fp->Serialize(&value)) {\n        return false;\n      }\n    }\n  } else {\n    if (!tesseract::Serialize(fp, wf_)) {\n      return false;\n    }\n    if (training) {\n      if (!tesseract::Serialize(fp, updates_)) {\n        return false;\n      }\n      if (use_adam_ && !tesseract::Serialize(fp, dw_sq_sum_)) {\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\n\nbool WeightMatrix::DeSerialize(bool training, TFile *fp) {\n  uint8_t mode;\n  if (!fp->DeSerialize(&mode)) {\n    return false;\n  }\n  int_mode_ = (mode & kInt8Flag) != 0;\n  use_adam_ = (mode & kAdamFlag) != 0;\n  if ((mode & kDoubleFlag) == 0) {\n    return DeSerializeOld(training, fp);\n  }\n  if (int_mode_) {\n    if (!wi_.DeSerialize(fp)) {\n      return false;\n    }\n    uint32_t size;\n    if (!fp->DeSerialize(&size)) {\n      return false;\n    }\n#ifdef FAST_FLOAT\n    scales_.reserve(size);\n    for (auto n = size; n > 0; n--) {\n      double val;\n      if (!fp->DeSerialize(&val)) {\n        return false;\n      }\n      scales_.push_back(val / INT8_MAX);\n    }\n#else\n    scales_.resize(size);\n    if (!fp->DeSerialize(&scales_[0], size)) {\n      return false;\n    }\n    for (auto &scale : scales_) {\n      scale /= INT8_MAX;\n    }\n#endif\n    if (IntSimdMatrix::intSimdMatrix) {\n      int32_t rounded_num_out;\n      IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);\n      scales_.resize(rounded_num_out);\n    }\n  } else {\n    if (!tesseract::DeSerialize(fp, wf_)) {\n      return false;\n    }\n    if (training) {\n      InitBackward();\n      if (!tesseract::DeSerialize(fp, updates_)) {\n        return false;\n      }\n      if (use_adam_) {\n        if (!tesseract::DeSerialize(fp, dw_sq_sum_)) {\n          return false;\n        }\n      }\n    }\n  }\n  return true;\n}\n\n// As DeSerialize, but reads an old (float) format WeightMatrix for\n// backward compatibility.\nbool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {\n#ifdef FAST_FLOAT\n  // Not implemented.\n  ASSERT_HOST(!\"not implemented\");\n  return false;\n#else\n  if (int_mode_) {\n    if (!wi_.DeSerialize(fp)) {\n      return false;\n    }\n    std::vector<float> old_scales;\n    if (!fp->DeSerialize(old_scales)) {\n      return false;\n    }\n    scales_.reserve(old_scales.size());\n    for (float old_scale : old_scales) {\n      scales_.push_back(old_scale);\n    }\n  } else {\n    GENERIC_2D_ARRAY<float> float_array;\n    if (!float_array.DeSerialize(fp)) {\n      return false;\n    }\n    FloatToDouble(float_array, wf_);\n  }\n  if (training) {\n    InitBackward();\n    GENERIC_2D_ARRAY<float> float_array;\n    if (!float_array.DeSerialize(fp)) {\n      return false;\n    }\n    FloatToDouble(float_array, updates_);\n    // Errs was only used in int training, which is now dead.\n    if (!float_array.DeSerialize(fp)) {\n      return false;\n    }\n  }\n  return true;\n#endif\n}\n\n// Computes matrix.vector v = Wu.\n// u is of size W.dim2() - 1 and the output v is of size W.dim1().\n// u is imagined to have an extra element at the end with value 1, to\n// implement the bias, but it doesn't actually have it.\n// Asserts that the call matches what we have.\nvoid WeightMatrix::MatrixDotVector(const TFloat *u, TFloat *v) const {\n  assert(!int_mode_);\n  MatrixDotVectorInternal(wf_, true, false, u, v);\n}\n\nvoid WeightMatrix::MatrixDotVector(const int8_t *u, TFloat *v) const {\n  assert(int_mode_);\n  if (IntSimdMatrix::intSimdMatrix) {\n    IntSimdMatrix::intSimdMatrix->matrixDotVectorFunction(wi_.dim1(), wi_.dim2(), &shaped_w_[0],\n                                                          &scales_[0], u, v);\n  } else {\n    IntSimdMatrix::MatrixDotVector(wi_, scales_, u, v);\n  }\n}\n\n// MatrixDotVector for peep weights, MultiplyAccumulate adds the\n// component-wise products of *this[0] and v to inout.\nvoid WeightMatrix::MultiplyAccumulate(const TFloat *v, TFloat *inout) {\n  assert(!int_mode_);\n  assert(wf_.dim1() == 1);\n  int n = wf_.dim2();\n  const TFloat *u = wf_[0];\n  for (int i = 0; i < n; ++i) {\n    inout[i] += u[i] * v[i];\n  }\n}\n\n// Computes vector.matrix v = uW.\n// u is of size W.dim1() and the output v is of size W.dim2() - 1.\n// The last result is discarded, as v is assumed to have an imaginary\n// last value of 1, as with MatrixDotVector.\nvoid WeightMatrix::VectorDotMatrix(const TFloat *u, TFloat *v) const {\n  assert(!int_mode_);\n  MatrixDotVectorInternal(wf_t_, false, true, u, v);\n}\n\n// Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from\n// u and v. In terms of the neural network, u is the gradients and v is the\n// inputs.\n// Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.\n// Runs parallel if requested. Note that u and v must be transposed.\nvoid WeightMatrix::SumOuterTransposed(const TransposedArray &u, const TransposedArray &v,\n                                      bool in_parallel) {\n  assert(!int_mode_);\n  int num_outputs = dw_.dim1();\n  assert(u.dim1() == num_outputs);\n  assert(u.dim2() == v.dim2());\n  int num_inputs = dw_.dim2() - 1;\n  int num_samples = u.dim2();\n  // v is missing the last element in dim1.\n  assert(v.dim1() == num_inputs);\n#ifdef _OPENMP\n#  pragma omp parallel for num_threads(4) if (in_parallel)\n#endif\n  for (int i = 0; i < num_outputs; ++i) {\n    TFloat *dwi = dw_[i];\n    const TFloat *ui = u[i];\n    for (int j = 0; j < num_inputs; ++j) {\n      dwi[j] = DotProduct(ui, v[j], num_samples);\n    }\n    // The last element of v is missing, presumed 1.0f.\n    TFloat total = 0;\n    for (int k = 0; k < num_samples; ++k) {\n      total += ui[k];\n    }\n    dwi[num_inputs] = total;\n  }\n}\n\n// Updates the weights using the given learning rate and momentum.\n// num_samples is the quotient to be used in the adam computation iff\n// use_adam_ is true.\nvoid WeightMatrix::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {\n  assert(!int_mode_);\n  if (use_adam_ && momentum > 0.0f && num_samples > 0 && num_samples < kAdamCorrectionIterations) {\n    learning_rate *= sqrt(1.0f - pow(adam_beta, num_samples));\n    learning_rate /= 1.0f - pow(momentum, num_samples);\n  }\n  if (use_adam_ && num_samples > 0 && momentum > 0.0f) {\n    dw_sq_sum_.SumSquares(dw_, adam_beta);\n    dw_ *= learning_rate * (1.0f - momentum);\n    updates_ *= momentum;\n    updates_ += dw_;\n    wf_.AdamUpdate(updates_, dw_sq_sum_, learning_rate * kAdamEpsilon);\n  } else {\n    dw_ *= learning_rate;\n    updates_ += dw_;\n    if (momentum > 0.0f) {\n      wf_ += updates_;\n    }\n    if (momentum >= 0.0f) {\n      updates_ *= momentum;\n    }\n  }\n  wf_t_.Transpose(wf_);\n}\n\n// Adds the dw_ in other to the dw_ is *this.\nvoid WeightMatrix::AddDeltas(const WeightMatrix &other) {\n  assert(dw_.dim1() == other.dw_.dim1());\n  assert(dw_.dim2() == other.dw_.dim2());\n  dw_ += other.dw_;\n}\n\n// Sums the products of weight updates in *this and other, splitting into\n// positive (same direction) in *same and negative (different direction) in\n// *changed.\nvoid WeightMatrix::CountAlternators(const WeightMatrix &other, TFloat *same,\n                                    TFloat *changed) const {\n  int num_outputs = updates_.dim1();\n  int num_inputs = updates_.dim2();\n  assert(num_outputs == other.updates_.dim1());\n  assert(num_inputs == other.updates_.dim2());\n  for (int i = 0; i < num_outputs; ++i) {\n    const TFloat *this_i = updates_[i];\n    const TFloat *other_i = other.updates_[i];\n    for (int j = 0; j < num_inputs; ++j) {\n      TFloat product = this_i[j] * other_i[j];\n      if (product < 0.0) {\n        *changed -= product;\n      } else {\n        *same += product;\n      }\n    }\n  }\n}\n\n// Helper computes an integer histogram bucket for a weight and adds it\n// to the histogram.\nconst int kHistogramBuckets = 16;\nstatic void HistogramWeight(TFloat weight, STATS *histogram) {\n  int bucket = kHistogramBuckets - 1;\n  if (weight != 0.0) {\n    TFloat logval = -log2(fabs(weight));\n    bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);\n  }\n  histogram->add(bucket, 1);\n}\n\nvoid WeightMatrix::Debug2D(const char *msg) {\n  STATS histogram(0, kHistogramBuckets - 1);\n  if (int_mode_) {\n    for (int i = 0; i < wi_.dim1(); ++i) {\n      for (int j = 0; j < wi_.dim2(); ++j) {\n        HistogramWeight(wi_[i][j] * scales_[i], &histogram);\n      }\n    }\n  } else {\n    for (int i = 0; i < wf_.dim1(); ++i) {\n      for (int j = 0; j < wf_.dim2(); ++j) {\n        HistogramWeight(wf_[i][j], &histogram);\n      }\n    }\n  }\n  tprintf(\"%s\\n\", msg);\n  histogram.print();\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/lstm/weightmatrix.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        weightmatrix.h\n// Description: Hides distinction between float/int implementations.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_WEIGHTMATRIX_H_\n#define TESSERACT_LSTM_WEIGHTMATRIX_H_\n\n#include <memory>\n#include <vector>\n#include \"intsimdmatrix.h\"\n#include \"matrix.h\"\n#include \"tesstypes.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Convenience instantiation of GENERIC_2D_ARRAY<TFloat> with additional\n// operations to write a strided vector, so the transposed form of the input\n// is memory-contiguous.\nclass TransposedArray : public GENERIC_2D_ARRAY<TFloat> {\npublic:\n  // Copies the whole input transposed, converted to TFloat, into *this.\n  void Transpose(const GENERIC_2D_ARRAY<TFloat> &input);\n  // Writes a vector of data representing a timestep (gradients or sources).\n  // The data is assumed to be of size1 in size (the strided dimension).\n  ~TransposedArray() override;\n  void WriteStrided(int t, const float *data) {\n    int size1 = dim1();\n    for (int i = 0; i < size1; ++i) {\n      put(i, t, data[i]);\n    }\n  }\n  void WriteStrided(int t, const double *data) {\n    int size1 = dim1();\n    for (int i = 0; i < size1; ++i) {\n      put(i, t, data[i]);\n    }\n  }\n  // Prints the first and last num elements of the un-transposed array.\n  void PrintUnTransposed(int num) {\n    int num_features = dim1();\n    int width = dim2();\n    for (int y = 0; y < num_features; ++y) {\n      for (int t = 0; t < width; ++t) {\n        if (num == 0 || t < num || t + num >= width) {\n          tprintf(\" %g\", static_cast<double>((*this)(y, t)));\n        }\n      }\n      tprintf(\"\\n\");\n    }\n  }\n}; // class TransposedArray\n\n// Generic weight matrix for network layers. Can store the matrix as either\n// an array of floats or int8_t. Provides functions to compute the forward and\n// backward steps with the matrix and updates to the weights.\nclass WeightMatrix {\npublic:\n  WeightMatrix() : int_mode_(false), use_adam_(false) {}\n  // Sets up the network for training. Initializes weights using weights of\n  // scale `range` picked according to the random number generator `randomizer`.\n  // Note the order is outputs, inputs, as this is the order of indices to\n  // the matrix, so the adjacent elements are multiplied by the input during\n  // a forward operation.\n  int InitWeightsFloat(int no, int ni, bool use_adam, float weight_range, TRand *randomizer);\n  // Changes the number of outputs to the size of the given code_map, copying\n  // the old weight matrix entries for each output from code_map[output] where\n  // non-negative, and uses the mean (over all outputs) of the existing weights\n  // for all outputs with negative code_map entries. Returns the new number of\n  // weights.\n  int RemapOutputs(const std::vector<int> &code_map);\n\n  // Converts a float network to an int network. Each set of input weights that\n  // corresponds to a single output weight is converted independently:\n  // Compute the max absolute value of the weight set.\n  // Scale so the max absolute value becomes INT8_MAX.\n  // Round to integer.\n  // Store a multiplicative scale factor (as a float) that will reproduce\n  // the original value, subject to rounding errors.\n  void ConvertToInt();\n  // Returns the size rounded up to an internal factor used by the SIMD\n  // implementation for its input.\n  int RoundInputs(int size) const {\n    if (!int_mode_ || !IntSimdMatrix::intSimdMatrix) {\n      return size;\n    }\n    return IntSimdMatrix::intSimdMatrix->RoundInputs(size);\n  }\n\n  // Accessors.\n  bool is_int_mode() const {\n    return int_mode_;\n  }\n  int NumOutputs() const {\n    return int_mode_ ? wi_.dim1() : wf_.dim1();\n  }\n  // Provides one set of weights. Only used by peep weight maxpool.\n  const TFloat *GetWeights(int index) const {\n    return wf_[index];\n  }\n  // Provides access to the deltas (dw_).\n  TFloat GetDW(int i, int j) const {\n    return dw_(i, j);\n  }\n\n  // Allocates any needed memory for running Backward, and zeroes the deltas,\n  // thus eliminating any existing momentum.\n  void InitBackward();\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(bool training, TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(bool training, TFile *fp);\n  // As DeSerialize, but reads an old (float) format WeightMatrix for\n  // backward compatibility.\n  bool DeSerializeOld(bool training, TFile *fp);\n\n  // Computes matrix.vector v = Wu.\n  // u is of size W.dim2() - 1 and the output v is of size W.dim1().\n  // u is imagined to have an extra element at the end with value 1, to\n  // implement the bias, but it doesn't actually have it.\n  // Asserts that the call matches what we have.\n  void MatrixDotVector(const TFloat *u, TFloat *v) const;\n  void MatrixDotVector(const int8_t *u, TFloat *v) const;\n  // MatrixDotVector for peep weights, MultiplyAccumulate adds the\n  // component-wise products of *this[0] and v to inout.\n  void MultiplyAccumulate(const TFloat *v, TFloat *inout);\n  // Computes vector.matrix v = uW.\n  // u is of size W.dim1() and the output v is of size W.dim2() - 1.\n  // The last result is discarded, as v is assumed to have an imaginary\n  // last value of 1, as with MatrixDotVector.\n  void VectorDotMatrix(const TFloat *u, TFloat *v) const;\n  // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements\n  // from u and v, starting with u[i][offset] and v[j][offset].\n  // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.\n  // Runs parallel if requested. Note that inputs must be transposed.\n  void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel);\n  // Updates the weights using the given learning rate, momentum and adam_beta.\n  // num_samples is used in the Adam correction factor.\n  void Update(float learning_rate, float momentum, float adam_beta, int num_samples);\n  // Adds the dw_ in other to the dw_ is *this.\n  void AddDeltas(const WeightMatrix &other);\n  // Sums the products of weight updates in *this and other, splitting into\n  // positive (same direction) in *same and negative (different direction) in\n  // *changed.\n  void CountAlternators(const WeightMatrix &other, TFloat *same, TFloat *changed) const;\n\n  void Debug2D(const char *msg);\n\nprivate:\n  // Choice between float and 8 bit int implementations.\n  GENERIC_2D_ARRAY<TFloat> wf_;\n  GENERIC_2D_ARRAY<int8_t> wi_;\n  // Transposed copy of wf_, used only for Backward, and set with each Update.\n  TransposedArray wf_t_;\n  // Which of wf_ and wi_ are we actually using.\n  bool int_mode_;\n  // True if we are running adam in this weight matrix.\n  bool use_adam_;\n  // If we are using wi_, then scales_ is a factor to restore the row product\n  // with a vector to the correct range.\n  std::vector<TFloat> scales_;\n  // Weight deltas. dw_ is the new delta, and updates_ the momentum-decaying\n  // amount to be added to wf_/wi_.\n  GENERIC_2D_ARRAY<TFloat> dw_;\n  GENERIC_2D_ARRAY<TFloat> updates_;\n  // Iff use_adam_, the sum of squares of dw_. The number of samples is\n  // given to Update(). Serialized iff use_adam_.\n  GENERIC_2D_ARRAY<TFloat> dw_sq_sum_;\n  // The weights matrix reorganized in whatever way suits this instance.\n  std::vector<int8_t> shaped_w_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_WEIGHTMATRIX_H_\n"
  },
  {
    "path": "src/svpaint.cpp",
    "content": "// Copyright 2007 Google Inc. All Rights Reserved.\n//\n// Author: Joern Wanke\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// Simple drawing program to illustrate ScrollView capabilities.\n//\n// Functionality:\n// - The menubar is used to select from different sample styles of input.\n// - With the RMB it is possible to change the RGB values in different\n//   popup menus.\n// - A LMB click either draws point-to-point, point or text.\n// - A LMB dragging either draws a line, a rectangle or ellipse.\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#ifndef GRAPHICS_DISABLED\n#  include \"scrollview.h\"\n#  include \"svmnode.h\"\n\n#  include <cstdlib>\n#  include <iostream>\n\nnamespace tesseract {\n\n// The current color values we use, initially white (== ScrollView::WHITE).\nstatic int rgb[3] = {255, 255, 255};\n\nclass SVPaint : public SVEventHandler {\npublic:\n  explicit SVPaint(const char *server_name);\n  // This is the main event handling function that we need to overwrite, defined\n  // in SVEventHandler.\n  void Notify(const SVEvent *sv_event) override;\n\nprivate:\n  // The Handler take care of the SVET_POPUP, SVET_MENU, SVET_CLICK and\n  // SVET_SELECTION events.\n  void PopupHandler(const SVEvent *sv_event);\n  void MenuBarHandler(const SVEvent *sv_event);\n  void ClickHandler(const SVEvent *sv_event);\n  void SelectionHandler(const SVEvent *sv_event);\n\n  // Convenience functions to build little menus.\n  SVMenuNode *BuildPopupMenu();\n  SVMenuNode *BuildMenuBar();\n\n  // Our window.\n  ScrollView *window_;\n\n  // The mode we are in when an SVET_CLICK or an SVET_SELECTION event occurs.\n  int click_mode_;\n  int drag_mode_;\n\n  // In the point-to-point drawing mode, we need to set a start-point the first\n  // time we call it (e.g. call SetCursor).\n  bool has_start_point_;\n};\n\n// Build a sample popup menu.\nSVMenuNode *SVPaint::BuildPopupMenu() {\n  auto *root = new SVMenuNode(); // Empty root node\n  // Initial color is white, so we  all values to 255.\n  root->AddChild(\"R\",                 // Shown caption.\n                 1,                   // assoc. command_id.\n                 \"255\",               // initial value.\n                 \"Red Color Value?\"); // Shown description.\n  root->AddChild(\"G\", 2, \"255\", \"Green Color Value?\");\n  root->AddChild(\"B\", 3, \"255\", \"Blue Color Value?\");\n  return root;\n}\n\n// Build a sample menu bar.\nSVMenuNode *SVPaint::BuildMenuBar() {\n  auto *root = new SVMenuNode(); // Empty root node\n\n  // Create some submenus and add them to the root.\n  SVMenuNode *click = root->AddChild(\"Clicking\");\n  SVMenuNode *drag = root->AddChild(\"Dragging\");\n\n  // Put some nodes into the submenus.\n  click->AddChild(\"Point to Point Drawing\", // Caption.\n                  1);                       // command_id.\n  click->AddChild(\"Point Drawing\", 2);\n  click->AddChild(\"Text Drawing\", 3);\n  drag->AddChild(\"Line Drawing\", 4);\n  drag->AddChild(\"Rectangle Drawing\", 5);\n  drag->AddChild(\"Ellipse Drawing\", 6);\n  return root;\n}\n\n// Takes care of the SVET_POPUP events.\n// In our case, SVET_POPUP is used to set RGB values.\nvoid SVPaint::PopupHandler(const SVEvent *sv_event) {\n  // Since we only have the RGB values as popup items,\n  // we take a shortcut to not bloat up code:\n  rgb[sv_event->command_id - 1] = atoi(sv_event->parameter);\n  window_->Pen(rgb[0], rgb[1], rgb[2]);\n}\n\n// Takes care of the SVET_MENU events.\n// In our case, we change either the click_mode_ (commands 1-3)\n// or the drag_mode_ (commands 4-6).\nvoid SVPaint::MenuBarHandler(const SVEvent *sv_event) {\n  if ((sv_event->command_id > 0) && (sv_event->command_id < 4)) {\n    click_mode_ = sv_event->command_id;\n    has_start_point_ = false;\n  } else {\n    drag_mode_ = sv_event->command_id;\n  }\n}\n\n// Takes care of the SVET_CLICK events.\n// Depending on the click_mode_ we are in, either do Point-to-Point drawing,\n// point drawing, or draw text.\nvoid SVPaint::ClickHandler(const SVEvent *sv_event) {\n  switch (click_mode_) {\n    case 1: // Point to Point\n      if (has_start_point_) {\n        window_->DrawTo(sv_event->x, sv_event->y);\n      } else {\n        has_start_point_ = true;\n        window_->SetCursor(sv_event->x, sv_event->y);\n      }\n      break;\n    case 2: // Point Drawing..simulated by drawing a 1 pixel line.\n      window_->Line(sv_event->x, sv_event->y, sv_event->x, sv_event->y);\n      break;\n    case 3: // Text\n      // We show a modal input dialog on our window, then draw the input and\n      // finally delete the input pointer.\n      char *p = window_->ShowInputDialog(\"Text:\");\n      window_->Text(sv_event->x, sv_event->y, p);\n      delete[] p;\n      break;\n  }\n}\n\n// Takes care of the SVET_SELECTION events.\n// Depending on the drag_mode_ we are in, either draw a line, a rectangle or\n// an ellipse.\nvoid SVPaint::SelectionHandler(const SVEvent *sv_event) {\n  switch (drag_mode_) {\n      // FIXME inversed x_size, y_size\n    case 4: // Line\n      window_->Line(sv_event->x, sv_event->y, sv_event->x - sv_event->x_size,\n                    sv_event->y - sv_event->y_size);\n      break;\n    case 5: // Rectangle\n      window_->Rectangle(sv_event->x, sv_event->y, sv_event->x - sv_event->x_size,\n                         sv_event->y - sv_event->y_size);\n      break;\n    case 6: // Ellipse\n      window_->Ellipse(sv_event->x - sv_event->x_size, sv_event->y - sv_event->y_size,\n                       sv_event->x_size, sv_event->y_size);\n      break;\n  }\n}\n\n// The event handling function from ScrollView which we have to overwrite.\n// We handle CLICK, SELECTION, MENU and POPUP and throw away all other events.\nvoid SVPaint::Notify(const SVEvent *sv_event) {\n  if (sv_event->type == SVET_CLICK) {\n    ClickHandler(sv_event);\n  } else if (sv_event->type == SVET_SELECTION) {\n    SelectionHandler(sv_event);\n  } else if (sv_event->type == SVET_MENU) {\n    MenuBarHandler(sv_event);\n  } else if (sv_event->type == SVET_POPUP) {\n    PopupHandler(sv_event);\n  }\n  // throw other events away\n}\n\n// Builds a new window, initializes the variables and event handler and builds\n// the menu.\nSVPaint::SVPaint(const char *server_name) {\n  window_ = new ScrollView(\"ScrollView Paint Example\", // window caption\n                           0, 0,                       // x,y window position\n                           500, 500,                   // window size\n                           500, 500,                   // canvas size\n                           false,                      // whether the Y axis is inversed.\n                                                       // this is included due to legacy\n                                                       // reasons for tesseract and enables\n                                                       // us to have (0,0) as the LOWER left\n                                                       // of the coordinate system.\n                           server_name);               // the server address.\n\n  // Set the start modes to point-to-point and line drawing.\n  click_mode_ = 1;\n  drag_mode_ = 4;\n  has_start_point_ = false;\n\n  // Bild our menus and add them to the window. The flag illustrates whether\n  // this is a menu bar.\n  SVMenuNode *popup_menu = BuildPopupMenu();\n  popup_menu->BuildMenu(window_, false);\n\n  SVMenuNode *bar_menu = BuildMenuBar();\n  bar_menu->BuildMenu(window_, true);\n\n  // Set the initial color values to White (could also be done by\n  // passing (rgb[0], rgb[1], rgb[2]).\n  window_->Pen(ScrollView::WHITE);\n  window_->Brush(ScrollView::WHITE);\n\n  // Adds the event handler to the window. This actually ensures that Notify\n  // gets called when events occur.\n  window_->AddEventHandler(this);\n\n  // Set the window visible (calling this is important to actually render\n  // everything. Without this call, the window would also be drawn, but the\n  // menu bars would be missing.\n  window_->SetVisible(true);\n\n  // Rest this thread until its window is destroyed.\n  // Note that a special eventhandling thread was created when constructing\n  // the window. Due to this, the application will not deadlock here.\n  window_->AwaitEvent(SVET_DESTROY);\n  // We now have 3 Threads running:\n  // (1) The MessageReceiver thread which fetches messages and distributes them\n  // (2) The EventHandler thread which handles all events for window_\n  // (3) The main thread which waits on window_ for a DESTROY event (blocked)\n}\n\n} // namespace tesseract\n\n// If a parameter is given, we try to connect to the given server.\n// This enables us to test the remote capabilities of ScrollView.\nint main(int argc, char **argv) {\n  const char *server_name;\n  if (argc > 1) {\n    server_name = argv[1];\n  } else {\n    server_name = \"localhost\";\n  }\n  tesseract::SVPaint svp(server_name);\n}\n\n#endif // !GRAPHICS_DISABLED\n"
  },
  {
    "path": "src/tesseract.cpp",
    "content": "/**********************************************************************\n * File:        tesseract.cpp\n * Description: Main program for merge of tess and editor.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <cerrno> // for errno\n#if defined(__USE_GNU)\n#  include <cfenv> // for feenableexcept\n#endif\n#include <climits> // for INT_MIN, INT_MAX\n#include <cstdlib> // for std::getenv\n#include <iostream>\n#include <map>    // for std::map\n#include <memory> // std::unique_ptr\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include \"dict.h\"\n#include <tesseract/renderer.h>\n#include \"simddetect.h\"\n#include \"tesseractclass.h\" // for AnyTessLang\n#include \"tprintf.h\" // for tprintf\n\n#ifdef _OPENMP\n#  include <omp.h>\n#endif\n\n#if defined(HAVE_LIBARCHIVE)\n#  include <archive.h>\n#endif\n#if defined(HAVE_LIBCURL)\n#  include <curl/curl.h>\n#endif\n\n#if defined(_WIN32)\n#  include <fcntl.h>\n#  include <io.h>\n#  if defined(HAVE_TIFFIO_H)\n\n#    include <tiffio.h>\n\nstatic void Win32ErrorHandler(const char *module, const char *fmt, va_list ap) {\n  if (module != nullptr) {\n    fprintf(stderr, \"%s: \", module);\n  }\n  vfprintf(stderr, fmt, ap);\n  fprintf(stderr, \".\\n\");\n}\n\nstatic void Win32WarningHandler(const char *module, const char *fmt, va_list ap) {\n  if (module != nullptr) {\n    fprintf(stderr, \"%s: \", module);\n  }\n  fprintf(stderr, \"Warning, \");\n  vfprintf(stderr, fmt, ap);\n  fprintf(stderr, \".\\n\");\n}\n\n#  endif /* HAVE_TIFFIO_H */\n\nclass AutoWin32ConsoleOutputCP {\npublic:\n  explicit AutoWin32ConsoleOutputCP(UINT codeCP) :\n    oldCP_(GetConsoleOutputCP()) {\n    SetConsoleOutputCP(codeCP);\n  }\n  ~AutoWin32ConsoleOutputCP() {\n    SetConsoleOutputCP(oldCP_);\n  }\n\nprivate:\n  UINT oldCP_;\n};\n\nstatic AutoWin32ConsoleOutputCP autoWin32ConsoleOutputCP(CP_UTF8);\n\n#endif // _WIN32\n\nusing namespace tesseract;\n\nstatic void PrintVersionInfo() {\n  char *versionStrP;\n\n  printf(\"tesseract %s\\n\", tesseract::TessBaseAPI::Version());\n\n  versionStrP = getLeptonicaVersion();\n  printf(\" %s\\n\", versionStrP);\n  lept_free(versionStrP);\n\n  versionStrP = getImagelibVersions();\n  printf(\"  %s\\n\", versionStrP);\n  lept_free(versionStrP);\n\n#if defined(HAVE_NEON) || defined(__aarch64__)\n  if (tesseract::SIMDDetect::IsNEONAvailable())\n    printf(\" Found NEON\\n\");\n#elif defined(HAVE_RVV)\n  if (tesseract::SIMDDetect::IsRVVAvailable())\n    printf(\" Found RVV\\n\");\n#else\n  if (tesseract::SIMDDetect::IsAVX512BWAvailable()) {\n    printf(\" Found AVX512BW\\n\");\n  }\n  if (tesseract::SIMDDetect::IsAVX512FAvailable()) {\n    printf(\" Found AVX512F\\n\");\n  }\n  if (tesseract::SIMDDetect::IsAVX512VNNIAvailable()) {\n    printf(\" Found AVX512VNNI\\n\");\n  }\n  if (tesseract::SIMDDetect::IsAVX2Available()) {\n    printf(\" Found AVX2\\n\");\n  }\n  if (tesseract::SIMDDetect::IsAVXAvailable()) {\n    printf(\" Found AVX\\n\");\n  }\n  if (tesseract::SIMDDetect::IsFMAAvailable()) {\n    printf(\" Found FMA\\n\");\n  }\n  if (tesseract::SIMDDetect::IsSSEAvailable()) {\n    printf(\" Found SSE4.1\\n\");\n  }\n#endif\n#ifdef _OPENMP\n  printf(\" Found OpenMP %d\\n\", _OPENMP);\n#endif\n#if defined(HAVE_LIBARCHIVE)\n#  if ARCHIVE_VERSION_NUMBER >= 3002000\n  printf(\" Found %s\\n\", archive_version_details());\n#  else\n  printf(\" Found %s\\n\", archive_version_string());\n#  endif // ARCHIVE_VERSION_NUMBER\n#endif   // HAVE_LIBARCHIVE\n#if defined(HAVE_LIBCURL)\n  printf(\" Found %s\\n\", curl_version());\n#endif\n}\n\nstatic void PrintHelpForPSM() {\n  printf(\n      \"Page segmentation modes (PSM):\\n\"\n      \"  0|osd_only                Orientation and script detection (OSD) only.\\n\"\n      \"  1|auto_osd                Automatic page segmentation with OSD.\\n\"\n      \"  2|auto_only               Automatic page segmentation, but no OSD, or OCR. (not \"\n      \"implemented)\\n\"\n      \"  3|auto                    Fully automatic page segmentation, but no OSD. (Default)\\n\"\n      \"  4|single_column           Assume a single column of text of variable sizes.\\n\"\n      \"  5|single_block_vert_text  Assume a single uniform block of vertically aligned text.\\n\"\n      \"  6|single_block            Assume a single uniform block of text.\\n\"\n      \"  7|single_line             Treat the image as a single text line.\\n\"\n      \"  8|single_word             Treat the image as a single word.\\n\"\n      \"  9|circle_word             Treat the image as a single word in a circle.\\n\"\n      \" 10|single_char             Treat the image as a single character.\\n\"\n      \" 11|sparse_text             Sparse text. Find as much text as possible in no\"\n      \" particular order.\\n\"\n      \" 12|sparse_text_osd         Sparse text with OSD.\\n\"\n      \" 13|raw_line                Raw line. Treat the image as a single text line,\\n\"\n      \"                            bypassing hacks that are Tesseract-specific.\\n\"\n  );\n\n#ifdef DISABLED_LEGACY_ENGINE\n  printf(\"\\nNOTE: The OSD modes are currently disabled.\\n\");\n#endif\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\nstatic void PrintHelpForOEM() {\n  printf(\n      \"OCR Engine modes (OEM):\\n\"\n      \"  0|tesseract_only          Legacy engine only.\\n\"\n      \"  1|lstm_only               Neural nets LSTM engine only.\\n\"\n      \"  2|tesseract_lstm_combined Legacy + LSTM engines.\\n\"\n      \"  3|default                 Default, based on what is available.\\n\"\n  );\n}\n#endif // ndef DISABLED_LEGACY_ENGINE\n\nstatic void PrintHelpExtra(const char *program) {\n  printf(\n      \"Usage:\\n\"\n      \"  %s --help | --help-extra | --help-psm | \"\n#ifndef DISABLED_LEGACY_ENGINE\n      \"--help-oem | \"\n#endif\n      \"--version\\n\"\n      \"  %s --list-langs [--tessdata-dir PATH]\\n\"\n#ifndef DISABLED_LEGACY_ENGINE\n      \"  %s --print-fonts-table [options...] [configfile...]\\n\"\n#endif  // ndef DISABLED_LEGACY_ENGINE\n      \"  %s --print-parameters [options...] [configfile...]\\n\"\n      \"  %s imagename|imagelist|stdin outputbase|stdout [options...] \"\n      \"[configfile...]\\n\"\n      \"\\n\"\n      \"OCR options:\\n\"\n      \"  --tessdata-dir PATH   Specify the location of tessdata path.\\n\"\n      \"  --user-words PATH     Specify the location of user words file.\\n\"\n      \"  --user-patterns PATH  Specify the location of user patterns file.\\n\"\n      \"  --dpi VALUE           Specify DPI for input image.\\n\"\n      \"  --loglevel LEVEL      Specify logging level. LEVEL can be\\n\"\n      \"                        ALL, TRACE, DEBUG, INFO, WARN, ERROR, FATAL or OFF.\\n\"\n      \"  -l LANG[+LANG]        Specify language(s) used for OCR.\\n\"\n      \"  -c VAR=VALUE          Set value for config variables.\\n\"\n      \"                        Multiple -c arguments are allowed.\\n\"\n      \"  --psm PSM|NUM         Specify page segmentation mode.\\n\"\n#ifndef DISABLED_LEGACY_ENGINE\n      \"  --oem OEM|NUM         Specify OCR Engine mode.\\n\"\n#endif\n      \"NOTE: These options must occur before any configfile.\\n\"\n      \"\\n\",\n      program, program, program, program\n#ifndef DISABLED_LEGACY_ENGINE\n      , program\n#endif  // ndef DISABLED_LEGACY_ENGINE\n  );\n\n  PrintHelpForPSM();\n#ifndef DISABLED_LEGACY_ENGINE\n  printf(\"\\n\");\n  PrintHelpForOEM();\n#endif\n\n  printf(\n      \"\\n\"\n      \"Single options:\\n\"\n      \"  -h, --help            Show minimal help message.\\n\"\n      \"  --help-extra          Show extra help for advanced users.\\n\"\n      \"  --help-psm            Show page segmentation modes.\\n\"\n#ifndef DISABLED_LEGACY_ENGINE\n      \"  --help-oem            Show OCR Engine modes.\\n\"\n#endif\n      \"  -v, --version         Show version information.\\n\"\n      \"  --list-langs          List available languages for tesseract engine.\\n\"\n#ifndef DISABLED_LEGACY_ENGINE\n      \"  --print-fonts-table   Print tesseract fonts table.\\n\"\n#endif  // ndef DISABLED_LEGACY_ENGINE\n      \"  --print-parameters    Print tesseract parameters.\\n\");\n}\n\nstatic void PrintHelpMessage(const char *program) {\n  printf(\n      \"Usage:\\n\"\n      \"  %s --help | --help-extra | --version\\n\"\n      \"  %s --list-langs\\n\"\n      \"  %s imagename outputbase [options...] [configfile...]\\n\"\n      \"\\n\"\n      \"OCR options:\\n\"\n      \"  -l LANG[+LANG]        Specify language(s) used for OCR.\\n\"\n      \"NOTE: These options must occur before any configfile.\\n\"\n      \"\\n\"\n      \"Single options:\\n\"\n      \"  --help                Show this help message.\\n\"\n      \"  --help-extra          Show extra help for advanced users.\\n\"\n      \"  --version             Show version information.\\n\"\n      \"  --list-langs          List available languages for tesseract \"\n      \"engine.\\n\",\n      program, program, program);\n}\n\nstatic void PrintLangsList(tesseract::TessBaseAPI &api) {\n  std::vector<std::string> languages;\n  api.GetAvailableLanguagesAsVector(&languages);\n  printf(\"List of available languages in \\\"%s\\\" (%zu):\\n\",\n         api.GetDatapath(), languages.size());\n  for (const auto &language : languages) {\n    printf(\"%s\\n\", language.c_str());\n  }\n  api.End();\n}\n\n/**\n * We have 2 possible sources of pagesegmode: a config file and\n * the command line. For backwards compatibility reasons, the\n * default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the\n * default for this program is tesseract::PSM_AUTO. We will let\n * the config file take priority, so the command-line default\n * can take priority over the tesseract default, so we use the\n * value from the command line only if the retrieved mode\n * is still tesseract::PSM_SINGLE_BLOCK, indicating no change\n * in any config file. Therefore the only way to force\n * tesseract::PSM_SINGLE_BLOCK is from the command line.\n * It would be simpler if we could set the value before Init,\n * but that doesn't work.\n */\nstatic void FixPageSegMode(tesseract::TessBaseAPI &api, tesseract::PageSegMode pagesegmode) {\n  if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) {\n    api.SetPageSegMode(pagesegmode);\n  }\n}\n\nstatic bool checkArgValues(int arg, const char *mode, int count) {\n  if (arg >= count || arg < 0) {\n    printf(\"Invalid %s value, please enter a symbolic %s value or a number between 0-%d\\n\", mode, mode, count - 1);\n    return false;\n  }\n  return true;\n}\n\n// Convert a symbolic or numeric string to an OEM value.\nstatic int stringToOEM(const std::string arg) {\n  std::map<std::string, int> oem_map = {\n    {\"0\", 0},\n    {\"1\", 1},\n    {\"2\", 2},\n    {\"3\", 3},\n    {\"tesseract_only\", 0},\n    {\"lstm_only\", 1},\n    {\"tesseract_lstm_combined\", 2},\n    {\"default\", 3},\n  };\n  auto it = oem_map.find(arg);\n  return it == oem_map.end() ? -1 : it->second;\n}\n\nstatic int stringToPSM(const std::string arg) {\n  std::map<std::string, int> psm_map = {\n    {\"0\", 0},\n    {\"1\", 1},\n    {\"2\", 2},\n    {\"3\", 3},\n    {\"4\", 4},\n    {\"5\", 5},\n    {\"6\", 6},\n    {\"7\", 7},\n    {\"8\", 8},\n    {\"9\", 9},\n    {\"10\", 10},\n    {\"11\", 11},\n    {\"12\", 12},\n    {\"13\", 13},\n    {\"osd_only\", 0},\n    {\"auto_osd\", 1},\n    {\"auto_only\", 2},\n    {\"auto\", 3},\n    {\"single_column\", 4},\n    {\"single_block_vert_text\", 5},\n    {\"single_block\", 6},\n    {\"single_line\", 7},\n    {\"single_word\", 8},\n    {\"circle_word\", 9},\n    {\"single_char\", 10},\n    {\"sparse_text\", 11},\n    {\"sparse_text_osd\", 12},\n    {\"raw_line\", 13},\n  };\n  auto it = psm_map.find(arg);\n  return it == psm_map.end() ? -1 : it->second;\n}\n\n// NOTE: arg_i is used here to avoid ugly *i so many times in this function\nstatic bool ParseArgs(int argc, char **argv, const char **lang, const char **image,\n                      const char **outputbase, const char **datapath, l_int32 *dpi,\n                      bool *list_langs, bool *print_parameters, bool *print_fonts_table,\n                      std::vector<std::string> *vars_vec, std::vector<std::string> *vars_values,\n                      l_int32 *arg_i, tesseract::PageSegMode *pagesegmode,\n                      tesseract::OcrEngineMode *enginemode) {\n  bool noocr = false;\n  int i;\n  for (i = 1; i < argc && (*outputbase == nullptr || argv[i][0] == '-'); i++) {\n    if (*image != nullptr && *outputbase == nullptr) {\n      // outputbase follows image, don't allow options at that position.\n      *outputbase = argv[i];\n    } else if ((strcmp(argv[i], \"-h\") == 0) || (strcmp(argv[i], \"--help\") == 0)) {\n      PrintHelpMessage(argv[0]);\n      noocr = true;\n    } else if (strcmp(argv[i], \"--help-extra\") == 0) {\n      PrintHelpExtra(argv[0]);\n      noocr = true;\n    } else if ((strcmp(argv[i], \"--help-psm\") == 0)) {\n      PrintHelpForPSM();\n      noocr = true;\n#ifndef DISABLED_LEGACY_ENGINE\n    } else if ((strcmp(argv[i], \"--help-oem\") == 0)) {\n      PrintHelpForOEM();\n      noocr = true;\n#endif\n    } else if ((strcmp(argv[i], \"-v\") == 0) || (strcmp(argv[i], \"--version\") == 0)) {\n      PrintVersionInfo();\n      noocr = true;\n    } else if (strcmp(argv[i], \"-l\") == 0 && i + 1 < argc) {\n      *lang = argv[i + 1];\n      ++i;\n    } else if (strcmp(argv[i], \"--tessdata-dir\") == 0 && i + 1 < argc) {\n      *datapath = argv[i + 1];\n      ++i;\n    } else if (strcmp(argv[i], \"--dpi\") == 0 && i + 1 < argc) {\n      *dpi = atoi(argv[i + 1]);\n      ++i;\n    } else if (strcmp(argv[i], \"--loglevel\") == 0 && i + 1 < argc) {\n      // Allow the log levels which are used by log4cxx.\n      const std::string loglevel_string = argv[++i];\n      static const std::map<const std::string, int> loglevels {\n        {\"ALL\", INT_MIN},\n        {\"TRACE\", 5000},\n        {\"DEBUG\", 10000},\n        {\"INFO\", 20000},\n        {\"WARN\", 30000},\n        {\"ERROR\", 40000},\n        {\"FATAL\", 50000},\n        {\"OFF\", INT_MAX},\n      };\n      try {\n        auto loglevel = loglevels.at(loglevel_string);\n        log_level = loglevel;\n      } catch (const std::out_of_range &) {\n        // TODO: Allow numeric argument?\n        tprintf(\"Error, unsupported --loglevel %s\\n\", loglevel_string.c_str());\n        return false;\n      }\n    } else if (strcmp(argv[i], \"--user-words\") == 0 && i + 1 < argc) {\n      vars_vec->push_back(\"user_words_file\");\n      vars_values->push_back(argv[i + 1]);\n      ++i;\n    } else if (strcmp(argv[i], \"--user-patterns\") == 0 && i + 1 < argc) {\n      vars_vec->push_back(\"user_patterns_file\");\n      vars_values->push_back(argv[i + 1]);\n      ++i;\n    } else if (strcmp(argv[i], \"--list-langs\") == 0) {\n      noocr = true;\n      *list_langs = true;\n    } else if (strcmp(argv[i], \"--psm\") == 0 && i + 1 < argc) {\n      int psm = stringToPSM(argv[i + 1]);\n      if (!checkArgValues(psm, \"PSM\", tesseract::PSM_COUNT)) {\n        return false;\n      }\n      *pagesegmode = static_cast<tesseract::PageSegMode>(psm);\n      ++i;\n    } else if (strcmp(argv[i], \"--oem\") == 0 && i + 1 < argc) {\n#ifndef DISABLED_LEGACY_ENGINE\n      int oem = stringToOEM(argv[i + 1]);\n      if (!checkArgValues(oem, \"OEM\", tesseract::OEM_COUNT)) {\n        return false;\n      }\n      *enginemode = static_cast<tesseract::OcrEngineMode>(oem);\n#endif\n      ++i;\n    } else if (strcmp(argv[i], \"--print-parameters\") == 0) {\n      noocr = true;\n      *print_parameters = true;\n#ifndef DISABLED_LEGACY_ENGINE\n    } else if (strcmp(argv[i], \"--print-fonts-table\") == 0) {\n      noocr = true;\n      *print_fonts_table = true;\n#endif  // ndef DISABLED_LEGACY_ENGINE\n    } else if (strcmp(argv[i], \"-c\") == 0 && i + 1 < argc) {\n      const std::string argument(argv[i + 1]);\n      const auto equal_pos = argument.find('=');\n      if (equal_pos == std::string::npos) {\n          throw std::invalid_argument(\"Missing '=' in configvar assignment\");\n      }\n      // Extract key and value\n      const std::string key = argument.substr(0, equal_pos);\n      const std::string value = argument.substr(equal_pos + 1);\n      vars_vec->push_back(key);\n      vars_values->push_back(value);\n      ++i;\n    } else if (*image == nullptr) {\n      *image = argv[i];\n    } else {\n      // Unexpected argument.\n      fprintf(stderr, \"Error, unknown command line argument '%s'\\n\", argv[i]);\n      return false;\n    }\n  }\n\n  *arg_i = i;\n\n  if (*pagesegmode == tesseract::PSM_OSD_ONLY) {\n    // OSD = orientation and script detection.\n    if (*lang != nullptr && strcmp(*lang, \"osd\")) {\n      // If the user explicitly specifies a language (other than osd)\n      // or a script, only orientation can be detected.\n      fprintf(stderr, \"Warning, detects only orientation with -l %s\\n\", *lang);\n    } else {\n      // That mode requires osd.traineddata to detect orientation and script.\n      *lang = \"osd\";\n    }\n  }\n\n  if (*outputbase == nullptr && noocr == false) {\n    PrintHelpMessage(argv[0]);\n    return false;\n  }\n\n  return true;\n}\n\nstatic void PreloadRenderers(tesseract::TessBaseAPI &api,\n                             std::vector<std::unique_ptr<TessResultRenderer>> &renderers,\n                             tesseract::PageSegMode pagesegmode, const char *outputbase) {\n  if (pagesegmode == tesseract::PSM_OSD_ONLY) {\n#ifndef DISABLED_LEGACY_ENGINE\n    renderers.push_back(std::make_unique<tesseract::TessOsdRenderer>(outputbase));\n#endif // ndef DISABLED_LEGACY_ENGINE\n  } else {\n    bool error = false;\n    bool b;\n    api.GetBoolVariable(\"tessedit_create_hocr\", &b);\n    if (b) {\n      bool font_info;\n      api.GetBoolVariable(\"hocr_font_info\", &font_info);\n      auto renderer = std::make_unique<tesseract::TessHOcrRenderer>(outputbase, font_info);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create hOCR output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_alto\", &b);\n    if (b) {\n      auto renderer = std::make_unique<tesseract::TessAltoRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create ALTO output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_page_xml\", &b);\n    if (b) {\n      auto renderer = std::make_unique<tesseract::TessPAGERenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create PAGE output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_tsv\", &b);\n    if (b) {\n      bool font_info;\n      api.GetBoolVariable(\"hocr_font_info\", &font_info);\n      auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create TSV output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_pdf\", &b);\n    if (b) {\n#ifdef WIN32\n      if (_setmode(_fileno(stdout), _O_BINARY) == -1)\n        tprintf(\"ERROR: cin to binary: %s\", strerror(errno));\n#endif // WIN32\n      bool textonly;\n      api.GetBoolVariable(\"textonly_pdf\", &textonly);\n      auto renderer = std::make_unique<tesseract::TessPDFRenderer>(outputbase, api.GetDatapath(), textonly);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create PDF output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_write_unlv\", &b);\n    if (b) {\n      api.SetVariable(\"unlv_tilde_crunching\", \"true\");\n      auto renderer = std::make_unique<tesseract::TessUnlvRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create UNLV output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_lstmbox\", &b);\n    if (b) {\n      auto renderer = std::make_unique<tesseract::TessLSTMBoxRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create LSTM BOX output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_boxfile\", &b);\n    if (b) {\n      auto renderer = std::make_unique<tesseract::TessBoxTextRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create BOX output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_wordstrbox\", &b);\n    if (b) {\n      auto renderer = std::make_unique<tesseract::TessWordStrBoxRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create WordStr BOX output file: %s\\n\", strerror(errno));\n        error = true;\n      }\n    }\n\n    api.GetBoolVariable(\"tessedit_create_txt\", &b);\n    if (b || (!error && renderers.empty())) {\n      // Create text output if no other output was requested\n      // even if text output was not explicitly requested unless\n      // there was an error.\n      auto renderer = std::make_unique<tesseract::TessTextRenderer>(outputbase);\n      if (renderer->happy()) {\n        renderers.push_back(std::move(renderer));\n      } else {\n        tprintf(\"Error, could not create TXT output file: %s\\n\", strerror(errno));\n      }\n    }\n  }\n\n  // Null-out the renderers that are\n  // added to the root, and leave the root in the vector.\n  for (size_t r = 1; r < renderers.size(); ++r) {\n    renderers[0]->insert(renderers[r].get());\n    renderers[r].release(); // at the moment insert() is owning\n  }\n}\n\n/**********************************************************************\n *  main()\n *\n **********************************************************************/\n\nstatic int main1(int argc, char **argv) {\n#if defined(__USE_GNU) && defined(HAVE_FEENABLEEXCEPT)\n  // Raise SIGFPE.\n#  if defined(__clang__)\n  // clang creates code which causes some FP exceptions, so don't enable those.\n  feenableexcept(FE_DIVBYZERO);\n#  else\n  feenableexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_INVALID);\n#  endif\n#endif\n  const char *lang = nullptr;\n  const char *image = nullptr;\n  const char *outputbase = nullptr;\n  const char *datapath = nullptr;\n  bool list_langs = false;\n  bool print_parameters = false;\n  bool print_fonts_table = false;\n  l_int32 dpi = 0;\n  int arg_i = 1;\n  tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;\n#ifdef DISABLED_LEGACY_ENGINE\n  auto enginemode = tesseract::OEM_LSTM_ONLY;\n#else\n  tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;\n#endif\n  std::vector<std::string> vars_vec;\n  std::vector<std::string> vars_values;\n\n  if (std::getenv(\"LEPT_MSG_SEVERITY\")) {\n    // Get Leptonica message level from environment variable.\n    setMsgSeverity(L_SEVERITY_EXTERNAL);\n  } else {\n    // Disable debugging and informational messages from Leptonica.\n    setMsgSeverity(L_SEVERITY_ERROR);\n  }\n\n#if defined(HAVE_TIFFIO_H) && defined(_WIN32)\n  /* Show libtiff errors and warnings on console (not in GUI). */\n  TIFFSetErrorHandler(Win32ErrorHandler);\n  TIFFSetWarningHandler(Win32WarningHandler);\n#endif // HAVE_TIFFIO_H && _WIN32\n\n  if (!ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &dpi, &list_langs,\n                 &print_parameters, &print_fonts_table, &vars_vec, &vars_values, &arg_i,\n                 &pagesegmode, &enginemode)) {\n    return EXIT_FAILURE;\n  }\n\n  bool in_recognition_mode = !list_langs && !print_parameters && !print_fonts_table;\n\n  if (lang == nullptr && in_recognition_mode) {\n    // Set default language model if none was given and a model file is needed.\n    lang = \"eng\";\n  }\n\n  if (image == nullptr && in_recognition_mode) {\n    return EXIT_SUCCESS;\n  }\n\n  // Call GlobalDawgCache here to create the global DawgCache object before\n  // the TessBaseAPI object. This fixes the order of destructor calls:\n  // first TessBaseAPI must be destructed, DawgCache must be the last object.\n  tesseract::Dict::GlobalDawgCache();\n\n  TessBaseAPI api;\n\n  api.SetOutputName(outputbase);\n\n  const int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]), argc - arg_i,\n                                   &vars_vec, &vars_values, false);\n\n  // SIMD settings might be overridden by config variable.\n  tesseract::SIMDDetect::Update();\n\n  if (list_langs) {\n    PrintLangsList(api);\n    return EXIT_SUCCESS;\n  }\n\n  if (init_failed) {\n    fprintf(stderr, \"Could not initialize tesseract.\\n\");\n    return EXIT_FAILURE;\n  }\n\n  if (print_parameters) {\n    FILE *fout = stdout;\n    fprintf(stdout, \"Tesseract parameters:\\n\");\n    api.PrintVariables(fout);\n    api.End();\n    return EXIT_SUCCESS;\n  }\n\n#ifndef DISABLED_LEGACY_ENGINE\n  if (print_fonts_table) {\n    FILE *fout = stdout;\n    fprintf(stdout, \"Tesseract fonts table:\\n\");\n    api.PrintFontsTable(fout);\n    api.End();\n    return EXIT_SUCCESS;\n  }\n#endif  // ndef DISABLED_LEGACY_ENGINE\n\n  FixPageSegMode(api, pagesegmode);\n\n  if (dpi) {\n    auto dpi_string = std::to_string(dpi);\n    api.SetVariable(\"user_defined_dpi\", dpi_string.c_str());\n  }\n\n  int ret_val = EXIT_SUCCESS;\n\n  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {\n    Pix *pixs = pixRead(image);\n    if (!pixs) {\n      fprintf(stderr, \"Leptonica can't process input file: %s\\n\", image);\n      return 2;\n    }\n\n    api.SetImage(pixs);\n\n    tesseract::Orientation orientation;\n    tesseract::WritingDirection direction;\n    tesseract::TextlineOrder order;\n    float deskew_angle;\n\n    const std::unique_ptr<const tesseract::PageIterator> it(api.AnalyseLayout());\n    if (it) {\n      // TODO: Implement output of page segmentation, see documentation\n      // (\"Automatic page segmentation, but no OSD, or OCR\").\n      it->Orientation(&orientation, &direction, &order, &deskew_angle);\n      tprintf(\n          \"Orientation: %d\\nWritingDirection: %d\\nTextlineOrder: %d\\n\"\n          \"Deskew angle: %.4f\\n\",\n          orientation, direction, order, deskew_angle);\n    } else {\n      ret_val = EXIT_FAILURE;\n    }\n\n    pixDestroy(&pixs);\n    return ret_val;\n  }\n\n  // Set in_training_mode to true when using one of these configs:\n  // ambigs.train, box.train, box.train.stderr, linebox, rebox, lstm.train.\n  // In this mode no other OCR result files are written.\n  bool b = false;\n  bool in_training_mode = (api.GetBoolVariable(\"tessedit_ambigs_training\", &b) && b) ||\n                          (api.GetBoolVariable(\"tessedit_resegment_from_boxes\", &b) && b) ||\n                          (api.GetBoolVariable(\"tessedit_make_boxes_from_boxes\", &b) && b) ||\n                          (api.GetBoolVariable(\"tessedit_train_line_recognizer\", &b) && b);\n\n  if (api.GetPageSegMode() == tesseract::PSM_OSD_ONLY) {\n    if (!api.tesseract()->AnyTessLang()) {\n      fprintf(stderr, \"Error, OSD requires a model for the legacy engine\\n\");\n      return EXIT_FAILURE;\n    }\n  }\n#ifdef DISABLED_LEGACY_ENGINE\n  auto cur_psm = api.GetPageSegMode();\n  auto osd_warning = std::string(\"\");\n  if (cur_psm == tesseract::PSM_OSD_ONLY) {\n    const char *disabled_osd_msg =\n        \"\\nERROR: The page segmentation mode 0 (OSD Only) is currently \"\n        \"disabled.\\n\\n\";\n    fprintf(stderr, \"%s\", disabled_osd_msg);\n    return EXIT_FAILURE;\n  } else if (cur_psm == tesseract::PSM_AUTO_OSD) {\n    api.SetPageSegMode(tesseract::PSM_AUTO);\n    osd_warning +=\n        \"\\nWarning: The page segmentation mode 1 (Auto+OSD) is currently \"\n        \"disabled. \"\n        \"Using PSM 3 (Auto) instead.\\n\\n\";\n  } else if (cur_psm == tesseract::PSM_SPARSE_TEXT_OSD) {\n    api.SetPageSegMode(tesseract::PSM_SPARSE_TEXT);\n    osd_warning +=\n        \"\\nWarning: The page segmentation mode 12 (Sparse text + OSD) is \"\n        \"currently disabled. \"\n        \"Using PSM 11 (Sparse text) instead.\\n\\n\";\n  }\n#endif // def DISABLED_LEGACY_ENGINE\n\n  std::vector<std::unique_ptr<TessResultRenderer>> renderers;\n\n  if (in_training_mode) {\n    renderers.push_back(nullptr);\n  } else if (outputbase != nullptr) {\n    PreloadRenderers(api, renderers, pagesegmode, outputbase);\n  }\n\n  if (!renderers.empty()) {\n#ifdef DISABLED_LEGACY_ENGINE\n    if (!osd_warning.empty()) {\n      fprintf(stderr, \"%s\", osd_warning.c_str());\n    }\n#endif\n    bool succeed = api.ProcessPages(image, nullptr, 0, renderers[0].get());\n    if (!succeed) {\n      fprintf(stderr, \"Error during processing.\\n\");\n      ret_val = EXIT_FAILURE;\n    }\n  }\n\n  return ret_val;\n}\n\nint main(int argc, char **argv) {\n  try {\n    return main1(argc, argv);\n  } catch (std::exception &e) {\n    std::cerr << \"exception: \" << e.what() << \"\\n\";\n  } catch (...) {\n    std::cerr << \"unknown exception\\n\";\n  }\n  return 1;\n}\n"
  },
  {
    "path": "src/textord/alignedblob.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        alignedblob.cpp\n// Description: Subclass of BBGrid to find vertically aligned blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"alignedblob.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\nINT_VAR(textord_debug_tabfind, 0, \"Debug tab finding\");\nINT_VAR(textord_debug_bugs, 0, \"Turn on output related to bugs in tab finding\");\nstatic INT_VAR(textord_testregion_left, -1,\n               \"Left edge of debug reporting rectangle in Leptonica coords \"\n               \"(bottom=0/top=height), with horizontal lines x/y-flipped\");\nstatic INT_VAR(textord_testregion_top, INT32_MAX,\n               \"Top edge of debug reporting rectangle in Leptonica coords \"\n               \"(bottom=0/top=height), with horizontal lines x/y-flipped\");\nstatic INT_VAR(textord_testregion_right, INT32_MAX,\n               \"Right edge of debug rectangle in Leptonica coords \"\n               \"(bottom=0/top=height), with horizontal lines x/y-flipped\");\nstatic INT_VAR(textord_testregion_bottom, -1,\n               \"Bottom edge of debug rectangle in Leptonica coords \"\n               \"(bottom=0/top=height), with horizontal lines x/y-flipped\");\nBOOL_VAR(textord_debug_printable, false, \"Make debug windows printable\");\n\n// Fraction of resolution used as alignment tolerance for aligned tabs.\nconst double kAlignedFraction = 0.03125;\n// Fraction of resolution used as alignment tolerance for ragged tabs.\nconst double kRaggedFraction = 2.5;\n// Fraction of height used as a minimum gutter gap for aligned blobs.\nconst double kAlignedGapFraction = 0.75;\n// Fraction of height used as a minimum gutter gap for ragged tabs.\nconst double kRaggedGapFraction = 1.0;\n// Constant number of pixels used as alignment tolerance for line finding.\nconst int kVLineAlignment = 3;\n// Constant number of pixels used as gutter gap tolerance for line finding.\nconst int kVLineGutter = 1;\n// Constant number of pixels used as the search size for line finding.\nconst int kVLineSearchSize = 150;\n// Min number of points to accept for a ragged tab stop.\nconst int kMinRaggedTabs = 5;\n// Min number of points to accept for an aligned tab stop.\nconst int kMinAlignedTabs = 4;\n// Constant number of pixels minimum height of a vertical line.\nconst int kVLineMinLength = 300;\n// Minimum gradient for a vertical tab vector. Used to prune away junk\n// tab vectors with what would be a ridiculously large skew angle.\n// Value corresponds to tan(90 - max allowed skew angle)\nconst double kMinTabGradient = 4.0;\n// Tolerance to skew on top of current estimate of skew. Divide x or y length\n// by kMaxSkewFactor to get the y or x skew distance.\n// If the angle is small, the angle in degrees is roughly 60/kMaxSkewFactor.\nconst int kMaxSkewFactor = 15;\n\n// Constructor to set the parameters for finding aligned and ragged tabs.\n// Vertical_x and vertical_y are the current estimates of the true vertical\n// direction (up) in the image. Height is the height of the starter blob.\n// v_gap_multiple is the multiple of height that will be used as a limit\n// on vertical gap before giving up and calling the line ended.\n// resolution is the original image resolution, and align0 indicates the\n// type of tab stop to be found.\nAlignedBlobParams::AlignedBlobParams(int vertical_x, int vertical_y, int height, int v_gap_multiple,\n                                     int min_gutter_width, int resolution, TabAlignment align0)\n    : right_tab(align0 == TA_RIGHT_RAGGED || align0 == TA_RIGHT_ALIGNED)\n    , ragged(align0 == TA_LEFT_RAGGED || align0 == TA_RIGHT_RAGGED)\n    , alignment(align0)\n    , confirmed_type(TT_CONFIRMED)\n    , min_length(0) {\n  // Set the tolerances according to the type of line sought.\n  // For tab search, these are based on the image resolution for most, or\n  // the height of the starting blob for the maximum vertical gap.\n  max_v_gap = height * v_gap_multiple;\n  if (ragged) {\n    // In the case of a ragged edge, we are much more generous with the\n    // inside alignment fraction, but also require a much bigger gutter.\n    gutter_fraction = kRaggedGapFraction;\n    if (alignment == TA_RIGHT_RAGGED) {\n      l_align_tolerance = static_cast<int>(resolution * kRaggedFraction + 0.5);\n      r_align_tolerance = static_cast<int>(resolution * kAlignedFraction + 0.5);\n    } else {\n      l_align_tolerance = static_cast<int>(resolution * kAlignedFraction + 0.5);\n      r_align_tolerance = static_cast<int>(resolution * kRaggedFraction + 0.5);\n    }\n    min_points = kMinRaggedTabs;\n  } else {\n    gutter_fraction = kAlignedGapFraction;\n    l_align_tolerance = static_cast<int>(resolution * kAlignedFraction + 0.5);\n    r_align_tolerance = static_cast<int>(resolution * kAlignedFraction + 0.5);\n    min_points = kMinAlignedTabs;\n  }\n  min_gutter = static_cast<int>(height * gutter_fraction + 0.5);\n  if (min_gutter < min_gutter_width) {\n    min_gutter = min_gutter_width;\n  }\n  // Fit the vertical vector into an ICOORD, which is 16 bit.\n  set_vertical(vertical_x, vertical_y);\n}\n\n// Constructor to set the parameters for finding vertical lines.\n// Vertical_x and vertical_y are the current estimates of the true vertical\n// direction (up) in the image. Width is the width of the starter blob.\nAlignedBlobParams::AlignedBlobParams(int vertical_x, int vertical_y, int width)\n    : gutter_fraction(0.0)\n    , right_tab(false)\n    , ragged(false)\n    , alignment(TA_SEPARATOR)\n    , confirmed_type(TT_VLINE)\n    , max_v_gap(kVLineSearchSize)\n    , min_gutter(kVLineGutter)\n    , min_points(1)\n    , min_length(kVLineMinLength) {\n  // Compute threshold for left and right alignment.\n  l_align_tolerance = std::max(kVLineAlignment, width);\n  r_align_tolerance = std::max(kVLineAlignment, width);\n\n  // Fit the vertical vector into an ICOORD, which is 16 bit.\n  set_vertical(vertical_x, vertical_y);\n}\n\n// Fit the vertical vector into an ICOORD, which is 16 bit.\nvoid AlignedBlobParams::set_vertical(int vertical_x, int vertical_y) {\n  int factor = 1;\n  if (vertical_y > INT16_MAX) {\n    factor = vertical_y / INT16_MAX + 1;\n  }\n  vertical.set_x(vertical_x / factor);\n  vertical.set_y(vertical_y / factor);\n}\n\nAlignedBlob::AlignedBlob(int gridsize, const ICOORD &bleft, const ICOORD &tright)\n    : BlobGrid(gridsize, bleft, tright) {}\n\n// Return true if the given coordinates are within the test rectangle\n// and the debug level is at least the given detail level.\nbool AlignedBlob::WithinTestRegion(int detail_level, int x, int y) {\n  if (textord_debug_tabfind < detail_level) {\n    return false;\n  }\n  return x >= textord_testregion_left && x <= textord_testregion_right &&\n         y <= textord_testregion_top && y >= textord_testregion_bottom;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Display the tab codes of the BLOBNBOXes in this grid.\nScrollView *AlignedBlob::DisplayTabs(const char *window_name, ScrollView *tab_win) {\n  if (tab_win == nullptr) {\n    tab_win = MakeWindow(0, 50, window_name);\n  }\n  // For every tab in the grid, display it.\n  BlobGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  BLOBNBOX *bbox;\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &box = bbox->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    TabType tabtype = bbox->left_tab_type();\n    if (tabtype != TT_NONE) {\n      if (tabtype == TT_MAYBE_ALIGNED) {\n        tab_win->Pen(ScrollView::BLUE);\n      } else if (tabtype == TT_MAYBE_RAGGED) {\n        tab_win->Pen(ScrollView::YELLOW);\n      } else if (tabtype == TT_CONFIRMED) {\n        tab_win->Pen(ScrollView::GREEN);\n      } else {\n        tab_win->Pen(ScrollView::GREY);\n      }\n      tab_win->Line(left_x, top_y, left_x, bottom_y);\n    }\n    tabtype = bbox->right_tab_type();\n    if (tabtype != TT_NONE) {\n      if (tabtype == TT_MAYBE_ALIGNED) {\n        tab_win->Pen(ScrollView::MAGENTA);\n      } else if (tabtype == TT_MAYBE_RAGGED) {\n        tab_win->Pen(ScrollView::ORANGE);\n      } else if (tabtype == TT_CONFIRMED) {\n        tab_win->Pen(ScrollView::RED);\n      } else {\n        tab_win->Pen(ScrollView::GREY);\n      }\n      tab_win->Line(right_x, top_y, right_x, bottom_y);\n    }\n  }\n  tab_win->Update();\n  return tab_win;\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Helper returns true if the total number of line_crossings of all the blobs\n// in the list is at least 2.\nstatic bool AtLeast2LineCrossings(BLOBNBOX_CLIST *blobs) {\n  BLOBNBOX_C_IT it(blobs);\n  int total_crossings = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    total_crossings += it.data()->line_crossings();\n  }\n  return total_crossings >= 2;\n}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nAlignedBlob::~AlignedBlob() = default;\n\n// Finds a vector corresponding to a set of vertically aligned blob edges\n// running through the given box. The type of vector returned and the\n// search parameters are determined by the AlignedBlobParams.\n// vertical_x and y are updated with an estimate of the real\n// vertical direction. (skew finding.)\n// Returns nullptr if no decent vector can be found.\nTabVector *AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params, BLOBNBOX *bbox,\n                                              int *vertical_x, int *vertical_y) {\n  int ext_start_y, ext_end_y;\n  BLOBNBOX_CLIST good_points;\n  // Search up and then down from the starting bbox.\n  TBOX box = bbox->bounding_box();\n  bool debug = WithinTestRegion(2, box.left(), box.bottom());\n  int pt_count = AlignTabs(align_params, false, bbox, &good_points, &ext_end_y);\n  pt_count += AlignTabs(align_params, true, bbox, &good_points, &ext_start_y);\n  BLOBNBOX_C_IT it(&good_points);\n  it.move_to_last();\n  box = it.data()->bounding_box();\n  int end_y = box.top();\n  int end_x = align_params.right_tab ? box.right() : box.left();\n  it.move_to_first();\n  box = it.data()->bounding_box();\n  int start_x = align_params.right_tab ? box.right() : box.left();\n  int start_y = box.bottom();\n  // Acceptable tab vectors must have a minimum number of points,\n  // have a minimum acceptable length, and have a minimum gradient.\n  // The gradient corresponds to the skew angle.\n  // Ragged tabs don't need to satisfy the gradient condition, as they\n  // will always end up parallel to the vertical direction.\n  bool at_least_2_crossings = AtLeast2LineCrossings(&good_points);\n  if ((pt_count >= align_params.min_points && end_y - start_y >= align_params.min_length &&\n       (align_params.ragged || end_y - start_y >= abs(end_x - start_x) * kMinTabGradient)) ||\n      at_least_2_crossings) {\n    int confirmed_points = 0;\n    // Count existing confirmed points to see if vector is acceptable.\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      bbox = it.data();\n      if (align_params.right_tab) {\n        if (bbox->right_tab_type() == align_params.confirmed_type) {\n          ++confirmed_points;\n        }\n      } else {\n        if (bbox->left_tab_type() == align_params.confirmed_type) {\n          ++confirmed_points;\n        }\n      }\n    }\n    // Ragged vectors are not allowed to use too many already used points.\n    if (!align_params.ragged || confirmed_points + confirmed_points < pt_count) {\n      const TBOX &box = bbox->bounding_box();\n      if (debug) {\n        tprintf(\"Confirming tab vector of %d pts starting at %d,%d\\n\", pt_count, box.left(),\n                box.bottom());\n      }\n      // Flag all the aligned neighbours as confirmed .\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        bbox = it.data();\n        if (align_params.right_tab) {\n          bbox->set_right_tab_type(align_params.confirmed_type);\n        } else {\n          bbox->set_left_tab_type(align_params.confirmed_type);\n        }\n        if (debug) {\n          bbox->bounding_box().print();\n        }\n      }\n      // Now make the vector and return it.\n      TabVector *result =\n          TabVector::FitVector(align_params.alignment, align_params.vertical, ext_start_y,\n                               ext_end_y, &good_points, vertical_x, vertical_y);\n      result->set_intersects_other_lines(at_least_2_crossings);\n      if (debug) {\n        tprintf(\"Box was %d, %d\\n\", box.left(), box.bottom());\n        result->Print(\"After fitting\");\n      }\n      return result;\n    } else if (debug) {\n      tprintf(\"Ragged tab used too many used points: %d out of %d\\n\", confirmed_points, pt_count);\n    }\n  } else if (debug) {\n    tprintf(\n        \"Tab vector failed basic tests: pt count %d vs min %d, \"\n        \"length %d vs min %d, min grad %g\\n\",\n        pt_count, align_params.min_points, end_y - start_y, align_params.min_length,\n        abs(end_x - start_x) * kMinTabGradient);\n  }\n  return nullptr;\n}\n\n// Find a set of blobs that are aligned in the given vertical\n// direction with the given blob. Returns a list of aligned\n// blobs and the number in the list.\n// For other parameters see FindAlignedBlob below.\nint AlignedBlob::AlignTabs(const AlignedBlobParams &params, bool top_to_bottom, BLOBNBOX *bbox,\n                           BLOBNBOX_CLIST *good_points, int *end_y) {\n  int ptcount = 0;\n  BLOBNBOX_C_IT it(good_points);\n\n  TBOX box = bbox->bounding_box();\n  bool debug = WithinTestRegion(2, box.left(), box.bottom());\n  if (debug) {\n    tprintf(\"Starting alignment run at blob:\");\n    box.print();\n  }\n  int x_start = params.right_tab ? box.right() : box.left();\n  while (bbox != nullptr) {\n    // Add the blob to the list if the appropriate side is a tab candidate,\n    // or if we are working on a ragged tab.\n    TabType type = params.right_tab ? bbox->right_tab_type() : bbox->left_tab_type();\n    if (((type != TT_NONE && type != TT_MAYBE_RAGGED) || params.ragged) &&\n        (it.empty() || it.data() != bbox)) {\n      if (top_to_bottom) {\n        it.add_before_then_move(bbox);\n      } else {\n        it.add_after_then_move(bbox);\n      }\n      ++ptcount;\n    }\n    // Find the next blob that is aligned with the current one.\n    // FindAlignedBlob guarantees that forward progress will be made in the\n    // top_to_bottom direction, and therefore eventually it will return nullptr,\n    // making this while (bbox != nullptr) loop safe.\n    bbox = FindAlignedBlob(params, top_to_bottom, bbox, x_start, end_y);\n    if (bbox != nullptr) {\n      box = bbox->bounding_box();\n      if (!params.ragged) {\n        x_start = params.right_tab ? box.right() : box.left();\n      }\n    }\n  }\n  if (debug) {\n    tprintf(\"Alignment run ended with %d pts at blob:\", ptcount);\n    box.print();\n  }\n  return ptcount;\n}\n\n// Search vertically for a blob that is aligned with the input bbox.\n// The search parameters are determined by AlignedBlobParams.\n// top_to_bottom tells whether to search down or up.\n// The return value is nullptr if nothing was found in the search box\n// or if a blob was found in the gutter. On a nullptr return, end_y\n// is set to the edge of the search box or the leading edge of the\n// gutter blob if one was found.\nBLOBNBOX *AlignedBlob::FindAlignedBlob(const AlignedBlobParams &p, bool top_to_bottom,\n                                       BLOBNBOX *bbox, int x_start, int *end_y) {\n  TBOX box = bbox->bounding_box();\n  // If there are separator lines, get the column edges.\n  int left_column_edge = bbox->left_rule();\n  int right_column_edge = bbox->right_rule();\n  // start_y is used to guarantee that forward progress is made and the\n  // search does not go into an infinite loop. New blobs must extend the\n  // line beyond start_y.\n  int start_y = top_to_bottom ? box.bottom() : box.top();\n  if (WithinTestRegion(2, x_start, start_y)) {\n    tprintf(\"Column edges for blob at (%d,%d)->(%d,%d) are [%d, %d]\\n\", box.left(), box.top(),\n            box.right(), box.bottom(), left_column_edge, right_column_edge);\n  }\n  // Compute skew tolerance.\n  int skew_tolerance = p.max_v_gap / kMaxSkewFactor;\n  // Calculate xmin and xmax of the search box so that it contains\n  // all possibly relevant boxes up to p.max_v_gap above or below according\n  // to top_to_bottom.\n  // Start with a notion of vertical with the current estimate.\n  int x2 = (p.max_v_gap * p.vertical.x() + p.vertical.y() / 2) / p.vertical.y();\n  if (top_to_bottom) {\n    x2 = x_start - x2;\n    *end_y = start_y - p.max_v_gap;\n  } else {\n    x2 = x_start + x2;\n    *end_y = start_y + p.max_v_gap;\n  }\n  // Expand the box by an additional skew tolerance\n  int xmin = std::min(x_start, x2) - skew_tolerance;\n  int xmax = std::max(x_start, x2) + skew_tolerance;\n  // Now add direction-specific tolerances.\n  if (p.right_tab) {\n    xmax += p.min_gutter;\n    xmin -= p.l_align_tolerance;\n  } else {\n    xmax += p.r_align_tolerance;\n    xmin -= p.min_gutter;\n  }\n  // Setup a vertical search for an aligned blob.\n  BlobGridSearch vsearch(this);\n  if (WithinTestRegion(2, x_start, start_y)) {\n    tprintf(\"Starting %s %s search at %d-%d,%d, search_size=%d, gutter=%d\\n\",\n            p.ragged ? \"Ragged\" : \"Aligned\", p.right_tab ? \"Right\" : \"Left\", xmin, xmax, start_y,\n            p.max_v_gap, p.min_gutter);\n  }\n  vsearch.StartVerticalSearch(xmin, xmax, start_y);\n  // result stores the best real return value.\n  BLOBNBOX *result = nullptr;\n  // The backup_result is not a tab candidate and can be used if no\n  // real tab candidate result is found.\n  BLOBNBOX *backup_result = nullptr;\n  // neighbour is the blob that is currently being investigated.\n  BLOBNBOX *neighbour = nullptr;\n  while ((neighbour = vsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {\n    if (neighbour == bbox) {\n      continue;\n    }\n    TBOX nbox = neighbour->bounding_box();\n    int n_y = (nbox.top() + nbox.bottom()) / 2;\n    if ((!top_to_bottom && n_y > start_y + p.max_v_gap) ||\n        (top_to_bottom && n_y < start_y - p.max_v_gap)) {\n      if (WithinTestRegion(2, x_start, start_y)) {\n        tprintf(\"Neighbour too far at (%d,%d)->(%d,%d)\\n\", nbox.left(), nbox.bottom(), nbox.right(),\n                nbox.top());\n      }\n      break; // Gone far enough.\n    }\n    // It is CRITICAL to ensure that forward progress is made, (strictly\n    // in/decreasing n_y) or the caller could loop infinitely, while\n    // waiting for a sequence of blobs in a line to end.\n    // NextVerticalSearch alone does not guarantee this, as there may be\n    // more than one blob in a grid cell. See comment in AlignTabs.\n    if ((n_y < start_y) != top_to_bottom || nbox.y_overlap(box)) {\n      continue; // Only look in the required direction.\n    }\n    if (result != nullptr && result->bounding_box().y_gap(nbox) > gridsize()) {\n      return result; // This result is clear.\n    }\n    if (backup_result != nullptr && p.ragged && result == nullptr &&\n        backup_result->bounding_box().y_gap(nbox) > gridsize()) {\n      return backup_result; // This result is clear.\n    }\n\n    // If the neighbouring blob is the wrong side of a separator line, then it\n    // \"doesn't exist\" as far as we are concerned.\n    int x_at_n_y = x_start + (n_y - start_y) * p.vertical.x() / p.vertical.y();\n    if (x_at_n_y < neighbour->left_crossing_rule() || x_at_n_y > neighbour->right_crossing_rule()) {\n      continue; // Separator line in the way.\n    }\n    int n_left = nbox.left();\n    int n_right = nbox.right();\n    int n_x = p.right_tab ? n_right : n_left;\n    if (WithinTestRegion(2, x_start, start_y)) {\n      tprintf(\"neighbour at (%d,%d)->(%d,%d), n_x=%d, n_y=%d, xatn=%d\\n\", nbox.left(),\n              nbox.bottom(), nbox.right(), nbox.top(), n_x, n_y, x_at_n_y);\n    }\n    if (p.right_tab && n_left < x_at_n_y + p.min_gutter &&\n        n_right > x_at_n_y + p.r_align_tolerance &&\n        (p.ragged || n_left < x_at_n_y + p.gutter_fraction * nbox.height())) {\n      // In the gutter so end of line.\n      if (bbox->right_tab_type() >= TT_MAYBE_ALIGNED) {\n        bbox->set_right_tab_type(TT_DELETED);\n      }\n      *end_y = top_to_bottom ? nbox.top() : nbox.bottom();\n      if (WithinTestRegion(2, x_start, start_y)) {\n        tprintf(\"gutter\\n\");\n      }\n      return nullptr;\n    }\n    if (!p.right_tab && n_left < x_at_n_y - p.l_align_tolerance &&\n        n_right > x_at_n_y - p.min_gutter &&\n        (p.ragged || n_right > x_at_n_y - p.gutter_fraction * nbox.height())) {\n      // In the gutter so end of line.\n      if (bbox->left_tab_type() >= TT_MAYBE_ALIGNED) {\n        bbox->set_left_tab_type(TT_DELETED);\n      }\n      *end_y = top_to_bottom ? nbox.top() : nbox.bottom();\n      if (WithinTestRegion(2, x_start, start_y)) {\n        tprintf(\"gutter\\n\");\n      }\n      return nullptr;\n    }\n    if ((p.right_tab && neighbour->leader_on_right()) ||\n        (!p.right_tab && neighbour->leader_on_left())) {\n      continue; // Neighbours of leaders are not allowed to be used.\n    }\n    if (n_x <= x_at_n_y + p.r_align_tolerance && n_x >= x_at_n_y - p.l_align_tolerance) {\n      // Aligned so keep it. If it is a marked tab save it as result,\n      // otherwise keep it as backup_result to return in case of later failure.\n      if (WithinTestRegion(2, x_start, start_y)) {\n        tprintf(\"aligned, seeking%d, l=%d, r=%d\\n\", p.right_tab, neighbour->left_tab_type(),\n                neighbour->right_tab_type());\n      }\n      TabType n_type = p.right_tab ? neighbour->right_tab_type() : neighbour->left_tab_type();\n      if (n_type != TT_NONE && (p.ragged || n_type != TT_MAYBE_RAGGED)) {\n        if (result == nullptr) {\n          result = neighbour;\n        } else {\n          // Keep the closest neighbour by Euclidean distance.\n          // This prevents it from picking a tab blob in another column.\n          const TBOX &old_box = result->bounding_box();\n          int x_diff = p.right_tab ? old_box.right() : old_box.left();\n          x_diff -= x_at_n_y;\n          int y_diff = (old_box.top() + old_box.bottom()) / 2 - start_y;\n          int old_dist = x_diff * x_diff + y_diff * y_diff;\n          x_diff = n_x - x_at_n_y;\n          y_diff = n_y - start_y;\n          int new_dist = x_diff * x_diff + y_diff * y_diff;\n          if (new_dist < old_dist) {\n            result = neighbour;\n          }\n        }\n      } else if (backup_result == nullptr) {\n        if (WithinTestRegion(2, x_start, start_y)) {\n          tprintf(\"Backup\\n\");\n        }\n        backup_result = neighbour;\n      } else {\n        TBOX backup_box = backup_result->bounding_box();\n        if ((p.right_tab && backup_box.right() < nbox.right()) ||\n            (!p.right_tab && backup_box.left() > nbox.left())) {\n          if (WithinTestRegion(2, x_start, start_y)) {\n            tprintf(\"Better backup\\n\");\n          }\n          backup_result = neighbour;\n        }\n      }\n    }\n  }\n  return result != nullptr ? result : backup_result;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/alignedblob.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        alignedblob.h\n// Description: A class to find vertically aligned blobs in a BBGrid,\n//              and a struct to hold control parameters.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_ALIGNEDBLOB_H_\n#define TESSERACT_TEXTORD_ALIGNEDBLOB_H_\n\n#include \"bbgrid.h\"\n#include \"blobbox.h\"\n#include \"tabvector.h\"\n\nnamespace tesseract {\n\nextern INT_VAR_H(textord_debug_bugs);\nextern INT_VAR_H(textord_debug_tabfind);\nextern BOOL_VAR_H(textord_debug_printable);\n\n// Simple structure to hold the search parameters for AlignedBlob.\n// The members are mostly derived from constants, which are\n// conditioned on the alignment parameter.\n// For finding vertical lines, a different set of constants are\n// used, conditioned on the different constructor.\nstruct AlignedBlobParams {\n  // Constructor to set the parameters for finding aligned and ragged tabs.\n  // Vertical_x and vertical_y are the current estimates of the true vertical\n  // direction (up) in the image. Height is the height of the starter blob.\n  // v_gap_multiple is the multiple of height that will be used as a limit\n  // on vertical gap before giving up and calling the line ended.\n  // resolution is the original image resolution, and align0 indicates the\n  // type of tab stop to be found.\n  AlignedBlobParams(int vertical_x, int vertical_y, int height, int v_gap_multiple,\n                    int min_gutter_width, int resolution, TabAlignment alignment0);\n  // Constructor to set the parameters for finding vertical lines.\n  // Vertical_x and vertical_y are the current estimates of the true vertical\n  // direction (up) in the image. Width is the width of the starter blob.\n  AlignedBlobParams(int vertical_x, int vertical_y, int width);\n\n  // Fit the vertical vector into an ICOORD, which is 16 bit.\n  void set_vertical(int vertical_x, int vertical_y);\n\n  double gutter_fraction; // Multiple of height used for min_gutter.\n  bool right_tab;         // We are looking at right edges.\n  bool ragged;            // We are looking for a ragged (vs aligned) edge.\n  TabAlignment alignment; // The type we are trying to produce.\n  TabType confirmed_type; // Type to flag blobs if accepted.\n  int max_v_gap;          // Max vertical gap to be tolerated.\n  int min_gutter;         // Minimum gutter between columns.\n  // Tolerances allowed on horizontal alignment of aligned edges.\n  int l_align_tolerance; // Left edges.\n  int r_align_tolerance; // Right edges.\n  // Conditions for accepting a line.\n  int min_points; // Minimum number of points to be OK.\n  int min_length; // Min length of completed line.\n\n  ICOORD vertical; // Current estimate of logical vertical.\n};\n\n// The AlignedBlob class contains code to find vertically aligned blobs.\n// This is factored out into a separate class, so it can be used by both\n// vertical line finding (LineFind) and tabstop finding (TabFind).\nclass TESS_API AlignedBlob : public BlobGrid {\npublic:\n  AlignedBlob(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~AlignedBlob() override;\n\n  // Return true if the given coordinates are within the test rectangle\n  // and the debug level is at least the given detail level.\n  static bool WithinTestRegion(int detail_level, int x, int y);\n\n  // Display the tab codes of the BLOBNBOXes in this grid.\n  ScrollView *DisplayTabs(const char *window_name, ScrollView *tab_win);\n\n  // Finds a vector corresponding to a set of vertically aligned blob edges\n  // running through the given box. The type of vector returned and the\n  // search parameters are determined by the AlignedBlobParams.\n  // vertical_x and y are updated with an estimate of the real\n  // vertical direction. (skew finding.)\n  // Returns nullptr if no decent vector can be found.\n  TabVector *FindVerticalAlignment(AlignedBlobParams align_params, BLOBNBOX *bbox, int *vertical_x,\n                                   int *vertical_y);\n\nprivate:\n  // Find a set of blobs that are aligned in the given vertical\n  // direction with the given blob. Returns a list of aligned\n  // blobs and the number in the list.\n  // For other parameters see FindAlignedBlob below.\n  int AlignTabs(const AlignedBlobParams &params, bool top_to_bottom, BLOBNBOX *bbox,\n                BLOBNBOX_CLIST *good_points, int *end_y);\n\n  // Search vertically for a blob that is aligned with the input bbox.\n  // The search parameters are determined by AlignedBlobParams.\n  // top_to_bottom tells whether to search down or up.\n  // The return value is nullptr if nothing was found in the search box\n  // or if a blob was found in the gutter. On a nullptr return, end_y\n  // is set to the edge of the search box or the leading edge of the\n  // gutter blob if one was found.\n  BLOBNBOX *FindAlignedBlob(const AlignedBlobParams &p, bool top_to_bottom, BLOBNBOX *bbox,\n                            int x_start, int *end_y);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_ALIGNEDBLOB_H_\n"
  },
  {
    "path": "src/textord/baselinedetect.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        baselinedetect.cpp\n// Description: Initial Baseline Determination.\n// Copyright 2012 Google Inc. All Rights Reserved.\n// Author:      rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"baselinedetect.h\"\n\n#include <allheaders.h>\n#include <algorithm>\n#include <cfloat> // for FLT_MAX\n#include <cmath>  // for M_PI\n#include \"blobbox.h\"\n#include \"detlinefit.h\"\n#include \"drawtord.h\"\n#include \"helpers.h\"\n#include \"linlsq.h\"\n#include \"makerow.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"textord.h\"\n#include \"tprintf.h\"\n#include \"underlin.h\"\n\n// Number of displacement modes kept in displacement_modes_;\nconst int kMaxDisplacementsModes = 3;\n// Number of points to skip when retrying initial fit.\nconst int kNumSkipPoints = 3;\n// Max angle deviation (in radians) allowed to keep the independent baseline.\nconst double kMaxSkewDeviation = 1.0 / 64;\n// Fraction of line spacing estimate for quantization of blob displacements.\nconst double kOffsetQuantizationFactor = 3.0 / 64;\n// Fraction of line spacing estimate for computing blob fit error.\nconst double kFitHalfrangeFactor = 6.0 / 64;\n// Max fraction of line spacing allowed before a baseline counts as badly\n// fitting.\nconst double kMaxBaselineError = 3.0 / 64;\n// Multiple of linespacing that sets max_blob_size in TO_BLOCK.\n// Copied from textord_excess_blobsize.\nconst double kMaxBlobSizeMultiple = 1.3;\n// Min fraction of linespacing gaps that should be close to the model before\n// we will force the linespacing model on all the lines.\nconst double kMinFittingLinespacings = 0.25;\n// A y-coordinate within a textline that is to be debugged.\n//#define kDebugYCoord 1525\n\nnamespace tesseract {\n\nBaselineRow::BaselineRow(double line_spacing, TO_ROW *to_row)\n    : blobs_(to_row->blob_list()),\n      baseline_pt1_(0.0f, 0.0f),\n      baseline_pt2_(0.0f, 0.0f),\n      baseline_error_(0.0),\n      good_baseline_(false) {\n  ComputeBoundingBox();\n  // Compute a scale factor for rounding to ints.\n  disp_quant_factor_ = kOffsetQuantizationFactor * line_spacing;\n  fit_halfrange_ = kFitHalfrangeFactor * line_spacing;\n  max_baseline_error_ = kMaxBaselineError * line_spacing;\n}\n\n// Sets the TO_ROW with the output straight line.\nvoid BaselineRow::SetupOldLineParameters(TO_ROW *row) const {\n  // TODO(rays) get rid of this when m and c are no longer used.\n  double gradient = tan(BaselineAngle());\n  // para_c is the actual intercept of the baseline on the y-axis.\n  float para_c = StraightYAtX(0.0);\n  row->set_line(gradient, para_c, baseline_error_);\n  row->set_parallel_line(gradient, para_c, baseline_error_);\n}\n\n// Outputs diagnostic information.\nvoid BaselineRow::Print() const {\n  tprintf(\"Baseline (%g,%g)->(%g,%g), angle=%g, intercept=%g\\n\",\n          baseline_pt1_.x(), baseline_pt1_.y(), baseline_pt2_.x(),\n          baseline_pt2_.y(), BaselineAngle(), StraightYAtX(0.0));\n  tprintf(\"Quant factor=%g, error=%g, good=%d, box:\", disp_quant_factor_,\n          baseline_error_, good_baseline_);\n  bounding_box_.print();\n}\n\n// Returns the skew angle (in radians) of the current baseline in [-pi,pi].\ndouble BaselineRow::BaselineAngle() const {\n  FCOORD baseline_dir(baseline_pt2_ - baseline_pt1_);\n  double angle = baseline_dir.angle();\n  // Baseline directions are only unique in a range of pi so constrain to\n  // [-pi/2, pi/2].\n  return fmod(angle + M_PI * 1.5, M_PI) - M_PI * 0.5;\n}\n\n// Computes and returns the linespacing at the middle of the overlap\n// between this and other.\ndouble BaselineRow::SpaceBetween(const BaselineRow &other) const {\n  // Find the x-centre of overlap of the lines.\n  float x = (std::max(bounding_box_.left(), other.bounding_box_.left()) +\n             std::min(bounding_box_.right(), other.bounding_box_.right())) /\n            2.0f;\n  // Find the vertical centre between them.\n  float y = (StraightYAtX(x) + other.StraightYAtX(x)) / 2.0f;\n  // Find the perpendicular distance of (x,y) from each line.\n  FCOORD pt(x, y);\n  return PerpDistanceFromBaseline(pt) + other.PerpDistanceFromBaseline(pt);\n}\n\n// Computes and returns the displacement of the center of the line\n// perpendicular to the given direction.\ndouble BaselineRow::PerpDisp(const FCOORD &direction) const {\n  float middle_x = (bounding_box_.left() + bounding_box_.right()) / 2.0f;\n  FCOORD middle_pos(middle_x, StraightYAtX(middle_x));\n  return direction * middle_pos / direction.length();\n}\n\n// Computes the y coordinate at the given x using the straight baseline\n// defined by baseline_pt1_ and baseline_pt2__.\ndouble BaselineRow::StraightYAtX(double x) const {\n  double denominator = baseline_pt2_.x() - baseline_pt1_.x();\n  if (denominator == 0.0) {\n    return (baseline_pt1_.y() + baseline_pt2_.y()) / 2.0;\n  }\n  return baseline_pt1_.y() + (x - baseline_pt1_.x()) *\n                                 (baseline_pt2_.y() - baseline_pt1_.y()) /\n                                 denominator;\n}\n\n// Fits a straight baseline to the points. Returns true if it had enough\n// points to be reasonably sure of the fitted baseline.\n// If use_box_bottoms is false, baselines positions are formed by\n// considering the outlines of the blobs.\nbool BaselineRow::FitBaseline(bool use_box_bottoms) {\n  // Deterministic fitting is used wherever possible.\n  fitter_.Clear();\n  // Linear least squares is a backup if the DetLineFit produces a bad line.\n  LLSQ llsq;\n  BLOBNBOX_IT blob_it(blobs_);\n\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (!use_box_bottoms) {\n      blob->EstimateBaselinePosition();\n    }\n    const TBOX &box = blob->bounding_box();\n    int x_middle = (box.left() + box.right()) / 2;\n#ifdef kDebugYCoord\n    if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) {\n      tprintf(\"Box bottom = %d, baseline pos=%d for box at:\", box.bottom(),\n              blob->baseline_position());\n      box.print();\n    }\n#endif\n    fitter_.Add(ICOORD(x_middle, blob->baseline_position()), box.width() / 2);\n    llsq.add(x_middle, blob->baseline_position());\n  }\n  // Fit the line.\n  ICOORD pt1, pt2;\n  baseline_error_ = fitter_.Fit(&pt1, &pt2);\n  baseline_pt1_ = pt1;\n  baseline_pt2_ = pt2;\n  if (baseline_error_ > max_baseline_error_ &&\n      fitter_.SufficientPointsForIndependentFit()) {\n    // The fit was bad but there were plenty of points, so try skipping\n    // the first and last few, and use the new line if it dramatically improves\n    // the error of fit.\n    double error = fitter_.Fit(kNumSkipPoints, kNumSkipPoints, &pt1, &pt2);\n    if (error < baseline_error_ / 2.0) {\n      baseline_error_ = error;\n      baseline_pt1_ = pt1;\n      baseline_pt2_ = pt2;\n    }\n  }\n  int debug = 0;\n#ifdef kDebugYCoord\n  Print();\n  debug = bounding_box_.bottom() < kDebugYCoord &&\n                  bounding_box_.top() > kDebugYCoord\n              ? 3\n              : 2;\n#endif\n  // Now we obtained a direction from that fit, see if we can improve the\n  // fit using the same direction and some other start point.\n  FCOORD direction(pt2 - pt1);\n  double target_offset = direction * pt1;\n  good_baseline_ = false;\n  FitConstrainedIfBetter(debug, direction, 0.0, target_offset);\n  // Wild lines can be produced because DetLineFit allows vertical lines, but\n  // vertical text has been rotated so angles over pi/4 should be disallowed.\n  // Near vertical lines can still be produced by vertically aligned components\n  // on very short lines.\n  double angle = BaselineAngle();\n  if (fabs(angle) > M_PI * 0.25) {\n    // Use the llsq fit as a backup.\n    baseline_pt1_ = llsq.mean_point();\n    baseline_pt2_ = baseline_pt1_ + FCOORD(1.0f, llsq.m());\n    // TODO(rays) get rid of this when m and c are no longer used.\n    double m = llsq.m();\n    double c = llsq.c(m);\n    baseline_error_ = llsq.rms(m, c);\n    good_baseline_ = false;\n  }\n  return good_baseline_;\n}\n\n// Modifies an existing result of FitBaseline to be parallel to the given\n// direction vector if that produces a better result.\nvoid BaselineRow::AdjustBaselineToParallel(int debug, const FCOORD &direction) {\n  SetupBlobDisplacements(direction);\n  if (displacement_modes_.empty()) {\n    return;\n  }\n#ifdef kDebugYCoord\n  if (bounding_box_.bottom() < kDebugYCoord &&\n      bounding_box_.top() > kDebugYCoord && debug < 3)\n    debug = 3;\n#endif\n  FitConstrainedIfBetter(debug, direction, 0.0, displacement_modes_[0]);\n}\n\n// Modifies the baseline to snap to the textline grid if the existing\n// result is not good enough.\ndouble BaselineRow::AdjustBaselineToGrid(int debug, const FCOORD &direction,\n                                         double line_spacing,\n                                         double line_offset) {\n  if (blobs_->empty()) {\n    if (debug > 1) {\n      tprintf(\"Row empty at:\");\n      bounding_box_.print();\n    }\n    return line_offset;\n  }\n  // Find the displacement_modes_ entry nearest to the grid.\n  double best_error = 0.0;\n  int best_index = -1;\n  for (unsigned i = 0; i < displacement_modes_.size(); ++i) {\n    double blob_y = displacement_modes_[i];\n    double error =\n        BaselineBlock::SpacingModelError(blob_y, line_spacing, line_offset);\n    if (debug > 1) {\n      tprintf(\"Mode at %g has error %g from model \\n\", blob_y, error);\n    }\n    if (best_index < 0 || error < best_error) {\n      best_error = error;\n      best_index = i;\n    }\n  }\n  // We will move the baseline only if the chosen mode is close enough to the\n  // model.\n  double model_margin = max_baseline_error_ - best_error;\n  if (best_index >= 0 && model_margin > 0.0) {\n    // But if the current baseline is already close to the mode there is no\n    // point, and only the potential to damage accuracy by changing its angle.\n    double perp_disp = PerpDisp(direction);\n    double shift = displacement_modes_[best_index] - perp_disp;\n    if (fabs(shift) > max_baseline_error_) {\n      if (debug > 1) {\n        tprintf(\"Attempting linespacing model fit with mode %g to row at:\",\n                displacement_modes_[best_index]);\n        bounding_box_.print();\n      }\n      FitConstrainedIfBetter(debug, direction, model_margin,\n                             displacement_modes_[best_index]);\n    } else if (debug > 1) {\n      tprintf(\"Linespacing model only moves current line by %g for row at:\",\n              shift);\n      bounding_box_.print();\n    }\n  } else if (debug > 1) {\n    tprintf(\"Linespacing model not close enough to any mode for row at:\");\n    bounding_box_.print();\n  }\n  return fmod(PerpDisp(direction), line_spacing);\n}\n\n// Sets up displacement_modes_ with the top few modes of the perpendicular\n// distance of each blob from the given direction vector, after rounding.\nvoid BaselineRow::SetupBlobDisplacements(const FCOORD &direction) {\n  // Set of perpendicular displacements of the blob bottoms from the required\n  // baseline direction.\n  std::vector<double> perp_blob_dists;\n  displacement_modes_.clear();\n  // Gather the skew-corrected position of every blob.\n  double min_dist = FLT_MAX;\n  double max_dist = -FLT_MAX;\n  BLOBNBOX_IT blob_it(blobs_);\n#ifdef kDebugYCoord\n  bool debug = false;\n#endif\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    const TBOX &box = blob->bounding_box();\n#ifdef kDebugYCoord\n    if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord)\n      debug = true;\n#endif\n    FCOORD blob_pos((box.left() + box.right()) / 2.0f,\n                    blob->baseline_position());\n    double offset = direction * blob_pos;\n    perp_blob_dists.push_back(offset);\n#ifdef kDebugYCoord\n    if (debug) {\n      tprintf(\"Displacement %g for blob at:\", offset);\n      box.print();\n    }\n#endif\n    UpdateRange(offset, &min_dist, &max_dist);\n  }\n  // Set up a histogram using disp_quant_factor_ as the bucket size.\n  STATS dist_stats(IntCastRounded(min_dist / disp_quant_factor_),\n                   IntCastRounded(max_dist / disp_quant_factor_));\n  for (double perp_blob_dist : perp_blob_dists) {\n    dist_stats.add(IntCastRounded(perp_blob_dist / disp_quant_factor_), 1);\n  }\n  std::vector<KDPairInc<float, int>> scaled_modes;\n  dist_stats.top_n_modes(kMaxDisplacementsModes, scaled_modes);\n#ifdef kDebugYCoord\n  if (debug) {\n    for (int i = 0; i < scaled_modes.size(); ++i) {\n      tprintf(\"Top mode = %g * %d\\n\", scaled_modes[i].key * disp_quant_factor_,\n              scaled_modes[i].data());\n    }\n  }\n#endif\n  for (auto &scaled_mode : scaled_modes) {\n    displacement_modes_.push_back(disp_quant_factor_ * scaled_mode.key());\n  }\n}\n\n// Fits a line in the given direction to blobs that are close to the given\n// target_offset perpendicular displacement from the direction. The fit\n// error is allowed to be cheat_allowance worse than the existing fit, and\n// will still be used.\n// If cheat_allowance > 0, the new fit will be good and replace the current\n// fit if it has better fit (with cheat) OR its error is below\n// max_baseline_error_ and the old fit is marked bad.\n// Otherwise the new fit will only replace the old if it is really better,\n// or the old fit is marked bad and the new fit has sufficient points, as\n// well as being within the max_baseline_error_.\nvoid BaselineRow::FitConstrainedIfBetter(int debug, const FCOORD &direction,\n                                         double cheat_allowance,\n                                         double target_offset) {\n  double halfrange = fit_halfrange_ * direction.length();\n  double min_dist = target_offset - halfrange;\n  double max_dist = target_offset + halfrange;\n  ICOORD line_pt;\n  double new_error = fitter_.ConstrainedFit(direction, min_dist, max_dist,\n                                            debug > 2, &line_pt);\n  // Allow cheat_allowance off the new error\n  new_error -= cheat_allowance;\n  double old_angle = BaselineAngle();\n  double new_angle = direction.angle();\n  if (debug > 1) {\n    tprintf(\"Constrained error = %g, original = %g\", new_error,\n            baseline_error_);\n    tprintf(\" angles = %g, %g, delta=%g vs threshold %g\\n\", old_angle,\n            new_angle, new_angle - old_angle, kMaxSkewDeviation);\n  }\n  bool new_good_baseline =\n      new_error <= max_baseline_error_ &&\n      (cheat_allowance > 0.0 || fitter_.SufficientPointsForIndependentFit());\n  // The new will replace the old if any are true:\n  // 1. the new error is better\n  // 2. the old is NOT good, but the new is\n  // 3. there is a wild angular difference between them (assuming that the new\n  //    is a better guess at the angle.)\n  if (new_error <= baseline_error_ || (!good_baseline_ && new_good_baseline) ||\n      fabs(new_angle - old_angle) > kMaxSkewDeviation) {\n    baseline_error_ = new_error;\n    baseline_pt1_ = line_pt;\n    baseline_pt2_ = baseline_pt1_ + direction;\n    good_baseline_ = new_good_baseline;\n    if (debug > 1) {\n      tprintf(\"Replacing with constrained baseline, good = %d\\n\",\n              good_baseline_);\n    }\n  } else if (debug > 1) {\n    tprintf(\"Keeping old baseline\\n\");\n  }\n}\n\n// Returns the perpendicular distance of the point from the straight\n// baseline.\nfloat BaselineRow::PerpDistanceFromBaseline(const FCOORD &pt) const {\n  FCOORD baseline_vector(baseline_pt2_ - baseline_pt1_);\n  FCOORD offset_vector(pt - baseline_pt1_);\n  float distance = baseline_vector * offset_vector;\n  float sqlength = baseline_vector.sqlength();\n  if (sqlength == 0.0f) {\n    tprintf(\"unexpected baseline vector (0,0)\\n\");\n    return 0.0f;\n  }\n  return std::sqrt(distance * distance / sqlength);\n}\n\n// Computes the bounding box of the row.\nvoid BaselineRow::ComputeBoundingBox() {\n  BLOBNBOX_IT it(blobs_);\n  TBOX box;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    box += it.data()->bounding_box();\n  }\n  bounding_box_ = box;\n}\n\nBaselineBlock::BaselineBlock(int debug_level, bool non_text, TO_BLOCK *block)\n    : block_(block),\n      debug_level_(debug_level),\n      non_text_block_(non_text),\n      good_skew_angle_(false),\n      skew_angle_(0.0),\n      line_spacing_(block->line_spacing),\n      line_offset_(0.0),\n      model_error_(0.0) {\n  TO_ROW_IT row_it(block_->get_rows());\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    // Sort the blobs on the rows.\n    row_it.data()->blob_list()->sort(blob_x_order);\n    rows_.push_back(new BaselineRow(block->line_spacing, row_it.data()));\n  }\n}\n\n// Computes and returns the absolute error of the given perp_disp from the\n// given linespacing model.\ndouble BaselineBlock::SpacingModelError(double perp_disp, double line_spacing,\n                                        double line_offset) {\n  // Round to the nearest multiple of line_spacing + line offset.\n  int multiple = IntCastRounded((perp_disp - line_offset) / line_spacing);\n  double model_y = line_spacing * multiple + line_offset;\n  return fabs(perp_disp - model_y);\n}\n\n// Fits straight line baselines and computes the skew angle from the\n// median angle. Returns true if a good angle is found.\n// If use_box_bottoms is false, baseline positions are formed by\n// considering the outlines of the blobs.\nbool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {\n  if (non_text_block_) {\n    return false;\n  }\n  std::vector<double> angles;\n  for (auto row : rows_) {\n    if (row->FitBaseline(use_box_bottoms)) {\n      double angle = row->BaselineAngle();\n      angles.push_back(angle);\n    }\n    if (debug_level_ > 1) {\n      row->Print();\n    }\n  }\n\n  if (!angles.empty()) {\n    skew_angle_ = MedianOfCircularValues(M_PI, angles);\n    good_skew_angle_ = true;\n  } else {\n    skew_angle_ = 0.0f;\n    good_skew_angle_ = false;\n  }\n  if (debug_level_ > 0) {\n    tprintf(\"Initial block skew angle = %g, good = %d\\n\", skew_angle_,\n            good_skew_angle_);\n  }\n  return good_skew_angle_;\n}\n\n// Refits the baseline to a constrained angle, using the stored block\n// skew if good enough, otherwise the supplied default skew.\nvoid BaselineBlock::ParallelizeBaselines(double default_block_skew) {\n  if (non_text_block_) {\n    return;\n  }\n  if (!good_skew_angle_) {\n    skew_angle_ = default_block_skew;\n  }\n  if (debug_level_ > 0) {\n    tprintf(\"Adjusting block to skew angle %g\\n\", skew_angle_);\n  }\n  FCOORD direction(cos(skew_angle_), sin(skew_angle_));\n  for (auto row : rows_) {\n    row->AdjustBaselineToParallel(debug_level_, direction);\n    if (debug_level_ > 1) {\n      row->Print();\n    }\n  }\n  if (rows_.size() < 3 || !ComputeLineSpacing()) {\n    return;\n  }\n  // Enforce the line spacing model on all lines that don't yet have a good\n  // baseline.\n  // Start by finding the row that is best fitted to the model.\n  unsigned best_row = 0;\n  double best_error = SpacingModelError(rows_[0]->PerpDisp(direction),\n                                        line_spacing_, line_offset_);\n  for (unsigned r = 1; r < rows_.size(); ++r) {\n    double error = SpacingModelError(rows_[r]->PerpDisp(direction),\n                                     line_spacing_, line_offset_);\n    if (error < best_error) {\n      best_error = error;\n      best_row = r;\n    }\n  }\n  // Starting at the best fitting row, work outwards, syncing the offset.\n  double offset = line_offset_;\n  for (auto r = best_row + 1; r < rows_.size(); ++r) {\n    offset = rows_[r]->AdjustBaselineToGrid(debug_level_, direction,\n                                            line_spacing_, offset);\n  }\n  offset = line_offset_;\n  for (int r = best_row - 1; r >= 0; --r) {\n    offset = rows_[r]->AdjustBaselineToGrid(debug_level_, direction,\n                                            line_spacing_, offset);\n  }\n}\n\n// Sets the parameters in TO_BLOCK that are needed by subsequent processes.\nvoid BaselineBlock::SetupBlockParameters() const {\n  if (line_spacing_ > 0.0) {\n    // Where was block_line_spacing set before?\n    float min_spacing =\n        std::min(block_->line_spacing, static_cast<float>(line_spacing_));\n    if (min_spacing < block_->line_size) {\n      block_->line_size = min_spacing;\n    }\n    block_->line_spacing = line_spacing_;\n    block_->baseline_offset = line_offset_;\n    block_->max_blob_size = line_spacing_ * kMaxBlobSizeMultiple;\n  }\n  // Setup the parameters on all the rows.\n  TO_ROW_IT row_it(block_->get_rows());\n  for (unsigned r = 0; r < rows_.size(); ++r, row_it.forward()) {\n    BaselineRow *row = rows_[r];\n    TO_ROW *to_row = row_it.data();\n    row->SetupOldLineParameters(to_row);\n  }\n}\n\n// Processing that is required before fitting baseline splines, but requires\n// linear baselines in order to be successful:\n//   Removes noise if required\n//   Separates out underlines\n//   Pre-associates blob fragments.\n// TODO(rays/joeliu) This entire section of code is inherited from the past\n// and could be improved/eliminated.\n// page_tr is used to size a debug window.\nvoid BaselineBlock::PrepareForSplineFitting(ICOORD page_tr, bool remove_noise) {\n  if (non_text_block_) {\n    return;\n  }\n  if (remove_noise) {\n    vigorous_noise_removal(block_);\n  }\n  FCOORD rotation(1.0f, 0.0f);\n  double gradient = tan(skew_angle_);\n  separate_underlines(block_, gradient, rotation, true);\n  pre_associate_blobs(page_tr, block_, rotation, true);\n}\n\n// Fits splines to the textlines, or creates fake QSPLINES from the straight\n// baselines that are already on the TO_ROWs.\n// As a side-effect, computes the xheights of the rows and the block.\n// Although x-height estimation is conceptually separate, it is part of\n// detecting perspective distortion and therefore baseline fitting.\nvoid BaselineBlock::FitBaselineSplines(bool enable_splines,\n                                       bool show_final_rows, Textord *textord) {\n  double gradient = tan(skew_angle_);\n  FCOORD rotation(1.0f, 0.0f);\n\n  if (enable_splines) {\n    textord->make_spline_rows(block_, gradient, show_final_rows);\n  } else {\n    // Make a fake spline from the existing line.\n    TBOX block_box = block_->block->pdblk.bounding_box();\n    TO_ROW_IT row_it = block_->get_rows();\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      TO_ROW *row = row_it.data();\n      int32_t xstarts[2] = {block_box.left(), block_box.right()};\n      double coeffs[3] = {0.0, row->line_m(), row->line_c()};\n      row->baseline = QSPLINE(1, xstarts, coeffs);\n      textord->compute_row_xheight(row, block_->block->classify_rotation(),\n                                   row->line_m(), block_->line_size);\n    }\n  }\n  textord->compute_block_xheight(block_, gradient);\n  block_->block->set_xheight(block_->xheight);\n  if (textord_restore_underlines) { // fix underlines\n    restore_underlined_blobs(block_);\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Draws the (straight) baselines and final blobs colored according to\n// what was discarded as noise and what is associated with each row.\nvoid BaselineBlock::DrawFinalRows(const ICOORD &page_tr) {\n  if (non_text_block_) {\n    return;\n  }\n  double gradient = tan(skew_angle_);\n  FCOORD rotation(1.0f, 0.0f);\n  int left_edge = block_->block->pdblk.bounding_box().left();\n  ScrollView *win = create_to_win(page_tr);\n  ScrollView::Color colour = ScrollView::RED;\n  TO_ROW_IT row_it = block_->get_rows();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    plot_parallel_row(row_it.data(), gradient, left_edge, colour, rotation);\n    colour = static_cast<ScrollView::Color>(colour + 1);\n    if (colour > ScrollView::MAGENTA) {\n      colour = ScrollView::RED;\n    }\n  }\n  plot_blob_list(win, &block_->blobs, ScrollView::MAGENTA, ScrollView::WHITE);\n  // Show discarded blobs.\n  plot_blob_list(win, &block_->underlines, ScrollView::YELLOW,\n                 ScrollView::CORAL);\n  if (block_->blobs.length() > 0) {\n    tprintf(\"%d blobs discarded as noise\\n\", block_->blobs.length());\n  }\n  draw_meanlines(block_, gradient, left_edge, ScrollView::WHITE, rotation);\n}\n\n#endif // !GRAPHICS_DISABLED\n\nvoid BaselineBlock::DrawPixSpline(Image pix_in) {\n  if (non_text_block_) {\n    return;\n  }\n  TO_ROW_IT row_it = block_->get_rows();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row_it.data()->baseline.plot(pix_in);\n  }\n}\n\n// Top-level line-spacing calculation. Computes an estimate of the line-\n// spacing, using the current baselines in the TO_ROWS of the block, and\n// then refines it by fitting a regression line to the baseline positions\n// as a function of their integer index.\n// Returns true if it seems that the model is a reasonable fit to the\n// observations.\nbool BaselineBlock::ComputeLineSpacing() {\n  FCOORD direction(cos(skew_angle_), sin(skew_angle_));\n  std::vector<double> row_positions;\n  ComputeBaselinePositions(direction, &row_positions);\n  if (row_positions.size() < 2) {\n    return false;\n  }\n  EstimateLineSpacing();\n  RefineLineSpacing(row_positions);\n  // Verify that the model is reasonable.\n  double max_baseline_error = kMaxBaselineError * line_spacing_;\n  int non_trivial_gaps = 0;\n  int fitting_gaps = 0;\n  for (unsigned i = 1; i < row_positions.size(); ++i) {\n    double row_gap = fabs(row_positions[i - 1] - row_positions[i]);\n    if (row_gap > max_baseline_error) {\n      ++non_trivial_gaps;\n      if (fabs(row_gap - line_spacing_) <= max_baseline_error) {\n        ++fitting_gaps;\n      }\n    }\n  }\n  if (debug_level_ > 0) {\n    tesserr << \"Spacing \" << line_spacing_ << \", in \"\n            << row_positions.size() << \" rows, \"\n            << fitting_gaps << \" gaps fitted out of \"\n            << non_trivial_gaps << \" non-trivial\\n\";\n  }\n  return fitting_gaps > non_trivial_gaps * kMinFittingLinespacings;\n}\n\n// Computes the deskewed vertical position of each baseline in the block and\n// stores them in the given vector.\n// This is calculated as the perpendicular distance of the middle of each\n// baseline (in case it has a different skew angle) from the line passing\n// through the origin parallel to the block baseline angle.\n// NOTE that \"distance\" above is a signed quantity so we can tell which side\n// of the block baseline a line sits, hence the function and argument name\n// positions not distances.\nvoid BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,\n                                             std::vector<double> *positions) {\n  positions->clear();\n  for (auto row : rows_) {\n    const TBOX &row_box = row->bounding_box();\n    float x_middle = (row_box.left() + row_box.right()) / 2.0f;\n    FCOORD row_pos(x_middle, static_cast<float>(row->StraightYAtX(x_middle)));\n    float offset = direction * row_pos;\n    positions->push_back(offset);\n  }\n}\n\n// Computes an estimate of the line spacing of the block from the median\n// of the spacings between adjacent overlapping textlines.\nvoid BaselineBlock::EstimateLineSpacing() {\n  std::vector<float> spacings;\n  for (unsigned r = 0; r < rows_.size(); ++r) {\n    BaselineRow *row = rows_[r];\n    // Exclude silly lines.\n    if (fabs(row->BaselineAngle()) > M_PI * 0.25) {\n      continue;\n    }\n    // Find the first row after row that overlaps it significantly.\n    const TBOX &row_box = row->bounding_box();\n    unsigned r2;\n    for (r2 = r + 1; r2 < rows_.size() &&\n                     !row_box.major_x_overlap(rows_[r2]->bounding_box());\n         ++r2) {\n      ;\n    }\n    if (r2 < rows_.size()) {\n      BaselineRow *row2 = rows_[r2];\n      // Exclude silly lines.\n      if (fabs(row2->BaselineAngle()) > M_PI * 0.25) {\n        continue;\n      }\n      float spacing = row->SpaceBetween(*row2);\n      spacings.push_back(spacing);\n    }\n  }\n  // If we have at least one value, use it, otherwise leave the previous\n  // value unchanged.\n  if (!spacings.empty()) {\n    std::nth_element(spacings.begin(), spacings.begin() + spacings.size() / 2,\n                     spacings.end());\n    line_spacing_ = spacings[spacings.size() / 2];\n    if (debug_level_ > 1) {\n      tprintf(\"Estimate of linespacing = %g\\n\", line_spacing_);\n    }\n  }\n}\n\n// Refines the line spacing of the block by fitting a regression\n// line to the deskewed y-position of each baseline as a function of its\n// estimated line index, allowing for a small error in the initial linespacing\n// and choosing the best available model.\nvoid BaselineBlock::RefineLineSpacing(const std::vector<double> &positions) {\n  double spacings[3], offsets[3], errors[3];\n  int index_range;\n  errors[0] = FitLineSpacingModel(positions, line_spacing_, &spacings[0],\n                                  &offsets[0], &index_range);\n  if (index_range > 1) {\n    double spacing_plus = line_spacing_ / (1.0 + 1.0 / index_range);\n    // Try the hypotheses that there might be index_range +/- 1 line spaces.\n    errors[1] = FitLineSpacingModel(positions, spacing_plus, &spacings[1],\n                                    &offsets[1], nullptr);\n    double spacing_minus = line_spacing_ / (1.0 - 1.0 / index_range);\n    errors[2] = FitLineSpacingModel(positions, spacing_minus, &spacings[2],\n                                    &offsets[2], nullptr);\n    for (int i = 1; i <= 2; ++i) {\n      if (errors[i] < errors[0]) {\n        spacings[0] = spacings[i];\n        offsets[0] = offsets[i];\n        errors[0] = errors[i];\n      }\n    }\n  }\n  if (spacings[0] > 0.0) {\n    line_spacing_ = spacings[0];\n    line_offset_ = offsets[0];\n    model_error_ = errors[0];\n    if (debug_level_ > 0) {\n      tprintf(\"Final linespacing model = %g + offset %g, error %g\\n\",\n              line_spacing_, line_offset_, model_error_);\n    }\n  }\n}\n\n// Given an initial estimate of line spacing (m_in) and the positions of each\n// baseline, computes the line spacing of the block more accurately in m_out,\n// and the corresponding intercept in c_out, and the number of spacings seen\n// in index_delta. Returns the error of fit to the line spacing model.\n// Uses a simple linear regression, but optimized the offset using the median.\ndouble BaselineBlock::FitLineSpacingModel(const std::vector<double> &positions,\n                                          double m_in, double *m_out,\n                                          double *c_out, int *index_delta) {\n  if (m_in == 0.0f || positions.size() < 2) {\n    *m_out = m_in;\n    *c_out = 0.0;\n    if (index_delta != nullptr) {\n      *index_delta = 0;\n    }\n    return 0.0;\n  }\n  std::vector<double> offsets;\n  // Get the offset (remainder) linespacing for each line and choose the median.\n  offsets.reserve(positions.size());\n  for (double position : positions) {\n    offsets.push_back(fmod(position, m_in));\n  }\n  // Get the median offset.\n  double median_offset = MedianOfCircularValues(m_in, offsets);\n  // Now fit a line to quantized line number and offset.\n  LLSQ llsq;\n  int min_index = INT32_MAX;\n  int max_index = -INT32_MAX;\n  for (double y_pos : positions) {\n    int row_index = IntCastRounded((y_pos - median_offset) / m_in);\n    UpdateRange(row_index, &min_index, &max_index);\n    llsq.add(row_index, y_pos);\n  }\n  // Get the refined line spacing.\n  *m_out = llsq.m();\n  // Use the median offset rather than the mean.\n  offsets.clear();\n  if (*m_out != 0.0) {\n    for (double position : positions) {\n      offsets.push_back(fmod(position, *m_out));\n    }\n    // Get the median offset.\n    if (debug_level_ > 2) {\n      for (unsigned i = 0; i < offsets.size(); ++i) {\n        tprintf(\"%u: %g\\n\", i, offsets[i]);\n      }\n    }\n    *c_out = MedianOfCircularValues(*m_out, offsets);\n  } else {\n    *c_out = 0.0;\n  }\n  if (debug_level_ > 1) {\n    tprintf(\"Median offset = %g, compared to mean of %g.\\n\", *c_out,\n            llsq.c(*m_out));\n  }\n  // Index_delta is the number of hypothesized line gaps present.\n  if (index_delta != nullptr) {\n    *index_delta = max_index - min_index;\n  }\n  // Use the regression model's intercept to compute the error, as it may be\n  // a full line-spacing in disagreement with the median.\n  double rms_error = llsq.rms(*m_out, llsq.c(*m_out));\n  if (debug_level_ > 1) {\n    tprintf(\"Linespacing of y=%g x + %g improved to %g x + %g, rms=%g\\n\", m_in,\n            median_offset, *m_out, *c_out, rms_error);\n  }\n  return rms_error;\n}\n\nBaselineDetect::BaselineDetect(int debug_level, const FCOORD &page_skew,\n                               TO_BLOCK_LIST *blocks)\n    : page_skew_(page_skew), debug_level_(debug_level) {\n  TO_BLOCK_IT it(blocks);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TO_BLOCK *to_block = it.data();\n    BLOCK *block = to_block->block;\n    POLY_BLOCK *pb = block->pdblk.poly_block();\n    // A note about non-text blocks.\n    // On output, non-text blocks are supposed to contain a single empty word\n    // in each incoming text line. These mark out the polygonal bounds of the\n    // block. Ideally no baselines should be required, but currently\n    // make_words crashes if a baseline and xheight are not provided, so we\n    // include non-text blocks here, but flag them for special treatment.\n    bool non_text = pb != nullptr && !pb->IsText();\n    blocks_.push_back(new BaselineBlock(debug_level_, non_text, to_block));\n  }\n}\n\n// Finds the initial baselines for each TO_ROW in each TO_BLOCK, gathers\n// block-wise and page-wise data to smooth small blocks/rows, and applies\n// smoothing based on block/page-level skew and block-level linespacing.\nvoid BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) {\n  std::vector<double> block_skew_angles;\n  for (auto bl_block : blocks_) {\n    if (debug_level_ > 0) {\n      tprintf(\"Fitting initial baselines...\\n\");\n    }\n    if (bl_block->FitBaselinesAndFindSkew(use_box_bottoms)) {\n      block_skew_angles.push_back(bl_block->skew_angle());\n    }\n  }\n  // Compute a page-wide default skew for blocks with too little information.\n  double default_block_skew = page_skew_.angle();\n  if (!block_skew_angles.empty()) {\n    default_block_skew = MedianOfCircularValues(M_PI, block_skew_angles);\n  }\n  if (debug_level_ > 0) {\n    tprintf(\"Page skew angle = %g\\n\", default_block_skew);\n  }\n  // Set bad lines in each block to the default block skew and then force fit\n  // a linespacing model where it makes sense to do so.\n  for (auto bl_block : blocks_) {\n    bl_block->ParallelizeBaselines(default_block_skew);\n    bl_block->SetupBlockParameters(); // This replaced compute_row_stats.\n  }\n}\n\n// Computes the baseline splines for each TO_ROW in each TO_BLOCK and\n// other associated side-effects, including pre-associating blobs, computing\n// x-heights and displaying debug information.\n// NOTE that ComputeStraightBaselines must have been called first as this\n// sets up data in the TO_ROWs upon which this function depends.\nvoid BaselineDetect::ComputeBaselineSplinesAndXheights(const ICOORD &page_tr,\n                                                       bool enable_splines,\n                                                       bool remove_noise,\n                                                       bool show_final_rows,\n                                                       Textord *textord) {\n  for (auto bl_block : blocks_) {\n    if (enable_splines) {\n      bl_block->PrepareForSplineFitting(page_tr, remove_noise);\n    }\n    bl_block->FitBaselineSplines(enable_splines, show_final_rows, textord);\n#ifndef GRAPHICS_DISABLED\n    if (show_final_rows) {\n      bl_block->DrawFinalRows(page_tr);\n    }\n#endif\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/baselinedetect.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        baselinedetect.h\n// Description: Initial Baseline Determination.\n// Copyright 2012 Google Inc. All Rights Reserved.\n// Author:      rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_BASELINEDETECT_H_\n#define TESSERACT_TEXTORD_BASELINEDETECT_H_\n\n#include \"detlinefit.h\"\n#include \"points.h\"\n#include \"rect.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass Textord;\nclass BLOBNBOX_LIST;\nclass TO_BLOCK;\nclass TO_BLOCK_LIST;\nclass TO_ROW;\n\n// Class to compute and hold baseline data for a TO_ROW.\nclass BaselineRow {\npublic:\n  BaselineRow(double line_size, TO_ROW *to_row);\n\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n  // Sets the TO_ROW with the output straight line.\n  void SetupOldLineParameters(TO_ROW *row) const;\n\n  // Outputs diagnostic information.\n  void Print() const;\n\n  // Returns the skew angle (in radians) of the current baseline in [-pi,pi].\n  double BaselineAngle() const;\n  // Computes and returns the linespacing at the middle of the overlap\n  // between this and other.\n  double SpaceBetween(const BaselineRow &other) const;\n  // Computes and returns the displacement of the center of the line\n  // perpendicular to the given direction.\n  double PerpDisp(const FCOORD &direction) const;\n  // Computes the y coordinate at the given x using the straight baseline\n  // defined by baseline1_ and baseline2_.\n  double StraightYAtX(double x) const;\n\n  // Fits a straight baseline to the points. Returns true if it had enough\n  // points to be reasonably sure of the fitted baseline.\n  // If use_box_bottoms is false, baselines positions are formed by\n  // considering the outlines of the blobs.\n  bool FitBaseline(bool use_box_bottoms);\n  // Modifies an existing result of FitBaseline to be parallel to the given\n  // vector if that produces a better result.\n  void AdjustBaselineToParallel(int debug, const FCOORD &direction);\n  // Modifies the baseline to snap to the textline grid if the existing\n  // result is not good enough.\n  double AdjustBaselineToGrid(int debug, const FCOORD &direction, double line_spacing,\n                              double line_offset);\n\nprivate:\n  // Sets up displacement_modes_ with the top few modes of the perpendicular\n  // distance of each blob from the given direction vector, after rounding.\n  void SetupBlobDisplacements(const FCOORD &direction);\n\n  // Fits a line in the given direction to blobs that are close to the given\n  // target_offset perpendicular displacement from the direction. The fit\n  // error is allowed to be cheat_allowance worse than the existing fit, and\n  // will still be used.\n  // If cheat_allowance > 0, the new fit will be good and replace the current\n  // fit if it has better fit (with cheat) OR its error is below\n  // max_baseline_error_ and the old fit is marked bad.\n  // Otherwise the new fit will only replace the old if it is really better,\n  // or the old fit is marked bad and the new fit has sufficient points, as\n  // well as being within the max_baseline_error_.\n  void FitConstrainedIfBetter(int debug, const FCOORD &direction, double cheat_allowance,\n                              double target_offset);\n  // Returns the perpendicular distance of the point from the straight\n  // baseline.\n  float PerpDistanceFromBaseline(const FCOORD &pt) const;\n  // Computes the bounding box of the row.\n  void ComputeBoundingBox();\n\n  // The blobs of the row to which this BaselineRow adds extra information\n  // during baseline fitting. Note that blobs_ could easily come from either\n  // a TO_ROW or a ColPartition.\n  BLOBNBOX_LIST *blobs_;\n  // Bounding box of all the blobs.\n  TBOX bounding_box_;\n  // Fitter used to fit lines to the blobs.\n  DetLineFit fitter_;\n  // 2 points on the straight baseline.\n  FCOORD baseline_pt1_;\n  FCOORD baseline_pt2_;\n  // Set of modes of displacements. They indicate preferable baseline positions.\n  std::vector<double> displacement_modes_;\n  // Quantization factor used for displacement_modes_.\n  double disp_quant_factor_;\n  // Half the acceptance range of blob displacements for computing the\n  // error during a constrained fit.\n  double fit_halfrange_;\n  // Max baseline error before a line is regarded as fitting badly.\n  double max_baseline_error_;\n  // The error of fit of the baseline.\n  double baseline_error_;\n  // True if this row seems to have a good baseline.\n  bool good_baseline_;\n};\n\n// Class to compute and hold baseline data for a TO_BLOCK.\nclass BaselineBlock {\npublic:\n  BaselineBlock(int debug_level, bool non_text, TO_BLOCK *block);\n\n  ~BaselineBlock() {\n    for (auto row : rows_) {\n      delete row;\n    }\n  }\n\n  TO_BLOCK *block() const {\n    return block_;\n  }\n  double skew_angle() const {\n    return skew_angle_;\n  }\n\n  // Computes and returns the absolute error of the given perp_disp from the\n  // given linespacing model.\n  static double SpacingModelError(double perp_disp, double line_spacing, double line_offset);\n\n  // Fits straight line baselines and computes the skew angle from the\n  // median angle. Returns true if a good angle is found.\n  // If use_box_bottoms is false, baseline positions are formed by\n  // considering the outlines of the blobs.\n  bool FitBaselinesAndFindSkew(bool use_box_bottoms);\n\n  // Refits the baseline to a constrained angle, using the stored block\n  // skew if good enough, otherwise the supplied default skew.\n  void ParallelizeBaselines(double default_block_skew);\n\n  // Sets the parameters in TO_BLOCK that are needed by subsequent processes.\n  void SetupBlockParameters() const;\n\n  // Processing that is required before fitting baseline splines, but requires\n  // linear baselines in order to be successful:\n  //   Removes noise if required\n  //   Separates out underlines\n  //   Pre-associates blob fragments.\n  // TODO(rays/joeliu) This entire section of code is inherited from the past\n  // and could be improved/eliminated.\n  // page_tr is used to size a debug window.\n  void PrepareForSplineFitting(ICOORD page_tr, bool remove_noise);\n\n  // Fits splines to the textlines, or creates fake QSPLINES from the straight\n  // baselines that are already on the TO_ROWs.\n  // As a side-effect, computes the xheights of the rows and the block.\n  // Although x-height estimation is conceptually separate, it is part of\n  // detecting perspective distortion and therefore baseline fitting.\n  void FitBaselineSplines(bool enable_splines, bool show_final_rows, Textord *textord);\n\n  // Draws the (straight) baselines and final blobs colored according to\n  // what was discarded as noise and what is associated with each row.\n  void DrawFinalRows(const ICOORD &page_tr);\n\n  // Render the generated spline baselines for this block on pix_in.\n  void DrawPixSpline(Image pix_in);\n\nprivate:\n  // Top-level line-spacing calculation. Computes an estimate of the line-\n  // spacing, using the current baselines in the TO_ROWS of the block, and\n  // then refines it by fitting a regression line to the baseline positions\n  // as a function of their integer index.\n  // Returns true if it seems that the model is a reasonable fit to the\n  // observations.\n  bool ComputeLineSpacing();\n\n  // Computes the deskewed vertical position of each baseline in the block and\n  // stores them in the given vector.\n  void ComputeBaselinePositions(const FCOORD &direction, std::vector<double> *positions);\n\n  // Computes an estimate of the line spacing of the block from the median\n  // of the spacings between adjacent overlapping textlines.\n  void EstimateLineSpacing();\n\n  // Refines the line spacing of the block by fitting a regression\n  // line to the deskewed y-position of each baseline as a function of its\n  // estimated line index, allowing for a small error in the initial linespacing\n  // and choosing the best available model.\n  void RefineLineSpacing(const std::vector<double> &positions);\n\n  // Given an initial estimate of line spacing (m_in) and the positions of each\n  // baseline, computes the line spacing of the block more accurately in m_out,\n  // and the corresponding intercept in c_out, and the number of spacings seen\n  // in index_delta. Returns the error of fit to the line spacing model.\n  double FitLineSpacingModel(const std::vector<double> &positions, double m_in, double *m_out,\n                             double *c_out, int *index_delta);\n\n  // The block to which this class adds extra information used during baseline\n  // calculation.\n  TO_BLOCK *block_;\n  // The rows in the block that we will be working with.\n  std::vector<BaselineRow *> rows_;\n  // Amount of debugging output to provide.\n  int debug_level_;\n  // True if the block is non-text (graphic).\n  bool non_text_block_;\n  // True if the block has at least one good enough baseline to compute the\n  // skew angle and therefore skew_angle_ is valid.\n  bool good_skew_angle_;\n  // Angle of skew in radians using the conventional anticlockwise from x-axis.\n  double skew_angle_;\n  // Current best estimate line spacing in pixels perpendicular to skew_angle_.\n  double line_spacing_;\n  // Offset for baseline positions, in pixels. Each baseline is at\n  // line_spacing_ * n + line_offset_ for integer n, which represents\n  // [textline] line number in a line numbering system that has line 0 on or\n  // at least near the x-axis. Not equal to the actual line number of a line\n  // within a block as most blocks are not near the x-axis.\n  double line_offset_;\n  // The error of the line spacing model.\n  double model_error_;\n};\n\nclass BaselineDetect {\npublic:\n  BaselineDetect(int debug_level, const FCOORD &page_skew, TO_BLOCK_LIST *blocks);\n\n  ~BaselineDetect() {\n    for (auto block : blocks_) {\n      delete block;\n    }\n  }\n\n  // Finds the initial baselines for each TO_ROW in each TO_BLOCK, gathers\n  // block-wise and page-wise data to smooth small blocks/rows, and applies\n  // smoothing based on block/page-level skew and block-level linespacing.\n  void ComputeStraightBaselines(bool use_box_bottoms);\n\n  // Computes the baseline splines for each TO_ROW in each TO_BLOCK and\n  // other associated side-effects, including pre-associating blobs, computing\n  // x-heights and displaying debug information.\n  // NOTE that ComputeStraightBaselines must have been called first as this\n  // sets up data in the TO_ROWs upon which this function depends.\n  void ComputeBaselineSplinesAndXheights(const ICOORD &page_tr, bool enable_splines,\n                                         bool remove_noise, bool show_final_rows, Textord *textord);\n\nprivate:\n  // Average (median) skew of the blocks on the page among those that have\n  // a good angle of their own.\n  FCOORD page_skew_;\n  // Amount of debug output to produce.\n  int debug_level_;\n  // The blocks that we are working with.\n  std::vector<BaselineBlock *> blocks_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TEXTORD_BASELINEDETECT_H_\n"
  },
  {
    "path": "src/textord/bbgrid.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        bbgrid.cpp\n// Description: Class to hold BLOBNBOXs in a grid for fast access\n//              to neighbours.\n// Author:      Ray Smith\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"bbgrid.h\"\n#include \"helpers.h\"\n#include \"ocrblock.h\"\n\nnamespace tesseract {\n\n///////////////////////////////////////////////////////////////////////\n// BBGrid IMPLEMENTATION.\n///////////////////////////////////////////////////////////////////////\nGridBase::GridBase(int gridsize, const ICOORD &bleft, const ICOORD &tright) {\n  Init(gridsize, bleft, tright);\n}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nGridBase::~GridBase() = default;\n\n// (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n// and bleft, tright are the bounding box of everything to go in it.\nvoid GridBase::Init(int gridsize, const ICOORD &bleft, const ICOORD &tright) {\n  gridsize_ = gridsize;\n  bleft_ = bleft;\n  tright_ = tright;\n  if (gridsize_ == 0) {\n    gridsize_ = 1;\n  }\n  gridwidth_ = (tright.x() - bleft.x() + gridsize_ - 1) / gridsize_;\n  gridheight_ = (tright.y() - bleft.y() + gridsize_ - 1) / gridsize_;\n  gridbuckets_ = gridwidth_ * gridheight_;\n}\n\n// Compute the given grid coordinates from image coords.\nvoid GridBase::GridCoords(int x, int y, int *grid_x, int *grid_y) const {\n  *grid_x = (x - bleft_.x()) / gridsize_;\n  *grid_y = (y - bleft_.y()) / gridsize_;\n  ClipGridCoords(grid_x, grid_y);\n}\n\n// Clip the given grid coordinates to fit within the grid.\nvoid GridBase::ClipGridCoords(int *x, int *y) const {\n  *x = ClipToRange(*x, 0, gridwidth_ - 1);\n  *y = ClipToRange(*y, 0, gridheight_ - 1);\n}\n\nIntGrid::IntGrid() {\n  grid_ = nullptr;\n}\n\nIntGrid::IntGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright) : grid_(nullptr) {\n  Init(gridsize, bleft, tright);\n}\n\nIntGrid::~IntGrid() {\n  delete[] grid_;\n}\n\n// (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n// and bleft, tright are the bounding box of everything to go in it.\nvoid IntGrid::Init(int gridsize, const ICOORD &bleft, const ICOORD &tright) {\n  GridBase::Init(gridsize, bleft, tright);\n  delete[] grid_;\n  grid_ = new int[gridbuckets_];\n  Clear();\n}\n\n// Clear all the ints in the grid to zero.\nvoid IntGrid::Clear() {\n  for (int i = 0; i < gridbuckets_; ++i) {\n    grid_[i] = 0;\n  }\n}\n\n// Rotate the grid by rotation, keeping cell contents.\n// rotation must be a multiple of 90 degrees.\n// NOTE: due to partial cells, cell coverage in the rotated grid will be\n// inexact. This is why there is no Rotate for the generic BBGrid.\n// TODO(rays) investigate fixing this inaccuracy by moving the origin after\n// rotation.\nvoid IntGrid::Rotate(const FCOORD &rotation) {\n  ASSERT_HOST(rotation.x() == 0.0f || rotation.y() == 0.0f);\n  ICOORD old_bleft(bleft());\n  // ICOORD old_tright(tright());\n  int old_width = gridwidth();\n  int old_height = gridheight();\n  TBOX box(bleft(), tright());\n  box.rotate(rotation);\n  int *old_grid = grid_;\n  grid_ = nullptr;\n  Init(gridsize(), box.botleft(), box.topright());\n  // Iterate over the old grid, copying data to the rotated position in the new.\n  int oldi = 0;\n  FCOORD x_step(rotation);\n  x_step *= gridsize();\n  for (int oldy = 0; oldy < old_height; ++oldy) {\n    FCOORD line_pos(old_bleft.x(), old_bleft.y() + gridsize() * oldy);\n    line_pos.rotate(rotation);\n    for (int oldx = 0; oldx < old_width; ++oldx, line_pos += x_step, ++oldi) {\n      int grid_x, grid_y;\n      GridCoords(static_cast<int>(line_pos.x() + 0.5), static_cast<int>(line_pos.y() + 0.5),\n                 &grid_x, &grid_y);\n      grid_[grid_y * gridwidth() + grid_x] = old_grid[oldi];\n    }\n  }\n  delete[] old_grid;\n}\n\n// Returns a new IntGrid containing values equal to the sum of all the\n// neighbouring cells. The returned grid must be deleted after use.\n// For ease of implementation, edge cells are double counted, to make them\n// have the same range as the non-edge cells.\nIntGrid *IntGrid::NeighbourhoodSum() const {\n  auto *sumgrid = new IntGrid(gridsize(), bleft(), tright());\n  for (int y = 0; y < gridheight(); ++y) {\n    for (int x = 0; x < gridwidth(); ++x) {\n      int cell_count = 0;\n      for (int yoffset = -1; yoffset <= 1; ++yoffset) {\n        for (int xoffset = -1; xoffset <= 1; ++xoffset) {\n          int grid_x = x + xoffset;\n          int grid_y = y + yoffset;\n          ClipGridCoords(&grid_x, &grid_y);\n          cell_count += GridCellValue(grid_x, grid_y);\n        }\n      }\n      if (GridCellValue(x, y) > 1) {\n        sumgrid->SetGridCell(x, y, cell_count);\n      }\n    }\n  }\n  return sumgrid;\n}\n\n// Returns true if more than half the area of the rect is covered by grid\n// cells that are over the threshold.\nbool IntGrid::RectMostlyOverThreshold(const TBOX &rect, int threshold) const {\n  int min_x, min_y, max_x, max_y;\n  GridCoords(rect.left(), rect.bottom(), &min_x, &min_y);\n  GridCoords(rect.right(), rect.top(), &max_x, &max_y);\n  int total_area = 0;\n  for (int y = min_y; y <= max_y; ++y) {\n    for (int x = min_x; x <= max_x; ++x) {\n      int value = GridCellValue(x, y);\n      if (value > threshold) {\n        TBOX cell_box(x * gridsize_, y * gridsize_, (x + 1) * gridsize_, (y + 1) * gridsize_);\n        cell_box &= rect; // This is in-place box intersection.\n        total_area += cell_box.area();\n      }\n    }\n  }\n  return total_area * 2 > rect.area();\n}\n\n// Returns true if any cell value in the given rectangle is zero.\nbool IntGrid::AnyZeroInRect(const TBOX &rect) const {\n  int min_x, min_y, max_x, max_y;\n  GridCoords(rect.left(), rect.bottom(), &min_x, &min_y);\n  GridCoords(rect.right(), rect.top(), &max_x, &max_y);\n  for (int y = min_y; y <= max_y; ++y) {\n    for (int x = min_x; x <= max_x; ++x) {\n      if (GridCellValue(x, y) == 0) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n// Returns a full-resolution binary pix in which each cell over the given\n// threshold is filled as a black square. pixDestroy after use.\n// Edge cells, which have a zero 4-neighbour, are not marked.\nImage IntGrid::ThresholdToPix(int threshold) const {\n  Image pix = pixCreate(tright().x() - bleft().x(), tright().y() - bleft().y(), 1);\n  int cellsize = gridsize();\n  for (int y = 0; y < gridheight(); ++y) {\n    for (int x = 0; x < gridwidth(); ++x) {\n      if (GridCellValue(x, y) > threshold && GridCellValue(x - 1, y) > 0 &&\n          GridCellValue(x + 1, y) > 0 && GridCellValue(x, y - 1) > 0 &&\n          GridCellValue(x, y + 1) > 0) {\n        pixRasterop(pix, x * cellsize, tright().y() - ((y + 1) * cellsize), cellsize, cellsize,\n                    PIX_SET, nullptr, 0, 0);\n      }\n    }\n  }\n  return pix;\n}\n\n// Make a Pix of the correct scaled size for the TraceOutline functions.\nstatic Image GridReducedPix(const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom) {\n  // Compute grid bounds of the outline and pad all round by 1.\n  int grid_left = (box.left() - bleft.x()) / gridsize - 1;\n  int grid_bottom = (box.bottom() - bleft.y()) / gridsize - 1;\n  int grid_right = (box.right() - bleft.x()) / gridsize + 1;\n  int grid_top = (box.top() - bleft.y()) / gridsize + 1;\n  *left = grid_left;\n  *bottom = grid_bottom;\n  return pixCreate(grid_right - grid_left + 1, grid_top - grid_bottom + 1, 1);\n}\n\n// Helper function to return a scaled Pix with one pixel per grid cell,\n// set (black) where the given outline enters the corresponding grid cell,\n// and clear where the outline does not touch the grid cell.\n// Also returns the grid coords of the bottom-left of the Pix, in *left\n// and *bottom, which corresponds to (0, 0) on the Pix.\n// Note that the Pix is used upside-down, with (0, 0) being the bottom-left.\nImage TraceOutlineOnReducedPix(C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left,\n                              int *bottom) {\n  const TBOX &box = outline->bounding_box();\n  Image pix = GridReducedPix(box, gridsize, bleft, left, bottom);\n  int wpl = pixGetWpl(pix);\n  l_uint32 *data = pixGetData(pix);\n  int length = outline->pathlength();\n  ICOORD pos = outline->start_pos();\n  for (int i = 0; i < length; ++i) {\n    int grid_x = (pos.x() - bleft.x()) / gridsize - *left;\n    int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;\n    SET_DATA_BIT(data + grid_y * wpl, grid_x);\n    pos += outline->step(i);\n  }\n  return pix;\n}\n#if 0 // Example code shows how to use TraceOutlineOnReducedPix.\n  C_OUTLINE_IT ol_it(blob->cblob()->out_list());\n  int grid_left, grid_bottom;\n  Pix* pix = TraceOutlineOnReducedPix(ol_it.data(), gridsize_, bleft_,\n                                      &grid_left, &grid_bottom);\n  grid->InsertPixPtBBox(grid_left, grid_bottom, pix, blob);\n  pix.destroy();\n#endif\n\n// As TraceOutlineOnReducedPix above, but on a BLOCK instead of a C_OUTLINE.\nImage TraceBlockOnReducedPix(BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom) {\n  const TBOX &box = block->pdblk.bounding_box();\n  Image pix = GridReducedPix(box, gridsize, bleft, left, bottom);\n  int wpl = pixGetWpl(pix);\n  l_uint32 *data = pixGetData(pix);\n  ICOORDELT_IT it(block->pdblk.poly_block()->points());\n  for (it.mark_cycle_pt(); !it.cycled_list();) {\n    ICOORD pos = *it.data();\n    it.forward();\n    ICOORD next_pos = *it.data();\n    ICOORD line_vector = next_pos - pos;\n    int major, minor;\n    ICOORD major_step, minor_step;\n    line_vector.setup_render(&major_step, &minor_step, &major, &minor);\n    int accumulator = major / 2;\n    while (pos != next_pos) {\n      int grid_x = (pos.x() - bleft.x()) / gridsize - *left;\n      int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;\n      SET_DATA_BIT(data + grid_y * wpl, grid_x);\n      pos += major_step;\n      accumulator += minor;\n      if (accumulator >= major) {\n        accumulator -= major;\n        pos += minor_step;\n      }\n    }\n  }\n  return pix;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/bbgrid.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        bbgrid.h\n// Description: Class to hold BLOBNBOXs in a grid for fast access\n//              to neighbours.\n// Author:      Ray Smith\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_BBGRID_H_\n#define TESSERACT_TEXTORD_BBGRID_H_\n\n#include <unordered_set>\n\n#include \"clst.h\"\n#include \"coutln.h\"\n#include \"rect.h\"\n#include \"scrollview.h\"\n\n#include <allheaders.h>\n\nclass BLOCK;\n\nnamespace tesseract {\n\n// Helper function to return a scaled Pix with one pixel per grid cell,\n// set (black) where the given outline enters the corresponding grid cell,\n// and clear where the outline does not touch the grid cell.\n// Also returns the grid coords of the bottom-left of the Pix, in *left\n// and *bottom, which corresponds to (0, 0) on the Pix.\n// Note that the Pix is used upside-down, with (0, 0) being the bottom-left.\nImage TraceOutlineOnReducedPix(C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left,\n                              int *bottom);\n// As TraceOutlineOnReducedPix above, but on a BLOCK instead of a C_OUTLINE.\nImage TraceBlockOnReducedPix(BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom);\n\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nclass GridSearch;\n\n// The GridBase class is the base class for BBGrid and IntGrid.\n// It holds the geometry and scale of the grid.\nclass TESS_API GridBase {\npublic:\n  GridBase() = default;\n  GridBase(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  virtual ~GridBase();\n\n  // (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n  // and bleft, tright are the bounding box of everything to go in it.\n  void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n\n  // Simple accessors.\n  int gridsize() const {\n    return gridsize_;\n  }\n  int gridwidth() const {\n    return gridwidth_;\n  }\n  int gridheight() const {\n    return gridheight_;\n  }\n  const ICOORD &bleft() const {\n    return bleft_;\n  }\n  const ICOORD &tright() const {\n    return tright_;\n  }\n  // Compute the given grid coordinates from image coords.\n  void GridCoords(int x, int y, int *grid_x, int *grid_y) const;\n\n  // Clip the given grid coordinates to fit within the grid.\n  void ClipGridCoords(int *x, int *y) const;\n\nprotected:\n  // TODO(rays) Make these private and migrate to the accessors in subclasses.\n  int gridsize_;  // Pixel size of each grid cell.\n  int gridwidth_; // Size of the grid in cells.\n  int gridheight_;\n  int gridbuckets_; // Total cells in grid.\n  ICOORD bleft_;    // Pixel coords of bottom-left of grid.\n  ICOORD tright_;   // Pixel coords of top-right of grid.\n\nprivate:\n};\n\n// The IntGrid maintains a single int for each cell in a grid.\nclass IntGrid : public GridBase {\npublic:\n  IntGrid();\n  IntGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~IntGrid() override;\n\n  // (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n  // and bleft, tright are the bounding box of everything to go in it.\n  void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n\n  // Clear all the ints in the grid to zero.\n  void Clear();\n\n  // Rotate the grid by rotation, keeping cell contents.\n  // rotation must be a multiple of 90 degrees.\n  // NOTE: due to partial cells, cell coverage in the rotated grid will be\n  // inexact. This is why there is no Rotate for the generic BBGrid.\n  void Rotate(const FCOORD &rotation);\n\n  // Returns a new IntGrid containing values equal to the sum of all the\n  // neighbouring cells. The returned grid must be deleted after use.\n  IntGrid *NeighbourhoodSum() const;\n\n  int GridCellValue(int grid_x, int grid_y) const {\n    ClipGridCoords(&grid_x, &grid_y);\n    return grid_[grid_y * gridwidth_ + grid_x];\n  }\n  void SetGridCell(int grid_x, int grid_y, int value) {\n    ASSERT_HOST(grid_x >= 0 && grid_x < gridwidth());\n    ASSERT_HOST(grid_y >= 0 && grid_y < gridheight());\n    grid_[grid_y * gridwidth_ + grid_x] = value;\n  }\n  // Returns true if more than half the area of the rect is covered by grid\n  // cells that are over the threshold.\n  bool RectMostlyOverThreshold(const TBOX &rect, int threshold) const;\n\n  // Returns true if any cell value in the given rectangle is zero.\n  bool AnyZeroInRect(const TBOX &rect) const;\n\n  // Returns a full-resolution binary pix in which each cell over the given\n  // threshold is filled as a black square. pixDestroy after use.\n  Image ThresholdToPix(int threshold) const;\n\nprivate:\n  int *grid_; // 2-d array of ints.\n};\n\n// The BBGrid class holds C_LISTs of template classes BBC (bounding box class)\n// in a grid for fast neighbour access.\n// The BBC class must have a member const TBOX& bounding_box() const.\n// The BBC class must have been CLISTIZEH'ed elsewhere to make the\n// list class BBC_CLIST and the iterator BBC_C_IT.\n// Use of C_LISTs enables BBCs to exist in multiple cells simultaneously.\n// As a consequence, ownership of BBCs is assumed to be elsewhere and\n// persistent for at least the life of the BBGrid, or at least until Clear is\n// called which removes all references to inserted objects without actually\n// deleting them.\n// Most uses derive a class from a specific instantiation of BBGrid,\n// thereby making most of the ugly template notation go away.\n// The friend class GridSearch, with the same template arguments, is\n// used to search a grid efficiently in one of several search patterns.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nclass BBGrid : public GridBase {\n  friend class GridSearch<BBC, BBC_CLIST, BBC_C_IT>;\n\npublic:\n  BBGrid();\n  BBGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~BBGrid() override;\n\n  // (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n  // and bleft, tright are the bounding box of everything to go in it.\n  void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n\n  // Empty all the lists but leave the grid itself intact.\n  void Clear();\n  // Deallocate the data in the lists but otherwise leave the lists and the grid\n  // intact.\n  void ClearGridData(void (*free_method)(BBC *));\n\n  // Insert a bbox into the appropriate place in the grid.\n  // If h_spread, then all cells covered horizontally by the box are\n  // used, otherwise, just the bottom-left. Similarly for v_spread.\n  // WARNING: InsertBBox may invalidate an active GridSearch. Call\n  // RepositionIterator() on any GridSearches that are active on this grid.\n  void InsertBBox(bool h_spread, bool v_spread, BBC *bbox);\n\n  // Using a pix from TraceOutlineOnReducedPix or TraceBlockOnReducedPix, in\n  // which each pixel corresponds to a grid cell, insert a bbox into every\n  // place in the grid where the corresponding pixel is 1. The Pix is handled\n  // upside-down to match the Tesseract coordinate system. (As created by\n  // TraceOutlineOnReducedPix or TraceBlockOnReducedPix.)\n  // (0, 0) in the pix corresponds to (left, bottom) in the\n  // grid (in grid coords), and the pix works up the grid from there.\n  // WARNING: InsertPixPtBBox may invalidate an active GridSearch. Call\n  // RepositionIterator() on any GridSearches that are active on this grid.\n  void InsertPixPtBBox(int left, int bottom, Image pix, BBC *bbox);\n\n  // Remove the bbox from the grid.\n  // WARNING: Any GridSearch operating on this grid could be invalidated!\n  // If a GridSearch is operating, call GridSearch::RemoveBBox() instead.\n  void RemoveBBox(BBC *bbox);\n\n  // Returns true if the given rectangle has no overlapping elements.\n  bool RectangleEmpty(const TBOX &rect);\n\n  // Returns an IntGrid showing the number of elements in each cell.\n  // Returned IntGrid must be deleted after use.\n  IntGrid *CountCellElements();\n\n#ifndef GRAPHICS_DISABLED\n\n  // Make a window of an appropriate size to display things in the grid.\n  ScrollView *MakeWindow(int x, int y, const char *window_name);\n\n  // Display the bounding boxes of the BLOBNBOXes in this grid.\n  // Use of this function requires an additional member of the BBC class:\n  // ScrollView::Color BBC::BoxColor() const.\n  void DisplayBoxes(ScrollView *window);\n\n#endif // !GRAPHICS_DISABLED\n\n  // ASSERT_HOST that every cell contains no more than one copy of each entry.\n  void AssertNoDuplicates();\n\n  // Handle a click event in a display window.\n  virtual void HandleClick(int x, int y);\n\nprotected:\n  BBC_CLIST *grid_; // 2-d array of CLISTS of BBC elements.\n\nprivate:\n};\n\n// The GridSearch class enables neighbourhood searching on a BBGrid.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nclass GridSearch {\npublic:\n  GridSearch(BBGrid<BBC, BBC_CLIST, BBC_C_IT> *grid) : grid_(grid) {}\n\n  // Get the grid x, y coords of the most recently returned BBC.\n  int GridX() const {\n    return x_;\n  }\n  int GridY() const {\n    return y_;\n  }\n\n  // Sets the search mode to return a box only once.\n  // Efficiency warning: Implementation currently uses a squared-order\n  // search in the number of returned elements. Use only where a small\n  // number of elements are spread over a wide area, eg ColPartitions.\n  void SetUniqueMode(bool mode) {\n    unique_mode_ = mode;\n  }\n  // TODO(rays) Replace calls to ReturnedSeedElement with SetUniqueMode.\n  // It only works if the search includes the bottom-left corner.\n  // Apart from full search, all other searches return a box several\n  // times if the box is inserted with h_spread or v_spread.\n  // This method will return true for only one occurrence of each box\n  // that was inserted with both h_spread and v_spread as true.\n  // It will usually return false for boxes that were not inserted with\n  // both h_spread=true and v_spread=true\n  bool ReturnedSeedElement() const {\n    TBOX box = previous_return_->bounding_box();\n    int x_center = (box.left() + box.right()) / 2;\n    int y_center = (box.top() + box.bottom()) / 2;\n    int grid_x, grid_y;\n    grid_->GridCoords(x_center, y_center, &grid_x, &grid_y);\n    return (x_ == grid_x) && (y_ == grid_y);\n  }\n\n  // Various searching iterations... Note that these iterations\n  // all share data members, so you can't run more than one iteration\n  // in parallel in a single GridSearch instance, but multiple instances\n  // can search the same BBGrid in parallel.\n  // Note that all the searches can return blobs that may not exactly\n  // match the search conditions, since they return everything in the\n  // covered grid cells. It is up to the caller to check for\n  // appropriateness.\n  // TODO(rays) NextRectSearch only returns valid elements. Make the other\n  // searches test before return also and remove the tests from code\n  // that uses GridSearch.\n\n  // Start a new full search. Will iterate all stored blobs, from the top.\n  // If the blobs have been inserted using InsertBBox, (not InsertPixPtBBox)\n  // then the full search guarantees to return each blob in the grid once.\n  // Other searches may return a blob more than once if they have been\n  // inserted using h_spread or v_spread.\n  void StartFullSearch();\n  // Return the next bbox in the search or nullptr if done.\n  BBC *NextFullSearch();\n\n  // Start a new radius search. Will search in a spiral up to a\n  // given maximum radius in grid cells from the given center in pixels.\n  void StartRadSearch(int x, int y, int max_radius);\n  // Return the next bbox in the radius search or nullptr if the\n  // maximum radius has been reached.\n  BBC *NextRadSearch();\n\n  // Start a new left or right-looking search. Will search to the side\n  // for a box that vertically overlaps the given vertical line segment.\n  // CAVEAT: This search returns all blobs from the cells to the side\n  // of the start, and somewhat below, since there is no guarantee\n  // that there may not be a taller object in a lower cell. The\n  // blobs returned will include all those that vertically overlap and\n  // are no more than twice as high, but may also include some that do\n  // not overlap and some that are more than twice as high.\n  void StartSideSearch(int x, int ymin, int ymax);\n  // Return the next bbox in the side search or nullptr if the\n  // edge has been reached. Searches left to right or right to left\n  // according to the flag.\n  BBC *NextSideSearch(bool right_to_left);\n\n  // Start a vertical-looking search. Will search up or down\n  // for a box that horizontally overlaps the given line segment.\n  void StartVerticalSearch(int xmin, int xmax, int y);\n  // Return the next bbox in the vertical search or nullptr if the\n  // edge has been reached. Searches top to bottom or bottom to top\n  // according to the flag.\n  BBC *NextVerticalSearch(bool top_to_bottom);\n\n  // Start a rectangular search. Will search for a box that overlaps the\n  // given rectangle.\n  void StartRectSearch(const TBOX &rect);\n  // Return the next bbox in the rectangular search or nullptr if complete.\n  BBC *NextRectSearch();\n\n  // Remove the last returned BBC. Will not invalidate this. May invalidate\n  // any other concurrent GridSearch on the same grid. If any others are\n  // in use, call RepositionIterator on those, to continue without harm.\n  void RemoveBBox();\n  void RepositionIterator();\n\nprivate:\n  // Factored out helper to start a search.\n  void CommonStart(int x, int y);\n  // Factored out helper to complete a next search.\n  BBC *CommonNext();\n  // Factored out final return when search is exhausted.\n  BBC *CommonEnd();\n  // Factored out function to set the iterator to the current x_, y_\n  // grid coords and mark the cycle pt.\n  void SetIterator();\n\nprivate:\n  // The grid we are searching.\n  BBGrid<BBC, BBC_CLIST, BBC_C_IT> *grid_ = nullptr;\n  // For executing a search. The different search algorithms use these in\n  // different ways, but most use x_origin_ and y_origin_ as the start position.\n  int x_origin_ = 0;\n  int y_origin_ = 0;\n  int max_radius_ = 0;\n  int radius_ = 0;\n  int rad_index_ = 0;\n  int rad_dir_ = 0;\n  TBOX rect_;\n  int x_ = 0; // The current location in grid coords, of the current search.\n  int y_ = 0;\n  bool unique_mode_ = false;\n  BBC *previous_return_ = nullptr; // Previous return from Next*.\n  BBC *next_return_ = nullptr;     // Current value of it_.data() used for repositioning.\n  // An iterator over the list at (x_, y_) in the grid_.\n  BBC_C_IT it_;\n  // Set of unique returned elements used when unique_mode_ is true.\n  std::unordered_set<BBC *> returns_;\n};\n\n// Sort function to sort a BBC by bounding_box().left().\ntemplate <class BBC>\nint SortByBoxLeft(const BBC *p1, const BBC *p2) {\n  int result = p1->bounding_box().left() - p2->bounding_box().left();\n  if (result != 0) {\n    return result;\n  }\n  result = p1->bounding_box().right() - p2->bounding_box().right();\n  if (result != 0) {\n    return result;\n  }\n  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();\n  if (result != 0) {\n    return result;\n  }\n  return p1->bounding_box().top() - p2->bounding_box().top();\n}\n\ntemplate <class BBC>\nbool StdSortByBoxLeft(const BBC *p1, const BBC *p2) {\n  int result = p1->bounding_box().left() - p2->bounding_box().left();\n  if (result != 0) {\n    return result < 0;\n  }\n  result = p1->bounding_box().right() - p2->bounding_box().right();\n  if (result != 0) {\n    return result < 0;\n  }\n  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();\n  if (result != 0) {\n    return result < 0;\n  }\n  return p1->bounding_box().top() < p2->bounding_box().top();\n}\n\n// Sort function to sort a BBC by bounding_box().right() in right-to-left order.\ntemplate <class BBC>\nint SortRightToLeft(const BBC *p1, const BBC *p2) {\n  int result = p2->bounding_box().right() - p1->bounding_box().right();\n  if (result != 0) {\n    return result;\n  }\n  result = p2->bounding_box().left() - p1->bounding_box().left();\n  if (result != 0) {\n    return result;\n  }\n  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();\n  if (result != 0) {\n    return result;\n  }\n  return p1->bounding_box().top() - p2->bounding_box().top();\n}\n\ntemplate <class BBC>\nbool StdSortRightToLeft(const BBC *p1, const BBC *p2) {\n  int result = p2->bounding_box().right() - p1->bounding_box().right();\n  if (result != 0) {\n    return result < 0;\n  }\n  result = p2->bounding_box().left() - p1->bounding_box().left();\n  if (result != 0) {\n    return result < 0;\n  }\n  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();\n  if (result != 0) {\n    return result < 0;\n  }\n  return p1->bounding_box().top() < p2->bounding_box().top();\n}\n\n// Sort function to sort a BBC by bounding_box().bottom().\ntemplate <class BBC>\nint SortByBoxBottom(const BBC *p1, const BBC *p2) {\n  int result = p1->bounding_box().bottom() - p2->bounding_box().bottom();\n  if (result != 0) {\n    return result;\n  }\n  result = p1->bounding_box().top() - p2->bounding_box().top();\n  if (result != 0) {\n    return result;\n  }\n  result = p1->bounding_box().left() - p2->bounding_box().left();\n  if (result != 0) {\n    return result;\n  }\n  return p1->bounding_box().right() - p2->bounding_box().right();\n}\n\n///////////////////////////////////////////////////////////////////////\n// BBGrid IMPLEMENTATION.\n///////////////////////////////////////////////////////////////////////\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBGrid<BBC, BBC_CLIST, BBC_C_IT>::BBGrid() : grid_(nullptr) {}\n\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBGrid<BBC, BBC_CLIST, BBC_C_IT>::BBGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)\n    : grid_(nullptr) {\n  Init(gridsize, bleft, tright);\n}\n\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBGrid<BBC, BBC_CLIST, BBC_C_IT>::~BBGrid() {\n  delete[] grid_;\n}\n\n// (Re)Initialize the grid. The gridsize is the size in pixels of each cell,\n// and bleft, tright are the bounding box of everything to go in it.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::Init(int gridsize, const ICOORD &bleft,\n                                            const ICOORD &tright) {\n  GridBase::Init(gridsize, bleft, tright);\n  delete[] grid_;\n  grid_ = new BBC_CLIST[gridbuckets_];\n}\n\n// Clear all lists, but leave the array of lists present.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::Clear() {\n  for (int i = 0; i < gridbuckets_; ++i) {\n    grid_[i].shallow_clear();\n  }\n}\n\n// Deallocate the data in the lists but otherwise leave the lists and the grid\n// intact.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::ClearGridData(void (*free_method)(BBC *)) {\n  if (grid_ == nullptr) {\n    return;\n  }\n  GridSearch<BBC, BBC_CLIST, BBC_C_IT> search(this);\n  search.StartFullSearch();\n  BBC *bb;\n  BBC_CLIST bb_list;\n  BBC_C_IT it(&bb_list);\n  while ((bb = search.NextFullSearch()) != nullptr) {\n    it.add_after_then_move(bb);\n  }\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    free_method(it.data());\n  }\n}\n\n// Insert a bbox into the appropriate place in the grid.\n// If h_spread, then all cells covered horizontally by the box are\n// used, otherwise, just the bottom-left. Similarly for v_spread.\n// WARNING: InsertBBox may invalidate an active GridSearch. Call\n// RepositionIterator() on any GridSearches that are active on this grid.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::InsertBBox(bool h_spread, bool v_spread, BBC *bbox) {\n  TBOX box = bbox->bounding_box();\n  int start_x, start_y, end_x, end_y;\n  GridCoords(box.left(), box.bottom(), &start_x, &start_y);\n  GridCoords(box.right(), box.top(), &end_x, &end_y);\n  if (!h_spread) {\n    end_x = start_x;\n  }\n  if (!v_spread) {\n    end_y = start_y;\n  }\n  int grid_index = start_y * gridwidth_;\n  for (int y = start_y; y <= end_y; ++y, grid_index += gridwidth_) {\n    for (int x = start_x; x <= end_x; ++x) {\n      grid_[grid_index + x].add_sorted(SortByBoxLeft<BBC>, true, bbox);\n    }\n  }\n}\n\n// Using a pix from TraceOutlineOnReducedPix or TraceBlockOnReducedPix, in\n// which each pixel corresponds to a grid cell, insert a bbox into every\n// place in the grid where the corresponding pixel is 1. The Pix is handled\n// upside-down to match the Tesseract coordinate system. (As created by\n// TraceOutlineOnReducedPix or TraceBlockOnReducedPix.)\n// (0, 0) in the pix corresponds to (left, bottom) in the\n// grid (in grid coords), and the pix works up the grid from there.\n// WARNING: InsertPixPtBBox may invalidate an active GridSearch. Call\n// RepositionIterator() on any GridSearches that are active on this grid.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::InsertPixPtBBox(int left, int bottom, Image pix, BBC *bbox) {\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  for (int y = 0; y < height; ++y) {\n    l_uint32 *data = pixGetData(pix) + y * pixGetWpl(pix);\n    for (int x = 0; x < width; ++x) {\n      if (GET_DATA_BIT(data, x)) {\n        grid_[(bottom + y) * gridwidth_ + x + left].add_sorted(SortByBoxLeft<BBC>, true, bbox);\n      }\n    }\n  }\n}\n\n// Remove the bbox from the grid.\n// WARNING: Any GridSearch operating on this grid could be invalidated!\n// If a GridSearch is operating, call GridSearch::RemoveBBox() instead.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::RemoveBBox(BBC *bbox) {\n  TBOX box = bbox->bounding_box();\n  int start_x, start_y, end_x, end_y;\n  GridCoords(box.left(), box.bottom(), &start_x, &start_y);\n  GridCoords(box.right(), box.top(), &end_x, &end_y);\n  int grid_index = start_y * gridwidth_;\n  for (int y = start_y; y <= end_y; ++y, grid_index += gridwidth_) {\n    for (int x = start_x; x <= end_x; ++x) {\n      BBC_C_IT it(&grid_[grid_index + x]);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        if (it.data() == bbox) {\n          it.extract();\n        }\n      }\n    }\n  }\n}\n\n// Returns true if the given rectangle has no overlapping elements.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nbool BBGrid<BBC, BBC_CLIST, BBC_C_IT>::RectangleEmpty(const TBOX &rect) {\n  GridSearch<BBC, BBC_CLIST, BBC_C_IT> rsearch(this);\n  rsearch.StartRectSearch(rect);\n  return rsearch.NextRectSearch() == nullptr;\n}\n\n// Returns an IntGrid showing the number of elements in each cell.\n// Returned IntGrid must be deleted after use.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nIntGrid *BBGrid<BBC, BBC_CLIST, BBC_C_IT>::CountCellElements() {\n  auto *intgrid = new IntGrid(gridsize(), bleft(), tright());\n  for (int y = 0; y < gridheight(); ++y) {\n    for (int x = 0; x < gridwidth(); ++x) {\n      int cell_count = grid_[y * gridwidth() + x].length();\n      intgrid->SetGridCell(x, y, cell_count);\n    }\n  }\n  return intgrid;\n}\n\n#ifndef GRAPHICS_DISABLED\ntemplate <class G>\nclass TabEventHandler : public SVEventHandler {\npublic:\n  explicit TabEventHandler(G *grid) : grid_(grid) {}\n  void Notify(const SVEvent *sv_event) override {\n    if (sv_event->type == SVET_CLICK) {\n      grid_->HandleClick(sv_event->x, sv_event->y);\n    }\n  }\n\nprivate:\n  G *grid_;\n};\n\n// Make a window of an appropriate size to display things in the grid.\n// Position the window at the given x,y.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nScrollView *BBGrid<BBC, BBC_CLIST, BBC_C_IT>::MakeWindow(int x, int y, const char *window_name) {\n  auto tab_win =\n      new ScrollView(window_name, x, y, tright_.x() - bleft_.x(), tright_.y() - bleft_.y(),\n                     tright_.x() - bleft_.x(), tright_.y() - bleft_.y(), true);\n  auto *handler = new TabEventHandler<BBGrid<BBC, BBC_CLIST, BBC_C_IT>>(this);\n  tab_win->AddEventHandler(handler);\n  tab_win->Pen(ScrollView::GREY);\n  tab_win->Rectangle(0, 0, tright_.x() - bleft_.x(), tright_.y() - bleft_.y());\n  return tab_win;\n}\n\n// Create a window at (x,y) and display the bounding boxes of the\n// BLOBNBOXes in this grid.\n// Use of this function requires an additional member of the BBC class:\n// ScrollView::Color BBC::BoxColor() const.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::DisplayBoxes(ScrollView *tab_win) {\n  tab_win->Pen(ScrollView::BLUE);\n  tab_win->Brush(ScrollView::NONE);\n\n  // For every bbox in the grid, display it.\n  GridSearch<BBC, BBC_CLIST, BBC_C_IT> gsearch(this);\n  gsearch.StartFullSearch();\n  BBC *bbox;\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &box = bbox->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    ScrollView::Color box_color = bbox->BoxColor();\n    tab_win->Pen(box_color);\n    tab_win->Rectangle(left_x, bottom_y, right_x, top_y);\n  }\n  tab_win->Update();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// ASSERT_HOST that every cell contains no more than one copy of each entry.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::AssertNoDuplicates() {\n  // Process all grid cells.\n  for (int i = gridwidth_ * gridheight_ - 1; i >= 0; --i) {\n    // Iterate over all elements excent the last.\n    for (BBC_C_IT it(&grid_[i]); !it.at_last(); it.forward()) {\n      BBC *ptr = it.data();\n      BBC_C_IT it2(it);\n      // None of the rest of the elements in the list should equal ptr.\n      for (it2.forward(); !it2.at_first(); it2.forward()) {\n        ASSERT_HOST(it2.data() != ptr);\n      }\n    }\n  }\n}\n\n// Handle a click event in a display window.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid BBGrid<BBC, BBC_CLIST, BBC_C_IT>::HandleClick(int x, int y) {\n  tprintf(\"Click at (%d, %d)\\n\", x, y);\n}\n\n///////////////////////////////////////////////////////////////////////\n// GridSearch IMPLEMENTATION.\n///////////////////////////////////////////////////////////////////////\n\n// Start a new full search. Will iterate all stored blobs.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::StartFullSearch() {\n  // Full search uses x_ and y_ as the current grid\n  // cell being searched.\n  CommonStart(grid_->bleft_.x(), grid_->tright_.y());\n}\n\n// Return the next bbox in the search or nullptr if done.\n// The other searches will return a box that overlaps the grid cell\n// thereby duplicating boxes, but NextFullSearch only returns each box once.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::NextFullSearch() {\n  int x;\n  int y;\n  do {\n    while (it_.cycled_list()) {\n      ++x_;\n      if (x_ >= grid_->gridwidth_) {\n        --y_;\n        if (y_ < 0) {\n          return CommonEnd();\n        }\n        x_ = 0;\n      }\n      SetIterator();\n    }\n    CommonNext();\n    TBOX box = previous_return_->bounding_box();\n    grid_->GridCoords(box.left(), box.bottom(), &x, &y);\n  } while (x != x_ || y != y_);\n  return previous_return_;\n}\n\n// Start a new radius search.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::StartRadSearch(int x, int y, int max_radius) {\n  // Rad search uses x_origin_ and y_origin_ as the center of the circle.\n  // The radius_ is the radius of the (diamond-shaped) circle and\n  // rad_index/rad_dir_ combine to determine the position around it.\n  max_radius_ = max_radius;\n  radius_ = 0;\n  rad_index_ = 0;\n  rad_dir_ = 3;\n  CommonStart(x, y);\n}\n\n// Return the next bbox in the radius search or nullptr if the\n// maximum radius has been reached.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::NextRadSearch() {\n  for (;;) {\n    while (it_.cycled_list()) {\n      ++rad_index_;\n      if (rad_index_ >= radius_) {\n        ++rad_dir_;\n        rad_index_ = 0;\n        if (rad_dir_ >= 4) {\n          ++radius_;\n          if (radius_ > max_radius_) {\n            return CommonEnd();\n          }\n          rad_dir_ = 0;\n        }\n      }\n      ICOORD offset = C_OUTLINE::chain_step(rad_dir_);\n      offset *= radius_ - rad_index_;\n      offset += C_OUTLINE::chain_step(rad_dir_ + 1) * rad_index_;\n      x_ = x_origin_ + offset.x();\n      y_ = y_origin_ + offset.y();\n      if (x_ >= 0 && x_ < grid_->gridwidth_ && y_ >= 0 && y_ < grid_->gridheight_) {\n        SetIterator();\n      }\n    }\n    CommonNext();\n    if (!unique_mode_) {\n      break;\n    }\n    auto inserted = returns_.insert(previous_return_);\n    if (inserted.second) {\n      break;\n    }\n  }\n  return previous_return_;\n}\n\n// Start a new left or right-looking search. Will search to the side\n// for a box that vertically overlaps the given vertical line segment.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::StartSideSearch(int x, int ymin, int ymax) {\n  // Right search records the x in x_origin_, the ymax in y_origin_\n  // and the size of the vertical strip to search in radius_.\n  // To guarantee finding overlapping objects of up to twice the\n  // given size, double the height.\n  radius_ = ((ymax - ymin) * 2 + grid_->gridsize_ - 1) / grid_->gridsize_;\n  rad_index_ = 0;\n  CommonStart(x, ymax);\n}\n\n// Return the next bbox in the side search or nullptr if the\n// edge has been reached. Searches left to right or right to left\n// according to the flag.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::NextSideSearch(bool right_to_left) {\n  for (;;) {\n    while (it_.cycled_list()) {\n      ++rad_index_;\n      if (rad_index_ > radius_) {\n        if (right_to_left) {\n          --x_;\n        } else {\n          ++x_;\n        }\n        rad_index_ = 0;\n        if (x_ < 0 || x_ >= grid_->gridwidth_) {\n          return CommonEnd();\n        }\n      }\n      y_ = y_origin_ - rad_index_;\n      if (y_ >= 0 && y_ < grid_->gridheight_) {\n        SetIterator();\n      }\n    }\n    CommonNext();\n    if (!unique_mode_) {\n      break;\n    }\n    auto inserted = returns_.insert(previous_return_);\n    if (inserted.second) {\n      break;\n    }\n  }\n  return previous_return_;\n}\n\n// Start a vertical-looking search. Will search up or down\n// for a box that horizontally overlaps the given line segment.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::StartVerticalSearch(int xmin, int xmax, int y) {\n  // Right search records the xmin in x_origin_, the y in y_origin_\n  // and the size of the horizontal strip to search in radius_.\n  radius_ = (xmax - xmin + grid_->gridsize_ - 1) / grid_->gridsize_;\n  rad_index_ = 0;\n  CommonStart(xmin, y);\n}\n\n// Return the next bbox in the vertical search or nullptr if the\n// edge has been reached. Searches top to bottom or bottom to top\n// according to the flag.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::NextVerticalSearch(bool top_to_bottom) {\n  for (;;) {\n    while (it_.cycled_list()) {\n      ++rad_index_;\n      if (rad_index_ > radius_) {\n        if (top_to_bottom) {\n          --y_;\n        } else {\n          ++y_;\n        }\n        rad_index_ = 0;\n        if (y_ < 0 || y_ >= grid_->gridheight_) {\n          return CommonEnd();\n        }\n      }\n      x_ = x_origin_ + rad_index_;\n      if (x_ >= 0 && x_ < grid_->gridwidth_) {\n        SetIterator();\n      }\n    }\n    CommonNext();\n    if (!unique_mode_) {\n      break;\n    }\n    auto inserted = returns_.insert(previous_return_);\n    if (inserted.second) {\n      break;\n    }\n  }\n  return previous_return_;\n}\n\n// Start a rectangular search. Will search for a box that overlaps the\n// given rectangle.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::StartRectSearch(const TBOX &rect) {\n  // Rect search records the xmin in x_origin_, the ymin in y_origin_\n  // and the xmax in max_radius_.\n  // The search proceeds left to right, top to bottom.\n  rect_ = rect;\n  CommonStart(rect.left(), rect.top());\n  grid_->GridCoords(rect.right(), rect.bottom(), // - rect.height(),\n                    &max_radius_, &y_origin_);\n}\n\n// Return the next bbox in the rectangular search or nullptr if complete.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::NextRectSearch() {\n  for (;;) {\n    while (it_.cycled_list()) {\n      ++x_;\n      if (x_ > max_radius_) {\n        --y_;\n        x_ = x_origin_;\n        if (y_ < y_origin_) {\n          return CommonEnd();\n        }\n      }\n      SetIterator();\n    }\n    CommonNext();\n    if (!rect_.overlap(previous_return_->bounding_box())) {\n      continue;\n    }\n    if (!unique_mode_) {\n      break;\n    }\n    auto inserted = returns_.insert(previous_return_);\n    if (inserted.second) {\n      break;\n    }\n  }\n  return previous_return_;\n}\n\n// Remove the last returned BBC. Will not invalidate this. May invalidate\n// any other concurrent GridSearch on the same grid. If any others are\n// in use, call RepositionIterator on those, to continue without harm.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::RemoveBBox() {\n  if (previous_return_ != nullptr) {\n    // Remove all instances of previous_return_ from the list, so the iterator\n    // remains valid after removal from the rest of the grid cells.\n    // if previous_return_ is not on the list, then it has been removed already.\n    BBC *prev_data = nullptr;\n    BBC *new_previous_return = nullptr;\n    it_.move_to_first();\n    for (it_.mark_cycle_pt(); !it_.cycled_list();) {\n      if (it_.data() == previous_return_) {\n        new_previous_return = prev_data;\n        it_.extract();\n        it_.forward();\n        next_return_ = it_.cycled_list() ? nullptr : it_.data();\n      } else {\n        prev_data = it_.data();\n        it_.forward();\n      }\n    }\n    grid_->RemoveBBox(previous_return_);\n    previous_return_ = new_previous_return;\n    RepositionIterator();\n  }\n}\n\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::RepositionIterator() {\n  // Something was deleted, so we have little choice but to clear the\n  // returns list.\n  returns_.clear();\n  // Reset the iterator back to one past the previous return.\n  // If the previous_return_ is no longer in the list, then\n  // next_return_ serves as a backup.\n  it_.move_to_first();\n  // Special case, the first element was removed and reposition\n  // iterator was called. In this case, the data is fine, but the\n  // cycle point is not. Detect it and return.\n  if (!it_.empty() && it_.data() == next_return_) {\n    it_.mark_cycle_pt();\n    return;\n  }\n  for (it_.mark_cycle_pt(); !it_.cycled_list(); it_.forward()) {\n    if (it_.data() == previous_return_ || it_.data_relative(1) == next_return_) {\n      CommonNext();\n      return;\n    }\n  }\n  // We ran off the end of the list. Move to a new cell next time.\n  previous_return_ = nullptr;\n  next_return_ = nullptr;\n}\n\n// Factored out helper to start a search.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::CommonStart(int x, int y) {\n  grid_->GridCoords(x, y, &x_origin_, &y_origin_);\n  x_ = x_origin_;\n  y_ = y_origin_;\n  SetIterator();\n  previous_return_ = nullptr;\n  next_return_ = it_.empty() ? nullptr : it_.data();\n  returns_.clear();\n}\n\n// Factored out helper to complete a next search.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::CommonNext() {\n  previous_return_ = it_.data();\n  it_.forward();\n  next_return_ = it_.cycled_list() ? nullptr : it_.data();\n  return previous_return_;\n}\n\n// Factored out final return when search is exhausted.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nBBC *GridSearch<BBC, BBC_CLIST, BBC_C_IT>::CommonEnd() {\n  previous_return_ = nullptr;\n  next_return_ = nullptr;\n  return nullptr;\n}\n\n// Factored out function to set the iterator to the current x_, y_\n// grid coords and mark the cycle pt.\ntemplate <class BBC, class BBC_CLIST, class BBC_C_IT>\nvoid GridSearch<BBC, BBC_CLIST, BBC_C_IT>::SetIterator() {\n  it_ = &(grid_->grid_[y_ * grid_->gridwidth_ + x_]);\n  it_.mark_cycle_pt();\n}\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_BBGRID_H_\n"
  },
  {
    "path": "src/textord/blkocc.cpp",
    "content": "/*****************************************************************************\n *\n * File:         blkocc.cpp  (Formerly blockocc.c)\n * Description:  Block Occupancy routines\n * Author:       Chris Newton\n *\n * (c) Copyright 1991, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n ******************************************************************************/\n\n#include \"blkocc.h\"\n\n#include \"drawtord.h\"\n#include \"errcode.h\"\n\n#include <cctype>\n#include <cmath>\n\n#include \"helpers.h\"\n\nnamespace tesseract {\n\ndouble_VAR(textord_underline_threshold, 0.5, \"Fraction of width occupied\");\n\n// Forward declarations of static functions\nstatic void horizontal_cblob_projection(C_BLOB *blob,  // blob to project\n                                        STATS *stats); // output\nstatic void horizontal_coutline_projection(C_OUTLINE *outline,\n                                           STATS *stats); // output\n\n/**\n * test_underline\n *\n * Check to see if the blob is an underline.\n * Return true if it is.\n */\n\nbool test_underline(  // look for underlines\n    bool testing_on,  ///< drawing blob\n    C_BLOB *blob,     ///< blob to test\n    int16_t baseline, ///< coords of baseline\n    int16_t xheight   ///< height of line\n) {\n  TDimension occ;\n  STATS projection;\n\n  auto blob_box = blob->bounding_box();\n  auto blob_width = blob->bounding_box().width();\n  projection.set_range(blob_box.bottom(), blob_box.top());\n  if (testing_on) {\n    //              blob->plot(to_win,GOLDENROD,GOLDENROD);\n    //              line_color_index(to_win,GOLDENROD);\n    //              move2d(to_win,blob_box.left(),baseline);\n    //              draw2d(to_win,blob_box.right(),baseline);\n    //              move2d(to_win,blob_box.left(),baseline+xheight);\n    //              draw2d(to_win,blob_box.right(),baseline+xheight);\n    tprintf(\"Testing underline on blob at (%d,%d)->(%d,%d), base=%d\\nOccs:\",\n            blob->bounding_box().left(), blob->bounding_box().bottom(),\n            blob->bounding_box().right(), blob->bounding_box().top(), baseline);\n  }\n  horizontal_cblob_projection(blob, &projection);\n  int32_t desc_occ = 0;\n  for (occ = blob_box.bottom(); occ < baseline; occ++) {\n    if (occ <= blob_box.top() && projection.pile_count(occ) > desc_occ) {\n      // max in region\n      desc_occ = projection.pile_count(occ);\n    }\n  }\n  int32_t x_occ = 0;\n  for (occ = baseline; occ <= baseline + xheight; occ++) {\n    if (occ >= blob_box.bottom() && occ <= blob_box.top() && projection.pile_count(occ) > x_occ) {\n      // max in region\n      x_occ = projection.pile_count(occ);\n    }\n  }\n  int32_t asc_occ = 0;\n  for (occ = baseline + xheight + 1; occ <= blob_box.top(); occ++) {\n    if (occ >= blob_box.bottom() && projection.pile_count(occ) > asc_occ) {\n      asc_occ = projection.pile_count(occ);\n    }\n  }\n  if (testing_on) {\n    tprintf(\"%d %d %d\\n\", desc_occ, x_occ, asc_occ);\n  }\n  if (desc_occ == 0 && x_occ == 0 && asc_occ == 0) {\n    tprintf(\"Bottom=%d, top=%d, base=%d, x=%d\\n\", blob_box.bottom(), blob_box.top(), baseline,\n            xheight);\n    projection.print();\n  }\n  if (desc_occ > x_occ + x_occ && desc_occ > blob_width * textord_underline_threshold) {\n    return true; // real underline\n  }\n  return asc_occ > x_occ + x_occ && asc_occ > blob_width * textord_underline_threshold; // overline\n                                                                                        // neither\n}\n\n/**\n * horizontal_cblob_projection\n *\n * Compute the horizontal projection of a cblob from its outlines\n * and add to the given STATS.\n */\n\nstatic void horizontal_cblob_projection( // project outlines\n    C_BLOB *blob,                        ///< blob to project\n    STATS *stats                         ///< output\n) {\n  // outlines of blob\n  C_OUTLINE_IT out_it = blob->out_list();\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    horizontal_coutline_projection(out_it.data(), stats);\n  }\n}\n\n/**\n * horizontal_coutline_projection\n *\n * Compute the horizontal projection of an outline from its outlines\n * and add to the given STATS.\n */\n\nstatic void horizontal_coutline_projection( // project outlines\n    C_OUTLINE *outline,                     ///< outline to project\n    STATS *stats                            ///< output\n) {\n  ICOORD pos;        // current point\n  ICOORD step;       // edge step\n  int32_t length;    // of outline\n  int16_t stepindex; // current step\n  C_OUTLINE_IT out_it = outline->child();\n\n  pos = outline->start_pos();\n  length = outline->pathlength();\n  for (stepindex = 0; stepindex < length; stepindex++) {\n    step = outline->step(stepindex);\n    if (step.y() > 0) {\n      stats->add(pos.y(), pos.x());\n    } else if (step.y() < 0) {\n      stats->add(pos.y() - 1, -pos.x());\n    }\n    pos += step;\n  }\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    horizontal_coutline_projection(out_it.data(), stats);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/blkocc.h",
    "content": "/******************************************************************************\n *\n * File:         blkocc.h  (Formerly blockocc.h)\n * Description:  Block Occupancy routines\n * Author:       Chris Newton\n *\n * (c) Copyright 1991, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n ******************************************************************************/\n\n#ifndef BLKOCC_H\n#define BLKOCC_H\n\n#include \"elst.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nclass C_BLOB;\n\n/***************************************************************************\nCLASS REGION_OCC\n\n  The class REGION_OCC defines a section of outline which exists entirely\n  within a single region. The only data held is the min and max x limits of\n  the outline within the region.\n\n  REGION_OCCs are held on lists, one list for each region.  The lists are\n  built in sorted order of min x. Overlapping REGION_OCCs are not permitted on\n  a single list. An overlapping region to be added causes the existing region\n  to be extended. This extension may result in the following REGION_OCC on the\n  list overlapping the amended one. In this case the amended REGION_OCC is\n  further extended to include the range of the following one, so that the\n  following one can be deleted.\n\n****************************************************************************/\n\nclass REGION_OCC : public ELIST<REGION_OCC>::LINK {\npublic:\n  float min_x;         // Lowest x in region\n  float max_x;         // Highest x in region\n  int16_t region_type; // Type of crossing\n\n  REGION_OCC() = default; // constructor used\n  // only in COPIER etc\n  REGION_OCC( // constructor\n      float min, float max, int16_t region) {\n    min_x = min;\n    max_x = max;\n    region_type = region;\n  }\n};\n\nELISTIZEH(REGION_OCC)\n#define RANGE_IN_BAND(band_max, band_min, range_max, range_min) \\\n  (((range_min) >= (band_min)) && ((range_max) < (band_max)))\n/************************************************************************\nAdapted from the following procedure so that it can be used in the bands\nclass in an include file...\n\nbool    range_in_band[\n              range within band?\nint16_t band_max,\nint16_t band_min,\nint16_t range_max,\nint16_t range_min]\n{\n  if ((range_min >= band_min) && (range_max < band_max))\n    return true;\n  else\n    return false;\n}\n***********************************************************************/\n#define RANGE_OVERLAPS_BAND(band_max, band_min, range_max, range_min) \\\n  (((range_max) >= (band_min)) && ((range_min) < (band_max)))\n/************************************************************************\nAdapted from the following procedure so that it can be used in the bands\nclass in an include file...\n\nbool    range_overlaps_band[\n              range crosses band?\nint16_t band_max,\nint16_t band_min,\nint16_t range_max,\nint16_t range_min]\n{\n  if ((range_max >= band_min) && (range_min < band_max))\n    return true;\n  else\n    return false;\n}\n***********************************************************************/\n/**********************************************************************\n  Bands\n  -----\n\n  BAND 4\n--------------------------------\n  BAND 3\n--------------------------------\n\n  BAND 2\n\n--------------------------------\n\n  BAND 1\n\nBand 0 is the dot band\n\nEach band has an error margin above and below. An outline is not considered to\nhave significantly changed bands until it has moved out of the error margin.\n*************************************************************************/\nclass BAND {\npublic:\n  int16_t max_max; // upper max\n  int16_t max;     // nominal max\n  int16_t min_max; // lower max\n  int16_t max_min; // upper min\n  int16_t min;     // nominal min\n  int16_t min_min; // lower min\n\n  BAND() = default; // constructor\n\n  void set(                  // initialise a band\n      int16_t new_max_max,   // upper max\n      int16_t new_max,       // new nominal max\n      int16_t new_min_max,   // new lower max\n      int16_t new_max_min,   // new upper min\n      int16_t new_min,       // new nominal min\n      int16_t new_min_min) { // new lower min\n    max_max = new_max_max;\n    max = new_max;\n    min_max = new_min_max;\n    max_min = new_max_min;\n    min = new_min;\n    min_min = new_min_min;\n  }\n\n  bool in_minimal( // in minimal limits?\n      float y) {   // y value\n    return (y >= max_min) && (y < min_max);\n  }\n\n  bool in_nominal( // in nominal limits?\n      float y) {   // y value\n    return (y >= min) && (y < max);\n  }\n\n  bool in_maximal( // in maximal limits?\n      float y) {   // y value\n    return (y >= min_min) && (y < max_max);\n  }\n\n  // overlaps min limits?\n  bool range_overlaps_minimal(float y1,   // one range limit\n                              float y2) { // other range limit\n    if (y1 > y2) {\n      return RANGE_OVERLAPS_BAND(min_max, max_min, y1, y2);\n    } else {\n      return RANGE_OVERLAPS_BAND(min_max, max_min, y2, y1);\n    }\n  }\n\n  // overlaps nom limits?\n  bool range_overlaps_nominal(float y1,   // one range limit\n                              float y2) { // other range limit\n    if (y1 > y2) {\n      return RANGE_OVERLAPS_BAND(max, min, y1, y2);\n    } else {\n      return RANGE_OVERLAPS_BAND(max, min, y2, y1);\n    }\n  }\n\n  // overlaps max limits?\n  bool range_overlaps_maximal(float y1,   // one range limit\n                              float y2) { // other range limit\n    if (y1 > y2) {\n      return RANGE_OVERLAPS_BAND(max_max, min_min, y1, y2);\n    } else {\n      return RANGE_OVERLAPS_BAND(max_max, min_min, y2, y1);\n    }\n  }\n\n  bool range_in_minimal( // within min limits?\n      float y1,          // one range limit\n      float y2) {        // other range limit\n    if (y1 > y2) {\n      return RANGE_IN_BAND(min_max, max_min, y1, y2);\n    } else {\n      return RANGE_IN_BAND(min_max, max_min, y2, y1);\n    }\n  }\n\n  bool range_in_nominal( // within nom limits?\n      float y1,          // one range limit\n      float y2) {        // other range limit\n    if (y1 > y2) {\n      return RANGE_IN_BAND(max, min, y1, y2);\n    } else {\n      return RANGE_IN_BAND(max, min, y2, y1);\n    }\n  }\n\n  bool range_in_maximal( // within max limits?\n      float y1,          // one range limit\n      float y2) {        // other range limit\n    if (y1 > y2) {\n      return RANGE_IN_BAND(max_max, min_min, y1, y2);\n    } else {\n      return RANGE_IN_BAND(max_max, min_min, y2, y1);\n    }\n  }\n};\n\n/* Standard positions */\n\n#define MAX_NUM_BANDS 5\n#define UNDEFINED_BAND 99\n#define NO_LOWER_LIMIT -9999\n#define NO_UPPER_LIMIT 9999\n\n#define DOT_BAND 0\n\n/* Special occupancy code emitted for the 0 region at the end of a word */\n\n#define END_OF_WERD_CODE 255\n\nextern double_VAR_H(textord_underline_threshold);\n\nbool test_underline(  // look for underlines\n    bool testing_on,  // drawing blob\n    C_BLOB *blob,     // blob to test\n    int16_t baseline, // coords of baseline\n    int16_t xheight   // height of line\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/blobgrid.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        blobgrid.cpp\n// Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods.\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Created:     Sat Jun 11 10:30:01 PST 2011\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"blobgrid.h\"\n\nnamespace tesseract {\n\nBlobGrid::BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)\n    : BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>(gridsize, bleft, tright) {}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nBlobGrid::~BlobGrid() = default;\n\n// Inserts all the blobs from the given list, with x and y spreading,\n// without removing from the source list, so ownership remains with the\n// source list.\nvoid BlobGrid::InsertBlobList(BLOBNBOX_LIST *blobs) {\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      InsertBBox(true, true, blob);\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/blobgrid.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        blobgrid.h\n// Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods.\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Created:     Sat Jun 11 10:26:01 PST 2011\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_BLOBGRID_H_\n#define TESSERACT_TEXTORD_BLOBGRID_H_\n\n#include \"bbgrid.h\"\n#include \"blobbox.h\"\n\nnamespace tesseract {\n\nCLISTIZEH(BLOBNBOX)\n\nusing BlobGridSearch = GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>;\n\nclass TESS_API BlobGrid : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {\npublic:\n  BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~BlobGrid() override;\n\n  // Inserts all the blobs from the given list, with x and y spreading,\n  // without removing from the source list, so ownership remains with the\n  // source list.\n  void InsertBlobList(BLOBNBOX_LIST *blobs);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_BLOBGRID_H_\n"
  },
  {
    "path": "src/textord/ccnontextdetect.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ccnontextdetect.cpp\n// Description: Connected-Component-based photo (non-text) detection.\n// Author:      rays@google.com (Ray Smith)\n//\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"ccnontextdetect.h\"\n#include \"helpers.h\"         // for IntCastRounded\n#include \"imagefind.h\"\n#include \"strokewidth.h\"\n\nnamespace tesseract {\n\n// Max number of neighbour small objects per squared gridsize before a grid\n// cell becomes image.\nconst double kMaxSmallNeighboursPerPix = 1.0 / 32;\n// Max number of small blobs a large blob may overlap before it is rejected\n// and determined to be image.\nconst int kMaxLargeOverlapsWithSmall = 3;\n// Max number of small blobs a medium blob may overlap before it is rejected\n// and determined to be image. Larger than for large blobs as medium blobs\n// may be complex Chinese characters. Very large Chinese characters are going\n// to overlap more medium blobs than small.\nconst int kMaxMediumOverlapsWithSmall = 12;\n// Max number of normal blobs a large blob may overlap before it is rejected\n// and determined to be image. This is set higher to allow for drop caps, which\n// may overlap a lot of good text blobs.\nconst int kMaxLargeOverlapsWithMedium = 12;\n// Multiplier of original noise_count used to test for the case of spreading\n// noise beyond where it should really be.\nconst int kOriginalNoiseMultiple = 8;\n// Pixel padding for noise blobs when rendering on the image\n// mask to encourage them to join together. Make it too big and images\n// will fatten out too much and have to be clipped to text.\nconst int kNoisePadding = 4;\n// Fraction of max_noise_count_ to be added to the noise count if there is\n// photo mask in the background.\nconst double kPhotoOffsetFraction = 0.375;\n// Min ratio of perimeter^2/16area for a \"good\" blob in estimating noise\n// density. Good blobs are supposed to be highly likely real text.\n// We consider a square to have unit ratio, where A=(p/4)^2, hence the factor\n// of 16. Digital circles are weird and have a minimum ratio of pi/64, not\n// the 1/(4pi) that you would expect.\nconst double kMinGoodTextPARatio = 1.5;\n\nCCNonTextDetect::CCNonTextDetect(int gridsize, const ICOORD &bleft, const ICOORD &tright)\n    : BlobGrid(gridsize, bleft, tright)\n    , max_noise_count_(static_cast<int>(kMaxSmallNeighboursPerPix * gridsize * gridsize))\n    , noise_density_(nullptr) {\n  // TODO(rays) break max_noise_count_ out into an area-proportional\n  // value, as now plus an additive constant for the number of text blobs\n  // in the 3x3 neighbourhood - maybe 9.\n}\n\nCCNonTextDetect::~CCNonTextDetect() {\n  delete noise_density_;\n}\n\n// Creates and returns a Pix with the same resolution as the original\n// in which 1 (black) pixels represent likely non text (photo, line drawing)\n// areas of the page, deleting from the blob_block the blobs that were\n// determined to be non-text.\n// The photo_map is used to bias the decision towards non-text, rather than\n// supplying definite decision.\n// The blob_block is the usual result of connected component analysis,\n// holding the detected blobs.\n// The returned Pix should be PixDestroyed after use.\nImage CCNonTextDetect::ComputeNonTextMask(bool debug, Image photo_map, TO_BLOCK *blob_block) {\n  // Insert the smallest blobs into the grid.\n  InsertBlobList(&blob_block->small_blobs);\n  InsertBlobList(&blob_block->noise_blobs);\n  // Add the medium blobs that don't have a good strokewidth neighbour.\n  // Those that do go into good_grid as an antidote to spreading beyond the\n  // real reaches of a noise region.\n  BlobGrid good_grid(gridsize(), bleft(), tright());\n  BLOBNBOX_IT blob_it(&blob_block->blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0;\n    perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area();\n    if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio) {\n      InsertBBox(true, true, blob);\n    } else {\n      good_grid.InsertBBox(true, true, blob);\n    }\n  }\n  noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid);\n  good_grid.Clear(); // Not needed any more.\n  Image pix = noise_density_->ThresholdToPix(max_noise_count_);\n  if (debug) {\n    pixWrite(\"junknoisemask.png\", pix, IFF_PNG);\n  }\n  ScrollView *win = nullptr;\n#ifndef GRAPHICS_DISABLED\n  if (debug) {\n    win = MakeWindow(0, 400, \"Photo Mask Blobs\");\n  }\n#endif // !GRAPHICS_DISABLED\n  // Large and medium blobs are not text if they overlap with \"a lot\" of small\n  // blobs.\n  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithSmall, win,\n                            ScrollView::DARK_GREEN, pix);\n  MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall, win, ScrollView::WHITE,\n                            pix);\n  // Clear the grid of small blobs and insert the medium blobs.\n  Clear();\n  InsertBlobList(&blob_block->blobs);\n  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithMedium, win,\n                            ScrollView::DARK_GREEN, pix);\n  // Clear again before we start deleting the blobs in the grid.\n  Clear();\n  MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1, win, ScrollView::CORAL, pix);\n  MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1, win, ScrollView::GOLDENROD, pix);\n  MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1, win, ScrollView::WHITE, pix);\n  if (debug) {\n#ifndef GRAPHICS_DISABLED\n    win->Update();\n#endif // !GRAPHICS_DISABLED\n    pixWrite(\"junkccphotomask.png\", pix, IFF_PNG);\n#ifndef GRAPHICS_DISABLED\n    win->AwaitEvent(SVET_DESTROY);\n    delete win;\n#endif // !GRAPHICS_DISABLED\n  }\n  return pix;\n}\n\n// Computes and returns the noise_density IntGrid, at the same gridsize as\n// this by summing the number of small elements in a 3x3 neighbourhood of\n// each grid cell. good_grid is filled with blobs that are considered most\n// likely good text, and this is filled with small and medium blobs that are\n// more likely non-text.\n// The photo_map is used to bias the decision towards non-text, rather than\n// supplying definite decision.\nIntGrid *CCNonTextDetect::ComputeNoiseDensity(bool debug, Image photo_map, BlobGrid *good_grid) {\n  IntGrid *noise_counts = CountCellElements();\n  IntGrid *noise_density = noise_counts->NeighbourhoodSum();\n  IntGrid *good_counts = good_grid->CountCellElements();\n  // Now increase noise density in photo areas, to bias the decision and\n  // minimize hallucinated text on image, but trim the noise_density where\n  // there are good blobs and the original count is low in non-photo areas,\n  // indicating that most of the result came from neighbouring cells.\n  int height = pixGetHeight(photo_map);\n  int photo_offset = IntCastRounded(max_noise_count_ * kPhotoOffsetFraction);\n  for (int y = 0; y < gridheight(); ++y) {\n    for (int x = 0; x < gridwidth(); ++x) {\n      int noise = noise_density->GridCellValue(x, y);\n      if (max_noise_count_ < noise + photo_offset && noise <= max_noise_count_) {\n        // Test for photo.\n        int left = x * gridsize();\n        int right = left + gridsize();\n        int bottom = height - y * gridsize();\n        int top = bottom - gridsize();\n        if (ImageFind::BoundsWithinRect(photo_map, &left, &top, &right, &bottom)) {\n          noise_density->SetGridCell(x, y, noise + photo_offset);\n        }\n      }\n      if (debug && noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0) {\n        tprintf(\"At %d, %d, noise = %d, good=%d, orig=%d, thr=%d\\n\", x * gridsize(), y * gridsize(),\n                noise_density->GridCellValue(x, y), good_counts->GridCellValue(x, y),\n                noise_counts->GridCellValue(x, y), max_noise_count_);\n      }\n      if (noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0 &&\n          noise_counts->GridCellValue(x, y) * kOriginalNoiseMultiple <= max_noise_count_) {\n        noise_density->SetGridCell(x, y, 0);\n      }\n    }\n  }\n  delete noise_counts;\n  delete good_counts;\n  return noise_density;\n}\n\n// Helper to expand a box in one of the 4 directions by the given pad,\n// provided it does not expand into any cell with a zero noise density.\n// If that is not possible, try expanding all round by a small constant.\nstatic TBOX AttemptBoxExpansion(const TBOX &box, const IntGrid &noise_density, int pad) {\n  TBOX expanded_box(box);\n  expanded_box.set_right(box.right() + pad);\n  if (!noise_density.AnyZeroInRect(expanded_box)) {\n    return expanded_box;\n  }\n  expanded_box = box;\n  expanded_box.set_left(box.left() - pad);\n  if (!noise_density.AnyZeroInRect(expanded_box)) {\n    return expanded_box;\n  }\n  expanded_box = box;\n  expanded_box.set_top(box.top() + pad);\n  if (!noise_density.AnyZeroInRect(expanded_box)) {\n    return expanded_box;\n  }\n  expanded_box = box;\n  expanded_box.set_bottom(box.bottom() + pad);\n  if (!noise_density.AnyZeroInRect(expanded_box)) {\n    return expanded_box;\n  }\n  expanded_box = box;\n  expanded_box.pad(kNoisePadding, kNoisePadding);\n  if (!noise_density.AnyZeroInRect(expanded_box)) {\n    return expanded_box;\n  }\n  return box;\n}\n\n// Tests each blob in the list to see if it is certain non-text using 2\n// conditions:\n// 1. blob overlaps a cell with high value in noise_density_ (previously set\n// by ComputeNoiseDensity).\n// OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This\n// condition is disabled with max_blob_overlaps == -1.\n// If it does, the blob is declared non-text, and is used to mark up the\n// nontext_mask. Such blobs are fully deleted, and non-noise blobs have their\n// neighbours reset, as they may now point to deleted data.\n// WARNING: The blobs list blobs may be in the *this grid, but they are\n// not removed. If any deleted blobs might be in *this, then this must be\n// Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.\n// If the win is not nullptr, deleted blobs are drawn on it in red, and kept\n// blobs are drawn on it in ok_color.\nvoid CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST *blobs, int max_blob_overlaps,\n                                                ScrollView *win, ScrollView::Color ok_color,\n                                                Image nontext_mask) {\n  int imageheight = tright().y() - bleft().x();\n  BLOBNBOX_IT blob_it(blobs);\n  BLOBNBOX_LIST dead_blobs;\n  BLOBNBOX_IT dead_it(&dead_blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    TBOX box = blob->bounding_box();\n    if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) &&\n        (max_blob_overlaps < 0 || !BlobOverlapsTooMuch(blob, max_blob_overlaps))) {\n      blob->ClearNeighbours();\n#ifndef GRAPHICS_DISABLED\n      if (win != nullptr) {\n        blob->plot(win, ok_color, ok_color);\n      }\n#endif // !GRAPHICS_DISABLED\n    } else {\n      if (noise_density_->AnyZeroInRect(box)) {\n        // There is a danger that the bounding box may overlap real text, so\n        // we need to render the outline.\n        Image blob_pix = blob->cblob()->render_outline();\n        pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(),\n                    PIX_SRC | PIX_DST, blob_pix, 0, 0);\n        blob_pix.destroy();\n      } else {\n        if (box.area() < gridsize() * gridsize()) {\n          // It is a really bad idea to make lots of small components in the\n          // photo mask, so try to join it to a bigger area by expanding the\n          // box in a way that does not touch any zero noise density cell.\n          box = AttemptBoxExpansion(box, *noise_density_, gridsize());\n        }\n        // All overlapped cells are non-zero, so just mark the rectangle.\n        pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(),\n                    PIX_SET, nullptr, 0, 0);\n      }\n#ifndef GRAPHICS_DISABLED\n      if (win != nullptr) {\n        blob->plot(win, ScrollView::RED, ScrollView::RED);\n      }\n#endif // !GRAPHICS_DISABLED\n      // It is safe to delete the cblob now, as it isn't used by the grid\n      // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the\n      // dead_blobs list.\n      // TODO: remove next line, currently still needed for resultiterator_test.\n      delete blob->remove_cblob();\n      dead_it.add_to_end(blob_it.extract());\n    }\n  }\n}\n\n// Returns true if the given blob overlaps more than max_overlaps blobs\n// in the current grid.\nbool CCNonTextDetect::BlobOverlapsTooMuch(BLOBNBOX *blob, int max_overlaps) {\n  // Search the grid to see what intersects it.\n  // Setup a Rectangle search for overlapping this blob.\n  BlobGridSearch rsearch(this);\n  const TBOX &box = blob->bounding_box();\n  rsearch.StartRectSearch(box);\n  rsearch.SetUniqueMode(true);\n  BLOBNBOX *neighbour;\n  int overlap_count = 0;\n  while (overlap_count <= max_overlaps && (neighbour = rsearch.NextRectSearch()) != nullptr) {\n    if (box.major_overlap(neighbour->bounding_box())) {\n      ++overlap_count;\n      if (overlap_count > max_overlaps) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/ccnontextdetect.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ccnontextdetect.h\n// Description: Connected-Component-based non-text detection.\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Created:     Sat Jun 11 09:52:01 PST 2011\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_CCPHOTODETECT_H_\n#define TESSERACT_TEXTORD_CCPHOTODETECT_H_\n\n#include \"blobgrid.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n// The CCNonTextDetect class contains grid-based operations on blobs to create\n// a full-resolution image mask analogous yet complementary to\n// pixGenHalftoneMask as it is better at line-drawings, graphs and charts.\nclass CCNonTextDetect : public BlobGrid {\npublic:\n  CCNonTextDetect(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~CCNonTextDetect() override;\n\n  // Creates and returns a Pix with the same resolution as the original\n  // in which 1 (black) pixels represent likely non text (photo, line drawing)\n  // areas of the page, deleting from the blob_block the blobs that were\n  // determined to be non-text.\n  // The photo_map (binary image mask) is used to bias the decision towards\n  // non-text, rather than supplying a definite decision.\n  // The blob_block is the usual result of connected component analysis,\n  // holding the detected blobs.\n  // The returned Pix should be PixDestroyed after use.\n  Image ComputeNonTextMask(bool debug, Image photo_map, TO_BLOCK *blob_block);\n\nprivate:\n  // Computes and returns the noise_density IntGrid, at the same gridsize as\n  // this by summing the number of small elements in a 3x3 neighbourhood of\n  // each grid cell. good_grid is filled with blobs that are considered most\n  // likely good text, and this is filled with small and medium blobs that are\n  // more likely non-text.\n  // The photo_map is used to bias the decision towards non-text, rather than\n  // supplying definite decision.\n  IntGrid *ComputeNoiseDensity(bool debug, Image photo_map, BlobGrid *good_grid);\n\n  // Tests each blob in the list to see if it is certain non-text using 2\n  // conditions:\n  // 1. blob overlaps a cell with high value in noise_density_ (previously set\n  // by ComputeNoiseDensity).\n  // OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This\n  // condition is disabled with max_blob_overlaps == -1.\n  // If it does, the blob is declared non-text, and is used to mark up the\n  // nontext_mask. Such blobs are fully deleted, and non-noise blobs have their\n  // neighbours reset, as they may now point to deleted data.\n  // WARNING: The blobs list blobs may be in the *this grid, but they are\n  // not removed. If any deleted blobs might be in *this, then this must be\n  // Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.\n  // If the win is not nullptr, deleted blobs are drawn on it in red, and kept\n  void MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST *blobs, int max_blob_overlaps, ScrollView *win,\n                                 ScrollView::Color ok_color, Image nontext_mask);\n  // Returns true if the given blob overlaps more than max_overlaps blobs\n  // in the current grid.\n  bool BlobOverlapsTooMuch(BLOBNBOX *blob, int max_overlaps);\n\n  // Max entry in noise_density_ before the cell is declared noisy.\n  int max_noise_count_;\n  // Completed noise density map, which we keep around to use for secondary\n  // noise detection.\n  IntGrid *noise_density_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_CCPHOTODETECT_H_\n"
  },
  {
    "path": "src/textord/cjkpitch.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        cjkpitch.cpp\n// Description: Code to determine fixed pitchness and the pitch if fixed,\n//              for CJK text.\n// Author:      takenaka@google.com (Hiroshi Takenaka)\n//\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"cjkpitch.h\"\n#include \"topitch.h\"\n#include \"tovars.h\"\n\n#include <algorithm> // for std::sort\n#include <cmath>\n#include <vector>    // for std::vector\n\nnamespace tesseract {\n\nstatic BOOL_VAR(textord_space_size_is_variable, false,\n                \"If true, word delimiter spaces are assumed to have \"\n                \"variable width, even though characters have fixed pitch.\");\n\n// Allow +/-10% error for character pitch / body size.\nstatic const float kFPTolerance = 0.1f;\n\n// Minimum ratio of \"good\" character pitch for a row to be considered\n// to be fixed-pitch.\nstatic const float kFixedPitchThreshold = 0.35f;\n\n// rank statistics for a small collection of float values.\nclass SimpleStats {\npublic:\n  SimpleStats() = default;\n  ~SimpleStats() = default;\n\n  void Clear() {\n    values_.clear();\n    finalized_ = false;\n  }\n\n  void Add(float value) {\n    values_.push_back(value);\n    finalized_ = false;\n  }\n\n  void Finish() {\n    std::sort(values_.begin(), values_.end());\n    finalized_ = true;\n  }\n\n  float ile(double frac) {\n    if (!finalized_) {\n      Finish();\n    }\n    if (values_.empty()) {\n      return 0.0f;\n    }\n    if (frac >= 1.0) {\n      return values_.back();\n    }\n    if (frac <= 0.0 || values_.size() == 1) {\n      return values_[0];\n    }\n    int index = static_cast<int>((values_.size() - 1) * frac);\n    float reminder = (values_.size() - 1) * frac - index;\n\n    return values_[index] * (1.0f - reminder) + values_[index + 1] * reminder;\n  }\n\n  float median() {\n    return ile(0.5);\n  }\n\n  float minimum() {\n    if (!finalized_) {\n      Finish();\n    }\n    if (values_.empty()) {\n      return 0.0f;\n    }\n    return values_[0];\n  }\n\n  bool empty() const {\n    return values_.empty();\n  }\n\n  int size() const {\n    return values_.size();\n  }\n\nprivate:\n  bool finalized_ = false;\n  std::vector<float> values_;\n};\n\n// statistics for a small collection of float pairs (x, y).\n// EstimateYFor(x, r) returns the estimated y at x, based on\n// existing samples between x*(1-r) ~ x*(1+r).\nclass LocalCorrelation {\npublic:\n  struct float_pair {\n    float x, y;\n    int vote;\n  };\n\n  LocalCorrelation() : finalized_(false) {}\n  ~LocalCorrelation() = default;\n\n  void Finish() {\n    std::sort(values_.begin(), values_.end(), float_pair_compare);\n    finalized_ = true;\n  }\n\n  void Clear() {\n    finalized_ = false;\n  }\n\n  void Add(float x, float y, int v) {\n    struct float_pair value;\n    value.x = x;\n    value.y = y;\n    value.vote = v;\n    values_.push_back(value);\n    finalized_ = false;\n  }\n\n  float EstimateYFor(float x, float r) {\n    ASSERT_HOST(finalized_);\n    unsigned start = 0, end = values_.size();\n    // Because the number of samples (used_) is assumed to be small,\n    // just use linear search to find values within the range.\n    while (start < values_.size() && values_[start].x < x * (1 - r)) {\n      start++;\n    }\n    while (end > 0 && values_[end - 1].x > x * (1 + r)) {\n      end--;\n    }\n\n    // Fall back to the global average if there are no data within r\n    // of x.\n    if (start >= end) {\n      start = 0;\n      end = values_.size();\n    }\n\n    // Compute weighted average of the values.\n    float rc = 0;\n    int vote = 0;\n    for (auto i = start; i < end; i++) {\n      rc += values_[i].vote * x * values_[i].y / values_[i].x;\n      vote += values_[i].vote;\n    }\n\n    return vote == 0 ? 0.0f : rc / vote;\n  }\n\nprivate:\n  static bool float_pair_compare(const float_pair f_a, const float_pair f_b) {\n    return f_a.x < f_b.x;\n  }\n\n  bool finalized_;\n  std::vector<struct float_pair> values_;\n};\n\n// Class to represent a character on a fixed pitch row.  A FPChar may\n// consist of multiple blobs (BLOBNBOX's).\nclass FPChar {\npublic:\n  enum Alignment { ALIGN_UNKNOWN, ALIGN_GOOD, ALIGN_BAD };\n\n  FPChar()\n      : box_()\n      , real_body_()\n      , from_(nullptr)\n      , to_(nullptr)\n      , num_blobs_(0)\n      , max_gap_(0)\n      , final_(false)\n      , alignment_(ALIGN_UNKNOWN)\n      , merge_to_prev_(false)\n      , delete_flag_(false) {}\n\n  // Initialize from blob.\n  void Init(BLOBNBOX *blob) {\n    box_ = blob->bounding_box();\n    real_body_ = box_;\n    from_ = to_ = blob;\n    num_blobs_ = 1;\n  }\n\n  // Merge this character with \"next\". The \"next\" character should\n  // consist of succeeding blobs on the same row.\n  void Merge(const FPChar &next) {\n    int gap = real_body_.x_gap(next.real_body_);\n    if (gap > max_gap_) {\n      max_gap_ = gap;\n    }\n\n    box_ += next.box_;\n    real_body_ += next.real_body_;\n    to_ = next.to_;\n    num_blobs_ += next.num_blobs_;\n  }\n\n  // Accessors.\n  const TBOX &box() const {\n    return box_;\n  }\n  void set_box(const TBOX &box) {\n    box_ = box;\n  }\n  const TBOX &real_body() const {\n    return real_body_;\n  }\n\n  bool is_final() const {\n    return final_;\n  }\n  void set_final(bool flag) {\n    final_ = flag;\n  }\n\n  const Alignment &alignment() const {\n    return alignment_;\n  }\n  void set_alignment(Alignment alignment) {\n    alignment_ = alignment;\n  }\n\n  bool merge_to_prev() const {\n    return merge_to_prev_;\n  }\n  void set_merge_to_prev(bool flag) {\n    merge_to_prev_ = flag;\n  }\n\n  bool delete_flag() const {\n    return delete_flag_;\n  }\n  void set_delete_flag(bool flag) {\n    delete_flag_ = flag;\n  }\n\n  int max_gap() const {\n    return max_gap_;\n  }\n\n  int num_blobs() const {\n    return num_blobs_;\n  }\n\nprivate:\n  TBOX box_; // Rectangle region considered to be occupied by this\n  // character.  It could be bigger than the bounding box.\n  TBOX real_body_; // Real bounding box of this character.\n  BLOBNBOX *from_; // The first blob of this character.\n  BLOBNBOX *to_;   // The last blob of this character.\n  int num_blobs_;  // Number of blobs that belong to this character.\n  int max_gap_;    // Maximum x gap between the blobs.\n\n  bool final_; // True if alignment/fragmentation decision for this\n  // character is finalized.\n\n  Alignment alignment_; // Alignment status.\n  bool merge_to_prev_;  // True if this is a fragmented blob that\n  // needs to be merged to the previous\n  // character.\n\n  int delete_flag_; // True if this character is merged to another\n                    // one and needs to be deleted.\n};\n\n// Class to represent a fixed pitch row, as a linear collection of\n// FPChar's.\nclass FPRow {\npublic:\n  FPRow() : all_pitches_(), all_gaps_(), good_pitches_(), good_gaps_(), heights_(), characters_() {}\n\n  ~FPRow() = default;\n\n  // Initialize from TD_ROW.\n  void Init(TO_ROW *row);\n\n  // Estimate character pitch of this row, based on current alignment\n  // status of underlying FPChar's.  The argument pass1 can be set to\n  // true if the function is called after Pass1Analyze(), to eliminate\n  // some redundant computation.\n  void EstimatePitch(bool pass1);\n\n  // Check each character if it has good character pitches between its\n  // predecessor and its successor and set its alignment status.  If\n  // we already calculated the estimated pitch for this row, the value\n  // is used.  If we didn't, a character is considered to be good, if\n  // the pitches between its predecessor and its successor are almost\n  // equal.\n  void Pass1Analyze();\n\n  // Find characters that fit nicely into one imaginary body next to a\n  // character which is already finalized. Then mark them as character\n  // fragments.\n  bool Pass2Analyze();\n\n  // Merge FPChars marked as character fragments into one.\n  void MergeFragments();\n\n  // Finalize characters that are already large enough and cannot be\n  // merged with others any more.\n  void FinalizeLargeChars();\n\n  // Output pitch estimation results to attributes of TD_ROW.\n  void OutputEstimations();\n\n  void DebugOutputResult(int row_index);\n\n  int good_pitches() {\n    return good_pitches_.size();\n  }\n\n  float pitch() {\n    return pitch_;\n  }\n\n  float estimated_pitch() {\n    return estimated_pitch_;\n  }\n\n  void set_estimated_pitch(float v) {\n    estimated_pitch_ = v;\n  }\n\n  float height() {\n    return height_;\n  }\n\n  float height_pitch_ratio() {\n    if (good_pitches_.size() < 2) {\n      return -1.0;\n    }\n    return height_ / good_pitches_.median();\n  }\n\n  float gap() {\n    return gap_;\n  }\n\n  size_t num_chars() {\n    return characters_.size();\n  }\n  FPChar *character(int i) {\n    return &characters_[i];\n  }\n\n  const TBOX &box(int i) {\n    return characters_[i].box();\n  }\n\n  const TBOX &real_body(int i) {\n    return characters_[i].real_body();\n  }\n\n  bool is_box_modified(int i) {\n    return !(characters_[i].box() == characters_[i].real_body());\n  }\n\n  float center_x(int i) {\n    return (characters_[i].box().left() + characters_[i].box().right()) / 2.0;\n  }\n\n  bool is_final(int i) {\n    return characters_[i].is_final();\n  }\n\n  void finalize(int i) {\n    characters_[i].set_final(true);\n  }\n\n  bool is_good(int i) {\n    return characters_[i].alignment() == FPChar::ALIGN_GOOD;\n  }\n\n  void mark_good(int i) {\n    characters_[i].set_alignment(FPChar::ALIGN_GOOD);\n  }\n\n  void mark_bad(int i) {\n    characters_[i].set_alignment(FPChar::ALIGN_BAD);\n  }\n\n  void clear_alignment(int i) {\n    characters_[i].set_alignment(FPChar::ALIGN_UNKNOWN);\n  }\n\nprivate:\n  static float x_overlap_fraction(const TBOX &box1, const TBOX &box2) {\n    if (std::min(box1.width(), box2.width()) == 0) {\n      return 0.0;\n    }\n    return -box1.x_gap(box2) / static_cast<float>(std::min(box1.width(), box2.width()));\n  }\n\n  static bool mostly_overlap(const TBOX &box1, const TBOX &box2) {\n    return x_overlap_fraction(box1, box2) > 0.9;\n  }\n\n  static bool significant_overlap(const TBOX &box1, const TBOX &box2) {\n    if (std::min(box1.width(), box2.width()) == 0) {\n      return false;\n    }\n    int overlap = -box1.x_gap(box2);\n    return overlap > 1 || x_overlap_fraction(box1, box2) > 0.1;\n  }\n\n  static float box_pitch(const TBOX &ref, const TBOX &box) {\n    return abs(ref.left() + ref.right() - box.left() - box.right()) / 2.0;\n  }\n\n  // Check if two neighboring characters satisfy the fixed pitch model.\n  static bool is_good_pitch(float pitch, const TBOX &box1, const TBOX &box2) {\n    // Character box shouldn't exceed pitch.\n    if (box1.width() >= pitch * (1.0 + kFPTolerance) ||\n        box2.width() >= pitch * (1.0 + kFPTolerance) ||\n        box1.height() >= pitch * (1.0 + kFPTolerance) ||\n        box2.height() >= pitch * (1.0 + kFPTolerance)) {\n      return false;\n    }\n\n    const float real_pitch = box_pitch(box1, box2);\n    if (std::fabs(real_pitch - pitch) < pitch * kFPTolerance) {\n      return true;\n    }\n\n    if (textord_space_size_is_variable) {\n      // Hangul characters usually have fixed pitch, but words are\n      // delimited by space which can be narrower than characters.\n      if (real_pitch > pitch && real_pitch < pitch * 2.0 && real_pitch - box1.x_gap(box2) < pitch) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  static bool is_interesting_blob(const BLOBNBOX *blob) {\n    return !blob->joined_to_prev() && blob->flow() != BTFT_LEADER;\n  }\n\n  // Cleanup chars that are already merged to others.\n  void DeleteChars() {\n    unsigned index = 0;\n    for (unsigned i = 0; i < characters_.size(); ++i) {\n      if (!characters_[i].delete_flag()) {\n        if (index != i) {\n          characters_[index] = characters_[i];\n        }\n        index++;\n      }\n    }\n    characters_.resize(index);\n  }\n\n  float pitch_ = 0.0f;           // Character pitch.\n  float estimated_pitch_ = 0.0f; // equal to pitch_ if pitch_ is considered\n  // to be good enough.\n  float height_ = 0.0f; // Character height.\n  float gap_ = 0.0f;    // Minimum gap between characters.\n\n  // Pitches between any two successive characters.\n  SimpleStats all_pitches_;\n  // Gaps between any two successive characters.\n  SimpleStats all_gaps_;\n  // Pitches between any two successive characters that are consistent\n  // with the fixed pitch model.\n  SimpleStats good_pitches_;\n  // Gaps between any two successive characters that are consistent\n  // with the fixed pitch model.\n  SimpleStats good_gaps_;\n\n  SimpleStats heights_;\n\n  std::vector<FPChar> characters_;\n  TO_ROW *real_row_ = nullptr; // Underlying TD_ROW for this row.\n};\n\nvoid FPRow::Init(TO_ROW *row) {\n  ASSERT_HOST(row != nullptr);\n  ASSERT_HOST(row->xheight > 0);\n  real_row_ = row;\n  real_row_->pitch_decision = PITCH_CORR_PROP; // Default decision.\n\n  BLOBNBOX_IT blob_it = row->blob_list();\n  // Initialize characters_ and compute the initial estimation of\n  // character height.\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    if (is_interesting_blob(blob_it.data())) {\n      FPChar fp_char;\n      fp_char.Init(blob_it.data());\n      // Merge unconditionally if two blobs overlap.\n      if (!characters_.empty() && significant_overlap(fp_char.box(), characters_.back().box())) {\n        characters_.back().Merge(fp_char);\n      } else {\n        characters_.push_back(fp_char);\n      }\n      TBOX bound = blob_it.data()->bounding_box();\n      if (bound.height() * 3.0 > bound.width()) {\n        heights_.Add(bound.height());\n      }\n    }\n  }\n  heights_.Finish();\n  height_ = heights_.ile(0.875);\n}\n\nvoid FPRow::OutputEstimations() {\n  if (good_pitches_.empty()) {\n    pitch_ = 0.0f;\n    real_row_->pitch_decision = PITCH_CORR_PROP;\n    return;\n  }\n\n  pitch_ = good_pitches_.median();\n  real_row_->fixed_pitch = pitch_;\n  // good_gaps_.ile(0.125) can be large if most characters on the row\n  // are skinny. Use pitch_ - height_ instead if it's smaller, but\n  // positive.\n  real_row_->kern_size = real_row_->pr_nonsp =\n      std::min(good_gaps_.ile(0.125), std::max(pitch_ - height_, 0.0f));\n  real_row_->body_size = pitch_ - real_row_->kern_size;\n\n  if (good_pitches_.size() < all_pitches_.size() * kFixedPitchThreshold) {\n    // If more than half of the characters of a line don't fit to the\n    // fixed pitch model, consider the line to be proportional. 50%\n    // seems to be a good threshold in practice as well.\n    // Anyway we store estimated values (fixed_pitch, kern_size, etc.) in\n    // real_row_ as a partial estimation result and try to use them in the\n    // normalization process.\n    real_row_->pitch_decision = PITCH_CORR_PROP;\n    return;\n  } else if (good_pitches_.size() > all_pitches_.size() * 0.75) {\n    real_row_->pitch_decision = PITCH_DEF_FIXED;\n  } else {\n    real_row_->pitch_decision = PITCH_CORR_FIXED;\n  }\n\n  real_row_->space_size = real_row_->pr_space = pitch_;\n  // Set min_space to 50% of character pitch so that we can break CJK\n  // text at a half-width space after punctuation.\n  real_row_->min_space = (pitch_ + good_gaps_.minimum()) * 0.5;\n\n  // Don't consider a quarter space as a real space, because it's used\n  // for line justification in traditional Japanese books.\n  real_row_->max_nonspace =\n      std::max(pitch_ * 0.25 + good_gaps_.minimum(), static_cast<double>(good_gaps_.ile(0.875)));\n\n  int space_threshold = std::min((real_row_->max_nonspace + real_row_->min_space) / 2,\n                                 static_cast<int>(real_row_->xheight));\n\n  // Make max_nonspace larger than any intra-character gap so that\n  // make_prop_words() won't break a row at the middle of a character.\n  for (size_t i = 0; i < num_chars(); ++i) {\n    if (characters_[i].max_gap() > real_row_->max_nonspace) {\n      real_row_->max_nonspace = characters_[i].max_gap();\n    }\n  }\n  real_row_->space_threshold = std::min((real_row_->max_nonspace + real_row_->min_space) / 2,\n                                        static_cast<int>(real_row_->xheight));\n  real_row_->used_dm_model = false;\n\n  // Setup char_cells.\n  ICOORDELT_IT cell_it = &real_row_->char_cells;\n  auto *cell = new ICOORDELT(real_body(0).left(), 0);\n  cell_it.add_after_then_move(cell);\n\n  int right = real_body(0).right();\n  for (size_t i = 1; i < num_chars(); ++i) {\n    // Put a word break if gap between two characters is bigger than\n    // space_threshold.  Don't break if none of two characters\n    // couldn't be \"finalized\", because maybe they need to be merged\n    // to one character.\n    if ((is_final(i - 1) || is_final(i)) &&\n        real_body(i - 1).x_gap(real_body(i)) > space_threshold) {\n      cell = new ICOORDELT(right + 1, 0);\n      cell_it.add_after_then_move(cell);\n      while (right + pitch_ < box(i).left()) {\n        right += pitch_;\n        cell = new ICOORDELT(right + 1, 0);\n        cell_it.add_after_then_move(cell);\n      }\n      right = box(i).left();\n    }\n    cell = new ICOORDELT((right + real_body(i).left()) / 2, 0);\n    cell_it.add_after_then_move(cell);\n    right = real_body(i).right();\n  }\n\n  cell = new ICOORDELT(right + 1, 0);\n  cell_it.add_after_then_move(cell);\n\n  // TODO(takenaka): add code to store alignment/fragmentation\n  // information to blobs so that it can be reused later, e.g. in\n  // recognition phase.\n}\n\nvoid FPRow::EstimatePitch(bool pass1) {\n  good_pitches_.Clear();\n  all_pitches_.Clear();\n  good_gaps_.Clear();\n  all_gaps_.Clear();\n  heights_.Clear();\n  if (num_chars() == 0) {\n    return;\n  }\n\n  int32_t cx0, cx1;\n  bool prev_was_good = is_good(0);\n  cx0 = center_x(0);\n\n  heights_.Add(box(0).height());\n  for (size_t i = 1; i < num_chars(); i++) {\n    cx1 = center_x(i);\n    int32_t pitch = cx1 - cx0;\n    int32_t gap = std::max(0, real_body(i - 1).x_gap(real_body(i)));\n\n    heights_.Add(box(i).height());\n    // Ignore if the pitch is too close.  But don't ignore wide pitch\n    // may be the result of large tracking.\n    if (pitch > height_ * 0.5) {\n      all_pitches_.Add(pitch);\n      all_gaps_.Add(gap);\n      if (is_good(i)) {\n        // In pass1 (after Pass1Analyze()), all characters marked as\n        // \"good\" have a good consistent pitch with their previous\n        // characters.  However, it's not true in pass2 and a good\n        // character may have a good pitch only between its successor.\n        // So we collect only pitch values between two good\n        // characters. and within tolerance in pass2.\n        if (pass1 ||\n            (prev_was_good && std::fabs(estimated_pitch_ - pitch) < kFPTolerance * estimated_pitch_)) {\n          good_pitches_.Add(pitch);\n          if (!is_box_modified(i - 1) && !is_box_modified(i)) {\n            good_gaps_.Add(gap);\n          }\n        }\n        prev_was_good = true;\n      } else {\n        prev_was_good = false;\n      }\n    }\n    cx0 = cx1;\n  }\n\n  good_pitches_.Finish();\n  all_pitches_.Finish();\n  good_gaps_.Finish();\n  all_gaps_.Finish();\n  heights_.Finish();\n\n  height_ = heights_.ile(0.875);\n  if (all_pitches_.empty()) {\n    pitch_ = 0.0f;\n    gap_ = 0.0f;\n  } else if (good_pitches_.size() < 2) {\n    // We don't have enough data to estimate the pitch of this row yet.\n    // Use median of all pitches as the initial guess.\n    pitch_ = all_pitches_.median();\n    ASSERT_HOST(pitch_ > 0.0f);\n    gap_ = all_gaps_.ile(0.125);\n  } else {\n    pitch_ = good_pitches_.median();\n    ASSERT_HOST(pitch_ > 0.0f);\n    gap_ = good_gaps_.ile(0.125);\n  }\n}\n\nvoid FPRow::DebugOutputResult(int row_index) {\n  if (num_chars() > 0) {\n    tprintf(\n        \"Row %d: pitch_decision=%d, fixed_pitch=%f, max_nonspace=%d, \"\n        \"space_size=%f, space_threshold=%d, xheight=%f\\n\",\n        row_index, static_cast<int>(real_row_->pitch_decision), real_row_->fixed_pitch,\n        real_row_->max_nonspace, real_row_->space_size, real_row_->space_threshold,\n        real_row_->xheight);\n\n    for (unsigned i = 0; i < num_chars(); i++) {\n      tprintf(\"Char %u: is_final=%d is_good=%d num_blobs=%d: \", i, is_final(i), is_good(i),\n              character(i)->num_blobs());\n      box(i).print();\n    }\n  }\n}\n\nvoid FPRow::Pass1Analyze() {\n  if (num_chars() < 2) {\n    return;\n  }\n\n  if (estimated_pitch_ > 0.0f) {\n    for (size_t i = 2; i < num_chars(); i++) {\n      if (is_good_pitch(estimated_pitch_, box(i - 2), box(i - 1)) &&\n          is_good_pitch(estimated_pitch_, box(i - 1), box(i))) {\n        mark_good(i - 1);\n      }\n    }\n  } else {\n    for (size_t i = 2; i < num_chars(); i++) {\n      if (is_good_pitch(box_pitch(box(i - 2), box(i - 1)), box(i - 1), box(i))) {\n        mark_good(i - 1);\n      }\n    }\n  }\n  character(0)->set_alignment(character(1)->alignment());\n  character(num_chars() - 1)->set_alignment(character(num_chars() - 2)->alignment());\n}\n\nbool FPRow::Pass2Analyze() {\n  bool changed = false;\n  if (num_chars() <= 1 || estimated_pitch_ == 0.0f) {\n    return false;\n  }\n  for (size_t i = 0; i < num_chars(); i++) {\n    if (is_final(i)) {\n      continue;\n    }\n\n    FPChar::Alignment alignment = character(i)->alignment();\n    bool intersecting = false;\n    bool not_intersecting = false;\n\n    if (i < num_chars() - 1 && is_final(i + 1)) {\n      // Next character is already finalized. Estimate the imaginary\n      // body including this character based on the character. Skip\n      // whitespace if necessary.\n      bool skipped_whitespaces = false;\n      float c1 = center_x(i + 1) - 1.5 * estimated_pitch_;\n      while (c1 > box(i).right()) {\n        skipped_whitespaces = true;\n        c1 -= estimated_pitch_;\n      }\n      TBOX ibody(c1, box(i).bottom(), c1 + estimated_pitch_, box(i).top());\n\n      // Collect all characters that mostly fit in the region.\n      // Also, their union height shouldn't be too big.\n      int j = i;\n      TBOX merged;\n      while (j >= 0 && !is_final(j) && mostly_overlap(ibody, box(j)) &&\n             merged.bounding_union(box(j)).height() < estimated_pitch_ * (1 + kFPTolerance)) {\n        merged += box(j);\n        j--;\n      }\n\n      if (j >= 0 && significant_overlap(ibody, box(j))) {\n        // character(j) lies on the character boundary and doesn't fit\n        // well into the imaginary body.\n        if (!is_final(j)) {\n          intersecting = true;\n        }\n      } else {\n        not_intersecting = true;\n        if (i - j > 0) {\n          // Merge character(j+1) ... character(i) because they fit\n          // into the body nicely.\n          if (i - j == 1) {\n            // Only one char in the imaginary body.\n            if (!skipped_whitespaces) {\n              mark_good(i);\n            }\n            // set ibody as bounding box of this character to get\n            // better pitch analysis result for halfwidth glyphs\n            // followed by a halfwidth space.\n            if (box(i).width() <= estimated_pitch_ * 0.5) {\n              ibody += box(i);\n              character(i)->set_box(ibody);\n            }\n            character(i)->set_merge_to_prev(false);\n            finalize(i);\n          } else {\n            for (int k = i; k > j + 1; k--) {\n              character(k)->set_merge_to_prev(true);\n            }\n          }\n        }\n      }\n    }\n    if (i > 0 && is_final(i - 1)) {\n      // Now we repeat everything from the opposite side.  Previous\n      // character is already finalized. Estimate the imaginary body\n      // including this character based on the character.\n      bool skipped_whitespaces = false;\n      float c1 = center_x(i - 1) + 1.5 * estimated_pitch_;\n      while (c1 < box(i).left()) {\n        skipped_whitespaces = true;\n        c1 += estimated_pitch_;\n      }\n      TBOX ibody(c1 - estimated_pitch_, box(i).bottom(), c1, box(i).top());\n\n      size_t j = i;\n      TBOX merged;\n      while (j < num_chars() && !is_final(j) && mostly_overlap(ibody, box(j)) &&\n             merged.bounding_union(box(j)).height() < estimated_pitch_ * (1 + kFPTolerance)) {\n        merged += box(j);\n        j++;\n      }\n\n      if (j < num_chars() && significant_overlap(ibody, box(j))) {\n        if (!is_final(j)) {\n          intersecting = true;\n        }\n      } else {\n        not_intersecting = true;\n        if (j - i > 0) {\n          if (j - i == 1) {\n            if (!skipped_whitespaces) {\n              mark_good(i);\n            }\n            if (box(i).width() <= estimated_pitch_ * 0.5) {\n              ibody += box(i);\n              character(i)->set_box(ibody);\n            }\n            character(i)->set_merge_to_prev(false);\n            finalize(i);\n          } else {\n            for (size_t k = i + 1; k < j; k++) {\n              character(k)->set_merge_to_prev(true);\n            }\n          }\n        }\n      }\n    }\n\n    // This character doesn't fit well into the estimated imaginary\n    // bodies. Mark it as bad.\n    if (intersecting && !not_intersecting) {\n      mark_bad(i);\n    }\n    if (character(i)->alignment() != alignment || character(i)->merge_to_prev()) {\n      changed = true;\n    }\n  }\n\n  return changed;\n}\n\nvoid FPRow::MergeFragments() {\n  int last_char = 0;\n\n  for (size_t j = 0; j < num_chars(); ++j) {\n    if (character(j)->merge_to_prev()) {\n      character(last_char)->Merge(*character(j));\n      character(j)->set_delete_flag(true);\n      clear_alignment(last_char);\n      character(j - 1)->set_merge_to_prev(false);\n    } else {\n      last_char = j;\n    }\n  }\n  DeleteChars();\n}\n\nvoid FPRow::FinalizeLargeChars() {\n  float row_pitch = estimated_pitch();\n  for (size_t i = 0; i < num_chars(); i++) {\n    if (is_final(i)) {\n      continue;\n    }\n\n    // Finalize if both neighbors are finalized. We have no other choice.\n    if (i > 0 && is_final(i - 1) && i < num_chars() - 1 && is_final(i + 1)) {\n      finalize(i);\n      continue;\n    }\n\n    float cx = center_x(i);\n    TBOX ibody(cx - 0.5 * row_pitch, 0, cx + 0.5 * row_pitch, 1);\n    if (i > 0) {\n      // The preceding character significantly intersects with the\n      // imaginary body of this character. Let Pass2Analyze() handle\n      // this case.\n      if (x_overlap_fraction(ibody, box(i - 1)) > 0.1) {\n        continue;\n      }\n      if (!is_final(i - 1)) {\n        TBOX merged = box(i);\n        merged += box(i - 1);\n        if (merged.width() < row_pitch) {\n          continue;\n        }\n        // This character cannot be finalized yet because it can be\n        // merged with the previous one.  Again, let Pass2Analyze()\n        // handle this case.\n      }\n    }\n    if (i < num_chars() - 1) {\n      if (x_overlap_fraction(ibody, box(i + 1)) > 0.1) {\n        continue;\n      }\n      if (!is_final(i + 1)) {\n        TBOX merged = box(i);\n        merged += box(i + 1);\n        if (merged.width() < row_pitch) {\n          continue;\n        }\n      }\n    }\n    finalize(i);\n  }\n\n  // Update alignment decision.  We only consider finalized characters\n  // in pass2.  E.g. if a finalized character C has another finalized\n  // character L on its left and a not-finalized character R on its\n  // right, we mark C as good if the pitch between C and L is good,\n  // regardless of the pitch between C and R.\n  for (size_t i = 0; i < num_chars(); i++) {\n    if (!is_final(i)) {\n      continue;\n    }\n    bool good_pitch = false;\n    bool bad_pitch = false;\n    if (i > 0 && is_final(i - 1)) {\n      if (is_good_pitch(row_pitch, box(i - 1), box(i))) {\n        good_pitch = true;\n      } else {\n        bad_pitch = true;\n      }\n    }\n    if (i < num_chars() - 1 && is_final(i + 1)) {\n      if (is_good_pitch(row_pitch, box(i), box(i + 1))) {\n        good_pitch = true;\n      } else {\n        bad_pitch = true;\n      }\n    }\n    if (good_pitch && !bad_pitch) {\n      mark_good(i);\n    } else if (!good_pitch && bad_pitch) {\n      mark_bad(i);\n    }\n  }\n}\n\nclass FPAnalyzer {\npublic:\n  FPAnalyzer(ICOORD page_tr, TO_BLOCK_LIST *port_blocks);\n  ~FPAnalyzer() = default;\n\n  void Pass1Analyze() {\n    for (auto &row : rows_) {\n      row.Pass1Analyze();\n    }\n  }\n\n  // Estimate character pitch for each row.  The argument pass1 can be\n  // set to true if the function is called after Pass1Analyze(), to\n  // eliminate some redundant computation.\n  void EstimatePitch(bool pass1);\n\n  bool maybe_fixed_pitch() {\n    if (rows_.empty() || rows_.size() <= num_bad_rows_ + num_tall_rows_ + 1) {\n      return false;\n    }\n    return true;\n  }\n\n  void MergeFragments() {\n    for (auto &row : rows_) {\n      row.MergeFragments();\n    }\n  }\n\n  void FinalizeLargeChars() {\n    for (auto &row : rows_) {\n      row.FinalizeLargeChars();\n    }\n  }\n\n  bool Pass2Analyze() {\n    bool changed = false;\n    for (auto &row : rows_) {\n      if (row.Pass2Analyze()) {\n        changed = true;\n      }\n    }\n    return changed;\n  }\n\n  void OutputEstimations() {\n    for (auto &row : rows_) {\n      row.OutputEstimations();\n    }\n    // Don't we need page-level estimation of gaps/spaces?\n  }\n\n  void DebugOutputResult() {\n    tprintf(\"FPAnalyzer: final result\\n\");\n    for (size_t i = 0; i < rows_.size(); i++) {\n      rows_[i].DebugOutputResult(i);\n    }\n  }\n\n  size_t num_rows() {\n    return rows_.size();\n  }\n\n  // Returns the upper limit for pass2 loop iteration.\n  unsigned max_iteration() {\n    // We're fixing at least one character per iteration. So basically\n    // we shouldn't require more than max_chars_per_row_ iterations.\n    return max_chars_per_row_ + 100;\n  }\n\nprivate:\n  ICOORD page_tr_;\n  std::vector<FPRow> rows_;\n  unsigned num_tall_rows_;\n  unsigned num_bad_rows_;\n  // TODO: num_empty_rows_ is incremented, but never used otherwise.\n  unsigned num_empty_rows_;\n  unsigned max_chars_per_row_;\n};\n\nFPAnalyzer::FPAnalyzer(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)\n    : page_tr_(page_tr)\n    , num_tall_rows_(0)\n    , num_bad_rows_(0)\n    , num_empty_rows_(0)\n    , max_chars_per_row_(0) {\n  TO_BLOCK_IT block_it(port_blocks);\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    TO_BLOCK *block = block_it.data();\n    if (!block->get_rows()->empty()) {\n      ASSERT_HOST(block->xheight > 0);\n      find_repeated_chars(block, false);\n    }\n  }\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    TO_ROW_IT row_it = block_it.data()->get_rows();\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      FPRow row;\n      row.Init(row_it.data());\n      rows_.push_back(row);\n      size_t num_chars = rows_.back().num_chars();\n      if (num_chars <= 1) {\n        num_empty_rows_++;\n      }\n      if (num_chars > max_chars_per_row_) {\n        max_chars_per_row_ = num_chars;\n      }\n    }\n  }\n}\n\nvoid FPAnalyzer::EstimatePitch(bool pass1) {\n  LocalCorrelation pitch_height_stats;\n\n  num_tall_rows_ = 0;\n  num_bad_rows_ = 0;\n  pitch_height_stats.Clear();\n  for (auto &row : rows_) {\n    row.EstimatePitch(pass1);\n    if (row.good_pitches()) {\n      pitch_height_stats.Add(row.height() + row.gap(), row.pitch(), row.good_pitches());\n      if (row.height_pitch_ratio() > 1.1) {\n        num_tall_rows_++;\n      }\n    } else {\n      num_bad_rows_++;\n    }\n  }\n\n  pitch_height_stats.Finish();\n  for (auto &row : rows_) {\n    if (row.good_pitches() >= 5) {\n      // We have enough evidences. Just use the pitch estimation\n      // from this row.\n      row.set_estimated_pitch(row.pitch());\n    } else if (row.num_chars() > 1) {\n      float estimated_pitch = pitch_height_stats.EstimateYFor(row.height() + row.gap(), 0.1f);\n      // CJK characters are more likely to be fragmented than poorly\n      // chopped. So trust the page-level estimation of character\n      // pitch only if it's larger than row-level estimation or\n      // row-level estimation is too large (2x bigger than row height).\n      if (estimated_pitch > row.pitch() || row.pitch() > row.height() * 2.0) {\n        row.set_estimated_pitch(estimated_pitch);\n      } else {\n        row.set_estimated_pitch(row.pitch());\n      }\n    }\n  }\n}\n\nvoid compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {\n  FPAnalyzer analyzer(page_tr, port_blocks);\n  if (analyzer.num_rows() == 0) {\n    return;\n  }\n\n  analyzer.Pass1Analyze();\n  analyzer.EstimatePitch(true);\n\n  // Perform pass1 analysis again with the initial estimation of row\n  // pitches, for better estimation.\n  analyzer.Pass1Analyze();\n  analyzer.EstimatePitch(true);\n\n  // Early exit if the page doesn't seem to contain fixed pitch rows.\n  if (!analyzer.maybe_fixed_pitch()) {\n    if (textord_debug_pitch_test) {\n      tprintf(\"Page doesn't seem to contain fixed pitch rows\\n\");\n    }\n    return;\n  }\n\n  unsigned iteration = 0;\n  do {\n    analyzer.MergeFragments();\n    analyzer.FinalizeLargeChars();\n    analyzer.EstimatePitch(false);\n    iteration++;\n  } while (analyzer.Pass2Analyze() && iteration < analyzer.max_iteration());\n\n  if (textord_debug_pitch_test) {\n    tprintf(\"compute_fixed_pitch_cjk finished after %u iteration (limit=%u)\\n\", iteration,\n            analyzer.max_iteration());\n  }\n\n  analyzer.OutputEstimations();\n  if (textord_debug_pitch_test) {\n    analyzer.DebugOutputResult();\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/cjkpitch.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        cjkpitch.h\n// Description: Code to determine fixed pitchness and the pitch if fixed,\n//              for CJK text.\n// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: takenaka@google.com (Hiroshi Takenaka)\n// Created:     Mon Jun 27 12:48:35 JST 2011\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n#ifndef CJKPITCH_H_\n#define CJKPITCH_H_\n\n#include \"blobbox.h\"\n\nnamespace tesseract {\n\n// Function to test \"fixed-pitchness\" of the input text and estimating\n// character pitch parameters for it, based on CJK fixed-pitch layout\n// model.\n//\n// This function assumes that a fixed-pitch CJK text has following\n// characteristics:\n//\n// - Most glyphs are designed to fit within the same sized square\n//   (imaginary body). Also they are aligned to the center of their\n//   imaginary bodies.\n// - The imaginary body is always a regular rectangle.\n// - There may be some extra space between character bodies\n//   (tracking).\n// - There may be some extra space after punctuations.\n// - The text is *not* space-delimited. Thus spaces are rare.\n// - Character may consists of multiple unconnected blobs.\n//\n// And the function works in two passes.  On pass 1, it looks for such\n// \"good\" blobs that has the pitch same pitch on the both side and\n// looks like a complete CJK character. Then estimates the character\n// pitch for every row, based on those good blobs. If we couldn't find\n// enough good blobs for a row, then the pitch is estimated from other\n// rows with similar character height instead.\n//\n// Pass 2 is an iterative process to fit the blobs into fixed-pitch\n// character cells. Once we have estimated the character pitch, blobs\n// that are almost as large as the pitch can be considered to be\n// complete characters. And once we know that some characters are\n// complete characters, we can estimate the region occupied by its\n// neighbors. And so on.\n//\n// We repeat the process until all ambiguities are resolved. Then make\n// the final decision about fixed-pitchness of each row and compute\n// pitch and spacing parameters.\n//\n// (If a row is considered to be proportional, pitch_decision for the\n// row is set to PITCH_CORR_PROP and the later phase\n// (i.e. Textord::to_spacing()) should determine its spacing\n// parameters)\n//\n// This function doesn't provide all information required by\n// fixed_pitch_words() and the rows need to be processed with\n// make_prop_words() even if they are fixed pitched.\nvoid compute_fixed_pitch_cjk(ICOORD page_tr,              // top right\n                             TO_BLOCK_LIST *port_blocks); // input list\n\n} // namespace tesseract\n\n#endif // CJKPITCH_H_\n"
  },
  {
    "path": "src/textord/colfind.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colfind.cpp\n// Description: Class to hold BLOBNBOXs in a grid for fast access\n//              to neighbours.\n// Author:      Ray Smith\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"colfind.h\"\n\n#include \"ccnontextdetect.h\"\n#include \"colpartition.h\"\n#include \"colpartitionset.h\"\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"equationdetectbase.h\"\n#endif\n#include \"blobbox.h\"\n#include \"linefind.h\"\n#include \"normalis.h\"\n#include \"params.h\"\n#include \"scrollview.h\"\n#include \"strokewidth.h\"\n#include \"tablefind.h\"\n#include \"workingpartset.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// When assigning columns, the max number of misfit grid rows/ColPartitionSets\n// that can be ignored.\nconst int kMaxIncompatibleColumnCount = 2;\n// Max fraction of mean_column_gap_ for the gap between two partitions within a\n// column to allow them to merge.\nconst double kHorizontalGapMergeFraction = 0.5;\n// Minimum gutter width as a fraction of gridsize\nconst double kMinGutterWidthGrid = 0.5;\n// Max multiple of a partition's median size as a distance threshold for\n// adding noise blobs.\nconst double kMaxDistToPartSizeRatio = 1.5;\n\n#ifndef GRAPHICS_DISABLED\nstatic BOOL_VAR(textord_tabfind_show_initial_partitions, false, \"Show partition bounds\");\nstatic BOOL_VAR(textord_tabfind_show_reject_blobs, false, \"Show blobs rejected as noise\");\nstatic INT_VAR(textord_tabfind_show_partitions, 0,\n               \"Show partition bounds, waiting if >1 (ScrollView)\");\nstatic BOOL_VAR(textord_tabfind_show_columns, false, \"Show column bounds (ScrollView)\");\nstatic BOOL_VAR(textord_tabfind_show_blocks, false, \"Show final block bounds (ScrollView)\");\n#endif\nstatic BOOL_VAR(textord_tabfind_find_tables, true, \"run table detection\");\n\n#ifndef GRAPHICS_DISABLED\nScrollView *ColumnFinder::blocks_win_ = nullptr;\n#endif\n\n// Gridsize is an estimate of the text size in the image. A suitable value\n// is in TO_BLOCK::line_size after find_components has been used to make\n// the blobs.\n// bleft and tright are the bounds of the image (or rectangle) being processed.\n// vlines is a (possibly empty) list of TabVector and vertical_x and y are\n// the sum logical vertical vector produced by LineFinder::FindVerticalLines.\nColumnFinder::ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution,\n                           bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines,\n                           TabVector_LIST *hlines, int vertical_x, int vertical_y)\n    : TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y, resolution)\n    , cjk_script_(cjk_script)\n    , min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize))\n    , mean_column_gap_(tright.x() - bleft.x())\n    , tabfind_aligned_gap_fraction_(aligned_gap_fraction)\n    , deskew_(0.0f, 0.0f)\n    , reskew_(1.0f, 0.0f)\n    , rotation_(1.0f, 0.0f)\n    , rerotate_(1.0f, 0.0f)\n    , text_rotation_(0.0f, 0.0f)\n    , best_columns_(nullptr)\n    , stroke_width_(nullptr)\n    , part_grid_(gridsize, bleft, tright)\n    , nontext_map_(nullptr)\n    , projection_(resolution)\n    , denorm_(nullptr)\n    , equation_detect_(nullptr) {\n  TabVector_IT h_it(&horizontal_lines_);\n  h_it.add_list_after(hlines);\n}\n\nColumnFinder::~ColumnFinder() {\n  for (auto set : column_sets_) {\n    delete set;\n  }\n  delete[] best_columns_;\n  delete stroke_width_;\n#ifndef GRAPHICS_DISABLED\n  delete input_blobs_win_;\n#endif\n  nontext_map_.destroy();\n  while (denorm_ != nullptr) {\n    DENORM *dead_denorm = denorm_;\n    denorm_ = const_cast<DENORM *>(denorm_->predecessor());\n    delete dead_denorm;\n  }\n\n  // The ColPartitions are destroyed automatically, but any boxes in\n  // the noise_parts_ list are owned and need to be deleted explicitly.\n  ColPartition_IT part_it(&noise_parts_);\n  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {\n    ColPartition *part = part_it.data();\n    part->DeleteBoxes();\n  }\n  // Likewise any boxes in the good_parts_ list need to be deleted.\n  // These are just the image parts. Text parts have already given their\n  // boxes on to the TO_BLOCK, and have empty lists.\n  part_it.set_to_list(&good_parts_);\n  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {\n    ColPartition *part = part_it.data();\n    part->DeleteBoxes();\n  }\n  // Also, any blobs on the image_bblobs_ list need to have their cblobs\n  // deleted. This only happens if there has been an early return from\n  // FindColumns, as in a normal return, the blobs go into the grid and\n  // end up in noise_parts_, good_parts_ or the output blocks.\n  BLOBNBOX_IT bb_it(&image_bblobs_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.data();\n    delete bblob->cblob();\n  }\n}\n\n// Performs initial processing on the blobs in the input_block:\n// Setup the part_grid_, stroke_width_, nontext_map.\n// Obvious noise blobs are filtered out and used to mark the nontext_map_.\n// Initial stroke-width analysis is used to get local text alignment\n// direction, so the textline projection_ map can be setup.\n// On return, IsVerticallyAlignedText may be called (now optionally) to\n// determine the gross textline alignment of the page.\nvoid ColumnFinder::SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix,\n                                       TO_BLOCK *input_block) {\n  part_grid_.Init(gridsize(), bleft(), tright());\n  delete stroke_width_;\n  stroke_width_ = new StrokeWidth(gridsize(), bleft(), tright());\n  min_gutter_width_ = static_cast<int>(kMinGutterWidthGrid * gridsize());\n  input_block->ReSetAndReFilterBlobs();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_blocks) {\n    input_blobs_win_ = MakeWindow(0, 0, \"Filtered Input Blobs\");\n    input_block->plot_graded_blobs(input_blobs_win_);\n  }\n#endif // !GRAPHICS_DISABLED\n  SetBlockRuleEdges(input_block);\n  nontext_map_.destroy();\n  // Run a preliminary strokewidth neighbour detection on the medium blobs.\n  stroke_width_->SetNeighboursOnMediumBlobs(input_block);\n  CCNonTextDetect nontext_detect(gridsize(), bleft(), tright());\n  // Remove obvious noise and make the initial non-text map.\n  nontext_map_ =\n      nontext_detect.ComputeNonTextMask(textord_debug_tabfind, photo_mask_pix, input_block);\n  stroke_width_->FindTextlineDirectionAndFixBrokenCJK(pageseg_mode, cjk_script_, input_block);\n  // Clear the strokewidth grid ready for rotation or leader finding.\n  stroke_width_->Clear();\n}\n\n// Tests for vertical alignment of text (returning true if so), and generates\n// a list of blobs of moderate aspect ratio, in the most frequent writing\n// direction (in osd_blobs) for orientation and script detection to test\n// the character orientation.\n// block is the single block for the whole page or rectangle to be OCRed.\n// Note that the vertical alignment may be due to text whose writing direction\n// is vertical, like say Japanese, or due to text whose writing direction is\n// horizontal but whose text appears vertically aligned because the image is\n// not the right way up.\nbool ColumnFinder::IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block,\n                                           BLOBNBOX_CLIST *osd_blobs) {\n  return stroke_width_->TestVerticalTextDirection(find_vertical_text_ratio, block, osd_blobs);\n}\n\n// Rotates the blobs and the TabVectors so that the gross writing direction\n// (text lines) are horizontal and lines are read down the page.\n// Applied rotation stored in rotation_.\n// A second rotation is calculated for application during recognition to\n// make the rotated blobs upright for recognition.\n// Subsequent rotation stored in text_rotation_.\n//\n// Arguments:\n//   vertical_text_lines true if the text lines are vertical.\n//   recognition_rotation [0..3] is the number of anti-clockwise 90 degree\n//   rotations from osd required for the text to be upright and readable.\nvoid ColumnFinder::CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines,\n                                      int recognition_rotation) {\n  const FCOORD anticlockwise90(0.0f, 1.0f);\n  const FCOORD clockwise90(0.0f, -1.0f);\n  const FCOORD rotation180(-1.0f, 0.0f);\n  const FCOORD norotation(1.0f, 0.0f);\n\n  text_rotation_ = norotation;\n  // Rotate the page to make the text upright, as implied by\n  // recognition_rotation.\n  rotation_ = norotation;\n  if (recognition_rotation == 1) {\n    rotation_ = anticlockwise90;\n  } else if (recognition_rotation == 2) {\n    rotation_ = rotation180;\n  } else if (recognition_rotation == 3) {\n    rotation_ = clockwise90;\n  }\n  // We infer text writing direction to be vertical if there are several\n  // vertical text lines detected, and horizontal if not. But if the page\n  // orientation was determined to be 90 or 270 degrees, the true writing\n  // direction is the opposite of what we inferred.\n  if (recognition_rotation & 1) {\n    vertical_text_lines = !vertical_text_lines;\n  }\n  // If we still believe the writing direction is vertical, we use the\n  // convention of rotating the page ccw 90 degrees to make the text lines\n  // horizontal, and mark the blobs for rotation cw 90 degrees for\n  // classification so that the text order is correct after recognition.\n  if (vertical_text_lines) {\n    rotation_.rotate(anticlockwise90);\n    text_rotation_.rotate(clockwise90);\n  }\n  // Set rerotate_ to the inverse of rotation_.\n  rerotate_ = FCOORD(rotation_.x(), -rotation_.y());\n  if (rotation_.x() != 1.0f || rotation_.y() != 0.0f) {\n    // Rotate all the blobs and tab vectors.\n    RotateBlobList(rotation_, &block->large_blobs);\n    RotateBlobList(rotation_, &block->blobs);\n    RotateBlobList(rotation_, &block->small_blobs);\n    RotateBlobList(rotation_, &block->noise_blobs);\n    TabFind::ResetForVerticalText(rotation_, rerotate_, &horizontal_lines_, &min_gutter_width_);\n    part_grid_.Init(gridsize(), bleft(), tright());\n    // Reset all blobs to initial state and filter by size.\n    // Since they have rotated, the list they belong on could have changed.\n    block->ReSetAndReFilterBlobs();\n    SetBlockRuleEdges(block);\n    stroke_width_->CorrectForRotation(rerotate_, &part_grid_);\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Vertical=%d, orientation=%d, final rotation=(%f, %f)+(%f,%f)\\n\", vertical_text_lines,\n            recognition_rotation, rotation_.x(), rotation_.y(), text_rotation_.x(),\n            text_rotation_.y());\n  }\n  // Setup the denormalization.\n  ASSERT_HOST(denorm_ == nullptr);\n  denorm_ = new DENORM;\n  denorm_->SetupNormalization(nullptr, &rotation_, nullptr, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);\n}\n\n// Finds blocks of text, image, rule line, table etc, returning them in the\n// blocks and to_blocks\n// (Each TO_BLOCK points to the basic BLOCK and adds more information.)\n// Image blocks are generated by a combination of photo_mask_pix (which may\n// NOT be nullptr) and the rejected text found during preliminary textline\n// finding.\n// The input_block is the result of a call to find_components, and contains\n// the blobs found in the image or rectangle to be OCRed. These blobs will be\n// removed and placed in the output blocks, while unused ones will be deleted.\n// If single_column is true, the input is treated as single column, but\n// it is still divided into blocks of equal line spacing/text size.\n// scaled_color is scaled down by scaled_factor from the input color image,\n// and may be nullptr if the input was not color.\n// grey_pix is optional, but if present must match the photo_mask_pix in size,\n// and must be a *real* grey image instead of binary_pix * 255.\n// thresholds_pix is expected to be present iff grey_pix is present and\n// can be an integer factor reduction of the grey_pix. It represents the\n// thresholds that were used to create the binary_pix from the grey_pix.\n// If diacritic_blobs is non-null, then diacritics/noise blobs, that would\n// confuse layout analysis by causing textline overlap, are placed there,\n// with the expectation that they will be reassigned to words later and\n// noise/diacriticness determined via classification.\n// Returns -1 if the user hits the 'd' key in the blocks window while running\n// in debug mode, which requests a retry with more debug info.\nint ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor,\n                             TO_BLOCK *input_block, Image photo_mask_pix, Image thresholds_pix,\n                             Image grey_pix, DebugPixa *pixa_debug, BLOCK_LIST *blocks,\n                             BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks) {\n  photo_mask_pix |= nontext_map_;\n  stroke_width_->FindLeaderPartitions(input_block, &part_grid_);\n  stroke_width_->RemoveLineResidue(&big_parts_);\n  FindInitialTabVectors(nullptr, min_gutter_width_, tabfind_aligned_gap_fraction_, input_block);\n  SetBlockRuleEdges(input_block);\n  stroke_width_->GradeBlobsIntoPartitions(pageseg_mode, rerotate_, input_block, nontext_map_,\n                                          denorm_, cjk_script_, &projection_, diacritic_blobs,\n                                          &part_grid_, &big_parts_);\n  if (!PSM_SPARSE(pageseg_mode)) {\n    ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this,\n                                   pixa_debug, &part_grid_, &big_parts_);\n    ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_, photo_mask_pix);\n    ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this,\n                                   pixa_debug, &part_grid_, &big_parts_);\n  }\n  part_grid_.ReTypeBlobs(&image_bblobs_);\n  TidyBlobs(input_block);\n  Reset();\n  // TODO(rays) need to properly handle big_parts_.\n  ColPartition_IT p_it(&big_parts_);\n  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {\n    p_it.data()->DisownBoxesNoAssert();\n  }\n  big_parts_.clear();\n  delete stroke_width_;\n  stroke_width_ = nullptr;\n  // Compute the edge offsets whether or not there is a grey_pix. It is done\n  // here as the c_blobs haven't been touched by rotation or anything yet,\n  // so no denorm is required, yet the text has been separated from image, so\n  // no time is wasted running it on image blobs.\n  input_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);\n\n  // A note about handling right-to-left scripts (Hebrew/Arabic):\n  // The columns must be reversed and come out in right-to-left instead of\n  // the normal left-to-right order. Because the left-to-right ordering\n  // is implicit in many data structures, it is simpler to fool the algorithms\n  // into thinking they are dealing with left-to-right text.\n  // To do this, we reflect the needed data in the y-axis and then reflect\n  // the blocks back after they have been created. This is a temporary\n  // arrangement that is confined to this function only, so the reflection\n  // is completely invisible in the output blocks.\n  // The only objects reflected are:\n  // The vertical separator lines that have already been found;\n  // The bounding boxes of all BLOBNBOXES on all lists on the input_block\n  // plus the image_bblobs. The outlines are not touched, since they are\n  // not looked at.\n  bool input_is_rtl = input_block->block->right_to_left();\n  if (input_is_rtl) {\n    // Reflect the vertical separator lines (member of TabFind).\n    ReflectInYAxis();\n    // Reflect the blob boxes.\n    ReflectForRtl(input_block, &image_bblobs_);\n    part_grid_.ReflectInYAxis();\n  }\n\n  if (!PSM_SPARSE(pageseg_mode)) {\n    if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {\n      // No tab stops needed. Just the grid that FindTabVectors makes.\n      DontFindTabVectors(&image_bblobs_, input_block, &deskew_, &reskew_);\n    } else {\n      SetBlockRuleEdges(input_block);\n      // Find the tab stops, estimate skew, and deskew the tabs, blobs and\n      // part_grid_.\n      FindTabVectors(&horizontal_lines_, &image_bblobs_, input_block, min_gutter_width_,\n                     tabfind_aligned_gap_fraction_, &part_grid_, &deskew_, &reskew_);\n      // Add the deskew to the denorm_.\n      auto *new_denorm = new DENORM;\n      new_denorm->SetupNormalization(nullptr, &deskew_, denorm_, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f,\n                                     0.0f);\n      denorm_ = new_denorm;\n    }\n    SetBlockRuleEdges(input_block);\n    part_grid_.SetTabStops(this);\n\n    // Make the column_sets_.\n    if (!MakeColumns(false)) {\n      tprintf(\"Empty page!!\\n\");\n      part_grid_.DeleteParts();\n      return 0; // This is an empty page.\n    }\n\n    // Refill the grid using rectangular spreading, and get the benefit\n    // of the completed tab vectors marking the rule edges of each blob.\n    Clear();\n#ifndef GRAPHICS_DISABLED\n    if (textord_tabfind_show_reject_blobs) {\n      ScrollView *rej_win = MakeWindow(500, 300, \"Rejected blobs\");\n      input_block->plot_graded_blobs(rej_win);\n    }\n#endif // !GRAPHICS_DISABLED\n    InsertBlobsToGrid(false, false, &image_bblobs_, this);\n    InsertBlobsToGrid(true, true, &input_block->blobs, this);\n\n    part_grid_.GridFindMargins(best_columns_);\n    // Split and merge the partitions by looking at local neighbours.\n    GridSplitPartitions();\n    // Resolve unknown partitions by adding to an existing partition, fixing\n    // the type, or declaring them noise.\n    part_grid_.GridFindMargins(best_columns_);\n    GridMergePartitions();\n    // Insert any unused noise blobs that are close enough to an appropriate\n    // partition.\n    InsertRemainingNoise(input_block);\n    // Add horizontal line separators as partitions.\n    GridInsertHLinePartitions();\n    GridInsertVLinePartitions();\n    // Recompute margins based on a local neighbourhood search.\n    part_grid_.GridFindMargins(best_columns_);\n    SetPartitionTypes();\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_initial_partitions) {\n    ScrollView *part_win = MakeWindow(100, 300, \"InitialPartitions\");\n    part_grid_.DisplayBoxes(part_win);\n    DisplayTabVectors(part_win);\n  }\n#endif\n  if (!PSM_SPARSE(pageseg_mode)) {\n#ifndef DISABLED_LEGACY_ENGINE\n    if (equation_detect_) {\n      equation_detect_->FindEquationParts(&part_grid_, best_columns_);\n    }\n#endif\n    if (textord_tabfind_find_tables) {\n      TableFinder table_finder;\n      table_finder.Init(gridsize(), bleft(), tright());\n      table_finder.set_resolution(resolution_);\n      table_finder.set_left_to_right_language(!input_block->block->right_to_left());\n      // Copy cleaned partitions from part_grid_ to clean_part_grid_ and\n      // insert dot-like noise into period_grid_\n      table_finder.InsertCleanPartitions(&part_grid_, input_block);\n      // Get Table Regions\n      table_finder.LocateTables(&part_grid_, best_columns_, WidthCB(), reskew_);\n    }\n    GridRemoveUnderlinePartitions();\n    part_grid_.DeleteUnknownParts(input_block);\n\n    // Build the partitions into chains that belong in the same block and\n    // refine into one-to-one links, then smooth the types within each chain.\n    part_grid_.FindPartitionPartners();\n    part_grid_.FindFigureCaptions();\n    part_grid_.RefinePartitionPartners(true);\n    SmoothPartnerRuns();\n\n#ifndef GRAPHICS_DISABLED\n    if (textord_tabfind_show_partitions) {\n      ScrollView *window = MakeWindow(400, 300, \"Partitions\");\n      if (window != nullptr) {\n        part_grid_.DisplayBoxes(window);\n        if (!textord_debug_printable) {\n          DisplayTabVectors(window);\n        }\n        if (window != nullptr && textord_tabfind_show_partitions > 1) {\n          window->AwaitEvent(SVET_DESTROY);\n        }\n      }\n    }\n#endif // !GRAPHICS_DISABLED\n    part_grid_.AssertNoDuplicates();\n  }\n  // Ownership of the ColPartitions moves from part_sets_ to part_grid_ here,\n  // and ownership of the BLOBNBOXes moves to the ColPartitions.\n  // (They were previously owned by the block or the image_bblobs list.)\n  ReleaseBlobsAndCleanupUnused(input_block);\n  // Ownership of the ColPartitions moves from part_grid_ to good_parts_ and\n  // noise_parts_ here. In text blocks, ownership of the BLOBNBOXes moves\n  // from the ColPartitions to the output TO_BLOCK. In non-text, the\n  // BLOBNBOXes stay with the ColPartitions and get deleted in the destructor.\n  if (PSM_SPARSE(pageseg_mode)) {\n    part_grid_.ExtractPartitionsAsBlocks(blocks, to_blocks);\n  } else {\n    TransformToBlocks(blocks, to_blocks);\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Found %d blocks, %d to_blocks\\n\", blocks->length(), to_blocks->length());\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_blocks) {\n    DisplayBlocks(blocks);\n  }\n#endif\n  RotateAndReskewBlocks(input_is_rtl, to_blocks);\n  int result = 0;\n#ifndef GRAPHICS_DISABLED\n  if (blocks_win_ != nullptr) {\n    bool waiting = false;\n    do {\n      waiting = false;\n      auto event = blocks_win_->AwaitEvent(SVET_ANY);\n      if (event->type == SVET_INPUT && event->parameter != nullptr) {\n        if (*event->parameter == 'd') {\n          result = -1;\n        } else {\n          blocks->clear();\n        }\n      } else if (event->type == SVET_DESTROY) {\n        blocks_win_ = nullptr;\n      } else {\n        waiting = true;\n      }\n    } while (waiting);\n  }\n#endif // !GRAPHICS_DISABLED\n  return result;\n}\n\n// Get the rotation required to deskew, and its inverse rotation.\nvoid ColumnFinder::GetDeskewVectors(FCOORD *deskew, FCOORD *reskew) {\n  *reskew = reskew_;\n  *deskew = reskew_;\n  deskew->set_y(-deskew->y());\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\nvoid ColumnFinder::SetEquationDetect(EquationDetectBase *detect) {\n  equation_detect_ = detect;\n}\n#endif\n\n//////////////// PRIVATE CODE /////////////////////////\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the blob and block bounding boxes in a window called Blocks.\nvoid ColumnFinder::DisplayBlocks(BLOCK_LIST *blocks) {\n  if (blocks_win_ == nullptr) {\n    blocks_win_ = MakeWindow(700, 300, \"Blocks\");\n  } else {\n    blocks_win_->Clear();\n  }\n  DisplayBoxes(blocks_win_);\n  BLOCK_IT block_it(blocks);\n  int serial = 1;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    block->pdblk.plot(blocks_win_, serial++,\n                      textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);\n  }\n  blocks_win_->Update();\n}\n\n// Displays the column edges at each grid y coordinate defined by\n// best_columns_.\nvoid ColumnFinder::DisplayColumnBounds(PartSetVector *sets) {\n  ScrollView *col_win = MakeWindow(50, 300, \"Columns\");\n  DisplayBoxes(col_win);\n  col_win->Pen(textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);\n  for (int i = 0; i < gridheight_; ++i) {\n    ColPartitionSet *columns = best_columns_[i];\n    if (columns != nullptr) {\n      columns->DisplayColumnEdges(i * gridsize_, (i + 1) * gridsize_, col_win);\n    }\n  }\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Sets up column_sets_ (the determined column layout at each horizontal\n// slice). Returns false if the page is empty.\nbool ColumnFinder::MakeColumns(bool single_column) {\n  // The part_sets_ are a temporary structure used during column creation,\n  // and is a vector of ColPartitionSets, representing ColPartitions found\n  // at horizontal slices through the page.\n  PartSetVector part_sets;\n  if (!single_column) {\n    if (!part_grid_.MakeColPartSets(&part_sets)) {\n      return false; // Empty page.\n    }\n    ASSERT_HOST(part_grid_.gridheight() == gridheight_);\n    // Try using only the good parts first.\n    bool good_only = true;\n    do {\n      for (int i = 0; i < gridheight_; ++i) {\n        ColPartitionSet *line_set = part_sets.at(i);\n        if (line_set != nullptr && line_set->LegalColumnCandidate()) {\n          ColPartitionSet *column_candidate = line_set->Copy(good_only);\n          if (column_candidate != nullptr) {\n            column_candidate->AddToColumnSetsIfUnique(&column_sets_, WidthCB());\n          }\n        }\n      }\n      good_only = !good_only;\n    } while (column_sets_.empty() && !good_only);\n    if (textord_debug_tabfind) {\n      PrintColumnCandidates(\"Column candidates\");\n    }\n    // Improve the column candidates against themselves.\n    ImproveColumnCandidates(&column_sets_, &column_sets_);\n    if (textord_debug_tabfind) {\n      PrintColumnCandidates(\"Improved columns\");\n    }\n    // Improve the column candidates using the part_sets_.\n    ImproveColumnCandidates(&part_sets, &column_sets_);\n  }\n  ColPartitionSet *single_column_set = part_grid_.MakeSingleColumnSet(WidthCB());\n  if (single_column_set != nullptr) {\n    // Always add the single column set as a backup even if not in\n    // single column mode.\n    single_column_set->AddToColumnSetsIfUnique(&column_sets_, WidthCB());\n  }\n  if (textord_debug_tabfind) {\n    PrintColumnCandidates(\"Final Columns\");\n  }\n  bool has_columns = !column_sets_.empty();\n  if (has_columns) {\n    // Divide the page into sections of uniform column layout.\n    bool any_multi_column = AssignColumns(part_sets);\n#ifndef GRAPHICS_DISABLED\n    if (textord_tabfind_show_columns) {\n      DisplayColumnBounds(&part_sets);\n    }\n#endif\n    ComputeMeanColumnGap(any_multi_column);\n  }\n  for (auto line_set : part_sets) {\n    if (line_set != nullptr) {\n      line_set->RelinquishParts();\n      delete line_set;\n    }\n  }\n  return has_columns;\n}\n\n// Attempt to improve the column_candidates by expanding the columns\n// and adding new partitions from the partition sets in src_sets.\n// Src_sets may be equal to column_candidates, in which case it will\n// use them as a source to improve themselves.\nvoid ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets) {\n  // TODO: optimize.\n  PartSetVector temp_cols = *column_sets;\n  column_sets->clear();\n  if (src_sets == column_sets) {\n    src_sets = &temp_cols;\n  }\n  int set_size = temp_cols.size();\n  // Try using only the good parts first.\n  bool good_only = true;\n  do {\n    for (int i = 0; i < set_size; ++i) {\n      ColPartitionSet *column_candidate = temp_cols.at(i);\n      ASSERT_HOST(column_candidate != nullptr);\n      ColPartitionSet *improved = column_candidate->Copy(good_only);\n      if (improved != nullptr) {\n        improved->ImproveColumnCandidate(WidthCB(), src_sets);\n        improved->AddToColumnSetsIfUnique(column_sets, WidthCB());\n      }\n    }\n    good_only = !good_only;\n  } while (column_sets->empty() && !good_only);\n  if (column_sets->empty()) {\n    // TODO: optimize.\n    *column_sets = temp_cols;\n    temp_cols.clear();\n  } else {\n    for (auto data : temp_cols) {\n      delete data;\n    }\n  }\n}\n\n// Prints debug information on the column candidates.\nvoid ColumnFinder::PrintColumnCandidates(const char *title) {\n  int set_size = column_sets_.size();\n  tprintf(\"Found %d %s:\\n\", set_size, title);\n  if (textord_debug_tabfind >= 3) {\n    for (int i = 0; i < set_size; ++i) {\n      ColPartitionSet *column_set = column_sets_.at(i);\n      column_set->Print();\n    }\n  }\n}\n\n// Finds the optimal set of columns that cover the entire image with as\n// few changes in column partition as possible.\n// NOTE: this could be thought of as an optimization problem, but a simple\n// greedy algorithm is used instead. The algorithm repeatedly finds the modal\n// compatible column in an unassigned region and uses that with the extra\n// tweak of extending the modal region over small breaks in compatibility.\n// Where modal regions overlap, the boundary is chosen so as to minimize\n// the cost in terms of ColPartitions not fitting an approved column.\n// Returns true if any part of the page is multi-column.\nbool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {\n  int set_count = part_sets.size();\n  ASSERT_HOST(set_count == gridheight());\n  // Allocate and init the best_columns_.\n  best_columns_ = new ColPartitionSet *[set_count];\n  for (int y = 0; y < set_count; ++y) {\n    best_columns_[y] = nullptr;\n  }\n  int column_count = column_sets_.size();\n  // column_set_costs[part_sets_ index][column_sets_ index] is\n  // < INT32_MAX if the partition set is compatible with the column set,\n  // in which case its value is the cost for that set used in deciding\n  // which competing set to assign.\n  // any_columns_possible[part_sets_ index] is true if any of\n  // possible_column_sets[part_sets_ index][*] is < INT32_MAX.\n  // assigned_costs[part_sets_ index] is set to the column_set_costs\n  // of the assigned column_sets_ index or INT32_MAX if none is set.\n  // On return the best_columns_ member is set.\n  bool *any_columns_possible = new bool[set_count];\n  int *assigned_costs = new int[set_count];\n  int **column_set_costs = new int *[set_count];\n  // Set possible column_sets to indicate whether each set is compatible\n  // with each column.\n  for (int part_i = 0; part_i < set_count; ++part_i) {\n    ColPartitionSet *line_set = part_sets.at(part_i);\n    bool debug = line_set != nullptr && WithinTestRegion(2, line_set->bounding_box().left(),\n                                                         line_set->bounding_box().bottom());\n    column_set_costs[part_i] = new int[column_count];\n    any_columns_possible[part_i] = false;\n    assigned_costs[part_i] = INT32_MAX;\n    for (int col_i = 0; col_i < column_count; ++col_i) {\n      if (line_set != nullptr &&\n          column_sets_.at(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {\n        column_set_costs[part_i][col_i] = column_sets_.at(col_i)->UnmatchedWidth(line_set);\n        any_columns_possible[part_i] = true;\n      } else {\n        column_set_costs[part_i][col_i] = INT32_MAX;\n        if (debug) {\n          tprintf(\"Set id %d did not match at y=%d, lineset =%p\\n\",\n                  col_i, part_i, static_cast<void *>(line_set));\n        }\n      }\n    }\n  }\n  bool any_multi_column = false;\n  // Assign a column set to each vertical grid position.\n  // While there is an unassigned range, find its mode.\n  int start, end;\n  while (BiggestUnassignedRange(set_count, any_columns_possible, &start, &end)) {\n    if (textord_debug_tabfind >= 2) {\n      tprintf(\"Biggest unassigned range = %d- %d\\n\", start, end);\n    }\n    // Find the modal column_set_id in the range.\n    int column_set_id = RangeModalColumnSet(column_set_costs, assigned_costs, start, end);\n    if (textord_debug_tabfind >= 2) {\n      tprintf(\"Range modal column id = %d\\n\", column_set_id);\n      column_sets_.at(column_set_id)->Print();\n    }\n    // Now find the longest run of the column_set_id in the range.\n    ShrinkRangeToLongestRun(column_set_costs, assigned_costs, any_columns_possible, column_set_id,\n                            &start, &end);\n    if (textord_debug_tabfind >= 2) {\n      tprintf(\"Shrunk range = %d- %d\\n\", start, end);\n    }\n    // Extend the start and end past the longest run, while there are\n    // only small gaps in compatibility that can be overcome by larger\n    // regions of compatibility beyond.\n    ExtendRangePastSmallGaps(column_set_costs, assigned_costs, any_columns_possible, column_set_id,\n                             -1, -1, &start);\n    --end;\n    ExtendRangePastSmallGaps(column_set_costs, assigned_costs, any_columns_possible, column_set_id,\n                             1, set_count, &end);\n    ++end;\n    if (textord_debug_tabfind) {\n      tprintf(\"Column id %d applies to range = %d - %d\\n\", column_set_id, start, end);\n    }\n    // Assign the column to the range, which now may overlap with other ranges.\n    AssignColumnToRange(column_set_id, start, end, column_set_costs, assigned_costs);\n    if (column_sets_.at(column_set_id)->GoodColumnCount() > 1) {\n      any_multi_column = true;\n    }\n  }\n  // If anything remains unassigned, the whole lot is unassigned, so\n  // arbitrarily assign id 0.\n  if (best_columns_[0] == nullptr) {\n    AssignColumnToRange(0, 0, gridheight_, column_set_costs, assigned_costs);\n  }\n  // Free memory.\n  for (int i = 0; i < set_count; ++i) {\n    delete[] column_set_costs[i];\n  }\n  delete[] assigned_costs;\n  delete[] any_columns_possible;\n  delete[] column_set_costs;\n  return any_multi_column;\n}\n\n// Finds the biggest range in part_sets_ that has no assigned column, but\n// column assignment is possible.\nbool ColumnFinder::BiggestUnassignedRange(int set_count, const bool *any_columns_possible,\n                                          int *best_start, int *best_end) {\n  int best_range_size = 0;\n  *best_start = set_count;\n  *best_end = set_count;\n  int end = set_count;\n  for (int start = 0; start < gridheight_; start = end) {\n    // Find the first unassigned index in start.\n    while (start < set_count) {\n      if (best_columns_[start] == nullptr && any_columns_possible[start]) {\n        break;\n      }\n      ++start;\n    }\n    // Find the first past the end and count the good ones in between.\n    int range_size = 1; // Number of non-null, but unassigned line sets.\n    end = start + 1;\n    while (end < set_count) {\n      if (best_columns_[end] != nullptr) {\n        break;\n      }\n      if (any_columns_possible[end]) {\n        ++range_size;\n      }\n      ++end;\n    }\n    if (start < set_count && range_size > best_range_size) {\n      best_range_size = range_size;\n      *best_start = start;\n      *best_end = end;\n    }\n  }\n  return *best_start < *best_end;\n}\n\n// Finds the modal compatible column_set_ index within the given range.\nint ColumnFinder::RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start,\n                                      int end) {\n  int column_count = column_sets_.size();\n  STATS column_stats(0, column_count - 1);\n  for (int part_i = start; part_i < end; ++part_i) {\n    for (int col_j = 0; col_j < column_count; ++col_j) {\n      if (column_set_costs[part_i][col_j] < assigned_costs[part_i]) {\n        column_stats.add(col_j, 1);\n      }\n    }\n  }\n  ASSERT_HOST(column_stats.get_total() > 0);\n  return column_stats.mode();\n}\n\n// Given that there are many column_set_id compatible columns in the range,\n// shrinks the range to the longest contiguous run of compatibility, allowing\n// gaps where no columns are possible, but not where competing columns are\n// possible.\nvoid ColumnFinder::ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs,\n                                           const bool *any_columns_possible, int column_set_id,\n                                           int *best_start, int *best_end) {\n  // orig_start and orig_end are the maximum range we will look at.\n  int orig_start = *best_start;\n  int orig_end = *best_end;\n  int best_range_size = 0;\n  *best_start = orig_end;\n  *best_end = orig_end;\n  int end = orig_end;\n  for (int start = orig_start; start < orig_end; start = end) {\n    // Find the first possible\n    while (start < orig_end) {\n      if (column_set_costs[start][column_set_id] < assigned_costs[start] ||\n          !any_columns_possible[start]) {\n        break;\n      }\n      ++start;\n    }\n    // Find the first past the end.\n    end = start + 1;\n    while (end < orig_end) {\n      if (column_set_costs[end][column_set_id] >= assigned_costs[start] &&\n          any_columns_possible[end]) {\n        break;\n      }\n      ++end;\n    }\n    if (start < orig_end && end - start > best_range_size) {\n      best_range_size = end - start;\n      *best_start = start;\n      *best_end = end;\n    }\n  }\n}\n\n// Moves start in the direction of step, up to, but not including end while\n// the only incompatible regions are no more than kMaxIncompatibleColumnCount\n// in size, and the compatible regions beyond are bigger.\nvoid ColumnFinder::ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs,\n                                            const bool *any_columns_possible, int column_set_id,\n                                            int step, int end, int *start) {\n  if (textord_debug_tabfind > 2) {\n    tprintf(\"Starting expansion at %d, step=%d, limit=%d\\n\", *start, step, end);\n  }\n  if (*start == end) {\n    return; // Cannot be expanded.\n  }\n\n  int barrier_size = 0;\n  int good_size = 0;\n  do {\n    // Find the size of the incompatible barrier.\n    barrier_size = 0;\n    int i;\n    for (i = *start + step; i != end; i += step) {\n      if (column_set_costs[i][column_set_id] < assigned_costs[i]) {\n        break; // We are back on.\n      }\n      // Locations where none are possible don't count.\n      if (any_columns_possible[i]) {\n        ++barrier_size;\n      }\n    }\n    if (textord_debug_tabfind > 2) {\n      tprintf(\"At %d, Barrier size=%d\\n\", i, barrier_size);\n    }\n    if (barrier_size > kMaxIncompatibleColumnCount) {\n      return; // Barrier too big.\n    }\n    if (i == end) {\n      // We can't go any further, but the barrier was small, so go to the end.\n      *start = i - step;\n      return;\n    }\n    // Now find the size of the good region on the other side.\n    good_size = 1;\n    for (i += step; i != end; i += step) {\n      if (column_set_costs[i][column_set_id] < assigned_costs[i]) {\n        ++good_size;\n      } else if (any_columns_possible[i]) {\n        break;\n      }\n    }\n    if (textord_debug_tabfind > 2) {\n      tprintf(\"At %d, good size = %d\\n\", i, good_size);\n    }\n    // If we had enough good ones we can extend the start and keep looking.\n    if (good_size >= barrier_size) {\n      *start = i - step;\n    }\n  } while (good_size >= barrier_size);\n}\n\n// Assigns the given column_set_id to the given range.\nvoid ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,\n                                       int **column_set_costs, int *assigned_costs) {\n  ColPartitionSet *column_set = column_sets_.at(column_set_id);\n  for (int i = start; i < end; ++i) {\n    assigned_costs[i] = column_set_costs[i][column_set_id];\n    best_columns_[i] = column_set;\n  }\n}\n\n// Computes the mean_column_gap_.\nvoid ColumnFinder::ComputeMeanColumnGap(bool any_multi_column) {\n  int total_gap = 0;\n  int total_width = 0;\n  int gap_samples = 0;\n  int width_samples = 0;\n  for (int i = 0; i < gridheight_; ++i) {\n    ASSERT_HOST(best_columns_[i] != nullptr);\n    best_columns_[i]->AccumulateColumnWidthsAndGaps(&total_width, &width_samples, &total_gap,\n                                                    &gap_samples);\n  }\n  mean_column_gap_ = any_multi_column && gap_samples > 0\n                         ? total_gap / gap_samples\n                         : width_samples > 0 ? total_width / width_samples : 0;\n}\n\n//////// Functions that manipulate ColPartitions in the part_grid_ /////\n//////// to split, merge, find margins, and find types.  //////////////\n\n// Helper to delete all the deletable blobs on the list. Owned blobs are\n// extracted from the list, but not deleted, leaving them owned by the owner().\nstatic void ReleaseAllBlobsAndDeleteUnused(BLOBNBOX_LIST *blobs) {\n  for (BLOBNBOX_IT blob_it(blobs); !blob_it.empty(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.extract();\n    if (blob->owner() == nullptr) {\n      delete blob;\n    }\n  }\n}\n\n// Hoovers up all un-owned blobs and deletes them.\n// The rest get released from the block so the ColPartitions can pass\n// ownership to the output blocks.\nvoid ColumnFinder::ReleaseBlobsAndCleanupUnused(TO_BLOCK *block) {\n  ReleaseAllBlobsAndDeleteUnused(&block->blobs);\n  ReleaseAllBlobsAndDeleteUnused(&block->small_blobs);\n  ReleaseAllBlobsAndDeleteUnused(&block->noise_blobs);\n  ReleaseAllBlobsAndDeleteUnused(&block->large_blobs);\n  ReleaseAllBlobsAndDeleteUnused(&image_bblobs_);\n}\n\n// Splits partitions that cross columns where they have nothing in the gap.\nvoid ColumnFinder::GridSplitPartitions() {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(&part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *dont_repeat = nullptr;\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->blob_type() < BRT_UNKNOWN || part == dont_repeat) {\n      continue; // Only applies to text partitions.\n    }\n    ColPartitionSet *column_set = best_columns_[gsearch.GridY()];\n    int first_col = -1;\n    int last_col = -1;\n    // Find which columns the partition spans.\n    part->ColumnRange(resolution_, column_set, &first_col, &last_col);\n    if (first_col > 0) {\n      --first_col;\n    }\n    // Convert output column indices to physical column indices.\n    first_col /= 2;\n    last_col /= 2;\n    // We will only consider cases where a partition spans two columns,\n    // since a heading that spans more columns than that is most likely\n    // genuine.\n    if (last_col != first_col + 1) {\n      continue;\n    }\n    // Set up a rectangle search x-bounded by the column gap and y by the part.\n    int y = part->MidY();\n    TBOX margin_box = part->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(2, margin_box.left(), margin_box.bottom());\n    if (debug) {\n      tprintf(\"Considering partition for GridSplit:\");\n      part->Print();\n    }\n    ColPartition *column = column_set->GetColumnByIndex(first_col);\n    if (column == nullptr) {\n      continue;\n    }\n    margin_box.set_left(column->RightAtY(y) + 2);\n    column = column_set->GetColumnByIndex(last_col);\n    if (column == nullptr) {\n      continue;\n    }\n    margin_box.set_right(column->LeftAtY(y) - 2);\n    // TODO(rays) Decide whether to keep rectangular filling or not in the\n    // main grid and therefore whether we need a fancier search here.\n    // Now run the rect search on the main blob grid.\n    GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(this);\n    if (debug) {\n      tprintf(\"Searching box (%d,%d)->(%d,%d)\\n\", margin_box.left(), margin_box.bottom(),\n              margin_box.right(), margin_box.top());\n      part->Print();\n    }\n    rectsearch.StartRectSearch(margin_box);\n    BLOBNBOX *bbox;\n    while ((bbox = rectsearch.NextRectSearch()) != nullptr) {\n      if (bbox->bounding_box().overlap(margin_box)) {\n        break;\n      }\n    }\n    if (bbox == nullptr) {\n      // There seems to be nothing in the hole, so split the partition.\n      gsearch.RemoveBBox();\n      int x_middle = (margin_box.left() + margin_box.right()) / 2;\n      if (debug) {\n        tprintf(\"Splitting part at %d:\", x_middle);\n        part->Print();\n      }\n      ColPartition *split_part = part->SplitAt(x_middle);\n      if (split_part != nullptr) {\n        if (debug) {\n          tprintf(\"Split result:\");\n          part->Print();\n          split_part->Print();\n        }\n        part_grid_.InsertBBox(true, true, split_part);\n      } else {\n        // Split had no effect\n        if (debug) {\n          tprintf(\"Split had no effect\\n\");\n        }\n        dont_repeat = part;\n      }\n      part_grid_.InsertBBox(true, true, part);\n      gsearch.RepositionIterator();\n    } else if (debug) {\n      tprintf(\"Part cannot be split: blob (%d,%d)->(%d,%d) in column gap\\n\",\n              bbox->bounding_box().left(), bbox->bounding_box().bottom(),\n              bbox->bounding_box().right(), bbox->bounding_box().top());\n    }\n  }\n}\n\n// Merges partitions where there is vertical overlap, within a single column,\n// and the horizontal gap is small enough.\nvoid ColumnFinder::GridMergePartitions() {\n  // Iterate the ColPartitions in the grid.\n  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->IsUnMergeableType()) {\n      continue;\n    }\n    // Set up a rectangle search x-bounded by the column and y by the part.\n    ColPartitionSet *columns = best_columns_[gsearch.GridY()];\n    TBOX box = part->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(1, box.left(), box.bottom());\n    if (debug) {\n      tprintf(\"Considering part for merge at:\");\n      part->Print();\n    }\n    int y = part->MidY();\n    ColPartition *left_column = columns->ColumnContaining(box.left(), y);\n    ColPartition *right_column = columns->ColumnContaining(box.right(), y);\n    if (left_column == nullptr || right_column != left_column) {\n      if (debug) {\n        tprintf(\"In different columns\\n\");\n      }\n      continue;\n    }\n    box.set_left(left_column->LeftAtY(y));\n    box.set_right(right_column->RightAtY(y));\n    // Now run the rect search.\n    bool modified_box = false;\n    GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(&part_grid_);\n    rsearch.SetUniqueMode(true);\n    rsearch.StartRectSearch(box);\n    ColPartition *neighbour;\n\n    while ((neighbour = rsearch.NextRectSearch()) != nullptr) {\n      if (neighbour == part || neighbour->IsUnMergeableType()) {\n        continue;\n      }\n      const TBOX &neighbour_box = neighbour->bounding_box();\n      if (debug) {\n        tprintf(\"Considering merge with neighbour at:\");\n        neighbour->Print();\n      }\n      if (neighbour_box.right() < box.left() || neighbour_box.left() > box.right()) {\n        continue; // Not within the same column.\n      }\n      if (part->VSignificantCoreOverlap(*neighbour) && part->TypesMatch(*neighbour)) {\n        // There is vertical overlap and the gross types match, but only\n        // merge if the horizontal gap is small enough, as one of the\n        // partitions may be a figure caption within a column.\n        // If there is only one column, then the mean_column_gap_ is large\n        // enough to allow almost any merge, by being the mean column width.\n        const TBOX &part_box = part->bounding_box();\n        // Don't merge if there is something else in the way. Use the margin\n        // to decide, and check both to allow a bit of overlap.\n        if (neighbour_box.left() > part->right_margin() &&\n            part_box.right() < neighbour->left_margin()) {\n          continue; // Neighbour is too far to the right.\n        }\n        if (neighbour_box.right() < part->left_margin() &&\n            part_box.left() > neighbour->right_margin()) {\n          continue; // Neighbour is too far to the left.\n        }\n        int h_gap = std::max(part_box.left(), neighbour_box.left()) -\n                    std::min(part_box.right(), neighbour_box.right());\n        if (h_gap < mean_column_gap_ * kHorizontalGapMergeFraction ||\n            part_box.width() < mean_column_gap_ || neighbour_box.width() < mean_column_gap_) {\n          if (debug) {\n            tprintf(\"Running grid-based merge between:\\n\");\n            part->Print();\n            neighbour->Print();\n          }\n          rsearch.RemoveBBox();\n          if (!modified_box) {\n            // We are going to modify part, so remove it and re-insert it after.\n            gsearch.RemoveBBox();\n            rsearch.RepositionIterator();\n            modified_box = true;\n          }\n          part->Absorb(neighbour, WidthCB());\n        } else if (debug) {\n          tprintf(\"Neighbour failed hgap test\\n\");\n        }\n      } else if (debug) {\n        tprintf(\"Neighbour failed overlap or typesmatch test\\n\");\n      }\n    }\n    if (modified_box) {\n      // We modified the box of part, so re-insert it into the grid.\n      // This does no harm in the current cell, as it already exists there,\n      // but it needs to exist in all the cells covered by its bounding box,\n      // or it will never be found by a full search.\n      // Because the box has changed, it has to be removed first, otherwise\n      // add_sorted may fail to keep a single copy of the pointer.\n      part_grid_.InsertBBox(true, true, part);\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// Inserts remaining noise blobs into the most applicable partition if any.\n// If there is no applicable partition, then the blobs are deleted.\nvoid ColumnFinder::InsertRemainingNoise(TO_BLOCK *block) {\n  BLOBNBOX_IT blob_it(&block->noise_blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->owner() != nullptr) {\n      continue;\n    }\n    TBOX search_box(blob->bounding_box());\n    bool debug = WithinTestRegion(2, search_box.left(), search_box.bottom());\n    search_box.pad(gridsize(), gridsize());\n    // Setup a rectangle search to find the best partition to merge with.\n    ColPartitionGridSearch rsearch(&part_grid_);\n    rsearch.SetUniqueMode(true);\n    rsearch.StartRectSearch(search_box);\n    ColPartition *part;\n    ColPartition *best_part = nullptr;\n    int best_distance = 0;\n    while ((part = rsearch.NextRectSearch()) != nullptr) {\n      if (part->IsUnMergeableType()) {\n        continue;\n      }\n      int distance =\n          projection_.DistanceOfBoxFromPartition(blob->bounding_box(), *part, denorm_, debug);\n      if (best_part == nullptr || distance < best_distance) {\n        best_part = part;\n        best_distance = distance;\n      }\n    }\n    if (best_part != nullptr &&\n        best_distance < kMaxDistToPartSizeRatio * best_part->median_height()) {\n      // Close enough to merge.\n      if (debug) {\n        tprintf(\"Adding noise blob with distance %d, thr=%g:box:\", best_distance,\n                kMaxDistToPartSizeRatio * best_part->median_height());\n        blob->bounding_box().print();\n        tprintf(\"To partition:\");\n        best_part->Print();\n      }\n      part_grid_.RemoveBBox(best_part);\n      best_part->AddBox(blob);\n      part_grid_.InsertBBox(true, true, best_part);\n      blob->set_owner(best_part);\n      blob->set_flow(best_part->flow());\n      blob->set_region_type(best_part->blob_type());\n    } else {\n      // Mark the blob for deletion.\n      blob->set_region_type(BRT_NOISE);\n    }\n  }\n  // Delete the marked blobs, clearing neighbour references.\n  block->DeleteUnownedNoise();\n}\n\n// Helper makes a box from a horizontal line.\nstatic TBOX BoxFromHLine(const TabVector *hline) {\n  int top = std::max(hline->startpt().y(), hline->endpt().y());\n  int bottom = std::min(hline->startpt().y(), hline->endpt().y());\n  top += hline->mean_width();\n  if (top == bottom) {\n    if (bottom > 0) {\n      --bottom;\n    } else {\n      ++top;\n    }\n  }\n  return TBOX(hline->startpt().x(), bottom, hline->endpt().x(), top);\n}\n\n// Remove partitions that come from horizontal lines that look like\n// underlines, but are not part of a table.\nvoid ColumnFinder::GridRemoveUnderlinePartitions() {\n  TabVector_IT hline_it(&horizontal_lines_);\n  for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {\n    TabVector *hline = hline_it.data();\n    if (hline->intersects_other_lines()) {\n      continue;\n    }\n    TBOX line_box = BoxFromHLine(hline);\n    TBOX search_box = line_box;\n    search_box.pad(0, line_box.height());\n    ColPartitionGridSearch part_search(&part_grid_);\n    part_search.SetUniqueMode(true);\n    part_search.StartRectSearch(search_box);\n    ColPartition *covered;\n    bool touched_table = false;\n    bool touched_text = false;\n    ColPartition *line_part = nullptr;\n    while ((covered = part_search.NextRectSearch()) != nullptr) {\n      if (covered->type() == PT_TABLE) {\n        touched_table = true;\n        break;\n      } else if (covered->IsTextType()) {\n        // TODO(rays) Add a list of underline sections to ColPartition.\n        int text_bottom = covered->median_bottom();\n        if (line_box.bottom() <= text_bottom && text_bottom <= search_box.top()) {\n          touched_text = true;\n        }\n      } else if (covered->blob_type() == BRT_HLINE && line_box.contains(covered->bounding_box()) &&\n                 // not if same instance (identical to hline)\n                 !TBOX(covered->bounding_box()).contains(line_box)) {\n        line_part = covered;\n      }\n    }\n    if (line_part != nullptr && !touched_table && touched_text) {\n      part_grid_.RemoveBBox(line_part);\n      delete line_part;\n    }\n  }\n}\n\n// Add horizontal line separators as partitions.\nvoid ColumnFinder::GridInsertHLinePartitions() {\n  TabVector_IT hline_it(&horizontal_lines_);\n  for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {\n    TabVector *hline = hline_it.data();\n    TBOX line_box = BoxFromHLine(hline);\n    ColPartition *part =\n        ColPartition::MakeLinePartition(BRT_HLINE, vertical_skew_, line_box.left(),\n                                        line_box.bottom(), line_box.right(), line_box.top());\n    part->set_type(PT_HORZ_LINE);\n    bool any_image = false;\n    ColPartitionGridSearch part_search(&part_grid_);\n    part_search.SetUniqueMode(true);\n    part_search.StartRectSearch(line_box);\n    ColPartition *covered;\n    while ((covered = part_search.NextRectSearch()) != nullptr) {\n      if (covered->IsImageType()) {\n        any_image = true;\n        break;\n      }\n    }\n    if (!any_image) {\n      part_grid_.InsertBBox(true, true, part);\n    } else {\n      delete part;\n    }\n  }\n}\n\n// Add horizontal line separators as partitions.\nvoid ColumnFinder::GridInsertVLinePartitions() {\n  TabVector_IT vline_it(dead_vectors());\n  for (vline_it.mark_cycle_pt(); !vline_it.cycled_list(); vline_it.forward()) {\n    TabVector *vline = vline_it.data();\n    if (!vline->IsSeparator()) {\n      continue;\n    }\n    int left = std::min(vline->startpt().x(), vline->endpt().x());\n    int right = std::max(vline->startpt().x(), vline->endpt().x());\n    right += vline->mean_width();\n    if (left == right) {\n      if (left > 0) {\n        --left;\n      } else {\n        ++right;\n      }\n    }\n    ColPartition *part = ColPartition::MakeLinePartition(\n        BRT_VLINE, vertical_skew_, left, vline->startpt().y(), right, vline->endpt().y());\n    part->set_type(PT_VERT_LINE);\n    bool any_image = false;\n    ColPartitionGridSearch part_search(&part_grid_);\n    part_search.SetUniqueMode(true);\n    part_search.StartRectSearch(part->bounding_box());\n    ColPartition *covered;\n    while ((covered = part_search.NextRectSearch()) != nullptr) {\n      if (covered->IsImageType()) {\n        any_image = true;\n        break;\n      }\n    }\n    if (!any_image) {\n      part_grid_.InsertBBox(true, true, part);\n    } else {\n      delete part;\n    }\n  }\n}\n\n// For every ColPartition in the grid, sets its type based on position\n// in the columns.\nvoid ColumnFinder::SetPartitionTypes() {\n  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part->SetPartitionType(resolution_, best_columns_[gsearch.GridY()]);\n  }\n}\n\n// Only images remain with multiple types in a run of partners.\n// Sets the type of all in the group to the maximum of the group.\nvoid ColumnFinder::SmoothPartnerRuns() {\n  // Iterate the ColPartitions in the grid.\n  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    ColPartition *partner = part->SingletonPartner(true);\n    if (partner != nullptr) {\n      if (partner->SingletonPartner(false) != part) {\n        tprintf(\"Ooops! Partition:(%d partners)\", part->upper_partners()->length());\n        part->Print();\n        tprintf(\"has singleton partner:(%d partners\", partner->lower_partners()->length());\n        partner->Print();\n        tprintf(\"but its singleton partner is:\");\n        if (partner->SingletonPartner(false) == nullptr) {\n          tprintf(\"NULL\\n\");\n        } else {\n          partner->SingletonPartner(false)->Print();\n        }\n      }\n      ASSERT_HOST(partner->SingletonPartner(false) == part);\n    } else if (part->SingletonPartner(false) != nullptr) {\n      ColPartitionSet *column_set = best_columns_[gsearch.GridY()];\n      int column_count = column_set->ColumnCount();\n      part->SmoothPartnerRun(column_count * 2 + 1);\n    }\n  }\n}\n\n// Helper functions for TransformToBlocks.\n// Add the part to the temp list in the correct order.\nvoid ColumnFinder::AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list) {\n  int mid_y = part->MidY();\n  ColPartition_C_IT it(temp_list);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *test_part = it.data();\n    if (part->type() == PT_NOISE || test_part->type() == PT_NOISE) {\n      continue; // Noise stays in sequence.\n    }\n    if (test_part == part->SingletonPartner(false)) {\n      break; // Insert before its lower partner.\n    }\n    int neighbour_bottom = test_part->median_bottom();\n    int neighbour_top = test_part->median_top();\n    int neighbour_y = (neighbour_bottom + neighbour_top) / 2;\n    if (neighbour_y < mid_y) {\n      break; // part is above test_part so insert it.\n    }\n    if (!part->HOverlaps(*test_part) && !part->WithinSameMargins(*test_part)) {\n      continue; // Incompatibles stay in order\n    }\n  }\n  if (it.cycled_list()) {\n    it.add_to_end(part);\n  } else {\n    it.add_before_stay_put(part);\n  }\n}\n\n// Add everything from the temp list to the work_set assuming correct order.\nvoid ColumnFinder::EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set) {\n  ColPartition_C_IT it(temp_list);\n  while (!it.empty()) {\n    it.extract()->AddToWorkingSet(bleft_, tright_, resolution_, &good_parts_, work_set);\n    it.forward();\n  }\n}\n\n// Transform the grid of partitions to the output blocks.\nvoid ColumnFinder::TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {\n  WorkingPartSet_LIST work_set;\n  ColPartitionSet *column_set = nullptr;\n  ColPartition_IT noise_it(&noise_parts_);\n  // The temp_part_list holds a list of parts at the same grid y coord\n  // so they can be added in the correct order. This prevents thin objects\n  // like horizontal lines going before the text lines above them.\n  ColPartition_CLIST temp_part_list;\n  // Iterate the ColPartitions in the grid. It starts at the top\n  ColPartitionGridSearch gsearch(&part_grid_);\n  gsearch.StartFullSearch();\n  int prev_grid_y = -1;\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    int grid_y = gsearch.GridY();\n    if (grid_y != prev_grid_y) {\n      EmptyTempPartList(&temp_part_list, &work_set);\n      prev_grid_y = grid_y;\n    }\n    if (best_columns_[grid_y] != column_set) {\n      column_set = best_columns_[grid_y];\n      // Every line should have a non-null best column.\n      ASSERT_HOST(column_set != nullptr);\n      column_set->ChangeWorkColumns(bleft_, tright_, resolution_, &good_parts_, &work_set);\n      if (textord_debug_tabfind) {\n        tprintf(\"Changed column groups at grid index %d, y=%d\\n\", gsearch.GridY(),\n                gsearch.GridY() * gridsize());\n      }\n    }\n    if (part->type() == PT_NOISE) {\n      noise_it.add_to_end(part);\n    } else {\n      AddToTempPartList(part, &temp_part_list);\n    }\n  }\n  EmptyTempPartList(&temp_part_list, &work_set);\n  // Now finish all working sets and transfer ColPartitionSets to block_sets.\n  WorkingPartSet_IT work_it(&work_set);\n  while (!work_it.empty()) {\n    WorkingPartSet *working_set = work_it.extract();\n    working_set->ExtractCompletedBlocks(bleft_, tright_, resolution_, &good_parts_, blocks,\n                                        to_blocks);\n    delete working_set;\n    work_it.forward();\n  }\n}\n\n// Helper reflects a list of blobs in the y-axis.\n// Only reflects the BLOBNBOX bounding box. Not the blobs or outlines below.\nstatic void ReflectBlobList(BLOBNBOX_LIST *bblobs) {\n  BLOBNBOX_IT it(bblobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->reflect_box_in_y_axis();\n  }\n}\n\n// Reflect the blob boxes (but not the outlines) in the y-axis so that\n// the blocks get created in the correct RTL order. Reflects the blobs\n// in the input_block and the bblobs list.\n// The reflection is undone in RotateAndReskewBlocks by\n// reflecting the blocks themselves, and then recomputing the blob bounding\n// boxes.\nvoid ColumnFinder::ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs) {\n  ReflectBlobList(bblobs);\n  ReflectBlobList(&input_block->blobs);\n  ReflectBlobList(&input_block->small_blobs);\n  ReflectBlobList(&input_block->noise_blobs);\n  ReflectBlobList(&input_block->large_blobs);\n  // Update the denorm with the reflection.\n  auto *new_denorm = new DENORM;\n  new_denorm->SetupNormalization(nullptr, nullptr, denorm_, 0.0f, 0.0f, -1.0f, 1.0f, 0.0f, 0.0f);\n  denorm_ = new_denorm;\n}\n\n// Helper fixes up blobs and cblobs to match the desired rotation,\n// exploding multi-outline blobs back to single blobs and accumulating\n// the bounding box widths and heights.\nstatic void RotateAndExplodeBlobList(const FCOORD &blob_rotation, BLOBNBOX_LIST *bblobs,\n                                     STATS *widths, STATS *heights) {\n  BLOBNBOX_IT it(bblobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    C_BLOB *cblob = blob->cblob();\n    C_OUTLINE_LIST *outlines = cblob->out_list();\n    C_OUTLINE_IT ol_it(outlines);\n    if (!outlines->singleton()) {\n      // This blob has multiple outlines from CJK repair.\n      // Explode the blob back into individual outlines.\n      for (; !ol_it.empty(); ol_it.forward()) {\n        C_OUTLINE *outline = ol_it.extract();\n        BLOBNBOX *new_blob = BLOBNBOX::RealBlob(outline);\n        // This blob will be revisited later since we add_after_stay_put here.\n        // This means it will get rotated and have its width/height added to\n        // the stats below.\n        it.add_after_stay_put(new_blob);\n      }\n      it.extract();\n      delete blob;\n    } else {\n      if (blob_rotation.x() != 1.0f || blob_rotation.y() != 0.0f) {\n        cblob->rotate(blob_rotation);\n      }\n      blob->compute_bounding_box();\n      widths->add(blob->bounding_box().width(), 1);\n      heights->add(blob->bounding_box().height(), 1);\n    }\n  }\n}\n\n// Undo the deskew that was done in FindTabVectors, as recognition is done\n// without correcting blobs or blob outlines for skew.\n// Reskew the completed blocks to put them back to the original rotated coords\n// that were created by CorrectOrientation.\n// If the input_is_rtl, then reflect the blocks in the y-axis to undo the\n// reflection that was done before FindTabVectors.\n// Blocks that were identified as vertical text (relative to the rotated\n// coordinates) are further rotated so the text lines are horizontal.\n// blob polygonal outlines are rotated to match the position of the blocks\n// that they are in, and their bounding boxes are recalculated to be accurate.\n// Record appropriate inverse transformations and required\n// classifier transformation in the blocks.\nvoid ColumnFinder::RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *blocks) {\n  if (input_is_rtl) {\n    // The skew is backwards because of the reflection.\n    FCOORD tmp = deskew_;\n    deskew_ = reskew_;\n    reskew_ = tmp;\n  }\n  TO_BLOCK_IT it(blocks);\n  int block_index = 1;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TO_BLOCK *to_block = it.data();\n    BLOCK *block = to_block->block;\n    // Blocks are created on the deskewed blob outlines in TransformToBlocks()\n    // so we need to reskew them back to page coordinates.\n    if (input_is_rtl) {\n      block->reflect_polygon_in_y_axis();\n    }\n    block->rotate(reskew_);\n    // Copy the right_to_left flag to the created block.\n    block->set_right_to_left(input_is_rtl);\n    // Save the skew angle in the block for baseline computations.\n    block->set_skew(reskew_);\n    block->pdblk.set_index(block_index++);\n    FCOORD blob_rotation = ComputeBlockAndClassifyRotation(block);\n    // Rotate all the blobs if needed and recompute the bounding boxes.\n    // Compute the block median blob width and height as we go.\n    STATS widths(0, block->pdblk.bounding_box().width() - 1);\n    STATS heights(0, block->pdblk.bounding_box().height() - 1);\n    RotateAndExplodeBlobList(blob_rotation, &to_block->blobs, &widths, &heights);\n    TO_ROW_IT row_it(to_block->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      TO_ROW *row = row_it.data();\n      RotateAndExplodeBlobList(blob_rotation, row->blob_list(), &widths, &heights);\n    }\n    block->set_median_size(static_cast<int>(widths.median() + 0.5),\n                           static_cast<int>(heights.median() + 0.5));\n    if (textord_debug_tabfind >= 2) {\n      tprintf(\"Block median size = (%d, %d)\\n\", block->median_size().x(), block->median_size().y());\n    }\n  }\n}\n\n// Computes the rotations for the block (to make textlines horizontal) and\n// for the blobs (for classification) and sets the appropriate members\n// of the given block.\n// Returns the rotation that needs to be applied to the blobs to make\n// them sit in the rotated block.\nFCOORD ColumnFinder::ComputeBlockAndClassifyRotation(BLOCK *block) {\n  // The text_rotation_ tells us the gross page text rotation that needs\n  // to be applied for classification\n  // TODO(rays) find block-level classify rotation by orientation detection.\n  // In the mean time, assume that \"up\" for text printed in the minority\n  // direction (PT_VERTICAL_TEXT) is perpendicular to the line of reading.\n  // Accomplish this by zero-ing out the text rotation.  This covers the\n  // common cases of image credits in documents written in Latin scripts\n  // and page headings for predominantly vertically written CJK books.\n  FCOORD classify_rotation(text_rotation_);\n  FCOORD block_rotation(1.0f, 0.0f);\n  if (block->pdblk.poly_block()->isA() == PT_VERTICAL_TEXT) {\n    // Vertical text needs to be 90 degrees rotated relative to the rest.\n    // If the rest has a 90 degree rotation already, use the inverse, making\n    // the vertical text the original way up. Otherwise use 90 degrees\n    // clockwise.\n    if (rerotate_.x() == 0.0f) {\n      block_rotation = rerotate_;\n    } else {\n      block_rotation = FCOORD(0.0f, -1.0f);\n    }\n    block->rotate(block_rotation);\n    classify_rotation = FCOORD(1.0f, 0.0f);\n  }\n  block_rotation.rotate(rotation_);\n  // block_rotation is now what we have done to the blocks. Now do the same\n  // thing to the blobs, but save the inverse rotation in the block, as that\n  // is what we need to DENORM back to the image coordinates.\n  FCOORD blob_rotation(block_rotation);\n  block_rotation.set_y(-block_rotation.y());\n  block->set_re_rotation(block_rotation);\n  block->set_classify_rotation(classify_rotation);\n  if (textord_debug_tabfind) {\n    tprintf(\"Blk %d, type %d rerotation(%.2f, %.2f), char(%.2f,%.2f), box:\", block->pdblk.index(),\n            block->pdblk.poly_block()->isA(), block->re_rotation().x(), block->re_rotation().y(),\n            classify_rotation.x(), classify_rotation.y());\n    block->pdblk.bounding_box().print();\n  }\n  return blob_rotation;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/colfind.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colfind.h\n// Description: Class to find columns in the grid of BLOBNBOXes.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_COLFIND_H_\n#define TESSERACT_TEXTORD_COLFIND_H_\n\n#include \"colpartitiongrid.h\"\n#include \"colpartitionset.h\"\n#include \"debugpixa.h\"\n#include \"imagefind.h\"\n#include \"ocrblock.h\"\n#include \"tabfind.h\"\n#include \"textlineprojection.h\"\n\nclass BLOCK_LIST;\nstruct Boxa;\nstruct Pixa;\nclass DENORM;\nclass ScrollView;\nclass STATS;\nclass TO_BLOCK;\n\nnamespace tesseract {\n\nclass ColPartitionSet;\nclass ColPartitionSet_LIST;\nclass ColSegment_LIST;\nclass ColumnGroup_LIST;\nclass LineSpacing;\nclass StrokeWidth;\nclass TempColumn_LIST;\nclass EquationDetectBase;\n\n// The ColumnFinder class finds columns in the grid.\nclass TESS_API ColumnFinder : public TabFind {\npublic:\n  // Gridsize is an estimate of the text size in the image. A suitable value\n  // is in TO_BLOCK::line_size after find_components has been used to make\n  // the blobs.\n  // bleft and tright are the bounds of the image (rectangle) being processed.\n  // vlines is a (possibly empty) list of TabVector and vertical_x and y are\n  // the sum logical vertical vector produced by LineFinder::FindVerticalLines.\n  // If cjk_script is true, then broken CJK characters are fixed during\n  // layout analysis to assist in detecting horizontal vs vertically written\n  // textlines.\n  ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution,\n               bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines,\n               TabVector_LIST *hlines, int vertical_x, int vertical_y);\n  ~ColumnFinder() override;\n\n  // Accessors for testing\n  const DENORM *denorm() const {\n    return denorm_;\n  }\n  const TextlineProjection *projection() const {\n    return &projection_;\n  }\n  void set_cjk_script(bool is_cjk) {\n    cjk_script_ = is_cjk;\n  }\n\n  // ======================================================================\n  // The main function of ColumnFinder is broken into pieces to facilitate\n  // optional insertion of orientation and script detection in an efficient\n  // way. The calling sequence IS MANDATORY however, whether or not\n  // OSD is being used:\n  // 1. Construction.\n  // 2. SetupAndFilterNoise.\n  // 3. IsVerticallyAlignedText.\n  // 4. CorrectOrientation.\n  // 5. FindBlocks.\n  // 6. Destruction. Use of a single column finder for multiple images does not\n  //    make sense.\n  // Throughout these steps, the ColPartitions are owned by part_grid_, which\n  // means that it must be kept correct. Exception: big_parts_ owns its\n  // own ColPartitions.\n  // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except\n  // for a phase in FindBlocks before TransformToBlocks, when they become\n  // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX\n  // indicates more of a betrothal for the majority of layout analysis, ie\n  // which ColPartition will take ownership when the blobs are release from\n  // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that\n  // are part of the image regions, as they are not on any TO_BLOCK list.\n  // TODO(rays) break up column finder further into smaller classes, as\n  // there is a lot more to it than column finding now.\n  // ======================================================================\n\n  // Performs initial processing on the blobs in the input_block:\n  // Setup the part_grid, stroke_width_, nontext_map_.\n  // Obvious noise blobs are filtered out and used to mark the nontext_map_.\n  // Initial stroke-width analysis is used to get local text alignment\n  // direction, so the textline projection_ map can be setup.\n  // On return, IsVerticallyAlignedText may be called (now optionally) to\n  // determine the gross textline alignment of the page.\n  void SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix, TO_BLOCK *input_block);\n\n  // Tests for vertical alignment of text (returning true if so), and generates\n  // a list of blobs (in osd_blobs) for orientation and script detection.\n  // block is the single block for the whole page or rectangle to be OCRed.\n  // Note that the vertical alignment may be due to text whose writing direction\n  // is vertical, like say Japanese, or due to text whose writing direction is\n  // horizontal but whose text appears vertically aligned because the image is\n  // not the right way up.\n  // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.\n  bool IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block,\n                               BLOBNBOX_CLIST *osd_blobs);\n\n  // Rotates the blobs and the TabVectors so that the gross writing direction\n  // (text lines) are horizontal and lines are read down the page.\n  // Applied rotation stored in rotation_.\n  // A second rotation is calculated for application during recognition to\n  // make the rotated blobs upright for recognition.\n  // Subsequent rotation stored in text_rotation_.\n  //\n  // Arguments:\n  //   vertical_text_lines is true if the text lines are vertical.\n  //   recognition_rotation [0..3] is the number of anti-clockwise 90 degree\n  //   rotations from osd required for the text to be upright and readable.\n  void CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines, int recognition_rotation);\n\n  // Finds blocks of text, image, rule line, table etc, returning them in the\n  // blocks and to_blocks\n  // (Each TO_BLOCK points to the basic BLOCK and adds more information.)\n  // Image blocks are generated by a combination of photo_mask_pix (which may\n  // NOT be nullptr) and the rejected text found during preliminary textline\n  // finding.\n  // The input_block is the result of a call to find_components, and contains\n  // the blobs found in the image or rectangle to be OCRed. These blobs will be\n  // removed and placed in the output blocks, while unused ones will be deleted.\n  // If single_column is true, the input is treated as single column, but\n  // it is still divided into blocks of equal line spacing/text size.\n  // scaled_color is scaled down by scaled_factor from the input color image,\n  // and may be nullptr if the input was not color.\n  // grey_pix is optional, but if present must match the photo_mask_pix in size,\n  // and must be a *real* grey image instead of binary_pix * 255.\n  // thresholds_pix is expected to be present iff grey_pix is present and\n  // can be an integer factor reduction of the grey_pix. It represents the\n  // thresholds that were used to create the binary_pix from the grey_pix.\n  // Small blobs that confuse the segmentation into lines are placed into\n  // diacritic_blobs, with the intention that they be put into the most\n  // appropriate word after the rest of layout analysis.\n  // Returns -1 if the user hits the 'd' key in the blocks window while running\n  // in debug mode, which requests a retry with more debug info.\n  int FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor, TO_BLOCK *block,\n                 Image photo_mask_pix, Image thresholds_pix, Image grey_pix, DebugPixa *pixa_debug,\n                 BLOCK_LIST *blocks, BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks);\n\n  // Get the rotation required to deskew, and its inverse rotation.\n  void GetDeskewVectors(FCOORD *deskew, FCOORD *reskew);\n\n  // Set the equation detection pointer.\n  void SetEquationDetect(EquationDetectBase *detect);\n\nprivate:\n  // Displays the blob and block bounding boxes in a window called Blocks.\n  void DisplayBlocks(BLOCK_LIST *blocks);\n  // Displays the column edges at each grid y coordinate defined by\n  // best_columns_.\n  void DisplayColumnBounds(PartSetVector *sets);\n\n  ////// Functions involved in determining the columns used on the page. /////\n\n  // Sets up column_sets_ (the determined column layout at each horizontal\n  // slice). Returns false if the page is empty.\n  bool MakeColumns(bool single_column);\n  // Attempt to improve the column_candidates by expanding the columns\n  // and adding new partitions from the partition sets in src_sets.\n  // Src_sets may be equal to column_candidates, in which case it will\n  // use them as a source to improve themselves.\n  void ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets);\n  // Prints debug information on the column candidates.\n  void PrintColumnCandidates(const char *title);\n  // Finds the optimal set of columns that cover the entire image with as\n  // few changes in column partition as possible.\n  // Returns true if any part of the page is multi-column.\n  bool AssignColumns(const PartSetVector &part_sets);\n  // Finds the biggest range in part_sets_ that has no assigned column, but\n  // column assignment is possible.\n  bool BiggestUnassignedRange(int set_count, const bool *any_columns_possible, int *start,\n                              int *end);\n  // Finds the modal compatible column_set_ index within the given range.\n  int RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start, int end);\n  // Given that there are many column_set_id compatible columns in the range,\n  // shrinks the range to the longest contiguous run of compatibility, allowing\n  // gaps where no columns are possible, but not where competing columns are\n  // possible.\n  void ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs,\n                               const bool *any_columns_possible, int column_set_id, int *best_start,\n                               int *best_end);\n  // Moves start in the direction of step, up to, but not including end while\n  // the only incompatible regions are no more than kMaxIncompatibleColumnCount\n  // in size, and the compatible regions beyond are bigger.\n  void ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs,\n                                const bool *any_columns_possible, int column_set_id, int step,\n                                int end, int *start);\n  // Assigns the given column_set_id to the part_sets_ in the given range.\n  void AssignColumnToRange(int column_set_id, int start, int end, int **column_set_costs,\n                           int *assigned_costs);\n\n  // Computes the mean_column_gap_.\n  void ComputeMeanColumnGap(bool any_multi_column);\n\n  //////// Functions that manipulate ColPartitions in the part_grid_ /////\n  //////// to split, merge, find margins, and find types.  //////////////\n\n  // Hoovers up all un-owned blobs and deletes them.\n  // The rest get released from the block so the ColPartitions can pass\n  // ownership to the output blocks.\n  void ReleaseBlobsAndCleanupUnused(TO_BLOCK *block);\n  // Splits partitions that cross columns where they have nothing in the gap.\n  void GridSplitPartitions();\n  // Merges partitions where there is vertical overlap, within a single column,\n  // and the horizontal gap is small enough.\n  void GridMergePartitions();\n  // Inserts remaining noise blobs into the most applicable partition if any.\n  // If there is no applicable partition, then the blobs are deleted.\n  void InsertRemainingNoise(TO_BLOCK *block);\n  // Remove partitions that come from horizontal lines that look like\n  // underlines, but are not part of a table.\n  void GridRemoveUnderlinePartitions();\n  // Add horizontal line separators as partitions.\n  void GridInsertHLinePartitions();\n  // Add vertical line separators as partitions.\n  void GridInsertVLinePartitions();\n  // For every ColPartition in the grid, sets its type based on position\n  // in the columns.\n  void SetPartitionTypes();\n  // Only images remain with multiple types in a run of partners.\n  // Sets the type of all in the group to the maximum of the group.\n  void SmoothPartnerRuns();\n\n  //////// Functions that make the final output blocks             ///////\n\n  // Helper functions for TransformToBlocks.\n  // Add the part to the temp list in the correct order.\n  void AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list);\n  // Add everything from the temp list to the work_set assuming correct order.\n  void EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set);\n\n  // Transform the grid of partitions to the output blocks.\n  void TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);\n\n  // Reflect the blob boxes (but not the outlines) in the y-axis so that\n  // the blocks get created in the correct RTL order. Rotates the blobs\n  // in the input_block and the bblobs list.\n  // The reflection is undone in RotateAndReskewBlocks by\n  // reflecting the blocks themselves, and then recomputing the blob bounding\n  //  boxes.\n  void ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs);\n\n  // Undo the deskew that was done in FindTabVectors, as recognition is done\n  // without correcting blobs or blob outlines for skew.\n  // Reskew the completed blocks to put them back to the original rotated coords\n  // that were created by CorrectOrientation.\n  // If the input_is_rtl, then reflect the blocks in the y-axis to undo the\n  // reflection that was done before FindTabVectors.\n  // Blocks that were identified as vertical text (relative to the rotated\n  // coordinates) are further rotated so the text lines are horizontal.\n  // blob polygonal outlines are rotated to match the position of the blocks\n  // that they are in, and their bounding boxes are recalculated to be accurate.\n  // Record appropriate inverse transformations and required\n  // classifier transformation in the blocks.\n  void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *to_blocks);\n\n  // Computes the rotations for the block (to make textlines horizontal) and\n  // for the blobs (for classification) and sets the appropriate members\n  // of the given block.\n  // Returns the rotation that needs to be applied to the blobs to make\n  // them sit in the rotated block.\n  FCOORD ComputeBlockAndClassifyRotation(BLOCK *block);\n\n  // If true then the page language is cjk, so it is safe to perform\n  // FixBrokenCJK.\n  bool cjk_script_;\n  // The minimum gutter width to apply for finding columns.\n  // Modified when vertical text is detected to prevent detection of\n  // vertical text lines as columns.\n  int min_gutter_width_;\n  // The mean gap between columns over the page.\n  int mean_column_gap_;\n  // Config param saved at construction time. Modifies min_gutter_width_ with\n  // vertical text to prevent detection of vertical text as columns.\n  double tabfind_aligned_gap_fraction_;\n  // The rotation vector needed to convert original coords to deskewed.\n  FCOORD deskew_;\n  // The rotation vector needed to convert deskewed back to original coords.\n  FCOORD reskew_;\n  // The rotation vector used to rotate vertically oriented pages.\n  FCOORD rotation_;\n  // The rotation vector needed to convert the rotated back to original coords.\n  FCOORD rerotate_;\n  // The additional rotation vector needed to rotate text for recognition.\n  FCOORD text_rotation_;\n  // The column_sets_ contain the ordered candidate ColPartitionSets that\n  // define the possible divisions of the page into columns.\n  PartSetVector column_sets_;\n  // A simple array of pointers to the best assigned column division at\n  // each grid y coordinate.\n  ColPartitionSet **best_columns_;\n  // The grid used for creating initial partitions with strokewidth.\n  StrokeWidth *stroke_width_;\n  // The grid used to hold ColPartitions after the columns have been determined.\n  ColPartitionGrid part_grid_;\n  // List of ColPartitions that are no longer needed after they have been\n  // turned into regions, but are kept around because they are referenced\n  // by the part_grid_.\n  ColPartition_LIST good_parts_;\n  // List of ColPartitions that are big and might be dropcap or vertically\n  // joined.\n  ColPartition_LIST big_parts_;\n  // List of ColPartitions that have been declared noise.\n  ColPartition_LIST noise_parts_;\n  // The fake blobs that are made from the images.\n  BLOBNBOX_LIST image_bblobs_;\n  // Horizontal line separators.\n  TabVector_LIST horizontal_lines_;\n  // Image map of photo/noise areas on the page.\n  Image nontext_map_;\n  // Textline projection map.\n  TextlineProjection projection_;\n  // Sequence of DENORMS that indicate how to get back to the original image\n  // coordinate space. The destructor must delete all the DENORMs in the chain.\n  DENORM *denorm_;\n\n  // The equation region detector pointer. Note: This pointer is passed in by\n  // member function SetEquationDetect, and releasing it is NOT owned by this\n  // class.\n  EquationDetectBase *equation_detect_;\n\n#ifndef GRAPHICS_DISABLED\n  // Various debug windows that automatically go away on completion.\n  ScrollView *input_blobs_win_ = nullptr;\n\n  // Allow a subsequent instance to reuse the blocks window.\n  // Not thread-safe, but multiple threads shouldn't be using windows anyway.\n  static ScrollView *blocks_win_;\n#endif\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_COLFIND_H_\n"
  },
  {
    "path": "src/textord/colpartition.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartition.cpp\n// Description: Class to hold partitions of the page that correspond\n//              roughly to text lines.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"colpartition.h\"\n#include \"colpartitiongrid.h\"\n#include \"colpartitionset.h\"\n#include \"detlinefit.h\"\n#include \"dppoint.h\"\n#include \"helpers.h\" // for UpdateRange\n#include \"host.h\"    // for NearlyEqual\n#include \"imagefind.h\"\n#include \"workingpartset.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n//////////////// ColPartition Implementation ////////////////\n\n// enum to refer to the entries in a neighbourhood of lines.\n// Used by SmoothSpacings to test for blips with OKSpacingBlip.\nenum SpacingNeighbourhood {\n  PN_ABOVE2,\n  PN_ABOVE1,\n  PN_UPPER,\n  PN_LOWER,\n  PN_BELOW1,\n  PN_BELOW2,\n  PN_COUNT\n};\n\n// Maximum change in spacing (in inches) to ignore.\nconst double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.\n// Maximum fraction of line height used as an additional allowance\n// for top spacing.\nconst double kMaxTopSpacingFraction = 0.25;\n// What multiple of the largest line height should be used as an upper bound\n// for whether lines are in the same text block?\nconst double kMaxSameBlockLineSpacing = 3;\n// Maximum ratio of sizes for lines to be considered the same size.\nconst double kMaxSizeRatio = 1.5;\n// Fraction of max of leader width and gap for max IQR of gaps.\nconst double kMaxLeaderGapFractionOfMax = 0.25;\n// Fraction of min of leader width and gap for max IQR of gaps.\nconst double kMaxLeaderGapFractionOfMin = 0.5;\n// Minimum number of blobs to be considered a leader.\nconst int kMinLeaderCount = 5;\n// Minimum score for a STRONG_CHAIN textline.\nconst int kMinStrongTextValue = 6;\n// Minimum score for a CHAIN textline.\nconst int kMinChainTextValue = 3;\n// Minimum number of blobs for strong horizontal text lines.\nconst int kHorzStrongTextlineCount = 8;\n// Minimum height (in image pixels) for strong horizontal text lines.\nconst int kHorzStrongTextlineHeight = 10;\n// Minimum aspect ratio for strong horizontal text lines.\nconst int kHorzStrongTextlineAspect = 5;\n// Maximum upper quartile error allowed on a baseline fit as a fraction\n// of height.\nconst double kMaxBaselineError = 0.4375;\n// Min coverage for a good baseline between vectors\nconst double kMinBaselineCoverage = 0.5;\n// Max RMS color noise to compare colors.\nconst int kMaxRMSColorNoise = 128;\n// Maximum distance to allow a partition color to be to use that partition\n// in smoothing neighbouring types. This is a squared distance.\nconst int kMaxColorDistance = 900;\n\n// blob_type is the blob_region_type_ of the blobs in this partition.\n// Vertical is the direction of logical vertical on the possibly skewed image.\nColPartition::ColPartition(BlobRegionType blob_type, const ICOORD &vertical)\n    : left_margin_(-INT32_MAX),\n      right_margin_(INT32_MAX),\n      median_bottom_(INT32_MAX),\n      median_top_(-INT32_MAX),\n      median_left_(INT32_MAX),\n      median_right_(-INT32_MAX),\n      blob_type_(blob_type),\n      vertical_(vertical) {\n  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));\n}\n\n// Constructs a fake ColPartition with a single fake BLOBNBOX, all made\n// from a single TBOX.\n// WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and\n// the ColPartition owns the BLOBNBOX!!!\n// Call DeleteBoxes before deleting the ColPartition.\nColPartition *ColPartition::FakePartition(const TBOX &box,\n                                          PolyBlockType block_type,\n                                          BlobRegionType blob_type,\n                                          BlobTextFlowType flow) {\n  auto *part = new ColPartition(blob_type, ICOORD(0, 1));\n  part->set_type(block_type);\n  part->set_flow(flow);\n  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));\n  part->set_left_margin(box.left());\n  part->set_right_margin(box.right());\n  part->SetBlobTypes();\n  part->ComputeLimits();\n  part->ClaimBoxes();\n  return part;\n}\n\n// Constructs and returns a ColPartition with the given real BLOBNBOX,\n// and sets it up to be a \"big\" partition (single-blob partition bigger\n// than the surrounding text that may be a dropcap, two or more vertically\n// touching characters, or some graphic element.\n// If the given list is not nullptr, the partition is also added to the list.\nColPartition *ColPartition::MakeBigPartition(BLOBNBOX *box,\n                                             ColPartition_LIST *big_part_list) {\n  box->set_owner(nullptr);\n  auto *single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));\n  single->set_flow(BTFT_NONE);\n  single->AddBox(box);\n  single->ComputeLimits();\n  single->ClaimBoxes();\n  single->SetBlobTypes();\n  single->set_block_owned(true);\n  if (big_part_list != nullptr) {\n    ColPartition_IT part_it(big_part_list);\n    part_it.add_to_end(single);\n  }\n  return single;\n}\n\nColPartition::~ColPartition() {\n  // Remove this as a partner of all partners, as we don't want them\n  // referring to a deleted object.\n  ColPartition_C_IT it(&upper_partners_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->RemovePartner(false, this);\n  }\n  it.set_to_list(&lower_partners_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->RemovePartner(true, this);\n  }\n}\n\n// Constructs a fake ColPartition with no BLOBNBOXes to represent a\n// horizontal or vertical line, given a type and a bounding box.\nColPartition *ColPartition::MakeLinePartition(BlobRegionType blob_type,\n                                              const ICOORD &vertical, int left,\n                                              int bottom, int right, int top) {\n  auto *part = new ColPartition(blob_type, vertical);\n  part->bounding_box_ = TBOX(left, bottom, right, top);\n  part->median_bottom_ = bottom;\n  part->median_top_ = top;\n  part->median_height_ = top - bottom;\n  part->median_left_ = left;\n  part->median_right_ = right;\n  part->median_width_ = right - left;\n  part->left_key_ = part->BoxLeftKey();\n  part->right_key_ = part->BoxRightKey();\n  return part;\n}\n\n// Adds the given box to the partition, updating the partition bounds.\n// The list of boxes in the partition is updated, ensuring that no box is\n// recorded twice, and the boxes are kept in increasing left position.\nvoid ColPartition::AddBox(BLOBNBOX *bbox) {\n  TBOX box = bbox->bounding_box();\n  // Update the partition limits.\n  if (boxes_.empty()) {\n    bounding_box_ = box;\n  } else {\n    bounding_box_ += box;\n  }\n\n  if (IsVerticalType()) {\n    if (!last_add_was_vertical_) {\n      boxes_.sort(SortByBoxBottom<BLOBNBOX>);\n      last_add_was_vertical_ = true;\n    }\n    boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);\n  } else {\n    if (last_add_was_vertical_) {\n      boxes_.sort(SortByBoxLeft<BLOBNBOX>);\n      last_add_was_vertical_ = false;\n    }\n    boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);\n  }\n  if (!left_key_tab_) {\n    left_key_ = BoxLeftKey();\n  }\n  if (!right_key_tab_) {\n    right_key_ = BoxRightKey();\n  }\n  if (TabFind::WithinTestRegion(2, box.left(), box.bottom())) {\n    tprintf(\"Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\\n\",\n            box.left(), box.bottom(), box.right(), box.top(),\n            bounding_box_.left(), bounding_box_.right());\n  }\n}\n\n// Removes the given box from the partition, updating the bounds.\nvoid ColPartition::RemoveBox(BLOBNBOX *box) {\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    if (box == bb_it.data()) {\n      bb_it.extract();\n      ComputeLimits();\n      return;\n    }\n  }\n}\n\n// Returns the tallest box in the partition, as measured perpendicular to the\n// presumed flow of text.\nBLOBNBOX *ColPartition::BiggestBox() {\n  BLOBNBOX *biggest = nullptr;\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bbox = bb_it.data();\n    if (IsVerticalType()) {\n      if (biggest == nullptr ||\n          bbox->bounding_box().width() > biggest->bounding_box().width()) {\n        biggest = bbox;\n      }\n    } else {\n      if (biggest == nullptr ||\n          bbox->bounding_box().height() > biggest->bounding_box().height()) {\n        biggest = bbox;\n      }\n    }\n  }\n  return biggest;\n}\n\n// Returns the bounding box excluding the given box.\nTBOX ColPartition::BoundsWithoutBox(BLOBNBOX *box) {\n  TBOX result;\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    if (box != bb_it.data()) {\n      result += bb_it.data()->bounding_box();\n    }\n  }\n  return result;\n}\n\n// Claims the boxes in the boxes_list by marking them with a this owner\n// pointer. If a box is already owned, then it must be owned by this.\nvoid ColPartition::ClaimBoxes() {\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.data();\n    ColPartition *other = bblob->owner();\n    if (other == nullptr) {\n      // Normal case: ownership is available.\n      bblob->set_owner(this);\n    } else {\n      ASSERT_HOST(other == this);\n    }\n  }\n}\n\n// nullptr the owner of the blobs in this partition, so they can be deleted\n// independently of the ColPartition.\nvoid ColPartition::DisownBoxes() {\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.data();\n    ASSERT_HOST(bblob->owner() == this || bblob->owner() == nullptr);\n    bblob->set_owner(nullptr);\n  }\n}\n\n// nullptr the owner of the blobs in this partition that are owned by this\n// partition, so they can be deleted independently of the ColPartition.\n// Any blobs that are not owned by this partition get to keep their owner\n// without an assert failure.\nvoid ColPartition::DisownBoxesNoAssert() {\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.data();\n    if (bblob->owner() == this) {\n      bblob->set_owner(nullptr);\n    }\n  }\n}\n\n// Nulls the owner of the blobs in this partition that are owned by this\n// partition and not leader blobs, removing them from the boxes_ list, thus\n// turning this partition back to a leader partition if it contains a leader,\n// or otherwise leaving it empty. Returns true if any boxes remain.\nbool ColPartition::ReleaseNonLeaderBoxes() {\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.data();\n    if (bblob->flow() != BTFT_LEADER) {\n      if (bblob->owner() == this) {\n        bblob->set_owner(nullptr);\n      }\n      bb_it.extract();\n    }\n  }\n  if (bb_it.empty()) {\n    return false;\n  }\n  flow_ = BTFT_LEADER;\n  ComputeLimits();\n  return true;\n}\n\n// Delete the boxes that this partition owns.\nvoid ColPartition::DeleteBoxes() {\n  // Although the boxes_ list is a C_LIST, in some cases it owns the\n  // BLOBNBOXes, as the ColPartition takes ownership from the grid,\n  // and the BLOBNBOXes own the underlying C_BLOBs.\n  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {\n    BLOBNBOX *bblob = bb_it.extract();\n    // TODO: remove next line, currently still needed for resultiterator_test.\n    delete bblob->remove_cblob();\n    delete bblob;\n  }\n}\n\n// Reflects the partition in the y-axis, assuming that its blobs have\n// already been done. Corrects only a limited part of the members, since\n// this function is assumed to be used shortly after initial creation, which\n// is before a lot of the members are used.\nvoid ColPartition::ReflectInYAxis() {\n  BLOBNBOX_CLIST reversed_boxes;\n  BLOBNBOX_C_IT reversed_it(&reversed_boxes);\n  // Reverse the order of the boxes_.\n  BLOBNBOX_C_IT bb_it(&boxes_);\n  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {\n    reversed_it.add_before_then_move(bb_it.extract());\n  }\n  bb_it.add_list_after(&reversed_boxes);\n  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);\n  int tmp = left_margin_;\n  left_margin_ = -right_margin_;\n  right_margin_ = -tmp;\n  ComputeLimits();\n}\n\n// Returns true if this is a legal partition - meaning that the conditions\n// left_margin <= bounding_box left\n// left_key <= bounding box left key\n// bounding box left <= bounding box right\n// and likewise for right margin and key\n// are all met.\nbool ColPartition::IsLegal() {\n  if (bounding_box_.left() > bounding_box_.right()) {\n    if (textord_debug_bugs) {\n      tprintf(\"Bounding box invalid\\n\");\n      Print();\n    }\n    return false; // Bounding box invalid.\n  }\n  if (left_margin_ > bounding_box_.left() ||\n      right_margin_ < bounding_box_.right()) {\n    if (textord_debug_bugs) {\n      tprintf(\"Margins invalid\\n\");\n      Print();\n    }\n    return false; // Margins invalid.\n  }\n  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {\n    if (textord_debug_bugs) {\n      tprintf(\"Key inside box: %d v %d or %d v %d\\n\", left_key_, BoxLeftKey(),\n              right_key_, BoxRightKey());\n      Print();\n    }\n    return false; // Keys inside the box.\n  }\n  return true;\n}\n\n// Returns true if the left and right edges are approximately equal.\nbool ColPartition::MatchingColumns(const ColPartition &other) const {\n  int y = (MidY() + other.MidY()) / 2;\n  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,\n                   LeftAtY(y) / kColumnWidthFactor, 1)) {\n    return false;\n  }\n  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,\n                   RightAtY(y) / kColumnWidthFactor, 1)) {\n    return false;\n  }\n  return true;\n}\n\n// Returns true if the colors match for two text partitions.\nbool ColPartition::MatchingTextColor(const ColPartition &other) const {\n  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&\n      other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise) {\n    return false; // Too noisy.\n  }\n\n  // Colors must match for other to count.\n  double d_this1_o =\n      ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color1_);\n  double d_this2_o =\n      ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color2_);\n  double d_o1_this =\n      ImageFind::ColorDistanceFromLine(color1_, color2_, other.color1_);\n  double d_o2_this =\n      ImageFind::ColorDistanceFromLine(color1_, color2_, other.color2_);\n  // All 4 distances must be small enough.\n  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&\n         d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;\n}\n\n// Returns true if the sizes match for two text partitions,\n// taking orientation into account. See also SizesSimilar.\nbool ColPartition::MatchingSizes(const ColPartition &other) const {\n  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT) {\n    return !TabFind::DifferentSizes(median_width_, other.median_width_);\n  } else {\n    return !TabFind::DifferentSizes(median_height_, other.median_height_);\n  }\n}\n\n// Returns true if there is no tabstop violation in merging this and other.\nbool ColPartition::ConfirmNoTabViolation(const ColPartition &other) const {\n  if (bounding_box_.right() < other.bounding_box_.left() &&\n      bounding_box_.right() < other.LeftBlobRule()) {\n    return false;\n  }\n  if (other.bounding_box_.right() < bounding_box_.left() &&\n      other.bounding_box_.right() < LeftBlobRule()) {\n    return false;\n  }\n  if (bounding_box_.left() > other.bounding_box_.right() &&\n      bounding_box_.left() > other.RightBlobRule()) {\n    return false;\n  }\n  if (other.bounding_box_.left() > bounding_box_.right() &&\n      other.bounding_box_.left() > RightBlobRule()) {\n    return false;\n  }\n  return true;\n}\n\n// Returns true if other has a similar stroke width to this.\nbool ColPartition::MatchingStrokeWidth(const ColPartition &other,\n                                       double fractional_tolerance,\n                                       double constant_tolerance) const {\n  int match_count = 0;\n  int nonmatch_count = 0;\n  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST *>(&boxes_));\n  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST *>(&other.boxes_));\n  box_it.mark_cycle_pt();\n  other_it.mark_cycle_pt();\n  while (!box_it.cycled_list() && !other_it.cycled_list()) {\n    if (box_it.data()->MatchingStrokeWidth(\n            *other_it.data(), fractional_tolerance, constant_tolerance)) {\n      ++match_count;\n    } else {\n      ++nonmatch_count;\n    }\n    box_it.forward();\n    other_it.forward();\n  }\n  return match_count > nonmatch_count;\n}\n\n// Returns true if base is an acceptable diacritic base char merge\n// with this as the diacritic.\n// Returns true if:\n// (1) this is a ColPartition containing only diacritics, and\n// (2) the base characters indicated on the diacritics all believably lie\n// within the text line of the candidate ColPartition.\nbool ColPartition::OKDiacriticMerge(const ColPartition &candidate,\n                                    bool debug) const {\n  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));\n  int min_top = INT32_MAX;\n  int max_bottom = -INT32_MAX;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (!blob->IsDiacritic()) {\n      if (debug) {\n        tprintf(\"Blob is not a diacritic:\");\n        blob->bounding_box().print();\n      }\n      return false; // All blobs must have diacritic bases.\n    }\n    if (blob->base_char_top() < min_top) {\n      min_top = blob->base_char_top();\n    }\n    if (blob->base_char_bottom() > max_bottom) {\n      max_bottom = blob->base_char_bottom();\n    }\n  }\n  // If the intersection of all vertical ranges of all base characters\n  // overlaps the median range of this, then it is OK.\n  bool result =\n      min_top > candidate.median_bottom_ && max_bottom < candidate.median_top_;\n  if (debug) {\n    if (result) {\n      tprintf(\"OKDiacritic!\\n\");\n    } else {\n      tprintf(\"y ranges don\\'t overlap: %d-%d / %d-%d\\n\", max_bottom, min_top,\n              median_bottom_, median_top_);\n    }\n  }\n  return result;\n}\n\n// Sets the sort key using either the tab vector, or the bounding box if\n// the tab vector is nullptr. If the tab_vector lies inside the bounding_box,\n// use the edge of the box as a key any way.\nvoid ColPartition::SetLeftTab(const TabVector *tab_vector) {\n  if (tab_vector != nullptr) {\n    left_key_ = tab_vector->sort_key();\n    left_key_tab_ = left_key_ <= BoxLeftKey();\n  } else {\n    left_key_tab_ = false;\n  }\n  if (!left_key_tab_) {\n    left_key_ = BoxLeftKey();\n  }\n}\n\n// As SetLeftTab, but with the right.\nvoid ColPartition::SetRightTab(const TabVector *tab_vector) {\n  if (tab_vector != nullptr) {\n    right_key_ = tab_vector->sort_key();\n    right_key_tab_ = right_key_ >= BoxRightKey();\n  } else {\n    right_key_tab_ = false;\n  }\n  if (!right_key_tab_) {\n    right_key_ = BoxRightKey();\n  }\n}\n\n// Copies the left/right tab from the src partition, but if take_box is\n// true, copies the box instead and uses that as a key.\nvoid ColPartition::CopyLeftTab(const ColPartition &src, bool take_box) {\n  left_key_tab_ = take_box ? false : src.left_key_tab_;\n  if (left_key_tab_) {\n    left_key_ = src.left_key_;\n  } else {\n    bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));\n    left_key_ = BoxLeftKey();\n  }\n  if (left_margin_ > bounding_box_.left()) {\n    left_margin_ = src.left_margin_;\n  }\n}\n\n// As CopyLeftTab, but with the right.\nvoid ColPartition::CopyRightTab(const ColPartition &src, bool take_box) {\n  right_key_tab_ = take_box ? false : src.right_key_tab_;\n  if (right_key_tab_) {\n    right_key_ = src.right_key_;\n  } else {\n    bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));\n    right_key_ = BoxRightKey();\n  }\n  if (right_margin_ < bounding_box_.right()) {\n    right_margin_ = src.right_margin_;\n  }\n}\n\n// Returns the left rule line x coord of the leftmost blob.\nint ColPartition::LeftBlobRule() const {\n  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));\n  return it.data()->left_rule();\n}\n// Returns the right rule line x coord of the rightmost blob.\nint ColPartition::RightBlobRule() const {\n  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));\n  it.move_to_last();\n  return it.data()->right_rule();\n}\n\nfloat ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {\n  ASSERT_HOST(type < BSTT_COUNT);\n  return special_blobs_densities_[type];\n}\n\nint ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {\n  ASSERT_HOST(type < BSTT_COUNT);\n  BLOBNBOX_C_IT blob_it(&boxes_);\n  int count = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    BlobSpecialTextType blob_type = blob->special_text_type();\n    if (blob_type == type) {\n      count++;\n    }\n  }\n\n  return count;\n}\n\nvoid ColPartition::SetSpecialBlobsDensity(const BlobSpecialTextType type,\n                                          const float density) {\n  ASSERT_HOST(type < BSTT_COUNT);\n  special_blobs_densities_[type] = density;\n}\n\nvoid ColPartition::ComputeSpecialBlobsDensity() {\n  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));\n  if (boxes_.empty()) {\n    return;\n  }\n\n  BLOBNBOX_C_IT blob_it(&boxes_);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    BlobSpecialTextType type = blob->special_text_type();\n    special_blobs_densities_[type]++;\n  }\n\n  for (float &special_blobs_density : special_blobs_densities_) {\n    special_blobs_density /= boxes_.length();\n  }\n}\n\n// Add a partner above if upper, otherwise below.\n// Add them uniquely and keep the list sorted by box left.\n// Partnerships are added symmetrically to partner and this.\nvoid ColPartition::AddPartner(bool upper, ColPartition *partner) {\n  if (upper) {\n    partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,\n                                        this);\n    upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);\n  } else {\n    partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,\n                                        this);\n    lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);\n  }\n}\n\n// Removes the partner from this, but does not remove this from partner.\n// This asymmetric removal is so as not to mess up the iterator that is\n// working on partner's partner list.\nvoid ColPartition::RemovePartner(bool upper, ColPartition *partner) {\n  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    if (it.data() == partner) {\n      it.extract();\n      break;\n    }\n  }\n}\n\n// Returns the partner if the given partner is a singleton, otherwise nullptr.\nColPartition *ColPartition::SingletonPartner(bool upper) {\n  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;\n  if (!partners->singleton()) {\n    return nullptr;\n  }\n  ColPartition_C_IT it(partners);\n  return it.data();\n}\n\n// Merge with the other partition and delete it.\nvoid ColPartition::Absorb(ColPartition *other, const WidthCallback &cb) {\n  // The result has to either own all of the blobs or none of them.\n  // Verify the flag is consistent.\n  ASSERT_HOST(owns_blobs() == other->owns_blobs());\n  // TODO(nbeato): check owns_blobs better. Right now owns_blobs\n  // should always be true when this is called. So there is no issues.\n  if (TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                bounding_box_.bottom()) ||\n      TabFind::WithinTestRegion(2, other->bounding_box_.left(),\n                                other->bounding_box_.bottom())) {\n    tprintf(\"Merging:\");\n    Print();\n    other->Print();\n  }\n\n  // Update the special_blobs_densities_.\n  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));\n  for (int type = 0; type < BSTT_COUNT; ++type) {\n    unsigned w1 = boxes_.length();\n    unsigned w2 = other->boxes_.length();\n    float new_val = special_blobs_densities_[type] * w1 +\n                    other->special_blobs_densities_[type] * w2;\n    if (!w1 || !w2) {\n      ASSERT_HOST((w1 + w2) > 0);\n      special_blobs_densities_[type] = new_val / (w1 + w2);\n    }\n  }\n\n  // Merge the two sorted lists.\n  BLOBNBOX_C_IT it(&boxes_);\n  BLOBNBOX_C_IT it2(&other->boxes_);\n  for (; !it2.empty(); it2.forward()) {\n    BLOBNBOX *bbox2 = it2.extract();\n    ColPartition *prev_owner = bbox2->owner();\n    if (prev_owner != other && prev_owner != nullptr) {\n      // A blob on other's list is owned by someone else; let them have it.\n      continue;\n    }\n    ASSERT_HOST(prev_owner == other || prev_owner == nullptr);\n    if (prev_owner == other) {\n      bbox2->set_owner(this);\n    }\n    it.add_to_end(bbox2);\n  }\n  left_margin_ = std::min(left_margin_, other->left_margin_);\n  right_margin_ = std::max(right_margin_, other->right_margin_);\n  if (other->left_key_ < left_key_) {\n    left_key_ = other->left_key_;\n    left_key_tab_ = other->left_key_tab_;\n  }\n  if (other->right_key_ > right_key_) {\n    right_key_ = other->right_key_;\n    right_key_tab_ = other->right_key_tab_;\n  }\n  // Combine the flow and blob_type in a sensible way.\n  // Dominant flows stay.\n  if (!DominatesInMerge(flow_, other->flow_)) {\n    flow_ = other->flow_;\n    blob_type_ = other->blob_type_;\n  }\n  SetBlobTypes();\n  if (IsVerticalType()) {\n    boxes_.sort(SortByBoxBottom<BLOBNBOX>);\n    last_add_was_vertical_ = true;\n  } else {\n    boxes_.sort(SortByBoxLeft<BLOBNBOX>);\n    last_add_was_vertical_ = false;\n  }\n  ComputeLimits();\n  // Fix partner lists. other is going away, so remove it as a\n  // partner of all its partners and add this in its place.\n  for (int upper = 0; upper < 2; ++upper) {\n    ColPartition_CLIST partners;\n    ColPartition_C_IT part_it(&partners);\n    part_it.add_list_after(upper ? &other->upper_partners_\n                                 : &other->lower_partners_);\n    for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n      ColPartition *partner = part_it.extract();\n      partner->RemovePartner(!upper, other);\n      partner->RemovePartner(!upper, this);\n      partner->AddPartner(!upper, this);\n    }\n  }\n  delete other;\n  if (cb != nullptr) {\n    SetColumnGoodness(cb);\n  }\n}\n\n// Merge1 and merge2 are candidates to be merged, yet their combined box\n// overlaps this. Is that allowed?\n// Returns true if the overlap between this and the merged pair of\n// merge candidates is sufficiently trivial to be allowed.\n// The merged box can graze the edge of this by the ok_box_overlap\n// if that exceeds the margin to the median top and bottom.\n// ok_box_overlap should be set by the caller appropriate to the sizes of\n// the text involved, and is usually a fraction of the median size of merge1\n// and/or merge2, or this.\n// TODO(rays) Determine whether vertical text needs to be considered.\nbool ColPartition::OKMergeOverlap(const ColPartition &merge1,\n                                  const ColPartition &merge2,\n                                  int ok_box_overlap, bool debug) {\n  // Vertical partitions are not allowed to be involved.\n  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {\n    if (debug) {\n      tprintf(\"Vertical partition\\n\");\n    }\n    return false;\n  }\n  // The merging partitions must strongly overlap each other.\n  if (!merge1.VSignificantCoreOverlap(merge2)) {\n    if (debug) {\n      tprintf(\"Voverlap %d (%d)\\n\", merge1.VCoreOverlap(merge2),\n              merge1.VSignificantCoreOverlap(merge2));\n    }\n    return false;\n  }\n  // The merged box must not overlap the median bounds of this.\n  TBOX merged_box(merge1.bounding_box());\n  merged_box += merge2.bounding_box();\n  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&\n      merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&\n      merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {\n    if (debug) {\n      tprintf(\"Excessive box overlap\\n\");\n    }\n    return false;\n  }\n  // Looks OK!\n  return true;\n}\n\n// Find the blob at which to split this to minimize the overlap with the\n// given box. Returns the first blob to go in the second partition.\nBLOBNBOX *ColPartition::OverlapSplitBlob(const TBOX &box) {\n  if (boxes_.empty() || boxes_.singleton()) {\n    return nullptr;\n  }\n  BLOBNBOX_C_IT it(&boxes_);\n  TBOX left_box(it.data()->bounding_box());\n  for (it.forward(); !it.at_first(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    left_box += bbox->bounding_box();\n    if (left_box.overlap(box)) {\n      return bbox;\n    }\n  }\n  return nullptr;\n}\n\n// Split this partition keeping the first half in this and returning\n// the second half.\n// Splits by putting the split_blob and the blobs that follow\n// in the second half, and the rest in the first half.\nColPartition *ColPartition::SplitAtBlob(BLOBNBOX *split_blob) {\n  ColPartition *split_part = ShallowCopy();\n  split_part->set_owns_blobs(owns_blobs());\n  BLOBNBOX_C_IT it(&boxes_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    ColPartition *prev_owner = bbox->owner();\n    ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);\n    if (bbox == split_blob || !split_part->boxes_.empty()) {\n      split_part->AddBox(it.extract());\n      if (owns_blobs() && prev_owner != nullptr) {\n        bbox->set_owner(split_part);\n      }\n    }\n  }\n  ASSERT_HOST(!it.empty());\n  if (split_part->IsEmpty()) {\n    // Split part ended up with nothing. Possible if split_blob is not\n    // in the list of blobs.\n    delete split_part;\n    return nullptr;\n  }\n  right_key_tab_ = false;\n  split_part->left_key_tab_ = false;\n  ComputeLimits();\n  // TODO(nbeato) Merge Ray's CL like this:\n  // if (owns_blobs())\n  //  SetBlobTextlineGoodness();\n  split_part->ComputeLimits();\n  // TODO(nbeato) Merge Ray's CL like this:\n  // if (split_part->owns_blobs())\n  //   split_part->SetBlobTextlineGoodness();\n  return split_part;\n}\n\n// Split this partition at the given x coordinate, returning the right\n// half and keeping the left half in this.\nColPartition *ColPartition::SplitAt(int split_x) {\n  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right()) {\n    return nullptr; // There will be no change.\n  }\n  ColPartition *split_part = ShallowCopy();\n  split_part->set_owns_blobs(owns_blobs());\n  BLOBNBOX_C_IT it(&boxes_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    ColPartition *prev_owner = bbox->owner();\n    ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);\n    const TBOX &box = bbox->bounding_box();\n    if (box.left() >= split_x) {\n      split_part->AddBox(it.extract());\n      if (owns_blobs() && prev_owner != nullptr) {\n        bbox->set_owner(split_part);\n      }\n    }\n  }\n  if (it.empty()) {\n    // Possible if split-x passes through the first blob.\n    it.add_list_after(&split_part->boxes_);\n  }\n  ASSERT_HOST(!it.empty());\n  if (split_part->IsEmpty()) {\n    // Split part ended up with nothing. Possible if split_x passes\n    // through the last blob.\n    delete split_part;\n    return nullptr;\n  }\n  right_key_tab_ = false;\n  split_part->left_key_tab_ = false;\n  right_margin_ = split_x;\n  split_part->left_margin_ = split_x;\n  ComputeLimits();\n  split_part->ComputeLimits();\n  return split_part;\n}\n\n// Recalculates all the coordinate limits of the partition.\nvoid ColPartition::ComputeLimits() {\n  bounding_box_ = TBOX(); // Clear it\n  BLOBNBOX_C_IT it(&boxes_);\n  BLOBNBOX *bbox = nullptr;\n  int non_leader_count = 0;\n  if (it.empty()) {\n    bounding_box_.set_left(left_margin_);\n    bounding_box_.set_right(right_margin_);\n    bounding_box_.set_bottom(0);\n    bounding_box_.set_top(0);\n  } else {\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      bbox = it.data();\n      bounding_box_ += bbox->bounding_box();\n      if (bbox->flow() != BTFT_LEADER) {\n        ++non_leader_count;\n      }\n    }\n  }\n  if (!left_key_tab_) {\n    left_key_ = BoxLeftKey();\n  }\n  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {\n    // TODO(rays) investigate the causes of these error messages, to find\n    // out if they are genuinely harmful, or just indicative of junk input.\n    tprintf(\"Computed left-illegal partition\\n\");\n    Print();\n  }\n  if (!right_key_tab_) {\n    right_key_ = BoxRightKey();\n  }\n  if (right_key_ < BoxRightKey() && textord_debug_bugs) {\n    tprintf(\"Computed right-illegal partition\\n\");\n    Print();\n  }\n  if (it.empty()) {\n    return;\n  }\n  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||\n      blob_type() == BRT_POLYIMAGE) {\n    median_top_ = bounding_box_.top();\n    median_bottom_ = bounding_box_.bottom();\n    median_height_ = bounding_box_.height();\n    median_left_ = bounding_box_.left();\n    median_right_ = bounding_box_.right();\n    median_width_ = bounding_box_.width();\n  } else {\n    STATS top_stats(bounding_box_.bottom(), bounding_box_.top());\n    STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top());\n    STATS height_stats(0, bounding_box_.height());\n    STATS left_stats(bounding_box_.left(), bounding_box_.right());\n    STATS right_stats(bounding_box_.left(), bounding_box_.right());\n    STATS width_stats(0, bounding_box_.width());\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      bbox = it.data();\n      if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {\n        const TBOX &box = bbox->bounding_box();\n        int area = box.area();\n        top_stats.add(box.top(), area);\n        bottom_stats.add(box.bottom(), area);\n        height_stats.add(box.height(), area);\n        left_stats.add(box.left(), area);\n        right_stats.add(box.right(), area);\n        width_stats.add(box.width(), area);\n      }\n    }\n    median_top_ = static_cast<int>(top_stats.median() + 0.5);\n    median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);\n    median_height_ = static_cast<int>(height_stats.median() + 0.5);\n    median_left_ = static_cast<int>(left_stats.median() + 0.5);\n    median_right_ = static_cast<int>(right_stats.median() + 0.5);\n    median_width_ = static_cast<int>(width_stats.median() + 0.5);\n  }\n\n  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {\n    tprintf(\"Made partition with bad right coords, %d < %d\\n\", right_margin_,\n            bounding_box_.right());\n    Print();\n  }\n  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {\n    tprintf(\"Made partition with bad left coords, %d > %d\\n\", left_margin_,\n            bounding_box_.left());\n    Print();\n  }\n  // Fix partner lists. The bounding box has changed and partners are stored\n  // in bounding box order, so remove and reinsert this as a partner\n  // of all its partners.\n  for (int upper = 0; upper < 2; ++upper) {\n    ColPartition_CLIST partners;\n    ColPartition_C_IT part_it(&partners);\n    part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);\n    for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n      ColPartition *partner = part_it.extract();\n      partner->RemovePartner(!upper, this);\n      partner->AddPartner(!upper, this);\n    }\n  }\n  if (TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                bounding_box_.bottom())) {\n    tprintf(\"Recomputed box for partition %p\\n\", static_cast<void *>(this));\n    Print();\n  }\n}\n\n// Returns the number of boxes that overlap the given box.\nint ColPartition::CountOverlappingBoxes(const TBOX &box) {\n  BLOBNBOX_C_IT it(&boxes_);\n  int overlap_count = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    if (box.overlap(bbox->bounding_box())) {\n      ++overlap_count;\n    }\n  }\n  return overlap_count;\n}\n\n// Computes and sets the type_ and first_column_, last_column_ and column_set_.\n// resolution refers to the ppi resolution of the image.\nvoid ColPartition::SetPartitionType(int resolution, ColPartitionSet *columns) {\n  int first_spanned_col = -1;\n  ColumnSpanningType span_type = columns->SpanningType(\n      resolution, bounding_box_.left(), bounding_box_.right(),\n      std::min(bounding_box_.height(), bounding_box_.width()), MidY(),\n      left_margin_, right_margin_, &first_column_, &last_column_,\n      &first_spanned_col);\n  column_set_ = columns;\n  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&\n      !IsLineType()) {\n    // Unequal columns may indicate that the pullout spans one of the columns\n    // it lies in, so force it to be allocated to just that column.\n    if (first_spanned_col >= 0) {\n      first_column_ = first_spanned_col;\n      last_column_ = first_spanned_col;\n    } else {\n      if ((first_column_ & 1) == 0) {\n        last_column_ = first_column_;\n      } else if ((last_column_ & 1) == 0) {\n        first_column_ = last_column_;\n      } else {\n        first_column_ = last_column_ = (first_column_ + last_column_) / 2;\n      }\n    }\n  }\n  type_ = PartitionType(span_type);\n}\n\n// Returns the PartitionType from the current BlobRegionType and a column\n// flow spanning type ColumnSpanningType, generated by\n// ColPartitionSet::SpanningType, that indicates how the partition sits\n// in the columns.\nPolyBlockType ColPartition::PartitionType(ColumnSpanningType flow) const {\n  if (flow == CST_NOISE) {\n    if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&\n        blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT) {\n      return PT_NOISE;\n    }\n    flow = CST_FLOWING;\n  }\n\n  switch (blob_type_) {\n    case BRT_NOISE:\n      return PT_NOISE;\n    case BRT_HLINE:\n      return PT_HORZ_LINE;\n    case BRT_VLINE:\n      return PT_VERT_LINE;\n    case BRT_RECTIMAGE:\n    case BRT_POLYIMAGE:\n      switch (flow) {\n        case CST_FLOWING:\n          return PT_FLOWING_IMAGE;\n        case CST_HEADING:\n          return PT_HEADING_IMAGE;\n        case CST_PULLOUT:\n          return PT_PULLOUT_IMAGE;\n        default:\n          ASSERT_HOST(!\"Undefined flow type for image!\");\n      }\n      break;\n    case BRT_VERT_TEXT:\n      return PT_VERTICAL_TEXT;\n    case BRT_TEXT:\n    case BRT_UNKNOWN:\n    default:\n      switch (flow) {\n        case CST_FLOWING:\n          return PT_FLOWING_TEXT;\n        case CST_HEADING:\n          return PT_HEADING_TEXT;\n        case CST_PULLOUT:\n          return PT_PULLOUT_TEXT;\n        default:\n          ASSERT_HOST(!\"Undefined flow type for text!\");\n      }\n  }\n  ASSERT_HOST(!\"Should never get here!\");\n  return PT_NOISE;\n}\n\n// Returns the first and last column touched by this partition.\n// resolution refers to the ppi resolution of the image.\nvoid ColPartition::ColumnRange(int resolution, ColPartitionSet *columns,\n                               int *first_col, int *last_col) {\n  int first_spanned_col = -1;\n  ColumnSpanningType span_type = columns->SpanningType(\n      resolution, bounding_box_.left(), bounding_box_.right(),\n      std::min(bounding_box_.height(), bounding_box_.width()), MidY(),\n      left_margin_, right_margin_, first_col, last_col, &first_spanned_col);\n  type_ = PartitionType(span_type);\n}\n\n// Sets the internal flags good_width_ and good_column_.\nvoid ColPartition::SetColumnGoodness(const WidthCallback &cb) {\n  int y = MidY();\n  int width = RightAtY(y) - LeftAtY(y);\n  good_width_ = cb(width);\n  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;\n}\n\n// Determines whether the blobs in this partition mostly represent\n// a leader (fixed pitch sequence) and sets the member blobs accordingly.\n// Note that height is assumed to have been tested elsewhere, and that this\n// function will find most fixed-pitch text as leader without a height filter.\n// Leader detection is limited to sequences of identical width objects,\n// such as .... or ----, so patterns, such as .-.-.-.-. will not be found.\nbool ColPartition::MarkAsLeaderIfMonospaced() {\n  bool result = false;\n  // Gather statistics on the gaps between blobs and the widths of the blobs.\n  int part_width = bounding_box_.width();\n  STATS gap_stats(0, part_width - 1);\n  STATS width_stats(0, part_width - 1);\n  BLOBNBOX_C_IT it(&boxes_);\n  BLOBNBOX *prev_blob = it.data();\n  prev_blob->set_flow(BTFT_NEIGHBOURS);\n  width_stats.add(prev_blob->bounding_box().width(), 1);\n  int blob_count = 1;\n  for (it.forward(); !it.at_first(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    int left = blob->bounding_box().left();\n    int right = blob->bounding_box().right();\n    gap_stats.add(left - prev_blob->bounding_box().right(), 1);\n    width_stats.add(right - left, 1);\n    blob->set_flow(BTFT_NEIGHBOURS);\n    prev_blob = blob;\n    ++blob_count;\n  }\n  double median_gap = gap_stats.median();\n  double median_width = width_stats.median();\n  double max_width = std::max(median_gap, median_width);\n  double min_width = std::min(median_gap, median_width);\n  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);\n  if (textord_debug_tabfind >= 4) {\n    tprintf(\"gap iqr = %g, blob_count=%d, limits=%g,%g\\n\", gap_iqr, blob_count,\n            max_width * kMaxLeaderGapFractionOfMax,\n            min_width * kMaxLeaderGapFractionOfMin);\n  }\n  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&\n      gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&\n      blob_count >= kMinLeaderCount) {\n    // This is stable enough to be called a leader, so check the widths.\n    // Since leader dashes can join, run a dp cutting algorithm and go\n    // on the cost.\n    int offset = static_cast<int>(ceil(gap_iqr * 2));\n    int min_step = static_cast<int>(median_gap + median_width + 0.5);\n    int max_step = min_step + offset;\n    min_step -= offset;\n    // Pad the buffer with min_step/2 on each end.\n    int part_left = bounding_box_.left() - min_step / 2;\n    part_width += min_step;\n    auto *projection = new DPPoint[part_width];\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      BLOBNBOX *blob = it.data();\n      int left = blob->bounding_box().left();\n      int right = blob->bounding_box().right();\n      int height = blob->bounding_box().height();\n      for (int x = left; x < right; ++x) {\n        projection[left - part_left].AddLocalCost(height);\n      }\n    }\n    DPPoint *best_end =\n        DPPoint::Solve(min_step, max_step, false, &DPPoint::CostWithVariance,\n                       part_width, projection);\n    if (best_end != nullptr && best_end->total_cost() < blob_count) {\n      // Good enough. Call it a leader.\n      result = true;\n      bool modified_blob_list = false;\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        BLOBNBOX *blob = it.data();\n        // If the first or last blob is spaced too much, don't mark it.\n        if (it.at_first()) {\n          int gap = it.data_relative(1)->bounding_box().left() -\n                    blob->bounding_box().right();\n          if (blob->bounding_box().width() + gap > max_step) {\n            it.extract();\n            modified_blob_list = true;\n            continue;\n          }\n        }\n        if (it.at_last()) {\n          int gap = blob->bounding_box().left() -\n                    it.data_relative(-1)->bounding_box().right();\n          if (blob->bounding_box().width() + gap > max_step) {\n            it.extract();\n            modified_blob_list = true;\n            break;\n          }\n        }\n        blob->set_region_type(BRT_TEXT);\n        blob->set_flow(BTFT_LEADER);\n      }\n      if (modified_blob_list) {\n        ComputeLimits();\n      }\n      blob_type_ = BRT_TEXT;\n      flow_ = BTFT_LEADER;\n    } else if (textord_debug_tabfind) {\n      if (best_end == nullptr) {\n        tprintf(\"No path\\n\");\n      } else {\n        tprintf(\"Total cost = %d vs allowed %d\\n\", best_end->total_cost(),\n                blob_count);\n      }\n    }\n    delete[] projection;\n  }\n  return result;\n}\n\n// Given the result of TextlineProjection::EvaluateColPartition, (positive for\n// horizontal text, negative for vertical text, and near zero for non-text),\n// sets the blob_type_ and flow_ for this partition to indicate whether it\n// is strongly or weakly vertical or horizontal text, or non-text.\n// The function assumes that the blob neighbours are valid (from\n// StrokeWidth::SetNeighbours) and that those neighbours have their\n// region_type() set.\nvoid ColPartition::SetRegionAndFlowTypesFromProjectionValue(int value) {\n  int blob_count = 0;       // Total # blobs.\n  int good_blob_score_ = 0; // Total # good strokewidth neighbours.\n  int noisy_count = 0;      // Total # neighbours marked as noise.\n  int hline_count = 0;\n  int vline_count = 0;\n  BLOBNBOX_C_IT it(&boxes_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    ++blob_count;\n    noisy_count += blob->NoisyNeighbours();\n    good_blob_score_ += blob->GoodTextBlob();\n    if (blob->region_type() == BRT_HLINE) {\n      ++hline_count;\n    }\n    if (blob->region_type() == BRT_VLINE) {\n      ++vline_count;\n    }\n  }\n  flow_ = BTFT_NEIGHBOURS;\n  blob_type_ = BRT_UNKNOWN;\n  if (hline_count > vline_count) {\n    flow_ = BTFT_NONE;\n    blob_type_ = BRT_HLINE;\n  } else if (vline_count > hline_count) {\n    flow_ = BTFT_NONE;\n    blob_type_ = BRT_VLINE;\n  } else if (value < -1 || 1 < value) {\n    int long_side;\n    int short_side;\n    if (value > 0) {\n      long_side = bounding_box_.width();\n      short_side = bounding_box_.height();\n      blob_type_ = BRT_TEXT;\n    } else {\n      long_side = bounding_box_.height();\n      short_side = bounding_box_.width();\n      blob_type_ = BRT_VERT_TEXT;\n    }\n    // We will combine the old metrics using aspect ratio and blob counts\n    // with the input value by allowing a strong indication to flip the\n    // STRONG_CHAIN/CHAIN flow values.\n    int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;\n    if (short_side > kHorzStrongTextlineHeight) {\n      ++strong_score;\n    }\n    if (short_side * kHorzStrongTextlineAspect < long_side) {\n      ++strong_score;\n    }\n    if (abs(value) >= kMinStrongTextValue) {\n      flow_ = BTFT_STRONG_CHAIN;\n    } else if (abs(value) >= kMinChainTextValue) {\n      flow_ = BTFT_CHAIN;\n    } else {\n      flow_ = BTFT_NEIGHBOURS;\n    }\n    // Upgrade chain to strong chain if the other indicators are good\n    if (flow_ == BTFT_CHAIN && strong_score == 3) {\n      flow_ = BTFT_STRONG_CHAIN;\n    }\n    // Downgrade strong vertical text to chain if the indicators are bad.\n    if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2) {\n      flow_ = BTFT_CHAIN;\n    }\n  }\n  if (flow_ == BTFT_NEIGHBOURS) {\n    // Check for noisy neighbours.\n    if (noisy_count >= blob_count) {\n      flow_ = BTFT_NONTEXT;\n      blob_type_ = BRT_NOISE;\n    }\n  }\n  if (TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                bounding_box_.bottom())) {\n    tprintf(\"RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,\",\n            blob_count, noisy_count, good_blob_score_);\n    tprintf(\" Projection value=%d, flow=%d, blob_type=%d\\n\", value, flow_,\n            blob_type_);\n    Print();\n  }\n  SetBlobTypes();\n}\n\n// Sets all blobs with the partition blob type and flow, but never overwrite\n// leader blobs, as we need to be able to identify them later.\nvoid ColPartition::SetBlobTypes() {\n  if (!owns_blobs()) {\n    return;\n  }\n  BLOBNBOX_C_IT it(&boxes_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (blob->flow() != BTFT_LEADER) {\n      blob->set_flow(flow_);\n    }\n    blob->set_region_type(blob_type_);\n    ASSERT_HOST(blob->owner() == nullptr || blob->owner() == this);\n  }\n}\n\n// Returns true if a decent baseline can be fitted through the blobs.\n// Works for both horizontal and vertical text.\nbool ColPartition::HasGoodBaseline() {\n  // Approximation of the baseline.\n  DetLineFit linepoints;\n  // Calculation of the mean height on this line segment. Note that these\n  // variable names apply to the context of a horizontal line, and work\n  // analogously, rather than literally in the case of a vertical line.\n  int total_height = 0;\n  int coverage = 0;\n  int height_count = 0;\n  int width = 0;\n  BLOBNBOX_C_IT it(&boxes_);\n  TBOX box(it.data()->bounding_box());\n  // Accumulate points representing the baseline at the middle of each blob,\n  // but add an additional point for each end of the line. This makes it\n  // harder to fit a severe skew angle, as it is most likely not right.\n  if (IsVerticalType()) {\n    // For a vertical line, use the right side as the baseline.\n    ICOORD first_pt(box.right(), box.bottom());\n    // Use the bottom-right of the first (bottom) box, the top-right of the\n    // last, and the middle-right of all others.\n    linepoints.Add(first_pt);\n    for (it.forward(); !it.at_last(); it.forward()) {\n      BLOBNBOX *blob = it.data();\n      box = blob->bounding_box();\n      ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);\n      linepoints.Add(box_pt);\n      total_height += box.width();\n      coverage += box.height();\n      ++height_count;\n    }\n    box = it.data()->bounding_box();\n    ICOORD last_pt(box.right(), box.top());\n    linepoints.Add(last_pt);\n    width = last_pt.y() - first_pt.y();\n\n  } else {\n    // Horizontal lines use the bottom as the baseline.\n    TBOX box(it.data()->bounding_box());\n    // Use the bottom-left of the first box, the bottom-right of the last,\n    // and the middle of all others.\n    ICOORD first_pt(box.left(), box.bottom());\n    linepoints.Add(first_pt);\n    for (it.forward(); !it.at_last(); it.forward()) {\n      BLOBNBOX *blob = it.data();\n      box = blob->bounding_box();\n      ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());\n      linepoints.Add(box_pt);\n      total_height += box.height();\n      coverage += box.width();\n      ++height_count;\n    }\n    box = it.data()->bounding_box();\n    ICOORD last_pt(box.right(), box.bottom());\n    linepoints.Add(last_pt);\n    width = last_pt.x() - first_pt.x();\n  }\n  // Maximum median error allowed to be a good text line.\n  if (height_count == 0) {\n    return false;\n  }\n  double max_error = kMaxBaselineError * total_height / height_count;\n  ICOORD start_pt, end_pt;\n  double error = linepoints.Fit(&start_pt, &end_pt);\n  return error < max_error && coverage >= kMinBaselineCoverage * width;\n}\n\n// Adds this ColPartition to a matching WorkingPartSet if one can be found,\n// otherwise starts a new one in the appropriate column, ending the previous.\nvoid ColPartition::AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright,\n                                   int resolution,\n                                   ColPartition_LIST *used_parts,\n                                   WorkingPartSet_LIST *working_sets) {\n  if (block_owned_) {\n    return; // Done it already.\n  }\n  block_owned_ = true;\n  WorkingPartSet_IT it(working_sets);\n  // If there is an upper partner use its working_set_ directly.\n  ColPartition *partner = SingletonPartner(true);\n  if (partner != nullptr && partner->working_set_ != nullptr) {\n    working_set_ = partner->working_set_;\n    working_set_->AddPartition(this);\n    return;\n  }\n  if (partner != nullptr && textord_debug_bugs) {\n    tprintf(\"Partition with partner has no working set!:\");\n    Print();\n    partner->Print();\n  }\n  // Search for the column that the left edge fits in.\n  WorkingPartSet *work_set = nullptr;\n  it.move_to_first();\n  int col_index = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list() && col_index != first_column_;\n       it.forward(), ++col_index) {\n    ;\n  }\n  if (textord_debug_tabfind >= 2) {\n    tprintf(\"Match is %s for:\", (col_index & 1) ? \"Real\" : \"Between\");\n    Print();\n  }\n  if (it.cycled_list() && textord_debug_bugs) {\n    tprintf(\"Target column=%d, only had %d\\n\", first_column_, col_index);\n  }\n  ASSERT_HOST(!it.cycled_list());\n  work_set = it.data();\n  // If last_column_ != first_column, then we need to scoop up all blocks\n  // between here and the last_column_ and put back in work_set.\n  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {\n    // Find the column that the right edge falls in.\n    BLOCK_LIST completed_blocks;\n    TO_BLOCK_LIST to_blocks;\n    for (; !it.cycled_list() && col_index <= last_column_;\n         it.forward(), ++col_index) {\n      WorkingPartSet *end_set = it.data();\n      end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,\n                                      &completed_blocks, &to_blocks);\n    }\n    work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);\n  }\n  working_set_ = work_set;\n  work_set->AddPartition(this);\n}\n\n// From the given block_parts list, builds one or more BLOCKs and\n// corresponding TO_BLOCKs, such that the line spacing is uniform in each.\n// Created blocks are appended to the end of completed_blocks and to_blocks.\n// The used partitions are put onto used_parts, as they may still be referred\n// to in the partition grid. bleft, tright and resolution are the bounds\n// and resolution of the original image.\nvoid ColPartition::LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright,\n                                     int resolution,\n                                     ColPartition_LIST *block_parts,\n                                     ColPartition_LIST *used_parts,\n                                     BLOCK_LIST *completed_blocks,\n                                     TO_BLOCK_LIST *to_blocks) {\n  int page_height = tright.y() - bleft.y();\n  // Compute the initial spacing stats.\n  ColPartition_IT it(block_parts);\n  int part_count = 0;\n  int max_line_height = 0;\n\n  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type\n  // because their line spacing with their neighbors maybe smaller and their\n  // height may be slightly larger.\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    ASSERT_HOST(!part->boxes()->empty());\n    STATS side_steps(0, part->bounding_box().height() - 1);\n    if (part->bounding_box().height() > max_line_height) {\n      max_line_height = part->bounding_box().height();\n    }\n    BLOBNBOX_C_IT blob_it(part->boxes());\n    int prev_bottom = blob_it.data()->bounding_box().bottom();\n    for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {\n      BLOBNBOX *blob = blob_it.data();\n      int bottom = blob->bounding_box().bottom();\n      int step = bottom - prev_bottom;\n      if (step < 0) {\n        step = -step;\n      }\n      side_steps.add(step, 1);\n      prev_bottom = bottom;\n    }\n    part->set_side_step(static_cast<int>(side_steps.median() + 0.5));\n    if (!it.at_last()) {\n      ColPartition *next_part = it.data_relative(1);\n      part->set_bottom_spacing(part->median_bottom() -\n                               next_part->median_bottom());\n      part->set_top_spacing(part->median_top() - next_part->median_top());\n    } else {\n      part->set_bottom_spacing(page_height);\n      part->set_top_spacing(page_height);\n    }\n    if (textord_debug_tabfind) {\n      part->Print();\n      tprintf(\"side step = %.2f, top spacing = %d, bottom spacing=%d\\n\",\n              side_steps.median(), part->top_spacing(), part->bottom_spacing());\n    }\n    ++part_count;\n  }\n  if (part_count == 0) {\n    return;\n  }\n\n  SmoothSpacings(resolution, page_height, block_parts);\n\n  // Move the partitions into individual block lists and make the blocks.\n  BLOCK_IT block_it(completed_blocks);\n  TO_BLOCK_IT to_block_it(to_blocks);\n  ColPartition_LIST spacing_parts;\n  ColPartition_IT sp_block_it(&spacing_parts);\n  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;\n  for (it.mark_cycle_pt(); !it.empty();) {\n    ColPartition *part = it.extract();\n    sp_block_it.add_to_end(part);\n    it.forward();\n    if (it.empty() || part->bottom_spacing() > same_block_threshold ||\n        !part->SpacingsEqual(*it.data(), resolution)) {\n      // There is a spacing boundary. Check to see if it.data() belongs\n      // better in the current block or the next one.\n      if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {\n        ColPartition *next_part = it.data();\n        // If there is a size match one-way, then the middle line goes with\n        // its matched size, otherwise it goes with the smallest spacing.\n        ColPartition *third_part = it.at_last() ? nullptr : it.data_relative(1);\n        if (textord_debug_tabfind) {\n          tprintf(\n              \"Spacings unequal: upper:%d/%d, lower:%d/%d,\"\n              \" sizes %d %d %d\\n\",\n              part->top_spacing(), part->bottom_spacing(),\n              next_part->top_spacing(), next_part->bottom_spacing(),\n              part->median_height(), next_part->median_height(),\n              third_part != nullptr ? third_part->median_height() : 0);\n        }\n        // We can only consider adding the next line to the block if the sizes\n        // match and the lines are close enough for their size.\n        if (part->SizesSimilar(*next_part) &&\n            next_part->median_height() * kMaxSameBlockLineSpacing >\n                part->bottom_spacing() &&\n            part->median_height() * kMaxSameBlockLineSpacing >\n                part->top_spacing()) {\n          // Even now, we can only add it as long as the third line doesn't\n          // match in the same way and have a smaller bottom spacing.\n          if (third_part == nullptr || !next_part->SizesSimilar(*third_part) ||\n              third_part->median_height() * kMaxSameBlockLineSpacing <=\n                  next_part->bottom_spacing() ||\n              next_part->median_height() * kMaxSameBlockLineSpacing <=\n                  next_part->top_spacing() ||\n              next_part->bottom_spacing() > part->bottom_spacing()) {\n            // Add to the current block.\n            sp_block_it.add_to_end(it.extract());\n            it.forward();\n            if (textord_debug_tabfind) {\n              tprintf(\"Added line to current block.\\n\");\n            }\n          }\n        }\n      }\n      TO_BLOCK *to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);\n      if (to_block != nullptr) {\n        to_block_it.add_to_end(to_block);\n        block_it.add_to_end(to_block->block);\n      }\n      sp_block_it.set_to_list(&spacing_parts);\n    } else {\n      if (textord_debug_tabfind && !it.empty()) {\n        ColPartition *next_part = it.data();\n        tprintf(\"Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\\n\",\n                part->top_spacing(), part->bottom_spacing(),\n                next_part->top_spacing(), next_part->bottom_spacing(),\n                part->median_height(), next_part->median_height());\n      }\n    }\n  }\n}\n\n// Helper function to clip the input pos to the given bleft, tright bounds.\nstatic void ClipCoord(const ICOORD &bleft, const ICOORD &tright, ICOORD *pos) {\n  if (pos->x() < bleft.x()) {\n    pos->set_x(bleft.x());\n  }\n  if (pos->x() > tright.x()) {\n    pos->set_x(tright.x());\n  }\n  if (pos->y() < bleft.y()) {\n    pos->set_y(bleft.y());\n  }\n  if (pos->y() > tright.y()) {\n    pos->set_y(tright.y());\n  }\n}\n\n// Helper moves the blobs from the given list of block_parts into the block\n// itself. Sets up the block for (old) textline formation correctly for\n// vertical and horizontal text. The partitions are moved to used_parts\n// afterwards, as they cannot be deleted yet.\nstatic TO_BLOCK *MoveBlobsToBlock(bool vertical_text, int line_spacing,\n                                  BLOCK *block, ColPartition_LIST *block_parts,\n                                  ColPartition_LIST *used_parts) {\n  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.\n  // Move all the parts to a done list as they are no longer needed, except\n  // that have to continue to exist until the part grid is deleted.\n  // Compute the median blob size as we go, as the block needs to know.\n  TBOX block_box(block->pdblk.bounding_box());\n  STATS sizes(0, std::max(block_box.width(), block_box.height()) - 1);\n  bool text_type = block->pdblk.poly_block()->IsText();\n  ColPartition_IT it(block_parts);\n  auto *to_block = new TO_BLOCK(block);\n  BLOBNBOX_IT blob_it(&to_block->blobs);\n  ColPartition_IT used_it(used_parts);\n  for (it.move_to_first(); !it.empty(); it.forward()) {\n    ColPartition *part = it.extract();\n    // Transfer blobs from all regions to the output blocks.\n    // Blobs for non-text regions will be used to define the polygonal\n    // bounds of the region.\n    for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty(); bb_it.forward()) {\n      BLOBNBOX *bblob = bb_it.extract();\n      if (bblob->owner() != part) {\n        tprintf(\"Ownership incorrect for blob:\");\n        bblob->bounding_box().print();\n        tprintf(\"Part=\");\n        part->Print();\n        if (bblob->owner() == nullptr) {\n          tprintf(\"Not owned\\n\");\n        } else {\n          tprintf(\"Owner part:\");\n          bblob->owner()->Print();\n        }\n      }\n      ASSERT_HOST(bblob->owner() == part);\n      // Assert failure here is caused by arbitrarily changing the partition\n      // type without also changing the blob type, such as in\n      // InsertSmallBlobsAsUnknowns.\n      ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);\n      C_OUTLINE_LIST *outlines = bblob->cblob()->out_list();\n      C_OUTLINE_IT ol_it(outlines);\n      ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);\n      if (vertical_text) {\n        sizes.add(bblob->bounding_box().width(), 1);\n      } else {\n        sizes.add(bblob->bounding_box().height(), 1);\n      }\n      blob_it.add_after_then_move(bblob);\n    }\n    used_it.add_to_end(part);\n  }\n  if (text_type && blob_it.empty()) {\n    delete block;\n    delete to_block;\n    return nullptr;\n  }\n  to_block->line_size = sizes.median();\n  if (vertical_text) {\n    int block_width = block->pdblk.bounding_box().width();\n    if (block_width < line_spacing) {\n      line_spacing = block_width;\n    }\n    to_block->line_spacing = static_cast<float>(line_spacing);\n    to_block->max_blob_size = static_cast<float>(block_width + 1);\n  } else {\n    int block_height = block->pdblk.bounding_box().height();\n    if (block_height < line_spacing) {\n      line_spacing = block_height;\n    }\n    to_block->line_spacing = static_cast<float>(line_spacing);\n    to_block->max_blob_size = static_cast<float>(block_height + 1);\n  }\n  return to_block;\n}\n\n// Constructs a block from the given list of partitions.\n// Arguments are as LineSpacingBlocks above.\nTO_BLOCK *ColPartition::MakeBlock(const ICOORD &bleft, const ICOORD &tright,\n                                  ColPartition_LIST *block_parts,\n                                  ColPartition_LIST *used_parts) {\n  if (block_parts->empty()) {\n    return nullptr; // Nothing to do.\n  }\n  // If the block_parts are not in reading order, then it will make an invalid\n  // block polygon and bounding_box, so sort by bounding box now just to make\n  // sure.\n  block_parts->sort(&ColPartition::SortByBBox);\n  ColPartition_IT it(block_parts);\n  ColPartition *part = it.data();\n  PolyBlockType type = part->type();\n  if (type == PT_VERTICAL_TEXT) {\n    return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);\n  }\n  // LineSpacingBlocks has handed us a collection of evenly spaced lines and\n  // put the average spacing in each partition, so we can just take the\n  // linespacing from the first partition.\n  int line_spacing = part->bottom_spacing();\n  if (line_spacing < part->median_height()) {\n    line_spacing = part->bounding_box().height();\n  }\n  ICOORDELT_LIST vertices;\n  ICOORDELT_IT vert_it(&vertices);\n  ICOORD start, end;\n  int min_x = INT32_MAX;\n  int max_x = -INT32_MAX;\n  int min_y = INT32_MAX;\n  int max_y = -INT32_MAX;\n  int iteration = 0;\n  do {\n    if (iteration == 0) {\n      ColPartition::LeftEdgeRun(&it, &start, &end);\n    } else {\n      ColPartition::RightEdgeRun(&it, &start, &end);\n    }\n    ClipCoord(bleft, tright, &start);\n    ClipCoord(bleft, tright, &end);\n    vert_it.add_after_then_move(new ICOORDELT(start));\n    vert_it.add_after_then_move(new ICOORDELT(end));\n    UpdateRange(start.x(), &min_x, &max_x);\n    UpdateRange(end.x(), &min_x, &max_x);\n    UpdateRange(start.y(), &min_y, &max_y);\n    UpdateRange(end.y(), &min_y, &max_y);\n    if ((iteration == 0 && it.at_first()) || (iteration == 1 && it.at_last())) {\n      ++iteration;\n      it.move_to_last();\n    }\n  } while (iteration < 2);\n  if (textord_debug_tabfind) {\n    tprintf(\"Making block at (%d,%d)->(%d,%d)\\n\", min_x, min_y, max_x, max_y);\n  }\n  auto *block = new BLOCK(\"\", true, 0, 0, min_x, min_y, max_x, max_y);\n  block->pdblk.set_poly_block(new POLY_BLOCK(&vertices, type));\n  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);\n}\n\n// Constructs a block from the given list of vertical text partitions.\n// Currently only creates rectangular blocks.\nTO_BLOCK *ColPartition::MakeVerticalTextBlock(const ICOORD &bleft,\n                                              const ICOORD &tright,\n                                              ColPartition_LIST *block_parts,\n                                              ColPartition_LIST *used_parts) {\n  if (block_parts->empty()) {\n    return nullptr; // Nothing to do.\n  }\n  ColPartition_IT it(block_parts);\n  ColPartition *part = it.data();\n  TBOX block_box = part->bounding_box();\n  int line_spacing = block_box.width();\n  PolyBlockType type = it.data()->type();\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    block_box += it.data()->bounding_box();\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Making block at:\");\n    block_box.print();\n  }\n  auto *block = new BLOCK(\"\", true, 0, 0, block_box.left(), block_box.bottom(),\n                          block_box.right(), block_box.top());\n  block->pdblk.set_poly_block(new POLY_BLOCK(block_box, type));\n  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);\n}\n\n// Makes a TO_ROW matching this and moves all the blobs to it, transferring\n// ownership to returned TO_ROW.\nTO_ROW *ColPartition::MakeToRow() {\n  BLOBNBOX_C_IT blob_it(&boxes_);\n  TO_ROW *row = nullptr;\n  int line_size = IsVerticalType() ? median_width_ : median_height_;\n  // Add all the blobs to a single TO_ROW.\n  for (; !blob_it.empty(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.extract();\n    //    blob->compute_bounding_box();\n    int top = blob->bounding_box().top();\n    int bottom = blob->bounding_box().bottom();\n    if (row == nullptr) {\n      row =\n          new TO_ROW(blob, static_cast<float>(top), static_cast<float>(bottom),\n                     static_cast<float>(line_size));\n    } else {\n      row->add_blob(blob, static_cast<float>(top), static_cast<float>(bottom),\n                    static_cast<float>(line_size));\n    }\n  }\n  return row;\n}\n\n// Returns a copy of everything except the list of boxes. The resulting\n// ColPartition is only suitable for keeping in a column candidate list.\nColPartition *ColPartition::ShallowCopy() const {\n  auto *part = new ColPartition(blob_type_, vertical_);\n  part->left_margin_ = left_margin_;\n  part->right_margin_ = right_margin_;\n  part->bounding_box_ = bounding_box_;\n  memcpy(part->special_blobs_densities_, special_blobs_densities_,\n         sizeof(special_blobs_densities_));\n  part->median_bottom_ = median_bottom_;\n  part->median_top_ = median_top_;\n  part->median_height_ = median_height_;\n  part->median_left_ = median_left_;\n  part->median_right_ = median_right_;\n  part->median_width_ = median_width_;\n  part->good_width_ = good_width_;\n  part->good_column_ = good_column_;\n  part->left_key_tab_ = left_key_tab_;\n  part->right_key_tab_ = right_key_tab_;\n  part->type_ = type_;\n  part->flow_ = flow_;\n  part->left_key_ = left_key_;\n  part->right_key_ = right_key_;\n  part->first_column_ = first_column_;\n  part->last_column_ = last_column_;\n  part->owns_blobs_ = false;\n  return part;\n}\n\nColPartition *ColPartition::CopyButDontOwnBlobs() {\n  ColPartition *copy = ShallowCopy();\n  copy->set_owns_blobs(false);\n  BLOBNBOX_C_IT inserter(copy->boxes());\n  BLOBNBOX_C_IT traverser(boxes());\n  for (traverser.mark_cycle_pt(); !traverser.cycled_list();\n       traverser.forward()) {\n    inserter.add_after_then_move(traverser.data());\n  }\n  return copy;\n}\n\n#ifndef GRAPHICS_DISABLED\n// Provides a color for BBGrid to draw the rectangle.\n// Must be kept in sync with PolyBlockType.\nScrollView::Color ColPartition::BoxColor() const {\n  if (type_ == PT_UNKNOWN) {\n    return BLOBNBOX::TextlineColor(blob_type_, flow_);\n  }\n  return POLY_BLOCK::ColorForPolyBlockType(type_);\n}\n#endif // !GRAPHICS_DISABLED\n\n// Keep in sync with BlobRegionType.\nstatic char kBlobTypes[BRT_COUNT + 1] = \"NHSRIUVT\";\n\n// Prints debug information on this.\nvoid ColPartition::Print() const {\n  int y = MidY();\n  tprintf(\n      \"ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)\"\n      \" w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d\"\n      \" ts=%d bs=%d ls=%d rs=%d\\n\",\n      boxes_.empty() ? 'E' : ' ', left_margin_, left_key_tab_ ? 'T' : 'B',\n      LeftAtY(y), bounding_box_.left(), median_left_, bounding_box_.bottom(),\n      median_bottom_, bounding_box_.right(), RightAtY(y),\n      right_key_tab_ ? 'T' : 'B', right_margin_, median_right_,\n      bounding_box_.top(), median_top_, good_width_, good_column_, type_,\n      kBlobTypes[blob_type_], flow_, first_column_, last_column_,\n      boxes_.length(), space_above_, space_below_, space_to_left_,\n      space_to_right_);\n}\n\n// Prints debug information on the colors.\nvoid ColPartition::PrintColors() {\n  tprintf(\"Colors:(%d, %d, %d)%d -> (%d, %d, %d)\\n\", color1_[COLOR_RED],\n          color1_[COLOR_GREEN], color1_[COLOR_BLUE], color1_[L_ALPHA_CHANNEL],\n          color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);\n}\n\n// Sets the types of all partitions in the run to be the max of the types.\nvoid ColPartition::SmoothPartnerRun(int working_set_count) {\n  STATS left_stats(0, working_set_count - 1);\n  STATS right_stats(0, working_set_count - 1);\n  PolyBlockType max_type = type_;\n  ColPartition *partner;\n  for (partner = SingletonPartner(false); partner != nullptr;\n       partner = partner->SingletonPartner(false)) {\n    if (partner->type_ > max_type) {\n      max_type = partner->type_;\n    }\n    if (column_set_ == partner->column_set_) {\n      left_stats.add(partner->first_column_, 1);\n      right_stats.add(partner->last_column_, 1);\n    }\n  }\n  type_ = max_type;\n  // TODO(rays) Either establish that it isn't necessary to set the columns,\n  // or find a way to do it that does not cause an assert failure in\n  // AddToWorkingSet.\n#if 0\n  first_column_ = left_stats.mode();\n  last_column_ = right_stats.mode();\n  if (last_column_ < first_column_)\n    last_column_ = first_column_;\n#endif\n\n  for (partner = SingletonPartner(false); partner != nullptr;\n       partner = partner->SingletonPartner(false)) {\n    partner->type_ = max_type;\n#if 0 // See TODO above\n    if (column_set_ == partner->column_set_) {\n      partner->first_column_ = first_column_;\n      partner->last_column_ = last_column_;\n    }\n#endif\n  }\n}\n\n// ======= Scenario common to all Refine*Partners* functions =======\n// ColPartitions are aiming to represent textlines, or horizontal slices\n// of images, and we are trying to form bi-directional (upper/lower) chains\n// of UNIQUE partner ColPartitions that can be made into blocks.\n// The ColPartitions have previously been typed (see SetPartitionType)\n// according to a combination of the content type and\n// how they lie on the columns. We want to chain text into\n// groups of a single type, but image ColPartitions may have been typed\n// differently in different parts of the image, due to being non-rectangular.\n//\n// We previously ran a search for upper and lower partners, but there may\n// be more than one, and they may be of mixed types, so now we wish to\n// refine the partners down to at most one.\n// A heading may have multiple partners:\n// ===============================\n// ========  ==========  =========\n// ========  ==========  =========\n// but it should be a different type.\n// A regular flowing text line may have multiple partners:\n// ==================   ===================\n// =======   =================  ===========\n// This could be the start of a pull-out, or it might all be in a single\n// column and might be caused by tightly spaced text, bold words, bullets,\n// funny punctuation etc, all of which can cause textlines to be split into\n// multiple ColPartitions. Pullouts and figure captions should now be different\n// types so we can more aggressively merge groups of partners that all sit\n// in a single column.\n//\n// Cleans up the partners of the given type so that there is at most\n// one partner. This makes block creation simpler.\n// If get_desperate is true, goes to more desperate merge methods\n// to merge flowing text before breaking partnerships.\nvoid ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,\n                                  ColPartitionGrid *grid) {\n  if (TypesSimilar(type_, type)) {\n    RefinePartnersInternal(true, get_desperate, grid);\n    RefinePartnersInternal(false, get_desperate, grid);\n  } else if (type == PT_COUNT) {\n    // This is the final pass. Make sure only the correctly typed\n    // partners surivive, however many there are.\n    RefinePartnersByType(true, &upper_partners_);\n    RefinePartnersByType(false, &lower_partners_);\n    // It is possible for a merge to have given a partition multiple\n    // partners again, so the last resort is to use overlap which is\n    // guaranteed to leave at most one partner left.\n    if (!upper_partners_.empty() && !upper_partners_.singleton()) {\n      RefinePartnersByOverlap(true, &upper_partners_);\n    }\n    if (!lower_partners_.empty() && !lower_partners_.singleton()) {\n      RefinePartnersByOverlap(false, &lower_partners_);\n    }\n  }\n}\n\n////////////////// PRIVATE CODE /////////////////////////////\n\n// Cleans up the partners above if upper is true, else below.\n// If get_desperate is true, goes to more desperate merge methods\n// to merge flowing text before breaking partnerships.\nvoid ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,\n                                          ColPartitionGrid *grid) {\n  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;\n  if (!partners->empty() && !partners->singleton()) {\n    RefinePartnersByType(upper, partners);\n    if (!partners->empty() && !partners->singleton()) {\n      // Check for transitive partnerships and break the cycle.\n      RefinePartnerShortcuts(upper, partners);\n      if (!partners->empty() && !partners->singleton()) {\n        // Types didn't fix it. Flowing text keeps the one with the longest\n        // sequence of singleton matching partners. All others max overlap.\n        if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {\n          RefineTextPartnersByMerge(upper, false, partners, grid);\n          if (!partners->empty() && !partners->singleton()) {\n            RefineTextPartnersByMerge(upper, true, partners, grid);\n          }\n        }\n        // The last resort is to use overlap.\n        if (!partners->empty() && !partners->singleton()) {\n          RefinePartnersByOverlap(upper, partners);\n        }\n      }\n    }\n  }\n}\n\n// Cleans up the partners above if upper is true, else below.\n// Restricts the partners to only desirable types. For text and BRT_HLINE this\n// means the same type_ , and for image types it means any image type.\nvoid ColPartition::RefinePartnersByType(bool upper,\n                                        ColPartition_CLIST *partners) {\n  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                         bounding_box_.bottom());\n  if (debug) {\n    tprintf(\"Refining %d %s partners by type for:\\n\", partners->length(),\n            upper ? \"Upper\" : \"Lower\");\n    Print();\n  }\n  ColPartition_C_IT it(partners);\n  // Purify text by type.\n  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {\n    // Keep only partners matching type_.\n    // Exception: PT_VERTICAL_TEXT is allowed to stay with the other\n    // text types if it is the only partner.\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      ColPartition *partner = it.data();\n      if (!TypesSimilar(type_, partner->type_)) {\n        if (debug) {\n          tprintf(\"Removing partner:\");\n          partner->Print();\n        }\n        partner->RemovePartner(!upper, this);\n        it.extract();\n      } else if (debug) {\n        tprintf(\"Keeping partner:\");\n        partner->Print();\n      }\n    }\n  } else {\n    // Only polyimages are allowed to have partners of any kind!\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      ColPartition *partner = it.data();\n      if (partner->blob_type() != BRT_POLYIMAGE ||\n          blob_type() != BRT_POLYIMAGE) {\n        if (debug) {\n          tprintf(\"Removing partner:\");\n          partner->Print();\n        }\n        partner->RemovePartner(!upper, this);\n        it.extract();\n      } else if (debug) {\n        tprintf(\"Keeping partner:\");\n        partner->Print();\n      }\n    }\n  }\n}\n\n// Cleans up the partners above if upper is true, else below.\n// Remove transitive partnerships: this<->a, and a<->b and this<->b.\n// Gets rid of this<->b, leaving a clean chain.\n// Also if we have this<->a and a<->this, then gets rid of this<->a, as\n// this has multiple partners.\nvoid ColPartition::RefinePartnerShortcuts(bool upper,\n                                          ColPartition_CLIST *partners) {\n  bool done_any = false;\n  do {\n    done_any = false;\n    ColPartition_C_IT it(partners);\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      ColPartition *a = it.data();\n      // Check for a match between all of a's partners (it1/b1) and all\n      // of this's partners (it2/b2).\n      ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);\n      for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {\n        ColPartition *b1 = it1.data();\n        if (b1 == this) {\n          done_any = true;\n          it.extract();\n          a->RemovePartner(!upper, this);\n          break;\n        }\n        ColPartition_C_IT it2(partners);\n        for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {\n          ColPartition *b2 = it2.data();\n          if (b1 == b2) {\n            // Jackpot! b2 should not be a partner of this.\n            it2.extract();\n            b2->RemovePartner(!upper, this);\n            done_any = true;\n            // That potentially invalidated all the iterators, so break out\n            // and start again.\n            break;\n          }\n        }\n        if (done_any) {\n          break;\n        }\n      }\n      if (done_any) {\n        break;\n      }\n    }\n  } while (done_any && !partners->empty() && !partners->singleton());\n}\n\n// Cleans up the partners above if upper is true, else below.\n// If multiple text partners can be merged, (with each other, NOT with this),\n// then do so.\n// If desperate is true, then an increase in overlap with the merge is\n// allowed. If the overlap increases, then the desperately_merged_ flag\n// is set, indicating that the textlines probably need to be regenerated\n// by aggressive line fitting/splitting, as there are probably vertically\n// joined blobs that cross textlines.\nvoid ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,\n                                             ColPartition_CLIST *partners,\n                                             ColPartitionGrid *grid) {\n  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                         bounding_box_.bottom());\n  if (debug) {\n    tprintf(\"Refining %d %s partners by merge for:\\n\", partners->length(),\n            upper ? \"Upper\" : \"Lower\");\n    Print();\n  }\n  while (!partners->empty() && !partners->singleton()) {\n    // Absorb will mess up the iterators, so we have to merge one partition\n    // at a time and rebuild the iterators each time.\n    ColPartition_C_IT it(partners);\n    ColPartition *part = it.data();\n    // Gather a list of merge candidates, from the list of partners, that\n    // are all in the same single column. See general scenario comment above.\n    ColPartition_CLIST candidates;\n    ColPartition_C_IT cand_it(&candidates);\n    for (it.forward(); !it.at_first(); it.forward()) {\n      ColPartition *candidate = it.data();\n      if (part->first_column_ == candidate->last_column_ &&\n          part->last_column_ == candidate->first_column_) {\n        cand_it.add_after_then_move(it.data());\n      }\n    }\n    int overlap_increase;\n    ColPartition *candidate = grid->BestMergeCandidate(\n        part, &candidates, debug, nullptr, &overlap_increase);\n    if (candidate != nullptr && (overlap_increase <= 0 || desperate)) {\n      if (debug) {\n        tprintf(\"Merging:hoverlap=%d, voverlap=%d, OLI=%d\\n\",\n                part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),\n                overlap_increase);\n      }\n      // Remove before merge and re-insert to keep the integrity of the grid.\n      grid->RemoveBBox(candidate);\n      grid->RemoveBBox(part);\n      part->Absorb(candidate, nullptr);\n      // We modified the box of part, so re-insert it into the grid.\n      grid->InsertBBox(true, true, part);\n      if (overlap_increase > 0) {\n        part->desperately_merged_ = true;\n      }\n    } else {\n      break; // Can't merge.\n    }\n  }\n}\n\n// Cleans up the partners above if upper is true, else below.\n// Keep the partner with the biggest overlap.\nvoid ColPartition::RefinePartnersByOverlap(bool upper,\n                                           ColPartition_CLIST *partners) {\n  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                         bounding_box_.bottom());\n  if (debug) {\n    tprintf(\"Refining %d %s partners by overlap for:\\n\", partners->length(),\n            upper ? \"Upper\" : \"Lower\");\n    Print();\n  }\n  ColPartition_C_IT it(partners);\n  ColPartition *best_partner = it.data();\n  // Find the partner with the best overlap.\n  int best_overlap = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *partner = it.data();\n    int overlap =\n        std::min(bounding_box_.right(), partner->bounding_box_.right()) -\n        std::max(bounding_box_.left(), partner->bounding_box_.left());\n    if (overlap > best_overlap) {\n      best_overlap = overlap;\n      best_partner = partner;\n    }\n  }\n  // Keep only the best partner.\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *partner = it.data();\n    if (partner != best_partner) {\n      if (debug) {\n        tprintf(\"Removing partner:\");\n        partner->Print();\n      }\n      partner->RemovePartner(!upper, this);\n      it.extract();\n    }\n  }\n}\n\n// Return true if bbox belongs better in this than other.\nbool ColPartition::ThisPartitionBetter(BLOBNBOX *bbox,\n                                       const ColPartition &other) {\n  const TBOX &box = bbox->bounding_box();\n  // Margins take priority.\n  int left = box.left();\n  int right = box.right();\n  if (left < left_margin_ || right > right_margin_) {\n    return false;\n  }\n  if (left < other.left_margin_ || right > other.right_margin_) {\n    return true;\n  }\n  int top = box.top();\n  int bottom = box.bottom();\n  int this_overlap =\n      std::min(top, median_top_) - std::max(bottom, median_bottom_);\n  int other_overlap =\n      std::min(top, other.median_top_) - std::max(bottom, other.median_bottom_);\n  int this_miss = median_top_ - median_bottom_ - this_overlap;\n  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;\n  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {\n    tprintf(\"Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\\n\",\n            box.left(), box.bottom(), box.right(), box.top(), this_overlap,\n            other_overlap, this_miss, other_miss, median_top_,\n            other.median_top_);\n  }\n  if (this_miss < other_miss) {\n    return true;\n  }\n  if (this_miss > other_miss) {\n    return false;\n  }\n  if (this_overlap > other_overlap) {\n    return true;\n  }\n  if (this_overlap < other_overlap) {\n    return false;\n  }\n  return median_top_ >= other.median_top_;\n}\n\n// Returns the median line-spacing between the current position and the end\n// of the list.\n// The iterator is passed by value so the iteration does not modify the\n// caller's iterator.\nstatic int MedianSpacing(int page_height, ColPartition_IT it) {\n  STATS stats(0, page_height - 1);\n  while (!it.cycled_list()) {\n    ColPartition *part = it.data();\n    it.forward();\n    stats.add(part->bottom_spacing(), 1);\n    stats.add(part->top_spacing(), 1);\n  }\n  return static_cast<int>(stats.median() + 0.5);\n}\n\n// Returns true if this column partition is in the same column as\n// part. This function will only work after the SetPartitionType function\n// has been called on both column partitions. This is useful for\n// doing a SideSearch when you want things in the same page column.\n//\n// Currently called by the table detection code to identify if potential table\n// partitions exist in the same column.\nbool ColPartition::IsInSameColumnAs(const ColPartition &part) const {\n  // Overlap does not occur when last < part.first or first > part.last.\n  // In other words, one is completely to the side of the other.\n  // This is just DeMorgan's law applied to that so the function returns true.\n  return (last_column_ >= part.first_column_) &&\n         (first_column_ <= part.last_column_);\n}\n\n// Smoothes the spacings in the list into groups of equal linespacing.\n// resolution is the resolution of the original image, used as a basis\n// for thresholds in change of spacing. page_height is in pixels.\nvoid ColPartition::SmoothSpacings(int resolution, int page_height,\n                                  ColPartition_LIST *parts) {\n  // The task would be trivial if we didn't have to allow for blips -\n  // occasional offsets in spacing caused by anomalous text, such as all\n  // caps, groups of descenders, joined words, Arabic etc.\n  // The neighbourhood stores a consecutive group of partitions so that\n  // blips can be detected correctly, yet conservatively enough to not\n  // mistake genuine spacing changes for blips. See example below.\n  ColPartition *neighbourhood[PN_COUNT];\n  ColPartition_IT it(parts);\n  it.mark_cycle_pt();\n  // Although we know nothing about the spacings is this list, the median is\n  // used as an approximation to allow blips.\n  // If parts of this block aren't spaced to the median, then we can't\n  // accept blips in those parts, but we'll recalculate it each time we\n  // split the block, so the median becomes more likely to match all the text.\n  int median_space = MedianSpacing(page_height, it);\n  ColPartition_IT start_it(it);\n  ColPartition_IT end_it(it);\n  for (int i = 0; i < PN_COUNT; ++i) {\n    if (i < PN_UPPER || it.cycled_list()) {\n      neighbourhood[i] = nullptr;\n    } else {\n      if (i == PN_LOWER) {\n        end_it = it;\n      }\n      neighbourhood[i] = it.data();\n      it.forward();\n    }\n  }\n  while (neighbourhood[PN_UPPER] != nullptr) {\n    // Test for end of a group. Normally SpacingsEqual is true within a group,\n    // but in the case of a blip, it will be false. Here is an example:\n    // Line enum   Spacing below (spacing between tops of lines)\n    //  1   ABOVE2    20\n    //  2   ABOVE1    20\n    //  3   UPPER     15\n    //  4   LOWER     25\n    //  5   BELOW1    20\n    //  6   BELOW2    20\n    // Line 4 is all in caps (regular caps), so the spacing between line 3\n    // and line 4 (looking at the tops) is smaller than normal, and the\n    // spacing between line 4 and line 5 is larger than normal, but the\n    // two of them add to twice the normal spacing.\n    // The following if has to accept unequal spacings 3 times to pass the\n    // blip (20/15, 15/25 and 25/20)\n    // When the blip is in the middle, OKSpacingBlip tests that one of\n    // ABOVE1 and BELOW1 matches the median.\n    // The first time, everything is shifted down 1, so we present\n    // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.\n    // The last time, everything is shifted up 1, so we present OKSpacingBlip\n    // with neighbourhood-1 and check that PN_LOWER matches the median.\n    if (neighbourhood[PN_LOWER] == nullptr ||\n        (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],\n                                                 resolution) &&\n         (neighbourhood[PN_UPPER] == nullptr ||\n          neighbourhood[PN_LOWER] == nullptr ||\n          !OKSpacingBlip(resolution, median_space, neighbourhood, 0)) &&\n         (neighbourhood[PN_UPPER - 1] == nullptr ||\n          neighbourhood[PN_LOWER - 1] == nullptr ||\n          !OKSpacingBlip(resolution, median_space, neighbourhood, -1) ||\n          !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&\n         (neighbourhood[PN_UPPER + 1] == nullptr ||\n          neighbourhood[PN_LOWER + 1] == nullptr ||\n          !OKSpacingBlip(resolution, median_space, neighbourhood, 1) ||\n          !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {\n      // The group has ended. PN_UPPER is the last member.\n      // Compute the mean spacing over the group.\n      ColPartition_IT sum_it(start_it);\n      ColPartition *last_part = neighbourhood[PN_UPPER];\n      double total_bottom = 0.0;\n      double total_top = 0.0;\n      int total_count = 0;\n      ColPartition *upper = sum_it.data();\n      // We do not process last_part, as its spacing is different.\n      while (upper != last_part) {\n        total_bottom += upper->bottom_spacing();\n        total_top += upper->top_spacing();\n        ++total_count;\n        sum_it.forward();\n        upper = sum_it.data();\n      }\n      if (total_count > 0) {\n        // There were at least 2 lines, so set them all to the mean.\n        int top_spacing = static_cast<int>(total_top / total_count + 0.5);\n        int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);\n        if (textord_debug_tabfind) {\n          tprintf(\"Spacing run ended. Cause:\");\n          if (neighbourhood[PN_LOWER] == nullptr) {\n            tprintf(\"No more lines\\n\");\n          } else {\n            tprintf(\"Spacing change. Spacings:\\n\");\n            for (int i = 0; i < PN_COUNT; ++i) {\n              if (neighbourhood[i] == nullptr) {\n                tprintf(\"NULL\");\n                if (i > 0 && neighbourhood[i - 1] != nullptr) {\n                  if (neighbourhood[i - 1]->SingletonPartner(false) !=\n                      nullptr) {\n                    tprintf(\" Lower partner:\");\n                    neighbourhood[i - 1]->SingletonPartner(false)->Print();\n                  } else {\n                    tprintf(\" nullptr lower partner:\\n\");\n                  }\n                } else {\n                  tprintf(\"\\n\");\n                }\n              } else {\n                tprintf(\"Top = %d, bottom = %d\\n\",\n                        neighbourhood[i]->top_spacing(),\n                        neighbourhood[i]->bottom_spacing());\n              }\n            }\n          }\n          tprintf(\"Mean spacing = %d/%d\\n\", top_spacing, bottom_spacing);\n        }\n        sum_it = start_it;\n        upper = sum_it.data();\n        while (upper != last_part) {\n          upper->set_top_spacing(top_spacing);\n          upper->set_bottom_spacing(bottom_spacing);\n          if (textord_debug_tabfind) {\n            tprintf(\"Setting mean on:\");\n            upper->Print();\n          }\n          sum_it.forward();\n          upper = sum_it.data();\n        }\n      }\n      // PN_LOWER starts the next group and end_it is the next start_it.\n      start_it = end_it;\n      // Recalculate the median spacing to maximize the chances of detecting\n      // spacing blips.\n      median_space = MedianSpacing(page_height, end_it);\n    }\n    // Shuffle pointers.\n    for (int j = 1; j < PN_COUNT; ++j) {\n      neighbourhood[j - 1] = neighbourhood[j];\n    }\n    if (it.cycled_list()) {\n      neighbourhood[PN_COUNT - 1] = nullptr;\n    } else {\n      neighbourhood[PN_COUNT - 1] = it.data();\n      it.forward();\n    }\n    end_it.forward();\n  }\n}\n\n// Returns true if the parts array of pointers to partitions matches the\n// condition for a spacing blip. See SmoothSpacings for what this means\n// and how it is used.\nbool ColPartition::OKSpacingBlip(int resolution, int median_spacing,\n                                 ColPartition **parts, int offset) {\n  // The blip is OK if upper and lower sum to an OK value and at least\n  // one of above1 and below1 is equal to the median.\n  parts += offset;\n  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER], median_spacing,\n                                          resolution) &&\n         ((parts[PN_ABOVE1] != nullptr &&\n           parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||\n          (parts[PN_BELOW1] != nullptr &&\n           parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));\n}\n\n// Returns true if both the top and bottom spacings of this match the given\n// spacing to within suitable margins dictated by the image resolution.\nbool ColPartition::SpacingEqual(int spacing, int resolution) const {\n  int bottom_error = BottomSpacingMargin(resolution);\n  int top_error = TopSpacingMargin(resolution);\n  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&\n         NearlyEqual(top_spacing_, spacing, top_error);\n}\n\n// Returns true if both the top and bottom spacings of this and other\n// match to within suitable margins dictated by the image resolution.\nbool ColPartition::SpacingsEqual(const ColPartition &other,\n                                 int resolution) const {\n  int bottom_error = std::max(BottomSpacingMargin(resolution),\n                              other.BottomSpacingMargin(resolution));\n  int top_error = std::max(TopSpacingMargin(resolution),\n                           other.TopSpacingMargin(resolution));\n  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&\n         (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||\n          NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,\n                      bottom_error));\n}\n\n// Returns true if the sum spacing of this and other match the given\n// spacing (or twice the given spacing) to within a suitable margin dictated\n// by the image resolution.\nbool ColPartition::SummedSpacingOK(const ColPartition &other, int spacing,\n                                   int resolution) const {\n  int bottom_error = std::max(BottomSpacingMargin(resolution),\n                              other.BottomSpacingMargin(resolution));\n  int top_error = std::max(TopSpacingMargin(resolution),\n                           other.TopSpacingMargin(resolution));\n  int bottom_total = bottom_spacing_ + other.bottom_spacing_;\n  int top_total = top_spacing_ + other.top_spacing_;\n  return (NearlyEqual(spacing, bottom_total, bottom_error) &&\n          NearlyEqual(spacing, top_total, top_error)) ||\n         (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&\n          NearlyEqual(spacing * 2, top_total, top_error));\n}\n\n// Returns a suitable spacing margin that can be applied to bottoms of\n// text lines, based on the resolution and the stored side_step_.\nint ColPartition::BottomSpacingMargin(int resolution) const {\n  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;\n}\n\n// Returns a suitable spacing margin that can be applied to tops of\n// text lines, based on the resolution and the stored side_step_.\nint ColPartition::TopSpacingMargin(int resolution) const {\n  return static_cast<int>(kMaxTopSpacingFraction * median_height_ + 0.5) +\n         BottomSpacingMargin(resolution);\n}\n\n// Returns true if the median text sizes of this and other agree to within\n// a reasonable multiplicative factor.\nbool ColPartition::SizesSimilar(const ColPartition &other) const {\n  return median_height_ <= other.median_height_ * kMaxSizeRatio &&\n         other.median_height_ <= median_height_ * kMaxSizeRatio;\n}\n\n// Helper updates margin_left and margin_right, being the bounds of the left\n// margin of part of a block. Returns false and does not update the bounds if\n// this partition has a disjoint margin with the established margin.\nstatic bool UpdateLeftMargin(const ColPartition &part, int *margin_left,\n                             int *margin_right) {\n  const TBOX &part_box = part.bounding_box();\n  int top = part_box.top();\n  int bottom = part_box.bottom();\n  int tl_key = part.SortKey(part.left_margin(), top);\n  int tr_key = part.SortKey(part_box.left(), top);\n  int bl_key = part.SortKey(part.left_margin(), bottom);\n  int br_key = part.SortKey(part_box.left(), bottom);\n  int left_key = std::max(tl_key, bl_key);\n  int right_key = std::min(tr_key, br_key);\n  if (left_key <= *margin_right && right_key >= *margin_left) {\n    // This part is good - let's keep it.\n    *margin_right = std::min(*margin_right, right_key);\n    *margin_left = std::max(*margin_left, left_key);\n    return true;\n  }\n  return false;\n}\n\n// Computes and returns in start, end a line segment formed from a\n// forwards-iterated group of left edges of partitions that satisfy the\n// condition that the intersection of the left margins is non-empty, ie the\n// rightmost left margin is to the left of the leftmost left bounding box edge.\n// On return the iterator is set to the start of the next run.\nvoid ColPartition::LeftEdgeRun(ColPartition_IT *part_it, ICOORD *start,\n                               ICOORD *end) {\n  ColPartition *part = part_it->data();\n  ColPartition *start_part = part;\n  int start_y = part->bounding_box_.top();\n  if (!part_it->at_first()) {\n    int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();\n    if (prev_bottom < start_y) {\n      start_y = prev_bottom;\n    } else if (prev_bottom > start_y) {\n      start_y = (start_y + prev_bottom) / 2;\n    }\n  }\n  int end_y = part->bounding_box_.bottom();\n  int margin_right = INT32_MAX;\n  int margin_left = -INT32_MAX;\n  UpdateLeftMargin(*part, &margin_left, &margin_right);\n  do {\n    part_it->forward();\n    part = part_it->data();\n  } while (!part_it->at_first() &&\n           UpdateLeftMargin(*part, &margin_left, &margin_right));\n  // The run ended. If we were pushed inwards, compute the next run and\n  // extend it backwards into the run we just calculated to find the end of\n  // this run that provides a tight box.\n  int next_margin_right = INT32_MAX;\n  int next_margin_left = -INT32_MAX;\n  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);\n  if (next_margin_left > margin_right) {\n    ColPartition_IT next_it(*part_it);\n    do {\n      next_it.forward();\n      part = next_it.data();\n    } while (!next_it.at_first() &&\n             UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));\n    // Now extend the next run backwards into the original run to get the\n    // tightest fit.\n    do {\n      part_it->backward();\n      part = part_it->data();\n    } while (part != start_part &&\n             UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));\n    part_it->forward();\n  }\n  // Now calculate the end_y.\n  part = part_it->data_relative(-1);\n  end_y = part->bounding_box_.bottom();\n  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y) {\n    end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;\n  }\n  start->set_y(start_y);\n  start->set_x(part->XAtY(margin_right, start_y));\n  end->set_y(end_y);\n  end->set_x(part->XAtY(margin_right, end_y));\n  if (textord_debug_tabfind && !part_it->at_first()) {\n    tprintf(\"Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\\n\",\n            start_y, end_y, part->XAtY(margin_left, end_y), end->x(),\n            part->left_margin_, part->bounding_box_.left());\n  }\n}\n\n// Helper updates margin_left and margin_right, being the bounds of the right\n// margin of part of a block. Returns false and does not update the bounds if\n// this partition has a disjoint margin with the established margin.\nstatic bool UpdateRightMargin(const ColPartition &part, int *margin_left,\n                              int *margin_right) {\n  const TBOX &part_box = part.bounding_box();\n  int top = part_box.top();\n  int bottom = part_box.bottom();\n  int tl_key = part.SortKey(part_box.right(), top);\n  int tr_key = part.SortKey(part.right_margin(), top);\n  int bl_key = part.SortKey(part_box.right(), bottom);\n  int br_key = part.SortKey(part.right_margin(), bottom);\n  int left_key = std::max(tl_key, bl_key);\n  int right_key = std::min(tr_key, br_key);\n  if (left_key <= *margin_right && right_key >= *margin_left) {\n    // This part is good - let's keep it.\n    *margin_right = std::min(*margin_right, right_key);\n    *margin_left = std::max(*margin_left, left_key);\n    return true;\n  }\n  return false;\n}\n\n// Computes and returns in start, end a line segment formed from a\n// backwards-iterated group of right edges of partitions that satisfy the\n// condition that the intersection of the right margins is non-empty, ie the\n// leftmost right margin is to the right of the rightmost right bounding box\n// edge.\n// On return the iterator is set to the start of the next run.\nvoid ColPartition::RightEdgeRun(ColPartition_IT *part_it, ICOORD *start,\n                                ICOORD *end) {\n  ColPartition *part = part_it->data();\n  ColPartition *start_part = part;\n  int start_y = part->bounding_box_.bottom();\n  if (!part_it->at_last()) {\n    int next_y = part_it->data_relative(1)->bounding_box_.top();\n    if (next_y > start_y) {\n      start_y = next_y;\n    } else if (next_y < start_y) {\n      start_y = (start_y + next_y) / 2;\n    }\n  }\n  int end_y = part->bounding_box_.top();\n  int margin_right = INT32_MAX;\n  int margin_left = -INT32_MAX;\n  UpdateRightMargin(*part, &margin_left, &margin_right);\n  do {\n    part_it->backward();\n    part = part_it->data();\n  } while (!part_it->at_last() &&\n           UpdateRightMargin(*part, &margin_left, &margin_right));\n  // The run ended. If we were pushed inwards, compute the next run and\n  // extend it backwards to find the end of this run for a tight box.\n  int next_margin_right = INT32_MAX;\n  int next_margin_left = -INT32_MAX;\n  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);\n  if (next_margin_right < margin_left) {\n    ColPartition_IT next_it(*part_it);\n    do {\n      next_it.backward();\n      part = next_it.data();\n    } while (!next_it.at_last() &&\n             UpdateRightMargin(*part, &next_margin_left, &next_margin_right));\n    // Now extend the next run forwards into the original run to get the\n    // tightest fit.\n    do {\n      part_it->forward();\n      part = part_it->data();\n    } while (part != start_part &&\n             UpdateRightMargin(*part, &next_margin_left, &next_margin_right));\n    part_it->backward();\n  }\n  // Now calculate the end_y.\n  part = part_it->data_relative(1);\n  end_y = part->bounding_box().top();\n  if (!part_it->at_last() && part_it->data()->bounding_box_.bottom() > end_y) {\n    end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;\n  }\n  start->set_y(start_y);\n  start->set_x(part->XAtY(margin_left, start_y));\n  end->set_y(end_y);\n  end->set_x(part->XAtY(margin_left, end_y));\n  if (textord_debug_tabfind && !part_it->at_last()) {\n    tprintf(\"Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\\n\",\n            start_y, end_y, end->x(), part->XAtY(margin_right, end_y),\n            part->bounding_box_.right(), part->right_margin_);\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/colpartition.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartition.h\n// Description: Class to hold partitions of the page that correspond\n//              roughly to text lines.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_COLPARTITION_H_\n#define TESSERACT_TEXTORD_COLPARTITION_H_\n\n#include \"bbgrid.h\"\n#include \"blobbox.h\" // For BlobRegionType.\n#include \"ocrblock.h\"\n#include \"rect.h\" // For TBOX.\n#include \"scrollview.h\"\n#include \"tabfind.h\"   // For WidthCallback.\n#include \"tabvector.h\" // For BLOBNBOX_CLIST.\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// Number of colors in the color1, color2 arrays.\nconst int kRGBRMSColors = 4;\n\nclass ColPartition;\nclass ColPartitionSet;\nclass ColPartitionGrid;\nclass WorkingPartSet;\nclass WorkingPartSet_LIST;\n\n// An enum to indicate how a partition sits on the columns.\n// The order of flowing/heading/pullout must be kept consistent with\n// PolyBlockType.\nenum ColumnSpanningType {\n  CST_NOISE,   // Strictly between columns.\n  CST_FLOWING, // Strictly within a single column.\n  CST_HEADING, // Spans multiple columns.\n  CST_PULLOUT, // Touches multiple columns, but doesn't span them.\n  CST_COUNT    // Number of entries.\n};\n\nELIST2IZEH(ColPartition)\nCLISTIZEH(ColPartition)\n\n/**\n * ColPartition is a partition of a horizontal slice of the page.\n * It starts out as a collection of blobs at a particular y-coord in the grid,\n * but ends up (after merging and uniquing) as an approximate text line.\n * ColPartitions are also used to hold a partitioning of the page into\n * columns, each representing one column. Although a ColPartition applies\n * to a given y-coordinate range, eventually, a ColPartitionSet of ColPartitions\n * emerges, which represents the columns over a wide y-coordinate range.\n */\nclass TESS_API ColPartition : public ELIST2<ColPartition>::LINK {\npublic:\n  // This empty constructor is here only so that the class can be ELISTIZED.\n  // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier\n  // and eliminate CLASSNAME##_copier.\n  ColPartition() = default;\n\n  /**\n   * @param blob_type is the blob_region_type_ of the blobs in this partition.\n   * @param vertical is the direction of logical vertical on the possibly skewed\n   * image.\n   */\n  ColPartition(BlobRegionType blob_type, const ICOORD &vertical);\n  /**\n   * Constructs a fake ColPartition with no BLOBNBOXes to represent a\n   * horizontal or vertical line, given a type and a bounding box.\n   */\n  static ColPartition *MakeLinePartition(BlobRegionType blob_type,\n                                         const ICOORD &vertical, int left,\n                                         int bottom, int right, int top);\n\n  // Constructs and returns a fake ColPartition with a single fake BLOBNBOX,\n  // all made from a single TBOX.\n  // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and\n  // the ColPartition owns the BLOBNBOX!!!\n  // Call DeleteBoxes before deleting the ColPartition.\n  static ColPartition *FakePartition(const TBOX &box, PolyBlockType block_type,\n                                     BlobRegionType blob_type,\n                                     BlobTextFlowType flow);\n\n  // Constructs and returns a ColPartition with the given real BLOBNBOX,\n  // and sets it up to be a \"big\" partition (single-blob partition bigger\n  // than the surrounding text that may be a dropcap, two or more vertically\n  // touching characters, or some graphic element.\n  // If the given list is not nullptr, the partition is also added to the list.\n  static ColPartition *MakeBigPartition(BLOBNBOX *box,\n                                        ColPartition_LIST *big_part_list);\n\n  ~ColPartition();\n\n  // Simple accessors.\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n  int left_margin() const {\n    return left_margin_;\n  }\n  void set_left_margin(int margin) {\n    left_margin_ = margin;\n  }\n  int right_margin() const {\n    return right_margin_;\n  }\n  void set_right_margin(int margin) {\n    right_margin_ = margin;\n  }\n  int median_top() const {\n    return median_top_;\n  }\n  int median_bottom() const {\n    return median_bottom_;\n  }\n  int median_left() const {\n    return median_left_;\n  }\n  int median_right() const {\n    return median_right_;\n  }\n  int median_height() const {\n    return median_height_;\n  }\n  void set_median_height(int height) {\n    median_height_ = height;\n  }\n  int median_width() const {\n    return median_width_;\n  }\n  void set_median_width(int width) {\n    median_width_ = width;\n  }\n  BlobRegionType blob_type() const {\n    return blob_type_;\n  }\n  void set_blob_type(BlobRegionType t) {\n    blob_type_ = t;\n  }\n  BlobTextFlowType flow() const {\n    return flow_;\n  }\n  void set_flow(BlobTextFlowType f) {\n    flow_ = f;\n  }\n  int good_blob_score() const {\n    return good_blob_score_;\n  }\n  bool good_width() const {\n    return good_width_;\n  }\n  bool good_column() const {\n    return good_column_;\n  }\n  bool left_key_tab() const {\n    return left_key_tab_;\n  }\n  int left_key() const {\n    return left_key_;\n  }\n  bool right_key_tab() const {\n    return right_key_tab_;\n  }\n  int right_key() const {\n    return right_key_;\n  }\n  PolyBlockType type() const {\n    return type_;\n  }\n  void set_type(PolyBlockType t) {\n    type_ = t;\n  }\n  BLOBNBOX_CLIST *boxes() {\n    return &boxes_;\n  }\n  int boxes_count() const {\n    return boxes_.length();\n  }\n  void set_vertical(const ICOORD &v) {\n    vertical_ = v;\n  }\n  ColPartition_CLIST *upper_partners() {\n    return &upper_partners_;\n  }\n  ColPartition_CLIST *lower_partners() {\n    return &lower_partners_;\n  }\n  void set_working_set(WorkingPartSet *working_set) {\n    working_set_ = working_set;\n  }\n  bool block_owned() const {\n    return block_owned_;\n  }\n  void set_block_owned(bool owned) {\n    block_owned_ = owned;\n  }\n  bool desperately_merged() const {\n    return desperately_merged_;\n  }\n  ColPartitionSet *column_set() const {\n    return column_set_;\n  }\n  void set_side_step(int step) {\n    side_step_ = step;\n  }\n  int bottom_spacing() const {\n    return bottom_spacing_;\n  }\n  void set_bottom_spacing(int spacing) {\n    bottom_spacing_ = spacing;\n  }\n  int top_spacing() const {\n    return top_spacing_;\n  }\n  void set_top_spacing(int spacing) {\n    top_spacing_ = spacing;\n  }\n\n  void set_table_type() {\n    if (type_ != PT_TABLE) {\n      type_before_table_ = type_;\n      type_ = PT_TABLE;\n    }\n  }\n  void clear_table_type() {\n    if (type_ == PT_TABLE) {\n      type_ = type_before_table_;\n    }\n  }\n  bool inside_table_column() {\n    return inside_table_column_;\n  }\n  void set_inside_table_column(bool val) {\n    inside_table_column_ = val;\n  }\n  ColPartition *nearest_neighbor_above() const {\n    return nearest_neighbor_above_;\n  }\n  void set_nearest_neighbor_above(ColPartition *part) {\n    nearest_neighbor_above_ = part;\n  }\n  ColPartition *nearest_neighbor_below() const {\n    return nearest_neighbor_below_;\n  }\n  void set_nearest_neighbor_below(ColPartition *part) {\n    nearest_neighbor_below_ = part;\n  }\n  int space_above() const {\n    return space_above_;\n  }\n  void set_space_above(int space) {\n    space_above_ = space;\n  }\n  int space_below() const {\n    return space_below_;\n  }\n  void set_space_below(int space) {\n    space_below_ = space;\n  }\n  int space_to_left() const {\n    return space_to_left_;\n  }\n  void set_space_to_left(int space) {\n    space_to_left_ = space;\n  }\n  int space_to_right() const {\n    return space_to_right_;\n  }\n  void set_space_to_right(int space) {\n    space_to_right_ = space;\n  }\n  uint8_t *color1() {\n    return color1_;\n  }\n  uint8_t *color2() {\n    return color2_;\n  }\n  bool owns_blobs() const {\n    return owns_blobs_;\n  }\n  void set_owns_blobs(bool owns_blobs) {\n    // Do NOT change ownership flag when there are blobs in the list.\n    // Immediately set the ownership flag when creating copies.\n    ASSERT_HOST(boxes_.empty());\n    owns_blobs_ = owns_blobs;\n  }\n\n  // Inline quasi-accessors that require some computation.\n\n  // Returns the middle y-coord of the bounding box.\n  int MidY() const {\n    return (bounding_box_.top() + bounding_box_.bottom()) / 2;\n  }\n  // Returns the middle y-coord of the median top and bottom.\n  int MedianY() const {\n    return (median_top_ + median_bottom_) / 2;\n  }\n  // Returns the middle x-coord of the bounding box.\n  int MidX() const {\n    return (bounding_box_.left() + bounding_box_.right()) / 2;\n  }\n  // Returns the sort key at any given x,y.\n  int SortKey(int x, int y) const {\n    return TabVector::SortKey(vertical_, x, y);\n  }\n  // Returns the x corresponding to the sortkey, y pair.\n  int XAtY(int sort_key, int y) const {\n    return TabVector::XAtY(vertical_, sort_key, y);\n  }\n  // Returns the x difference between the two sort keys.\n  int KeyWidth(int left_key, int right_key) const {\n    return (right_key - left_key) / vertical_.y();\n  }\n  // Returns the column width between the left and right keys.\n  int ColumnWidth() const {\n    return KeyWidth(left_key_, right_key_);\n  }\n  // Returns the sort key of the box left edge.\n  int BoxLeftKey() const {\n    return SortKey(bounding_box_.left(), MidY());\n  }\n  // Returns the sort key of the box right edge.\n  int BoxRightKey() const {\n    return SortKey(bounding_box_.right(), MidY());\n  }\n  // Returns the left edge at the given y, using the sort key.\n  int LeftAtY(int y) const {\n    return XAtY(left_key_, y);\n  }\n  // Returns the right edge at the given y, using the sort key.\n  int RightAtY(int y) const {\n    return XAtY(right_key_, y);\n  }\n  // Returns true if the right edge of this is to the left of the right\n  // edge of other.\n  bool IsLeftOf(const ColPartition &other) const {\n    return bounding_box_.right() < other.bounding_box_.right();\n  }\n  // Returns true if the partition contains the given x coordinate at the y.\n  bool ColumnContains(int x, int y) const {\n    return LeftAtY(y) - 1 <= x && x <= RightAtY(y) + 1;\n  }\n  // Returns true if there are no blobs in the list.\n  bool IsEmpty() const {\n    return boxes_.empty();\n  }\n  // Returns true if there is a single blob in the list.\n  bool IsSingleton() const {\n    return boxes_.singleton();\n  }\n  // Returns true if this and other overlap horizontally by bounding box.\n  bool HOverlaps(const ColPartition &other) const {\n    return bounding_box_.x_overlap(other.bounding_box_);\n  }\n  // Returns true if this and other's bounding boxes overlap vertically.\n  // TODO(rays) Make HOverlaps and VOverlaps truly symmetric.\n  bool VOverlaps(const ColPartition &other) const {\n    return bounding_box_.y_gap(other.bounding_box_) < 0;\n  }\n  // Returns the vertical overlap (by median) of this and other.\n  // WARNING! Only makes sense on horizontal partitions!\n  int VCoreOverlap(const ColPartition &other) const {\n    if (median_bottom_ == INT32_MAX || other.median_bottom_ == INT32_MAX) {\n      return 0;\n    }\n    return std::min(median_top_, other.median_top_) -\n           std::max(median_bottom_, other.median_bottom_);\n  }\n  // Returns the horizontal overlap (by median) of this and other.\n  // WARNING! Only makes sense on vertical partitions!\n  int HCoreOverlap(const ColPartition &other) const {\n    return std::min(median_right_, other.median_right_) -\n           std::max(median_left_, other.median_left_);\n  }\n  // Returns true if this and other overlap significantly vertically.\n  // WARNING! Only makes sense on horizontal partitions!\n  bool VSignificantCoreOverlap(const ColPartition &other) const {\n    if (median_bottom_ == INT32_MAX || other.median_bottom_ == INT32_MAX) {\n      return false;\n    }\n    int overlap = VCoreOverlap(other);\n    int height = std::min(median_top_ - median_bottom_,\n                          other.median_top_ - other.median_bottom_);\n    return overlap * 3 > height;\n  }\n  // Returns true if this and other can be combined without putting a\n  // horizontal step in either left or right edge of the resulting block.\n  bool WithinSameMargins(const ColPartition &other) const {\n    return left_margin_ <= other.bounding_box_.left() &&\n           bounding_box_.left() >= other.left_margin_ &&\n           bounding_box_.right() <= other.right_margin_ &&\n           right_margin_ >= other.bounding_box_.right();\n  }\n  // Returns true if the region types (aligned_text_) match.\n  // Lines never match anything, as they should never be merged or chained.\n  bool TypesMatch(const ColPartition &other) const {\n    return TypesMatch(blob_type_, other.blob_type_);\n  }\n  static bool TypesMatch(BlobRegionType type1, BlobRegionType type2) {\n    return (type1 == type2 || type1 == BRT_UNKNOWN || type2 == BRT_UNKNOWN) &&\n           !BLOBNBOX::IsLineType(type1) && !BLOBNBOX::IsLineType(type2);\n  }\n\n  // Returns true if the types are similar to each other.\n  static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2) {\n    return (type1 == type2 ||\n            (type1 == PT_FLOWING_TEXT && type2 == PT_INLINE_EQUATION) ||\n            (type2 == PT_FLOWING_TEXT && type1 == PT_INLINE_EQUATION));\n  }\n\n  // Returns true if partitions is of horizontal line type\n  bool IsLineType() const {\n    return PTIsLineType(type_);\n  }\n  // Returns true if partitions is of image type\n  bool IsImageType() const {\n    return PTIsImageType(type_);\n  }\n  // Returns true if partitions is of text type\n  bool IsTextType() const {\n    return PTIsTextType(type_);\n  }\n  // Returns true if partitions is of pullout(inter-column) type\n  bool IsPulloutType() const {\n    return PTIsPulloutType(type_);\n  }\n  // Returns true if the partition is of an exclusively vertical type.\n  bool IsVerticalType() const {\n    return blob_type_ == BRT_VERT_TEXT || blob_type_ == BRT_VLINE;\n  }\n  // Returns true if the partition is of a definite horizontal type.\n  bool IsHorizontalType() const {\n    return blob_type_ == BRT_TEXT || blob_type_ == BRT_HLINE;\n  }\n  // Returns true is the partition is of a type that cannot be merged.\n  bool IsUnMergeableType() const {\n    return BLOBNBOX::UnMergeableType(blob_type_) || type_ == PT_NOISE;\n  }\n  // Returns true if this partition is a vertical line\n  // TODO(nbeato): Use PartitionType enum when Ray's code is submitted.\n  bool IsVerticalLine() const {\n    return IsVerticalType() && IsLineType();\n  }\n  // Returns true if this partition is a horizontal line\n  // TODO(nbeato): Use PartitionType enum when Ray's code is submitted.\n  bool IsHorizontalLine() const {\n    return IsHorizontalType() && IsLineType();\n  }\n\n  // Adds the given box to the partition, updating the partition bounds.\n  // The list of boxes in the partition is updated, ensuring that no box is\n  // recorded twice, and the boxes are kept in increasing left position.\n  void AddBox(BLOBNBOX *box);\n\n  // Removes the given box from the partition, updating the bounds.\n  void RemoveBox(BLOBNBOX *box);\n\n  // Returns the tallest box in the partition, as measured perpendicular to the\n  // presumed flow of text.\n  BLOBNBOX *BiggestBox();\n\n  // Returns the bounding box excluding the given box.\n  TBOX BoundsWithoutBox(BLOBNBOX *box);\n\n  // Claims the boxes in the boxes_list by marking them with a this owner\n  // pointer.\n  void ClaimBoxes();\n\n  // nullptr the owner of the blobs in this partition, so they can be deleted\n  // independently of the ColPartition.\n  void DisownBoxes();\n  // nullptr the owner of the blobs in this partition that are owned by this\n  // partition, so they can be deleted independently of the ColPartition.\n  // Any blobs that are not owned by this partition get to keep their owner\n  // without an assert failure.\n  void DisownBoxesNoAssert();\n  // Nulls the owner of the blobs in this partition that are owned by this\n  // partition and not leader blobs, removing them from the boxes_ list, thus\n  // turning this partition back to a leader partition if it contains a leader,\n  // or otherwise leaving it empty. Returns true if any boxes remain.\n  bool ReleaseNonLeaderBoxes();\n\n  // Delete the boxes that this partition owns.\n  void DeleteBoxes();\n\n  // Reflects the partition in the y-axis, assuming that its blobs have\n  // already been done. Corrects only a limited part of the members, since\n  // this function is assumed to be used shortly after initial creation, which\n  // is before a lot of the members are used.\n  void ReflectInYAxis();\n\n  // Returns true if this is a legal partition - meaning that the conditions\n  // left_margin <= bounding_box left\n  // left_key <= bounding box left key\n  // bounding box left <= bounding box right\n  // and likewise for right margin and key\n  // are all met.\n  bool IsLegal();\n\n  // Returns true if the left and right edges are approximately equal.\n  bool MatchingColumns(const ColPartition &other) const;\n\n  // Returns true if the colors match for two text partitions.\n  bool MatchingTextColor(const ColPartition &other) const;\n\n  // Returns true if the sizes match for two text partitions,\n  // taking orientation into account\n  bool MatchingSizes(const ColPartition &other) const;\n\n  // Returns true if there is no tabstop violation in merging this and other.\n  bool ConfirmNoTabViolation(const ColPartition &other) const;\n\n  // Returns true if other has a similar stroke width to this.\n  bool MatchingStrokeWidth(const ColPartition &other,\n                           double fractional_tolerance,\n                           double constant_tolerance) const;\n  // Returns true if candidate is an acceptable diacritic base char merge\n  // with this as the diacritic.\n  bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const;\n\n  // Sets the sort key using either the tab vector, or the bounding box if\n  // the tab vector is nullptr. If the tab_vector lies inside the bounding_box,\n  // use the edge of the box as a key any way.\n  void SetLeftTab(const TabVector *tab_vector);\n  void SetRightTab(const TabVector *tab_vector);\n\n  // Copies the left/right tab from the src partition, but if take_box is\n  // true, copies the box instead and uses that as a key.\n  void CopyLeftTab(const ColPartition &src, bool take_box);\n  void CopyRightTab(const ColPartition &src, bool take_box);\n\n  // Returns the left rule line x coord of the leftmost blob.\n  int LeftBlobRule() const;\n  // Returns the right rule line x coord of the rightmost blob.\n  int RightBlobRule() const;\n\n  // Returns the density value for a particular BlobSpecialTextType.\n  float SpecialBlobsDensity(const BlobSpecialTextType type) const;\n  // Returns the number of blobs for a  particular BlobSpecialTextType.\n  int SpecialBlobsCount(const BlobSpecialTextType type);\n  // Set the density value for a particular BlobSpecialTextType, should ONLY be\n  // used for debugging or testing. In production code, use\n  // ComputeSpecialBlobsDensity instead.\n  void SetSpecialBlobsDensity(const BlobSpecialTextType type,\n                              const float density);\n  // Compute the SpecialTextType density of blobs, where we assume\n  // that the SpecialTextType in the boxes_ has been set.\n  void ComputeSpecialBlobsDensity();\n\n  // Add a partner above if upper, otherwise below.\n  // Add them uniquely and keep the list sorted by box left.\n  // Partnerships are added symmetrically to partner and this.\n  void AddPartner(bool upper, ColPartition *partner);\n  // Removes the partner from this, but does not remove this from partner.\n  // This asymmetric removal is so as not to mess up the iterator that is\n  // working on partner's partner list.\n  void RemovePartner(bool upper, ColPartition *partner);\n  // Returns the partner if the given partner is a singleton, otherwise nullptr.\n  ColPartition *SingletonPartner(bool upper);\n\n  // Merge with the other partition and delete it.\n  void Absorb(ColPartition *other, const WidthCallback &cb);\n\n  // Returns true if the overlap between this and the merged pair of\n  // merge candidates is sufficiently trivial to be allowed.\n  // The merged box can graze the edge of this by the ok_box_overlap\n  // if that exceeds the margin to the median top and bottom.\n  bool OKMergeOverlap(const ColPartition &merge1, const ColPartition &merge2,\n                      int ok_box_overlap, bool debug);\n\n  // Find the blob at which to split this to minimize the overlap with the\n  // given box. Returns the first blob to go in the second partition.\n  BLOBNBOX *OverlapSplitBlob(const TBOX &box);\n\n  // Split this partition keeping the first half in this and returning\n  // the second half.\n  // Splits by putting the split_blob and the blobs that follow\n  // in the second half, and the rest in the first half.\n  ColPartition *SplitAtBlob(BLOBNBOX *split_blob);\n\n  // Splits this partition at the given x coordinate, returning the right\n  // half and keeping the left half in this.\n  ColPartition *SplitAt(int split_x);\n\n  // Recalculates all the coordinate limits of the partition.\n  void ComputeLimits();\n\n  // Returns the number of boxes that overlap the given box.\n  int CountOverlappingBoxes(const TBOX &box);\n\n  // Computes and sets the type_, first_column_, last_column_ and column_set_.\n  // resolution refers to the ppi resolution of the image.\n  void SetPartitionType(int resolution, ColPartitionSet *columns);\n\n  // Returns the PartitionType from the current BlobRegionType and a column\n  // flow spanning type ColumnSpanningType, generated by\n  // ColPartitionSet::SpanningType, that indicates how the partition sits\n  // in the columns.\n  PolyBlockType PartitionType(ColumnSpanningType flow) const;\n\n  // Returns the first and last column touched by this partition.\n  // resolution refers to the ppi resolution of the image.\n  void ColumnRange(int resolution, ColPartitionSet *columns, int *first_col,\n                   int *last_col);\n\n  // Sets the internal flags good_width_ and good_column_.\n  void SetColumnGoodness(const WidthCallback &cb);\n\n  // Determines whether the blobs in this partition mostly represent\n  // a leader (fixed pitch sequence) and sets the member blobs accordingly.\n  // Note that height is assumed to have been tested elsewhere, and that this\n  // function will find most fixed-pitch text as leader without a height filter.\n  // Leader detection is limited to sequences of identical width objects,\n  // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.\n  bool MarkAsLeaderIfMonospaced();\n  // Given the result of TextlineProjection::EvaluateColPartition, (positive for\n  // horizontal text, negative for vertical text, and near zero for non-text),\n  // sets the blob_type_ and flow_ for this partition to indicate whether it\n  // is strongly or weakly vertical or horizontal text, or non-text.\n  void SetRegionAndFlowTypesFromProjectionValue(int value);\n\n  // Sets all blobs with the partition blob type and flow, but never overwrite\n  // leader blobs, as we need to be able to identify them later.\n  void SetBlobTypes();\n\n  // Returns true if a decent baseline can be fitted through the blobs.\n  // Works for both horizontal and vertical text.\n  bool HasGoodBaseline();\n\n  // Adds this ColPartition to a matching WorkingPartSet if one can be found,\n  // otherwise starts a new one in the appropriate column, ending the previous.\n  void AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright,\n                       int resolution, ColPartition_LIST *used_parts,\n                       WorkingPartSet_LIST *working_set);\n\n  // From the given block_parts list, builds one or more BLOCKs and\n  // corresponding TO_BLOCKs, such that the line spacing is uniform in each.\n  // Created blocks are appended to the end of completed_blocks and to_blocks.\n  // The used partitions are put onto used_parts, as they may still be referred\n  // to in the partition grid. bleft, tright and resolution are the bounds\n  // and resolution of the original image.\n  static void LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright,\n                                int resolution, ColPartition_LIST *block_parts,\n                                ColPartition_LIST *used_parts,\n                                BLOCK_LIST *completed_blocks,\n                                TO_BLOCK_LIST *to_blocks);\n  // Constructs a block from the given list of partitions.\n  // Arguments are as LineSpacingBlocks above.\n  static TO_BLOCK *MakeBlock(const ICOORD &bleft, const ICOORD &tright,\n                             ColPartition_LIST *block_parts,\n                             ColPartition_LIST *used_parts);\n\n  // Constructs a block from the given list of vertical text partitions.\n  // Currently only creates rectangular blocks.\n  static TO_BLOCK *MakeVerticalTextBlock(const ICOORD &bleft,\n                                         const ICOORD &tright,\n                                         ColPartition_LIST *block_parts,\n                                         ColPartition_LIST *used_parts);\n\n  // Makes a TO_ROW matching this and moves all the blobs to it, transferring\n  // ownership to returned TO_ROW.\n  TO_ROW *MakeToRow();\n\n  // Returns a copy of everything except the list of boxes. The resulting\n  // ColPartition is only suitable for keeping in a column candidate list.\n  ColPartition *ShallowCopy() const;\n  // Returns a copy of everything with a shallow copy of the blobs.\n  // The blobs are still owned by their original parent, so they are\n  // treated as read-only.\n  ColPartition *CopyButDontOwnBlobs();\n\n#ifndef GRAPHICS_DISABLED\n  // Provides a color for BBGrid to draw the rectangle.\n  ScrollView::Color BoxColor() const;\n#endif // !GRAPHICS_DISABLED\n\n  // Prints debug information on this.\n  void Print() const;\n  // Prints debug information on the colors.\n  void PrintColors();\n\n  // Sets the types of all partitions in the run to be the max of the types.\n  void SmoothPartnerRun(int working_set_count);\n\n  // Cleans up the partners of the given type so that there is at most\n  // one partner. This makes block creation simpler.\n  // If get_desperate is true, goes to more desperate merge methods\n  // to merge flowing text before breaking partnerships.\n  void RefinePartners(PolyBlockType type, bool get_desperate,\n                      ColPartitionGrid *grid);\n\n  // Returns true if this column partition is in the same column as\n  // part. This function will only work after the SetPartitionType function\n  // has been called on both column partitions. This is useful for\n  // doing a SideSearch when you want things in the same page column.\n  bool IsInSameColumnAs(const ColPartition &part) const;\n\n  // Sort function to sort by bounding box.\n  static int SortByBBox(const ColPartition *part1, const ColPartition *part2) {\n    int mid_y1 = part1->bounding_box_.y_middle();\n    int mid_y2 = part2->bounding_box_.y_middle();\n    if ((part2->bounding_box_.bottom() <= mid_y1 &&\n         mid_y1 <= part2->bounding_box_.top()) ||\n        (part1->bounding_box_.bottom() <= mid_y2 &&\n         mid_y2 <= part1->bounding_box_.top())) {\n      // Sort by increasing x.\n      return part1->bounding_box_.x_middle() - part2->bounding_box_.x_middle();\n    }\n    // Sort by decreasing y.\n    return mid_y2 - mid_y1;\n  }\n\n  // Sets the column bounds. Primarily used in testing.\n  void set_first_column(int column) {\n    first_column_ = column;\n  }\n  void set_last_column(int column) {\n    last_column_ = column;\n  }\n\nprivate:\n  // Cleans up the partners above if upper is true, else below.\n  // If get_desperate is true, goes to more desperate merge methods\n  // to merge flowing text before breaking partnerships.\n  void RefinePartnersInternal(bool upper, bool get_desperate,\n                              ColPartitionGrid *grid);\n  // Restricts the partners to only desirable types. For text and BRT_HLINE this\n  // means the same type_ , and for image types it means any image type.\n  void RefinePartnersByType(bool upper, ColPartition_CLIST *partners);\n  // Remove transitive partnerships: this<->a, and a<->b and this<->b.\n  // Gets rid of this<->b, leaving a clean chain.\n  // Also if we have this<->a and a<->this, then gets rid of this<->a, as\n  // this has multiple partners.\n  void RefinePartnerShortcuts(bool upper, ColPartition_CLIST *partners);\n  // If multiple text partners can be merged, then do so.\n  // If desperate is true, then an increase in overlap with the merge is\n  // allowed. If the overlap increases, then the desperately_merged_ flag\n  // is set, indicating that the textlines probably need to be regenerated\n  // by aggressive line fitting/splitting, as there are probably vertically\n  // joined blobs that cross textlines.\n  void RefineTextPartnersByMerge(bool upper, bool desperate,\n                                 ColPartition_CLIST *partners,\n                                 ColPartitionGrid *grid);\n  // Keep the partner with the biggest overlap.\n  void RefinePartnersByOverlap(bool upper, ColPartition_CLIST *partners);\n\n  // Return true if bbox belongs better in this than other.\n  bool ThisPartitionBetter(BLOBNBOX *bbox, const ColPartition &other);\n\n  // Smoothes the spacings in the list into groups of equal linespacing.\n  // resolution is the resolution of the original image, used as a basis\n  // for thresholds in change of spacing. page_height is in pixels.\n  static void SmoothSpacings(int resolution, int page_height,\n                             ColPartition_LIST *parts);\n\n  // Returns true if the parts array of pointers to partitions matches the\n  // condition for a spacing blip. See SmoothSpacings for what this means\n  // and how it is used.\n  static bool OKSpacingBlip(int resolution, int median_spacing,\n                            ColPartition **parts, int offset);\n\n  // Returns true if both the top and bottom spacings of this match the given\n  // spacing to within suitable margins dictated by the image resolution.\n  bool SpacingEqual(int spacing, int resolution) const;\n\n  // Returns true if both the top and bottom spacings of this and other\n  // match to within suitable margins dictated by the image resolution.\n  bool SpacingsEqual(const ColPartition &other, int resolution) const;\n\n  // Returns true if the sum spacing of this and other match the given\n  // spacing (or twice the given spacing) to within a suitable margin dictated\n  // by the image resolution.\n  bool SummedSpacingOK(const ColPartition &other, int spacing,\n                       int resolution) const;\n\n  // Returns a suitable spacing margin that can be applied to bottoms of\n  // text lines, based on the resolution and the stored side_step_.\n  int BottomSpacingMargin(int resolution) const;\n\n  // Returns a suitable spacing margin that can be applied to tops of\n  // text lines, based on the resolution and the stored side_step_.\n  int TopSpacingMargin(int resolution) const;\n\n  // Returns true if the median text sizes of this and other agree to within\n  // a reasonable multiplicative factor.\n  bool SizesSimilar(const ColPartition &other) const;\n\n  // Computes and returns in start, end a line segment formed from a\n  // forwards-iterated group of left edges of partitions that satisfy the\n  // condition that the rightmost left margin is to the left of the\n  // leftmost left bounding box edge.\n  // TODO(rays) Not good enough. Needs improving to tightly wrap text in both\n  // directions, and to loosely wrap images.\n  static void LeftEdgeRun(ColPartition_IT *part_it, ICOORD *start, ICOORD *end);\n  // Computes and returns in start, end a line segment formed from a\n  // backwards-iterated group of right edges of partitions that satisfy the\n  // condition that the leftmost right margin is to the right of the\n  // rightmost right bounding box edge.\n  // TODO(rays) Not good enough. Needs improving to tightly wrap text in both\n  // directions, and to loosely wrap images.\n  static void RightEdgeRun(ColPartition_IT *part_it, ICOORD *start,\n                           ICOORD *end);\n\n  // The margins are determined by the position of the nearest vertically\n  // overlapping neighbour to the side. They indicate the maximum extent\n  // that the block/column may be extended without touching something else.\n  // Leftmost coordinate that the region may occupy over the y limits.\n  int left_margin_ = 0;\n  // Rightmost coordinate that the region may occupy over the y limits.\n  int right_margin_ = 0;\n  // Bounding box of all blobs in the partition.\n  TBOX bounding_box_;\n  // Median top and bottom of blobs in this partition.\n  int median_bottom_ = 0;\n  int median_top_ = 0;\n  // Median height of blobs in this partition.\n  int median_height_ = 0;\n  // Median left and right of blobs in this partition.\n  int median_left_ = 0;\n  int median_right_ = 0;\n  // Median width of blobs in this partition.\n  int median_width_ = 0;\n  // blob_region_type_ for the blobs in this partition.\n  BlobRegionType blob_type_ = BRT_UNKNOWN;\n  BlobTextFlowType flow_ = BTFT_NONE; // Quality of text flow.\n  // Total of GoodTextBlob results for all blobs in the partition.\n  int good_blob_score_ = 0;\n  // True if this partition has a common width.\n  bool good_width_ = false;\n  // True if this is a good column candidate.\n  bool good_column_ = false;\n  // True if the left_key_ is from a tab vector.\n  bool left_key_tab_ = false;\n  // True if the right_key_ is from a tab vector.\n  bool right_key_tab_ = false;\n  // Left and right sort keys for the edges of the partition.\n  // If the respective *_key_tab_ is true then this key came from a tab vector.\n  // If not, then the class promises to keep the key equal to the sort key\n  // for the respective edge of the bounding box at the MidY, so that\n  // LeftAtY and RightAtY always returns an x coordinate on the line parallel\n  // to vertical_ through the bounding box edge at MidY.\n  int left_key_ = 0;\n  int right_key_ = 0;\n  // Type of this partition after looking at its relation to the columns.\n  PolyBlockType type_ = PT_UNKNOWN;\n  // The global vertical skew direction.\n  ICOORD vertical_;\n  // All boxes in the partition stored in increasing left edge coordinate.\n  BLOBNBOX_CLIST boxes_;\n  // The partitions above that matched this.\n  ColPartition_CLIST upper_partners_;\n  // The partitions below that matched this.\n  ColPartition_CLIST lower_partners_;\n  // The WorkingPartSet it lives in while blocks are being made.\n  WorkingPartSet *working_set_ = nullptr;\n  // Column_set_ is the column layout applicable to this ColPartition.\n  ColPartitionSet *column_set_ = nullptr;\n  // Flag is true when AddBox is sorting vertically, false otherwise.\n  bool last_add_was_vertical_ = false;\n  // True when the partition's ownership has been taken from the grid and\n  // placed in a working set, or, after that, in the good_parts_ list.\n  bool block_owned_ = false;\n  // Flag to indicate that this partition was subjected to a desperate merge,\n  // and therefore the textlines need rebuilding.\n  bool desperately_merged_ = false;\n  bool owns_blobs_ = true; // Does the partition own its blobs?\n  // The first and last column that this partition applies to.\n  // Flowing partitions (see type_) will have an equal first and last value\n  // of the form 2n + 1, where n is the zero-based index into the partitions\n  // in column_set_. (See ColPartitionSet::GetColumnByIndex).\n  // Heading partitions will have unequal values of the same form.\n  // Pullout partitions will have equal values, but may have even values,\n  // indicating placement between columns.\n  int first_column_ = -1;\n  int last_column_ = -1;\n  // Linespacing data.\n  int side_step_ = 0;      // Median y-shift to next blob on same line.\n  int top_spacing_ = 0;    // Line spacing from median_top_.\n  int bottom_spacing_ = 0; // Line spacing from median_bottom_.\n\n  // Nearest neighbor above with major x-overlap\n  ColPartition *nearest_neighbor_above_ = nullptr;\n  // Nearest neighbor below with major x-overlap\n  ColPartition *nearest_neighbor_below_ = nullptr;\n  int space_above_ = 0;    // Distance from nearest_neighbor_above\n  int space_below_ = 0;    // Distance from nearest_neighbor_below\n  int space_to_left_ = 0;  // Distance from the left edge of the column\n  int space_to_right_ = 0; // Distance from the right edge of the column\n  // Color foreground/background data.\n  uint8_t color1_[kRGBRMSColors];\n  uint8_t color2_[kRGBRMSColors];\n  // The density of special blobs.\n  float special_blobs_densities_[BSTT_COUNT];\n  // Type of this partition before considering it as a table cell. This is\n  // used to revert the type if a partition is first marked as a table cell but\n  // later filtering steps decide it does not belong to a table\n  PolyBlockType type_before_table_ = PT_UNKNOWN;\n  // Check whether the current partition has been assigned to a table column.\n  bool inside_table_column_ = false;\n};\n\n// Typedef it now in case it becomes a class later.\nusing ColPartitionGridSearch =\n    GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>;\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_COLPARTITION_H_\n"
  },
  {
    "path": "src/textord/colpartitiongrid.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartitiongrid.cpp\n// Description: Class collecting code that acts on a BBGrid of ColPartitions.\n// Author:      Ray Smith\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"colpartitiongrid.h\"\n#include \"colpartitionset.h\"\n#include \"imagefind.h\"\n\n#include <algorithm>\n#include <utility>\n\nnamespace tesseract {\n\n// Max pad factor used to search the neighbourhood of a partition to smooth\n// partition types.\nconst int kMaxPadFactor = 6;\n// Max multiple of size (min(height, width)) for the distance of the nearest\n// neighbour for the change of type to be used.\nconst int kMaxNeighbourDistFactor = 4;\n// Maximum number of lines in a credible figure caption.\nconst int kMaxCaptionLines = 7;\n// Min ratio between biggest and smallest gap to bound a caption.\nconst double kMinCaptionGapRatio = 2.0;\n// Min ratio between biggest gap and mean line height to bound a caption.\nconst double kMinCaptionGapHeightRatio = 0.5;\n// Min fraction of ColPartition height to be overlapping for margin purposes.\nconst double kMarginOverlapFraction = 0.25;\n// Size ratio required to consider an unmerged overlapping partition to be big.\nconst double kBigPartSizeRatio = 1.75;\n// Fraction of gridsize to allow arbitrary overlap between partitions.\nconst double kTinyEnoughTextlineOverlapFraction = 0.25;\n// Max vertical distance of neighbouring ColPartition as a multiple of\n// partition height for it to be a partner.\n// TODO(rays) fix the problem that causes a larger number to not work well.\n// The value needs to be larger as sparse text blocks in a page that gets\n// marked as single column will not find adjacent lines as partners, and\n// will merge horizontally distant, but aligned lines. See rep.4B3 p5.\n// The value needs to be small because double-spaced legal docs written\n// in a single column, but justified courier have widely spaced lines\n// that need to get merged before they partner-up with the lines above\n// and below. See legal.3B5 p13/17. Neither of these should depend on\n// the value of kMaxPartitionSpacing to be successful, and ColPartition\n// merging needs attention to fix this problem.\nconst double kMaxPartitionSpacing = 1.75;\n// Margin by which text has to beat image or vice-versa to make a firm\n// decision in GridSmoothNeighbour.\nconst int kSmoothDecisionMargin = 4;\n\nColPartitionGrid::ColPartitionGrid(int gridsize, const ICOORD &bleft,\n                                   const ICOORD &tright)\n    : BBGrid<ColPartition, ColPartition_CLIST, ColPartition_C_IT>(\n          gridsize, bleft, tright) {}\n\n// Handles a click event in a display window.\nvoid ColPartitionGrid::HandleClick(int x, int y) {\n  BBGrid<ColPartition, ColPartition_CLIST, ColPartition_C_IT>::HandleClick(x,\n                                                                           y);\n  // Run a radial search for partitions that overlap.\n  ColPartitionGridSearch radsearch(this);\n  radsearch.SetUniqueMode(true);\n  radsearch.StartRadSearch(x, y, 1);\n  ColPartition *neighbour;\n  FCOORD click(x, y);\n  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {\n    const TBOX &nbox = neighbour->bounding_box();\n    if (nbox.contains(click)) {\n      tprintf(\"Block box:\");\n      neighbour->bounding_box().print();\n      neighbour->Print();\n    }\n  }\n}\n\n// Merges ColPartitions in the grid that look like they belong in the same\n// textline.\n// For all partitions in the grid, calls the box_cb permanent callback\n// to compute the search box, searches the box, and if a candidate is found,\n// calls the confirm_cb to check any more rules. If the confirm_cb returns\n// true, then the partitions are merged.\n// Both callbacks are deleted before returning.\nvoid ColPartitionGrid::Merges(\n    const std::function<bool(ColPartition *, TBOX *)> &box_cb,\n    const std::function<bool(const ColPartition *, const ColPartition *)>\n        &confirm_cb) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (MergePart(box_cb, confirm_cb, part)) {\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// For the given partition, calls the box_cb permanent callback\n// to compute the search box, searches the box, and if a candidate is found,\n// calls the confirm_cb to check any more rules. If the confirm_cb returns\n// true, then the partitions are merged.\n// Returns true if the partition is consumed by one or more merges.\nbool ColPartitionGrid::MergePart(\n    const std::function<bool(ColPartition *, TBOX *)> &box_cb,\n    const std::function<bool(const ColPartition *, const ColPartition *)>\n        &confirm_cb,\n    ColPartition *part) {\n  if (part->IsUnMergeableType()) {\n    return false;\n  }\n  bool any_done = false;\n  // Repeatedly merge part while we find a best merge candidate that works.\n  bool merge_done = false;\n  do {\n    merge_done = false;\n    TBOX box = part->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());\n    if (debug) {\n      tprintf(\"Merge candidate:\");\n      box.print();\n    }\n    // Set up a rectangle search bounded by the part.\n    if (!box_cb(part, &box)) {\n      continue;\n    }\n    // Create a list of merge candidates.\n    ColPartition_CLIST merge_candidates;\n    FindMergeCandidates(part, box, debug, &merge_candidates);\n    // Find the best merge candidate based on minimal overlap increase.\n    int overlap_increase;\n    ColPartition *neighbour = BestMergeCandidate(part, &merge_candidates, debug,\n                                                 confirm_cb, &overlap_increase);\n    if (neighbour != nullptr && overlap_increase <= 0) {\n      if (debug) {\n        tprintf(\"Merging:hoverlap=%d, voverlap=%d, OLI=%d\\n\",\n                part->HCoreOverlap(*neighbour), part->VCoreOverlap(*neighbour),\n                overlap_increase);\n      }\n      // Looks like a good candidate so merge it.\n      RemoveBBox(neighbour);\n      // We will modify the box of part, so remove it from the grid, merge\n      // it and then re-insert it into the grid.\n      RemoveBBox(part);\n      part->Absorb(neighbour, nullptr);\n      InsertBBox(true, true, part);\n      merge_done = true;\n      any_done = true;\n    } else if (neighbour != nullptr) {\n      if (debug) {\n        tprintf(\"Overlapped when merged with increase %d: \", overlap_increase);\n        neighbour->bounding_box().print();\n      }\n    } else if (debug) {\n      tprintf(\"No candidate neighbour returned\\n\");\n    }\n  } while (merge_done);\n  return any_done;\n}\n\n// Returns true if the given part and merge candidate might believably\n// be part of a single text line according to the default rules.\n// In general we only want to merge partitions that look like they\n// are on the same text line, ie their median limits overlap, but we have\n// to make exceptions for diacritics and stray punctuation.\nstatic bool OKMergeCandidate(const ColPartition *part,\n                             const ColPartition *candidate, bool debug) {\n  const TBOX &part_box = part->bounding_box();\n  if (candidate == part) {\n    return false; // Ignore itself.\n  }\n  if (!part->TypesMatch(*candidate) || candidate->IsUnMergeableType()) {\n    return false; // Don't mix inappropriate types.\n  }\n\n  const TBOX &c_box = candidate->bounding_box();\n  if (debug) {\n    tprintf(\"Examining merge candidate:\");\n    c_box.print();\n  }\n  // Candidates must be within a reasonable distance.\n  if (candidate->IsVerticalType() || part->IsVerticalType()) {\n    int h_dist = -part->HCoreOverlap(*candidate);\n    if (h_dist >= std::max(part_box.width(), c_box.width()) / 2) {\n      if (debug) {\n        tprintf(\"Too far away: h_dist = %d\\n\", h_dist);\n      }\n      return false;\n    }\n  } else {\n    // Coarse filter by vertical distance between partitions.\n    int v_dist = -part->VCoreOverlap(*candidate);\n    if (v_dist >= std::max(part_box.height(), c_box.height()) / 2) {\n      if (debug) {\n        tprintf(\"Too far away: v_dist = %d\\n\", v_dist);\n      }\n      return false;\n    }\n    // Candidates must either overlap in median y,\n    // or part or candidate must be an acceptable diacritic.\n    if (!part->VSignificantCoreOverlap(*candidate) &&\n        !part->OKDiacriticMerge(*candidate, debug) &&\n        !candidate->OKDiacriticMerge(*part, debug)) {\n      if (debug) {\n        tprintf(\"Candidate fails overlap and diacritic tests!\\n\");\n      }\n      return false;\n    }\n  }\n  return true;\n}\n\n// Helper function to compute the increase in overlap of the parts list of\n// Colpartitions with the combination of merge1 and merge2, compared to\n// the overlap with them uncombined.\n// An overlap is not counted if passes the OKMergeOverlap test with ok_overlap\n// as the pixel overlap limit. merge1 and merge2 must both be non-nullptr.\nstatic int IncreaseInOverlap(const ColPartition *merge1,\n                             const ColPartition *merge2, int ok_overlap,\n                             ColPartition_CLIST *parts) {\n  ASSERT_HOST(merge1 != nullptr && merge2 != nullptr);\n  int total_area = 0;\n  ColPartition_C_IT it(parts);\n  TBOX merged_box(merge1->bounding_box());\n  merged_box += merge2->bounding_box();\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    if (part == merge1 || part == merge2) {\n      continue;\n    }\n    TBOX part_box = part->bounding_box();\n    // Compute the overlap of the merged box with part.\n    int overlap_area = part_box.intersection(merged_box).area();\n    if (overlap_area > 0 &&\n        !part->OKMergeOverlap(*merge1, *merge2, ok_overlap, false)) {\n      total_area += overlap_area;\n      // Subtract the overlap of merge1 and merge2 individually.\n      overlap_area = part_box.intersection(merge1->bounding_box()).area();\n      if (overlap_area > 0) {\n        total_area -= overlap_area;\n      }\n      TBOX intersection_box = part_box.intersection(merge2->bounding_box());\n      overlap_area = intersection_box.area();\n      if (overlap_area > 0) {\n        total_area -= overlap_area;\n        // Add back the 3-way area.\n        intersection_box &= merge1->bounding_box(); // In-place intersection.\n        overlap_area = intersection_box.area();\n        if (overlap_area > 0) {\n          total_area += overlap_area;\n        }\n      }\n    }\n  }\n  return total_area;\n}\n\n// Helper function to test that each partition in candidates is either a\n// good diacritic merge with part or an OK merge candidate with all others\n// in the candidates list.\n// ASCII Art Scenario:\n// We sometimes get text such as \"join-this\" where the - is actually a long\n// dash culled from a standard set of extra characters that don't match the\n// font of the text. This makes its strokewidth not match and forms a broken\n// set of 3 partitions for \"join\", \"-\" and \"this\" and the dash may slightly\n// overlap BOTH words.\n// -------  -------\n// |     ====     |\n// -------  -------\n// The standard merge rule: \"you can merge 2 partitions as long as there is\n// no increase in overlap elsewhere\" fails miserably here. Merge any pair\n// of partitions and the combined box overlaps more with the third than\n// before. To allow the merge, we need to consider whether it is safe to\n// merge everything, without merging separate text lines. For that we need\n// everything to be an OKMergeCandidate (which is supposed to prevent\n// separate text lines merging), but this is hard for diacritics to satisfy,\n// so an alternative to being OKMergeCandidate with everything is to be an\n// OKDiacriticMerge with part as the base character.\nstatic bool TestCompatibleCandidates(const ColPartition &part, bool debug,\n                                     ColPartition_CLIST *candidates) {\n  ColPartition_C_IT it(candidates);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *candidate = it.data();\n    if (!candidate->OKDiacriticMerge(part, false)) {\n      ColPartition_C_IT it2(it);\n      for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {\n        ColPartition *candidate2 = it2.data();\n        if (candidate2 != candidate &&\n            !OKMergeCandidate(candidate, candidate2, false)) {\n          if (debug) {\n            tprintf(\"NC overlap failed:Candidate:\");\n            candidate2->bounding_box().print();\n            tprintf(\"fails to be a good merge with:\");\n            candidate->bounding_box().print();\n          }\n          return false;\n        }\n      }\n    }\n  }\n  return true;\n}\n\n// Computes and returns the total overlap of all partitions in the grid.\n// If overlap_grid is non-null, it is filled with a grid that holds empty\n// partitions representing the union of all overlapped partitions.\nint ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid **overlap_grid) {\n  int total_overlap = 0;\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    ColPartition_CLIST neighbors;\n    const TBOX &part_box = part->bounding_box();\n    FindOverlappingPartitions(part_box, part, &neighbors);\n    ColPartition_C_IT n_it(&neighbors);\n    bool any_part_overlap = false;\n    for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {\n      const TBOX &n_box = n_it.data()->bounding_box();\n      int overlap = n_box.intersection(part_box).area();\n      if (overlap > 0 && overlap_grid != nullptr) {\n        if (*overlap_grid == nullptr) {\n          *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright());\n        }\n        (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy());\n        if (!any_part_overlap) {\n          (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy());\n        }\n      }\n      any_part_overlap = true;\n      total_overlap += overlap;\n    }\n  }\n  return total_overlap;\n}\n\n// Finds all the ColPartitions in the grid that overlap with the given\n// box and returns them SortByBoxLeft(ed) and uniqued in the given list.\n// Any partition equal to not_this (may be nullptr) is excluded.\nvoid ColPartitionGrid::FindOverlappingPartitions(const TBOX &box,\n                                                 const ColPartition *not_this,\n                                                 ColPartition_CLIST *parts) {\n  ColPartitionGridSearch rsearch(this);\n  rsearch.StartRectSearch(box);\n  ColPartition *part;\n  while ((part = rsearch.NextRectSearch()) != nullptr) {\n    if (part != not_this) {\n      parts->add_sorted(SortByBoxLeft<ColPartition>, true, part);\n    }\n  }\n}\n\n// Finds and returns the best candidate ColPartition to merge with part,\n// selected from the candidates list, based on the minimum increase in\n// pairwise overlap among all the partitions overlapped by the combined box.\n// If overlap_increase is not nullptr then it returns the increase in overlap\n// that would result from the merge.\n// confirm_cb is a permanent callback that (if non-null) will be used to\n// confirm the validity of a proposed merge candidate before selecting it.\n//\n// ======HOW MERGING WORKS======\n// The problem:\n// We want to merge all the parts of a textline together, but avoid merging\n// separate textlines. Diacritics, i dots, punctuation, and broken characters\n// are examples of small bits that need merging with the main textline.\n// Drop-caps and descenders in one line that touch ascenders in the one below\n// are examples of cases where we don't want to merge.\n//\n// The solution:\n// Merges that increase overlap among other partitions are generally bad.\n// Those that don't increase overlap (much) and minimize the total area\n// seem to be good.\n//\n// Ascii art example:\n// The text:\n// groggy descenders\n// minimum ascenders\n// The boxes: The === represents a small box near or overlapping the lower box.\n// -----------------\n// |               |\n// -----------------\n// -===-------------\n// |               |\n// -----------------\n// In considering what to do with the small === box, we find the 2 larger\n// boxes as neighbours and possible merge candidates, but merging with the\n// upper box increases overlap with the lower box, whereas merging with the\n// lower box does not increase overlap.\n// If the small === box didn't overlap either to start with, total area\n// would be minimized by merging with the nearer (lower) box.\n//\n// This is a simple example. In reality, we have to allow some increase\n// in overlap, or tightly spaced text would end up in bits.\nColPartition *ColPartitionGrid::BestMergeCandidate(\n    const ColPartition *part, ColPartition_CLIST *candidates, bool debug,\n    const std::function<bool(const ColPartition *, const ColPartition *)>\n        &confirm_cb,\n    int *overlap_increase) {\n  if (overlap_increase != nullptr) {\n    *overlap_increase = 0;\n  }\n  if (candidates->empty()) {\n    return nullptr;\n  }\n  int ok_overlap =\n      static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);\n  // The best neighbour to merge with is the one that causes least\n  // total pairwise overlap among all the neighbours.\n  // If more than one offers the same total overlap, choose the one\n  // with the least total area.\n  const TBOX &part_box = part->bounding_box();\n  ColPartition_C_IT it(candidates);\n  ColPartition *best_candidate = nullptr;\n  // Find the total combined box of all candidates and the original.\n  TBOX full_box(part_box);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *candidate = it.data();\n    full_box += candidate->bounding_box();\n  }\n  // Keep valid neighbours in a list.\n  ColPartition_CLIST neighbours;\n  // Now run a rect search of the merged box for overlapping neighbours, as\n  // we need anything that might be overlapped by the merged box.\n  FindOverlappingPartitions(full_box, part, &neighbours);\n  if (debug) {\n    tprintf(\"Finding best merge candidate from %d, %d neighbours for box:\",\n            candidates->length(), neighbours.length());\n    part_box.print();\n  }\n  // If the best increase in overlap is positive, then we also check the\n  // worst non-candidate overlap. This catches the case of multiple good\n  // candidates that overlap each other when merged. If the worst\n  // non-candidate overlap is better than the best overlap, then return\n  // the worst non-candidate overlap instead.\n  ColPartition_CLIST non_candidate_neighbours;\n  non_candidate_neighbours.set_subtract(SortByBoxLeft<ColPartition>, true,\n                                        &neighbours, candidates);\n  int worst_nc_increase = 0;\n  int best_increase = INT32_MAX;\n  int best_area = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *candidate = it.data();\n    if (confirm_cb != nullptr && !confirm_cb(part, candidate)) {\n      if (debug) {\n        tprintf(\"Candidate not confirmed:\");\n        candidate->bounding_box().print();\n      }\n      continue;\n    }\n    int increase = IncreaseInOverlap(part, candidate, ok_overlap, &neighbours);\n    const TBOX &cand_box = candidate->bounding_box();\n    if (best_candidate == nullptr || increase < best_increase) {\n      best_candidate = candidate;\n      best_increase = increase;\n      best_area = cand_box.bounding_union(part_box).area() - cand_box.area();\n      if (debug) {\n        tprintf(\"New best merge candidate has increase %d, area %d, over box:\",\n                increase, best_area);\n        full_box.print();\n        candidate->Print();\n      }\n    } else if (increase == best_increase) {\n      int area = cand_box.bounding_union(part_box).area() - cand_box.area();\n      if (area < best_area) {\n        best_area = area;\n        best_candidate = candidate;\n      }\n    }\n    increase = IncreaseInOverlap(part, candidate, ok_overlap,\n                                 &non_candidate_neighbours);\n    if (increase > worst_nc_increase) {\n      worst_nc_increase = increase;\n    }\n  }\n  if (best_increase > 0) {\n    // If the worst non-candidate increase is less than the best increase\n    // including the candidates, then all the candidates can merge together\n    // and the increase in outside overlap would be less, so use that result,\n    // but only if each candidate is either a good diacritic merge with part,\n    // or an ok merge candidate with all the others.\n    // See TestCompatibleCandidates for more explanation and a picture.\n    if (worst_nc_increase < best_increase &&\n        TestCompatibleCandidates(*part, debug, candidates)) {\n      best_increase = worst_nc_increase;\n    }\n  }\n  if (overlap_increase != nullptr) {\n    *overlap_increase = best_increase;\n  }\n  return best_candidate;\n}\n\n// Helper to remove the given box from the given partition, put it in its\n// own partition, and add to the partition list.\nstatic void RemoveBadBox(BLOBNBOX *box, ColPartition *part,\n                         ColPartition_LIST *part_list) {\n  part->RemoveBox(box);\n  ColPartition::MakeBigPartition(box, part_list);\n}\n\n// Split partitions where it reduces overlap between their bounding boxes.\n// ColPartitions are after all supposed to be a partitioning of the blobs\n// AND of the space on the page!\n// Blobs that cause overlaps get removed, put in individual partitions\n// and added to the big_parts list. They are most likely characters on\n// 2 textlines that touch, or something big like a dropcap.\nvoid ColPartitionGrid::SplitOverlappingPartitions(\n    ColPartition_LIST *big_parts) {\n  int ok_overlap =\n      static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // Set up a rectangle search bounded by the part.\n    const TBOX &box = part->bounding_box();\n    ColPartitionGridSearch rsearch(this);\n    rsearch.SetUniqueMode(true);\n    rsearch.StartRectSearch(box);\n    int unresolved_overlaps = 0;\n\n    ColPartition *neighbour;\n    while ((neighbour = rsearch.NextRectSearch()) != nullptr) {\n      if (neighbour == part) {\n        continue;\n      }\n      const TBOX &neighbour_box = neighbour->bounding_box();\n      if (neighbour->OKMergeOverlap(*part, *part, ok_overlap, false) &&\n          part->OKMergeOverlap(*neighbour, *neighbour, ok_overlap, false)) {\n        continue; // The overlap is OK both ways.\n      }\n\n      // If removal of the biggest box from either partition eliminates the\n      // overlap, and it is much bigger than the box left behind, then\n      // it is either a drop-cap, an inter-line join, or some junk that\n      // we don't want anyway, so put it in the big_parts list.\n      if (!part->IsSingleton()) {\n        BLOBNBOX *excluded = part->BiggestBox();\n        TBOX shrunken = part->BoundsWithoutBox(excluded);\n        if (!shrunken.overlap(neighbour_box) &&\n            excluded->bounding_box().height() >\n                kBigPartSizeRatio * shrunken.height()) {\n          // Removing the biggest box fixes the overlap, so do it!\n          gsearch.RemoveBBox();\n          RemoveBadBox(excluded, part, big_parts);\n          InsertBBox(true, true, part);\n          gsearch.RepositionIterator();\n          break;\n        }\n      } else if (box.contains(neighbour_box)) {\n        ++unresolved_overlaps;\n        continue; // No amount of splitting will fix it.\n      }\n      if (!neighbour->IsSingleton()) {\n        BLOBNBOX *excluded = neighbour->BiggestBox();\n        TBOX shrunken = neighbour->BoundsWithoutBox(excluded);\n        if (!shrunken.overlap(box) &&\n            excluded->bounding_box().height() >\n                kBigPartSizeRatio * shrunken.height()) {\n          // Removing the biggest box fixes the overlap, so do it!\n          rsearch.RemoveBBox();\n          RemoveBadBox(excluded, neighbour, big_parts);\n          InsertBBox(true, true, neighbour);\n          gsearch.RepositionIterator();\n          break;\n        }\n      }\n      int part_overlap_count = part->CountOverlappingBoxes(neighbour_box);\n      int neighbour_overlap_count = neighbour->CountOverlappingBoxes(box);\n      ColPartition *right_part = nullptr;\n      if (neighbour_overlap_count <= part_overlap_count ||\n          part->IsSingleton()) {\n        // Try to split the neighbour to reduce overlap.\n        BLOBNBOX *split_blob = neighbour->OverlapSplitBlob(box);\n        if (split_blob != nullptr) {\n          rsearch.RemoveBBox();\n          right_part = neighbour->SplitAtBlob(split_blob);\n          InsertBBox(true, true, neighbour);\n          ASSERT_HOST(right_part != nullptr);\n        }\n      } else {\n        // Try to split part to reduce overlap.\n        BLOBNBOX *split_blob = part->OverlapSplitBlob(neighbour_box);\n        if (split_blob != nullptr) {\n          gsearch.RemoveBBox();\n          right_part = part->SplitAtBlob(split_blob);\n          InsertBBox(true, true, part);\n          ASSERT_HOST(right_part != nullptr);\n        }\n      }\n      if (right_part != nullptr) {\n        InsertBBox(true, true, right_part);\n        gsearch.RepositionIterator();\n        rsearch.RepositionIterator();\n        break;\n      }\n    }\n    if (unresolved_overlaps > 2 && part->IsSingleton()) {\n      // This part is no good so just add to big_parts.\n      RemoveBBox(part);\n      ColPartition_IT big_it(big_parts);\n      part->set_block_owned(true);\n      big_it.add_to_end(part);\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// Filters partitions of source_type by looking at local neighbours.\n// Where a majority of neighbours have a text type, the partitions are\n// changed to text, where the neighbours have image type, they are changed\n// to image, and partitions that have no definite neighbourhood type are\n// left unchanged.\n// im_box and rerotation are used to map blob coordinates onto the\n// nontext_map, which is used to prevent the spread of text neighbourhoods\n// into images.\n// Returns true if anything was changed.\nbool ColPartitionGrid::GridSmoothNeighbours(BlobTextFlowType source_type,\n                                            Image nontext_map,\n                                            const TBOX &im_box,\n                                            const FCOORD &rotation) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  bool any_changed = false;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->flow() != source_type ||\n        BLOBNBOX::IsLineType(part->blob_type())) {\n      continue;\n    }\n    const TBOX &box = part->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());\n    if (SmoothRegionType(nontext_map, im_box, rotation, debug, part)) {\n      any_changed = true;\n    }\n  }\n  return any_changed;\n}\n\n// Reflects the grid and its colpartitions in the y-axis, assuming that\n// all blob boxes have already been done.\nvoid ColPartitionGrid::ReflectInYAxis() {\n  ColPartition_LIST parts;\n  ColPartition_IT part_it(&parts);\n  // Iterate the ColPartitions in the grid to extract them.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part_it.add_after_then_move(part);\n  }\n  ICOORD bot_left(-tright().x(), bleft().y());\n  ICOORD top_right(-bleft().x(), tright().y());\n  // Reinitializing the grid with reflected coords also clears all the\n  // pointers, so parts will now own the ColPartitions. (Briefly).\n  Init(gridsize(), bot_left, top_right);\n  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n    part = part_it.extract();\n    part->ReflectInYAxis();\n    InsertBBox(true, true, part);\n  }\n}\n\n// Transforms the grid of partitions to the output blocks, putting each\n// partition into a separate block. We don't really care about the order,\n// as we just want to get as much text as possible without trying to organize\n// it into proper blocks or columns.\n// TODO(rays) some kind of sort function would be useful and probably better\n// than the default here, which is to sort by order of the grid search.\nvoid ColPartitionGrid::ExtractPartitionsAsBlocks(BLOCK_LIST *blocks,\n                                                 TO_BLOCK_LIST *to_blocks) {\n  TO_BLOCK_IT to_block_it(to_blocks);\n  BLOCK_IT block_it(blocks);\n  // All partitions will be put on this list and deleted on return.\n  ColPartition_LIST parts;\n  ColPartition_IT part_it(&parts);\n  // Iterate the ColPartitions in the grid to extract them.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part_it.add_after_then_move(part);\n    // The partition has to be at least vaguely like text.\n    BlobRegionType blob_type = part->blob_type();\n    if (BLOBNBOX::IsTextType(blob_type) ||\n        (blob_type == BRT_UNKNOWN && part->boxes_count() > 1)) {\n      PolyBlockType type =\n          blob_type == BRT_VERT_TEXT ? PT_VERTICAL_TEXT : PT_FLOWING_TEXT;\n      // Get metrics from the row that will be used for the block.\n      TBOX box = part->bounding_box();\n      int median_width = part->median_width();\n      int median_height = part->median_height();\n      // Turn the partition into a TO_ROW.\n      TO_ROW *row = part->MakeToRow();\n      if (row == nullptr) {\n        // This partition is dead.\n        part->DeleteBoxes();\n        continue;\n      }\n      auto *block = new BLOCK(\"\", true, 0, 0, box.left(), box.bottom(),\n                              box.right(), box.top());\n      block->pdblk.set_poly_block(new POLY_BLOCK(box, type));\n      auto *to_block = new TO_BLOCK(block);\n      TO_ROW_IT row_it(to_block->get_rows());\n      row_it.add_after_then_move(row);\n      // We haven't differentially rotated vertical and horizontal text at\n      // this point, so use width or height as appropriate.\n      if (blob_type == BRT_VERT_TEXT) {\n        to_block->line_size = static_cast<float>(median_width);\n        to_block->line_spacing = static_cast<float>(box.width());\n        to_block->max_blob_size = static_cast<float>(box.width() + 1);\n      } else {\n        to_block->line_size = static_cast<float>(median_height);\n        to_block->line_spacing = static_cast<float>(box.height());\n        to_block->max_blob_size = static_cast<float>(box.height() + 1);\n      }\n      if (to_block->line_size == 0) {\n        to_block->line_size = 1;\n      }\n      block_it.add_to_end(block);\n      to_block_it.add_to_end(to_block);\n    } else {\n      // This partition is dead.\n      part->DeleteBoxes();\n    }\n  }\n  Clear();\n  // Now it is safe to delete the ColPartitions as parts goes out of scope.\n}\n\n// Rotates the grid and its colpartitions by the given angle, assuming that\n// all blob boxes have already been done.\nvoid ColPartitionGrid::Deskew(const FCOORD &deskew) {\n  ColPartition_LIST parts;\n  ColPartition_IT part_it(&parts);\n  // Iterate the ColPartitions in the grid to extract them.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part_it.add_after_then_move(part);\n  }\n  // Rebuild the grid to the new size.\n  TBOX grid_box(bleft_, tright_);\n  grid_box.rotate_large(deskew);\n  Init(gridsize(), grid_box.botleft(), grid_box.topright());\n  // Reinitializing the grid with rotated coords also clears all the\n  // pointers, so parts will now own the ColPartitions. (Briefly).\n  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n    part = part_it.extract();\n    part->ComputeLimits();\n    InsertBBox(true, true, part);\n  }\n}\n\n// Sets the left and right tabs of the partitions in the grid.\nvoid ColPartitionGrid::SetTabStops(TabFind *tabgrid) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &part_box = part->bounding_box();\n    TabVector *left_line = tabgrid->LeftTabForBox(part_box, true, false);\n    // If the overlapping line is not a left tab, try for non-overlapping.\n    if (left_line != nullptr && !left_line->IsLeftTab()) {\n      left_line = tabgrid->LeftTabForBox(part_box, false, false);\n    }\n    if (left_line != nullptr && left_line->IsLeftTab()) {\n      part->SetLeftTab(left_line);\n    }\n    TabVector *right_line = tabgrid->RightTabForBox(part_box, true, false);\n    if (right_line != nullptr && !right_line->IsRightTab()) {\n      right_line = tabgrid->RightTabForBox(part_box, false, false);\n    }\n    if (right_line != nullptr && right_line->IsRightTab()) {\n      part->SetRightTab(right_line);\n    }\n    part->SetColumnGoodness(tabgrid->WidthCB());\n  }\n}\n\n// Makes the ColPartSets and puts them in the PartSetVector ready\n// for finding column bounds. Returns false if no partitions were found.\nbool ColPartitionGrid::MakeColPartSets(PartSetVector *part_sets) {\n  auto *part_lists = new ColPartition_LIST[gridheight()];\n  part_sets->reserve(gridheight());\n  // Iterate the ColPartitions in the grid to get parts onto lists for the\n  // y bottom of each.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  bool any_parts_found = false;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BlobRegionType blob_type = part->blob_type();\n    if (blob_type != BRT_NOISE &&\n        (blob_type != BRT_UNKNOWN || !part->boxes()->singleton())) {\n      int grid_x, grid_y;\n      const TBOX &part_box = part->bounding_box();\n      GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);\n      ColPartition_IT part_it(&part_lists[grid_y]);\n      part_it.add_to_end(part);\n      any_parts_found = true;\n    }\n  }\n  if (any_parts_found) {\n    for (int grid_y = 0; grid_y < gridheight(); ++grid_y) {\n      ColPartitionSet *line_set = nullptr;\n      if (!part_lists[grid_y].empty()) {\n        line_set = new ColPartitionSet(&part_lists[grid_y]);\n      }\n      part_sets->push_back(line_set);\n    }\n  }\n  delete[] part_lists;\n  return any_parts_found;\n}\n\n// Makes a single ColPartitionSet consisting of a single ColPartition that\n// represents the total horizontal extent of the significant content on the\n// page. Used for the single column setting in place of automatic detection.\n// Returns nullptr if the page is empty of significant content.\nColPartitionSet *ColPartitionGrid::MakeSingleColumnSet(WidthCallback cb) {\n  ColPartition *single_column_part = nullptr;\n  // Iterate the ColPartitions in the grid to get parts onto lists for the\n  // y bottom of each.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BlobRegionType blob_type = part->blob_type();\n    if (blob_type != BRT_NOISE &&\n        (blob_type != BRT_UNKNOWN || !part->boxes()->singleton())) {\n      // Consider for single column.\n      BlobTextFlowType flow = part->flow();\n      if ((blob_type == BRT_TEXT &&\n           (flow == BTFT_STRONG_CHAIN || flow == BTFT_CHAIN ||\n            flow == BTFT_LEADER || flow == BTFT_TEXT_ON_IMAGE)) ||\n          blob_type == BRT_RECTIMAGE || blob_type == BRT_POLYIMAGE) {\n        if (single_column_part == nullptr) {\n          single_column_part = part->ShallowCopy();\n          single_column_part->set_blob_type(BRT_TEXT);\n          // Copy the tabs from itself to properly setup the margins.\n          single_column_part->CopyLeftTab(*single_column_part, false);\n          single_column_part->CopyRightTab(*single_column_part, false);\n        } else {\n          if (part->left_key() < single_column_part->left_key()) {\n            single_column_part->CopyLeftTab(*part, false);\n          }\n          if (part->right_key() > single_column_part->right_key()) {\n            single_column_part->CopyRightTab(*part, false);\n          }\n        }\n      }\n    }\n  }\n  if (single_column_part != nullptr) {\n    // Make a ColPartitionSet out of the single_column_part as a candidate\n    // for the single column case.\n    single_column_part->SetColumnGoodness(cb);\n    return new ColPartitionSet(single_column_part);\n  }\n  return nullptr;\n}\n\n// Mark the BLOBNBOXes in each partition as being owned by that partition.\nvoid ColPartitionGrid::ClaimBoxes() {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part->ClaimBoxes();\n  }\n}\n\n// Retypes all the blobs referenced by the partitions in the grid.\n// Image blobs are found and returned in the im_blobs list, as they are not\n// owned by the block.\nvoid ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST *im_blobs) {\n  BLOBNBOX_IT im_blob_it(im_blobs);\n  ColPartition_LIST dead_parts;\n  ColPartition_IT dead_part_it(&dead_parts);\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BlobRegionType blob_type = part->blob_type();\n    BlobTextFlowType flow = part->flow();\n    bool any_blobs_moved = false;\n    if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) {\n      BLOBNBOX_C_IT blob_it(part->boxes());\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        BLOBNBOX *blob = blob_it.data();\n        im_blob_it.add_after_then_move(blob);\n      }\n    } else if (blob_type != BRT_NOISE) {\n      // Make sure the blobs are marked with the correct type and flow.\n      BLOBNBOX_C_IT blob_it(part->boxes());\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        BLOBNBOX *blob = blob_it.data();\n        if (blob->region_type() == BRT_NOISE) {\n          // TODO(rays) Deprecated. Change this section to an assert to verify\n          // and then delete.\n          ASSERT_HOST(blob->cblob()->area() != 0);\n          blob->set_owner(nullptr);\n          blob_it.extract();\n          any_blobs_moved = true;\n        } else {\n          blob->set_region_type(blob_type);\n          if (blob->flow() != BTFT_LEADER) {\n            blob->set_flow(flow);\n          }\n        }\n      }\n    }\n    if (blob_type == BRT_NOISE || part->boxes()->empty()) {\n      BLOBNBOX_C_IT blob_it(part->boxes());\n      part->DisownBoxes();\n      dead_part_it.add_to_end(part);\n      gsearch.RemoveBBox();\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        BLOBNBOX *blob = blob_it.data();\n        if (blob->cblob()->area() == 0) {\n          // Any blob with zero area is a fake image blob and should be deleted.\n          delete blob->cblob();\n          delete blob;\n        }\n      }\n    } else if (any_blobs_moved) {\n      gsearch.RemoveBBox();\n      part->ComputeLimits();\n      InsertBBox(true, true, part);\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// The boxes within the partitions have changed (by deskew) so recompute\n// the bounds of all the partitions and reinsert them into the grid.\nvoid ColPartitionGrid::RecomputeBounds(int gridsize, const ICOORD &bleft,\n                                       const ICOORD &tright,\n                                       const ICOORD &vertical) {\n  ColPartition_LIST saved_parts;\n  ColPartition_IT part_it(&saved_parts);\n  // Iterate the ColPartitions in the grid to get parts onto a list.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part_it.add_to_end(part);\n  }\n  // Reinitialize grid to the new size.\n  Init(gridsize, bleft, tright);\n  // Recompute the bounds of the parts and put them back in the new grid.\n  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n    part = part_it.extract();\n    part->set_vertical(vertical);\n    part->ComputeLimits();\n    InsertBBox(true, true, part);\n  }\n}\n\n// Improves the margins of the ColPartitions in the grid by calling\n// FindPartitionMargins on each.\n// best_columns, which may be nullptr, is an array of pointers indicating the\n// column set at each y-coordinate in the grid.\n// best_columns is usually the best_columns_ member of ColumnFinder.\nvoid ColPartitionGrid::GridFindMargins(ColPartitionSet **best_columns) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // Set up a rectangle search x-bounded by the column and y by the part.\n    ColPartitionSet *columns =\n        best_columns != nullptr ? best_columns[gsearch.GridY()] : nullptr;\n    FindPartitionMargins(columns, part);\n    const TBOX &box = part->bounding_box();\n    if (AlignedBlob::WithinTestRegion(2, box.left(), box.bottom())) {\n      tprintf(\"Computed margins for part:\");\n      part->Print();\n    }\n  }\n}\n\n// Improves the margins of the ColPartitions in the list by calling\n// FindPartitionMargins on each.\n// best_columns, which may be nullptr, is an array of pointers indicating the\n// column set at each y-coordinate in the grid.\n// best_columns is usually the best_columns_ member of ColumnFinder.\nvoid ColPartitionGrid::ListFindMargins(ColPartitionSet **best_columns,\n                                       ColPartition_LIST *parts) {\n  ColPartition_IT part_it(parts);\n  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {\n    ColPartition *part = part_it.data();\n    ColPartitionSet *columns = nullptr;\n    if (best_columns != nullptr) {\n      const TBOX &part_box = part->bounding_box();\n      // Get the columns from the y grid coord.\n      int grid_x, grid_y;\n      GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);\n      columns = best_columns[grid_y];\n    }\n    FindPartitionMargins(columns, part);\n  }\n}\n\n// Deletes all the partitions in the grid after disowning all the blobs.\nvoid ColPartitionGrid::DeleteParts() {\n  ColPartition_LIST dead_parts;\n  ColPartition_IT dead_it(&dead_parts);\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    part->DisownBoxes();\n    dead_it.add_to_end(part); // Parts will be deleted on return.\n  }\n  Clear();\n}\n\n// Deletes all the partitions in the grid that are of type BRT_UNKNOWN and\n// all the blobs in them.\nvoid ColPartitionGrid::DeleteUnknownParts(TO_BLOCK *block) {\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->blob_type() == BRT_UNKNOWN) {\n      gsearch.RemoveBBox();\n      // Once marked, the blobs will be swept up by DeleteUnownedNoise.\n      part->set_flow(BTFT_NONTEXT);\n      part->set_blob_type(BRT_NOISE);\n      part->SetBlobTypes();\n      part->DisownBoxes();\n      delete part;\n    }\n  }\n  block->DeleteUnownedNoise();\n}\n\n// Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER.\nvoid ColPartitionGrid::DeleteNonLeaderParts() {\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->flow() != BTFT_LEADER) {\n      gsearch.RemoveBBox();\n      if (part->ReleaseNonLeaderBoxes()) {\n        InsertBBox(true, true, part);\n        gsearch.RepositionIterator();\n      } else {\n        delete part;\n      }\n    }\n  }\n}\n\n// Finds and marks text partitions that represent figure captions.\nvoid ColPartitionGrid::FindFigureCaptions() {\n  // For each image region find its best candidate text caption region,\n  // if any and mark it as such.\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->IsImageType()) {\n      const TBOX &part_box = part->bounding_box();\n      bool debug =\n          AlignedBlob::WithinTestRegion(2, part_box.left(), part_box.bottom());\n      ColPartition *best_caption = nullptr;\n      int best_dist = 0;  // Distance to best_caption.\n      int best_upper = 0; // Direction of best_caption.\n      // Handle both lower and upper directions.\n      for (int upper = 0; upper < 2; ++upper) {\n        ColPartition_C_IT partner_it(upper ? part->upper_partners()\n                                           : part->lower_partners());\n        // If there are no image partners, then this direction is ok.\n        for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();\n             partner_it.forward()) {\n          ColPartition *partner = partner_it.data();\n          if (partner->IsImageType()) {\n            break;\n          }\n        }\n        if (!partner_it.cycled_list()) {\n          continue;\n        }\n        // Find the nearest totally overlapping text partner.\n        for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();\n             partner_it.forward()) {\n          ColPartition *partner = partner_it.data();\n          if (!partner->IsTextType() || partner->type() == PT_TABLE) {\n            continue;\n          }\n          const TBOX &partner_box = partner->bounding_box();\n          if (debug) {\n            tprintf(\"Finding figure captions for image part:\");\n            part_box.print();\n            tprintf(\"Considering partner:\");\n            partner_box.print();\n          }\n          if (partner_box.left() >= part_box.left() &&\n              partner_box.right() <= part_box.right()) {\n            int dist = partner_box.y_gap(part_box);\n            if (best_caption == nullptr || dist < best_dist) {\n              best_dist = dist;\n              best_caption = partner;\n              best_upper = upper;\n            }\n          }\n        }\n      }\n      if (best_caption != nullptr) {\n        if (debug) {\n          tprintf(\"Best caption candidate:\");\n          best_caption->bounding_box().print();\n        }\n        // We have a candidate caption. Qualify it as being separable from\n        // any body text. We are looking for either a small number of lines\n        // or a big gap that indicates a separation from the body text.\n        int line_count = 0;\n        int biggest_gap = 0;\n        int smallest_gap = INT16_MAX;\n        int total_height = 0;\n        int mean_height = 0;\n        ColPartition *end_partner = nullptr;\n        ColPartition *next_partner = nullptr;\n        for (ColPartition *partner = best_caption;\n             partner != nullptr && line_count <= kMaxCaptionLines;\n             partner = next_partner) {\n          if (!partner->IsTextType()) {\n            end_partner = partner;\n            break;\n          }\n          ++line_count;\n          total_height += partner->bounding_box().height();\n          next_partner = partner->SingletonPartner(best_upper);\n          if (next_partner != nullptr) {\n            int gap =\n                partner->bounding_box().y_gap(next_partner->bounding_box());\n            if (gap > biggest_gap) {\n              biggest_gap = gap;\n              end_partner = next_partner;\n              mean_height = total_height / line_count;\n            } else if (gap < smallest_gap) {\n              smallest_gap = gap;\n            }\n            // If the gap looks big compared to the text size and the smallest\n            // gap seen so far, then we can stop.\n            if (biggest_gap > mean_height * kMinCaptionGapHeightRatio &&\n                biggest_gap > smallest_gap * kMinCaptionGapRatio) {\n              break;\n            }\n          }\n        }\n        if (debug) {\n          tprintf(\"Line count=%d, biggest gap %d, smallest%d, mean height %d\\n\",\n                  line_count, biggest_gap, smallest_gap, mean_height);\n          if (end_partner != nullptr) {\n            tprintf(\"End partner:\");\n            end_partner->bounding_box().print();\n          }\n        }\n        if (next_partner == nullptr && line_count <= kMaxCaptionLines) {\n          end_partner = nullptr; // No gap, but line count is small.\n        }\n        if (line_count <= kMaxCaptionLines) {\n          // This is a qualified caption. Mark the text as caption.\n          for (ColPartition *partner = best_caption;\n               partner != nullptr && partner != end_partner;\n               partner = next_partner) {\n            partner->set_type(PT_CAPTION_TEXT);\n            partner->SetBlobTypes();\n            if (debug) {\n              tprintf(\"Set caption type for partition:\");\n              partner->bounding_box().print();\n            }\n            next_partner = partner->SingletonPartner(best_upper);\n          }\n        }\n      }\n    }\n  }\n}\n\n//////// Functions that manipulate ColPartitions in the part_grid_ /////\n//////// to find chains of partner partitions of the same type.  ///////\n\n// For every ColPartition in the grid, finds its upper and lower neighbours.\nvoid ColPartitionGrid::FindPartitionPartners() {\n  ColPartitionGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->IsVerticalType()) {\n      FindVPartitionPartners(true, part);\n      FindVPartitionPartners(false, part);\n    } else {\n      FindPartitionPartners(true, part);\n      FindPartitionPartners(false, part);\n    }\n  }\n}\n\n// Finds the best partner in the given direction for the given partition.\n// Stores the result with AddPartner.\nvoid ColPartitionGrid::FindPartitionPartners(bool upper, ColPartition *part) {\n  if (part->type() == PT_NOISE) {\n    return; // Noise is not allowed to partner anything.\n  }\n  const TBOX &box = part->bounding_box();\n  int top = part->median_top();\n  int bottom = part->median_bottom();\n  int height = top - bottom;\n  int mid_y = (bottom + top) / 2;\n  ColPartitionGridSearch vsearch(this);\n  // Search down for neighbour below\n  vsearch.StartVerticalSearch(box.left(), box.right(), part->MidY());\n  ColPartition *neighbour;\n  ColPartition *best_neighbour = nullptr;\n  int best_dist = INT32_MAX;\n  while ((neighbour = vsearch.NextVerticalSearch(!upper)) != nullptr) {\n    if (neighbour == part || neighbour->type() == PT_NOISE) {\n      continue; // Noise is not allowed to partner anything.\n    }\n    int neighbour_bottom = neighbour->median_bottom();\n    int neighbour_top = neighbour->median_top();\n    int neighbour_y = (neighbour_bottom + neighbour_top) / 2;\n    if (upper != (neighbour_y > mid_y)) {\n      continue;\n    }\n    if (!part->HOverlaps(*neighbour) && !part->WithinSameMargins(*neighbour)) {\n      continue;\n    }\n    if (!part->TypesMatch(*neighbour)) {\n      if (best_neighbour == nullptr) {\n        best_neighbour = neighbour;\n      }\n      continue;\n    }\n    int dist = upper ? neighbour_bottom - top : bottom - neighbour_top;\n    if (dist <= kMaxPartitionSpacing * height) {\n      if (dist < best_dist) {\n        best_dist = dist;\n        best_neighbour = neighbour;\n      }\n    } else {\n      break;\n    }\n  }\n  if (best_neighbour != nullptr) {\n    part->AddPartner(upper, best_neighbour);\n  }\n}\n\n// Finds the best partner in the given direction for the given partition.\n// Stores the result with AddPartner.\nvoid ColPartitionGrid::FindVPartitionPartners(bool to_the_left,\n                                              ColPartition *part) {\n  if (part->type() == PT_NOISE) {\n    return; // Noise is not allowed to partner anything.\n  }\n  const TBOX &box = part->bounding_box();\n  int left = part->median_left();\n  int right = part->median_right();\n  int width = right >= left ? right - left : -1;\n  int mid_x = (left + right) / 2;\n  ColPartitionGridSearch hsearch(this);\n  // Search left for neighbour to_the_left\n  hsearch.StartSideSearch(mid_x, box.bottom(), box.top());\n  ColPartition *neighbour;\n  ColPartition *best_neighbour = nullptr;\n  int best_dist = INT32_MAX;\n  while ((neighbour = hsearch.NextSideSearch(to_the_left)) != nullptr) {\n    if (neighbour == part || neighbour->type() == PT_NOISE) {\n      continue; // Noise is not allowed to partner anything.\n    }\n    int neighbour_left = neighbour->median_left();\n    int neighbour_right = neighbour->median_right();\n    int neighbour_x = (neighbour_left + neighbour_right) / 2;\n    if (to_the_left != (neighbour_x < mid_x)) {\n      continue;\n    }\n    if (!part->VOverlaps(*neighbour)) {\n      continue;\n    }\n    if (!part->TypesMatch(*neighbour)) {\n      continue; // Only match to other vertical text.\n    }\n    int dist = to_the_left ? left - neighbour_right : neighbour_left - right;\n    if (dist <= kMaxPartitionSpacing * width) {\n      if (dist < best_dist || best_neighbour == nullptr) {\n        best_dist = dist;\n        best_neighbour = neighbour;\n      }\n    } else {\n      break;\n    }\n  }\n  // For vertical partitions, the upper partner is to the left, and lower is\n  // to the right.\n  if (best_neighbour != nullptr) {\n    part->AddPartner(to_the_left, best_neighbour);\n  }\n}\n\n// For every ColPartition with multiple partners in the grid, reduces the\n// number of partners to 0 or 1. If get_desperate is true, goes to more\n// desperate merge methods to merge flowing text before breaking partnerships.\nvoid ColPartitionGrid::RefinePartitionPartners(bool get_desperate) {\n  ColPartitionGridSearch gsearch(this);\n  // Refine in type order so that chasing multiple partners can be done\n  // before eliminating type mis-matching partners.\n  for (int type = PT_UNKNOWN + 1; type <= PT_COUNT; type++) {\n    // Iterate the ColPartitions in the grid.\n    gsearch.StartFullSearch();\n    ColPartition *part;\n    while ((part = gsearch.NextFullSearch()) != nullptr) {\n      part->RefinePartners(static_cast<PolyBlockType>(type), get_desperate,\n                           this);\n      // Iterator may have been messed up by a merge.\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// ========================== PRIVATE CODE ========================\n\n// Finds and returns a list of candidate ColPartitions to merge with part.\n// The candidates must overlap search_box, and when merged must not\n// overlap any other partitions that are not overlapped by each individually.\nvoid ColPartitionGrid::FindMergeCandidates(const ColPartition *part,\n                                           const TBOX &search_box, bool debug,\n                                           ColPartition_CLIST *candidates) {\n  int ok_overlap =\n      static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);\n  const TBOX &part_box = part->bounding_box();\n  // Now run the rect search.\n  ColPartitionGridSearch rsearch(this);\n  rsearch.SetUniqueMode(true);\n  rsearch.StartRectSearch(search_box);\n  ColPartition *candidate;\n  while ((candidate = rsearch.NextRectSearch()) != nullptr) {\n    if (!OKMergeCandidate(part, candidate, debug)) {\n      continue;\n    }\n    const TBOX &c_box = candidate->bounding_box();\n    // Candidate seems to be a potential merge with part. If one contains\n    // the other, then the merge is a no-brainer. Otherwise, search the\n    // combined box to see if anything else is inappropriately overlapped.\n    if (!part_box.contains(c_box) && !c_box.contains(part_box)) {\n      // Search the combined rectangle to see if anything new is overlapped.\n      // This is a preliminary test designed to quickly weed-out poor\n      // merge candidates that would create a big list of overlapped objects\n      // for the squared-order overlap analysis. Eg. vertical and horizontal\n      // line-like objects that overlap real text when merged:\n      // || ==========================\n      // ||\n      // ||  r e a l  t e x t\n      // ||\n      // ||\n      TBOX merged_box(part_box);\n      merged_box += c_box;\n      ColPartitionGridSearch msearch(this);\n      msearch.SetUniqueMode(true);\n      msearch.StartRectSearch(merged_box);\n      ColPartition *neighbour;\n      while ((neighbour = msearch.NextRectSearch()) != nullptr) {\n        if (neighbour == part || neighbour == candidate) {\n          continue; // Ignore itself.\n        }\n        if (neighbour->OKMergeOverlap(*part, *candidate, ok_overlap, false)) {\n          continue; // This kind of merge overlap is OK.\n        }\n        TBOX n_box = neighbour->bounding_box();\n        // The overlap is OK if:\n        // * the n_box already overlapped the part or the candidate OR\n        // * the n_box is a suitable merge with either part or candidate\n        if (!n_box.overlap(part_box) && !n_box.overlap(c_box) &&\n            !OKMergeCandidate(part, neighbour, false) &&\n            !OKMergeCandidate(candidate, neighbour, false)) {\n          break;\n        }\n      }\n      if (neighbour != nullptr) {\n        if (debug) {\n          tprintf(\n              \"Combined box overlaps another that is not OK despite\"\n              \" allowance of %d:\",\n              ok_overlap);\n          neighbour->bounding_box().print();\n          tprintf(\"Reason:\");\n          OKMergeCandidate(part, neighbour, true);\n          tprintf(\"...and:\");\n          OKMergeCandidate(candidate, neighbour, true);\n          tprintf(\"Overlap:\");\n          neighbour->OKMergeOverlap(*part, *candidate, ok_overlap, true);\n        }\n        continue;\n      }\n    }\n    if (debug) {\n      tprintf(\"Adding candidate:\");\n      candidate->bounding_box().print();\n    }\n    // Unique elements as they arrive.\n    candidates->add_sorted(SortByBoxLeft<ColPartition>, true, candidate);\n  }\n}\n\n// Smoothes the region type/flow type of the given part by looking at local\n// neighbours and the given image mask. Searches a padded rectangle with the\n// padding truncated on one size of the part's box in turn for each side,\n// using the result (if any) that has the least distance to all neighbours\n// that contribute to the decision. This biases in favor of rectangular\n// regions without completely enforcing them.\n// If a good decision cannot be reached, the part is left unchanged.\n// im_box and rerotation are used to map blob coordinates onto the\n// nontext_map, which is used to prevent the spread of text neighbourhoods\n// into images.\n// Returns true if the partition was changed.\nbool ColPartitionGrid::SmoothRegionType(Image nontext_map, const TBOX &im_box,\n                                        const FCOORD &rerotation, bool debug,\n                                        ColPartition *part) {\n  const TBOX &part_box = part->bounding_box();\n  if (debug) {\n    tprintf(\"Smooothing part at:\");\n    part_box.print();\n  }\n  BlobRegionType best_type = BRT_UNKNOWN;\n  int best_dist = INT32_MAX;\n  int max_dist = std::min(part_box.width(), part_box.height());\n  max_dist = std::max(max_dist * kMaxNeighbourDistFactor, gridsize() * 2);\n  // Search with the pad truncated on each side of the box in turn.\n  bool any_image = false;\n  bool all_image = true;\n  for (int d = 0; d < BND_COUNT; ++d) {\n    int dist;\n    auto dir = static_cast<BlobNeighbourDir>(d);\n    BlobRegionType type = SmoothInOneDirection(dir, nontext_map, im_box,\n                                               rerotation, debug, *part, &dist);\n    if (debug) {\n      tprintf(\"Result in dir %d = %d at dist %d\\n\", dir, type, dist);\n    }\n    if (type != BRT_UNKNOWN && dist < best_dist) {\n      best_dist = dist;\n      best_type = type;\n    }\n    if (type == BRT_POLYIMAGE) {\n      any_image = true;\n    } else {\n      all_image = false;\n    }\n  }\n  if (best_dist > max_dist) {\n    return false; // Too far away to set the type with it.\n  }\n  if (part->flow() == BTFT_STRONG_CHAIN && !all_image) {\n    return false; // We are not modifying it.\n  }\n  BlobRegionType new_type = part->blob_type();\n  BlobTextFlowType new_flow = part->flow();\n  if (best_type == BRT_TEXT && !any_image) {\n    new_flow = BTFT_STRONG_CHAIN;\n    new_type = BRT_TEXT;\n  } else if (best_type == BRT_VERT_TEXT && !any_image) {\n    new_flow = BTFT_STRONG_CHAIN;\n    new_type = BRT_VERT_TEXT;\n  } else if (best_type == BRT_POLYIMAGE) {\n    new_flow = BTFT_NONTEXT;\n    new_type = BRT_UNKNOWN;\n  }\n  if (new_type != part->blob_type() || new_flow != part->flow()) {\n    part->set_flow(new_flow);\n    part->set_blob_type(new_type);\n    part->SetBlobTypes();\n    if (debug) {\n      tprintf(\"Modified part:\");\n      part->Print();\n    }\n    return true;\n  } else {\n    return false;\n  }\n}\n\n// Sets up a search box based on the part_box, padded in all directions\n// except direction. Also setup dist_scaling to weight x,y distances according\n// to the given direction.\nstatic void ComputeSearchBoxAndScaling(BlobNeighbourDir direction,\n                                       const TBOX &part_box, int min_padding,\n                                       TBOX *search_box, ICOORD *dist_scaling) {\n  *search_box = part_box;\n  // Generate a pad value based on the min dimension of part_box, but at least\n  // min_padding and then scaled by kMaxPadFactor.\n  int padding = std::min(part_box.height(), part_box.width());\n  padding = std::max(padding, min_padding);\n  padding *= kMaxPadFactor;\n  search_box->pad(padding, padding);\n  // Truncate the box in the appropriate direction and make the distance\n  // metric slightly biased in the truncated direction.\n  switch (direction) {\n    case BND_LEFT:\n      search_box->set_left(part_box.left());\n      *dist_scaling = ICOORD(2, 1);\n      break;\n    case BND_BELOW:\n      search_box->set_bottom(part_box.bottom());\n      *dist_scaling = ICOORD(1, 2);\n      break;\n    case BND_RIGHT:\n      search_box->set_right(part_box.right());\n      *dist_scaling = ICOORD(2, 1);\n      break;\n    case BND_ABOVE:\n      search_box->set_top(part_box.top());\n      *dist_scaling = ICOORD(1, 2);\n      break;\n    default:\n      ASSERT_HOST(false);\n  }\n}\n\n// Local enum used by SmoothInOneDirection and AccumulatePartDistances\n// for the different types of partition neighbour.\nenum NeighbourPartitionType {\n  NPT_HTEXT,      // Definite horizontal text.\n  NPT_VTEXT,      // Definite vertical text.\n  NPT_WEAK_HTEXT, // Weakly horizontal text. Counts as HTEXT for HTEXT, but\n                  // image for image and VTEXT.\n  NPT_WEAK_VTEXT, // Weakly vertical text. Counts as VTEXT for VTEXT, but\n                  // image for image and HTEXT.\n  NPT_IMAGE,      // Defininte non-text.\n  NPT_COUNT       // Number of array elements.\n};\n\n// Executes the search for SmoothRegionType in a single direction.\n// Creates a bounding box that is padded in all directions except direction,\n// and searches it for other partitions. Finds the nearest collection of\n// partitions that makes a decisive result (if any) and returns the type\n// and the distance of the collection. If there are any pixels in the\n// nontext_map, then the decision is biased towards image.\nBlobRegionType ColPartitionGrid::SmoothInOneDirection(\n    BlobNeighbourDir direction, Image nontext_map, const TBOX &im_box,\n    const FCOORD &rerotation, bool debug, const ColPartition &part,\n    int *best_distance) {\n  // Set up a rectangle search bounded by the part.\n  const TBOX &part_box = part.bounding_box();\n  TBOX search_box;\n  ICOORD dist_scaling;\n  ComputeSearchBoxAndScaling(direction, part_box, gridsize(), &search_box,\n                             &dist_scaling);\n  bool image_region = ImageFind::CountPixelsInRotatedBox(\n                          search_box, im_box, rerotation, nontext_map) > 0;\n  std::vector<int> dists[NPT_COUNT];\n  AccumulatePartDistances(part, dist_scaling, search_box, nontext_map, im_box,\n                          rerotation, debug, dists);\n  // By iteratively including the next smallest distance across the vectors,\n  // (as in a merge sort) we can use the vector indices as counts of each type\n  // and find the nearest set of objects that give us a definite decision.\n  unsigned counts[NPT_COUNT];\n  memset(counts, 0, sizeof(counts));\n  // If there is image in the search box, tip the balance in image's favor.\n  int image_bias = image_region ? kSmoothDecisionMargin / 2 : 0;\n  BlobRegionType text_dir = part.blob_type();\n  BlobTextFlowType flow_type = part.flow();\n  int min_dist = 0;\n  do {\n    // Find the minimum new entry across the vectors\n    min_dist = INT32_MAX;\n    for (int i = 0; i < NPT_COUNT; ++i) {\n      if (counts[i] < dists[i].size() && dists[i][counts[i]] < min_dist) {\n        min_dist = dists[i][counts[i]];\n      }\n    }\n    // Step all the indices/counts forward to include min_dist.\n    for (int i = 0; i < NPT_COUNT; ++i) {\n      while (counts[i] < dists[i].size() && dists[i][counts[i]] <= min_dist) {\n        ++counts[i];\n      }\n    }\n    *best_distance = min_dist;\n    if (debug) {\n      tprintf(\"Totals: htext=%u+%u, vtext=%u+%u, image=%u+%u, at dist=%d\\n\",\n              counts[NPT_HTEXT], counts[NPT_WEAK_HTEXT], counts[NPT_VTEXT],\n              counts[NPT_WEAK_VTEXT], counts[NPT_IMAGE], image_bias, min_dist);\n    }\n    // See if we have a decision yet.\n    auto image_count = counts[NPT_IMAGE];\n    int htext_score = counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] -\n                      (image_count + counts[NPT_WEAK_VTEXT]);\n    int vtext_score = counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] -\n                      (image_count + counts[NPT_WEAK_HTEXT]);\n    if (image_count > 0 && image_bias - htext_score >= kSmoothDecisionMargin &&\n        image_bias - vtext_score >= kSmoothDecisionMargin) {\n      *best_distance = dists[NPT_IMAGE][0];\n      if (!dists[NPT_WEAK_VTEXT].empty() &&\n          *best_distance > dists[NPT_WEAK_VTEXT][0]) {\n        *best_distance = dists[NPT_WEAK_VTEXT][0];\n      }\n      if (!dists[NPT_WEAK_HTEXT].empty() &&\n          *best_distance > dists[NPT_WEAK_HTEXT][0]) {\n        *best_distance = dists[NPT_WEAK_HTEXT][0];\n      }\n      return BRT_POLYIMAGE;\n    }\n    if ((text_dir != BRT_VERT_TEXT || flow_type != BTFT_CHAIN) &&\n        counts[NPT_HTEXT] > 0 && htext_score >= kSmoothDecisionMargin) {\n      *best_distance = dists[NPT_HTEXT][0];\n      return BRT_TEXT;\n    } else if ((text_dir != BRT_TEXT || flow_type != BTFT_CHAIN) &&\n               counts[NPT_VTEXT] > 0 && vtext_score >= kSmoothDecisionMargin) {\n      *best_distance = dists[NPT_VTEXT][0];\n      return BRT_VERT_TEXT;\n    }\n  } while (min_dist < INT32_MAX);\n  return BRT_UNKNOWN;\n}\n\n// Counts the partitions in the given search_box by appending the gap\n// distance (scaled by dist_scaling) of the part from the base_part to the\n// vector of the appropriate type for the partition. Prior to return, the\n// vectors in the dists array are sorted in increasing order.\n// The nontext_map (+im_box, rerotation) is used to make text invisible if\n// there is non-text in between.\n// dists must be an array of vectors of size NPT_COUNT.\nvoid ColPartitionGrid::AccumulatePartDistances(\n    const ColPartition &base_part, const ICOORD &dist_scaling,\n    const TBOX &search_box, Image nontext_map, const TBOX &im_box,\n    const FCOORD &rerotation, bool debug, std::vector<int> *dists) {\n  const TBOX &part_box = base_part.bounding_box();\n  ColPartitionGridSearch rsearch(this);\n  rsearch.SetUniqueMode(true);\n  rsearch.StartRectSearch(search_box);\n  ColPartition *neighbour;\n  // Search for compatible neighbours with a similar strokewidth, but not\n  // on the other side of a tab vector.\n  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {\n    if (neighbour->IsUnMergeableType() ||\n        !base_part.ConfirmNoTabViolation(*neighbour) ||\n        neighbour == &base_part) {\n      continue;\n    }\n    TBOX nbox = neighbour->bounding_box();\n    BlobRegionType n_type = neighbour->blob_type();\n    if ((n_type == BRT_TEXT || n_type == BRT_VERT_TEXT) &&\n        !ImageFind::BlankImageInBetween(part_box, nbox, im_box, rerotation,\n                                        nontext_map)) {\n      continue; // Text not visible the other side of image.\n    }\n    if (BLOBNBOX::IsLineType(n_type)) {\n      continue; // Don't use horizontal lines as neighbours.\n    }\n    int x_gap = std::max(part_box.x_gap(nbox), 0);\n    int y_gap = std::max(part_box.y_gap(nbox), 0);\n    int n_dist = x_gap * dist_scaling.x() + y_gap * dist_scaling.y();\n    if (debug) {\n      tprintf(\"Part has x-gap=%d, y=%d, dist=%d at:\", x_gap, y_gap, n_dist);\n      nbox.print();\n    }\n    // Truncate the number of boxes, so text doesn't get too much advantage.\n    int n_boxes = std::min(neighbour->boxes_count(), kSmoothDecisionMargin);\n    BlobTextFlowType n_flow = neighbour->flow();\n    std::vector<int> *count_vector = nullptr;\n    if (n_flow == BTFT_STRONG_CHAIN) {\n      if (n_type == BRT_TEXT) {\n        count_vector = &dists[NPT_HTEXT];\n      } else {\n        count_vector = &dists[NPT_VTEXT];\n      }\n      if (debug) {\n        tprintf(\"%s %d\\n\", n_type == BRT_TEXT ? \"Htext\" : \"Vtext\", n_boxes);\n      }\n    } else if ((n_type == BRT_TEXT || n_type == BRT_VERT_TEXT) &&\n               (n_flow == BTFT_CHAIN || n_flow == BTFT_NEIGHBOURS)) {\n      // Medium text counts as weak, and all else counts as image.\n      if (n_type == BRT_TEXT) {\n        count_vector = &dists[NPT_WEAK_HTEXT];\n      } else {\n        count_vector = &dists[NPT_WEAK_VTEXT];\n      }\n      if (debug) {\n        tprintf(\"Weak %d\\n\", n_boxes);\n      }\n    } else {\n      count_vector = &dists[NPT_IMAGE];\n      if (debug) {\n        tprintf(\"Image %d\\n\", n_boxes);\n      }\n    }\n    if (count_vector != nullptr) {\n      for (int i = 0; i < n_boxes; ++i) {\n        count_vector->push_back(n_dist);\n      }\n    }\n    if (debug) {\n      neighbour->Print();\n    }\n  }\n  for (int i = 0; i < NPT_COUNT; ++i) {\n    std::sort(dists[i].begin(), dists[i].end());\n  }\n}\n\n// Improves the margins of the part ColPartition by searching for\n// neighbours that vertically overlap significantly.\n// columns may be nullptr, and indicates the assigned column structure this\n// is applicable to part.\nvoid ColPartitionGrid::FindPartitionMargins(ColPartitionSet *columns,\n                                            ColPartition *part) {\n  // Set up a rectangle search x-bounded by the column and y by the part.\n  TBOX box = part->bounding_box();\n  int y = part->MidY();\n  // Initial left margin is based on the column, if there is one.\n  int left_margin = bleft().x();\n  int right_margin = tright().x();\n  if (columns != nullptr) {\n    ColPartition *column = columns->ColumnContaining(box.left(), y);\n    if (column != nullptr) {\n      left_margin = column->LeftAtY(y);\n    }\n    column = columns->ColumnContaining(box.right(), y);\n    if (column != nullptr) {\n      right_margin = column->RightAtY(y);\n    }\n  }\n  left_margin -= kColumnWidthFactor;\n  right_margin += kColumnWidthFactor;\n  // Search for ColPartitions that reduce the margin.\n  left_margin = FindMargin(box.left() + box.height(), true, left_margin,\n                           box.bottom(), box.top(), part);\n  part->set_left_margin(left_margin);\n  // Search for ColPartitions that reduce the margin.\n  right_margin = FindMargin(box.right() - box.height(), false, right_margin,\n                            box.bottom(), box.top(), part);\n  part->set_right_margin(right_margin);\n}\n\n// Starting at x, and going in the specified direction, up to x_limit, finds\n// the margin for the given y range by searching sideways,\n// and ignoring not_this.\nint ColPartitionGrid::FindMargin(int x, bool right_to_left, int x_limit,\n                                 int y_bottom, int y_top,\n                                 const ColPartition *not_this) {\n  int height = y_top - y_bottom;\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch side_search(this);\n  side_search.SetUniqueMode(true);\n  side_search.StartSideSearch(x, y_bottom, y_top);\n  ColPartition *part;\n  while ((part = side_search.NextSideSearch(right_to_left)) != nullptr) {\n    // Ignore itself.\n    if (part == not_this) { // || part->IsLineType())\n      continue;\n    }\n    // Must overlap by enough, based on the min of the heights, so\n    // large partitions can't smash through small ones.\n    TBOX box = part->bounding_box();\n    int min_overlap = std::min(height, static_cast<int>(box.height()));\n    min_overlap = static_cast<int>(min_overlap * kMarginOverlapFraction + 0.5);\n    int y_overlap = std::min(y_top, static_cast<int>(box.top())) -\n                    std::max(y_bottom, static_cast<int>(box.bottom()));\n    if (y_overlap < min_overlap) {\n      continue;\n    }\n    // Must be going the right way.\n    int x_edge = right_to_left ? box.right() : box.left();\n    if ((x_edge < x) != right_to_left) {\n      continue;\n    }\n    // If we have gone past x_limit, then x_limit will do.\n    if ((x_edge < x_limit) == right_to_left) {\n      break;\n    }\n    // It reduces x limit, so save the new one.\n    x_limit = x_edge;\n  }\n  return x_limit;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/colpartitiongrid.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartitiongrid.h\n// Description: Class collecting code that acts on a BBGrid of ColPartitions.\n// Author:      Ray Smith\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_COLPARTITIONGRID_H_\n#define TESSERACT_TEXTORD_COLPARTITIONGRID_H_\n\n#include \"bbgrid.h\"\n#include \"colpartition.h\"\n#include \"colpartitionset.h\"\n\nnamespace tesseract {\n\nclass TabFind;\n\n// ColPartitionGrid is a BBGrid of ColPartition.\n// It collects functions that work on the grid.\nclass TESS_API ColPartitionGrid\n    : public BBGrid<ColPartition, ColPartition_CLIST, ColPartition_C_IT> {\npublic:\n  ColPartitionGrid() = default;\n  ColPartitionGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n\n  ~ColPartitionGrid() override = default;\n\n  // Handles a click event in a display window.\n  void HandleClick(int x, int y) override;\n\n  // Merges ColPartitions in the grid that look like they belong in the same\n  // textline.\n  // For all partitions in the grid, calls the box_cb permanent callback\n  // to compute the search box, searches the box, and if a candidate is found,\n  // calls the confirm_cb to check any more rules. If the confirm_cb returns\n  // true, then the partitions are merged.\n  // Both callbacks are deleted before returning.\n  void Merges(const std::function<bool(ColPartition *, TBOX *)> &box_cb,\n              const std::function<bool(const ColPartition *,\n                                       const ColPartition *)> &confirm_cb);\n\n  // For the given partition, calls the box_cb permanent callback\n  // to compute the search box, searches the box, and if a candidate is found,\n  // calls the confirm_cb to check any more rules. If the confirm_cb returns\n  // true, then the partitions are merged.\n  // Returns true if the partition is consumed by one or more merges.\n  bool MergePart(const std::function<bool(ColPartition *, TBOX *)> &box_cb,\n                 const std::function<bool(const ColPartition *,\n                                          const ColPartition *)> &confirm_cb,\n                 ColPartition *part);\n\n  // Computes and returns the total overlap of all partitions in the grid.\n  // If overlap_grid is non-null, it is filled with a grid that holds empty\n  // partitions representing the union of all overlapped partitions.\n  int ComputeTotalOverlap(ColPartitionGrid **overlap_grid);\n\n  // Finds all the ColPartitions in the grid that overlap with the given\n  // box and returns them SortByBoxLeft(ed) and uniqued in the given list.\n  // Any partition equal to not_this (may be nullptr) is excluded.\n  void FindOverlappingPartitions(const TBOX &box, const ColPartition *not_this,\n                                 ColPartition_CLIST *parts);\n\n  // Finds and returns the best candidate ColPartition to merge with part,\n  // selected from the candidates list, based on the minimum increase in\n  // pairwise overlap among all the partitions overlapped by the combined box.\n  // If overlap_increase is not nullptr then it returns the increase in overlap\n  // that would result from the merge.\n  // See colpartitiongrid.cpp for a diagram.\n  ColPartition *BestMergeCandidate(\n      const ColPartition *part, ColPartition_CLIST *candidates, bool debug,\n      const std::function<bool(const ColPartition *, const ColPartition *)>\n          &confirm_cb,\n      int *overlap_increase);\n\n  // Split partitions where it reduces overlap between their bounding boxes.\n  // ColPartitions are after all supposed to be a partitioning of the blobs\n  // AND of the space on the page!\n  // Blobs that cause overlaps get removed, put in individual partitions\n  // and added to the big_parts list. They are most likely characters on\n  // 2 textlines that touch, or something big like a dropcap.\n  void SplitOverlappingPartitions(ColPartition_LIST *big_parts);\n\n  // Filters partitions of source_type by looking at local neighbours.\n  // Where a majority of neighbours have a text type, the partitions are\n  // changed to text, where the neighbours have image type, they are changed\n  // to image, and partitions that have no definite neighbourhood type are\n  // left unchanged.\n  // im_box and rerotation are used to map blob coordinates onto the\n  // nontext_map, which is used to prevent the spread of text neighbourhoods\n  // into images.\n  // Returns true if anything was changed.\n  bool GridSmoothNeighbours(BlobTextFlowType source_type, Image nontext_map,\n                            const TBOX &im_box, const FCOORD &rerotation);\n\n  // Reflects the grid and its colpartitions in the y-axis, assuming that\n  // all blob boxes have already been done.\n  void ReflectInYAxis();\n\n  // Rotates the grid and its colpartitions by the given angle, assuming that\n  // all blob boxes have already been done.\n  void Deskew(const FCOORD &deskew);\n\n  // Transforms the grid of partitions to the output blocks, putting each\n  // partition into a separate block. We don't really care about the order,\n  // as we just want to get as much text as possible without trying to organize\n  // it into proper blocks or columns.\n  void ExtractPartitionsAsBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);\n\n  // Sets the left and right tabs of the partitions in the grid.\n  void SetTabStops(TabFind *tabgrid);\n\n  // Makes the ColPartSets and puts them in the PartSetVector ready\n  // for finding column bounds. Returns false if no partitions were found.\n  // Each ColPartition in the grid is placed in a single ColPartSet based\n  // on the bottom-left of its bounding box.\n  bool MakeColPartSets(PartSetVector *part_sets);\n\n  // Makes a single ColPartitionSet consisting of a single ColPartition that\n  // represents the total horizontal extent of the significant content on the\n  // page. Used for the single column setting in place of automatic detection.\n  // Returns nullptr if the page is empty of significant content.\n  ColPartitionSet *MakeSingleColumnSet(WidthCallback cb);\n\n  // Mark the BLOBNBOXes in each partition as being owned by that partition.\n  void ClaimBoxes();\n\n  // Retypes all the blobs referenced by the partitions in the grid.\n  // Image blobs are sliced on the grid boundaries to give the tab finder\n  // a better handle on the edges of the images, and the actual blobs are\n  // returned in the im_blobs list, as they are not owned by the block.\n  void ReTypeBlobs(BLOBNBOX_LIST *im_blobs);\n\n  // The boxes within the partitions have changed (by deskew) so recompute\n  // the bounds of all the partitions and reinsert them into the grid.\n  void RecomputeBounds(int gridsize, const ICOORD &bleft, const ICOORD &tright,\n                       const ICOORD &vertical);\n\n  // Improves the margins of the ColPartitions in the grid by calling\n  // FindPartitionMargins on each.\n  void GridFindMargins(ColPartitionSet **best_columns);\n\n  // Improves the margins of the ColPartitions in the list by calling\n  // FindPartitionMargins on each.\n  void ListFindMargins(ColPartitionSet **best_columns,\n                       ColPartition_LIST *parts);\n\n  // Deletes all the partitions in the grid after disowning all the blobs.\n  void DeleteParts();\n\n  // Deletes all the partitions in the grid that are of type BRT_UNKNOWN and\n  // all the blobs in them.\n  void DeleteUnknownParts(TO_BLOCK *block);\n\n  // Deletes all the partitions in the grid that are NOT of flow type\n  // BTFT_LEADER.\n  void DeleteNonLeaderParts();\n\n  // Finds and marks text partitions that represent figure captions.\n  void FindFigureCaptions();\n\n  //////// Functions that manipulate ColPartitions in the grid     ///////\n  //////// to find chains of partner partitions of the same type.  ///////\n  // For every ColPartition in the grid, finds its upper and lower neighbours.\n  void FindPartitionPartners();\n  // Finds the best partner in the given direction for the given partition.\n  // Stores the result with AddPartner.\n  void FindPartitionPartners(bool upper, ColPartition *part);\n  // Finds the best partner in the given direction for the given partition.\n  // Stores the result with AddPartner.\n  void FindVPartitionPartners(bool to_the_left, ColPartition *part);\n  // For every ColPartition with multiple partners in the grid, reduces the\n  // number of partners to 0 or 1. If get_desperate is true, goes to more\n  // desperate merge methods to merge flowing text before breaking partnerships.\n  void RefinePartitionPartners(bool get_desperate);\n\nprivate:\n  // Finds and returns a list of candidate ColPartitions to merge with part.\n  // The candidates must overlap search_box, and when merged must not\n  // overlap any other partitions that are not overlapped by each individually.\n  void FindMergeCandidates(const ColPartition *part, const TBOX &search_box,\n                           bool debug, ColPartition_CLIST *candidates);\n\n  // Smoothes the region type/flow type of the given part by looking at local\n  // neighbours and the given image mask. Searches a padded rectangle with the\n  // padding truncated on one size of the part's box in turn for each side,\n  // using the result (if any) that has the least distance to all neighbours\n  // that contribute to the decision. This biases in favor of rectangular\n  // regions without completely enforcing them.\n  // If a good decision cannot be reached, the part is left unchanged.\n  // im_box and rerotation are used to map blob coordinates onto the\n  // nontext_map, which is used to prevent the spread of text neighbourhoods\n  // into images.\n  // Returns true if the partition was changed.\n  bool SmoothRegionType(Image nontext_map, const TBOX &im_box,\n                        const FCOORD &rerotation, bool debug,\n                        ColPartition *part);\n  // Executes the search for SmoothRegionType in a single direction.\n  // Creates a bounding box that is padded in all directions except direction,\n  // and searches it for other partitions. Finds the nearest collection of\n  // partitions that makes a decisive result (if any) and returns the type\n  // and the distance of the collection. If there are any pixels in the\n  // nontext_map, then the decision is biased towards image.\n  BlobRegionType SmoothInOneDirection(BlobNeighbourDir direction,\n                                      Image nontext_map, const TBOX &im_box,\n                                      const FCOORD &rerotation, bool debug,\n                                      const ColPartition &part,\n                                      int *best_distance);\n  // Counts the partitions in the given search_box by appending the gap\n  // distance (scaled by dist_scaling) of the part from the base_part to the\n  // vector of the appropriate type for the partition. Prior to return, the\n  // vectors in the dists array are sorted in increasing order.\n  // dists must be an array of vectors of size NPT_COUNT.\n  void AccumulatePartDistances(const ColPartition &base_part,\n                               const ICOORD &dist_scaling,\n                               const TBOX &search_box, Image nontext_map,\n                               const TBOX &im_box, const FCOORD &rerotation,\n                               bool debug, std::vector<int> *dists);\n\n  // Improves the margins of the ColPartition by searching for\n  // neighbours that vertically overlap significantly.\n  void FindPartitionMargins(ColPartitionSet *columns, ColPartition *part);\n\n  // Starting at x, and going in the specified direction, up to x_limit, finds\n  // the margin for the given y range by searching sideways,\n  // and ignoring not_this.\n  int FindMargin(int x, bool right_to_left, int x_limit, int y_bottom,\n                 int y_top, const ColPartition *not_this);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_COLPARTITIONGRID_H_\n"
  },
  {
    "path": "src/textord/colpartitionset.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartitionset.cpp\n// Description: Class to hold a list of ColPartitions of the page that\n//              correspond roughly to columns.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"colpartitionset.h\"\n#include \"tablefind.h\"\n#include \"workingpartset.h\"\n\nnamespace tesseract {\n\n// Minimum width of a column to be interesting as a multiple of resolution.\nconst double kMinColumnWidth = 2.0 / 3;\n\nColPartitionSet::ColPartitionSet(ColPartition_LIST *partitions) {\n  ColPartition_IT it(&parts_);\n  it.add_list_after(partitions);\n  ComputeCoverage();\n}\n\nColPartitionSet::ColPartitionSet(ColPartition *part) {\n  ColPartition_IT it(&parts_);\n  it.add_after_then_move(part);\n  ComputeCoverage();\n}\n\n// Returns the number of columns of good width.\nint ColPartitionSet::GoodColumnCount() const {\n  int num_good_cols = 0;\n  // This is a read-only iteration of the list.\n  ColPartition_IT it(const_cast<ColPartition_LIST *>(&parts_));\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    if (it.data()->good_width()) {\n      ++num_good_cols;\n    }\n  }\n  return num_good_cols;\n}\n\n// Return an element of the parts_ list from its index.\nColPartition *ColPartitionSet::GetColumnByIndex(int index) {\n  ColPartition_IT it(&parts_);\n  it.mark_cycle_pt();\n  for (int i = 0; i < index && !it.cycled_list(); ++i, it.forward()) {\n    ;\n  }\n  if (it.cycled_list()) {\n    return nullptr;\n  }\n  return it.data();\n}\n\n// Return the ColPartition that contains the given coords, if any, else nullptr.\nColPartition *ColPartitionSet::ColumnContaining(int x, int y) {\n  ColPartition_IT it(&parts_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    if (part->ColumnContains(x, y)) {\n      return part;\n    }\n  }\n  return nullptr;\n}\n\n// Extract all the parts from the list, relinquishing ownership.\nvoid ColPartitionSet::RelinquishParts() {\n  ColPartition_IT it(&parts_);\n  while (!it.empty()) {\n    it.extract();\n    it.forward();\n  }\n}\n\n// Attempt to improve this by adding partitions or expanding partitions.\nvoid ColPartitionSet::ImproveColumnCandidate(const WidthCallback &cb,\n                                             PartSetVector *src_sets) {\n  int set_size = src_sets->size();\n  // Iterate over the provided column sets, as each one may have something\n  // to improve this.\n  for (int i = 0; i < set_size; ++i) {\n    ColPartitionSet *column_set = src_sets->at(i);\n    if (column_set == nullptr) {\n      continue;\n    }\n    // Iterate over the parts in this and column_set, adding bigger or\n    // new parts in column_set to this.\n    ColPartition_IT part_it(&parts_);\n    ASSERT_HOST(!part_it.empty());\n    int prev_right = INT32_MIN;\n    part_it.mark_cycle_pt();\n    ColPartition_IT col_it(&column_set->parts_);\n    for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) {\n      ColPartition *col_part = col_it.data();\n      if (col_part->blob_type() < BRT_UNKNOWN) {\n        continue; // Ignore image partitions.\n      }\n      int col_left = col_part->left_key();\n      int col_right = col_part->right_key();\n      // Sync-up part_it (in this) so it matches the col_part in column_set.\n      ColPartition *part = part_it.data();\n      while (!part_it.at_last() && part->right_key() < col_left) {\n        prev_right = part->right_key();\n        part_it.forward();\n        part = part_it.data();\n      }\n      int part_left = part->left_key();\n      int part_right = part->right_key();\n      if (part_right < col_left || col_right < part_left) {\n        // There is no overlap so this is a new partition.\n        AddPartition(col_part->ShallowCopy(), &part_it);\n        continue;\n      }\n      // Check the edges of col_part to see if they can improve part.\n      bool part_width_ok = cb(part->KeyWidth(part_left, part_right));\n      if (col_left < part_left && col_left > prev_right) {\n        // The left edge of the column is better and it doesn't overlap,\n        // so we can potentially expand it.\n        int col_box_left = col_part->BoxLeftKey();\n        bool tab_width_ok = cb(part->KeyWidth(col_left, part_right));\n        bool box_width_ok = cb(part->KeyWidth(col_box_left, part_right));\n        if (tab_width_ok || (!part_width_ok)) {\n          // The tab is leaving the good column metric at least as good as\n          // it was before, so use the tab.\n          part->CopyLeftTab(*col_part, false);\n          part->SetColumnGoodness(cb);\n        } else if (col_box_left < part_left &&\n                   (box_width_ok || !part_width_ok)) {\n          // The box is leaving the good column metric at least as good as\n          // it was before, so use the box.\n          part->CopyLeftTab(*col_part, true);\n          part->SetColumnGoodness(cb);\n        }\n        part_left = part->left_key();\n      }\n      if (col_right > part_right &&\n          (part_it.at_last() ||\n           part_it.data_relative(1)->left_key() > col_right)) {\n        // The right edge is better, so we can possibly expand it.\n        int col_box_right = col_part->BoxRightKey();\n        bool tab_width_ok = cb(part->KeyWidth(part_left, col_right));\n        bool box_width_ok = cb(part->KeyWidth(part_left, col_box_right));\n        if (tab_width_ok || (!part_width_ok)) {\n          // The tab is leaving the good column metric at least as good as\n          // it was before, so use the tab.\n          part->CopyRightTab(*col_part, false);\n          part->SetColumnGoodness(cb);\n        } else if (col_box_right > part_right &&\n                   (box_width_ok || !part_width_ok)) {\n          // The box is leaving the good column metric at least as good as\n          // it was before, so use the box.\n          part->CopyRightTab(*col_part, true);\n          part->SetColumnGoodness(cb);\n        }\n      }\n    }\n  }\n  ComputeCoverage();\n}\n\n// If this set is good enough to represent a new partitioning into columns,\n// add it to the vector of sets, otherwise delete it.\nvoid ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets,\n                                              const WidthCallback &cb) {\n  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),\n                                         bounding_box_.bottom());\n  if (debug) {\n    tprintf(\"Considering new column candidate:\\n\");\n    Print();\n  }\n  if (!LegalColumnCandidate()) {\n    if (debug) {\n      tprintf(\"Not a legal column candidate:\\n\");\n      Print();\n    }\n    delete this;\n    return;\n  }\n  for (unsigned i = 0; i < column_sets->size(); ++i) {\n    ColPartitionSet *columns = column_sets->at(i);\n    // In ordering the column set candidates, good_coverage_ is king,\n    // followed by good_column_count_ and then bad_coverage_.\n    bool better = good_coverage_ > columns->good_coverage_;\n    if (good_coverage_ == columns->good_coverage_) {\n      better = good_column_count_ > columns->good_column_count_;\n      if (good_column_count_ == columns->good_column_count_) {\n        better = bad_coverage_ > columns->bad_coverage_;\n      }\n    }\n    if (better) {\n      // The new one is better so add it.\n      if (debug) {\n        tprintf(\"Good one\\n\");\n      }\n      column_sets->insert(column_sets->begin() + i, this);\n      return;\n    }\n    if (columns->CompatibleColumns(false, this, cb)) {\n      if (debug) {\n        tprintf(\"Duplicate\\n\");\n      }\n      delete this;\n      return; // It is not unique.\n    }\n  }\n  if (debug) {\n    tprintf(\"Added to end\\n\");\n  }\n  column_sets->push_back(this);\n}\n\n// Return true if the partitions in other are all compatible with the columns\n// in this.\nbool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet *other,\n                                        const WidthCallback &cb) {\n  if (debug) {\n    tprintf(\"CompatibleColumns testing compatibility\\n\");\n    Print();\n    other->Print();\n  }\n  if (other->parts_.empty()) {\n    if (debug) {\n      tprintf(\"CompatibleColumns true due to empty other\\n\");\n    }\n    return true;\n  }\n  ColPartition_IT it(&other->parts_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    if (part->blob_type() < BRT_UNKNOWN) {\n      if (debug) {\n        tprintf(\"CompatibleColumns ignoring image partition\\n\");\n        part->Print();\n      }\n      continue; // Image partitions are irrelevant to column compatibility.\n    }\n    int y = part->MidY();\n    int left = part->bounding_box().left();\n    int right = part->bounding_box().right();\n    ColPartition *left_col = ColumnContaining(left, y);\n    ColPartition *right_col = ColumnContaining(right, y);\n    if (right_col == nullptr || left_col == nullptr) {\n      if (debug) {\n        tprintf(\"CompatibleColumns false due to partition edge outside\\n\");\n        part->Print();\n      }\n      return false; // A partition edge lies outside of all columns\n    }\n    if (right_col != left_col && cb(right - left)) {\n      if (debug) {\n        tprintf(\"CompatibleColumns false due to good width in multiple cols\\n\");\n        part->Print();\n      }\n      return false; // Partition with a good width must be in a single column.\n    }\n\n    ColPartition_IT it2 = it;\n    while (!it2.at_last()) {\n      it2.forward();\n      ColPartition *next_part = it2.data();\n      if (!BLOBNBOX::IsTextType(next_part->blob_type())) {\n        continue; // Non-text partitions are irrelevant.\n      }\n      int next_left = next_part->bounding_box().left();\n      if (next_left == right) {\n        break; // They share the same edge, so one must be a pull-out.\n      }\n      // Search to see if right and next_left fall within a single column.\n      ColPartition *next_left_col = ColumnContaining(next_left, y);\n      if (right_col == next_left_col) {\n        // There is a column break in this column.\n        // This can be due to a figure caption within a column, a pull-out\n        // block, or a simple broken textline that remains to be merged:\n        // all allowed, or a change in column layout: not allowed.\n        // If both partitions are of good width, then it is likely\n        // a change in column layout, otherwise probably an allowed situation.\n        if (part->good_width() && next_part->good_width()) {\n          if (debug) {\n            int next_right = next_part->bounding_box().right();\n            tprintf(\"CompatibleColumns false due to 2 parts of good width\\n\");\n            tprintf(\"part1 %d-%d, part2 %d-%d\\n\", left, right, next_left,\n                    next_right);\n            right_col->Print();\n          }\n          return false;\n        }\n      }\n      break;\n    }\n  }\n  if (debug) {\n    tprintf(\"CompatibleColumns true!\\n\");\n  }\n  return true;\n}\n\n// Returns the total width of all blobs in the part_set that do not lie\n// within an approved column. Used as a cost measure for using this\n// column set over another that might be compatible.\nint ColPartitionSet::UnmatchedWidth(ColPartitionSet *part_set) {\n  int total_width = 0;\n  ColPartition_IT it(&part_set->parts_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    if (!BLOBNBOX::IsTextType(part->blob_type())) {\n      continue; // Non-text partitions are irrelevant to column compatibility.\n    }\n    int y = part->MidY();\n    BLOBNBOX_C_IT box_it(part->boxes());\n    for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {\n      const TBOX &box = it.data()->bounding_box();\n      // Assume that the whole blob is outside any column iff its x-middle\n      // is outside.\n      int x = (box.left() + box.right()) / 2;\n      ColPartition *col = ColumnContaining(x, y);\n      if (col == nullptr) {\n        total_width += box.width();\n      }\n    }\n  }\n  return total_width;\n}\n\n// Return true if this ColPartitionSet makes a legal column candidate by\n// having legal individual partitions and non-overlapping adjacent pairs.\nbool ColPartitionSet::LegalColumnCandidate() {\n  ColPartition_IT it(&parts_);\n  if (it.empty()) {\n    return false;\n  }\n  bool any_text_parts = false;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    if (BLOBNBOX::IsTextType(part->blob_type())) {\n      if (!part->IsLegal()) {\n        return false; // Individual partition is illegal.\n      }\n      any_text_parts = true;\n    }\n    if (!it.at_last()) {\n      ColPartition *next_part = it.data_relative(1);\n      if (next_part->left_key() < part->right_key()) {\n        return false;\n      }\n    }\n  }\n  return any_text_parts;\n}\n\n// Return a copy of this. If good_only will only copy the Good ColPartitions.\nColPartitionSet *ColPartitionSet::Copy(bool good_only) {\n  ColPartition_LIST copy_parts;\n  ColPartition_IT src_it(&parts_);\n  ColPartition_IT dest_it(&copy_parts);\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    ColPartition *part = src_it.data();\n    if (BLOBNBOX::IsTextType(part->blob_type()) &&\n        (!good_only || part->good_width() || part->good_column())) {\n      dest_it.add_after_then_move(part->ShallowCopy());\n    }\n  }\n  if (dest_it.empty()) {\n    return nullptr;\n  }\n  return new ColPartitionSet(&copy_parts);\n}\n\n// Return the bounding boxes of columns at the given y-range\nvoid ColPartitionSet::GetColumnBoxes(int y_bottom, int y_top,\n                                     ColSegment_LIST *segments) {\n  ColPartition_IT it(&parts_);\n  ColSegment_IT col_it(segments);\n  col_it.move_to_last();\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    ICOORD bot_left(part->LeftAtY(y_top), y_bottom);\n    ICOORD top_right(part->RightAtY(y_bottom), y_top);\n    auto *col_seg = new ColSegment();\n    col_seg->InsertBox(TBOX(bot_left, top_right));\n    col_it.add_after_then_move(col_seg);\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Display the edges of the columns at the given y coords.\nvoid ColPartitionSet::DisplayColumnEdges(int y_bottom, int y_top,\n                                         ScrollView *win) {\n  ColPartition_IT it(&parts_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    win->Line(part->LeftAtY(y_top), y_top, part->LeftAtY(y_bottom), y_bottom);\n    win->Line(part->RightAtY(y_top), y_top, part->RightAtY(y_bottom), y_bottom);\n  }\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Return the ColumnSpanningType that best explains the columns overlapped\n// by the given coords(left,right,y), with the given margins.\n// Also return the first and last column index touched by the coords and\n// the leftmost spanned column.\n// Column indices are 2n + 1 for real columns (0 based) and even values\n// represent the gaps in between columns, with 0 being left of the leftmost.\n// resolution refers to the ppi resolution of the image.\nColumnSpanningType ColPartitionSet::SpanningType(\n    int resolution, int left, int right, int height, int y, int left_margin,\n    int right_margin, int *first_col, int *last_col, int *first_spanned_col) {\n  *first_col = -1;\n  *last_col = -1;\n  *first_spanned_col = -1;\n  int margin_columns = 0;\n  ColPartition_IT it(&parts_);\n  int col_index = 1;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) {\n    ColPartition *part = it.data();\n    if (part->ColumnContains(left, y) ||\n        (it.at_first() && part->ColumnContains(left + height, y))) {\n      // In the default case, first_col is set, but columns_spanned remains\n      // zero, so first_col will get reset in the first column genuinely\n      // spanned, but we can tell the difference from a noise partition\n      // that touches no column.\n      *first_col = col_index;\n      if (part->ColumnContains(right, y) ||\n          (it.at_last() && part->ColumnContains(right - height, y))) {\n        // Both within a single column.\n        *last_col = col_index;\n        return CST_FLOWING;\n      }\n      if (left_margin <= part->LeftAtY(y)) {\n        // It completely spans this column.\n        *first_spanned_col = col_index;\n        margin_columns = 1;\n      }\n    } else if (part->ColumnContains(right, y) ||\n               (it.at_last() && part->ColumnContains(right - height, y))) {\n      if (*first_col < 0) {\n        // It started in-between.\n        *first_col = col_index - 1;\n      }\n      if (right_margin >= part->RightAtY(y)) {\n        // It completely spans this column.\n        if (margin_columns == 0) {\n          *first_spanned_col = col_index;\n        }\n        ++margin_columns;\n      }\n      *last_col = col_index;\n      break;\n    } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) {\n      // Neither left nor right are contained within, so it spans this\n      // column.\n      if (*first_col < 0) {\n        // It started in between the previous column and the current column.\n        *first_col = col_index - 1;\n      }\n      if (margin_columns == 0) {\n        *first_spanned_col = col_index;\n      }\n      *last_col = col_index;\n    } else if (right < part->LeftAtY(y)) {\n      // We have gone past the end.\n      *last_col = col_index - 1;\n      if (*first_col < 0) {\n        // It must lie completely between columns =>noise.\n        *first_col = col_index - 1;\n      }\n      break;\n    }\n  }\n  if (*first_col < 0) {\n    *first_col = col_index - 1; // The last in-between.\n  }\n  if (*last_col < 0) {\n    *last_col = col_index - 1; // The last in-between.\n  }\n  ASSERT_HOST(*first_col >= 0 && *last_col >= 0);\n  ASSERT_HOST(*first_col <= *last_col);\n  if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) {\n    // Neither end was in a column, and it didn't span any, so it lies\n    // entirely between columns, therefore noise.\n    return CST_NOISE;\n  } else if (margin_columns <= 1) {\n    // An exception for headings that stick outside of single-column text.\n    if (margin_columns == 1 && parts_.singleton()) {\n      return CST_HEADING;\n    }\n    // It is a pullout, as left and right were not in the same column, but\n    // it doesn't go to the edge of its start and end.\n    return CST_PULLOUT;\n  }\n  // Its margins went to the edges of first and last columns => heading.\n  return CST_HEADING;\n}\n\n// The column_set has changed. Close down all in-progress WorkingPartSets in\n// columns that do not match and start new ones for the new columns in this.\n// As ColPartitions are turned into BLOCKs, the used ones are put in\n// used_parts, as they still need to be referenced in the grid.\nvoid ColPartitionSet::ChangeWorkColumns(const ICOORD &bleft,\n                                        const ICOORD &tright, int resolution,\n                                        ColPartition_LIST *used_parts,\n                                        WorkingPartSet_LIST *working_set_list) {\n  // Move the input list to a temporary location so we can delete its elements\n  // as we add them to the output working_set.\n  WorkingPartSet_LIST work_src;\n  WorkingPartSet_IT src_it(&work_src);\n  src_it.add_list_after(working_set_list);\n  src_it.move_to_first();\n  WorkingPartSet_IT dest_it(working_set_list);\n  // Completed blocks and to_blocks are accumulated and given to the first new\n  // one  whenever we keep a column, or at the end.\n  BLOCK_LIST completed_blocks;\n  TO_BLOCK_LIST to_blocks;\n  WorkingPartSet *first_new_set = nullptr;\n  WorkingPartSet *working_set = nullptr;\n  ColPartition_IT col_it(&parts_);\n  for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) {\n    ColPartition *column = col_it.data();\n    // Any existing column to the left of column is completed.\n    while (!src_it.empty() &&\n           ((working_set = src_it.data())->column() == nullptr ||\n            working_set->column()->right_key() <= column->left_key())) {\n      src_it.extract();\n      working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,\n                                          &completed_blocks, &to_blocks);\n      delete working_set;\n      src_it.forward();\n    }\n    // Make a new between-column WorkingSet for before the current column.\n    working_set = new WorkingPartSet(nullptr);\n    dest_it.add_after_then_move(working_set);\n    if (first_new_set == nullptr) {\n      first_new_set = working_set;\n    }\n    // A matching column gets to stay, and first_new_set gets all the\n    // completed_sets.\n    working_set = src_it.empty() ? nullptr : src_it.data();\n    if (working_set != nullptr &&\n        working_set->column()->MatchingColumns(*column)) {\n      working_set->set_column(column);\n      dest_it.add_after_then_move(src_it.extract());\n      src_it.forward();\n      first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);\n      first_new_set = nullptr;\n    } else {\n      // Just make a new working set for the current column.\n      working_set = new WorkingPartSet(column);\n      dest_it.add_after_then_move(working_set);\n    }\n  }\n  // Complete any remaining src working sets.\n  while (!src_it.empty()) {\n    working_set = src_it.extract();\n    working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,\n                                        &completed_blocks, &to_blocks);\n    delete working_set;\n    src_it.forward();\n  }\n  // Make a new between-column WorkingSet for after the last column.\n  working_set = new WorkingPartSet(nullptr);\n  dest_it.add_after_then_move(working_set);\n  if (first_new_set == nullptr) {\n    first_new_set = working_set;\n  }\n  // The first_new_set now gets any accumulated completed_parts/blocks.\n  first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);\n}\n\n// Accumulate the widths and gaps into the given variables.\nvoid ColPartitionSet::AccumulateColumnWidthsAndGaps(int *total_width,\n                                                    int *width_samples,\n                                                    int *total_gap,\n                                                    int *gap_samples) {\n  ColPartition_IT it(&parts_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    *total_width += part->ColumnWidth();\n    ++*width_samples;\n    if (!it.at_last()) {\n      ColPartition *next_part = it.data_relative(1);\n      int part_left = part->right_key();\n      int part_right = next_part->left_key();\n      int gap = part->KeyWidth(part_left, part_right);\n      *total_gap += gap;\n      ++*gap_samples;\n    }\n  }\n}\n\n// Provide debug output for this ColPartitionSet and all the ColPartitions.\nvoid ColPartitionSet::Print() {\n  ColPartition_IT it(&parts_);\n  tprintf(\n      \"Partition set of %d parts, %d good, coverage=%d+%d\"\n      \" (%d,%d)->(%d,%d)\\n\",\n      it.length(), good_column_count_, good_coverage_, bad_coverage_,\n      bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),\n      bounding_box_.top());\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    part->Print();\n  }\n}\n\n// PRIVATE CODE.\n\n// Add the given partition to the list in the appropriate place.\nvoid ColPartitionSet::AddPartition(ColPartition *new_part,\n                                   ColPartition_IT *it) {\n  AddPartitionCoverageAndBox(*new_part);\n  int new_right = new_part->right_key();\n  if (it->data()->left_key() >= new_right) {\n    it->add_before_stay_put(new_part);\n  } else {\n    it->add_after_stay_put(new_part);\n  }\n}\n\n// Compute the coverage and good column count. Coverage is the amount of the\n// width of the page (in pixels) that is covered by ColPartitions, which are\n// used to provide candidate column layouts.\n// Coverage is split into good and bad. Good coverage is provided by\n// ColPartitions of a frequent width (according to the callback function\n// provided by TabFinder::WidthCB, which accesses stored statistics on the\n// widths of ColPartitions) and bad coverage is provided by all other\n// ColPartitions, even if they have tab vectors at both sides. Thus:\n// |-----------------------------------------------------------------|\n// |        Double     width    heading                              |\n// |-----------------------------------------------------------------|\n// |-------------------------------| |-------------------------------|\n// |   Common width ColPartition   | |  Common width ColPartition    |\n// |-------------------------------| |-------------------------------|\n// the layout with two common-width columns has better coverage than the\n// double width heading, because the coverage is \"good,\" even though less in\n// total coverage than the heading, because the heading coverage is \"bad.\"\nvoid ColPartitionSet::ComputeCoverage() {\n  // Count the number of good columns and sum their width.\n  ColPartition_IT it(&parts_);\n  good_column_count_ = 0;\n  good_coverage_ = 0;\n  bad_coverage_ = 0;\n  bounding_box_ = TBOX();\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColPartition *part = it.data();\n    AddPartitionCoverageAndBox(*part);\n  }\n}\n\n// Adds the coverage, column count and box for a single partition,\n// without adding it to the list. (Helper factored from ComputeCoverage.)\nvoid ColPartitionSet::AddPartitionCoverageAndBox(const ColPartition &part) {\n  bounding_box_ += part.bounding_box();\n  int coverage = part.ColumnWidth();\n  if (part.good_width()) {\n    good_coverage_ += coverage;\n    good_column_count_ += 2;\n  } else {\n    if (part.blob_type() < BRT_UNKNOWN) {\n      coverage /= 2;\n    }\n    if (part.good_column()) {\n      ++good_column_count_;\n    }\n    bad_coverage_ += coverage;\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/colpartitionset.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        colpartitionset.h\n// Description: Class to hold a list of ColPartitions of the page that\n//              correspond roughly to columns.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_\n#define TESSERACT_TEXTORD_COLPARTITIONSET_H_\n\n#include \"colpartition.h\" // For ColPartition_LIST.\n#include \"rect.h\"         // For TBOX.\n#include \"tabvector.h\"    // For BLOBNBOX_CLIST.\n\nnamespace tesseract {\n\nclass WorkingPartSet_LIST;\nclass ColSegment_LIST;\nclass ColPartitionSet;\nusing PartSetVector = std::vector<ColPartitionSet *>;\n\n// ColPartitionSet is a class that holds a list of ColPartitions.\n// Its main use is in holding a candidate partitioning of the width of the\n// image into columns, where each member ColPartition is a single column.\n// ColPartitionSets are used in building the column layout of a page.\nclass ColPartitionSet : public ELIST<ColPartitionSet>::LINK {\npublic:\n  ColPartitionSet() = default;\n  explicit ColPartitionSet(ColPartition_LIST *partitions);\n  explicit ColPartitionSet(ColPartition *partition);\n\n  ~ColPartitionSet() = default;\n\n  // Simple accessors.\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n  bool Empty() const {\n    return parts_.empty();\n  }\n  int ColumnCount() const {\n    return parts_.length();\n  }\n\n  // Returns the number of columns of good width.\n  int GoodColumnCount() const;\n\n  // Return an element of the parts_ list from its index.\n  ColPartition *GetColumnByIndex(int index);\n\n  // Return the ColPartition that contains the given coords, if any, else\n  // nullptr.\n  ColPartition *ColumnContaining(int x, int y);\n\n  // Return the bounding boxes of columns at the given y-range\n  void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);\n\n  // Extract all the parts from the list, relinquishing ownership.\n  void RelinquishParts();\n\n  // Attempt to improve this by adding partitions or expanding partitions.\n  void ImproveColumnCandidate(const WidthCallback &cb, PartSetVector *src_sets);\n\n  // If this set is good enough to represent a new partitioning into columns,\n  // add it to the vector of sets, otherwise delete it.\n  void AddToColumnSetsIfUnique(PartSetVector *column_sets,\n                               const WidthCallback &cb);\n\n  // Return true if the partitions in other are all compatible with the columns\n  // in this.\n  bool CompatibleColumns(bool debug, ColPartitionSet *other,\n                         const WidthCallback &cb);\n\n  // Returns the total width of all blobs in the part_set that do not lie\n  // within an approved column. Used as a cost measure for using this\n  // column set over another that might be compatible.\n  int UnmatchedWidth(ColPartitionSet *part_set);\n\n  // Return true if this ColPartitionSet makes a legal column candidate by\n  // having legal individual partitions and non-overlapping adjacent pairs.\n  bool LegalColumnCandidate();\n\n  // Return a copy of this. If good_only will only copy the Good ColPartitions.\n  ColPartitionSet *Copy(bool good_only);\n\n  // Display the edges of the columns at the given y coords.\n  void DisplayColumnEdges(int y_bottom, int y_top, ScrollView *win);\n\n  // Return the ColumnSpanningType that best explains the columns overlapped\n  // by the given coords(left,right,y), with the given margins.\n  // Also return the first and last column index touched by the coords and\n  // the leftmost spanned column.\n  // Column indices are 2n + 1 for real columns (0 based) and even values\n  // represent the gaps in between columns, with 0 being left of the leftmost.\n  // resolution refers to the ppi resolution of the image. It may be 0 if only\n  // the first_col and last_col are required.\n  ColumnSpanningType SpanningType(int resolution, int left, int right,\n                                  int height, int y, int left_margin,\n                                  int right_margin, int *first_col,\n                                  int *last_col, int *first_spanned_col);\n\n  // The column_set has changed. Close down all in-progress WorkingPartSets in\n  // columns that do not match and start new ones for the new columns in this.\n  // As ColPartitions are turned into BLOCKs, the used ones are put in\n  // used_parts, as they still need to be referenced in the grid.\n  void ChangeWorkColumns(const ICOORD &bleft, const ICOORD &tright,\n                         int resolution, ColPartition_LIST *used_parts,\n                         WorkingPartSet_LIST *working_set);\n\n  // Accumulate the widths and gaps into the given variables.\n  void AccumulateColumnWidthsAndGaps(int *total_width, int *width_samples,\n                                     int *total_gap, int *gap_samples);\n\n  // Provide debug output for this ColPartitionSet and all the ColPartitions.\n  void Print();\n\nprivate:\n  // Add the given partition to the list in the appropriate place.\n  void AddPartition(ColPartition *new_part, ColPartition_IT *it);\n\n  // Compute the coverage and good column count. Coverage is the amount of the\n  // width of the page (in pixels) that is covered by ColPartitions, which are\n  // used to provide candidate column layouts.\n  // Coverage is split into good and bad. Good coverage is provided by\n  // ColPartitions of a frequent width (according to the callback function\n  // provided by TabFinder::WidthCB, which accesses stored statistics on the\n  // widths of ColPartitions) and bad coverage is provided by all other\n  // ColPartitions, even if they have tab vectors at both sides. Thus:\n  // |-----------------------------------------------------------------|\n  // |        Double     width    heading                              |\n  // |-----------------------------------------------------------------|\n  // |-------------------------------| |-------------------------------|\n  // |   Common width ColPartition   | |  Common width ColPartition    |\n  // |-------------------------------| |-------------------------------|\n  // the layout with two common-width columns has better coverage than the\n  // double width heading, because the coverage is \"good,\" even though less in\n  // total coverage than the heading, because the heading coverage is \"bad.\"\n  void ComputeCoverage();\n\n  // Adds the coverage, column count and box for a single partition,\n  // without adding it to the list. (Helper factored from ComputeCoverage.)\n  void AddPartitionCoverageAndBox(const ColPartition &part);\n\n  // The partitions in this column candidate.\n  ColPartition_LIST parts_;\n  // The number of partitions that have a frequent column width.\n  int good_column_count_;\n  // Total width of all the good ColPartitions.\n  int good_coverage_;\n  // Total width of all the bad ColPartitions.\n  int bad_coverage_;\n  // Bounding box of all partitions in the set.\n  TBOX bounding_box_;\n};\n\nELISTIZEH(ColPartitionSet)\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_COLPARTITION_H_\n"
  },
  {
    "path": "src/textord/devanagari_processing.cpp",
    "content": "/**********************************************************************\n * File:        devanagari_processing.cpp\n * Description: Methods to process images containing devanagari symbols,\n *              prior to classification.\n * Author:      Shobhit Saxena\n *\n * (C) Copyright 2008, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"devanagari_processing.h\"\n\n#include \"debugpixa.h\"\n#include \"statistc.h\"\n#include \"tordmain.h\"\n\n#include <allheaders.h>\n\nnamespace tesseract {\n\n// Flags controlling the debugging information for shiro-rekha splitting\n// strategies.\nINT_VAR(devanagari_split_debuglevel, 0, \"Debug level for split shiro-rekha process.\");\n\nBOOL_VAR(devanagari_split_debugimage, 0,\n         \"Whether to create a debug image for split shiro-rekha process.\");\n\nShiroRekhaSplitter::ShiroRekhaSplitter() :\n  orig_pix_(nullptr),\n  splitted_image_(nullptr),\n  pageseg_split_strategy_(NO_SPLIT),\n  ocr_split_strategy_(NO_SPLIT),\n  debug_image_(nullptr),\n  segmentation_block_list_(nullptr),\n  global_xheight_(kUnspecifiedXheight),\n  perform_close_(false)\n{\n}\n\nShiroRekhaSplitter::~ShiroRekhaSplitter() {\n  Clear();\n}\n\nvoid ShiroRekhaSplitter::Clear() {\n  orig_pix_.destroy();\n  splitted_image_.destroy();\n  pageseg_split_strategy_ = NO_SPLIT;\n  ocr_split_strategy_ = NO_SPLIT;\n  debug_image_.destroy();\n  segmentation_block_list_ = nullptr;\n  global_xheight_ = kUnspecifiedXheight;\n  perform_close_ = false;\n}\n\n// On setting the input image, a clone of it is owned by this class.\nvoid ShiroRekhaSplitter::set_orig_pix(Image pix) {\n  if (orig_pix_) {\n    orig_pix_.destroy();\n  }\n  orig_pix_ = pix.clone();\n}\n\n// Top-level method to perform splitting based on current settings.\n// Returns true if a split was actually performed.\n// split_for_pageseg should be true if the splitting is being done prior to\n// page segmentation. This mode uses the flag\n// pageseg_devanagari_split_strategy to determine the splitting strategy.\nbool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa *pixa_debug) {\n  SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_;\n  if (split_strategy == NO_SPLIT) {\n    return false; // Nothing to do.\n  }\n  ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT);\n  ASSERT_HOST(orig_pix_);\n  if (devanagari_split_debuglevel > 0) {\n    tprintf(\"Splitting shiro-rekha ...\\n\");\n    tprintf(\"Split strategy = %s\\n\", split_strategy == MINIMAL_SPLIT ? \"Minimal\" : \"Maximal\");\n    tprintf(\"Initial pageseg available = %s\\n\", segmentation_block_list_ ? \"yes\" : \"no\");\n  }\n  // Create a copy of original image to store the splitting output.\n  splitted_image_.destroy();\n  splitted_image_ = orig_pix_.copy();\n\n  // Initialize debug image if required.\n  if (devanagari_split_debugimage) {\n    debug_image_.destroy();\n    debug_image_ = pixConvertTo32(orig_pix_);\n  }\n\n  // Determine all connected components in the input image. A close operation\n  // may be required prior to this, depending on the current settings.\n  Image pix_for_ccs = orig_pix_.clone();\n  if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) {\n    if (devanagari_split_debuglevel > 0) {\n      tprintf(\"Performing a global close operation..\\n\");\n    }\n    // A global measure is available for xheight, but no local information\n    // exists.\n    pix_for_ccs.destroy();\n    pix_for_ccs = orig_pix_.copy();\n    PerformClose(pix_for_ccs, global_xheight_);\n  }\n  Pixa *ccs;\n  Boxa *tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);\n  boxaDestroy(&tmp_boxa);\n  pix_for_ccs.destroy();\n\n  // Iterate over all connected components. Get their bounding boxes and clip\n  // out the image regions corresponding to these boxes from the original image.\n  // Conditionally run splitting on each of them.\n  Boxa *regions_to_clear = boxaCreate(0);\n  int num_ccs = 0;\n  if (ccs != nullptr) {\n    num_ccs = pixaGetCount(ccs);\n  }\n  for (int i = 0; i < num_ccs; ++i) {\n    Box *box = pixaGetBox(ccs, i, L_CLONE);\n    Image word_pix = pixClipRectangle(orig_pix_, box, nullptr);\n    ASSERT_HOST(word_pix);\n    int xheight = GetXheightForCC(box);\n    if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) {\n      pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);\n    }\n    // If some xheight measure is available, attempt to pre-eliminate small\n    // blobs from the shiro-rekha process. This is primarily to save the CCs\n    // corresponding to punctuation marks/small dots etc which are part of\n    // larger graphemes.\n    l_int32 x, y, w, h;\n    boxGetGeometry(box, &x, &y, &w, &h);\n    if (xheight == kUnspecifiedXheight || (w > xheight / 3 && h > xheight / 2)) {\n      SplitWordShiroRekha(split_strategy, word_pix, xheight, x, y, regions_to_clear);\n    } else if (devanagari_split_debuglevel > 0) {\n      tprintf(\"CC dropped from splitting: %d,%d (%d, %d)\\n\", x, y, w, h);\n    }\n    word_pix.destroy();\n    boxDestroy(&box);\n  }\n  // Actually clear the boxes now.\n  for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {\n    Box *box = boxaGetBox(regions_to_clear, i, L_CLONE);\n    pixClearInRect(splitted_image_, box);\n    boxDestroy(&box);\n  }\n  boxaDestroy(&regions_to_clear);\n  pixaDestroy(&ccs);\n  if (devanagari_split_debugimage && pixa_debug != nullptr) {\n    pixa_debug->AddPix(debug_image_, split_for_pageseg ? \"pageseg_split\" : \"ocr_split\");\n  }\n  return true;\n}\n\n// Method to perform a close operation on the input image. The xheight\n// estimate decides the size of sel used.\nvoid ShiroRekhaSplitter::PerformClose(Image pix, int xheight_estimate) {\n  pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);\n}\n\n// This method resolves the cc bbox to a particular row and returns the row's\n// xheight.\nint ShiroRekhaSplitter::GetXheightForCC(Box *cc_bbox) {\n  if (!segmentation_block_list_) {\n    return global_xheight_;\n  }\n  // Compute the box coordinates in Tesseract's coordinate system.\n  l_int32 x, y, w, h;\n  boxGetGeometry(cc_bbox, &x, &y, &w, &h);\n  TBOX bbox(x, pixGetHeight(orig_pix_) - y - h - 1,\n            x + w, pixGetHeight(orig_pix_) - y - 1);\n  // Iterate over all blocks.\n  BLOCK_IT block_it(segmentation_block_list_);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    // Iterate over all rows in the block.\n    ROW_IT row_it(block->row_list());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      ROW *row = row_it.data();\n      if (!row->bounding_box().major_overlap(bbox)) {\n        continue;\n      }\n      // Row could be skewed, warped, etc. Use the position of the box to\n      // determine the baseline position of the row for that x-coordinate.\n      // Create a square TBOX whose baseline's mid-point lies at this point\n      // and side is row's xheight. Take the overlap of this box with the input\n      // box and check if it is a 'major overlap'. If so, this box lies in this\n      // row. In that case, return the xheight for this row.\n      float box_middle = 0.5 * (bbox.left() + bbox.right());\n      int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);\n      TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2,\n                    static_cast<int>(baseline + row->x_height()));\n      // Compute overlap. If it is a major overlap, this is the right row.\n      if (bbox.major_overlap(test_box)) {\n        return row->x_height();\n      }\n    }\n  }\n  // No row found for this bbox.\n  return kUnspecifiedXheight;\n}\n\n// Returns a list of regions (boxes) which should be cleared in the original\n// image so as to perform shiro-rekha splitting. Pix is assumed to carry one\n// (or less) word only. Xheight measure could be the global estimate, the row\n// estimate, or unspecified. If unspecified, over splitting may occur, since a\n// conservative estimate of stroke width along with an associated multiplier\n// is used in its place. It is advisable to have a specified xheight when\n// splitting for classification/training.\n// A vertical projection histogram of all the on-pixels in the input pix is\n// computed. The maxima of this histogram is regarded as an approximate location\n// of the shiro-rekha. By descending on the maxima's peak on both sides,\n// stroke width of shiro-rekha is estimated.\n// A horizontal projection histogram is computed for a sub-image of the input\n// image, which extends from just below the shiro-rekha down to a certain\n// leeway. The leeway depends on the input xheight, if provided, else a\n// conservative multiplier on approximate stroke width is used (which may lead\n// to over-splitting).\nvoid ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight,\n                                             int word_left, int word_top, Boxa *regions_to_clear) {\n  if (split_strategy == NO_SPLIT) {\n    return;\n  }\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  // Statistically determine the yextents of the shiro-rekha.\n  int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;\n  GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, &shirorekha_ylevel);\n  // Since the shiro rekha is also a stroke, its width is equal to the stroke\n  // width.\n  int stroke_width = shirorekha_bottom - shirorekha_top + 1;\n\n  // Some safeguards to protect CCs we do not want to be split.\n  // These are particularly useful when the word wasn't eliminated earlier\n  // because xheight information was unavailable.\n  if (shirorekha_ylevel > height / 2) {\n    // Shirorekha shouldn't be in the bottom half of the word.\n    if (devanagari_split_debuglevel > 0) {\n      tprintf(\"Skipping splitting CC at (%d, %d): shirorekha in lower half..\\n\", word_left,\n              word_top);\n    }\n    return;\n  }\n  if (stroke_width > height / 3) {\n    // Even the boldest of fonts shouldn't do this.\n    if (devanagari_split_debuglevel > 0) {\n      tprintf(\"Skipping splitting CC at (%d, %d): stroke width too huge..\\n\", word_left, word_top);\n    }\n    return;\n  }\n\n  // Clear the ascender and descender regions of the word.\n  // Obtain a vertical projection histogram for the resulting image.\n  Box *box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, width, 5 * stroke_width / 3);\n  Image word_in_xheight = pix.copy();\n  pixClearInRect(word_in_xheight, box_to_clear);\n  // Also clear any pixels which are below shirorekha_bottom + some leeway.\n  // The leeway is set to xheight if the information is available, else it is a\n  // multiplier applied to the stroke width.\n  int leeway_to_keep = stroke_width * 3;\n  if (xheight != kUnspecifiedXheight) {\n    // This is because the xheight-region typically includes the shiro-rekha\n    // inside it, i.e., the top of the xheight range corresponds to the top of\n    // shiro-rekha.\n    leeway_to_keep = xheight - stroke_width;\n  }\n  auto y = shirorekha_bottom + leeway_to_keep;\n  boxSetGeometry(box_to_clear, -1, y, -1, height - y);\n  pixClearInRect(word_in_xheight, box_to_clear);\n  boxDestroy(&box_to_clear);\n\n  PixelHistogram vert_hist;\n  vert_hist.ConstructVerticalCountHist(word_in_xheight);\n  word_in_xheight.destroy();\n\n  // If the number of black pixel in any column of the image is less than a\n  // fraction of the stroke width, treat it as noise / a stray mark. Perform\n  // these changes inside the vert_hist data itself, as that is used later on as\n  // a bit vector for the final split decision at every column.\n  for (int i = 0; i < width; ++i) {\n    if (vert_hist.hist()[i] <= stroke_width / 4) {\n      vert_hist.hist()[i] = 0;\n    } else {\n      vert_hist.hist()[i] = 1;\n    }\n  }\n  // In order to split the line at any point, we make sure that the width of the\n  // gap is at least half the stroke width.\n  int i = 0;\n  int cur_component_width = 0;\n  while (i < width) {\n    if (!vert_hist.hist()[i]) {\n      int j = 0;\n      while (i + j < width && !vert_hist.hist()[i + j]) {\n        ++j;\n      }\n      if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {\n        // Perform a shiro-rekha split. The intervening region lies from i to\n        // i+j-1.\n        // A minimal single-pixel split makes the estimation of intra- and\n        // inter-word spacing easier during page layout analysis,\n        // whereas a maximal split may be needed for OCR, depending on\n        // how the engine was trained.\n        bool minimal_split = (split_strategy == MINIMAL_SPLIT);\n        int split_width = minimal_split ? 1 : j;\n        int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;\n        if (!minimal_split || (i != 0 && i + j != width)) {\n          Box *box_to_clear =\n              boxCreate(word_left + split_left, word_top + shirorekha_top - stroke_width / 3,\n                        split_width, 5 * stroke_width / 3);\n          if (box_to_clear) {\n            boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);\n            // Mark this in the debug image if needed.\n            if (devanagari_split_debugimage) {\n              pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);\n            }\n            boxDestroy(&box_to_clear);\n            cur_component_width = 0;\n          }\n        }\n      }\n      i += j;\n    } else {\n      ++i;\n      ++cur_component_width;\n    }\n  }\n}\n\n// Refreshes the words in the segmentation block list by using blobs in the\n// input block list.\n// The segmentation block list must be set.\nvoid ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs) {\n  // The segmentation block list must have been specified.\n  ASSERT_HOST(segmentation_block_list_);\n  if (devanagari_split_debuglevel > 0) {\n    tprintf(\"Before refreshing blobs:\\n\");\n    PrintSegmentationStats(segmentation_block_list_);\n    tprintf(\"New Blobs found: %d\\n\", new_blobs->length());\n  }\n\n  C_BLOB_LIST not_found_blobs;\n  RefreshWordBlobsFromNewBlobs(\n      segmentation_block_list_, new_blobs,\n      ((devanagari_split_debugimage && debug_image_) ? &not_found_blobs : nullptr));\n\n  if (devanagari_split_debuglevel > 0) {\n    tprintf(\"After refreshing blobs:\\n\");\n    PrintSegmentationStats(segmentation_block_list_);\n  }\n  if (devanagari_split_debugimage && debug_image_) {\n    // Plot out the original blobs for which no match was found in the new\n    // all_blobs list.\n    C_BLOB_IT not_found_it(&not_found_blobs);\n    for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {\n      C_BLOB *not_found = not_found_it.data();\n      TBOX not_found_box = not_found->bounding_box();\n      Box *box_to_plot = GetBoxForTBOX(not_found_box);\n      pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);\n      boxDestroy(&box_to_plot);\n    }\n\n    // Plot out the blobs unused from all blobs.\n    C_BLOB_IT all_blobs_it(new_blobs);\n    for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {\n      C_BLOB *a_blob = all_blobs_it.data();\n      Box *box_to_plot = GetBoxForTBOX(a_blob->bounding_box());\n      pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);\n      boxDestroy(&box_to_plot);\n    }\n  }\n}\n\n// Returns a new box object for the corresponding TBOX, based on the original\n// image's coordinate system.\nBox *ShiroRekhaSplitter::GetBoxForTBOX(const TBOX &tbox) const {\n  return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, tbox.width(),\n                   tbox.height());\n}\n\n// This method returns the computed mode-height of blobs in the pix.\n// It also prunes very small blobs from calculation.\nint ShiroRekhaSplitter::GetModeHeight(Image pix) {\n  Boxa *boxa = pixConnComp(pix, nullptr, 8);\n  STATS heights(0, pixGetHeight(pix) - 1);\n  heights.clear();\n  for (int i = 0; i < boxaGetCount(boxa); ++i) {\n    Box *box = boxaGetBox(boxa, i, L_CLONE);\n    l_int32 x, y, w, h;\n    boxGetGeometry(box, &x, &y, &w, &h);\n    if (h >= 3 || w >= 3) {\n      heights.add(h, 1);\n    }\n    boxDestroy(&box);\n  }\n  boxaDestroy(&boxa);\n  return heights.mode();\n}\n\n// This method returns y-extents of the shiro-rekha computed from the input\n// word image.\nvoid ShiroRekhaSplitter::GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top,\n                                               int *shirorekha_bottom, int *shirorekha_ylevel) {\n  // Compute a histogram from projecting the word on a vertical line.\n  PixelHistogram hist_horiz;\n  hist_horiz.ConstructHorizontalCountHist(word_pix);\n  // Get the ylevel where the top-line exists. This is basically the global\n  // maxima in the horizontal histogram.\n  int topline_onpixel_count = 0;\n  int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);\n\n  // Get the upper and lower extents of the shiro rekha.\n  int thresh = (topline_onpixel_count * 70) / 100;\n  int ulimit = topline_ylevel;\n  int llimit = topline_ylevel;\n  while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) {\n    --ulimit;\n  }\n  while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) {\n    ++llimit;\n  }\n\n  if (shirorekha_top) {\n    *shirorekha_top = ulimit;\n  }\n  if (shirorekha_bottom) {\n    *shirorekha_bottom = llimit;\n  }\n  if (shirorekha_ylevel) {\n    *shirorekha_ylevel = topline_ylevel;\n  }\n}\n\n// This method returns the global-maxima for the histogram. The frequency of\n// the global maxima is returned in count, if specified.\nint PixelHistogram::GetHistogramMaximum(int *count) const {\n  int best_value = 0;\n  for (int i = 0; i < length_; ++i) {\n    if (hist_[i] > hist_[best_value]) {\n      best_value = i;\n    }\n  }\n  if (count) {\n    *count = hist_[best_value];\n  }\n  return best_value;\n}\n\n// Methods to construct histograms from images.\nvoid PixelHistogram::ConstructVerticalCountHist(Image pix) {\n  Clear();\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  hist_ = new int[width];\n  length_ = width;\n  int wpl = pixGetWpl(pix);\n  l_uint32 *data = pixGetData(pix);\n  for (int i = 0; i < width; ++i) {\n    hist_[i] = 0;\n  }\n  for (int i = 0; i < height; ++i) {\n    l_uint32 *line = data + i * wpl;\n    for (int j = 0; j < width; ++j) {\n      if (GET_DATA_BIT(line, j)) {\n        ++(hist_[j]);\n      }\n    }\n  }\n}\n\nvoid PixelHistogram::ConstructHorizontalCountHist(Image pix) {\n  Clear();\n  Numa *counts = pixCountPixelsByRow(pix, nullptr);\n  length_ = numaGetCount(counts);\n  hist_ = new int[length_];\n  for (int i = 0; i < length_; ++i) {\n    l_int32 val = 0;\n    numaGetIValue(counts, i, &val);\n    hist_[i] = val;\n  }\n  numaDestroy(&counts);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/devanagari_processing.h",
    "content": "// Copyright 2008 Google Inc. All Rights Reserved.\n// Author: shobhitsaxena@google.com (Shobhit Saxena)\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_\n#define TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_\n\n#include <allheaders.h>\n#include \"ocrblock.h\"\n#include \"params.h\"\n\nstruct Pix;\nstruct Box;\nstruct Boxa;\n\nnamespace tesseract {\n\nextern INT_VAR_H(devanagari_split_debuglevel);\n\nextern BOOL_VAR_H(devanagari_split_debugimage);\n\nclass TBOX;\nclass DebugPixa;\n\nclass PixelHistogram {\npublic:\n  PixelHistogram() {\n    hist_ = nullptr;\n    length_ = 0;\n  }\n\n  ~PixelHistogram() {\n    Clear();\n  }\n\n  void Clear() {\n    delete[] hist_;\n    length_ = 0;\n  }\n\n  int *hist() const {\n    return hist_;\n  }\n\n  int length() const {\n    return length_;\n  }\n\n  // Methods to construct histograms from images. These clear any existing data.\n  void ConstructVerticalCountHist(Image pix);\n  void ConstructHorizontalCountHist(Image pix);\n\n  // This method returns the global-maxima for the histogram. The frequency of\n  // the global maxima is returned in count, if specified.\n  int GetHistogramMaximum(int *count) const;\n\nprivate:\n  int *hist_;\n  int length_;\n};\n\nclass ShiroRekhaSplitter {\npublic:\n  enum SplitStrategy {\n    NO_SPLIT = 0,  // No splitting is performed for the phase.\n    MINIMAL_SPLIT, // Blobs are split minimally.\n    MAXIMAL_SPLIT  // Blobs are split maximally.\n  };\n\n  ShiroRekhaSplitter();\n  virtual ~ShiroRekhaSplitter();\n\n  // Top-level method to perform splitting based on current settings.\n  // Returns true if a split was actually performed.\n  // If split_for_pageseg is true, the pageseg_split_strategy_ is used for\n  // splitting. If false, the ocr_split_strategy_ is used.\n  bool Split(bool split_for_pageseg, DebugPixa *pixa_debug);\n\n  // Clears the memory held by this object.\n  void Clear();\n\n  // Refreshes the words in the segmentation block list by using blobs in the\n  // input blob list.\n  // The segmentation block list must be set.\n  void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs);\n\n  // Returns true if the split strategies for pageseg and ocr are different.\n  bool HasDifferentSplitStrategies() const {\n    return pageseg_split_strategy_ != ocr_split_strategy_;\n  }\n\n  // This only keeps a copy of the block list pointer. At split call, the list\n  // object should still be alive. This block list is used as a golden\n  // segmentation when performing splitting.\n  void set_segmentation_block_list(BLOCK_LIST *block_list) {\n    segmentation_block_list_ = block_list;\n  }\n\n  static const int kUnspecifiedXheight = -1;\n\n  void set_global_xheight(int xheight) {\n    global_xheight_ = xheight;\n  }\n\n  void set_perform_close(bool perform) {\n    perform_close_ = perform;\n  }\n\n  // Returns the image obtained from shiro-rekha splitting. The returned object\n  // is owned by this class. Callers may want to clone the returned pix to keep\n  // it alive beyond the life of ShiroRekhaSplitter object.\n  Image splitted_image() {\n    return splitted_image_;\n  }\n\n  // On setting the input image, a clone of it is owned by this class.\n  void set_orig_pix(Image pix);\n\n  // Returns the input image provided to the object. This object is owned by\n  // this class. Callers may want to clone the returned pix to work with it.\n  Image orig_pix() {\n    return orig_pix_;\n  }\n\n  SplitStrategy ocr_split_strategy() const {\n    return ocr_split_strategy_;\n  }\n\n  void set_ocr_split_strategy(SplitStrategy strategy) {\n    ocr_split_strategy_ = strategy;\n  }\n\n  SplitStrategy pageseg_split_strategy() const {\n    return pageseg_split_strategy_;\n  }\n\n  void set_pageseg_split_strategy(SplitStrategy strategy) {\n    pageseg_split_strategy_ = strategy;\n  }\n\n  BLOCK_LIST *segmentation_block_list() {\n    return segmentation_block_list_;\n  }\n\n  // This method returns the computed mode-height of blobs in the pix.\n  // It also prunes very small blobs from calculation. Could be used to provide\n  // a global xheight estimate for images which have the same point-size text.\n  static int GetModeHeight(Image pix);\n\nprivate:\n  // Method to perform a close operation on the input image. The xheight\n  // estimate decides the size of sel used.\n  static void PerformClose(Image pix, int xheight_estimate);\n\n  // This method resolves the cc bbox to a particular row and returns the row's\n  // xheight. This uses block_list_ if available, else just returns the\n  // global_xheight_ estimate currently set in the object.\n  int GetXheightForCC(Box *cc_bbox);\n\n  // Returns a list of regions (boxes) which should be cleared in the original\n  // image so as to perform shiro-rekha splitting. Pix is assumed to carry one\n  // (or less) word only. Xheight measure could be the global estimate, the row\n  // estimate, or unspecified. If unspecified, over splitting may occur, since a\n  // conservative estimate of stroke width along with an associated multiplier\n  // is used in its place. It is advisable to have a specified xheight when\n  // splitting for classification/training.\n  void SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight, int word_left,\n                           int word_top, Boxa *regions_to_clear);\n\n  // Returns a new box object for the corresponding TBOX, based on the original\n  // image's coordinate system.\n  Box *GetBoxForTBOX(const TBOX &tbox) const;\n\n  // This method returns y-extents of the shiro-rekha computed from the input\n  // word image.\n  static void GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top, int *shirorekha_bottom,\n                                    int *shirorekha_ylevel);\n\n  Image orig_pix_;       // Just a clone of the input image passed.\n  Image splitted_image_; // Image produced after the last splitting round. The\n                        // object is owned by this class.\n  SplitStrategy pageseg_split_strategy_;\n  SplitStrategy ocr_split_strategy_;\n  Image debug_image_;\n  // This block list is used as a golden segmentation when performing splitting.\n  BLOCK_LIST *segmentation_block_list_;\n  int global_xheight_;\n  bool perform_close_; // Whether a morphological close operation should be\n                       // performed before CCs are run through splitting.\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_\n"
  },
  {
    "path": "src/textord/drawtord.cpp",
    "content": "/**********************************************************************\n * File:        drawtord.cpp  (Formerly drawto.c)\n * Description: Draw things to do with textord.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"drawtord.h\"\n\n#include \"pithsync.h\"\n#include \"topitch.h\"\n\nnamespace tesseract {\n\n#define TO_WIN_XPOS 0 // default window pos\n#define TO_WIN_YPOS 0\n#define TO_WIN_NAME \"Textord\"\n// title of window\n\nBOOL_VAR(textord_show_fixed_cuts, false, \"Draw fixed pitch cell boundaries\");\n\nScrollView *to_win = nullptr;\n\n#ifndef GRAPHICS_DISABLED\n\n/**********************************************************************\n * create_to_win\n *\n * Create the to window used to show the fit.\n **********************************************************************/\n\nScrollView *create_to_win(ICOORD page_tr) {\n  if (to_win != nullptr) {\n    return to_win;\n  }\n  to_win = new ScrollView(TO_WIN_NAME, TO_WIN_XPOS, TO_WIN_YPOS, page_tr.x() + 1, page_tr.y() + 1,\n                          page_tr.x(), page_tr.y(), true);\n  return to_win;\n}\n\nvoid close_to_win() {\n  // to_win is leaked, but this enables the user to view the contents.\n  if (to_win != nullptr) {\n    to_win->Update();\n  }\n}\n\n/**********************************************************************\n * plot_box_list\n *\n * Draw a list of blobs.\n **********************************************************************/\n\nvoid plot_box_list(               // make gradients win\n    ScrollView *win,              // window to draw in\n    BLOBNBOX_LIST *list,          // blob list\n    ScrollView::Color body_colour // colour to draw\n) {\n  BLOBNBOX_IT it = list; // iterator\n\n  win->Pen(body_colour);\n  win->Brush(ScrollView::NONE);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->bounding_box().plot(win);\n  }\n}\n\n/**********************************************************************\n * plot_to_row\n *\n * Draw the blobs of a row in a given colour and draw the line fit.\n **********************************************************************/\n\nvoid plot_to_row(             // draw a row\n    TO_ROW *row,              // row to draw\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n) {\n  FCOORD plot_pt; // point to plot\n                  // blobs\n  BLOBNBOX_IT it = row->blob_list();\n  float left, right; // end of row\n\n  if (it.empty()) {\n    tprintf(\"No blobs in row at %g\\n\", row->parallel_c());\n    return;\n  }\n  left = it.data()->bounding_box().left();\n  it.move_to_last();\n  right = it.data()->bounding_box().right();\n  plot_blob_list(to_win, row->blob_list(), colour, ScrollView::BROWN);\n  to_win->Pen(colour);\n  plot_pt = FCOORD(left, row->line_m() * left + row->line_c());\n  plot_pt.rotate(rotation);\n  to_win->SetCursor(plot_pt.x(), plot_pt.y());\n  plot_pt = FCOORD(right, row->line_m() * right + row->line_c());\n  plot_pt.rotate(rotation);\n  to_win->DrawTo(plot_pt.x(), plot_pt.y());\n}\n\n/**********************************************************************\n * plot_parallel_row\n *\n * Draw the blobs of a row in a given colour and draw the line fit.\n **********************************************************************/\n\nvoid plot_parallel_row(       // draw a row\n    TO_ROW *row,              // row to draw\n    float gradient,           // gradients of lines\n    int32_t left,             // edge of block\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n) {\n  FCOORD plot_pt; // point to plot\n                  // blobs\n  BLOBNBOX_IT it = row->blob_list();\n  auto fleft = static_cast<float>(left); // floating version\n  float right;                           // end of row\n\n  //      left=it.data()->bounding_box().left();\n  it.move_to_last();\n  right = it.data()->bounding_box().right();\n  plot_blob_list(to_win, row->blob_list(), colour, ScrollView::BROWN);\n  to_win->Pen(colour);\n  plot_pt = FCOORD(fleft, gradient * left + row->max_y());\n  plot_pt.rotate(rotation);\n  to_win->SetCursor(plot_pt.x(), plot_pt.y());\n  plot_pt = FCOORD(fleft, gradient * left + row->min_y());\n  plot_pt.rotate(rotation);\n  to_win->DrawTo(plot_pt.x(), plot_pt.y());\n  plot_pt = FCOORD(fleft, gradient * left + row->parallel_c());\n  plot_pt.rotate(rotation);\n  to_win->SetCursor(plot_pt.x(), plot_pt.y());\n  plot_pt = FCOORD(right, gradient * right + row->parallel_c());\n  plot_pt.rotate(rotation);\n  to_win->DrawTo(plot_pt.x(), plot_pt.y());\n}\n\n/**********************************************************************\n * draw_occupation\n *\n * Draw the row occupation with points above the threshold in white\n * and points below the threshold in black.\n **********************************************************************/\n\nvoid draw_occupation(                    // draw projection\n    int32_t xleft,                       // edge of block\n    int32_t ybottom,                     // bottom of block\n    int32_t min_y,                       // coordinate limits\n    int32_t max_y, int32_t occupation[], // projection counts\n    int32_t thresholds[]                 // for drop out\n) {\n  int32_t line_index;                     // pixel coord\n  ScrollView::Color colour;               // of histogram\n  auto fleft = static_cast<float>(xleft); // float version\n\n  colour = ScrollView::WHITE;\n  to_win->Pen(colour);\n  to_win->SetCursor(fleft, static_cast<float>(ybottom));\n  for (line_index = min_y; line_index <= max_y; line_index++) {\n    if (occupation[line_index - min_y] < thresholds[line_index - min_y]) {\n      if (colour != ScrollView::BLUE) {\n        colour = ScrollView::BLUE;\n        to_win->Pen(colour);\n      }\n    } else {\n      if (colour != ScrollView::WHITE) {\n        colour = ScrollView::WHITE;\n        to_win->Pen(colour);\n      }\n    }\n    to_win->DrawTo(fleft + occupation[line_index - min_y] / 10.0, static_cast<float>(line_index));\n  }\n  colour = ScrollView::STEEL_BLUE;\n  to_win->Pen(colour);\n  to_win->SetCursor(fleft, static_cast<float>(ybottom));\n  for (line_index = min_y; line_index <= max_y; line_index++) {\n    to_win->DrawTo(fleft + thresholds[line_index - min_y] / 10.0, static_cast<float>(line_index));\n  }\n}\n\n/**********************************************************************\n * draw_meanlines\n *\n * Draw the meanlines of the given block in the given colour.\n **********************************************************************/\n\nvoid draw_meanlines(          // draw a block\n    TO_BLOCK *block,          // block to draw\n    float gradient,           // gradients of lines\n    int32_t left,             // edge of block\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n) {\n  FCOORD plot_pt; // point to plot\n                  // rows\n  TO_ROW_IT row_it = block->get_rows();\n  TO_ROW *row;         // current row\n  BLOBNBOX_IT blob_it; // blobs\n  float right;         // end of row\n  to_win->Pen(colour);\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    blob_it.set_to_list(row->blob_list());\n    blob_it.move_to_last();\n    right = blob_it.data()->bounding_box().right();\n    plot_pt = FCOORD(static_cast<float>(left), gradient * left + row->parallel_c() + row->xheight);\n    plot_pt.rotate(rotation);\n    to_win->SetCursor(plot_pt.x(), plot_pt.y());\n    plot_pt = FCOORD(right, gradient * right + row->parallel_c() + row->xheight);\n    plot_pt.rotate(rotation);\n    to_win->DrawTo(plot_pt.x(), plot_pt.y());\n  }\n}\n\n/**********************************************************************\n * plot_word_decisions\n *\n * Plot a row with words in different colours and fuzzy spaces\n * highlighted.\n **********************************************************************/\n\nvoid plot_word_decisions( // draw words\n    ScrollView *win,      // window tro draw in\n    int16_t pitch,        // of block\n    TO_ROW *row           // row to draw\n) {\n  ScrollView::Color colour = ScrollView::MAGENTA; // current colour\n  ScrollView::Color rect_colour;                  // fuzzy colour\n  int32_t prev_x;                                 // end of prev blob\n  int16_t blob_count;                             // blobs in word\n  BLOBNBOX *blob;                                 // current blob\n  TBOX blob_box;                                  // bounding box\n                                                  // iterator\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT start_it = blob_it; // word start\n\n  rect_colour = ScrollView::BLACK;\n  prev_x = -INT16_MAX;\n  blob_count = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    blob_box = blob->bounding_box();\n    if (!blob->joined_to_prev() && blob_box.left() - prev_x > row->max_nonspace) {\n      if ((blob_box.left() - prev_x >= row->min_space ||\n           blob_box.left() - prev_x > row->space_threshold) &&\n          blob_count > 0) {\n        if (pitch > 0 && textord_show_fixed_cuts) {\n          plot_fp_cells(win, colour, &start_it, pitch, blob_count, &row->projection,\n                        row->projection_left, row->projection_right,\n                        row->xheight * textord_projection_scale);\n        }\n        blob_count = 0;\n        start_it = blob_it;\n      }\n      if (colour == ScrollView::MAGENTA) {\n        colour = ScrollView::RED;\n      } else {\n        colour = static_cast<ScrollView::Color>(colour + 1);\n      }\n      if (blob_box.left() - prev_x < row->min_space) {\n        if (blob_box.left() - prev_x > row->space_threshold) {\n          rect_colour = ScrollView::GOLDENROD;\n        } else {\n          rect_colour = ScrollView::CORAL;\n        }\n        // fill_color_index(win, rect_colour);\n        win->Brush(rect_colour);\n        win->Rectangle(prev_x, blob_box.bottom(), blob_box.left(), blob_box.top());\n      }\n    }\n    if (!blob->joined_to_prev()) {\n      prev_x = blob_box.right();\n    }\n    if (blob->cblob() != nullptr) {\n      blob->cblob()->plot(win, colour, colour);\n    }\n    if (!blob->joined_to_prev() && blob->cblob() != nullptr) {\n      blob_count++;\n    }\n  }\n  if (pitch > 0 && textord_show_fixed_cuts && blob_count > 0) {\n    plot_fp_cells(win, colour, &start_it, pitch, blob_count, &row->projection, row->projection_left,\n                  row->projection_right, row->xheight * textord_projection_scale);\n  }\n}\n\n/**********************************************************************\n * plot_fp_cells\n *\n * Make a list of fixed pitch cuts and draw them.\n **********************************************************************/\n\nvoid plot_fp_cells(           // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    BLOBNBOX_IT *blob_it,     // blobs\n    int16_t pitch,            // of block\n    int16_t blob_count,       // no of real blobs\n    STATS *projection,        // vertical\n    int16_t projection_left,  // edges //scale factor\n    int16_t projection_right, float projection_scale) {\n  int16_t occupation;    // occupied cells\n  TBOX word_box;         // bounding box\n  FPSEGPT_LIST seg_list; // list of cuts\n  FPSEGPT_IT seg_it;\n  FPSEGPT *segpt; // current point\n\n  if (pitsync_linear_version) {\n    check_pitch_sync2(blob_it, blob_count, pitch, 2, projection, projection_left, projection_right,\n                      projection_scale, occupation, &seg_list, 0, 0);\n  } else {\n    check_pitch_sync(blob_it, blob_count, pitch, 2, projection, &seg_list);\n  }\n  word_box = blob_it->data()->bounding_box();\n  for (; blob_count > 0; blob_count--) {\n    word_box += box_next(blob_it);\n  }\n  seg_it.set_to_list(&seg_list);\n  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n    segpt = seg_it.data();\n    if (segpt->faked) {\n      colour = ScrollView::WHITE;\n      win->Pen(colour);\n    } else {\n      win->Pen(colour);\n    }\n    win->Line(segpt->position(), word_box.bottom(), segpt->position(), word_box.top());\n  }\n}\n\n/**********************************************************************\n * plot_fp_cells2\n *\n * Make a list of fixed pitch cuts and draw them.\n **********************************************************************/\n\nvoid plot_fp_cells2(          // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    TO_ROW *row,              // for location\n    FPSEGPT_LIST *seg_list    // segments to plot\n) {\n  TBOX word_box; // bounding box\n  FPSEGPT_IT seg_it = seg_list;\n  // blobs in row\n  BLOBNBOX_IT blob_it = row->blob_list();\n  FPSEGPT *segpt; // current point\n\n  word_box = blob_it.data()->bounding_box();\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();) {\n    word_box += box_next(&blob_it);\n  }\n  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n    segpt = seg_it.data();\n    if (segpt->faked) {\n      colour = ScrollView::WHITE;\n      win->Pen(colour);\n    } else {\n      win->Pen(colour);\n    }\n    win->Line(segpt->position(), word_box.bottom(), segpt->position(), word_box.top());\n  }\n}\n\n/**********************************************************************\n * plot_row_cells\n *\n * Make a list of fixed pitch cuts and draw them.\n **********************************************************************/\n\nvoid plot_row_cells(          // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    TO_ROW *row,              // for location\n    float xshift,             // amount of shift\n    ICOORDELT_LIST *cells     // cells to draw\n) {\n  TBOX word_box; // bounding box\n  ICOORDELT_IT cell_it = cells;\n  // blobs in row\n  BLOBNBOX_IT blob_it = row->blob_list();\n  ICOORDELT *cell; // current cell\n\n  word_box = blob_it.data()->bounding_box();\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();) {\n    word_box += box_next(&blob_it);\n  }\n  win->Pen(colour);\n  for (cell_it.mark_cycle_pt(); !cell_it.cycled_list(); cell_it.forward()) {\n    cell = cell_it.data();\n    win->Line(cell->x() + xshift, word_box.bottom(), cell->x() + xshift, word_box.top());\n  }\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/drawtord.h",
    "content": "/**********************************************************************\n * File:        drawtord.h  (Formerly drawto.h)\n * Description: Draw things to do with textord.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef DRAWTORD_H\n#define DRAWTORD_H\n\n#include \"blobbox.h\"\n#include \"params.h\"\n#include \"pitsync1.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n#define NO_SMD \"none\"\n\nextern BOOL_VAR_H(textord_show_fixed_cuts);\nextern ScrollView *to_win;\nextern FILE *to_debug;\n// Creates a static display window for textord, and returns a pointer to it.\nScrollView *create_to_win(ICOORD page_tr);\nvoid close_to_win();              // Destroy the textord window.\nvoid create_todebug_win();        // make gradients win\nvoid plot_box_list(               // make gradients win\n    ScrollView *win,              // window to draw in\n    BLOBNBOX_LIST *list,          // blob list\n    ScrollView::Color body_colour // colour to draw\n);\nvoid plot_to_row(             // draw a row\n    TO_ROW *row,              // row to draw\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n);\nvoid plot_parallel_row(       // draw a row\n    TO_ROW *row,              // row to draw\n    float gradient,           // gradients of lines\n    int32_t left,             // edge of block\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n);\nvoid draw_occupation(                    // draw projection\n    int32_t xleft,                       // edge of block\n    int32_t ybottom,                     // bottom of block\n    int32_t min_y,                       // coordinate limits\n    int32_t max_y, int32_t occupation[], // projection counts\n    int32_t thresholds[]                 // for drop out\n);\nvoid draw_meanlines(          // draw a block\n    TO_BLOCK *block,          // block to draw\n    float gradient,           // gradients of lines\n    int32_t left,             // edge of block\n    ScrollView::Color colour, // colour to draw in\n    FCOORD rotation           // rotation for line\n);\nvoid plot_word_decisions( // draw words\n    ScrollView *win,      // window tro draw in\n    int16_t pitch,        // of block\n    TO_ROW *row           // row to draw\n);\nvoid plot_fp_cells(           // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    BLOBNBOX_IT *blob_it,     // blobs\n    int16_t pitch,            // of block\n    int16_t blob_count,       // no of real blobs\n    STATS *projection,        // vertical\n    int16_t projection_left,  // edges //scale factor\n    int16_t projection_right, float projection_scale);\nvoid plot_fp_cells2(          // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    TO_ROW *row,              // for location\n    FPSEGPT_LIST *seg_list    // segments to plot\n);\nvoid plot_row_cells(          // draw words\n    ScrollView *win,          // window tro draw in\n    ScrollView::Color colour, // colour of lines\n    TO_ROW *row,              // for location\n    float xshift,             // amount of shift\n    ICOORDELT_LIST *cells     // cells to draw\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/edgblob.cpp",
    "content": "/**********************************************************************\n * File:        edgblob.cpp (Formerly edgeloop.c)\n * Description: Functions to clean up an outline before approximation.\n * Author:      Ray Smith\n *\n *(C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0(the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"edgblob.h\"\n\n#include \"edgloop.h\"\n#include \"scanedg.h\"\n\n#define BUCKETSIZE 16\n\nnamespace tesseract {\n\n// Control parameters used in outline_complexity(), which rejects an outline\n// if any one of the 3 conditions is satisfied:\n//  - number of children exceeds edges_max_children_per_outline\n//  - number of nested layers exceeds edges_max_children_layers\n//  - joint complexity exceeds edges_children_count_limit(as in child_count())\nstatic BOOL_VAR(edges_use_new_outline_complexity, false,\n                \"Use the new outline complexity module\");\nstatic INT_VAR(edges_max_children_per_outline, 10,\n               \"Max number of children inside a character outline\");\nstatic INT_VAR(edges_max_children_layers, 5,\n               \"Max layers of nested children inside a character outline\");\nstatic BOOL_VAR(edges_debug, false, \"turn on debugging for this module\");\n\nstatic INT_VAR(edges_children_per_grandchild, 10,\n               \"Importance ratio for chucking outlines\");\nstatic INT_VAR(edges_children_count_limit, 45, \"Max holes allowed in blob\");\nstatic BOOL_VAR(edges_children_fix, false,\n                \"Remove boxy parents of char-like children\");\nstatic INT_VAR(edges_min_nonhole, 12, \"Min pixels for potential char in box\");\nstatic INT_VAR(edges_patharea_ratio, 40,\n               \"Max lensq/area for acceptable child outline\");\nstatic double_VAR(edges_childarea, 0.5, \"Min area fraction of child outline\");\nstatic double_VAR(edges_boxarea, 0.875,\n                  \"Min area fraction of grandchild for box\");\n\n/**\n * @name OL_BUCKETS::OL_BUCKETS\n *\n * Construct an array of buckets for associating outlines into blobs.\n */\n\nOL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners\n                       ICOORD tright)\n    : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),\n      bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),\n      buckets(bxdim * bydim),\n      bl(bleft),\n      tr(tright) {}\n\n/**\n * @name OL_BUCKETS::operator(\n *\n * Return a pointer to a list of C_OUTLINEs corresponding to the\n * given pixel coordinates.\n */\n\nC_OUTLINE_LIST *OL_BUCKETS::operator()( // array access\n    TDimension x,                       // image coords\n    TDimension y) {\n  return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +\n                  (x - bl.x()) / BUCKETSIZE];\n}\n\nC_OUTLINE_LIST *OL_BUCKETS::start_scan() {\n  return scan_next(buckets.begin());\n}\n\nC_OUTLINE_LIST *OL_BUCKETS::scan_next() {\n  return scan_next(it);\n}\n\nC_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) {\n  it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); });\n  if (it == buckets.end())\n    return nullptr;\n  return &*it;\n}\n\n/**\n * @name OL_BUCKETS::outline_complexity\n *\n * This is the new version of count_child.\n *\n * The goal of this function is to determine if an outline and its\n * interiors could be part of a character blob.  This is done by\n * computing a \"complexity\" index for the outline, which is the return\n * value of this function, and checking it against a threshold.\n * The max_count is used for short-circuiting the recursion and forcing\n * a rejection that guarantees to fail the threshold test.\n * The complexity F for outline X with N children X[i] is\n *   F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild\n * so each layer of nesting increases complexity exponentially.\n * An outline can be rejected as a text blob candidate if its complexity\n * is too high, has too many children(likely a container), or has too\n * many layers of nested inner loops.  This has the side-effect of\n * flattening out boxed or reversed video text regions.\n */\n\nint32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline\n                                       int32_t max_count,  // max output\n                                       int16_t depth       // recursion depth\n) {\n  TDimension xmin, xmax;    // coord limits\n  TDimension ymin, ymax;\n  C_OUTLINE *child;         // current child\n  int32_t child_count;      // no of children\n  int32_t grandchild_count; // no of grandchildren\n  C_OUTLINE_IT child_it;    // search iterator\n\n  TBOX olbox = outline->bounding_box();\n  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;\n  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;\n  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;\n  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;\n  child_count = 0;\n  grandchild_count = 0;\n  if (++depth > edges_max_children_layers) { // nested loops are too deep\n    return max_count + depth;\n  }\n\n  for (auto yindex = ymin; yindex <= ymax; yindex++) {\n    for (auto xindex = xmin; xindex <= xmax; xindex++) {\n      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);\n      if (child_it.empty()) {\n        continue;\n      }\n      for (child_it.mark_cycle_pt(); !child_it.cycled_list();\n           child_it.forward()) {\n        child = child_it.data();\n        if (child == outline || !(*child < *outline)) {\n          continue;\n        }\n        child_count++;\n\n        if (child_count > edges_max_children_per_outline) { // too fragmented\n          if (edges_debug) {\n            tprintf(\n                \"Discard outline on child_count=%d > \"\n                \"max_children_per_outline=%d\\n\",\n                child_count,\n                static_cast<int32_t>(edges_max_children_per_outline));\n          }\n          return max_count + child_count;\n        }\n\n        // Compute the \"complexity\" of each child recursively\n        int32_t remaining_count = max_count - child_count - grandchild_count;\n        if (remaining_count > 0) {\n          grandchild_count += edges_children_per_grandchild *\n                              outline_complexity(child, remaining_count, depth);\n        }\n        if (child_count + grandchild_count > max_count) { // too complex\n          if (edges_debug) {\n            tprintf(\n                \"Discard outline on child_count=%d + grandchild_count=%d \"\n                \"> max_count=%d\\n\",\n                child_count, grandchild_count, max_count);\n          }\n          return child_count + grandchild_count;\n        }\n      }\n    }\n  }\n  return child_count + grandchild_count;\n}\n\n/**\n * @name OL_BUCKETS::count_children\n *\n * Find number of descendants of this outline.\n */\n// TODO(rays) Merge with outline_complexity.\nint32_t OL_BUCKETS::count_children( // recursive count\n    C_OUTLINE *outline,             // parent outline\n    int32_t max_count               // max output\n) {\n  bool parent_box;    // could it be boxy\n  TDimension xmin, xmax;    // coord limits\n  TDimension ymin, ymax;\n  C_OUTLINE *child;         // current child\n  int32_t child_count;      // no of children\n  int32_t grandchild_count; // no of grandchildren\n  int32_t parent_area;      // potential box\n  float max_parent_area;    // potential box\n  int32_t child_area;       // current child\n  int32_t child_length;     // current child\n  TBOX olbox;\n  C_OUTLINE_IT child_it; // search iterator\n\n  olbox = outline->bounding_box();\n  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;\n  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;\n  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;\n  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;\n  child_count = 0;\n  grandchild_count = 0;\n  parent_area = 0;\n  max_parent_area = 0;\n  parent_box = true;\n  for (auto yindex = ymin; yindex <= ymax; yindex++) {\n    for (auto xindex = xmin; xindex <= xmax; xindex++) {\n      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);\n      if (child_it.empty()) {\n        continue;\n      }\n      for (child_it.mark_cycle_pt(); !child_it.cycled_list();\n           child_it.forward()) {\n        child = child_it.data();\n        if (child != outline && *child < *outline) {\n          child_count++;\n          if (child_count <= max_count) {\n            int max_grand =\n                (max_count - child_count) / edges_children_per_grandchild;\n            if (max_grand > 0) {\n              grandchild_count += count_children(child, max_grand) *\n                                  edges_children_per_grandchild;\n            } else {\n              grandchild_count += count_children(child, 1);\n            }\n          }\n          if (child_count + grandchild_count > max_count) {\n            if (edges_debug) {\n              tprintf(\"Discarding parent with child count=%d, gc=%d\\n\",\n                      child_count, grandchild_count);\n            }\n            return child_count + grandchild_count;\n          }\n          if (parent_area == 0) {\n            parent_area = outline->outer_area();\n            if (parent_area < 0) {\n              parent_area = -parent_area;\n            }\n            max_parent_area = outline->bounding_box().area() * edges_boxarea;\n            if (parent_area < max_parent_area) {\n              parent_box = false;\n            }\n          }\n          if (parent_box &&\n              (!edges_children_fix ||\n               child->bounding_box().height() > edges_min_nonhole)) {\n            child_area = child->outer_area();\n            if (child_area < 0) {\n              child_area = -child_area;\n            }\n            if (edges_children_fix) {\n              if (parent_area - child_area < max_parent_area) {\n                parent_box = false;\n                continue;\n              }\n              if (grandchild_count > 0) {\n                if (edges_debug) {\n                  tprintf(\n                      \"Discarding parent of area %d, child area=%d, max%g \"\n                      \"with gc=%d\\n\",\n                      parent_area, child_area, max_parent_area,\n                      grandchild_count);\n                }\n                return max_count + 1;\n              }\n              child_length = child->pathlength();\n              if (child_length * child_length >\n                  child_area * edges_patharea_ratio) {\n                if (edges_debug) {\n                  tprintf(\n                      \"Discarding parent of area %d, child area=%d, max%g \"\n                      \"with child length=%d\\n\",\n                      parent_area, child_area, max_parent_area, child_length);\n                }\n                return max_count + 1;\n              }\n            }\n            if (child_area < child->bounding_box().area() * edges_childarea) {\n              if (edges_debug) {\n                tprintf(\n                    \"Discarding parent of area %d, child area=%d, max%g \"\n                    \"with child rect=%d\\n\",\n                    parent_area, child_area, max_parent_area,\n                    child->bounding_box().area());\n              }\n              return max_count + 1;\n            }\n          }\n        }\n      }\n    }\n  }\n  return child_count + grandchild_count;\n}\n\n/**\n * @name OL_BUCKETS::extract_children\n *\n * Find number of descendants of this outline.\n */\n\nvoid OL_BUCKETS::extract_children( // recursive count\n    C_OUTLINE *outline,            // parent outline\n    C_OUTLINE_IT *it               // destination iterator\n) {\n  TDimension xmin, xmax; // coord limits\n  TDimension ymin, ymax;\n  TBOX olbox;\n  C_OUTLINE_IT child_it; // search iterator\n\n  olbox = outline->bounding_box();\n  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;\n  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;\n  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;\n  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;\n  for (auto yindex = ymin; yindex <= ymax; yindex++) {\n    for (auto xindex = xmin; xindex <= xmax; xindex++) {\n      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);\n      for (child_it.mark_cycle_pt(); !child_it.cycled_list();\n           child_it.forward()) {\n        if (*child_it.data() < *outline) {\n          it->add_after_then_move(child_it.extract());\n        }\n      }\n    }\n  }\n}\n\n/// @name extract_edges\n\nvoid extract_edges(Image pix,      // thresholded image\n                   BLOCK *block) { // block to scan\n  C_OUTLINE_LIST outlines;         // outlines in block\n  C_OUTLINE_IT out_it = &outlines;\n\n  block_edges(pix, &(block->pdblk), &out_it);\n  ICOORD bleft; // block box\n  ICOORD tright;\n  block->pdblk.bounding_box(bleft, tright);\n  // make blobs\n  outlines_to_blobs(block, bleft, tright, &outlines);\n}\n\n/// @name fill_buckets\n\nstatic void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block\n                         OL_BUCKETS *buckets       // output buckets\n) {\n  C_OUTLINE_IT out_it = outlines; // iterator\n  C_OUTLINE_IT bucket_it;         // iterator in bucket\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    auto outline = out_it.extract(); // take off list\n                                     // get box\n    const TBOX &ol_box(outline->bounding_box());\n    bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom()));\n    bucket_it.add_to_end(outline);\n  }\n}\n\n/**\n * @name capture_children\n *\n * Find all neighbouring outlines that are children of this outline\n * and either move them to the output list or declare this outline\n * illegal and return false.\n */\n\nstatic bool capture_children(OL_BUCKETS *buckets,  // bucket sort class\n                             C_BLOB_IT *reject_it, // dead grandchildren\n                             C_OUTLINE_IT *blob_it // output outlines\n) {\n  // master outline\n  auto outline = blob_it->data();\n  // no of children\n  int32_t child_count;\n  if (edges_use_new_outline_complexity) {\n    child_count =\n        buckets->outline_complexity(outline, edges_children_count_limit, 0);\n  } else {\n    child_count = buckets->count_children(outline, edges_children_count_limit);\n  }\n  if (child_count > edges_children_count_limit) {\n    return false;\n  }\n\n  if (child_count > 0) {\n    buckets->extract_children(outline, blob_it);\n  }\n  return true;\n}\n\n/**\n * @name empty_buckets\n *\n * Run the edge detector over the block and return a list of blobs.\n */\n\nstatic void empty_buckets(BLOCK *block,       // block to scan\n                          OL_BUCKETS *buckets // output buckets\n) {\n  C_OUTLINE_LIST outlines; // outlines in block\n                           // iterator\n  C_OUTLINE_IT out_it = &outlines;\n  auto start_scan = buckets->start_scan();\n  if (start_scan == nullptr) {\n    return;\n  }\n  C_OUTLINE_IT bucket_it = start_scan;\n  C_BLOB_IT good_blobs = block->blob_list();\n  C_BLOB_IT junk_blobs = block->reject_blobs();\n\n  while (!bucket_it.empty()) {\n    out_it.set_to_list(&outlines);\n    C_OUTLINE_IT parent_it; // parent outline\n    do {\n      parent_it = bucket_it; // find outermost\n      do {\n        bucket_it.forward();\n      } while (!bucket_it.at_first() &&\n               !(*parent_it.data() < *bucket_it.data()));\n    } while (!bucket_it.at_first());\n\n    // move to new list\n    out_it.add_after_then_move(parent_it.extract());\n    // healthy blob\n    bool good_blob = capture_children(buckets, &junk_blobs, &out_it);\n    C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,\n                                       &junk_blobs);\n\n    if (auto l = buckets->scan_next())\n      bucket_it.set_to_list(l);\n    else\n      break;\n  }\n}\n\n/**\n * @name outlines_to_blobs\n *\n * Gather together outlines into blobs using the usual bucket sort.\n */\n\nvoid outlines_to_blobs( // find blobs\n    BLOCK *block,       // block to scan\n    ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) {\n  // make buckets\n  OL_BUCKETS buckets(bleft, tright);\n\n  fill_buckets(outlines, &buckets);\n  empty_buckets(block, &buckets);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/edgblob.h",
    "content": "/**********************************************************************\n * File:        edgblob.h  (Formerly edgeloop.h)\n * Description: Functions to clean up an outline before approximation.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef EDGBLOB_H\n#define EDGBLOB_H\n\n#include \"coutln.h\"   // for C_OUTLINE\n#include \"ocrblock.h\" // for BLOCK\n#include \"points.h\"   // for ICOORD\n\n#include <vector>\n\nnamespace tesseract {\n\nclass OL_BUCKETS {\npublic:\n  OL_BUCKETS(       // constructor\n      ICOORD bleft, // corners\n      ICOORD tright);\n\n  C_OUTLINE_LIST *operator()( // array access\n      TDimension x,           // image coords\n      TDimension y);\n  // first non-empty bucket\n  C_OUTLINE_LIST *start_scan();\n  // next non-empty bucket\n  C_OUTLINE_LIST *scan_next();\n  int32_t count_children(     // recursive sum\n      C_OUTLINE *outline,     // parent outline\n      int32_t max_count);     // max output\n  int32_t outline_complexity( // new version of count_children\n      C_OUTLINE *outline,     // parent outline\n      int32_t max_count,      // max output\n      int16_t depth);         // level of recursion\n  void extract_children(      // single level get\n      C_OUTLINE *outline,     // parent outline\n      C_OUTLINE_IT *it);      // destination iterator\n\nprivate:\n  int16_t bxdim; // size of array\n  int16_t bydim;\n  std::vector<C_OUTLINE_LIST> buckets; // array of buckets\n  ICOORD bl;                           // corners\n  ICOORD tr;\n  decltype(buckets)::iterator it; // for extraction scan\n\n  C_OUTLINE_LIST *scan_next(decltype(buckets)::iterator it);\n};\n\nvoid extract_edges(Image pix,     // thresholded image\n                   BLOCK *block); // block to scan\nvoid outlines_to_blobs(           // find blobs\n    BLOCK *block,                 // block to scan\n    ICOORD bleft,                 // block box //outlines in block\n    ICOORD tright, C_OUTLINE_LIST *outlines);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/edgloop.cpp",
    "content": "/**********************************************************************\n * File:        edgloop.cpp  (Formerly edgeloop.c)\n * Description: Functions to clean up an outline before approximation.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"scanedg.h\"\n\n#include \"edgloop.h\"\n\nnamespace tesseract {\n\n#define MINEDGELENGTH 8 // min decent length\n\n/**********************************************************************\n * complete_edge\n *\n * Complete the edge by cleaning it up.\n **********************************************************************/\n\nvoid complete_edge(CRACKEDGE *start, // start of loop\n                   C_OUTLINE_IT *outline_it) {\n  ScrollView::Color colour; // colour to draw in\n  int16_t looplength;       // steps in loop\n  ICOORD botleft;           // bounding box\n  ICOORD topright;\n  C_OUTLINE *outline; // new outline\n\n  // check length etc.\n  colour = check_path_legal(start);\n\n  if (colour == ScrollView::RED || colour == ScrollView::BLUE) {\n    looplength = loop_bounding_box(start, botleft, topright);\n    outline = new C_OUTLINE(start, botleft, topright, looplength);\n    // add to list\n    outline_it->add_after_then_move(outline);\n  }\n}\n\n/**********************************************************************\n * check_path_legal\n *\n * Check that the outline is legal for length and for chaincode sum.\n * The return value is RED for a normal black-inside outline,\n * BLUE for a white-inside outline, MAGENTA if it is too short,\n * YELLOW if it is too long, and GREEN if it is illegal.\n * These colours are used to draw the raw outline.\n **********************************************************************/\n\nScrollView::Color check_path_legal( // certify outline\n    CRACKEDGE *start                // start of loop\n) {\n  int lastchain;     // last chain code\n  int chaindiff;     // chain code diff\n  int32_t length;    // length of loop\n  int32_t chainsum;  // sum of chain diffs\n  CRACKEDGE *edgept; // current point\n  constexpr ERRCODE ED_ILLEGAL_SUM(\"Illegal sum of chain codes\");\n\n  length = 0;\n  chainsum = 0; // sum of chain codes\n  edgept = start;\n  lastchain = edgept->prev->stepdir; // previous chain code\n  do {\n    length++;\n    if (edgept->stepdir != lastchain) {\n      // chain code difference\n      chaindiff = edgept->stepdir - lastchain;\n      if (chaindiff > 2) {\n        chaindiff -= 4;\n      } else if (chaindiff < -2) {\n        chaindiff += 4;\n      }\n      chainsum += chaindiff; // sum differences\n      lastchain = edgept->stepdir;\n    }\n    edgept = edgept->next;\n  } while (edgept != start && length < C_OUTLINE::kMaxOutlineLength);\n\n  if ((chainsum != 4 && chainsum != -4) || edgept != start || length < MINEDGELENGTH) {\n    if (edgept != start) {\n      return ScrollView::YELLOW;\n    } else if (length < MINEDGELENGTH) {\n      return ScrollView::MAGENTA;\n    } else {\n      ED_ILLEGAL_SUM.error(\"check_path_legal\", TESSLOG, \"chainsum=%d\", chainsum);\n      return ScrollView::GREEN;\n    }\n  }\n  // colour on inside\n  return chainsum < 0 ? ScrollView::BLUE : ScrollView::RED;\n}\n\n/**********************************************************************\n * loop_bounding_box\n *\n * Find the bounding box of the edge loop.\n **********************************************************************/\n\nint16_t loop_bounding_box( // get bounding box\n    CRACKEDGE *&start,     // edge loop\n    ICOORD &botleft,       // bounding box\n    ICOORD &topright) {\n  int16_t length;       // length of loop\n  int16_t leftmost;     // on top row\n  CRACKEDGE *edgept;    // current point\n  CRACKEDGE *realstart; // topleft start\n\n  edgept = start;\n  realstart = start;\n  botleft = topright = ICOORD(edgept->pos.x(), edgept->pos.y());\n  leftmost = edgept->pos.x();\n  length = 0; // count length\n  do {\n    edgept = edgept->next;\n    if (edgept->pos.x() < botleft.x()) {\n      // get bounding box\n      botleft.set_x(edgept->pos.x());\n    } else if (edgept->pos.x() > topright.x()) {\n      topright.set_x(edgept->pos.x());\n    }\n    if (edgept->pos.y() < botleft.y()) {\n      // get bounding box\n      botleft.set_y(edgept->pos.y());\n    } else if (edgept->pos.y() > topright.y()) {\n      realstart = edgept;\n      leftmost = edgept->pos.x();\n      topright.set_y(edgept->pos.y());\n    } else if (edgept->pos.y() == topright.y() && edgept->pos.x() < leftmost) {\n      // leftmost on line\n      leftmost = edgept->pos.x();\n      realstart = edgept;\n    }\n    length++; // count elements\n  } while (edgept != start);\n  start = realstart; // shift it to topleft\n  return length;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/edgloop.h",
    "content": "/**********************************************************************\n * File:        edgloop.h  (Formerly edgeloop.h)\n * Description: Functions to clean up an outline before approximation.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef EDGLOOP_H\n#define EDGLOOP_H\n\n#include \"coutln.h\"\n#include \"crakedge.h\"\n#include \"params.h\"\n#include \"pdblock.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n#define BUCKETSIZE 16\n\nvoid complete_edge(CRACKEDGE *start, // start of loop\n                   C_OUTLINE_IT *outline_it);\nScrollView::Color check_path_legal( // certify outline\n    CRACKEDGE *start                // start of loop\n);\nint16_t loop_bounding_box( // get bounding box\n    CRACKEDGE *&start,     // edge loop\n    ICOORD &botleft,       // bounding box\n    ICOORD &topright);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/equationdetectbase.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        equationdetectbase.cpp\n// Description: The base class equation detection class.\n// Author:      Zongyi (Joe) Liu (joeliu@google.com)\n// Created:     Fri Aug 31 11:13:01 PST 2011\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"equationdetectbase.h\"\n\n#include \"blobbox.h\"\n\n#include <allheaders.h>\n\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nEquationDetectBase::~EquationDetectBase() = default;\n\nvoid EquationDetectBase::RenderSpecialText(Image pix, BLOBNBOX *blob) {\n  ASSERT_HOST(pix != nullptr && pixGetDepth(pix) == 32 && blob != nullptr);\n  const TBOX &tbox = blob->bounding_box();\n  int height = pixGetHeight(pix);\n  const int box_width = 5;\n\n  // Coordinate translation: tesseract use left bottom as the original, while\n  // leptonica uses left top as the original.\n  Box *box = boxCreate(tbox.left(), height - tbox.top(), tbox.width(), tbox.height());\n  switch (blob->special_text_type()) {\n    case BSTT_MATH: // Red box.\n      pixRenderBoxArb(pix, box, box_width, 255, 0, 0);\n      break;\n    case BSTT_DIGIT: // cyan box.\n      pixRenderBoxArb(pix, box, box_width, 0, 255, 255);\n      break;\n    case BSTT_ITALIC: // Green box.\n      pixRenderBoxArb(pix, box, box_width, 0, 255, 0);\n      break;\n    case BSTT_UNCLEAR: // blue box.\n      pixRenderBoxArb(pix, box, box_width, 0, 255, 0);\n      break;\n    case BSTT_NONE:\n    default:\n      // yellow box.\n      pixRenderBoxArb(pix, box, box_width, 255, 255, 0);\n      break;\n  }\n  boxDestroy(&box);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/equationdetectbase.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        equationdetectbase.h\n// Description: The base class equation detection class.\n// Author:      Zongyi (Joe) Liu (joeliu@google.com)\n// Created:     Fri Aug 31 11:13:01 PST 2011\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_\n#define TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_\n\n#include \"image.h\"\n\nclass BLOBNBOX_LIST;\nclass TO_BLOCK;\nstruct Pix;\n\nnamespace tesseract {\n\nclass ColPartitionGrid;\nclass ColPartitionSet;\nclass BLOBNBOX;\n\nclass TESS_API EquationDetectBase {\npublic:\n  EquationDetectBase() = default;\n  virtual ~EquationDetectBase();\n\n  // Iterate over the blobs inside to_block, and set the blobs that we want to\n  // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function\n  // returns 0 upon success.\n  virtual int LabelSpecialText(TO_BLOCK *to_block) = 0;\n\n  // Interface to find possible equation partition grid from part_grid. This\n  // should be called after IdentifySpecialText function.\n  virtual int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) = 0;\n\n  // Debug function: Render a bounding box on pix based on the value of its\n  // special_text_type, specifically:\n  // BSTT_MATH: red box\n  // BSTT_DIGIT: cyan box\n  // BSTT_ITALIC: green box\n  // BSTT_UNCLEAR: blue box\n  // All others: yellow box\n  static void RenderSpecialText(Image pix, BLOBNBOX *blob);\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_\n"
  },
  {
    "path": "src/textord/fpchop.cpp",
    "content": "/**********************************************************************\n * File:        fpchop.cpp  (Formerly fp_chop.c)\n * Description: Code to chop fixed pitch text into character cells.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"fpchop.h\"\n\n#include \"blobbox.h\"\n#include \"drawtord.h\"\n#include \"statistc.h\"\n#include \"topitch.h\"\n#include \"tovars.h\"\n\nnamespace tesseract {\n\nINT_VAR(textord_fp_chop_error, 2, \"Max allowed bending of chop cells\");\n\nstatic WERD *add_repeated_word(WERD_IT *rep_it, int16_t &rep_left, int16_t &prev_chop_coord,\n                               uint8_t &blanks, float pitch, WERD_IT *word_it);\n\nstatic void fixed_chop_cblob(C_BLOB *blob, int16_t chop_coord, float pitch_error,\n                             C_OUTLINE_LIST *left_outlines, C_OUTLINE_LIST *right_outlines);\n\nstatic void fixed_split_coutline(C_OUTLINE *srcline, int16_t chop_coord, float pitch_error,\n                                 C_OUTLINE_IT *left_it, C_OUTLINE_IT *right_it);\n\nstatic bool fixed_chop_coutline(C_OUTLINE *srcline, int16_t chop_coord, float pitch_error,\n                                C_OUTLINE_FRAG_LIST *left_frags, C_OUTLINE_FRAG_LIST *right_frags);\n\nstatic void save_chop_cfragment(int16_t head_index, ICOORD head_pos, int16_t tail_index,\n                                ICOORD tail_pos, C_OUTLINE *srcline, C_OUTLINE_FRAG_LIST *frags);\n\nstatic void add_frag_to_list(C_OUTLINE_FRAG *frag, C_OUTLINE_FRAG_LIST *frags);\n\nstatic void close_chopped_cfragments(C_OUTLINE_FRAG_LIST *frags, C_OUTLINE_LIST *children,\n                                     float pitch_error, C_OUTLINE_IT *dest_it);\n\nstatic C_OUTLINE *join_chopped_fragments(C_OUTLINE_FRAG *bottom, C_OUTLINE_FRAG *top);\n\nstatic void join_segments(C_OUTLINE_FRAG *bottom, C_OUTLINE_FRAG *top);\n\n/**********************************************************************\n * fixed_pitch_words\n *\n * Make a ROW from a fixed pitch TO_ROW.\n **********************************************************************/\nROW *fixed_pitch_words( // find lines\n    TO_ROW *row,        // row to do\n    FCOORD rotation     // for drawing\n) {\n  bool bol;                // start of line\n  uint8_t blanks;          // in front of word\n  uint8_t new_blanks;      // blanks in empty cell\n  int16_t chop_coord;      // chop boundary\n  int16_t prev_chop_coord; // start of cell\n  int16_t rep_left;        // left edge of rep word\n  ROW *real_row;           // output row\n  C_OUTLINE_LIST left_coutlines;\n  C_OUTLINE_LIST right_coutlines;\n  C_BLOB_LIST cblobs;\n  C_BLOB_IT cblob_it = &cblobs;\n  WERD_LIST words;\n  WERD_IT word_it = &words; // new words\n                            // repeated blobs\n  WERD_IT rep_it = &row->rep_words;\n  WERD *word;         // new word\n  int32_t xstarts[2]; // row ends\n  int32_t prev_x;     // end of prev blob\n                      // iterator\n  BLOBNBOX_IT box_it = row->blob_list();\n  // boundaries\n  ICOORDELT_IT cell_it = &row->char_cells;\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_page_cuts && to_win != nullptr) {\n    plot_row_cells(to_win, ScrollView::RED, row, 0, &row->char_cells);\n  }\n#endif\n\n  prev_x = -INT16_MAX;\n  bol = true;\n  blanks = 0;\n  if (rep_it.empty()) {\n    rep_left = INT16_MAX;\n  } else {\n    rep_left = rep_it.data()->bounding_box().left();\n  }\n  if (box_it.empty()) {\n    return nullptr; // empty row\n  }\n  xstarts[0] = box_it.data()->bounding_box().left();\n  if (rep_left < xstarts[0]) {\n    xstarts[0] = rep_left;\n  }\n  if (cell_it.empty() || row->char_cells.singleton()) {\n    tprintf(\"Row without enough char cells!\\n\");\n    tprintf(\"Leftmost blob is at (%d,%d)\\n\", box_it.data()->bounding_box().left(),\n            box_it.data()->bounding_box().bottom());\n    return nullptr;\n  }\n  ASSERT_HOST(!cell_it.empty() && !row->char_cells.singleton());\n  prev_chop_coord = cell_it.data()->x();\n  word = nullptr;\n  while (rep_left < cell_it.data()->x()) {\n    word =\n        add_repeated_word(&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch, &word_it);\n  }\n  cell_it.mark_cycle_pt();\n  if (prev_chop_coord >= cell_it.data()->x()) {\n    cell_it.forward();\n  }\n  for (; !cell_it.cycled_list(); cell_it.forward()) {\n    chop_coord = cell_it.data()->x();\n    while (!box_it.empty() && box_it.data()->bounding_box().left() <= chop_coord) {\n      if (box_it.data()->bounding_box().right() > prev_x) {\n        prev_x = box_it.data()->bounding_box().right();\n      }\n      split_to_blob(box_it.extract(), chop_coord, textord_fp_chop_error + 0.5f, &left_coutlines,\n                    &right_coutlines);\n      box_it.forward();\n      while (!box_it.empty() && box_it.data()->cblob() == nullptr) {\n        delete box_it.extract();\n        box_it.forward();\n      }\n    }\n    if (!right_coutlines.empty() && left_coutlines.empty()) {\n      split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5f, &left_coutlines,\n                    &right_coutlines);\n    }\n    if (!left_coutlines.empty()) {\n      cblob_it.add_after_then_move(new C_BLOB(&left_coutlines));\n    } else {\n      if (rep_left < chop_coord) {\n        if (rep_left > prev_chop_coord) {\n          new_blanks =\n              static_cast<uint8_t>(floor((rep_left - prev_chop_coord) / row->fixed_pitch + 0.5));\n        } else {\n          new_blanks = 0;\n        }\n      } else {\n        if (chop_coord > prev_chop_coord) {\n          new_blanks =\n              static_cast<uint8_t>(floor((chop_coord - prev_chop_coord) / row->fixed_pitch + 0.5));\n        } else {\n          new_blanks = 0;\n        }\n      }\n      if (!cblob_it.empty()) {\n        if (blanks < 1 && word != nullptr && !word->flag(W_REP_CHAR)) {\n          blanks = 1;\n        }\n        word = new WERD(&cblobs, blanks, nullptr);\n        cblob_it.set_to_list(&cblobs);\n        word->set_flag(W_DONT_CHOP, true);\n        word_it.add_after_then_move(word);\n        if (bol) {\n          word->set_flag(W_BOL, true);\n          bol = false;\n        }\n        blanks = new_blanks;\n      } else {\n        blanks += new_blanks;\n      }\n      while (rep_left < chop_coord) {\n        word = add_repeated_word(&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch,\n                                 &word_it);\n      }\n    }\n    if (prev_chop_coord < chop_coord) {\n      prev_chop_coord = chop_coord;\n    }\n  }\n  if (!cblob_it.empty()) {\n    word = new WERD(&cblobs, blanks, nullptr);\n    word->set_flag(W_DONT_CHOP, true);\n    word_it.add_after_then_move(word);\n    if (bol) {\n      word->set_flag(W_BOL, true);\n    }\n  }\n  ASSERT_HOST(word != nullptr);\n  while (!rep_it.empty()) {\n    add_repeated_word(&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch, &word_it);\n  }\n  // at end of line\n  word_it.data()->set_flag(W_EOL, true);\n  if (prev_chop_coord > prev_x) {\n    prev_x = prev_chop_coord;\n  }\n  xstarts[1] = prev_x + 1;\n  real_row =\n      new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));\n  word_it.set_to_list(real_row->word_list());\n  // put words in row\n  word_it.add_list_after(&words);\n  real_row->recalc_bounding_box();\n  return real_row;\n}\n\n/**********************************************************************\n * add_repeated_word\n *\n * Add repeated word into the row at the given point.\n **********************************************************************/\n\nstatic WERD *add_repeated_word( // move repeated word\n    WERD_IT *rep_it,            // repeated words\n    int16_t &rep_left,          // left edge of word\n    int16_t &prev_chop_coord,   // previous word end\n    uint8_t &blanks,            // no of blanks\n    float pitch,                // char cell size\n    WERD_IT *word_it            // list of words\n) {\n  WERD *word;         // word to move\n  int16_t new_blanks; // extra blanks\n\n  if (rep_left > prev_chop_coord) {\n    new_blanks = static_cast<uint8_t>(floor((rep_left - prev_chop_coord) / pitch + 0.5));\n    blanks += new_blanks;\n  }\n  word = rep_it->extract();\n  prev_chop_coord = word->bounding_box().right();\n  word_it->add_after_then_move(word);\n  word->set_blanks(blanks);\n  rep_it->forward();\n  if (rep_it->empty()) {\n    rep_left = INT16_MAX;\n  } else {\n    rep_left = rep_it->data()->bounding_box().left();\n  }\n  blanks = 0;\n  return word;\n}\n\n/**********************************************************************\n * split_to_blob\n *\n * Split a BLOBNBOX across a vertical chop line and put the pieces\n * into a left outline list and a right outline list.\n **********************************************************************/\n\nvoid split_to_blob(                 // split the blob\n    BLOBNBOX *blob,                 // blob to split\n    int16_t chop_coord,             // place to chop\n    float pitch_error,              // allowed deviation\n    C_OUTLINE_LIST *left_coutlines, // for cblobs\n    C_OUTLINE_LIST *right_coutlines) {\n  C_BLOB *real_cblob; // cblob to chop\n\n  if (blob != nullptr) {\n    real_cblob = blob->remove_cblob();\n  } else {\n    real_cblob = nullptr;\n  }\n  if (!right_coutlines->empty() || real_cblob != nullptr) {\n    fixed_chop_cblob(real_cblob, chop_coord, pitch_error, left_coutlines, right_coutlines);\n  }\n\n  delete blob;\n}\n\n/**********************************************************************\n * fixed_chop_cblob\n *\n * Chop the given cblob (if any) and the existing right outlines to\n * produce a list of outlines left of the chop point and more to the right.\n **********************************************************************/\n\nstatic void fixed_chop_cblob(      // split the blob\n    C_BLOB *blob,                  // blob to split\n    int16_t chop_coord,            // place to chop\n    float pitch_error,             // allowed deviation\n    C_OUTLINE_LIST *left_outlines, // left half of chop\n    C_OUTLINE_LIST *right_outlines // right half of chop\n) {\n  C_OUTLINE *old_right;        // already there\n  C_OUTLINE_LIST new_outlines; // new right ones\n                               // output iterator\n  C_OUTLINE_IT left_it = left_outlines;\n  // in/out iterator\n  C_OUTLINE_IT right_it = right_outlines;\n  C_OUTLINE_IT new_it = &new_outlines;\n  C_OUTLINE_IT blob_it; // outlines in blob\n\n  if (!right_it.empty()) {\n    while (!right_it.empty()) {\n      old_right = right_it.extract();\n      right_it.forward();\n      fixed_split_coutline(old_right, chop_coord, pitch_error, &left_it, &new_it);\n    }\n    right_it.add_list_before(&new_outlines);\n  }\n  if (blob != nullptr) {\n    blob_it.set_to_list(blob->out_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      fixed_split_coutline(blob_it.extract(), chop_coord, pitch_error, &left_it, &right_it);\n    }\n    delete blob;\n  }\n}\n\n/**********************************************************************\n * fixed_split_outline\n *\n * Chop the given outline (if necessary) placing the fragments which\n * fall either side of the chop line into the appropriate list.\n **********************************************************************/\n\nstatic void fixed_split_coutline( // chop the outline\n    C_OUTLINE *srcline,           // source outline\n    int16_t chop_coord,           // place to chop\n    float pitch_error,            // allowed deviation\n    C_OUTLINE_IT *left_it,        // left half of chop\n    C_OUTLINE_IT *right_it        // right half of chop\n) {\n  C_OUTLINE *child;               // child outline\n  TBOX srcbox;                    // box of outline\n  C_OUTLINE_LIST left_ch;         // left children\n  C_OUTLINE_LIST right_ch;        // right children\n  C_OUTLINE_FRAG_LIST left_frags; // chopped fragments\n  C_OUTLINE_FRAG_LIST right_frags;\n  ;\n  C_OUTLINE_IT left_ch_it = &left_ch;\n  // for whole children\n  C_OUTLINE_IT right_ch_it = &right_ch;\n  // for holes\n  C_OUTLINE_IT child_it = srcline->child();\n\n  srcbox = srcline->bounding_box();\n  if (srcbox.left() + srcbox.right() <= chop_coord * 2 &&\n      srcbox.right() < chop_coord + pitch_error) {\n    // Whole outline is in the left side or not far over the chop_coord,\n    // so put the whole thing on the left.\n    left_it->add_after_then_move(srcline);\n  } else if (srcbox.left() + srcbox.right() > chop_coord * 2 &&\n             srcbox.left() > chop_coord - pitch_error) {\n    // Whole outline is in the right side or not far over the chop_coord,\n    // so put the whole thing on the right.\n    right_it->add_before_stay_put(srcline);\n  } else {\n    // Needs real chopping.\n    if (fixed_chop_coutline(srcline, chop_coord, pitch_error, &left_frags, &right_frags)) {\n      for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) {\n        child = child_it.extract();\n        srcbox = child->bounding_box();\n        if (srcbox.right() < chop_coord) {\n          // Whole child is on the left.\n          left_ch_it.add_after_then_move(child);\n        } else if (srcbox.left() > chop_coord) {\n          // Whole child is on the right.\n          right_ch_it.add_after_then_move(child);\n        } else {\n          // No pitch_error is allowed when chopping children to prevent\n          // impossible outlines from being created.\n          if (fixed_chop_coutline(child, chop_coord, 0.0f, &left_frags, &right_frags)) {\n            delete child;\n          } else {\n            if (srcbox.left() + srcbox.right() <= chop_coord * 2) {\n              left_ch_it.add_after_then_move(child);\n            } else {\n              right_ch_it.add_after_then_move(child);\n            }\n          }\n        }\n      }\n      close_chopped_cfragments(&left_frags, &left_ch, pitch_error, left_it);\n      close_chopped_cfragments(&right_frags, &right_ch, pitch_error, right_it);\n      ASSERT_HOST(left_ch.empty() && right_ch.empty());\n      // No children left.\n      delete srcline; // Smashed up.\n    } else {\n      // Chop failed. Just use middle coord.\n      if (srcbox.left() + srcbox.right() <= chop_coord * 2) {\n        left_it->add_after_then_move(srcline); // Stick whole in left.\n      } else {\n        right_it->add_before_stay_put(srcline);\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * fixed_chop_coutline\n *\n * Chop the given coutline (if necessary) placing the fragments which\n * fall either side of the chop line into the appropriate list.\n * If the coutline lies too heavily to one side to chop, false is returned.\n **********************************************************************/\n\nstatic bool fixed_chop_coutline(     // chop the outline\n    C_OUTLINE *srcline,              // source outline\n    int16_t chop_coord,              // place to chop\n    float pitch_error,               // allowed deviation\n    C_OUTLINE_FRAG_LIST *left_frags, // left half of chop\n    C_OUTLINE_FRAG_LIST *right_frags // right half of chop\n) {\n  bool first_frag;         // fragment\n  int16_t left_edge;       // of outline\n  int16_t startindex;      // in first fragment\n  int32_t length;          // of outline\n  int16_t stepindex;       // into outline\n  int16_t head_index;      // start of fragment\n  ICOORD head_pos;         // start of fragment\n  int16_t tail_index;      // end of fragment\n  ICOORD tail_pos;         // end of fragment\n  ICOORD pos;              // current point\n  int16_t first_index = 0; // first tail\n  ICOORD first_pos;        // first tail\n\n  length = srcline->pathlength();\n  pos = srcline->start_pos();\n  left_edge = pos.x();\n  tail_index = 0;\n  tail_pos = pos;\n  for (stepindex = 0; stepindex < length; stepindex++) {\n    if (pos.x() < left_edge) {\n      left_edge = pos.x();\n      tail_index = stepindex;\n      tail_pos = pos;\n    }\n    pos += srcline->step(stepindex);\n  }\n  if (left_edge >= chop_coord - pitch_error) {\n    return false; // not worth it\n  }\n\n  startindex = tail_index;\n  first_frag = true;\n  head_index = tail_index;\n  head_pos = tail_pos;\n  do {\n    do {\n      tail_pos += srcline->step(tail_index);\n      tail_index++;\n      if (tail_index == length) {\n        tail_index = 0;\n      }\n    } while (tail_pos.x() != chop_coord && tail_index != startindex);\n    if (tail_index == startindex) {\n      if (first_frag) {\n        return false; // doesn't cross line\n      } else {\n        break;\n      }\n    }\n    ASSERT_HOST(head_index != tail_index);\n    if (!first_frag) {\n      save_chop_cfragment(head_index, head_pos, tail_index, tail_pos, srcline, left_frags);\n    } else {\n      first_index = tail_index;\n      first_pos = tail_pos;\n      first_frag = false;\n    }\n    while (srcline->step(tail_index).x() == 0) {\n      tail_pos += srcline->step(tail_index);\n      tail_index++;\n      if (tail_index == length) {\n        tail_index = 0;\n      }\n    }\n    head_index = tail_index;\n    head_pos = tail_pos;\n    while (srcline->step(tail_index).x() > 0) {\n      do {\n        tail_pos += srcline->step(tail_index);\n        tail_index++;\n        if (tail_index == length) {\n          tail_index = 0;\n        }\n      } while (tail_pos.x() != chop_coord);\n      ASSERT_HOST(head_index != tail_index);\n      save_chop_cfragment(head_index, head_pos, tail_index, tail_pos, srcline, right_frags);\n      while (srcline->step(tail_index).x() == 0) {\n        tail_pos += srcline->step(tail_index);\n        tail_index++;\n        if (tail_index == length) {\n          tail_index = 0;\n        }\n      }\n      head_index = tail_index;\n      head_pos = tail_pos;\n    }\n  } while (tail_index != startindex);\n  save_chop_cfragment(head_index, head_pos, first_index, first_pos, srcline, left_frags);\n  return true; // did some chopping\n}\n\n/**********************************************************************\n * save_chop_cfragment\n *\n * Store the given fragment in the given fragment list.\n **********************************************************************/\n\nstatic void save_chop_cfragment( // chop the outline\n    int16_t head_index,          // head of fragment\n    ICOORD head_pos,             // head of fragment\n    int16_t tail_index,          // tail of fragment\n    ICOORD tail_pos,             // tail of fragment\n    C_OUTLINE *srcline,          // source of edgesteps\n    C_OUTLINE_FRAG_LIST *frags   // fragment list\n) {\n  int16_t jump;         // gap across end\n  int16_t stepcount;    // total steps\n  C_OUTLINE_FRAG *head; // head of fragment\n  C_OUTLINE_FRAG *tail; // tail of fragment\n  int16_t tail_y;       // ycoord of tail\n\n  ASSERT_HOST(tail_pos.x() == head_pos.x());\n  ASSERT_HOST(tail_index != head_index);\n  stepcount = tail_index - head_index;\n  if (stepcount < 0) {\n    stepcount += srcline->pathlength();\n  }\n  jump = tail_pos.y() - head_pos.y();\n  if (jump < 0) {\n    jump = -jump;\n  }\n  if (jump == stepcount) {\n    return; // its a nop\n  }\n  tail_y = tail_pos.y();\n  head = new C_OUTLINE_FRAG(head_pos, tail_pos, srcline, head_index, tail_index);\n  tail = new C_OUTLINE_FRAG(head, tail_y);\n  head->other_end = tail;\n  add_frag_to_list(head, frags);\n  add_frag_to_list(tail, frags);\n}\n\n/**********************************************************************\n * C_OUTLINE_FRAG::C_OUTLINE_FRAG\n *\n * Constructors for C_OUTLINE_FRAG.\n **********************************************************************/\n\nC_OUTLINE_FRAG::C_OUTLINE_FRAG( // record fragment\n    ICOORD start_pt,            // start coord\n    ICOORD end_pt,              // end coord\n    C_OUTLINE *outline,         // source of steps\n    int16_t start_index, int16_t end_index) {\n  start = start_pt;\n  end = end_pt;\n  ycoord = start_pt.y();\n  stepcount = end_index - start_index;\n  if (stepcount < 0) {\n    stepcount += outline->pathlength();\n  }\n  ASSERT_HOST(stepcount > 0);\n  steps = new DIR128[stepcount];\n  if (end_index > start_index) {\n    for (int i = start_index; i < end_index; ++i) {\n      steps[i - start_index] = outline->step_dir(i);\n    }\n  } else {\n    int len = outline->pathlength();\n    int i = start_index;\n    for (; i < len; ++i) {\n      steps[i - start_index] = outline->step_dir(i);\n    }\n    if (end_index > 0) {\n      for (; i < end_index + len; ++i) {\n        steps[i - start_index] = outline->step_dir(i - len);\n      }\n    }\n  }\n  other_end = nullptr;\n  delete close();\n}\n\nC_OUTLINE_FRAG::C_OUTLINE_FRAG( // record fragment\n    C_OUTLINE_FRAG *head,       // other end\n    int16_t tail_y) {\n  ycoord = tail_y;\n  other_end = head;\n  start = head->start;\n  end = head->end;\n  steps = nullptr;\n  stepcount = 0;\n}\n\n/**********************************************************************\n * add_frag_to_list\n *\n * Insert the fragment in the list at the appropriate place to keep\n * them in ascending ycoord order.\n **********************************************************************/\n\nstatic void add_frag_to_list(  // ordered add\n    C_OUTLINE_FRAG *frag,      // fragment to add\n    C_OUTLINE_FRAG_LIST *frags // fragment list\n) {\n  // output list\n  C_OUTLINE_FRAG_IT frag_it = frags;\n\n  if (!frags->empty()) {\n    for (frag_it.mark_cycle_pt(); !frag_it.cycled_list(); frag_it.forward()) {\n      if (frag_it.data()->ycoord > frag->ycoord ||\n          (frag_it.data()->ycoord == frag->ycoord && frag->other_end->ycoord < frag->ycoord)) {\n        frag_it.add_before_then_move(frag);\n        return;\n      }\n    }\n  }\n  frag_it.add_to_end(frag);\n}\n\n/**********************************************************************\n * close_chopped_cfragments\n *\n * Clear the given list of fragments joining them up into outlines.\n * Each outline made soaks up any of the child outlines which it encloses.\n **********************************************************************/\n\nstatic void close_chopped_cfragments( // chop the outline\n    C_OUTLINE_FRAG_LIST *frags,       // list to clear\n    C_OUTLINE_LIST *children,         // potential children\n    float pitch_error,                // allowed shrinkage\n    C_OUTLINE_IT *dest_it             // output list\n) {\n  // iterator\n  C_OUTLINE_FRAG_IT frag_it = frags;\n  C_OUTLINE_FRAG *bottom_frag; // bottom of cut\n  C_OUTLINE_FRAG *top_frag;    // top of cut\n  C_OUTLINE *outline;          // new outline\n  C_OUTLINE *child;            // current child\n  C_OUTLINE_IT child_it = children;\n  C_OUTLINE_IT olchild_it; // children of outline\n\n  while (!frag_it.empty()) {\n    frag_it.move_to_first();\n    // get bottom one\n    bottom_frag = frag_it.extract();\n    frag_it.forward();\n    top_frag = frag_it.data(); // look at next\n    if ((bottom_frag->steps == nullptr && top_frag->steps == nullptr) ||\n        (bottom_frag->steps != nullptr && top_frag->steps != nullptr)) {\n      if (frag_it.data_relative(1)->ycoord == top_frag->ycoord) {\n        frag_it.forward();\n      }\n    }\n    top_frag = frag_it.extract();\n    if (top_frag->other_end != bottom_frag) {\n      outline = join_chopped_fragments(bottom_frag, top_frag);\n      ASSERT_HOST(outline == nullptr);\n    } else {\n      outline = join_chopped_fragments(bottom_frag, top_frag);\n      if (outline != nullptr) {\n        olchild_it.set_to_list(outline->child());\n        for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) {\n          child = child_it.data();\n          if (*child < *outline) {\n            olchild_it.add_to_end(child_it.extract());\n          }\n        }\n        if (outline->bounding_box().width() > pitch_error) {\n          dest_it->add_after_then_move(outline);\n        } else {\n          delete outline; // Make it disappear.\n        }\n      }\n    }\n  }\n  while (!child_it.empty()) {\n    dest_it->add_after_then_move(child_it.extract());\n    child_it.forward();\n  }\n}\n\n/**********************************************************************\n * join_chopped_fragments\n *\n * Join the two lists of POLYPTs such that neither OUTLINE_FRAG\n * operand keeps responsibility for the fragment.\n **********************************************************************/\n\nstatic C_OUTLINE *join_chopped_fragments( // join pieces\n    C_OUTLINE_FRAG *bottom,               // bottom of cut\n    C_OUTLINE_FRAG *top                   // top of cut\n) {\n  C_OUTLINE *outline; // closed loop\n\n  if (bottom->other_end == top) {\n    if (bottom->steps == nullptr) {\n      outline = top->close(); // turn to outline\n    } else {\n      outline = bottom->close();\n    }\n    delete top;\n    delete bottom;\n    return outline;\n  }\n  if (bottom->steps == nullptr) {\n    ASSERT_HOST(top->steps != nullptr);\n    join_segments(bottom->other_end, top);\n  } else {\n    ASSERT_HOST(top->steps == nullptr);\n    join_segments(top->other_end, bottom);\n  }\n  top->other_end->other_end = bottom->other_end;\n  bottom->other_end->other_end = top->other_end;\n  delete bottom;\n  delete top;\n  return nullptr;\n}\n\n/**********************************************************************\n * join_segments\n *\n * Join the two edgestep fragments such that the second comes after\n * the first and the gap between them is closed.\n **********************************************************************/\n\nstatic void join_segments(  // join pieces\n    C_OUTLINE_FRAG *bottom, // bottom of cut\n    C_OUTLINE_FRAG *top     // top of cut\n) {\n  DIR128 *steps;      // new steps\n  int32_t stepcount;  // no of steps\n  int16_t fake_count; // fake steps\n  DIR128 fake_step;   // step entry\n\n  ASSERT_HOST(bottom->end.x() == top->start.x());\n  fake_count = top->start.y() - bottom->end.y();\n  if (fake_count < 0) {\n    fake_count = -fake_count;\n    fake_step = 32;\n  } else {\n    fake_step = 96;\n  }\n\n  stepcount = bottom->stepcount + fake_count + top->stepcount;\n  steps = new DIR128[stepcount];\n  memmove(steps, bottom->steps, bottom->stepcount);\n  memset(steps + bottom->stepcount, fake_step.get_dir(), fake_count);\n  memmove(steps + bottom->stepcount + fake_count, top->steps, top->stepcount);\n  delete[] bottom->steps;\n  bottom->steps = steps;\n  bottom->stepcount = stepcount;\n  bottom->end = top->end;\n  bottom->other_end->end = top->end;\n}\n\n/**********************************************************************\n * C_OUTLINE_FRAG::close\n *\n * Join the ends of this fragment and turn it into an outline.\n **********************************************************************/\n\nC_OUTLINE *C_OUTLINE_FRAG::close() { // join pieces\n  DIR128 *new_steps;                 // new steps\n  int32_t new_stepcount;             // no of steps\n  int16_t fake_count;                // fake steps\n  DIR128 fake_step;                  // step entry\n\n  ASSERT_HOST(start.x() == end.x());\n  fake_count = start.y() - end.y();\n  if (fake_count < 0) {\n    fake_count = -fake_count;\n    fake_step = 32;\n  } else {\n    fake_step = 96;\n  }\n\n  new_stepcount = stepcount + fake_count;\n  if (new_stepcount > C_OUTLINE::kMaxOutlineLength) {\n    return nullptr; // Can't join them\n  }\n  new_steps = new DIR128[new_stepcount];\n  memmove(new_steps, steps, stepcount);\n  memset(new_steps + stepcount, fake_step.get_dir(), fake_count);\n  auto *result = new C_OUTLINE(start, new_steps, new_stepcount);\n  delete[] new_steps;\n  return result;\n}\n\n/**********************************************************************\n * C_OUTLINE_FRAG::operator=\n *\n * Copy this fragment.\n **********************************************************************/\n\n// join pieces\nC_OUTLINE_FRAG &C_OUTLINE_FRAG::operator=(const C_OUTLINE_FRAG &src // fragment to copy\n) {\n  delete[] steps;\n\n  stepcount = src.stepcount;\n  steps = new DIR128[stepcount];\n  memmove(steps, src.steps, stepcount);\n  start = src.start;\n  end = src.end;\n  ycoord = src.ycoord;\n  return *this;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/fpchop.h",
    "content": "/**********************************************************************\n * File:        fpchop.h  (Formerly fp_chop.h)\n * Description: Code to chop fixed pitch text into character cells.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef FPCHOP_H\n#define FPCHOP_H\n\n#include \"blobbox.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nclass C_OUTLINE_FRAG : public ELIST<C_OUTLINE_FRAG>::LINK {\npublic:\n  C_OUTLINE_FRAG() { // empty constructor\n    steps = nullptr;\n    stepcount = 0;\n  }\n  ~C_OUTLINE_FRAG() {\n    delete[] steps;\n  }\n  // start coord\n  C_OUTLINE_FRAG(ICOORD start_pt,\n                 ICOORD end_pt,      // end coord\n                 C_OUTLINE *outline, // source of steps\n                 int16_t start_index, int16_t end_index);\n  // other end\n  C_OUTLINE_FRAG(C_OUTLINE_FRAG *head, int16_t tail_y);\n  C_OUTLINE *close();        // copy to outline\n  C_OUTLINE_FRAG &operator=( // assign\n      const C_OUTLINE_FRAG &src);\n\n  ICOORD start;              // start coord\n  ICOORD end;                // end coord\n  DIR128 *steps;             // step array\n  int32_t stepcount;         // no of steps\n  C_OUTLINE_FRAG *other_end; // head if a tail\n  int16_t ycoord;            // coord of cut pt\n\nprivate:\n  // Copy constructor (currently unused, therefore private).\n  C_OUTLINE_FRAG(const C_OUTLINE_FRAG &other) = delete;\n};\n\nELISTIZEH(C_OUTLINE_FRAG)\n\nextern INT_VAR_H(textord_fp_chop_error);\n\nROW *fixed_pitch_words( // find lines\n    TO_ROW *row,        // row to do\n    FCOORD rotation     // for drawing\n);\n\nvoid split_to_blob(                 // split the blob\n    BLOBNBOX *blob,                 // blob to split\n    int16_t chop_coord,             // place to chop\n    float pitch_error,              // allowed deviation\n    C_OUTLINE_LIST *left_coutlines, // for cblobs\n    C_OUTLINE_LIST *right_coutlines);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/gap_map.cpp",
    "content": "// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"gap_map.h\"\n\n#include \"statistc.h\"\n\nnamespace tesseract {\n\nBOOL_VAR(gapmap_debug, false, \"Say which blocks have tables\");\nBOOL_VAR(gapmap_use_ends, false, \"Use large space at start and end of rows\");\nBOOL_VAR(gapmap_no_isolated_quanta, false, \"Ensure gaps not less than 2quanta wide\");\ndouble_VAR(gapmap_big_gaps, 1.75, \"xht multiplier\");\n\n/*************************************************************************\n * A block gap map is a quantised histogram of whitespace regions in the\n * block. It is a vertical projection of wide gaps WITHIN lines\n *\n * The map is held as an array of counts of rows which have a wide gap\n * covering that region of the row. Each bucket in the map represents a width\n * of about half an xheight - (The median of the xhts in the rows is used.)\n *\n * The block is considered RECTANGULAR - delimited by the left and right\n * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are\n * counted.\n *\n *************************************************************************/\n\nGAPMAP::GAPMAP(     // Constructor\n    TO_BLOCK *block // block\n) {\n  TO_ROW *row;         // current row\n  BLOBNBOX_IT blob_it; // iterator\n  TBOX blob_box;\n  TBOX prev_blob_box;\n  int16_t gap_width;\n  int16_t start_of_row;\n  int16_t end_of_row;\n  STATS xht_stats(0, 127);\n  int16_t min_quantum;\n  int16_t max_quantum;\n  int16_t i;\n\n  /*\n  Find left and right extremes and bucket size\n*/\n  map = nullptr;\n  min_left = INT16_MAX;\n  max_right = -INT16_MAX;\n  total_rows = 0;\n  any_tabs = false;\n\n  // row iterator\n  TO_ROW_IT row_it(block->get_rows());\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    if (!row->blob_list()->empty()) {\n      total_rows++;\n      xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);\n      blob_it.set_to_list(row->blob_list());\n      start_of_row = blob_it.data()->bounding_box().left();\n      end_of_row = blob_it.data_relative(-1)->bounding_box().right();\n      if (min_left > start_of_row) {\n        min_left = start_of_row;\n      }\n      if (max_right < end_of_row) {\n        max_right = end_of_row;\n      }\n    }\n  }\n  if ((total_rows < 3) || (min_left >= max_right)) {\n    bucket_size = 0;\n    map_max = 0;\n    total_rows = 0;\n    min_left = max_right = 0;\n    return;\n  }\n  bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;\n  map_max = (max_right - min_left) / bucket_size;\n  map = new int16_t[map_max + 1];\n  for (i = 0; i <= map_max; i++) {\n    map[i] = 0;\n  }\n\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    if (!row->blob_list()->empty()) {\n      blob_it.set_to_list(row->blob_list());\n      blob_it.mark_cycle_pt();\n      blob_box = box_next(&blob_it);\n      prev_blob_box = blob_box;\n      if (gapmap_use_ends) {\n        /* Leading space */\n        gap_width = blob_box.left() - min_left;\n        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {\n          max_quantum = (blob_box.left() - min_left) / bucket_size;\n          if (max_quantum > map_max) {\n            max_quantum = map_max;\n          }\n          for (i = 0; i <= max_quantum; i++) {\n            map[i]++;\n          }\n        }\n      }\n      while (!blob_it.cycled_list()) {\n        blob_box = box_next(&blob_it);\n        gap_width = blob_box.left() - prev_blob_box.right();\n        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {\n          min_quantum = (prev_blob_box.right() - min_left) / bucket_size;\n          max_quantum = (blob_box.left() - min_left) / bucket_size;\n          if (max_quantum > map_max) {\n            max_quantum = map_max;\n          }\n          for (i = min_quantum; i <= max_quantum; i++) {\n            map[i]++;\n          }\n        }\n        prev_blob_box = blob_box;\n      }\n      if (gapmap_use_ends) {\n        /* Trailing space */\n        gap_width = max_right - prev_blob_box.right();\n        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {\n          min_quantum = (prev_blob_box.right() - min_left) / bucket_size;\n          if (min_quantum < 0) {\n            min_quantum = 0;\n          }\n          for (i = min_quantum; i <= map_max; i++) {\n            map[i]++;\n          }\n        }\n      }\n    }\n  }\n  for (i = 0; i <= map_max; i++) {\n    if (map[i] > total_rows / 2) {\n      if (gapmap_no_isolated_quanta &&\n          (((i == 0) && (map[i + 1] <= total_rows / 2)) ||\n           ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||\n           ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&\n            (map[i + 1] <= total_rows / 2)))) {\n        map[i] = 0; // prevent isolated quantum\n      } else {\n        any_tabs = true;\n      }\n    }\n  }\n  if (gapmap_debug && any_tabs) {\n    tprintf(\"Table found\\n\");\n  }\n}\n\n/*************************************************************************\n * GAPMAP::table_gap()\n * Is there a bucket in the specified range where more than half the rows in the\n * block have a wide gap?\n *************************************************************************/\n\nbool GAPMAP::table_gap( // Is gap a table?\n    int16_t left,       // From here\n    int16_t right       // To here\n) {\n  int16_t min_quantum;\n  int16_t max_quantum;\n  int16_t i;\n  bool tab_found = false;\n\n  if (!any_tabs) {\n    return false;\n  }\n\n  min_quantum = (left - min_left) / bucket_size;\n  max_quantum = (right - min_left) / bucket_size;\n  // Clip to the bounds of the array. In some circumstances (big blob followed\n  // by small blob) max_quantum can exceed the map_max bounds, but we clip\n  // here instead, as it provides better long-term safety.\n  if (min_quantum < 0) {\n    min_quantum = 0;\n  }\n  if (max_quantum > map_max) {\n    max_quantum = map_max;\n  }\n  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {\n    if (map[i] > total_rows / 2) {\n      tab_found = true;\n    }\n  }\n  return tab_found;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/gap_map.h",
    "content": "// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef GAP_MAP_H\n#define GAP_MAP_H\n\n#include \"blobbox.h\"\n\nnamespace tesseract {\n\nclass GAPMAP {\npublic:\n  GAPMAP( // constructor\n      TO_BLOCK *block);\n\n  ~GAPMAP() { // destructor\n    delete[] map;\n  }\n\n  bool table_gap(     // Is gap a table?\n      int16_t left,   // From here\n      int16_t right); // To here\n\nprivate:\n  int16_t total_rows;  // in block\n  int16_t min_left;    // Left extreme\n  int16_t max_right;   // Right extreme\n  int16_t bucket_size; // half an x ht\n  int16_t *map;        // empty counts\n  int16_t map_max;     // map[0..max_map] defined\n  bool any_tabs;\n};\n\n/*-----------------------------*/\n\nextern BOOL_VAR_H(gapmap_debug);\nextern BOOL_VAR_H(gapmap_use_ends);\nextern BOOL_VAR_H(gapmap_no_isolated_quanta);\nextern double_VAR_H(gapmap_big_gaps);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/imagefind.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        imagefind.cpp\n// Description: Function to find image and drawing regions in an image\n//              and create a corresponding list of empty blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"imagefind.h\"\n\n#include \"colpartitiongrid.h\"\n#include \"linlsq.h\"\n#include \"params.h\"\n#include \"statistc.h\"\n\n#include <allheaders.h>\n\n#include <algorithm>\n\nnamespace tesseract {\n\nstatic INT_VAR(textord_tabfind_show_images, false, \"Show image blobs\");\n\n// Fraction of width or height of on pixels that can be discarded from a\n// roughly rectangular image.\nconst double kMinRectangularFraction = 0.125;\n// Fraction of width or height to consider image completely used.\nconst double kMaxRectangularFraction = 0.75;\n// Fraction of width or height to allow transition from kMinRectangularFraction\n// to kMaxRectangularFraction, equivalent to a dy/dx skew.\nconst double kMaxRectangularGradient = 0.1; // About 6 degrees.\n// Minimum image size to be worth looking for images on.\nconst int kMinImageFindSize = 100;\n// Pixel padding for noise blobs and partitions when rendering on the image\n// mask to encourage them to join together. Make it too big and images\n// will fatten out too much and have to be clipped to text.\nconst int kNoisePadding = 4;\n\n// Scans horizontally on x=[x_start,x_end), starting with y=*y_start,\n// stepping y+=y_step, until y=y_end. *ystart is input/output.\n// If the number of black pixels in a row, pix_count fits this pattern:\n// 0 or more rows with pix_count < min_count then\n// <= mid_width rows with min_count <= pix_count <= max_count then\n// a row with pix_count > max_count then\n// true is returned, and *y_start = the first y with pix_count >= min_count.\nstatic bool HScanForEdge(uint32_t *data, int wpl, int x_start, int x_end, int min_count,\n                         int mid_width, int max_count, int y_end, int y_step, int *y_start) {\n  int mid_rows = 0;\n  for (int y = *y_start; y != y_end; y += y_step) {\n    // Need pixCountPixelsInRow(pix, y, &pix_count, nullptr) to count in a\n    // subset.\n    int pix_count = 0;\n    uint32_t *line = data + wpl * y;\n    for (int x = x_start; x < x_end; ++x) {\n      if (GET_DATA_BIT(line, x)) {\n        ++pix_count;\n      }\n    }\n    if (mid_rows == 0 && pix_count < min_count) {\n      continue; // In the min phase.\n    }\n    if (mid_rows == 0) {\n      *y_start = y; // Save the y_start where we came out of the min phase.\n    }\n    if (pix_count > max_count) {\n      return true; // Found the pattern.\n    }\n    ++mid_rows;\n    if (mid_rows > mid_width) {\n      break; // Middle too big.\n    }\n  }\n  return false; // Never found max_count.\n}\n\n// Scans vertically on y=[y_start,y_end), starting with x=*x_start,\n// stepping x+=x_step, until x=x_end. *x_start is input/output.\n// If the number of black pixels in a column, pix_count fits this pattern:\n// 0 or more cols with pix_count < min_count then\n// <= mid_width cols with min_count <= pix_count <= max_count then\n// a column with pix_count > max_count then\n// true is returned, and *x_start = the first x with pix_count >= min_count.\nstatic bool VScanForEdge(uint32_t *data, int wpl, int y_start, int y_end, int min_count,\n                         int mid_width, int max_count, int x_end, int x_step, int *x_start) {\n  int mid_cols = 0;\n  for (int x = *x_start; x != x_end; x += x_step) {\n    int pix_count = 0;\n    uint32_t *line = data + y_start * wpl;\n    for (int y = y_start; y < y_end; ++y, line += wpl) {\n      if (GET_DATA_BIT(line, x)) {\n        ++pix_count;\n      }\n    }\n    if (mid_cols == 0 && pix_count < min_count) {\n      continue; // In the min phase.\n    }\n    if (mid_cols == 0) {\n      *x_start = x; // Save the place where we came out of the min phase.\n    }\n    if (pix_count > max_count) {\n      return true; // found the pattern.\n    }\n    ++mid_cols;\n    if (mid_cols > mid_width) {\n      break; // Middle too big.\n    }\n  }\n  return false; // Never found max_count.\n}\n\n// Returns true if there is a rectangle in the source pix, such that all\n// pixel rows and column slices outside of it have less than\n// min_fraction of the pixels black, and within max_skew_gradient fraction\n// of the pixels on the inside, there are at least max_fraction of the\n// pixels black. In other words, the inside of the rectangle looks roughly\n// rectangular, and the outside of it looks like extra bits.\n// On return, the rectangle is defined by x_start, y_start, x_end and y_end.\n// Note: the algorithm is iterative, allowing it to slice off pixels from\n// one edge, allowing it to then slice off more pixels from another edge.\nstatic bool pixNearlyRectangular(Image pix, double min_fraction, double max_fraction,\n                                 double max_skew_gradient, int *x_start, int *y_start,\n                                 int *x_end, int *y_end) {\n  ASSERT_HOST(pix != nullptr);\n  *x_start = 0;\n  *x_end = pixGetWidth(pix);\n  *y_start = 0;\n  *y_end = pixGetHeight(pix);\n\n  uint32_t *data = pixGetData(pix);\n  int wpl = pixGetWpl(pix);\n  bool any_cut = false;\n  bool left_done = false;\n  bool right_done = false;\n  bool top_done = false;\n  bool bottom_done = false;\n  do {\n    any_cut = false;\n    // Find the top/bottom edges.\n    int width = *x_end - *x_start;\n    int min_count = static_cast<int>(width * min_fraction);\n    int max_count = static_cast<int>(width * max_fraction);\n    int edge_width = static_cast<int>(width * max_skew_gradient);\n    if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_end, 1,\n                     y_start) &&\n        !top_done) {\n      top_done = true;\n      any_cut = true;\n    }\n    --(*y_end);\n    if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_start, -1,\n                     y_end) &&\n        !bottom_done) {\n      bottom_done = true;\n      any_cut = true;\n    }\n    ++(*y_end);\n\n    // Find the left/right edges.\n    int height = *y_end - *y_start;\n    min_count = static_cast<int>(height * min_fraction);\n    max_count = static_cast<int>(height * max_fraction);\n    edge_width = static_cast<int>(height * max_skew_gradient);\n    if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_end, 1,\n                     x_start) &&\n        !left_done) {\n      left_done = true;\n      any_cut = true;\n    }\n    --(*x_end);\n    if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_start, -1,\n                     x_end) &&\n        !right_done) {\n      right_done = true;\n      any_cut = true;\n    }\n    ++(*x_end);\n  } while (any_cut);\n\n  // All edges must satisfy the condition of sharp gradient in pixel density\n  // in order for the full rectangle to be present.\n  return left_done && right_done && top_done && bottom_done;\n}\n\n// Generates a Boxa, Pixa pair from the input binary (image mask) pix,\n// analogous to pixConnComp, except that connected components which are nearly\n// rectangular are replaced with solid rectangles.\n// The returned boxa, pixa may be nullptr, meaning no images found.\n// If not nullptr, they must be destroyed by the caller.\n// Resolution of pix should match the source image (Tesseract::pix_binary_)\n// so the output coordinate systems match.\nstatic void ConnCompAndRectangularize(Image pix, DebugPixa *pixa_debug, Boxa **boxa,\n                                      Pixa **pixa) {\n  *boxa = nullptr;\n  *pixa = nullptr;\n\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pix, \"Conncompimage\");\n  }\n  // Find the individual image regions in the mask image.\n  *boxa = pixConnComp(pix, pixa, 8);\n  // Rectangularize the individual images. If a sharp edge in vertical and/or\n  // horizontal occupancy can be found, it indicates a probably rectangular\n  // image with unwanted bits merged on, so clip to the approximate rectangle.\n  int npixes = 0;\n  if (*boxa != nullptr && *pixa != nullptr) {\n    npixes = pixaGetCount(*pixa);\n  }\n  for (int i = 0; i < npixes; ++i) {\n    int x_start, x_end, y_start, y_end;\n    Image img_pix = pixaGetPix(*pixa, i, L_CLONE);\n    if (textord_tabfind_show_images && pixa_debug != nullptr) {\n      pixa_debug->AddPix(img_pix, \"A component\");\n    }\n    if (pixNearlyRectangular(img_pix, kMinRectangularFraction, kMaxRectangularFraction,\n                             kMaxRectangularGradient, &x_start, &y_start, &x_end, &y_end)) {\n      Image simple_pix = pixCreate(x_end - x_start, y_end - y_start, 1);\n      pixSetAll(simple_pix);\n      img_pix.destroy();\n      // pixaReplacePix takes ownership of the simple_pix.\n      pixaReplacePix(*pixa, i, simple_pix, nullptr);\n      img_pix = pixaGetPix(*pixa, i, L_CLONE);\n      // Fix the box to match the new pix.\n      l_int32 x, y, width, height;\n      boxaGetBoxGeometry(*boxa, i, &x, &y, &width, &height);\n      Box *simple_box = boxCreate(x + x_start, y + y_start, x_end - x_start, y_end - y_start);\n      boxaReplaceBox(*boxa, i, simple_box);\n    }\n    img_pix.destroy();\n  }\n}\n\n// Finds image regions within the BINARY source pix (page image) and returns\n// the image regions as a mask image.\n// The returned pix may be nullptr, meaning no images found.\n// If not nullptr, it must be PixDestroyed by the caller.\n// If textord_tabfind_show_images, debug images are appended to pixa_debug.\nImage ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) {\n  auto width = pixGetWidth(pix);\n  auto height = pixGetHeight(pix);\n  // Not worth looking at small images.\n  // Leptonica will print an error message and return nullptr if we call\n  // pixGenHalftoneMask(pixr, nullptr, ...) with width or height < 100\n  // for the reduced image, so we want to bypass that, too.\n  if (width / 2 < kMinImageFindSize || height / 2 < kMinImageFindSize) {\n    return pixCreate(width, height, 1);\n  }\n\n  // Reduce by factor 2.\n  Image pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0);\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixr, \"CascadeReduced\");\n  }\n\n  // Get the halftone mask directly from Leptonica.\n  l_int32 ht_found = 0;\n  Pixa *pixadb = (textord_tabfind_show_images && pixa_debug != nullptr) ? pixaCreate(0) : nullptr;\n  Image pixht2 = pixGenerateHalftoneMask(pixr, nullptr, &ht_found, pixadb);\n  if (pixadb) {\n    Image pixdb = pixaDisplayTiledInColumns(pixadb, 3, 1.0, 20, 2);\n    if (textord_tabfind_show_images && pixa_debug != nullptr) {\n      pixa_debug->AddPix(pixdb, \"HalftoneMask\");\n    }\n    pixdb.destroy();\n    pixaDestroy(&pixadb);\n  }\n  pixr.destroy();\n  if (!ht_found && pixht2 != nullptr) {\n    pixht2.destroy();\n  }\n  if (pixht2 == nullptr) {\n    return pixCreate(width, height, 1);\n  }\n\n  // Expand back up again.\n  Image pixht = pixExpandReplicate(pixht2, 2);\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixht, \"HalftoneReplicated\");\n  }\n  pixht2.destroy();\n\n  // Fill to capture pixels near the mask edges that were missed\n  Image pixt = pixSeedfillBinary(nullptr, pixht, pix, 8);\n  pixht |= pixt;\n  pixt.destroy();\n\n  // Eliminate lines and bars that may be joined to images.\n  Image pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3);\n  pixDilateBrick(pixfinemask, pixfinemask, 5, 5);\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixfinemask, \"FineMask\");\n  }\n  Image pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1);\n  Image pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0);\n  pixreduced.destroy();\n  pixDilateBrick(pixreduced2, pixreduced2, 5, 5);\n  Image pixcoarsemask = pixExpandReplicate(pixreduced2, 8);\n  pixreduced2.destroy();\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixcoarsemask, \"CoarseMask\");\n  }\n  // Combine the coarse and fine image masks.\n  pixcoarsemask &= pixfinemask;\n  pixfinemask.destroy();\n  // Dilate a bit to make sure we get everything.\n  pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3);\n  Image pixmask = pixExpandReplicate(pixcoarsemask, 16);\n  pixcoarsemask.destroy();\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixmask, \"MaskDilated\");\n  }\n  // And the image mask with the line and bar remover.\n  pixht &= pixmask;\n  pixmask.destroy();\n  if (textord_tabfind_show_images && pixa_debug != nullptr) {\n    pixa_debug->AddPix(pixht, \"FinalMask\");\n  }\n  // Make the result image the same size as the input.\n  Image result = pixCreate(width, height, 1);\n  result |= pixht;\n  pixht.destroy();\n  return result;\n}\n\n// Given an input pix, and a bounding rectangle, the sides of the rectangle\n// are shrunk inwards until they bound any black pixels found within the\n// original rectangle. Returns false if the rectangle contains no black\n// pixels at all.\nbool ImageFind::BoundsWithinRect(Image pix, int *x_start, int *y_start, int *x_end, int *y_end) {\n  Box *input_box = boxCreate(*x_start, *y_start, *x_end - *x_start, *y_end - *y_start);\n  Box *output_box = nullptr;\n  pixClipBoxToForeground(pix, input_box, nullptr, &output_box);\n  bool result = output_box != nullptr;\n  if (result) {\n    l_int32 x, y, width, height;\n    boxGetGeometry(output_box, &x, &y, &width, &height);\n    *x_start = x;\n    *y_start = y;\n    *x_end = x + width;\n    *y_end = y + height;\n    boxDestroy(&output_box);\n  }\n  boxDestroy(&input_box);\n  return result;\n}\n\n// Given a point in 3-D (RGB) space, returns the squared Euclidean distance\n// of the point from the given line, defined by a pair of points in the 3-D\n// (RGB) space, line1 and line2.\ndouble ImageFind::ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2,\n                                        const uint8_t *point) {\n  int line_vector[kRGBRMSColors];\n  int point_vector[kRGBRMSColors];\n  for (int i = 0; i < kRGBRMSColors; ++i) {\n    line_vector[i] = static_cast<int>(line2[i]) - static_cast<int>(line1[i]);\n    point_vector[i] = static_cast<int>(point[i]) - static_cast<int>(line1[i]);\n  }\n  line_vector[L_ALPHA_CHANNEL] = 0;\n  // Now the cross product in 3d.\n  int cross[kRGBRMSColors];\n  cross[COLOR_RED] = line_vector[COLOR_GREEN] * point_vector[COLOR_BLUE] -\n                     line_vector[COLOR_BLUE] * point_vector[COLOR_GREEN];\n  cross[COLOR_GREEN] = line_vector[COLOR_BLUE] * point_vector[COLOR_RED] -\n                       line_vector[COLOR_RED] * point_vector[COLOR_BLUE];\n  cross[COLOR_BLUE] = line_vector[COLOR_RED] * point_vector[COLOR_GREEN] -\n                      line_vector[COLOR_GREEN] * point_vector[COLOR_RED];\n  cross[L_ALPHA_CHANNEL] = 0;\n  // Now the sums of the squares.\n  double cross_sq = 0.0;\n  double line_sq = 0.0;\n  for (int j = 0; j < kRGBRMSColors; ++j) {\n    cross_sq += static_cast<double>(cross[j]) * cross[j];\n    line_sq += static_cast<double>(line_vector[j]) * line_vector[j];\n  }\n  if (line_sq == 0.0) {\n    return 0.0;\n  }\n  return cross_sq / line_sq; // This is the squared distance.\n}\n\n// ================ CUTTING POLYGONAL IMAGES FROM A RECTANGLE ================\n// The following functions are responsible for cutting a polygonal image from\n// a rectangle: CountPixelsInRotatedBox, AttemptToShrinkBox, CutChunkFromParts\n// with DivideImageIntoParts as the master.\n// Problem statement:\n// We start with a single connected component from the image mask: we get\n// a Pix of the component, and its location on the page (im_box).\n// The objective of cutting a polygonal image from its rectangle is to avoid\n// interfering text, but not text that completely overlaps the image.\n//     ------------------------------      ------------------------------\n//     |   Single input partition   |      | 1 Cut up output partitions |\n//     |                            |      ------------------------------\n//   Av|oid                         |    Avoid |                        |\n//     |                            |          |________________________|\n//  Int|erfering                    |   Interfering  |                  |\n//     |                            |           _____|__________________|\n//    T|ext                         |     Text |                        |\n//     |        Text-on-image       |          |     Text-on-image      |\n//     ------------------------------          --------------------------\n// DivideImageIntoParts does this by building a ColPartition_LIST (not in the\n// grid) with each ColPartition representing one of the rectangles needed,\n// starting with a single rectangle for the whole image component, and cutting\n// bits out of it with CutChunkFromParts as needed to avoid text. The output\n// ColPartitions are supposed to be ordered from top to bottom.\n\n// The problem is complicated by the fact that we have rotated the coordinate\n// system to make text lines horizontal, so if we need to look at the component\n// image, we have to rotate the coordinates. Throughout the functions in this\n// section im_box is the rectangle representing the image component in the\n// rotated page coordinates (where we are building our output ColPartitions),\n// rotation is the rotation that we used to get there, and rerotation is the\n// rotation required to get back to original page image coordinates.\n// To get to coordinates in the component image, pix, we rotate the im_box,\n// the point we want to locate, and subtract the rotated point from the top-left\n// of the rotated im_box.\n// im_box is therefore essential to calculating coordinates within the pix.\n\n// Returns true if there are no black pixels in between the boxes.\n// The im_box must represent the bounding box of the pix in tesseract\n// coordinates, which may be negative, due to rotations to make the textlines\n// horizontal. The boxes are rotated by rotation, which should undo such\n// rotations, before mapping them onto the pix.\nbool ImageFind::BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box,\n                                    const FCOORD &rotation, Image pix) {\n  TBOX search_box(box1);\n  search_box += box2;\n  if (box1.x_gap(box2) >= box1.y_gap(box2)) {\n    if (box1.x_gap(box2) <= 0) {\n      return true;\n    }\n    search_box.set_left(std::min(box1.right(), box2.right()));\n    search_box.set_right(std::max(box1.left(), box2.left()));\n  } else {\n    if (box1.y_gap(box2) <= 0) {\n      return true;\n    }\n    search_box.set_top(std::max(box1.bottom(), box2.bottom()));\n    search_box.set_bottom(std::min(box1.top(), box2.top()));\n  }\n  return CountPixelsInRotatedBox(search_box, im_box, rotation, pix) == 0;\n}\n\n// Returns the number of pixels in box in the pix.\n// rotation, pix and im_box are defined in the large comment above.\nint ImageFind::CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation,\n                                       Image pix) {\n  // Intersect it with the image box.\n  box &= im_box; // This is in-place box intersection.\n  if (box.null_box()) {\n    return 0;\n  }\n  box.rotate(rotation);\n  TBOX rotated_im_box(im_box);\n  rotated_im_box.rotate(rotation);\n  Image rect_pix = pixCreate(box.width(), box.height(), 1);\n  pixRasterop(rect_pix, 0, 0, box.width(), box.height(), PIX_SRC, pix,\n              box.left() - rotated_im_box.left(), rotated_im_box.top() - box.top());\n  l_int32 result;\n  pixCountPixels(rect_pix, &result, nullptr);\n  rect_pix.destroy();\n  return result;\n}\n\n// The box given by slice contains some black pixels, but not necessarily\n// over the whole box. Shrink the x bounds of slice, but not the y bounds\n// until there is at least one black pixel in the outermost columns.\n// rotation, rerotation, pix and im_box are defined in the large comment above.\nstatic void AttemptToShrinkBox(const FCOORD &rotation, const FCOORD &rerotation, const TBOX &im_box,\n                               Image pix, TBOX *slice) {\n  TBOX rotated_box(*slice);\n  rotated_box.rotate(rerotation);\n  TBOX rotated_im_box(im_box);\n  rotated_im_box.rotate(rerotation);\n  int left = rotated_box.left() - rotated_im_box.left();\n  int right = rotated_box.right() - rotated_im_box.left();\n  int top = rotated_im_box.top() - rotated_box.top();\n  int bottom = rotated_im_box.top() - rotated_box.bottom();\n  ImageFind::BoundsWithinRect(pix, &left, &top, &right, &bottom);\n  top = rotated_im_box.top() - top;\n  bottom = rotated_im_box.top() - bottom;\n  left += rotated_im_box.left();\n  right += rotated_im_box.left();\n  rotated_box.set_to_given_coords(left, bottom, right, top);\n  rotated_box.rotate(rotation);\n  slice->set_left(rotated_box.left());\n  slice->set_right(rotated_box.right());\n}\n\n// The meat of cutting a polygonal image around text.\n// This function covers the general case of cutting a box out of a box\n// as shown:\n// Input                               Output\n// ------------------------------      ------------------------------\n// |   Single input partition   |      | 1 Cut up output partitions |\n// |                            |      ------------------------------\n// |         ----------         |      ---------           ----------\n// |         |  box   |         |      |   2   |   box     |    3   |\n// |         |        |         |      |       |  is cut   |        |\n// |         ----------         |      ---------   out     ----------\n// |                            |      ------------------------------\n// |                            |      |   4                        |\n// ------------------------------      ------------------------------\n// In the context that this function is used, at most 3 of the above output\n// boxes will be created, as the overlapping box is never contained by the\n// input.\n// The above cutting operation is executed for each element of part_list that\n// is overlapped by the input box. Each modified ColPartition is replaced\n// in place in the list by the output of the cutting operation in the order\n// shown above, so iff no holes are ever created, the output will be in\n// top-to-bottom order, but in extreme cases, hole creation is possible.\n// In such cases, the output order may cause strange block polygons.\n// rotation, rerotation, pix and im_box are defined in the large comment above.\nstatic void CutChunkFromParts(const TBOX &box, const TBOX &im_box, const FCOORD &rotation,\n                              const FCOORD &rerotation, Image pix, ColPartition_LIST *part_list) {\n  ASSERT_HOST(!part_list->empty());\n  ColPartition_IT part_it(part_list);\n  do {\n    ColPartition *part = part_it.data();\n    TBOX part_box = part->bounding_box();\n    if (part_box.overlap(box)) {\n      // This part must be cut and replaced with the remains. There are\n      // up to 4 pieces to be made. Start with the first one and use\n      // add_before_stay_put. For each piece if it has no black pixels\n      // left, just don't make the box.\n      // Above box.\n      if (box.top() < part_box.top()) {\n        TBOX slice(part_box);\n        slice.set_bottom(box.top());\n        if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {\n          AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);\n          part_it.add_before_stay_put(\n              ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT));\n        }\n      }\n      // Left of box.\n      if (box.left() > part_box.left()) {\n        TBOX slice(part_box);\n        slice.set_right(box.left());\n        if (box.top() < part_box.top()) {\n          slice.set_top(box.top());\n        }\n        if (box.bottom() > part_box.bottom()) {\n          slice.set_bottom(box.bottom());\n        }\n        if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {\n          AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);\n          part_it.add_before_stay_put(\n              ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT));\n        }\n      }\n      // Right of box.\n      if (box.right() < part_box.right()) {\n        TBOX slice(part_box);\n        slice.set_left(box.right());\n        if (box.top() < part_box.top()) {\n          slice.set_top(box.top());\n        }\n        if (box.bottom() > part_box.bottom()) {\n          slice.set_bottom(box.bottom());\n        }\n        if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {\n          AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);\n          part_it.add_before_stay_put(\n              ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT));\n        }\n      }\n      // Below box.\n      if (box.bottom() > part_box.bottom()) {\n        TBOX slice(part_box);\n        slice.set_top(box.bottom());\n        if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {\n          AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);\n          part_it.add_before_stay_put(\n              ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT));\n        }\n      }\n      part->DeleteBoxes();\n      delete part_it.extract();\n    }\n    part_it.forward();\n  } while (!part_it.at_first());\n}\n\n// Starts with the bounding box of the image component and cuts it up\n// so that it doesn't intersect text where possible.\n// Strong fully contained horizontal text is marked as text on image,\n// and does not cause a division of the image.\n// For more detail see the large comment above on cutting polygonal images\n// from a rectangle.\n// rotation, rerotation, pix and im_box are defined in the large comment above.\nstatic void DivideImageIntoParts(const TBOX &im_box, const FCOORD &rotation,\n                                 const FCOORD &rerotation, Image pix,\n                                 ColPartitionGridSearch *rectsearch, ColPartition_LIST *part_list) {\n  // Add the full im_box partition to the list to begin with.\n  ColPartition *pix_part =\n      ColPartition::FakePartition(im_box, PT_UNKNOWN, BRT_RECTIMAGE, BTFT_NONTEXT);\n  ColPartition_IT part_it(part_list);\n  part_it.add_after_then_move(pix_part);\n\n  rectsearch->StartRectSearch(im_box);\n  ColPartition *part;\n  while ((part = rectsearch->NextRectSearch()) != nullptr) {\n    TBOX part_box = part->bounding_box();\n    if (part_box.contains(im_box) && part->flow() >= BTFT_CHAIN) {\n      // This image is completely covered by an existing text partition.\n      for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n        ColPartition *pix_part = part_it.extract();\n        pix_part->DeleteBoxes();\n        delete pix_part;\n      }\n    } else if (part->flow() == BTFT_STRONG_CHAIN) {\n      // Text intersects the box.\n      TBOX overlap_box = part_box.intersection(im_box);\n      // Intersect it with the image box.\n      int black_area = ImageFind::CountPixelsInRotatedBox(overlap_box, im_box, rerotation, pix);\n      if (black_area * 2 < part_box.area() || !im_box.contains(part_box)) {\n        // Eat a piece out of the image.\n        // Pad it so that pieces eaten out look decent.\n        int padding = part->blob_type() == BRT_VERT_TEXT ? part_box.width() : part_box.height();\n        part_box.set_top(part_box.top() + padding / 2);\n        part_box.set_bottom(part_box.bottom() - padding / 2);\n        CutChunkFromParts(part_box, im_box, rotation, rerotation, pix, part_list);\n      } else {\n        // Strong overlap with the black area, so call it text on image.\n        part->set_flow(BTFT_TEXT_ON_IMAGE);\n      }\n    }\n    if (part_list->empty()) {\n      break;\n    }\n  }\n}\n\n// Search for the rightmost text that overlaps vertically and is to the left\n// of the given box, but within the given left limit.\nstatic int ExpandImageLeft(const TBOX &box, int left_limit, ColPartitionGrid *part_grid) {\n  ColPartitionGridSearch search(part_grid);\n  ColPartition *part;\n  // Search right to left for any text that overlaps.\n  search.StartSideSearch(box.left(), box.bottom(), box.top());\n  while ((part = search.NextSideSearch(true)) != nullptr) {\n    if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n      const TBOX &part_box(part->bounding_box());\n      if (part_box.y_gap(box) < 0) {\n        if (part_box.right() > left_limit && part_box.right() < box.left()) {\n          left_limit = part_box.right();\n        }\n        break;\n      }\n    }\n  }\n  if (part != nullptr) {\n    // Search for the nearest text up to the one we already found.\n    TBOX search_box(left_limit, box.bottom(), box.left(), box.top());\n    search.StartRectSearch(search_box);\n    while ((part = search.NextRectSearch()) != nullptr) {\n      if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n        const TBOX &part_box(part->bounding_box());\n        if (part_box.y_gap(box) < 0) {\n          if (part_box.right() > left_limit && part_box.right() < box.left()) {\n            left_limit = part_box.right();\n          }\n        }\n      }\n    }\n  }\n  return left_limit;\n}\n\n// Search for the leftmost text that overlaps vertically and is to the right\n// of the given box, but within the given right limit.\nstatic int ExpandImageRight(const TBOX &box, int right_limit, ColPartitionGrid *part_grid) {\n  ColPartitionGridSearch search(part_grid);\n  ColPartition *part;\n  // Search left to right for any text that overlaps.\n  search.StartSideSearch(box.right(), box.bottom(), box.top());\n  while ((part = search.NextSideSearch(false)) != nullptr) {\n    if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n      const TBOX &part_box(part->bounding_box());\n      if (part_box.y_gap(box) < 0) {\n        if (part_box.left() < right_limit && part_box.left() > box.right()) {\n          right_limit = part_box.left();\n        }\n        break;\n      }\n    }\n  }\n  if (part != nullptr) {\n    // Search for the nearest text up to the one we already found.\n    TBOX search_box(box.left(), box.bottom(), right_limit, box.top());\n    search.StartRectSearch(search_box);\n    while ((part = search.NextRectSearch()) != nullptr) {\n      if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n        const TBOX &part_box(part->bounding_box());\n        if (part_box.y_gap(box) < 0) {\n          if (part_box.left() < right_limit && part_box.left() > box.right()) {\n            right_limit = part_box.left();\n          }\n        }\n      }\n    }\n  }\n  return right_limit;\n}\n\n// Search for the topmost text that overlaps horizontally and is below\n// the given box, but within the given bottom limit.\nstatic int ExpandImageBottom(const TBOX &box, int bottom_limit, ColPartitionGrid *part_grid) {\n  ColPartitionGridSearch search(part_grid);\n  ColPartition *part;\n  // Search right to left for any text that overlaps.\n  search.StartVerticalSearch(box.left(), box.right(), box.bottom());\n  while ((part = search.NextVerticalSearch(true)) != nullptr) {\n    if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n      const TBOX &part_box(part->bounding_box());\n      if (part_box.x_gap(box) < 0) {\n        if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) {\n          bottom_limit = part_box.top();\n        }\n        break;\n      }\n    }\n  }\n  if (part != nullptr) {\n    // Search for the nearest text up to the one we already found.\n    TBOX search_box(box.left(), bottom_limit, box.right(), box.bottom());\n    search.StartRectSearch(search_box);\n    while ((part = search.NextRectSearch()) != nullptr) {\n      if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n        const TBOX &part_box(part->bounding_box());\n        if (part_box.x_gap(box) < 0) {\n          if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) {\n            bottom_limit = part_box.top();\n          }\n        }\n      }\n    }\n  }\n  return bottom_limit;\n}\n\n// Search for the bottommost text that overlaps horizontally and is above\n// the given box, but within the given top limit.\nstatic int ExpandImageTop(const TBOX &box, int top_limit, ColPartitionGrid *part_grid) {\n  ColPartitionGridSearch search(part_grid);\n  ColPartition *part;\n  // Search right to left for any text that overlaps.\n  search.StartVerticalSearch(box.left(), box.right(), box.top());\n  while ((part = search.NextVerticalSearch(false)) != nullptr) {\n    if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n      const TBOX &part_box(part->bounding_box());\n      if (part_box.x_gap(box) < 0) {\n        if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) {\n          top_limit = part_box.bottom();\n        }\n        break;\n      }\n    }\n  }\n  if (part != nullptr) {\n    // Search for the nearest text up to the one we already found.\n    TBOX search_box(box.left(), box.top(), box.right(), top_limit);\n    search.StartRectSearch(search_box);\n    while ((part = search.NextRectSearch()) != nullptr) {\n      if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {\n        const TBOX &part_box(part->bounding_box());\n        if (part_box.x_gap(box) < 0) {\n          if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) {\n            top_limit = part_box.bottom();\n          }\n        }\n      }\n    }\n  }\n  return top_limit;\n}\n\n// Expands the image box in the given direction until it hits text,\n// limiting the expansion to the given limit box, returning the result\n// in the expanded box, and\n// returning the increase in area resulting from the expansion.\nstatic int ExpandImageDir(BlobNeighbourDir dir, const TBOX &im_box, const TBOX &limit_box,\n                          ColPartitionGrid *part_grid, TBOX *expanded_box) {\n  *expanded_box = im_box;\n  switch (dir) {\n    case BND_LEFT:\n      expanded_box->set_left(ExpandImageLeft(im_box, limit_box.left(), part_grid));\n      break;\n    case BND_RIGHT:\n      expanded_box->set_right(ExpandImageRight(im_box, limit_box.right(), part_grid));\n      break;\n    case BND_ABOVE:\n      expanded_box->set_top(ExpandImageTop(im_box, limit_box.top(), part_grid));\n      break;\n    case BND_BELOW:\n      expanded_box->set_bottom(ExpandImageBottom(im_box, limit_box.bottom(), part_grid));\n      break;\n    default:\n      return 0;\n  }\n  return expanded_box->area() - im_box.area();\n}\n\n// Expands the image partition into any non-text until it touches text.\n// The expansion proceeds in the order of increasing increase in area\n// as a heuristic to find the best rectangle by expanding in the most\n// constrained direction first.\nstatic void MaximalImageBoundingBox(ColPartitionGrid *part_grid, TBOX *im_box) {\n  bool dunnit[BND_COUNT];\n  memset(dunnit, 0, sizeof(dunnit));\n  TBOX limit_box(part_grid->bleft().x(), part_grid->bleft().y(), part_grid->tright().x(),\n                 part_grid->tright().y());\n  TBOX text_box(*im_box);\n  for (int iteration = 0; iteration < BND_COUNT; ++iteration) {\n    // Find the direction with least area increase.\n    int best_delta = -1;\n    BlobNeighbourDir best_dir = BND_LEFT;\n    TBOX expanded_boxes[BND_COUNT];\n    for (int dir = 0; dir < BND_COUNT; ++dir) {\n      auto bnd = static_cast<BlobNeighbourDir>(dir);\n      if (!dunnit[bnd]) {\n        TBOX expanded_box;\n        int area_delta = ExpandImageDir(bnd, text_box, limit_box, part_grid, &expanded_boxes[bnd]);\n        if (best_delta < 0 || area_delta < best_delta) {\n          best_delta = area_delta;\n          best_dir = bnd;\n        }\n      }\n    }\n    // Run the best and remember the direction.\n    dunnit[best_dir] = true;\n    text_box = expanded_boxes[best_dir];\n  }\n  *im_box = text_box;\n}\n\n// Helper deletes the given partition but first marks up all the blobs as\n// noise, so they get deleted later, and disowns them.\n// If the initial type of the partition is image, then it actually deletes\n// the blobs, as the partition owns them in that case.\nstatic void DeletePartition(ColPartition *part) {\n  BlobRegionType type = part->blob_type();\n  if (type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {\n    // The partition owns the boxes of these types, so just delete them.\n    part->DeleteBoxes(); // From a previous iteration.\n  } else {\n    // Once marked, the blobs will be swept up by TidyBlobs.\n    part->set_flow(BTFT_NONTEXT);\n    part->set_blob_type(BRT_NOISE);\n    part->SetBlobTypes();\n    part->DisownBoxes(); // Created before FindImagePartitions.\n  }\n  delete part;\n}\n\n// The meat of joining fragmented images and consuming ColPartitions of\n// uncertain type.\n// *part_ptr is an input/output BRT_RECTIMAGE ColPartition that is to be\n// expanded to consume overlapping and nearby ColPartitions of uncertain type\n// and other BRT_RECTIMAGE partitions, but NOT to be expanded beyond\n// max_image_box. *part_ptr is NOT in the part_grid.\n// rectsearch is already constructed on the part_grid, and is used for\n// searching for overlapping and nearby ColPartitions.\n// ExpandImageIntoParts is called iteratively until it returns false. Each\n// time it absorbs the nearest non-contained candidate, and everything that\n// is fully contained within part_ptr's bounding box.\n// TODO(rays) what if it just eats everything inside max_image_box in one go?\nstatic bool ExpandImageIntoParts(const TBOX &max_image_box, ColPartitionGridSearch *rectsearch,\n                                 ColPartitionGrid *part_grid, ColPartition **part_ptr) {\n  ColPartition *image_part = *part_ptr;\n  TBOX im_part_box = image_part->bounding_box();\n  if (textord_tabfind_show_images > 1) {\n    tprintf(\"Searching for merge with image part:\");\n    im_part_box.print();\n    tprintf(\"Text box=\");\n    max_image_box.print();\n  }\n  rectsearch->StartRectSearch(max_image_box);\n  ColPartition *part;\n  ColPartition *best_part = nullptr;\n  int best_dist = 0;\n  while ((part = rectsearch->NextRectSearch()) != nullptr) {\n    if (textord_tabfind_show_images > 1) {\n      tprintf(\"Considering merge with part:\");\n      part->Print();\n      if (im_part_box.contains(part->bounding_box())) {\n        tprintf(\"Fully contained\\n\");\n      } else if (!max_image_box.contains(part->bounding_box())) {\n        tprintf(\"Not within text box\\n\");\n      } else if (part->flow() == BTFT_STRONG_CHAIN) {\n        tprintf(\"Too strong text\\n\");\n      } else {\n        tprintf(\"Real candidate\\n\");\n      }\n    }\n    if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_TEXT_ON_IMAGE ||\n        part->blob_type() == BRT_POLYIMAGE) {\n      continue;\n    }\n    TBOX box = part->bounding_box();\n    if (max_image_box.contains(box) && part->blob_type() != BRT_NOISE) {\n      if (im_part_box.contains(box)) {\n        // Eat it completely.\n        rectsearch->RemoveBBox();\n        DeletePartition(part);\n        continue;\n      }\n      int x_dist = std::max(0, box.x_gap(im_part_box));\n      int y_dist = std::max(0, box.y_gap(im_part_box));\n      int dist = x_dist * x_dist + y_dist * y_dist;\n      if (dist > box.area() || dist > im_part_box.area()) {\n        continue; // Not close enough.\n      }\n      if (best_part == nullptr || dist < best_dist) {\n        // We keep the nearest qualifier, which is not necessarily the nearest.\n        best_part = part;\n        best_dist = dist;\n      }\n    }\n  }\n  if (best_part != nullptr) {\n    // It needs expanding. We can do it without touching text.\n    TBOX box = best_part->bounding_box();\n    if (textord_tabfind_show_images > 1) {\n      tprintf(\"Merging image part:\");\n      im_part_box.print();\n      tprintf(\"with part:\");\n      box.print();\n    }\n    im_part_box += box;\n    *part_ptr = ColPartition::FakePartition(im_part_box, PT_UNKNOWN, BRT_RECTIMAGE, BTFT_NONTEXT);\n    DeletePartition(image_part);\n    part_grid->RemoveBBox(best_part);\n    DeletePartition(best_part);\n    rectsearch->RepositionIterator();\n    return true;\n  }\n  return false;\n}\n\n// Helper function to compute the overlap area between the box and the\n// given list of partitions.\nstatic int IntersectArea(const TBOX &box, ColPartition_LIST *part_list) {\n  int intersect_area = 0;\n  ColPartition_IT part_it(part_list);\n  // Iterate the parts and subtract intersecting area.\n  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {\n    ColPartition *image_part = part_it.data();\n    TBOX intersect = box.intersection(image_part->bounding_box());\n    intersect_area += intersect.area();\n  }\n  return intersect_area;\n}\n\n// part_list is a set of ColPartitions representing a polygonal image, and\n// im_box is the union of the bounding boxes of all the parts in part_list.\n// Tests whether part is to be consumed by the polygonal image.\n// Returns true if part is weak text and more than half of its area is\n// intersected by parts from the part_list, and it is contained within im_box.\nstatic bool TestWeakIntersectedPart(const TBOX &im_box, ColPartition_LIST *part_list,\n                                    ColPartition *part) {\n  if (part->flow() < BTFT_STRONG_CHAIN) {\n    // A weak partition intersects the box.\n    const TBOX &part_box = part->bounding_box();\n    if (im_box.contains(part_box)) {\n      int area = part_box.area();\n      int intersect_area = IntersectArea(part_box, part_list);\n      if (area < 2 * intersect_area) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n// A rectangular or polygonal image has been completed, in part_list, bounding\n// box in im_box. We want to eliminate weak text or other uncertain partitions\n// (basically anything that is not BRT_STRONG_CHAIN or better) from both the\n// part_grid and the big_parts list that are contained within im_box and\n// overlapped enough by the possibly polygonal image.\nstatic void EliminateWeakParts(const TBOX &im_box, ColPartitionGrid *part_grid,\n                               ColPartition_LIST *big_parts, ColPartition_LIST *part_list) {\n  ColPartitionGridSearch rectsearch(part_grid);\n  ColPartition *part;\n  rectsearch.StartRectSearch(im_box);\n  while ((part = rectsearch.NextRectSearch()) != nullptr) {\n    if (TestWeakIntersectedPart(im_box, part_list, part)) {\n      BlobRegionType type = part->blob_type();\n      if (type == BRT_POLYIMAGE || type == BRT_RECTIMAGE) {\n        rectsearch.RemoveBBox();\n        DeletePartition(part);\n      } else {\n        // The part is mostly covered, so mark it. Non-image partitions are\n        // kept hanging around to mark the image for pass2\n        part->set_flow(BTFT_NONTEXT);\n        part->set_blob_type(BRT_NOISE);\n        part->SetBlobTypes();\n      }\n    }\n  }\n  ColPartition_IT big_it(big_parts);\n  for (big_it.mark_cycle_pt(); !big_it.cycled_list(); big_it.forward()) {\n    part = big_it.data();\n    if (TestWeakIntersectedPart(im_box, part_list, part)) {\n      // Once marked, the blobs will be swept up by TidyBlobs.\n      DeletePartition(big_it.extract());\n    }\n  }\n}\n\n// Helper scans for good text partitions overlapping the given box.\n// If there are no good text partitions overlapping an expanded box, then\n// the box is expanded, otherwise, the original box is returned.\n// If good text overlaps the box, true is returned.\nstatic bool ScanForOverlappingText(ColPartitionGrid *part_grid, TBOX *box) {\n  ColPartitionGridSearch rectsearch(part_grid);\n  TBOX padded_box(*box);\n  padded_box.pad(kNoisePadding, kNoisePadding);\n  rectsearch.StartRectSearch(padded_box);\n  ColPartition *part;\n  bool any_text_in_padded_rect = false;\n  while ((part = rectsearch.NextRectSearch()) != nullptr) {\n    if (part->flow() == BTFT_CHAIN || part->flow() == BTFT_STRONG_CHAIN) {\n      // Text intersects the box.\n      any_text_in_padded_rect = true;\n      const TBOX &part_box = part->bounding_box();\n      if (box->overlap(part_box)) {\n        return true;\n      }\n    }\n  }\n  if (!any_text_in_padded_rect) {\n    *box = padded_box;\n  }\n  return false;\n}\n\n// Renders the boxes of image parts from the supplied list onto the image_pix,\n// except where they interfere with existing strong text in the part_grid,\n// and then deletes them.\n// Box coordinates are rotated by rerotate to match the image.\nstatic void MarkAndDeleteImageParts(const FCOORD &rerotate, ColPartitionGrid *part_grid,\n                                    ColPartition_LIST *image_parts, Image image_pix) {\n  if (image_pix == nullptr) {\n    return;\n  }\n  int imageheight = pixGetHeight(image_pix);\n  ColPartition_IT part_it(image_parts);\n  for (; !part_it.empty(); part_it.forward()) {\n    ColPartition *part = part_it.extract();\n    TBOX part_box = part->bounding_box();\n    BlobRegionType type = part->blob_type();\n    if (!ScanForOverlappingText(part_grid, &part_box) || type == BRT_RECTIMAGE ||\n        type == BRT_POLYIMAGE) {\n      // Mark the box on the image.\n      // All coords need to be rotated to match the image.\n      part_box.rotate(rerotate);\n      int left = part_box.left();\n      int top = part_box.top();\n      pixRasterop(image_pix, left, imageheight - top, part_box.width(), part_box.height(), PIX_SET,\n                  nullptr, 0, 0);\n    }\n    DeletePartition(part);\n  }\n}\n\n// Locates all the image partitions in the part_grid, that were found by a\n// previous call to FindImagePartitions, marks them in the image_mask,\n// removes them from the grid, and deletes them. This makes it possible to\n// call FindImagePartitions again to produce less broken-up and less\n// overlapping image partitions.\n// rerotation specifies how to rotate the partition coords to match\n// the image_mask, since this function is used after orientation correction.\nvoid ImageFind::TransferImagePartsToImageMask(const FCOORD &rerotation, ColPartitionGrid *part_grid,\n                                              Image image_mask) {\n  // Extract the noise parts from the grid and put them on a temporary list.\n  ColPartition_LIST parts_list;\n  ColPartition_IT part_it(&parts_list);\n  ColPartitionGridSearch gsearch(part_grid);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BlobRegionType type = part->blob_type();\n    if (type == BRT_NOISE || type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {\n      part_it.add_after_then_move(part);\n      gsearch.RemoveBBox();\n    }\n  }\n  // Render listed noise partitions to the image mask.\n  MarkAndDeleteImageParts(rerotation, part_grid, &parts_list, image_mask);\n}\n\n// Removes and deletes all image partitions that are too small to be worth\n// keeping. We have to do this as a separate phase after creating the image\n// partitions as the small images are needed to join the larger ones together.\nstatic void DeleteSmallImages(ColPartitionGrid *part_grid) {\n  if (part_grid != nullptr) {\n    return;\n  }\n  ColPartitionGridSearch gsearch(part_grid);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // Only delete rectangular images, since if it became a poly image, it\n    // is more evidence that it is somehow important.\n    if (part->blob_type() == BRT_RECTIMAGE) {\n      const TBOX &part_box = part->bounding_box();\n      if (part_box.width() < kMinImageFindSize || part_box.height() < kMinImageFindSize) {\n        // It is too small to keep. Just make it disappear.\n        gsearch.RemoveBBox();\n        DeletePartition(part);\n      }\n    }\n  }\n}\n\n// Runs a CC analysis on the image_pix mask image, and creates\n// image partitions from them, cutting out strong text, and merging with\n// nearby image regions such that they don't interfere with text.\n// Rotation and rerotation specify how to rotate image coords to match\n// the blob and partition coords and back again.\n// The input/output part_grid owns all the created partitions, and\n// the partitions own all the fake blobs that belong in the partitions.\n// Since the other blobs in the other partitions will be owned by the block,\n// ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this\n// situation and collect the image blobs.\nvoid ImageFind::FindImagePartitions(Image image_pix, const FCOORD &rotation,\n                                    const FCOORD &rerotation, TO_BLOCK *block, TabFind *tab_grid,\n                                    DebugPixa *pixa_debug, ColPartitionGrid *part_grid,\n                                    ColPartition_LIST *big_parts) {\n  int imageheight = pixGetHeight(image_pix);\n  Boxa *boxa;\n  Pixa *pixa;\n  ConnCompAndRectangularize(image_pix, pixa_debug, &boxa, &pixa);\n  // Iterate the connected components in the image regions mask.\n  int nboxes = 0;\n  if (boxa != nullptr && pixa != nullptr) {\n    nboxes = boxaGetCount(boxa);\n  }\n  for (int i = 0; i < nboxes; ++i) {\n    l_int32 x, y, width, height;\n    boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height);\n    Image pix = pixaGetPix(pixa, i, L_CLONE);\n    TBOX im_box(x, imageheight - y - height, x + width, imageheight - y);\n    im_box.rotate(rotation); // Now matches all partitions and blobs.\n    ColPartitionGridSearch rectsearch(part_grid);\n    rectsearch.SetUniqueMode(true);\n    ColPartition_LIST part_list;\n    DivideImageIntoParts(im_box, rotation, rerotation, pix, &rectsearch, &part_list);\n    if (textord_tabfind_show_images && pixa_debug != nullptr) {\n      pixa_debug->AddPix(pix, \"ImageComponent\");\n      tprintf(\"Component has %d parts\\n\", part_list.length());\n    }\n    pix.destroy();\n    if (!part_list.empty()) {\n      ColPartition_IT part_it(&part_list);\n      if (part_list.singleton()) {\n        // We didn't have to chop it into a polygon to fit around text, so\n        // try expanding it to merge fragmented image parts, as long as it\n        // doesn't touch strong text.\n        ColPartition *part = part_it.extract();\n        TBOX text_box(im_box);\n        MaximalImageBoundingBox(part_grid, &text_box);\n        while (ExpandImageIntoParts(text_box, &rectsearch, part_grid, &part)) {\n          ;\n        }\n        part_it.set_to_list(&part_list);\n        part_it.add_after_then_move(part);\n        im_box = part->bounding_box();\n      }\n      EliminateWeakParts(im_box, part_grid, big_parts, &part_list);\n      // Iterate the part_list and put the parts into the grid.\n      for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {\n        ColPartition *image_part = part_it.extract();\n        im_box = image_part->bounding_box();\n        part_grid->InsertBBox(true, true, image_part);\n        if (!part_it.at_last()) {\n          ColPartition *neighbour = part_it.data_relative(1);\n          image_part->AddPartner(false, neighbour);\n          neighbour->AddPartner(true, image_part);\n        }\n      }\n    }\n  }\n  boxaDestroy(&boxa);\n  pixaDestroy(&pixa);\n  DeleteSmallImages(part_grid);\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_images) {\n    ScrollView *images_win_ = part_grid->MakeWindow(1000, 400, \"With Images\");\n    part_grid->DisplayBoxes(images_win_);\n  }\n#endif\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/imagefind.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        imagefind.h\n// Description: Class to find image and drawing regions in an image\n//              and create a corresponding list of empty blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_IMAGEFIND_H_\n#define TESSERACT_TEXTORD_IMAGEFIND_H_\n\n#include \"debugpixa.h\"\n\n#include <cstdint>\n\nnamespace tesseract {\n\nclass ColPartitionGrid;\nclass ColPartition_LIST;\nclass TabFind;\nclass TBOX;\nclass FCOORD;\nclass TO_BLOCK;\n\n// The ImageFind class is a simple static function wrapper class that\n// exposes the FindImages function and some useful helper functions.\nclass ImageFind {\npublic:\n  // Finds image regions within the BINARY source pix (page image) and returns\n  // the image regions as a mask image.\n  // The returned pix may be nullptr, meaning no images found.\n  // If not nullptr, it must be PixDestroyed by the caller.\n  // If textord_tabfind_show_images, debug images are appended to pixa_debug.\n  static Image FindImages(Image pix, DebugPixa *pixa_debug);\n\n  // Given an input pix, and a bounding rectangle, the sides of the rectangle\n  // are shrunk inwards until they bound any black pixels found within the\n  // original rectangle. Returns false if the rectangle contains no black\n  // pixels at all.\n  static bool BoundsWithinRect(Image pix, int *x_start, int *y_start, int *x_end, int *y_end);\n\n  // Given a point in 3-D (RGB) space, returns the squared Euclidean distance\n  // of the point from the given line, defined by a pair of points in the 3-D\n  // (RGB) space, line1 and line2.\n  static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2,\n                                      const uint8_t *point);\n\n  // Returns true if there are no black pixels in between the boxes.\n  // The im_box must represent the bounding box of the pix in tesseract\n  // coordinates, which may be negative, due to rotations to make the textlines\n  // horizontal. The boxes are rotated by rotation, which should undo such\n  // rotations, before mapping them onto the pix.\n  static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box,\n                                  const FCOORD &rotation, Image pix);\n\n  // Returns the number of pixels in box in the pix.\n  // The im_box must represent the bounding box of the pix in tesseract\n  // coordinates, which may be negative, due to rotations to make the textlines\n  // horizontal. The boxes are rotated by rotation, which should undo such\n  // rotations, before mapping them onto the pix.\n  static int CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation,\n                                     Image pix);\n\n  // Locates all the image partitions in the part_grid, that were found by a\n  // previous call to FindImagePartitions, marks them in the image_mask,\n  // removes them from the grid, and deletes them. This makes it possible to\n  // call FindImagePartitions again to produce less broken-up and less\n  // overlapping image partitions.\n  // rerotation specifies how to rotate the partition coords to match\n  // the image_mask, since this function is used after orientation correction.\n  static void TransferImagePartsToImageMask(const FCOORD &rerotation, ColPartitionGrid *part_grid,\n                                            Image image_mask);\n\n  // Runs a CC analysis on the image_pix mask image, and creates\n  // image partitions from them, cutting out strong text, and merging with\n  // nearby image regions such that they don't interfere with text.\n  // Rotation and rerotation specify how to rotate image coords to match\n  // the blob and partition coords and back again.\n  // The input/output part_grid owns all the created partitions, and\n  // the partitions own all the fake blobs that belong in the partitions.\n  // Since the other blobs in the other partitions will be owned by the block,\n  // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this\n  // situation and collect the image blobs.\n  static void FindImagePartitions(Image image_pix, const FCOORD &rotation, const FCOORD &rerotation,\n                                  TO_BLOCK *block, TabFind *tab_grid, DebugPixa *pixa_debug,\n                                  ColPartitionGrid *part_grid, ColPartition_LIST *big_parts);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_LINEFIND_H_\n"
  },
  {
    "path": "src/textord/linefind.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        linefind.cpp\n// Description: Class to find vertical lines in an image and create\n//              a corresponding list of empty blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"alignedblob.h\"\n#include \"blobbox.h\"\n#include \"crakedge.h\" // for CRACKEDGE\n#include \"edgblob.h\"\n#include \"linefind.h\"\n#include \"tabvector.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n/// Denominator of resolution makes max pixel width to allow thin lines.\nconst int kThinLineFraction = 20;\n/// Denominator of resolution makes min pixels to demand line lengths to be.\nconst int kMinLineLengthFraction = 4;\n/// Spacing of cracks across the page to break up tall vertical lines.\nconst int kCrackSpacing = 100;\n/// Grid size used by line finder. Not very critical.\nconst int kLineFindGridSize = 50;\n// Min width of a line in pixels to be considered thick.\nconst int kMinThickLineWidth = 12;\n// Max size of line residue. (The pixels that fail the long thin opening, and\n// therefore don't make it to the candidate line mask, but are nevertheless\n// part of the line.)\nconst int kMaxLineResidue = 6;\n// Min length in inches of a line segment that exceeds kMinThickLineWidth in\n// thickness. (Such lines shouldn't break by simple image degradation.)\nconst double kThickLengthMultiple = 0.75;\n// Max fraction of line box area that can be occupied by non-line pixels.\nconst double kMaxNonLineDensity = 0.25;\n// Max height of a music stave in inches.\nconst double kMaxStaveHeight = 1.0;\n// Minimum fraction of pixels in a music rectangle connected to the staves.\nconst double kMinMusicPixelFraction = 0.75;\n\n// Erases the unused blobs from the line_pix image, taking into account\n// whether this was a horizontal or vertical line set.\nstatic void RemoveUnusedLineSegments(bool horizontal_lines, BLOBNBOX_LIST *line_bblobs,\n                                     Image line_pix) {\n  int height = pixGetHeight(line_pix);\n  BLOBNBOX_IT bbox_it(line_bblobs);\n  for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n    BLOBNBOX *blob = bbox_it.data();\n    if (blob->left_tab_type() != TT_VLINE) {\n      const TBOX &box = blob->bounding_box();\n      Box *pixbox = nullptr;\n      if (horizontal_lines) {\n        // Horizontal lines are in tess format and also have x and y flipped\n        // (to use FindVerticalAlignment) so we have to flip x and y and then\n        // convert to Leptonica by height - flipped x (ie the right edge).\n        // See GetLineBoxes for more explanation.\n        pixbox = boxCreate(box.bottom(), height - box.right(), box.height(), box.width());\n      } else {\n        // For vertical lines, just flip upside-down to convert to Leptonica.\n        // The y position of the box in Leptonica terms is the distance from\n        // the top of the image to the top of the box.\n        pixbox = boxCreate(box.left(), height - box.top(), box.width(), box.height());\n      }\n      pixClearInRect(line_pix, pixbox);\n      boxDestroy(&pixbox);\n    }\n  }\n}\n\n// Helper subtracts the line_pix image from the src_pix, and removes residue\n// as well by removing components that touch the line, but are not in the\n// non_line_pix mask. It is assumed that the non_line_pix mask has already\n// been prepared to required accuracy.\nstatic void SubtractLinesAndResidue(Image line_pix, Image non_line_pix,\n                                    Image src_pix) {\n  // First remove the lines themselves.\n  pixSubtract(src_pix, src_pix, line_pix);\n  // Subtract the non-lines from the image to get the residue.\n  Image residue_pix = pixSubtract(nullptr, src_pix, non_line_pix);\n  // Dilate the lines so they touch the residue.\n  Image fat_line_pix = pixDilateBrick(nullptr, line_pix, 3, 3);\n  // Seed fill the fat lines to get all the residue.\n  pixSeedfillBinary(fat_line_pix, fat_line_pix, residue_pix, 8);\n  // Subtract the residue from the original image.\n  pixSubtract(src_pix, src_pix, fat_line_pix);\n  fat_line_pix.destroy();\n  residue_pix.destroy();\n}\n\n// Returns the maximum strokewidth in the given binary image by doubling\n// the maximum of the distance function.\nstatic int MaxStrokeWidth(Image pix) {\n  Image dist_pix = pixDistanceFunction(pix, 4, 8, L_BOUNDARY_BG);\n  int width = pixGetWidth(dist_pix);\n  int height = pixGetHeight(dist_pix);\n  int wpl = pixGetWpl(dist_pix);\n  l_uint32 *data = pixGetData(dist_pix);\n  // Find the maximum value in the distance image.\n  int max_dist = 0;\n  for (int y = 0; y < height; ++y) {\n    for (int x = 0; x < width; ++x) {\n      int pixel = GET_DATA_BYTE(data, x);\n      if (pixel > max_dist) {\n        max_dist = pixel;\n      }\n    }\n    data += wpl;\n  }\n  dist_pix.destroy();\n  return max_dist * 2;\n}\n\n// Returns the number of components in the intersection_pix touched by line_box.\nstatic int NumTouchingIntersections(Box *line_box, Image intersection_pix) {\n  if (intersection_pix == nullptr) {\n    return 0;\n  }\n  Image rect_pix = pixClipRectangle(intersection_pix, line_box, nullptr);\n  Boxa *boxa = pixConnComp(rect_pix, nullptr, 8);\n  rect_pix.destroy();\n  if (boxa == nullptr) {\n    return false;\n  }\n  int result = boxaGetCount(boxa);\n  boxaDestroy(&boxa);\n  return result;\n}\n\n// Returns the number of black pixels found in the box made by adding the line\n// width to both sides of the line bounding box. (Increasing the smallest\n// dimension of the bounding box.)\nstatic int CountPixelsAdjacentToLine(int line_width, Box *line_box, Image nonline_pix) {\n  l_int32 x, y, box_width, box_height;\n  boxGetGeometry(line_box, &x, &y, &box_width, &box_height);\n  if (box_width > box_height) {\n    // horizontal line.\n    int bottom = std::min(pixGetHeight(nonline_pix), y + box_height + line_width);\n    y = std::max(0, y - line_width);\n    box_height = bottom - y;\n  } else {\n    // Vertical line.\n    int right = std::min(pixGetWidth(nonline_pix), x + box_width + line_width);\n    x = std::max(0, x - line_width);\n    box_width = right - x;\n  }\n  Box *box = boxCreate(x, y, box_width, box_height);\n  Image rect_pix = pixClipRectangle(nonline_pix, box, nullptr);\n  boxDestroy(&box);\n  l_int32 result;\n  pixCountPixels(rect_pix, &result, nullptr);\n  rect_pix.destroy();\n  return result;\n}\n\n// Helper erases false-positive line segments from the input/output line_pix.\n// 1. Since thick lines shouldn't really break up, we can eliminate some false\n//    positives by marking segments that are at least kMinThickLineWidth\n//    thickness, yet have a length less than min_thick_length.\n// 2. Lines that don't have at least 2 intersections with other lines and have\n//    a lot of neighbouring non-lines are probably not lines (perhaps arabic\n//    or Hindi words, or underlines.)\n// Bad line components are erased from line_pix.\n// Returns the number of remaining connected components.\nstatic int FilterFalsePositives(int resolution, Image nonline_pix, Image intersection_pix,\n                                Image line_pix) {\n  int min_thick_length = static_cast<int>(resolution * kThickLengthMultiple);\n  Pixa *pixa = nullptr;\n  Boxa *boxa = pixConnComp(line_pix, &pixa, 8);\n  // Iterate over the boxes to remove false positives.\n  int nboxes = boxaGetCount(boxa);\n  int remaining_boxes = nboxes;\n  for (int i = 0; i < nboxes; ++i) {\n    Box *box = boxaGetBox(boxa, i, L_CLONE);\n    l_int32 x, y, box_width, box_height;\n    boxGetGeometry(box, &x, &y, &box_width, &box_height);\n    Image comp_pix = pixaGetPix(pixa, i, L_CLONE);\n    int max_width = MaxStrokeWidth(comp_pix);\n    comp_pix.destroy();\n    bool bad_line = false;\n    // If the length is too short to stand-alone as a line, and the box width\n    // is thick enough, and the stroke width is thick enough it is bad.\n    if (box_width >= kMinThickLineWidth && box_height >= kMinThickLineWidth &&\n        box_width < min_thick_length && box_height < min_thick_length &&\n        max_width > kMinThickLineWidth) {\n      // Too thick for the length.\n      bad_line = true;\n    }\n    if (!bad_line && (NumTouchingIntersections(box, intersection_pix) < 2)) {\n      // Test non-line density near the line.\n      int nonline_count = CountPixelsAdjacentToLine(max_width, box, nonline_pix);\n      if (nonline_count > box_height * box_width * kMaxNonLineDensity) {\n        bad_line = true;\n      }\n    }\n    if (bad_line) {\n      // Not a good line.\n      pixClearInRect(line_pix, box);\n      --remaining_boxes;\n    }\n    boxDestroy(&box);\n  }\n  pixaDestroy(&pixa);\n  boxaDestroy(&boxa);\n  return remaining_boxes;\n}\n\n// Converts the Boxa array to a list of C_BLOB, getting rid of severely\n// overlapping outlines and those that are children of a bigger one.\n// The output is a list of C_BLOBs that are owned by the list.\n// The C_OUTLINEs in the C_BLOBs contain no outline data - just empty\n// bounding boxes. The Boxa is consumed and destroyed.\nstatic void ConvertBoxaToBlobs(int image_width, int image_height, Boxa **boxes,\n                               C_BLOB_LIST *blobs) {\n  C_OUTLINE_LIST outlines;\n  C_OUTLINE_IT ol_it = &outlines;\n  // Iterate the boxes to convert to outlines.\n  int nboxes = boxaGetCount(*boxes);\n  for (int i = 0; i < nboxes; ++i) {\n    l_int32 x, y, width, height;\n    boxaGetBoxGeometry(*boxes, i, &x, &y, &width, &height);\n    // Make a C_OUTLINE from the leptonica box. This is a bit of a hack,\n    // as there is no outline, just a bounding box, but with some very\n    // small changes to coutln.cpp, it works nicely.\n    ICOORD top_left(x, y);\n    ICOORD bot_right(x + width, y + height);\n    CRACKEDGE startpt;\n    startpt.pos = top_left;\n    auto *outline = new C_OUTLINE(&startpt, top_left, bot_right, 0);\n    ol_it.add_after_then_move(outline);\n  }\n  // Use outlines_to_blobs to convert the outlines to blobs and find\n  // overlapping and contained objects. The output list of blobs in the block\n  // has all the bad ones filtered out and deleted.\n  BLOCK block;\n  ICOORD page_tl(0, 0);\n  ICOORD page_br(image_width, image_height);\n  outlines_to_blobs(&block, page_tl, page_br, &outlines);\n  // Transfer the created blobs to the output list.\n  C_BLOB_IT blob_it(blobs);\n  blob_it.add_list_after(block.blob_list());\n  // The boxes aren't needed any more.\n  boxaDestroy(boxes);\n}\n\n// Returns a list of boxes corresponding to the candidate line segments. Sets\n// the line_crossings member of the boxes so we can later determine the number\n// of intersections touched by a full line.\nstatic void GetLineBoxes(bool horizontal_lines, Image pix_lines, Image pix_intersections,\n                         C_BLOB_LIST *line_cblobs, BLOBNBOX_LIST *line_bblobs) {\n  // Put a single pixel crack in every line at an arbitrary spacing,\n  // so they break up and the bounding boxes can be used to get the\n  // direction accurately enough without needing outlines.\n  int wpl = pixGetWpl(pix_lines);\n  int width = pixGetWidth(pix_lines);\n  int height = pixGetHeight(pix_lines);\n  l_uint32 *data = pixGetData(pix_lines);\n  if (horizontal_lines) {\n    for (int y = 0; y < height; ++y, data += wpl) {\n      for (int x = kCrackSpacing; x < width; x += kCrackSpacing) {\n        CLEAR_DATA_BIT(data, x);\n      }\n    }\n  } else {\n    for (int y = kCrackSpacing; y < height; y += kCrackSpacing) {\n      memset(data + wpl * y, 0, wpl * sizeof(*data));\n    }\n  }\n  // Get the individual connected components\n  Boxa *boxa = pixConnComp(pix_lines, nullptr, 8);\n  ConvertBoxaToBlobs(width, height, &boxa, line_cblobs);\n  // Make the BLOBNBOXes from the C_BLOBs.\n  C_BLOB_IT blob_it(line_cblobs);\n  BLOBNBOX_IT bbox_it(line_bblobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    C_BLOB *cblob = blob_it.data();\n    auto *bblob = new BLOBNBOX(cblob);\n    bbox_it.add_to_end(bblob);\n    // Determine whether the line segment touches two intersections.\n    const TBOX &bbox = bblob->bounding_box();\n    Box *box = boxCreate(bbox.left(), bbox.bottom(), bbox.width(), bbox.height());\n    bblob->set_line_crossings(NumTouchingIntersections(box, pix_intersections));\n    boxDestroy(&box);\n    // Transform the bounding box prior to finding lines. To save writing\n    // two line finders, flip x and y for horizontal lines and re-use the\n    // tab-stop detection code. For vertical lines we still have to flip the\n    // y-coordinates to switch from leptonica coords to tesseract coords.\n    if (horizontal_lines) {\n      // Note that we have Leptonica coords stored in a Tesseract box, so that\n      // bbox.bottom(), being the MIN y coord, is actually the top, so to get\n      // back to Leptonica coords in RemoveUnusedLineSegments, we have to\n      // use height - box.right() as the top, which looks very odd.\n      TBOX new_box(height - bbox.top(), bbox.left(), height - bbox.bottom(), bbox.right());\n      bblob->set_bounding_box(new_box);\n    } else {\n      TBOX new_box(bbox.left(), height - bbox.top(), bbox.right(), height - bbox.bottom());\n      bblob->set_bounding_box(new_box);\n    }\n  }\n}\n\n// Finds vertical lines in the given list of BLOBNBOXes. bleft and tright\n// are the bounds of the image on which the input line_bblobs were found.\n// The input line_bblobs list is const really.\n// The output vertical_x and vertical_y are the total of all the vectors.\n// The output list of TabVector makes no reference to the input BLOBNBOXes.\nstatic void FindLineVectors(const ICOORD &bleft, const ICOORD &tright,\n                            BLOBNBOX_LIST *line_bblobs, int *vertical_x, int *vertical_y,\n                            TabVector_LIST *vectors) {\n  BLOBNBOX_IT bbox_it(line_bblobs);\n  int b_count = 0;\n  // Put all the blobs into the grid to find the lines, and move the blobs\n  // to the output lists.\n  AlignedBlob blob_grid(kLineFindGridSize, bleft, tright);\n  for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {\n    BLOBNBOX *bblob = bbox_it.data();\n    bblob->set_left_tab_type(TT_MAYBE_ALIGNED);\n    bblob->set_left_rule(bleft.x());\n    bblob->set_right_rule(tright.x());\n    bblob->set_left_crossing_rule(bleft.x());\n    bblob->set_right_crossing_rule(tright.x());\n    blob_grid.InsertBBox(false, true, bblob);\n    ++b_count;\n  }\n  if (b_count == 0) {\n    return;\n  }\n\n  // Search the entire grid, looking for vertical line vectors.\n  BlobGridSearch lsearch(&blob_grid);\n  BLOBNBOX *bbox;\n  TabVector_IT vector_it(vectors);\n  *vertical_x = 0;\n  *vertical_y = 1;\n  lsearch.StartFullSearch();\n  while ((bbox = lsearch.NextFullSearch()) != nullptr) {\n    if (bbox->left_tab_type() == TT_MAYBE_ALIGNED) {\n      const TBOX &box = bbox->bounding_box();\n      if (AlignedBlob::WithinTestRegion(2, box.left(), box.bottom())) {\n        tprintf(\"Finding line vector starting at bbox (%d,%d)\\n\", box.left(), box.bottom());\n      }\n      AlignedBlobParams align_params(*vertical_x, *vertical_y, box.width());\n      TabVector *vector =\n          blob_grid.FindVerticalAlignment(align_params, bbox, vertical_x, vertical_y);\n      if (vector != nullptr) {\n        vector->Freeze();\n        vector_it.add_to_end(vector);\n      }\n    }\n  }\n}\n\n// Returns a Pix music mask if music is detected.\n// Any vertical line that has at least 5 intersections in sufficient density\n// is taken to be a bar. Bars are used as a seed and the entire touching\n// component is added to the output music mask and subtracted from the lines.\n// Returns nullptr and does minimal work if no music is found.\nstatic Image FilterMusic(int resolution, Image pix_closed, Image pix_vline, Image pix_hline,\n                        bool &v_empty, bool &h_empty) {\n  int max_stave_height = static_cast<int>(resolution * kMaxStaveHeight);\n  Image intersection_pix = pix_vline & pix_hline;\n  Boxa *boxa = pixConnComp(pix_vline, nullptr, 8);\n  // Iterate over the boxes to find music bars.\n  int nboxes = boxaGetCount(boxa);\n  Image music_mask = nullptr;\n  for (int i = 0; i < nboxes; ++i) {\n    Box *box = boxaGetBox(boxa, i, L_CLONE);\n    l_int32 x, y, box_width, box_height;\n    boxGetGeometry(box, &x, &y, &box_width, &box_height);\n    int joins = NumTouchingIntersections(box, intersection_pix);\n    // Test for the join density being at least 5 per max_stave_height,\n    // ie (joins-1)/box_height >= (5-1)/max_stave_height.\n    if (joins >= 5 && (joins - 1) * max_stave_height >= 4 * box_height) {\n      // This is a music bar. Add to the mask.\n      if (music_mask == nullptr) {\n        music_mask = pixCreate(pixGetWidth(pix_vline), pixGetHeight(pix_vline), 1);\n      }\n      pixSetInRect(music_mask, box);\n    }\n    boxDestroy(&box);\n  }\n  boxaDestroy(&boxa);\n  intersection_pix.destroy();\n  if (music_mask != nullptr) {\n    // The mask currently contains just the bars. Use the mask as a seed\n    // and the pix_closed as the mask for a seedfill to get all the\n    // intersecting staves.\n    pixSeedfillBinary(music_mask, music_mask, pix_closed, 8);\n    // Filter out false positives. CCs in the music_mask should be the vast\n    // majority of the pixels in their bounding boxes, as we expect just a\n    // tiny amount of text, a few phrase marks, and crescendo etc left.\n    Boxa *boxa = pixConnComp(music_mask, nullptr, 8);\n    // Iterate over the boxes to find music components.\n    int nboxes = boxaGetCount(boxa);\n    for (int i = 0; i < nboxes; ++i) {\n      Box *box = boxaGetBox(boxa, i, L_CLONE);\n      Image rect_pix = pixClipRectangle(music_mask, box, nullptr);\n      l_int32 music_pixels;\n      pixCountPixels(rect_pix, &music_pixels, nullptr);\n      rect_pix.destroy();\n      rect_pix = pixClipRectangle(pix_closed, box, nullptr);\n      l_int32 all_pixels;\n      pixCountPixels(rect_pix, &all_pixels, nullptr);\n      rect_pix.destroy();\n      if (music_pixels < kMinMusicPixelFraction * all_pixels) {\n        // False positive. Delete from the music mask.\n        pixClearInRect(music_mask, box);\n      }\n      boxDestroy(&box);\n    }\n    boxaDestroy(&boxa);\n    if (music_mask.isZero()) {\n      music_mask.destroy();\n    } else {\n      pixSubtract(pix_vline, pix_vline, music_mask);\n      pixSubtract(pix_hline, pix_hline, music_mask);\n      // We may have deleted all the lines\n      v_empty = pix_vline.isZero();\n      h_empty = pix_hline.isZero();\n    }\n  }\n  return music_mask;\n}\n\n// Most of the heavy lifting of line finding. Given src_pix and its separate\n// resolution, returns image masks:\n// pix_vline           candidate vertical lines.\n// pix_non_vline       pixels that didn't look like vertical lines.\n// pix_hline           candidate horizontal lines.\n// pix_non_hline       pixels that didn't look like horizontal lines.\n// pix_intersections   pixels where vertical and horizontal lines meet.\n// pix_music_mask      candidate music staves.\n// This function promises to initialize all the output (2nd level) pointers,\n// but any of the returns that are empty will be nullptr on output.\n// None of the input (1st level) pointers may be nullptr except\n// pix_music_mask, which will disable music detection, and pixa_display, which\n// is for debug.\nstatic void GetLineMasks(int resolution, Image src_pix, Image *pix_vline, Image *pix_non_vline,\n                         Image *pix_hline, Image *pix_non_hline, Image *pix_intersections,\n                         Image *pix_music_mask, Pixa *pixa_display) {\n  Image pix_closed = nullptr;\n  Image pix_hollow = nullptr;\n\n  int max_line_width = resolution / kThinLineFraction;\n  int min_line_length = resolution / kMinLineLengthFraction;\n  if (pixa_display != nullptr) {\n    tprintf(\"Image resolution = %d, max line width = %d, min length=%d\\n\", resolution,\n            max_line_width, min_line_length);\n  }\n  int closing_brick = max_line_width / 3;\n\n  // Close up small holes, making it less likely that false alarms are found\n  // in thickened text (as it will become more solid) and also smoothing over\n  // some line breaks and nicks in the edges of the lines.\n  pix_closed = pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);\n  if (pixa_display != nullptr) {\n    pixaAddPix(pixa_display, pix_closed, L_CLONE);\n  }\n  // Open up with a big box to detect solid areas, which can then be\n  // subtracted. This is very generous and will leave in even quite wide\n  // lines.\n  Image pix_solid = pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);\n  if (pixa_display != nullptr) {\n    pixaAddPix(pixa_display, pix_solid, L_CLONE);\n  }\n  pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);\n\n  pix_solid.destroy();\n\n  // Now open up in both directions independently to find lines of at least\n  // 1 inch/kMinLineLengthFraction in length.\n  if (pixa_display != nullptr) {\n    pixaAddPix(pixa_display, pix_hollow, L_CLONE);\n  }\n  *pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);\n  *pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);\n\n  pix_hollow.destroy();\n\n  // Lines are sufficiently rare, that it is worth checking for a zero image.\n  bool v_empty = pix_vline->isZero();\n  bool h_empty = pix_hline->isZero();\n  if (pix_music_mask != nullptr) {\n    if (!v_empty && !h_empty) {\n      *pix_music_mask =\n          FilterMusic(resolution, pix_closed, *pix_vline, *pix_hline, v_empty, h_empty);\n    } else {\n      *pix_music_mask = nullptr;\n    }\n  }\n  pix_closed.destroy();\n  Image pix_nonlines = nullptr;\n  *pix_intersections = nullptr;\n  Image extra_non_hlines = nullptr;\n  if (!v_empty) {\n    // Subtract both line candidates from the source to get definite non-lines.\n    pix_nonlines = pixSubtract(nullptr, src_pix, *pix_vline);\n    if (!h_empty) {\n      pixSubtract(pix_nonlines, pix_nonlines, *pix_hline);\n      // Intersections are a useful indicator for likelihood of being a line.\n      *pix_intersections = *pix_vline & *pix_hline;\n      // Candidate vlines are not hlines (apart from the intersections)\n      // and vice versa.\n      extra_non_hlines = pixSubtract(nullptr, *pix_vline, *pix_intersections);\n    }\n    *pix_non_vline = pixErodeBrick(nullptr, pix_nonlines, kMaxLineResidue, 1);\n    pixSeedfillBinary(*pix_non_vline, *pix_non_vline, pix_nonlines, 8);\n    if (!h_empty) {\n      // Candidate hlines are not vlines.\n      *pix_non_vline |= *pix_hline;\n      pixSubtract(*pix_non_vline, *pix_non_vline, *pix_intersections);\n    }\n    if (!FilterFalsePositives(resolution, *pix_non_vline, *pix_intersections, *pix_vline)) {\n      pix_vline->destroy(); // No candidates left.\n    }\n  } else {\n    // No vertical lines.\n    pix_vline->destroy();\n    *pix_non_vline = nullptr;\n    if (!h_empty) {\n      pix_nonlines = pixSubtract(nullptr, src_pix, *pix_hline);\n    }\n  }\n  if (h_empty) {\n    pix_hline->destroy();\n    *pix_non_hline = nullptr;\n    if (v_empty) {\n      return;\n    }\n  } else {\n    *pix_non_hline = pixErodeBrick(nullptr, pix_nonlines, 1, kMaxLineResidue);\n    pixSeedfillBinary(*pix_non_hline, *pix_non_hline, pix_nonlines, 8);\n    if (extra_non_hlines != nullptr) {\n      *pix_non_hline |= extra_non_hlines;\n      extra_non_hlines.destroy();\n    }\n    if (!FilterFalsePositives(resolution, *pix_non_hline, *pix_intersections, *pix_hline)) {\n      pix_hline->destroy(); // No candidates left.\n    }\n  }\n  if (pixa_display != nullptr) {\n    if (*pix_vline != nullptr) {\n      pixaAddPix(pixa_display, *pix_vline, L_CLONE);\n    }\n    if (*pix_hline != nullptr) {\n      pixaAddPix(pixa_display, *pix_hline, L_CLONE);\n    }\n    if (pix_nonlines != nullptr) {\n      pixaAddPix(pixa_display, pix_nonlines, L_CLONE);\n    }\n    if (*pix_non_vline != nullptr) {\n      pixaAddPix(pixa_display, *pix_non_vline, L_CLONE);\n    }\n    if (*pix_non_hline != nullptr) {\n      pixaAddPix(pixa_display, *pix_non_hline, L_CLONE);\n    }\n    if (*pix_intersections != nullptr) {\n      pixaAddPix(pixa_display, *pix_intersections, L_CLONE);\n    }\n    if (pix_music_mask != nullptr && *pix_music_mask != nullptr) {\n      pixaAddPix(pixa_display, *pix_music_mask, L_CLONE);\n    }\n  }\n  pix_nonlines.destroy();\n}\n\n// Finds vertical line objects in pix_vline and removes them from src_pix.\n// Uses the given resolution to determine size thresholds instead of any\n// that may be present in the pix.\n// The output vertical_x and vertical_y contain a sum of the output vectors,\n// thereby giving the mean vertical direction.\n// The output vectors are owned by the list and Frozen (cannot refit) by\n// having no boxes, as there is no need to refit or merge separator lines.\n// If no good lines are found, pix_vline is destroyed.\n// None of the input pointers may be nullptr, and if *pix_vline is nullptr then\n// the function does nothing.\nstatic void FindAndRemoveVLines(Image pix_intersections, int *vertical_x,\n                                int *vertical_y, Image *pix_vline, Image pix_non_vline,\n                                Image src_pix, TabVector_LIST *vectors) {\n  if (pix_vline == nullptr || *pix_vline == nullptr) {\n    return;\n  }\n  C_BLOB_LIST line_cblobs;\n  BLOBNBOX_LIST line_bblobs;\n  GetLineBoxes(false, *pix_vline, pix_intersections, &line_cblobs, &line_bblobs);\n  int width = pixGetWidth(src_pix);\n  int height = pixGetHeight(src_pix);\n  ICOORD bleft(0, 0);\n  ICOORD tright(width, height);\n  FindLineVectors(bleft, tright, &line_bblobs, vertical_x, vertical_y, vectors);\n  if (!vectors->empty()) {\n    RemoveUnusedLineSegments(false, &line_bblobs, *pix_vline);\n    SubtractLinesAndResidue(*pix_vline, pix_non_vline, src_pix);\n    ICOORD vertical;\n    vertical.set_with_shrink(*vertical_x, *vertical_y);\n    TabVector::MergeSimilarTabVectors(vertical, vectors, nullptr);\n  } else {\n    pix_vline->destroy();\n  }\n}\n\n// Finds horizontal line objects in pix_hline and removes them from src_pix.\n// Uses the given resolution to determine size thresholds instead of any\n// that may be present in the pix.\n// The output vertical_x and vertical_y contain a sum of the output vectors,\n// thereby giving the mean vertical direction.\n// The output vectors are owned by the list and Frozen (cannot refit) by\n// having no boxes, as there is no need to refit or merge separator lines.\n// If no good lines are found, pix_hline is destroyed.\n// None of the input pointers may be nullptr, and if *pix_hline is nullptr then\n// the function does nothing.\nstatic void FindAndRemoveHLines(Image pix_intersections, int vertical_x,\n                                int vertical_y, Image *pix_hline, Image pix_non_hline,\n                                Image src_pix, TabVector_LIST *vectors) {\n  if (pix_hline == nullptr || *pix_hline == nullptr) {\n    return;\n  }\n  C_BLOB_LIST line_cblobs;\n  BLOBNBOX_LIST line_bblobs;\n  GetLineBoxes(true, *pix_hline, pix_intersections, &line_cblobs, &line_bblobs);\n  int width = pixGetWidth(src_pix);\n  int height = pixGetHeight(src_pix);\n  ICOORD bleft(0, 0);\n  ICOORD tright(height, width);\n  FindLineVectors(bleft, tright, &line_bblobs, &vertical_x, &vertical_y, vectors);\n  if (!vectors->empty()) {\n    RemoveUnusedLineSegments(true, &line_bblobs, *pix_hline);\n    SubtractLinesAndResidue(*pix_hline, pix_non_hline, src_pix);\n    ICOORD vertical;\n    vertical.set_with_shrink(vertical_x, vertical_y);\n    TabVector::MergeSimilarTabVectors(vertical, vectors, nullptr);\n    // Iterate the vectors to flip them. x and y were flipped for horizontal\n    // lines, so FindLineVectors can work just with the vertical case.\n    // See GetLineBoxes for more on the flip.\n    TabVector_IT h_it(vectors);\n    for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {\n      h_it.data()->XYFlip();\n    }\n  } else {\n    pix_hline->destroy();\n  }\n}\n\n// Finds vertical and horizontal line objects in the given pix.\n// Uses the given resolution to determine size thresholds instead of any\n// that may be present in the pix.\n// The output vertical_x and vertical_y contain a sum of the output vectors,\n// thereby giving the mean vertical direction.\n// If pix_music_mask != nullptr, and music is detected, a mask of the staves\n// and anything that is connected (bars, notes etc.) will be returned in\n// pix_music_mask, the mask subtracted from pix, and the lines will not\n// appear in v_lines or h_lines.\n// The output vectors are owned by the list and Frozen (cannot refit) by\n// having no boxes, as there is no need to refit or merge separator lines.\n// The detected lines are removed from the pix.\nvoid LineFinder::FindAndRemoveLines(int resolution, bool debug, Image pix, int *vertical_x,\n                                    int *vertical_y, Image *pix_music_mask, TabVector_LIST *v_lines,\n                                    TabVector_LIST *h_lines) {\n  if (pix == nullptr || vertical_x == nullptr || vertical_y == nullptr) {\n    tprintf(\"Error in parameters for LineFinder::FindAndRemoveLines\\n\");\n    return;\n  }\n  Image pix_vline = nullptr;\n  Image pix_non_vline = nullptr;\n  Image pix_hline = nullptr;\n  Image pix_non_hline = nullptr;\n  Image pix_intersections = nullptr;\n  Pixa *pixa_display = debug ? pixaCreate(0) : nullptr;\n  GetLineMasks(resolution, pix, &pix_vline, &pix_non_vline, &pix_hline, &pix_non_hline,\n               &pix_intersections, pix_music_mask, pixa_display);\n  // Find lines, convert to TabVector_LIST and remove those that are used.\n  FindAndRemoveVLines(pix_intersections, vertical_x, vertical_y, &pix_vline,\n                      pix_non_vline, pix, v_lines);\n  pix_intersections.destroy();\n  if (pix_hline != nullptr) {\n    // Recompute intersections and re-filter false positive h-lines.\n    if (pix_vline != nullptr) {\n      pix_intersections = pix_vline & pix_hline;\n    }\n    if (!FilterFalsePositives(resolution, pix_non_hline, pix_intersections, pix_hline)) {\n      pix_hline.destroy();\n    }\n  }\n  FindAndRemoveHLines(pix_intersections, *vertical_x, *vertical_y, &pix_hline,\n                      pix_non_hline, pix, h_lines);\n  if (pixa_display != nullptr && pix_vline != nullptr) {\n    pixaAddPix(pixa_display, pix_vline, L_CLONE);\n  }\n  if (pixa_display != nullptr && pix_hline != nullptr) {\n    pixaAddPix(pixa_display, pix_hline, L_CLONE);\n  }\n  pix_intersections.destroy();\n  if (pix_vline != nullptr && pix_hline != nullptr) {\n    // Remove joins (intersections) where lines cross, and the residue.\n    // Recalculate the intersections, since some lines have been deleted.\n    pix_intersections = pix_vline & pix_hline;\n    // Fatten up the intersections and seed-fill to get the intersection\n    // residue.\n    Image pix_join_residue = pixDilateBrick(nullptr, pix_intersections, 5, 5);\n    pixSeedfillBinary(pix_join_residue, pix_join_residue, pix, 8);\n    // Now remove the intersection residue.\n    pixSubtract(pix, pix, pix_join_residue);\n    pix_join_residue.destroy();\n  }\n  // Remove any detected music.\n  if (pix_music_mask != nullptr && *pix_music_mask != nullptr) {\n    if (pixa_display != nullptr) {\n      pixaAddPix(pixa_display, *pix_music_mask, L_CLONE);\n    }\n    pixSubtract(pix, pix, *pix_music_mask);\n  }\n  if (pixa_display != nullptr) {\n    pixaAddPix(pixa_display, pix, L_CLONE);\n  }\n\n  pix_vline.destroy();\n  pix_non_vline.destroy();\n  pix_hline.destroy();\n  pix_non_hline.destroy();\n  pix_intersections.destroy();\n  if (pixa_display != nullptr) {\n    pixaConvertToPdf(pixa_display, resolution, 1.0f, 0, 0, \"LineFinding\", \"vhlinefinding.pdf\");\n    pixaDestroy(&pixa_display);\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/linefind.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        linefind.h\n// Description: Class to find vertical lines in an image and create\n//              a corresponding list of empty blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_LINEFIND_H_\n#define TESSERACT_TEXTORD_LINEFIND_H_\n\nnamespace tesseract {\n\nclass TabVector_LIST;\n\n/**\n * The LineFinder class is a simple static function wrapper class that mainly\n * exposes the FindVerticalLines function.\n */\nclass LineFinder {\npublic:\n  /**\n   * Finds vertical and horizontal line objects in the given pix and removes\n   * them.\n   *\n   * Uses the given resolution to determine size thresholds instead of any\n   * that may be present in the pix.\n   *\n   * The output vertical_x and vertical_y contain a sum of the output vectors,\n   * thereby giving the mean vertical direction.\n   *\n   * If pix_music_mask != nullptr, and music is detected, a mask of the staves\n   * and anything that is connected (bars, notes etc.) will be returned in\n   * pix_music_mask, the mask subtracted from pix, and the lines will not\n   * appear in v_lines or h_lines.\n   *\n   * The output vectors are owned by the list and Frozen (cannot refit) by\n   * having no boxes, as there is no need to refit or merge separator lines.\n   *\n   * The detected lines are removed from the pix.\n   */\n  static void FindAndRemoveLines(int resolution, bool debug, Image pix, int *vertical_x,\n                                 int *vertical_y, Image *pix_music_mask, TabVector_LIST *v_lines,\n                                 TabVector_LIST *h_lines);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_LINEFIND_H_\n"
  },
  {
    "path": "src/textord/makerow.cpp",
    "content": "/**********************************************************************\n * File:        makerow.cpp  (Formerly makerows.c)\n * Description: Code to arrange blobs into rows of text.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"makerow.h\"\n\n#include \"blkocc.h\"\n#include \"blobbox.h\"\n#include \"ccstruct.h\"\n#include \"detlinefit.h\"\n#include \"drawtord.h\"\n#include \"oldbasel.h\"\n#include \"sortflts.h\"\n#include \"statistc.h\"\n#include \"textord.h\"\n#include \"tordmain.h\"\n#include \"tovars.h\"\n#include \"tprintf.h\"\n#include \"underlin.h\"\n\n#include <algorithm>\n#include <cmath>\n#include <vector> // for std::vector\n\nnamespace tesseract {\n\nBOOL_VAR(textord_heavy_nr, false, \"Vigorously remove noise\");\nBOOL_VAR(textord_show_initial_rows, false, \"Display row accumulation\");\nBOOL_VAR(textord_show_parallel_rows, false, \"Display page correlated rows\");\nBOOL_VAR(textord_show_expanded_rows, false, \"Display rows after expanding\");\nBOOL_VAR(textord_show_final_rows, false, \"Display rows after final fitting\");\nBOOL_VAR(textord_show_final_blobs, false, \"Display blob bounds after pre-ass\");\nBOOL_VAR(textord_test_landscape, false, \"Tests refer to land/port\");\nBOOL_VAR(textord_parallel_baselines, true, \"Force parallel baselines\");\nBOOL_VAR(textord_straight_baselines, false, \"Force straight baselines\");\nBOOL_VAR(textord_old_baselines, true, \"Use old baseline algorithm\");\nBOOL_VAR(textord_old_xheight, false, \"Use old xheight algorithm\");\nBOOL_VAR(textord_fix_xheight_bug, true, \"Use spline baseline\");\nBOOL_VAR(textord_fix_makerow_bug, true, \"Prevent multiple baselines\");\nBOOL_VAR(textord_debug_xheights, false, \"Test xheight algorithms\");\nstatic BOOL_VAR(textord_biased_skewcalc, true, \"Bias skew estimates with line length\");\nstatic BOOL_VAR(textord_interpolating_skew, true, \"Interpolate across gaps\");\nstatic INT_VAR(textord_skewsmooth_offset, 4, \"For smooth factor\");\nstatic INT_VAR(textord_skewsmooth_offset2, 1, \"For smooth factor\");\nINT_VAR(textord_test_x, -INT32_MAX, \"coord of test pt\");\nINT_VAR(textord_test_y, -INT32_MAX, \"coord of test pt\");\nINT_VAR(textord_min_blobs_in_row, 4, \"Min blobs before gradient counted\");\nINT_VAR(textord_spline_minblobs, 8, \"Min blobs in each spline segment\");\nINT_VAR(textord_spline_medianwin, 6, \"Size of window for spline segmentation\");\nstatic INT_VAR(textord_max_blob_overlaps, 4, \"Max number of blobs a big blob can overlap\");\nINT_VAR(textord_min_xheight, 10, \"Min credible pixel xheight\");\ndouble_VAR(textord_spline_shift_fraction, 0.02, \"Fraction of line spacing for quad\");\ndouble_VAR(textord_skew_ile, 0.5, \"Ile of gradients for page skew\");\ndouble_VAR(textord_skew_lag, 0.02, \"Lag for skew on row accumulation\");\ndouble_VAR(textord_linespace_iqrlimit, 0.2, \"Max iqr/median for linespace\");\ndouble_VAR(textord_width_limit, 8, \"Max width of blobs to make rows\");\ndouble_VAR(textord_chop_width, 1.5, \"Max width before chopping\");\nstatic double_VAR(textord_expansion_factor, 1.0, \"Factor to expand rows by in expand_rows\");\nstatic double_VAR(textord_overlap_x, 0.375, \"Fraction of linespace for good overlap\");\ndouble_VAR(textord_minxh, 0.25, \"fraction of linesize for min xheight\");\ndouble_VAR(textord_min_linesize, 1.25, \"* blob height for initial linesize\");\ndouble_VAR(textord_excess_blobsize, 1.3, \"New row made if blob makes row this big\");\ndouble_VAR(textord_occupancy_threshold, 0.4, \"Fraction of neighbourhood\");\ndouble_VAR(textord_underline_width, 2.0, \"Multiple of line_size for underline\");\ndouble_VAR(textord_min_blob_height_fraction, 0.75,\n           \"Min blob height/top to include blob top into xheight stats\");\ndouble_VAR(textord_xheight_mode_fraction, 0.4, \"Min pile height to make xheight\");\ndouble_VAR(textord_ascheight_mode_fraction, 0.08, \"Min pile height to make ascheight\");\nstatic double_VAR(textord_descheight_mode_fraction, 0.08, \"Min pile height to make descheight\");\ndouble_VAR(textord_ascx_ratio_min, 1.25, \"Min cap/xheight\");\ndouble_VAR(textord_ascx_ratio_max, 1.8, \"Max cap/xheight\");\ndouble_VAR(textord_descx_ratio_min, 0.25, \"Min desc/xheight\");\ndouble_VAR(textord_descx_ratio_max, 0.6, \"Max desc/xheight\");\ndouble_VAR(textord_xheight_error_margin, 0.1, \"Accepted variation\");\nINT_VAR(textord_lms_line_trials, 12, \"Number of linew fits to do\");\nBOOL_VAR(textord_new_initial_xheight, true, \"Use test xheight mechanism\");\nBOOL_VAR(textord_debug_blob, false, \"Print test blob information\");\n\n#define MAX_HEIGHT_MODES 12\n\nconst int kMinLeaderCount = 5;\n\n/**\n * @name row_y_order\n *\n * Sort function to sort rows in y from page top.\n */\nstatic int row_y_order(       // sort function\n    const TO_ROW *row1, // items to compare\n    const TO_ROW *row2) {\n  if (row1->parallel_c() > row2->parallel_c()) {\n    return -1;\n  } else if (row1->parallel_c() < row2->parallel_c()) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\n/**\n * @name row_spacing_order\n *\n * Qsort style function to compare 2 TO_ROWS based on their spacing value.\n */\nstatic int row_spacing_order( // sort function\n    const TO_ROW *row1, // items to compare\n    const TO_ROW *row2) {\n  return row1->spacing < row2->spacing;\n}\n\n// Factored-out helper to build a single row from a list of blobs.\n// Returns the mean blob size.\nstatic float MakeRowFromBlobs(float line_size, BLOBNBOX_IT *blob_it, TO_ROW_IT *row_it) {\n  blob_it->sort(blob_x_order);\n  blob_it->move_to_first();\n  TO_ROW *row = nullptr;\n  float total_size = 0.0f;\n  int blob_count = 0;\n  // Add all the blobs to a single TO_ROW.\n  for (; !blob_it->empty(); blob_it->forward()) {\n    BLOBNBOX *blob = blob_it->extract();\n    int top = blob->bounding_box().top();\n    int bottom = blob->bounding_box().bottom();\n    if (row == nullptr) {\n      row = new TO_ROW(blob, top, bottom, line_size);\n      row_it->add_before_then_move(row);\n    } else {\n      row->add_blob(blob, top, bottom, line_size);\n    }\n    total_size += top - bottom;\n    ++blob_count;\n  }\n  return blob_count > 0 ? total_size / blob_count : total_size;\n}\n\n// Helper to make a row using the children of a single blob.\n// Returns the mean size of the blobs created.\nstatic float MakeRowFromSubBlobs(TO_BLOCK *block, C_BLOB *blob, TO_ROW_IT *row_it) {\n  // The blobs made from the children will go in the small_blobs list.\n  BLOBNBOX_IT bb_it(&block->small_blobs);\n  C_OUTLINE_IT ol_it(blob->out_list());\n  // Get the children.\n  ol_it.set_to_list(ol_it.data()->child());\n  if (ol_it.empty()) {\n    return 0.0f;\n  }\n  for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {\n    // Deep copy the child outline and use that to make a blob.\n    blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data()));\n    // Correct direction as needed.\n    blob->CheckInverseFlagAndDirection();\n    auto *bbox = new BLOBNBOX(blob);\n    bb_it.add_after_then_move(bbox);\n  }\n  // Now we can make a row from the blobs.\n  return MakeRowFromBlobs(block->line_size, &bb_it, row_it);\n}\n\n/**\n * @name make_single_row\n *\n * Arrange the blobs into a single row... well actually, if there is\n * only a single blob, it makes 2 rows, in case the top-level blob\n * is a container of the real blobs to recognize.\n */\nfloat make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block,\n                      TO_BLOCK_LIST *blocks) {\n  BLOBNBOX_IT blob_it = &block->blobs;\n  TO_ROW_IT row_it = block->get_rows();\n\n  // Include all the small blobs and large blobs.\n  blob_it.add_list_after(&block->small_blobs);\n  blob_it.add_list_after(&block->noise_blobs);\n  blob_it.add_list_after(&block->large_blobs);\n  if (block->blobs.singleton() && allow_sub_blobs) {\n    blob_it.move_to_first();\n    float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);\n    if (size > block->line_size) {\n      block->line_size = size;\n    }\n  } else if (block->blobs.empty()) {\n    // Make a fake blob.\n    C_BLOB *blob = C_BLOB::FakeBlob(block->block->pdblk.bounding_box());\n    // The blobnbox owns the blob.\n    auto *bblob = new BLOBNBOX(blob);\n    blob_it.add_after_then_move(bblob);\n  }\n  MakeRowFromBlobs(block->line_size, &blob_it, &row_it);\n  // Fit an LMS line to the rows.\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    fit_lms_line(row_it.data());\n  }\n  float gradient;\n  float fit_error;\n  // Compute the skew based on the fitted line.\n  compute_page_skew(blocks, gradient, fit_error);\n  return gradient;\n}\n\n/**\n * @name make_rows\n *\n * Arrange the blobs into rows.\n */\nfloat make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {\n  float port_m;         // global skew\n  float port_err;       // global noise\n  TO_BLOCK_IT block_it; // iterator\n\n  block_it.set_to_list(port_blocks);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f), !textord_test_landscape);\n  }\n  // compute globally\n  compute_page_skew(port_blocks, port_m, port_err);\n  block_it.set_to_list(port_blocks);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),\n                        block_it.data()->block->pdblk.bounding_box().left(),\n                        !textord_test_landscape);\n  }\n  return port_m; // global skew\n}\n\n/**\n * @name make_initial_textrows\n *\n * Arrange the good blobs into rows of text.\n */\nvoid make_initial_textrows( // find lines\n    ICOORD page_tr,\n    TO_BLOCK *block, // block to do\n    FCOORD rotation, // for drawing\n    bool testing_on  // correct orientation\n) {\n  TO_ROW_IT row_it = block->get_rows();\n\n#ifndef GRAPHICS_DISABLED\n  ScrollView::Color colour; // of row\n\n  if (textord_show_initial_rows && testing_on) {\n    if (to_win == nullptr) {\n      create_to_win(page_tr);\n    }\n  }\n#endif\n  // guess skew\n  assign_blobs_to_rows(block, nullptr, 0, true, true, textord_show_initial_rows && testing_on);\n  row_it.move_to_first();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    fit_lms_line(row_it.data());\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_initial_rows && testing_on) {\n    colour = ScrollView::RED;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      plot_to_row(row_it.data(), colour, rotation);\n      colour = static_cast<ScrollView::Color>(colour + 1);\n      if (colour > ScrollView::MAGENTA) {\n        colour = ScrollView::RED;\n      }\n    }\n  }\n#endif\n}\n\n/**\n * @name fit_lms_line\n *\n * Fit an LMS line to a row.\n */\nvoid fit_lms_line(TO_ROW *row) {\n  float m, c; // fitted line\n  tesseract::DetLineFit lms;\n  BLOBNBOX_IT blob_it = row->blob_list();\n\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    const TBOX &box = blob_it.data()->bounding_box();\n    lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));\n  }\n  double error = lms.Fit(&m, &c);\n  row->set_line(m, c, error);\n}\n\n/**\n * @name compute_page_skew\n *\n * Compute the skew over a full page by averaging the gradients over\n * all the lines. Get the error of the same row.\n */\nvoid compute_page_skew(    // get average gradient\n    TO_BLOCK_LIST *blocks, // list of blocks\n    float &page_m,         // average gradient\n    float &page_err        // average error\n) {\n  int32_t row_count;             // total rows\n  int32_t blob_count;            // total_blobs\n  int32_t row_err;               // integer error\n  int32_t row_index;             // of total\n  TO_ROW *row;                   // current row\n  TO_BLOCK_IT block_it = blocks; // iterator\n\n  row_count = 0;\n  blob_count = 0;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();\n    if (pb != nullptr && !pb->IsText()) {\n      continue; // Pretend non-text blocks don't exist.\n    }\n    row_count += block_it.data()->get_rows()->length();\n    // count up rows\n    TO_ROW_IT row_it(block_it.data()->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      blob_count += row_it.data()->blob_list()->length();\n    }\n  }\n  if (row_count == 0) {\n    page_m = 0.0f;\n    page_err = 0.0f;\n    return;\n  }\n  // of rows\n  std::vector<float> gradients(blob_count);\n  // of rows\n  std::vector<float> errors(blob_count);\n\n  row_index = 0;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();\n    if (pb != nullptr && !pb->IsText()) {\n      continue; // Pretend non-text blocks don't exist.\n    }\n    TO_ROW_IT row_it(block_it.data()->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      blob_count = row->blob_list()->length();\n      row_err = static_cast<int32_t>(std::ceil(row->line_error()));\n      if (row_err <= 0) {\n        row_err = 1;\n      }\n      if (textord_biased_skewcalc) {\n        blob_count /= row_err;\n        for (blob_count /= row_err; blob_count > 0; blob_count--) {\n          gradients[row_index] = row->line_m();\n          errors[row_index] = row->line_error();\n          row_index++;\n        }\n      } else if (blob_count >= textord_min_blobs_in_row) {\n        // get gradient\n        gradients[row_index] = row->line_m();\n        errors[row_index] = row->line_error();\n        row_index++;\n      }\n    }\n  }\n  if (row_index == 0) {\n    // desperate\n    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n      POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();\n      if (pb != nullptr && !pb->IsText()) {\n        continue; // Pretend non-text blocks don't exist.\n      }\n      TO_ROW_IT row_it(block_it.data()->get_rows());\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        row = row_it.data();\n        gradients[row_index] = row->line_m();\n        errors[row_index] = row->line_error();\n        row_index++;\n      }\n    }\n  }\n  row_count = row_index;\n  row_index = static_cast<int32_t>(row_count * textord_skew_ile);\n  gradients.resize(row_count);\n  std::nth_element(gradients.begin(), gradients.begin() + row_index, gradients.end());\n  page_m = gradients[row_index];\n  row_index = static_cast<int32_t>(row_count * textord_skew_ile);\n  errors.resize(row_count);\n  std::nth_element(errors.begin(), errors.begin() + row_index, errors.end());\n  page_err = errors[row_index];\n}\n\nconst double kNoiseSize = 0.5; // Fraction of xheight.\nconst int kMinSize = 8;        // Min pixels to be xheight.\n\n/**\n * Return true if the dot looks like it is part of the i.\n * Doesn't work for any other diacritical.\n */\nstatic bool dot_of_i(BLOBNBOX *dot, BLOBNBOX *i, TO_ROW *row) {\n  const TBOX &ibox = i->bounding_box();\n  const TBOX &dotbox = dot->bounding_box();\n\n  // Must overlap horizontally by enough and be high enough.\n  int overlap = std::min(dotbox.right(), ibox.right()) - std::max(dotbox.left(), ibox.left());\n  if (ibox.height() <= 2 * dotbox.height() ||\n      (overlap * 2 < ibox.width() && overlap < dotbox.width())) {\n    return false;\n  }\n\n  // If the i is tall and thin then it is good.\n  if (ibox.height() > ibox.width() * 2) {\n    return true; // The i or ! must be tall and thin.\n  }\n\n  // It might still be tall and thin, but it might be joined to something.\n  // So search the outline for a piece of large height close to the edges\n  // of the dot.\n  const double kHeightFraction = 0.6;\n  double target_height = std::min(dotbox.bottom(), ibox.top());\n  target_height -= row->line_m() * dotbox.left() + row->line_c();\n  target_height *= kHeightFraction;\n  int left_min = dotbox.left() - dotbox.width();\n  int middle = (dotbox.left() + dotbox.right()) / 2;\n  int right_max = dotbox.right() + dotbox.width();\n  int left_miny = 0;\n  int left_maxy = 0;\n  int right_miny = 0;\n  int right_maxy = 0;\n  bool found_left = false;\n  bool found_right = false;\n  bool in_left = false;\n  bool in_right = false;\n  C_BLOB *blob = i->cblob();\n  C_OUTLINE_IT o_it = blob->out_list();\n  for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {\n    C_OUTLINE *outline = o_it.data();\n    int length = outline->pathlength();\n    ICOORD pos = outline->start_pos();\n    for (int step = 0; step < length; pos += outline->step(step++)) {\n      int x = pos.x();\n      int y = pos.y();\n      if (x >= left_min && x < middle && !found_left) {\n        // We are in the left part so find min and max y.\n        if (in_left) {\n          if (y > left_maxy) {\n            left_maxy = y;\n          }\n          if (y < left_miny) {\n            left_miny = y;\n          }\n        } else {\n          left_maxy = left_miny = y;\n          in_left = true;\n        }\n      } else if (in_left) {\n        // We just left the left so look for size.\n        if (left_maxy - left_miny > target_height) {\n          if (found_right) {\n            return true;\n          }\n          found_left = true;\n        }\n        in_left = false;\n      }\n      if (x <= right_max && x > middle && !found_right) {\n        // We are in the right part so find min and max y.\n        if (in_right) {\n          if (y > right_maxy) {\n            right_maxy = y;\n          }\n          if (y < right_miny) {\n            right_miny = y;\n          }\n        } else {\n          right_maxy = right_miny = y;\n          in_right = true;\n        }\n      } else if (in_right) {\n        // We just left the right so look for size.\n        if (right_maxy - right_miny > target_height) {\n          if (found_left) {\n            return true;\n          }\n          found_right = true;\n        }\n        in_right = false;\n      }\n    }\n  }\n  return false;\n}\n\nvoid vigorous_noise_removal(TO_BLOCK *block) {\n  TO_ROW_IT row_it = block->get_rows();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    TO_ROW *row = row_it.data();\n    BLOBNBOX_IT b_it = row->blob_list();\n    // Estimate the xheight on the row.\n    int max_height = 0;\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      BLOBNBOX *blob = b_it.data();\n      if (blob->bounding_box().height() > max_height) {\n        max_height = blob->bounding_box().height();\n      }\n    }\n    STATS hstats(0, max_height);\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      BLOBNBOX *blob = b_it.data();\n      int height = blob->bounding_box().height();\n      if (height >= kMinSize) {\n        hstats.add(blob->bounding_box().height(), 1);\n      }\n    }\n    float xheight = hstats.median();\n    // Delete small objects.\n    BLOBNBOX *prev = nullptr;\n    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n      BLOBNBOX *blob = b_it.data();\n      const TBOX &box = blob->bounding_box();\n      if (box.height() < kNoiseSize * xheight) {\n        // Small so delete unless it looks like an i dot.\n        if (prev != nullptr) {\n          if (dot_of_i(blob, prev, row)) {\n            continue; // Looks OK.\n          }\n        }\n        if (!b_it.at_last()) {\n          BLOBNBOX *next = b_it.data_relative(1);\n          if (dot_of_i(blob, next, row)) {\n            continue; // Looks OK.\n          }\n        }\n        // It might be noise so get rid of it.\n        delete blob->remove_cblob();\n        delete b_it.extract();\n      } else {\n        prev = blob;\n      }\n    }\n  }\n}\n\n/**\n * cleanup_rows_making\n *\n * Remove overlapping rows and fit all the blobs to what's left.\n */\nvoid cleanup_rows_making( // find lines\n    ICOORD page_tr,       // top right\n    TO_BLOCK *block,      // block to do\n    float gradient,       // gradient to fit\n    FCOORD rotation,      // for drawing\n    int32_t block_edge,   // edge of block\n    bool testing_on       // correct orientation\n) {\n  // iterators\n  BLOBNBOX_IT blob_it = &block->blobs;\n  TO_ROW_IT row_it = block->get_rows();\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_parallel_rows && testing_on) {\n    if (to_win == nullptr) {\n      create_to_win(page_tr);\n    }\n  }\n#endif\n  // get row coords\n  fit_parallel_rows(block, gradient, rotation, block_edge,\n                    textord_show_parallel_rows && testing_on);\n  delete_non_dropout_rows(block, gradient, rotation, block_edge,\n                          textord_show_parallel_rows && testing_on);\n  expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);\n  blob_it.set_to_list(&block->blobs);\n  row_it.set_to_list(block->get_rows());\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    blob_it.add_list_after(row_it.data()->blob_list());\n  }\n  // give blobs back\n  assign_blobs_to_rows(block, &gradient, 1, false, false, false);\n  // now new rows must be genuine\n  blob_it.set_to_list(&block->blobs);\n  blob_it.add_list_after(&block->large_blobs);\n  assign_blobs_to_rows(block, &gradient, 2, true, true, false);\n  // safe to use big ones now\n  blob_it.set_to_list(&block->blobs);\n  // throw all blobs in\n  blob_it.add_list_after(&block->noise_blobs);\n  blob_it.add_list_after(&block->small_blobs);\n  assign_blobs_to_rows(block, &gradient, 3, false, false, false);\n}\n\n/**\n * delete_non_dropout_rows\n *\n * Compute the linespacing and offset.\n */\nvoid delete_non_dropout_rows( // find lines\n    TO_BLOCK *block,          // block to do\n    float gradient,           // global skew\n    FCOORD rotation,          // deskew vector\n    int32_t block_edge,       // left edge\n    bool testing_on           // correct orientation\n) {\n  TBOX block_box; // deskewed block\n  int32_t max_y;  // in block\n  int32_t min_y;\n  int32_t line_index; // of scan line\n  int32_t line_count; // no of scan lines\n  int32_t distance;   // to drop-out\n  int32_t xleft;      // of block\n  int32_t ybottom;    // of block\n  TO_ROW *row;        // current row\n  TO_ROW_IT row_it = block->get_rows();\n  BLOBNBOX_IT blob_it = &block->blobs;\n\n  if (row_it.empty()) {\n    return; // empty block\n  }\n  block_box = deskew_block_coords(block, gradient);\n  xleft = block->block->pdblk.bounding_box().left();\n  ybottom = block->block->pdblk.bounding_box().bottom();\n  min_y = block_box.bottom() - 1;\n  max_y = block_box.top() + 1;\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    line_index = static_cast<int32_t>(std::floor(row_it.data()->intercept()));\n    if (line_index <= min_y) {\n      min_y = line_index - 1;\n    }\n    if (line_index >= max_y) {\n      max_y = line_index + 1;\n    }\n  }\n  line_count = max_y - min_y + 1;\n  if (line_count <= 0) {\n    return; // empty block\n  }\n  // change in occupation\n  std::vector<int32_t> deltas(line_count);\n  // of pixel coords\n  std::vector<int32_t> occupation(line_count);\n\n  compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]);\n  compute_occupation_threshold(\n      static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kDescenderFraction +\n                                                       tesseract::CCStruct::kAscenderFraction))),\n      static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kXHeightFraction +\n                                                       tesseract::CCStruct::kAscenderFraction))),\n      max_y - min_y + 1, &occupation[0], &deltas[0]);\n#ifndef GRAPHICS_DISABLED\n  if (testing_on) {\n    draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);\n  }\n#endif\n  compute_dropout_distances(&occupation[0], &deltas[0], line_count);\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    line_index = static_cast<int32_t>(std::floor(row->intercept()));\n    distance = deltas[line_index - min_y];\n    if (find_best_dropout_row(row, distance, block->line_spacing / 2, line_index, &row_it,\n                              testing_on)) {\n#ifndef GRAPHICS_DISABLED\n      if (testing_on) {\n        plot_parallel_row(row, gradient, block_edge, ScrollView::WHITE, rotation);\n      }\n#endif\n      blob_it.add_list_after(row_it.data()->blob_list());\n      delete row_it.extract(); // too far away\n    }\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    blob_it.add_list_after(row_it.data()->blob_list());\n  }\n}\n\n/**\n * @name find_best_dropout_row\n *\n * Delete this row if it has a neighbour with better dropout characteristics.\n * true is returned if the row should be deleted.\n */\nbool find_best_dropout_row( // find neighbours\n    TO_ROW *row,            // row to test\n    int32_t distance,       // dropout dist\n    float dist_limit,       // threshold distance\n    int32_t line_index,     // index of row\n    TO_ROW_IT *row_it,      // current position\n    bool testing_on         // correct orientation\n) {\n  int32_t next_index; // of neighbouring row\n  int32_t row_offset; // from current row\n  int32_t abs_dist;   // absolute distance\n  int8_t row_inc;     // increment to row_index\n  TO_ROW *next_row;   // nextious row\n\n  if (testing_on) {\n    tprintf(\"Row at %g(%g), dropout dist=%d,\", row->intercept(), row->parallel_c(), distance);\n  }\n  if (distance < 0) {\n    row_inc = 1;\n    abs_dist = -distance;\n  } else {\n    row_inc = -1;\n    abs_dist = distance;\n  }\n  if (abs_dist > dist_limit) {\n    if (testing_on) {\n      tprintf(\" too far - deleting\\n\");\n    }\n    return true;\n  }\n  if ((distance < 0 && !row_it->at_last()) || (distance >= 0 && !row_it->at_first())) {\n    row_offset = row_inc;\n    do {\n      next_row = row_it->data_relative(row_offset);\n      next_index = static_cast<int32_t>(std::floor(next_row->intercept()));\n      if ((distance < 0 && next_index < line_index &&\n           next_index > line_index + distance + distance) ||\n          (distance >= 0 && next_index > line_index &&\n           next_index < line_index + distance + distance)) {\n        if (testing_on) {\n          tprintf(\" nearer neighbour (%d) at %g\\n\", line_index + distance - next_index,\n                  next_row->intercept());\n        }\n        return true; // other is nearer\n      } else if (next_index == line_index || next_index == line_index + distance + distance) {\n        if (row->believability() <= next_row->believability()) {\n          if (testing_on) {\n            tprintf(\" equal but more believable at %g (%g/%g)\\n\", next_row->intercept(),\n                    row->believability(), next_row->believability());\n          }\n          return true; // other is more believable\n        }\n      }\n      row_offset += row_inc;\n    } while ((next_index == line_index || next_index == line_index + distance + distance) &&\n             row_offset < row_it->length());\n    if (testing_on) {\n      tprintf(\" keeping\\n\");\n    }\n  }\n  return false;\n}\n\n/**\n * @name deskew_block_coords\n *\n * Compute the bounding box of all the blobs in the block\n * if they were deskewed without actually doing it.\n */\nTBOX deskew_block_coords( // block box\n    TO_BLOCK *block,      // block to do\n    float gradient        // global skew\n) {\n  TBOX result;     // block bounds\n  TBOX blob_box;   // of block\n  FCOORD rotation; // deskew vector\n  float length;    // of gradient vector\n  TO_ROW_IT row_it = block->get_rows();\n  TO_ROW *row;         // current row\n  BLOBNBOX *blob;      // current blob\n  BLOBNBOX_IT blob_it; // iterator\n\n  length = std::sqrt(gradient * gradient + 1);\n  rotation = FCOORD(1 / length, -gradient / length);\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    blob_it.set_to_list(row->blob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      blob_box = blob->bounding_box();\n      blob_box.rotate(rotation); // de-skew it\n      result += blob_box;\n    }\n  }\n  return result;\n}\n\n/**\n * @name compute_line_occupation\n *\n * Compute the pixel projection back on the y axis given the global\n * skew. Also compute the 1st derivative.\n */\nvoid compute_line_occupation( // project blobs\n    TO_BLOCK *block,          // block to do\n    float gradient,           // global skew\n    int32_t min_y,            // min coord in block\n    int32_t max_y,            // in block\n    int32_t *occupation,      // output projection\n    int32_t *deltas           // derivative\n) {\n  int32_t line_count; // maxy-miny+1\n  int32_t line_index; // of scan line\n  int index;          // array index for daft compilers\n  TO_ROW *row;        // current row\n  TO_ROW_IT row_it = block->get_rows();\n  BLOBNBOX *blob;      // current blob\n  BLOBNBOX_IT blob_it; // iterator\n  float length;        // of skew vector\n  TBOX blob_box;       // bounding box\n  FCOORD rotation;     // inverse of skew\n\n  line_count = max_y - min_y + 1;\n  length = std::sqrt(gradient * gradient + 1);\n  rotation = FCOORD(1 / length, -gradient / length);\n  for (line_index = 0; line_index < line_count; line_index++) {\n    deltas[line_index] = 0;\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    blob_it.set_to_list(row->blob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      blob_box = blob->bounding_box();\n      blob_box.rotate(rotation); // de-skew it\n      int32_t width = blob_box.right() - blob_box.left();\n      index = blob_box.bottom() - min_y;\n      ASSERT_HOST(index >= 0 && index < line_count);\n      // count transitions\n      deltas[index] += width;\n      index = blob_box.top() - min_y;\n      ASSERT_HOST(index >= 0 && index < line_count);\n      deltas[index] -= width;\n    }\n  }\n  occupation[0] = deltas[0];\n  for (line_index = 1; line_index < line_count; line_index++) {\n    occupation[line_index] = occupation[line_index - 1] + deltas[line_index];\n  }\n}\n\n/**\n * compute_occupation_threshold\n *\n * Compute thresholds for textline or not for the occupation array.\n */\nvoid compute_occupation_threshold( // project blobs\n    int32_t low_window,            // below result point\n    int32_t high_window,           // above result point\n    int32_t line_count,            // array sizes\n    int32_t *occupation,           // input projection\n    int32_t *thresholds            // output thresholds\n) {\n  int32_t line_index; // of thresholds line\n  int32_t low_index;  // in occupation\n  int32_t high_index; // in occupation\n  int32_t sum;        // current average\n  int32_t divisor;    // to get thresholds\n  int32_t min_index;  // of min occ\n  int32_t min_occ;    // min in locality\n  int32_t test_index; // for finding min\n\n  divisor = static_cast<int32_t>(ceil((low_window + high_window) / textord_occupancy_threshold));\n  if (low_window + high_window < line_count) {\n    for (sum = 0, high_index = 0; high_index < low_window; high_index++) {\n      sum += occupation[high_index];\n    }\n    for (low_index = 0; low_index < high_window; low_index++, high_index++) {\n      sum += occupation[high_index];\n    }\n    min_occ = occupation[0];\n    min_index = 0;\n    for (test_index = 1; test_index < high_index; test_index++) {\n      if (occupation[test_index] <= min_occ) {\n        min_occ = occupation[test_index];\n        min_index = test_index; // find min in region\n      }\n    }\n    for (line_index = 0; line_index < low_window; line_index++) {\n      thresholds[line_index] = (sum - min_occ) / divisor + min_occ;\n    }\n    // same out to end\n    for (low_index = 0; high_index < line_count; low_index++, high_index++) {\n      sum -= occupation[low_index];\n      sum += occupation[high_index];\n      if (occupation[high_index] <= min_occ) {\n        // find min in region\n        min_occ = occupation[high_index];\n        min_index = high_index;\n      }\n      // lost min from region\n      if (min_index <= low_index) {\n        min_occ = occupation[low_index + 1];\n        min_index = low_index + 1;\n        for (test_index = low_index + 2; test_index <= high_index; test_index++) {\n          if (occupation[test_index] <= min_occ) {\n            min_occ = occupation[test_index];\n            // find min in region\n            min_index = test_index;\n          }\n        }\n      }\n      thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;\n    }\n  } else {\n    min_occ = occupation[0];\n    min_index = 0;\n    for (sum = 0, low_index = 0; low_index < line_count; low_index++) {\n      if (occupation[low_index] < min_occ) {\n        min_occ = occupation[low_index];\n        min_index = low_index;\n      }\n      sum += occupation[low_index];\n    }\n    line_index = 0;\n  }\n  for (; line_index < line_count; line_index++) {\n    thresholds[line_index] = (sum - min_occ) / divisor + min_occ;\n  }\n  // same out to end\n}\n\n/**\n * @name compute_dropout_distances\n *\n * Compute the distance from each coordinate to the nearest dropout.\n */\nvoid compute_dropout_distances( // project blobs\n    int32_t *occupation,        // input projection\n    int32_t *thresholds,        // output thresholds\n    int32_t line_count          // array sizes\n) {\n  int32_t line_index;     // of thresholds line\n  int32_t distance;       // from prev dropout\n  int32_t next_dist;      // to next dropout\n  int32_t back_index;     // for back filling\n  int32_t prev_threshold; // before overwrite\n\n  distance = -line_count;\n  line_index = 0;\n  do {\n    do {\n      distance--;\n      prev_threshold = thresholds[line_index];\n      // distance from prev\n      thresholds[line_index] = distance;\n      line_index++;\n    } while (line_index < line_count && (occupation[line_index] < thresholds[line_index] ||\n                                         occupation[line_index - 1] >= prev_threshold));\n    if (line_index < line_count) {\n      back_index = line_index - 1;\n      next_dist = 1;\n      while (next_dist < -distance && back_index >= 0) {\n        thresholds[back_index] = next_dist;\n        back_index--;\n        next_dist++;\n        distance++;\n      }\n      distance = 1;\n    }\n  } while (line_index < line_count);\n}\n\n/**\n * @name expand_rows\n *\n * Expand each row to the least of its allowed size and touching its\n * neighbours. If the expansion would entirely swallow a neighbouring row\n * then do so.\n */\nvoid expand_rows(       // find lines\n    ICOORD page_tr,     // top right\n    TO_BLOCK *block,    // block to do\n    float gradient,     // gradient to fit\n    FCOORD rotation,    // for drawing\n    int32_t block_edge, // edge of block\n    bool testing_on     // correct orientation\n) {\n  bool swallowed_row;    // eaten a neighbour\n  float y_max, y_min;    // new row limits\n  float y_bottom, y_top; // allowed limits\n  TO_ROW *test_row;      // next row\n  TO_ROW *row;           // current row\n                         // iterators\n  BLOBNBOX_IT blob_it = &block->blobs;\n  TO_ROW_IT row_it = block->get_rows();\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_expanded_rows && testing_on) {\n    if (to_win == nullptr) {\n      create_to_win(page_tr);\n    }\n  }\n#endif\n\n  adjust_row_limits(block); // shift min,max.\n  if (textord_new_initial_xheight) {\n    if (block->get_rows()->empty()) {\n      return;\n    }\n    compute_row_stats(block, textord_show_expanded_rows && testing_on);\n  }\n  assign_blobs_to_rows(block, &gradient, 4, true, false, false);\n  // get real membership\n  if (block->get_rows()->empty()) {\n    return;\n  }\n  fit_parallel_rows(block, gradient, rotation, block_edge,\n                    textord_show_expanded_rows && testing_on);\n  if (!textord_new_initial_xheight) {\n    compute_row_stats(block, textord_show_expanded_rows && testing_on);\n  }\n  row_it.move_to_last();\n  do {\n    row = row_it.data();\n    y_max = row->max_y(); // get current limits\n    y_min = row->min_y();\n    y_bottom = row->intercept() - block->line_size * textord_expansion_factor *\n                                      tesseract::CCStruct::kDescenderFraction;\n    y_top = row->intercept() +\n            block->line_size * textord_expansion_factor *\n                (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction);\n    if (y_min > y_bottom) { // expansion allowed\n      if (textord_show_expanded_rows && testing_on) {\n        tprintf(\"Expanding bottom of row at %f from %f to %f\\n\", row->intercept(), y_min, y_bottom);\n      }\n      // expandable\n      swallowed_row = true;\n      while (swallowed_row && !row_it.at_last()) {\n        swallowed_row = false;\n        // get next one\n        test_row = row_it.data_relative(1);\n        // overlaps space\n        if (test_row->max_y() > y_bottom) {\n          if (test_row->min_y() > y_bottom) {\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Eating row below at %f\\n\", test_row->intercept());\n            }\n            row_it.forward();\n#ifndef GRAPHICS_DISABLED\n            if (textord_show_expanded_rows && testing_on) {\n              plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation);\n            }\n#endif\n            blob_it.set_to_list(row->blob_list());\n            blob_it.add_list_after(test_row->blob_list());\n            // swallow complete row\n            delete row_it.extract();\n            row_it.backward();\n            swallowed_row = true;\n          } else if (test_row->max_y() < y_min) {\n            // shorter limit\n            y_bottom = test_row->max_y();\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Truncating limit to %f due to touching row at %f\\n\", y_bottom,\n                      test_row->intercept());\n            }\n          } else {\n            y_bottom = y_min; // can't expand it\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Not expanding limit beyond %f due to touching row at %f\\n\", y_bottom,\n                      test_row->intercept());\n            }\n          }\n        }\n      }\n      y_min = y_bottom; // expand it\n    }\n    if (y_max < y_top) { // expansion allowed\n      if (textord_show_expanded_rows && testing_on) {\n        tprintf(\"Expanding top of row at %f from %f to %f\\n\", row->intercept(), y_max, y_top);\n      }\n      swallowed_row = true;\n      while (swallowed_row && !row_it.at_first()) {\n        swallowed_row = false;\n        // get one above\n        test_row = row_it.data_relative(-1);\n        if (test_row->min_y() < y_top) {\n          if (test_row->max_y() < y_top) {\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Eating row above at %f\\n\", test_row->intercept());\n            }\n            row_it.backward();\n            blob_it.set_to_list(row->blob_list());\n#ifndef GRAPHICS_DISABLED\n            if (textord_show_expanded_rows && testing_on) {\n              plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation);\n            }\n#endif\n            blob_it.add_list_after(test_row->blob_list());\n            // swallow complete row\n            delete row_it.extract();\n            row_it.forward();\n            swallowed_row = true;\n          } else if (test_row->min_y() < y_max) {\n            // shorter limit\n            y_top = test_row->min_y();\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Truncating limit to %f due to touching row at %f\\n\", y_top,\n                      test_row->intercept());\n            }\n          } else {\n            y_top = y_max; // can't expand it\n            if (textord_show_expanded_rows && testing_on) {\n              tprintf(\"Not expanding limit beyond %f due to touching row at %f\\n\", y_top,\n                      test_row->intercept());\n            }\n          }\n        }\n      }\n      y_max = y_top;\n    }\n    // new limits\n    row->set_limits(y_min, y_max);\n    row_it.backward();\n  } while (!row_it.at_last());\n}\n\n/**\n * adjust_row_limits\n *\n * Change the limits of rows to suit the default fractions.\n */\nvoid adjust_row_limits( // tidy limits\n    TO_BLOCK *block     // block to do\n) {\n  TO_ROW *row; // current row\n  float size;  // size of row\n  float ymax;  // top of row\n  float ymin;  // bottom of row\n  TO_ROW_IT row_it = block->get_rows();\n\n  if (textord_show_expanded_rows) {\n    tprintf(\"Adjusting row limits for block(%d,%d)\\n\", block->block->pdblk.bounding_box().left(),\n            block->block->pdblk.bounding_box().top());\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    size = row->max_y() - row->min_y();\n    if (textord_show_expanded_rows) {\n      tprintf(\"Row at %f has min %f, max %f, size %f\\n\", row->intercept(), row->min_y(),\n              row->max_y(), size);\n    }\n    size /= tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction +\n            tesseract::CCStruct::kDescenderFraction;\n    ymax = size * (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction);\n    ymin = -size * tesseract::CCStruct::kDescenderFraction;\n    row->set_limits(row->intercept() + ymin, row->intercept() + ymax);\n    row->merged = false;\n  }\n}\n\n/**\n * @name compute_row_stats\n *\n * Compute the linespacing and offset.\n */\nvoid compute_row_stats( // find lines\n    TO_BLOCK *block,    // block to do\n    bool testing_on     // correct orientation\n) {\n  int32_t row_index; // of median\n  TO_ROW *row;       // current row\n  TO_ROW *prev_row;  // previous row\n  float iqr;         // inter quartile range\n  TO_ROW_IT row_it = block->get_rows();\n  // number of rows\n  int16_t rowcount = row_it.length();\n  // for choose nth\n  std::vector<TO_ROW *> rows(rowcount);\n  rowcount = 0;\n  prev_row = nullptr;\n  row_it.move_to_last(); // start at bottom\n  do {\n    row = row_it.data();\n    if (prev_row != nullptr) {\n      rows[rowcount++] = prev_row;\n      prev_row->spacing = row->intercept() - prev_row->intercept();\n      if (prev_row->spacing < 0.1 && prev_row->spacing > -0.1) {\n        // Avoid small spacing values which give a small disp_quant_factor_.\n        // That can cause large memory allocations with out-of-memory.\n        prev_row->spacing = 0;\n      }\n      if (testing_on) {\n        tprintf(\"Row at %g yields spacing of %g\\n\", row->intercept(), prev_row->spacing);\n      }\n    }\n    prev_row = row;\n    row_it.backward();\n  } while (!row_it.at_last());\n  block->key_row = prev_row;\n  block->baseline_offset = std::fmod(prev_row->parallel_c(), block->line_spacing);\n  if (testing_on) {\n    tprintf(\"Blob based spacing=(%g,%g), offset=%g\", block->line_size, block->line_spacing,\n            block->baseline_offset);\n  }\n  if (rowcount > 0) {\n    rows.resize(rowcount);\n    row_index = rowcount * 3 / 4;\n    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);\n    iqr = rows[row_index]->spacing;\n    row_index = rowcount / 4;\n    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);\n    iqr -= rows[row_index]->spacing;\n    row_index = rowcount / 2;\n    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);\n    block->key_row = rows[row_index];\n    if (testing_on) {\n      tprintf(\" row based=%g(%g)\", rows[row_index]->spacing, iqr);\n    }\n    if (rowcount > 2 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {\n      if (!textord_new_initial_xheight) {\n        if (rows[row_index]->spacing < block->line_spacing &&\n            rows[row_index]->spacing > block->line_size) {\n          // within range\n          block->line_size = rows[row_index]->spacing;\n        // spacing=size\n        } else if (rows[row_index]->spacing > block->line_spacing) {\n          block->line_size = block->line_spacing;\n        }\n        // too big so use max\n      } else {\n        if (rows[row_index]->spacing < block->line_spacing) {\n          block->line_size = rows[row_index]->spacing;\n        } else {\n          block->line_size = block->line_spacing;\n        }\n        // too big so use max\n      }\n      if (block->line_size < textord_min_xheight) {\n        block->line_size = (float)textord_min_xheight;\n      }\n      block->line_spacing = rows[row_index]->spacing;\n      block->max_blob_size = block->line_spacing * textord_excess_blobsize;\n    }\n    block->baseline_offset = std::fmod(rows[row_index]->intercept(), block->line_spacing);\n  }\n  if (testing_on) {\n    tprintf(\"\\nEstimate line size=%g, spacing=%g, offset=%g\\n\", block->line_size,\n            block->line_spacing, block->baseline_offset);\n  }\n}\n\n/**\n * @name compute_block_xheight\n *\n * Compute the xheight of the individual rows, then correlate them\n * and interpret ascenderless lines, correcting xheights.\n *\n * First we compute our best guess of the x-height of each row independently\n * with compute_row_xheight(), which looks for a pair of commonly occurring\n * heights that could be x-height and ascender height. This function also\n * attempts to find descenders of lowercase letters (i.e. not the small\n * descenders that could appear in upper case letters as Q,J).\n *\n * After this computation each row falls into one of the following categories:\n * ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be\n *                      a regular row; we'll use its xheight to compute\n *                      xheight and ascrise estimates for the block\n * ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in\n *                       the xheight of this row (don't use it for estimating\n *                       block xheight), but this row can't contain all caps\n * ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase\n *              (or mostly lowercase for fonts with very few ascenders),\n *              all upper case or small caps\n * ROW_INVALID: no meaningful xheight could be found for this row\n *\n * We then run correct_row_xheight() and use the computed xheight and ascrise\n * averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND,\n * ROW_UNKNOWN and ROW_INVALID categories.\n *\n */\nvoid Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {\n  TO_ROW *row; // current row\n  float asc_frac_xheight = CCStruct::kAscenderFraction / CCStruct::kXHeightFraction;\n  float desc_frac_xheight = CCStruct::kDescenderFraction / CCStruct::kXHeightFraction;\n  int32_t min_height, max_height; // limits on xheight\n  TO_ROW_IT row_it = block->get_rows();\n  if (row_it.empty()) {\n    return; // no rows\n  }\n\n  // Compute the best guess of xheight of each row individually.\n  // Use xheight and ascrise values of the rows where ascenders were found.\n  get_min_max_xheight(block->line_size, &min_height, &max_height);\n  STATS row_asc_xheights(min_height, max_height);\n  STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),\n                        static_cast<int>(max_height * asc_frac_xheight));\n  int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);\n  int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);\n  STATS row_asc_descdrop(min_desc_height, max_desc_height);\n  STATS row_desc_xheights(min_height, max_height);\n  STATS row_desc_descdrop(min_desc_height, max_desc_height);\n  STATS row_cap_xheights(min_height, max_height);\n  STATS row_cap_floating_xheights(min_height, max_height);\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    // Compute the xheight of this row if it has not been computed before.\n    if (row->xheight <= 0) {\n      compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size);\n    }\n    ROW_CATEGORY row_category = get_row_category(row);\n    if (row_category == ROW_ASCENDERS_FOUND) {\n      row_asc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);\n      row_asc_ascrise.add(static_cast<int32_t>(row->ascrise), row->xheight_evidence);\n      row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);\n    } else if (row_category == ROW_DESCENDERS_FOUND) {\n      row_desc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);\n      row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);\n    } else if (row_category == ROW_UNKNOWN) {\n      fill_heights(row, gradient, min_height, max_height, &row_cap_xheights,\n                   &row_cap_floating_xheights);\n    }\n  }\n\n  float xheight = 0.0;\n  float ascrise = 0.0;\n  float descdrop = 0.0;\n  // Compute our best guess of xheight of this block.\n  if (row_asc_xheights.get_total() > 0) {\n    // Determine xheight from rows where ascenders were found.\n    xheight = row_asc_xheights.median();\n    ascrise = row_asc_ascrise.median();\n    descdrop = -row_asc_descdrop.median();\n  } else if (row_desc_xheights.get_total() > 0) {\n    // Determine xheight from rows where descenders were found.\n    xheight = row_desc_xheights.median();\n    descdrop = -row_desc_descdrop.median();\n  } else if (row_cap_xheights.get_total() > 0) {\n    // All the rows in the block were (a/de)scenderless.\n    // Try to search for two modes in row_cap_heights that could\n    // be the xheight and the capheight (e.g. some of the rows\n    // were lowercase, but did not have enough (a/de)scenders.\n    // If such two modes cannot be found, this block is most\n    // likely all caps (or all small caps, in which case the code\n    // still works as intended).\n    compute_xheight_from_modes(\n        &row_cap_xheights, &row_cap_floating_xheights,\n        textord_single_height_mode && block->block->classify_rotation().y() == 0.0, min_height,\n        max_height, &(xheight), &(ascrise));\n    if (ascrise == 0) { // assume only caps in the whole block\n      xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;\n    }\n  } else { // default block sizes\n    xheight = block->line_size * CCStruct::kXHeightFraction;\n  }\n  // Correct xheight, ascrise and descdrop if necessary.\n  bool corrected_xheight = false;\n  if (xheight < textord_min_xheight) {\n    xheight = static_cast<float>(textord_min_xheight);\n    corrected_xheight = true;\n  }\n  if (corrected_xheight || ascrise <= 0) {\n    ascrise = xheight * asc_frac_xheight;\n  }\n  if (corrected_xheight || descdrop >= 0) {\n    descdrop = -(xheight * desc_frac_xheight);\n  }\n  block->xheight = xheight;\n\n  if (textord_debug_xheights) {\n    tprintf(\"Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\\n\", xheight, ascrise,\n            descdrop);\n  }\n  // Correct xheight, ascrise, descdrop of rows based on block averages.\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);\n  }\n}\n\n/**\n * @name compute_row_xheight\n *\n * Estimate the xheight of this row.\n * Compute the ascender rise and descender drop at the same time.\n * Set xheigh_evidence to the number of blobs with the chosen xheight\n * that appear in this row.\n */\nvoid Textord::compute_row_xheight(TO_ROW *row, // row to do\n                                  const FCOORD &rotation,\n                                  float gradient, // global skew\n                                  int block_line_size) {\n  // Find blobs representing repeated characters in rows and mark them.\n  // This information is used for computing row xheight and at a later\n  // stage when words are formed by make_words.\n  if (!row->rep_chars_marked()) {\n    mark_repeated_chars(row);\n  }\n\n  int min_height, max_height;\n  get_min_max_xheight(block_line_size, &min_height, &max_height);\n  STATS heights(min_height, max_height);\n  STATS floating_heights(min_height, max_height);\n  fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights);\n  row->ascrise = 0.0f;\n  row->xheight = 0.0f;\n  row->xheight_evidence = compute_xheight_from_modes(\n      &heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height,\n      max_height, &(row->xheight), &(row->ascrise));\n  row->descdrop = 0.0f;\n  if (row->xheight > 0) {\n    row->descdrop =\n        static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));\n  }\n}\n\n/**\n * @name fill_heights\n *\n * Fill the given heights with heights of the blobs that are legal\n * candidates for estimating xheight.\n */\nvoid fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights,\n                  STATS *floating_heights) {\n  float xcentre;  // centre of blob\n  float top;      // top y coord of blob\n  float height;   // height of blob\n  BLOBNBOX *blob; // current blob\n  int repeated_set;\n  BLOBNBOX_IT blob_it = row->blob_list();\n  if (blob_it.empty()) {\n    return; // no blobs in this row\n  }\n  bool has_rep_chars = row->rep_chars_marked() && row->num_repeated_sets() > 0;\n  do {\n    blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f;\n      top = blob->bounding_box().top();\n      height = blob->bounding_box().height();\n      if (textord_fix_xheight_bug) {\n        top -= row->baseline.y(xcentre);\n      } else {\n        top -= gradient * xcentre + row->parallel_c();\n      }\n      if (top >= min_height && top <= max_height) {\n        heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);\n        if (height / top < textord_min_blob_height_fraction) {\n          floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);\n        }\n      }\n    }\n    // Skip repeated chars, since they are likely to skew the height stats.\n    if (has_rep_chars && blob->repeated_set() != 0) {\n      repeated_set = blob->repeated_set();\n      blob_it.forward();\n      while (!blob_it.at_first() && blob_it.data()->repeated_set() == repeated_set) {\n        blob_it.forward();\n        if (textord_debug_xheights) {\n          tprintf(\"Skipping repeated char when computing xheight\\n\");\n        }\n      }\n    } else {\n      blob_it.forward();\n    }\n  } while (!blob_it.at_first());\n}\n\n/**\n * @name compute_xheight_from_modes\n *\n * Given a STATS object heights, looks for two most frequently occurring\n * heights that look like xheight and xheight + ascrise. If found, sets\n * the values of *xheight and *ascrise accordingly, otherwise sets xheight\n * to any most frequently occurring height and sets *ascrise to 0.\n * Returns the number of times xheight occurred in heights.\n * For each mode that is considered for being an xheight the count of\n * floating blobs (stored in floating_heights) is subtracted from the\n * total count of the blobs of this height. This is done because blobs\n * that sit far above the baseline could represent valid ascenders, but\n * it is highly unlikely that such a character's height will be an xheight\n * (e.g.  -, ', =, ^, `, \", ', etc)\n * If cap_only, then force finding of only the top mode.\n */\nint compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only,\n                               int min_height, int max_height, float *xheight, float *ascrise) {\n  int blob_index = heights->mode();                 // find mode\n  int blob_count = heights->pile_count(blob_index); // get count of mode\n  if (textord_debug_xheights) {\n    tprintf(\"min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\\n\", min_height, max_height,\n            blob_index, blob_count, heights->get_total());\n    heights->print();\n    floating_heights->print();\n  }\n  if (blob_count == 0) {\n    return 0;\n  }\n  int modes[MAX_HEIGHT_MODES]; // biggest piles\n  bool in_best_pile = false;\n  int prev_size = -INT32_MAX;\n  int best_count = 0;\n  int mode_count = compute_height_modes(heights, min_height, max_height, modes, MAX_HEIGHT_MODES);\n  if (cap_only && mode_count > 1) {\n    mode_count = 1;\n  }\n  int x;\n  if (textord_debug_xheights) {\n    tprintf(\"found %d modes: \", mode_count);\n    for (x = 0; x < mode_count; x++) {\n      tprintf(\"%d \", modes[x]);\n    }\n    tprintf(\"\\n\");\n  }\n\n  for (x = 0; x < mode_count - 1; x++) {\n    if (modes[x] != prev_size + 1) {\n      in_best_pile = false; // had empty height\n    }\n    int modes_x_count = heights->pile_count(modes[x]) - floating_heights->pile_count(modes[x]);\n    if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&\n        (in_best_pile || modes_x_count > best_count)) {\n      for (int asc = x + 1; asc < mode_count; asc++) {\n        float ratio = static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);\n        if (textord_ascx_ratio_min < ratio && ratio < textord_ascx_ratio_max &&\n            (heights->pile_count(modes[asc]) >= blob_count * textord_ascheight_mode_fraction)) {\n          if (modes_x_count > best_count) {\n            in_best_pile = true;\n            best_count = modes_x_count;\n          }\n          if (textord_debug_xheights) {\n            tprintf(\"X=%d, asc=%d, count=%d, ratio=%g\\n\", modes[x], modes[asc] - modes[x],\n                    modes_x_count, ratio);\n          }\n          prev_size = modes[x];\n          *xheight = static_cast<float>(modes[x]);\n          *ascrise = static_cast<float>(modes[asc] - modes[x]);\n        }\n      }\n    }\n  }\n  if (*xheight == 0) { // single mode\n    // Remove counts of the \"floating\" blobs (the one whose height is too\n    // small in relation to it's top end of the bounding box) from heights\n    // before computing the single-mode xheight.\n    // Restore the counts in heights after the mode is found, since\n    // floating blobs might be useful for determining potential ascenders\n    // in compute_row_descdrop().\n    if (floating_heights->get_total() > 0) {\n      for (x = min_height; x < max_height; ++x) {\n        heights->add(x, -(floating_heights->pile_count(x)));\n      }\n      blob_index = heights->mode(); // find the modified mode\n      for (x = min_height; x < max_height; ++x) {\n        heights->add(x, floating_heights->pile_count(x));\n      }\n    }\n    *xheight = static_cast<float>(blob_index);\n    *ascrise = 0.0f;\n    best_count = heights->pile_count(blob_index);\n    if (textord_debug_xheights) {\n      tprintf(\"Single mode xheight set to %g\\n\", *xheight);\n    }\n  } else if (textord_debug_xheights) {\n    tprintf(\"Multi-mode xheight set to %g, asc=%g\\n\", *xheight, *ascrise);\n  }\n  return best_count;\n}\n\n/**\n * @name compute_row_descdrop\n *\n * Estimates the descdrop of this row. This function looks for\n * \"significant\" descenders of lowercase letters (those that could\n * not just be the small descenders of upper case letters like Q,J).\n * The function also takes into account how many potential ascenders\n * this row might contain. If the number of potential ascenders along\n * with descenders is close to the expected fraction of the total\n * number of blobs in the row, the function returns the descender\n * height, returns 0 otherwise.\n */\nint32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count,\n                             STATS *asc_heights) {\n  // Count how many potential ascenders are in this row.\n  int i_min = asc_heights->min_bucket();\n  if ((i_min / row->xheight) < textord_ascx_ratio_min) {\n    i_min = static_cast<int>(floor(row->xheight * textord_ascx_ratio_min + 0.5));\n  }\n  int i_max = asc_heights->max_bucket();\n  if ((i_max / row->xheight) > textord_ascx_ratio_max) {\n    i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));\n  }\n  int num_potential_asc = 0;\n  for (int i = i_min; i <= i_max; ++i) {\n    num_potential_asc += asc_heights->pile_count(i);\n  }\n  auto min_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5));\n  auto max_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max));\n  float xcentre; // centre of blob\n  float height;  // height of blob\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX *blob; // current blob\n  STATS heights(min_height, max_height);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f;\n      height = (gradient * xcentre + row->parallel_c() - blob->bounding_box().bottom());\n      if (height >= min_height && height <= max_height) {\n        heights.add(static_cast<int>(floor(height + 0.5)), 1);\n      }\n    }\n  }\n  int blob_index = heights.mode();                 // find mode\n  int blob_count = heights.pile_count(blob_index); // get count of mode\n  float total_fraction = (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);\n  if (static_cast<float>(blob_count + num_potential_asc) < xheight_blob_count * total_fraction) {\n    blob_count = 0;\n  }\n  int descdrop = blob_count > 0 ? -blob_index : 0;\n  if (textord_debug_xheights) {\n    tprintf(\"Descdrop: %d (potential ascenders %d, descenders %d)\\n\", descdrop, num_potential_asc,\n            blob_count);\n    heights.print();\n  }\n  return descdrop;\n}\n\n/**\n * @name compute_height_modes\n *\n * Find the top maxmodes values in the input array and put their\n * indices in the output in the order in which they occurred.\n */\nint32_t compute_height_modes(STATS *heights,     // stats to search\n                             int32_t min_height, // bottom of range\n                             int32_t max_height, // top of range\n                             int32_t *modes,     // output array\n                             int32_t maxmodes) { // size of modes\n  int32_t pile_count;                            // no in source pile\n  int32_t src_count;                             // no of source entries\n  int32_t src_index;                             // current entry\n  int32_t least_count;                           // height of smalllest\n  int32_t least_index;                           // index of least\n  int32_t dest_count;                            // index in modes\n\n  src_count = max_height + 1 - min_height;\n  dest_count = 0;\n  least_count = INT32_MAX;\n  least_index = -1;\n  for (src_index = 0; src_index < src_count; src_index++) {\n    pile_count = heights->pile_count(min_height + src_index);\n    if (pile_count > 0) {\n      if (dest_count < maxmodes) {\n        if (pile_count < least_count) {\n          // find smallest in array\n          least_count = pile_count;\n          least_index = dest_count;\n        }\n        modes[dest_count++] = min_height + src_index;\n      } else if (pile_count >= least_count) {\n        while (least_index < maxmodes - 1) {\n          modes[least_index] = modes[least_index + 1];\n          // shuffle up\n          least_index++;\n        }\n        // new one on end\n        modes[maxmodes - 1] = min_height + src_index;\n        if (pile_count == least_count) {\n          // new smallest\n          least_index = maxmodes - 1;\n        } else {\n          least_count = heights->pile_count(modes[0]);\n          least_index = 0;\n          for (dest_count = 1; dest_count < maxmodes; dest_count++) {\n            pile_count = heights->pile_count(modes[dest_count]);\n            if (pile_count < least_count) {\n              // find smallest\n              least_count = pile_count;\n              least_index = dest_count;\n            }\n          }\n        }\n      }\n    }\n  }\n  return dest_count;\n}\n\n/**\n * @name correct_row_xheight\n *\n * Adjust the xheight etc of this row if not within reasonable limits\n * of the average for the block.\n */\nvoid correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop) {\n  ROW_CATEGORY row_category = get_row_category(row);\n  if (textord_debug_xheights) {\n    tprintf(\n        \"correcting row xheight: row->xheight %.4f\"\n        \", row->acrise %.4f row->descdrop %.4f\\n\",\n        row->xheight, row->ascrise, row->descdrop);\n  }\n  bool normal_xheight = within_error_margin(row->xheight, xheight, textord_xheight_error_margin);\n  bool cap_xheight =\n      within_error_margin(row->xheight, xheight + ascrise, textord_xheight_error_margin);\n  // Use the average xheight/ascrise for the following cases:\n  // -- the xheight of the row could not be determined at all\n  // -- the row has descenders (e.g. \"many groups\", \"ISBN 12345 p.3\")\n  //    and its xheight is close to either cap height or average xheight\n  // -- the row does not have ascenders or descenders, but its xheight\n  //    is close to the average block xheight (e.g. row with \"www.mmm.com\")\n  if (row_category == ROW_ASCENDERS_FOUND) {\n    if (row->descdrop >= 0) {\n      row->descdrop = row->xheight * (descdrop / xheight);\n    }\n  } else if (row_category == ROW_INVALID ||\n             (row_category == ROW_DESCENDERS_FOUND && (normal_xheight || cap_xheight)) ||\n             (row_category == ROW_UNKNOWN && normal_xheight)) {\n    if (textord_debug_xheights) {\n      tprintf(\"using average xheight\\n\");\n    }\n    row->xheight = xheight;\n    row->ascrise = ascrise;\n    row->descdrop = descdrop;\n  } else if (row_category == ROW_DESCENDERS_FOUND) {\n    // Assume this is a row with mostly lowercase letters and it's xheight\n    // is computed correctly (unfortunately there is no way to distinguish\n    // this from the case when descenders are found, but the most common\n    // height is capheight).\n    if (textord_debug_xheights) {\n      tprintf(\"lowercase, corrected ascrise\\n\");\n    }\n    row->ascrise = row->xheight * (ascrise / xheight);\n  } else if (row_category == ROW_UNKNOWN) {\n    // Otherwise assume this row is an all-caps or small-caps row\n    // and adjust xheight and ascrise of the row.\n\n    row->all_caps = true;\n    if (cap_xheight) { // regular all caps\n      if (textord_debug_xheights) {\n        tprintf(\"all caps\\n\");\n      }\n      row->xheight = xheight;\n      row->ascrise = ascrise;\n      row->descdrop = descdrop;\n    } else { // small caps or caps with an odd xheight\n      if (textord_debug_xheights) {\n        if (row->xheight < xheight + ascrise && row->xheight > xheight) {\n          tprintf(\"small caps\\n\");\n        } else {\n          tprintf(\"all caps with irregular xheight\\n\");\n        }\n      }\n      row->ascrise = row->xheight * (ascrise / (xheight + ascrise));\n      row->xheight -= row->ascrise;\n      row->descdrop = row->xheight * (descdrop / xheight);\n    }\n  }\n  if (textord_debug_xheights) {\n    tprintf(\n        \"corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop\"\n        \" = %.4f\\n\",\n        row->xheight, row->ascrise, row->descdrop);\n  }\n}\n\nstatic int CountOverlaps(const TBOX &box, int min_height, BLOBNBOX_LIST *blobs) {\n  int overlaps = 0;\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    const TBOX &blob_box = blob->bounding_box();\n    if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {\n      ++overlaps;\n    }\n  }\n  return overlaps;\n}\n\n/**\n * @name separate_underlines\n *\n * Test wide objects for being potential underlines. If they are then\n * put them in a separate list in the block.\n */\nvoid separate_underlines(TO_BLOCK *block,   // block to do\n                         float gradient,    // skew angle\n                         FCOORD rotation,   // inverse landscape\n                         bool testing_on) { // correct orientation\n  BLOBNBOX *blob;                           // current blob\n  C_BLOB *rotated_blob;                     // rotated blob\n  TO_ROW *row;                              // current row\n  float length;                             // of g_vec\n  TBOX blob_box;\n  FCOORD blob_rotation; // inverse of rotation\n  FCOORD g_vec;         // skew rotation\n  BLOBNBOX_IT blob_it;  // iterator\n                        // iterator\n  BLOBNBOX_IT under_it = &block->underlines;\n  BLOBNBOX_IT large_it = &block->large_blobs;\n  TO_ROW_IT row_it = block->get_rows();\n  int min_blob_height = static_cast<int>(textord_min_blob_height_fraction * block->line_size + 0.5);\n\n  // length of vector\n  length = std::sqrt(1 + gradient * gradient);\n  g_vec = FCOORD(1 / length, -gradient / length);\n  blob_rotation = FCOORD(rotation.x(), -rotation.y());\n  blob_rotation.rotate(g_vec); // undoing everything\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    // get blobs\n    blob_it.set_to_list(row->blob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      blob_box = blob->bounding_box();\n      if (blob_box.width() > block->line_size * textord_underline_width) {\n        ASSERT_HOST(blob->cblob() != nullptr);\n        rotated_blob = crotate_cblob(blob->cblob(), blob_rotation);\n        if (test_underline(testing_on && textord_show_final_rows, rotated_blob,\n                           static_cast<int16_t>(row->intercept()),\n                           static_cast<int16_t>(block->line_size *\n                                                (tesseract::CCStruct::kXHeightFraction +\n                                                 tesseract::CCStruct::kAscenderFraction / 2.0f)))) {\n          under_it.add_after_then_move(blob_it.extract());\n          if (testing_on && textord_show_final_rows) {\n            tprintf(\"Underlined blob at:\");\n            rotated_blob->bounding_box().print();\n            tprintf(\"Was:\");\n            blob_box.print();\n          }\n        } else if (CountOverlaps(blob->bounding_box(), min_blob_height, row->blob_list()) >\n                   textord_max_blob_overlaps) {\n          large_it.add_after_then_move(blob_it.extract());\n          if (testing_on && textord_show_final_rows) {\n            tprintf(\"Large blob overlaps %d blobs at:\",\n                    CountOverlaps(blob_box, min_blob_height, row->blob_list()));\n            blob_box.print();\n          }\n        }\n        delete rotated_blob;\n      }\n    }\n  }\n}\n\n/**\n * @name pre_associate_blobs\n *\n * Associate overlapping blobs and fake chop wide blobs.\n */\nvoid pre_associate_blobs( // make rough chars\n    ICOORD page_tr,       // top right\n    TO_BLOCK *block,      // block to do\n    FCOORD rotation,      // inverse landscape\n    bool testing_on       // correct orientation\n) {\n#ifndef GRAPHICS_DISABLED\n  ScrollView::Color colour; // of boxes\n#endif\n  BLOBNBOX *blob;     // current blob\n  BLOBNBOX *nextblob; // next in list\n  TBOX blob_box;\n  FCOORD blob_rotation; // inverse of rotation\n  BLOBNBOX_IT blob_it;  // iterator\n  BLOBNBOX_IT start_it; // iterator\n  TO_ROW_IT row_it = block->get_rows();\n\n#ifndef GRAPHICS_DISABLED\n  colour = ScrollView::RED;\n#endif\n\n  blob_rotation = FCOORD(rotation.x(), -rotation.y());\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    // get blobs\n    blob_it.set_to_list(row_it.data()->blob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      blob_box = blob->bounding_box();\n      start_it = blob_it; // save start point\n      //                      if (testing_on && textord_show_final_blobs)\n      //                      {\n      //                              tprintf(\"Blob at (%d,%d)->(%d,%d),\n      //                              addr=%x, count=%d\\n\",\n      //                                      blob_box.left(),blob_box.bottom(),\n      //                                      blob_box.right(),blob_box.top(),\n      //                                      (void*)blob,blob_it.length());\n      //                      }\n      bool overlap;\n      do {\n        overlap = false;\n        if (!blob_it.at_last()) {\n          nextblob = blob_it.data_relative(1);\n          overlap = blob_box.major_x_overlap(nextblob->bounding_box());\n          if (overlap) {\n            blob->merge(nextblob);           // merge new blob\n            blob_box = blob->bounding_box(); // get bigger box\n            blob_it.forward();\n          }\n        }\n      } while (overlap);\n      blob->chop(&start_it, &blob_it, blob_rotation,\n                 block->line_size * tesseract::CCStruct::kXHeightFraction * textord_chop_width);\n      // attempt chop\n    }\n#ifndef GRAPHICS_DISABLED\n    if (testing_on && textord_show_final_blobs) {\n      if (to_win == nullptr) {\n        create_to_win(page_tr);\n      }\n      to_win->Pen(colour);\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        blob = blob_it.data();\n        blob_box = blob->bounding_box();\n        blob_box.rotate(rotation);\n        if (!blob->joined_to_prev()) {\n          to_win->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top());\n        }\n      }\n      colour = static_cast<ScrollView::Color>(colour + 1);\n      if (colour > ScrollView::MAGENTA) {\n        colour = ScrollView::RED;\n      }\n    }\n#endif\n  }\n}\n\n/**\n * @name fit_parallel_rows\n *\n * Re-fit the rows in the block to the given gradient.\n */\nvoid fit_parallel_rows( // find lines\n    TO_BLOCK *block,    // block to do\n    float gradient,     // gradient to fit\n    FCOORD rotation,    // for drawing\n    int32_t block_edge, // edge of block\n    bool testing_on     // correct orientation\n) {\n#ifndef GRAPHICS_DISABLED\n  ScrollView::Color colour; // of row\n#endif\n  TO_ROW_IT row_it = block->get_rows();\n\n  row_it.move_to_first();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    if (row_it.data()->blob_list()->empty()) {\n      delete row_it.extract(); // nothing in it\n    } else {\n      fit_parallel_lms(gradient, row_it.data());\n    }\n  }\n#ifndef GRAPHICS_DISABLED\n  if (testing_on) {\n    colour = ScrollView::RED;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      plot_parallel_row(row_it.data(), gradient, block_edge, colour, rotation);\n      colour = static_cast<ScrollView::Color>(colour + 1);\n      if (colour > ScrollView::MAGENTA) {\n        colour = ScrollView::RED;\n      }\n    }\n  }\n#endif\n  row_it.sort(row_y_order); // may have gone out of order\n}\n\n/**\n * @name fit_parallel_lms\n *\n * Fit an LMS line to a row.\n * Make the fit parallel to the given gradient and set the\n * row accordingly.\n */\nvoid fit_parallel_lms(float gradient, TO_ROW *row) {\n  float c;       // fitted line\n  int blobcount; // no of blobs\n  tesseract::DetLineFit lms;\n  BLOBNBOX_IT blob_it = row->blob_list();\n\n  blobcount = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    if (!blob_it.data()->joined_to_prev()) {\n      const TBOX &box = blob_it.data()->bounding_box();\n      lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));\n      blobcount++;\n    }\n  }\n  double error = lms.ConstrainedFit(gradient, &c);\n  row->set_parallel_line(gradient, c, error);\n  if (textord_straight_baselines && blobcount > textord_lms_line_trials) {\n    error = lms.Fit(&gradient, &c);\n  }\n  // set the other too\n  row->set_line(gradient, c, error);\n}\n\n/**\n * @name make_spline_rows\n *\n * Re-fit the rows in the block to the given gradient.\n */\nvoid Textord::make_spline_rows(TO_BLOCK *block, // block to do\n                               float gradient,  // gradient to fit\n                               bool testing_on) {\n#ifndef GRAPHICS_DISABLED\n  ScrollView::Color colour; // of row\n  if (testing_on && to_win == nullptr) {\n    create_to_win(page_tr_);\n  }\n#endif\n  TO_ROW_IT row_it = block->get_rows();\n\n  row_it.move_to_first();\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    if (row_it.data()->blob_list()->empty()) {\n      delete row_it.extract(); // nothing in it\n    } else {\n      make_baseline_spline(row_it.data(), block);\n    }\n  }\n  if (textord_old_baselines) {\n#ifndef GRAPHICS_DISABLED\n    if (testing_on) {\n      colour = ScrollView::RED;\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        row_it.data()->baseline.plot(to_win, colour);\n        colour = static_cast<ScrollView::Color>(colour + 1);\n        if (colour > ScrollView::MAGENTA) {\n          colour = ScrollView::RED;\n        }\n      }\n    }\n#endif\n    make_old_baselines(block, testing_on, gradient);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (testing_on) {\n    colour = ScrollView::RED;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row_it.data()->baseline.plot(to_win, colour);\n      colour = static_cast<ScrollView::Color>(colour + 1);\n      if (colour > ScrollView::MAGENTA) {\n        colour = ScrollView::RED;\n      }\n    }\n  }\n#endif\n}\n\n/**\n * @name make_baseline_spline\n *\n * Fit an LMS line to a row.\n * Make the fit parallel to the given gradient and set the\n * row accordingly.\n */\nvoid make_baseline_spline(TO_ROW *row, // row to fit\n                          TO_BLOCK *block) {\n  double *coeffs;   // quadratic coeffs\n  int32_t segments; // no of segments\n\n  // spline boundaries\n  auto *xstarts = new int32_t[row->blob_list()->length() + 1];\n  if (segment_baseline(row, block, segments, xstarts) && !textord_straight_baselines &&\n      !textord_parallel_baselines) {\n    coeffs = linear_spline_baseline(row, block, segments, xstarts);\n  } else {\n    xstarts[1] = xstarts[segments];\n    segments = 1;\n    coeffs = new double[3];\n    coeffs[0] = 0;\n    coeffs[1] = row->line_m();\n    coeffs[2] = row->line_c();\n  }\n  row->baseline = QSPLINE(segments, xstarts, coeffs);\n  delete[] coeffs;\n  delete[] xstarts;\n}\n\n/**\n * @name segment_baseline\n *\n * Divide the baseline up into segments which require a different\n * quadratic fitted to them.\n * Return true if enough blobs were far enough away to need a quadratic.\n */\nbool segment_baseline( // split baseline\n    TO_ROW *row,       // row to fit\n    TO_BLOCK *block,   // block it came from\n    int32_t &segments, // no of segments\n    int32_t *xstarts   // coords of segments\n) {\n  bool needs_curve; // needs curved line\n  int blobcount;    // no of blobs\n  int blobindex;    // current blob\n  int last_state;   // above, on , below\n  int state;        // of current blob\n  float yshift;     // from baseline\n  TBOX box;         // blob box\n  TBOX new_box;     // new_it box\n  float middle;     // xcentre of blob\n                    // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT new_it = blob_it; // front end\n  SORTED_FLOATS yshifts;        // shifts from baseline\n\n  needs_curve = false;\n  box = box_next_pre_chopped(&blob_it);\n  xstarts[0] = box.left();\n  segments = 1;\n  blobcount = row->blob_list()->length();\n  if (textord_oldbl_debug) {\n    tprintf(\"Segmenting baseline of %d blobs at (%d,%d)\\n\", blobcount, box.left(), box.bottom());\n  }\n  if (blobcount <= textord_spline_medianwin || blobcount < textord_spline_minblobs) {\n    blob_it.move_to_last();\n    box = blob_it.data()->bounding_box();\n    xstarts[1] = box.right();\n    return false;\n  }\n  last_state = 0;\n  new_it.mark_cycle_pt();\n  for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {\n    new_box = box_next_pre_chopped(&new_it);\n    middle = (new_box.left() + new_box.right()) / 2.0;\n    yshift = new_box.bottom() - row->line_m() * middle - row->line_c();\n    // record shift\n    yshifts.add(yshift, blobindex);\n    if (new_it.cycled_list()) {\n      xstarts[1] = new_box.right();\n      return false;\n    }\n  }\n  for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++) {\n    box = box_next_pre_chopped(&blob_it);\n  }\n  do {\n    new_box = box_next_pre_chopped(&new_it);\n    // get middle one\n    yshift = yshifts[textord_spline_medianwin / 2];\n    if (yshift > textord_spline_shift_fraction * block->line_size) {\n      state = 1;\n    } else if (-yshift > textord_spline_shift_fraction * block->line_size) {\n      state = -1;\n    } else {\n      state = 0;\n    }\n    if (state != 0) {\n      needs_curve = true;\n    }\n    //              tprintf(\"State=%d, prev=%d, shift=%g\\n\",\n    //                      state,last_state,yshift);\n    if (state != last_state && blobcount > textord_spline_minblobs) {\n      xstarts[segments++] = box.left();\n      blobcount = 0;\n    }\n    last_state = state;\n    yshifts.remove(blobindex - textord_spline_medianwin);\n    box = box_next_pre_chopped(&blob_it);\n    middle = (new_box.left() + new_box.right()) / 2.0;\n    yshift = new_box.bottom() - row->line_m() * middle - row->line_c();\n    yshifts.add(yshift, blobindex);\n    blobindex++;\n    blobcount++;\n  } while (!new_it.cycled_list());\n  if (blobcount > textord_spline_minblobs || segments == 1) {\n    xstarts[segments] = new_box.right();\n  } else {\n    xstarts[--segments] = new_box.right();\n  }\n  if (textord_oldbl_debug) {\n    tprintf(\"Made %d segments on row at (%d,%d)\\n\", segments, box.right(), box.bottom());\n  }\n  return needs_curve;\n}\n\n/**\n * @name linear_spline_baseline\n *\n * Divide the baseline up into segments which require a different\n * quadratic fitted to them.\n * @return true if enough blobs were far enough away to need a quadratic.\n */\ndouble *linear_spline_baseline( // split baseline\n    TO_ROW *row,                // row to fit\n    TO_BLOCK *block,            // block it came from\n    int32_t &segments,          // no of segments\n    int32_t xstarts[]           // coords of segments\n) {\n  int blobcount;         // no of blobs\n  int blobindex;         // current blob\n  int index1, index2;    // blob numbers\n  int blobs_per_segment; // blobs in each\n  TBOX box;              // blob box\n  TBOX new_box;          // new_it box\n                         // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT new_it = blob_it; // front end\n  float b, c;                   // fitted curve\n  tesseract::DetLineFit lms;\n  int32_t segment; // current segment\n\n  box = box_next_pre_chopped(&blob_it);\n  xstarts[0] = box.left();\n  blobcount = 1;\n  while (!blob_it.at_first()) {\n    blobcount++;\n    box = box_next_pre_chopped(&blob_it);\n  }\n  segments = blobcount / textord_spline_medianwin;\n  if (segments < 1) {\n    segments = 1;\n  }\n  blobs_per_segment = blobcount / segments;\n  // quadratic coeffs\n  auto *coeffs = new double[segments * 3];\n  if (textord_oldbl_debug) {\n    tprintf(\n        \"Linear splining baseline of %d blobs at (%d,%d), into %d segments of \"\n        \"%d blobs\\n\",\n        blobcount, box.left(), box.bottom(), segments, blobs_per_segment);\n  }\n  segment = 1;\n  for (index2 = 0; index2 < blobs_per_segment / 2; index2++) {\n    box_next_pre_chopped(&new_it);\n  }\n  index1 = 0;\n  blobindex = index2;\n  do {\n    blobindex += blobs_per_segment;\n    lms.Clear();\n    while (index1 < blobindex || (segment == segments && index1 < blobcount)) {\n      box = box_next_pre_chopped(&blob_it);\n      int middle = (box.left() + box.right()) / 2;\n      lms.Add(ICOORD(middle, box.bottom()));\n      index1++;\n      if (index1 == blobindex - blobs_per_segment / 2 || index1 == blobcount - 1) {\n        xstarts[segment] = box.left();\n      }\n    }\n    lms.Fit(&b, &c);\n    coeffs[segment * 3 - 3] = 0;\n    coeffs[segment * 3 - 2] = b;\n    coeffs[segment * 3 - 1] = c;\n    segment++;\n    if (segment > segments) {\n      break;\n    }\n\n    blobindex += blobs_per_segment;\n    lms.Clear();\n    while (index2 < blobindex || (segment == segments && index2 < blobcount)) {\n      new_box = box_next_pre_chopped(&new_it);\n      int middle = (new_box.left() + new_box.right()) / 2;\n      lms.Add(ICOORD(middle, new_box.bottom()));\n      index2++;\n      if (index2 == blobindex - blobs_per_segment / 2 || index2 == blobcount - 1) {\n        xstarts[segment] = new_box.left();\n      }\n    }\n    lms.Fit(&b, &c);\n    coeffs[segment * 3 - 3] = 0;\n    coeffs[segment * 3 - 2] = b;\n    coeffs[segment * 3 - 1] = c;\n    segment++;\n  } while (segment <= segments);\n  return coeffs;\n}\n\n/**\n * @name assign_blobs_to_rows\n *\n * Make enough rows to allocate all the given blobs to one.\n * If a block skew is given, use that, else attempt to track it.\n */\nvoid assign_blobs_to_rows( // find lines\n    TO_BLOCK *block,       // block to do\n    float *gradient,       // block skew\n    int pass,              // identification\n    bool reject_misses,    // chuck big ones out\n    bool make_new_rows,    // add rows for unmatched\n    bool drawing_skew      // draw smoothed skew\n) {\n  OVERLAP_STATE overlap_result; // what to do with it\n  float ycoord;                 // current y\n  float top, bottom;            // of blob\n  float g_length = 1.0f;        // from gradient\n  int16_t row_count;            // no of rows\n  int16_t left_x;               // left edge\n  int16_t last_x;               // previous edge\n  float block_skew;             // y delta\n  float smooth_factor;          // for new coords\n  float near_dist;              // dist to nearest row\n  ICOORD testpt;                // testing only\n  BLOBNBOX *blob;               // current blob\n  TO_ROW *row;                  // current row\n  TO_ROW *dest_row = nullptr;   // row to put blob in\n                                // iterators\n  BLOBNBOX_IT blob_it = &block->blobs;\n  TO_ROW_IT row_it = block->get_rows();\n\n  ycoord =\n      (block->block->pdblk.bounding_box().bottom() + block->block->pdblk.bounding_box().top()) /\n      2.0f;\n  if (gradient != nullptr) {\n    g_length = std::sqrt(1 + *gradient * *gradient);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (drawing_skew) {\n    to_win->SetCursor(block->block->pdblk.bounding_box().left(), ycoord);\n  }\n#endif\n  testpt = ICOORD(textord_test_x, textord_test_y);\n  blob_it.sort(blob_x_order);\n  smooth_factor = 1.0;\n  block_skew = 0.0f;\n  row_count = row_it.length(); // might have rows\n  if (!blob_it.empty()) {\n    left_x = blob_it.data()->bounding_box().left();\n  } else {\n    left_x = block->block->pdblk.bounding_box().left();\n  }\n  last_x = left_x;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    if (gradient != nullptr) {\n      block_skew = (1 - 1 / g_length) * blob->bounding_box().bottom() +\n                   *gradient / g_length * blob->bounding_box().left();\n    } else if (blob->bounding_box().left() - last_x > block->line_size / 2 &&\n               last_x - left_x > block->line_size * 2 && textord_interpolating_skew) {\n      //                      tprintf(\"Interpolating skew from %g\",block_skew);\n      block_skew *= static_cast<float>(blob->bounding_box().left() - left_x) / (last_x - left_x);\n      //                      tprintf(\"to %g\\n\",block_skew);\n    }\n    last_x = blob->bounding_box().left();\n    top = blob->bounding_box().top() - block_skew;\n    bottom = blob->bounding_box().bottom() - block_skew;\n#ifndef GRAPHICS_DISABLED\n    if (drawing_skew) {\n      to_win->DrawTo(blob->bounding_box().left(), ycoord + block_skew);\n    }\n#endif\n    if (!row_it.empty()) {\n      for (row_it.move_to_first(); !row_it.at_last() && row_it.data()->min_y() > top;\n           row_it.forward()) {\n      }\n      row = row_it.data();\n      if (row->min_y() <= top && row->max_y() >= bottom) {\n        // any overlap\n        dest_row = row;\n        overlap_result = most_overlapping_row(&row_it, dest_row, top, bottom, block->line_size,\n                                              blob->bounding_box().contains(testpt));\n        if (overlap_result == NEW_ROW && !reject_misses) {\n          overlap_result = ASSIGN;\n        }\n      } else {\n        overlap_result = NEW_ROW;\n        if (!make_new_rows) {\n          near_dist = row_it.data_relative(-1)->min_y() - top;\n          // below bottom\n          if (bottom < row->min_y()) {\n            if (row->min_y() - bottom <= (block->line_spacing - block->line_size) *\n                                             tesseract::CCStruct::kDescenderFraction) {\n              // done it\n              overlap_result = ASSIGN;\n              dest_row = row;\n            }\n          } else if (near_dist > 0 && near_dist < bottom - row->max_y()) {\n            row_it.backward();\n            dest_row = row_it.data();\n            if (dest_row->min_y() - bottom <= (block->line_spacing - block->line_size) *\n                                                  tesseract::CCStruct::kDescenderFraction) {\n              // done it\n              overlap_result = ASSIGN;\n            }\n          } else {\n            if (top - row->max_y() <=\n                (block->line_spacing - block->line_size) *\n                    (textord_overlap_x + tesseract::CCStruct::kAscenderFraction)) {\n              // done it\n              overlap_result = ASSIGN;\n              dest_row = row;\n            }\n          }\n        }\n      }\n      if (overlap_result == ASSIGN) {\n        dest_row->add_blob(blob_it.extract(), top, bottom, block->line_size);\n      }\n      if (overlap_result == NEW_ROW) {\n        if (make_new_rows && top - bottom < block->max_blob_size) {\n          dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size);\n          row_count++;\n          if (bottom > row_it.data()->min_y()) {\n            row_it.add_before_then_move(dest_row);\n          // insert in right place\n          } else {\n            row_it.add_after_then_move(dest_row);\n          }\n          smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset);\n        } else {\n          overlap_result = REJECT;\n        }\n      }\n    } else if (make_new_rows && top - bottom < block->max_blob_size) {\n      overlap_result = NEW_ROW;\n      dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size);\n      row_count++;\n      row_it.add_after_then_move(dest_row);\n      smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset2);\n    } else {\n      overlap_result = REJECT;\n    }\n    if (blob->bounding_box().contains(testpt) && textord_debug_blob) {\n      if (overlap_result != REJECT) {\n        tprintf(\"Test blob assigned to row at (%g,%g) on pass %d\\n\", dest_row->min_y(),\n                dest_row->max_y(), pass);\n      } else {\n        tprintf(\"Test blob assigned to no row on pass %d\\n\", pass);\n      }\n    }\n    if (overlap_result != REJECT) {\n      while (!row_it.at_first() && row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {\n        row = row_it.extract();\n        row_it.backward();\n        row_it.add_before_then_move(row);\n      }\n      while (!row_it.at_last() && row_it.data()->min_y() < row_it.data_relative(1)->min_y()) {\n        row = row_it.extract();\n        row_it.forward();\n        // Keep rows in order.\n        row_it.add_after_then_move(row);\n      }\n      BLOBNBOX_IT added_blob_it(dest_row->blob_list());\n      added_blob_it.move_to_last();\n      TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();\n      if (dest_row->blob_list()->singleton() || !prev_box.major_x_overlap(blob->bounding_box())) {\n        block_skew = (1 - smooth_factor) * block_skew +\n                     smooth_factor * (blob->bounding_box().bottom() - dest_row->initial_min_y());\n      }\n    }\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    if (row_it.data()->blob_list()->empty()) {\n      delete row_it.extract(); // Discard empty rows.\n    }\n  }\n}\n\n/**\n * @name most_overlapping_row\n *\n * Return the row which most overlaps the blob.\n */\nOVERLAP_STATE most_overlapping_row( // find best row\n    TO_ROW_IT *row_it,              // iterator\n    TO_ROW *&best_row,              // output row\n    float top,                      // top of blob\n    float bottom,                   // bottom of blob\n    float rowsize,                  // max row size\n    bool testing_blob               // test stuff\n) {\n  OVERLAP_STATE result;          // result of tests\n  float overlap;                 // of blob & row\n  float bestover;                // nearest row\n  float merge_top, merge_bottom; // size of merged row\n  ICOORD testpt;                 // testing only\n  TO_ROW *row;                   // current row\n  TO_ROW *test_row;              // for multiple overlaps\n  BLOBNBOX_IT blob_it;           // for merging rows\n\n  result = ASSIGN;\n  row = row_it->data();\n  bestover = top - bottom;\n  if (top > row->max_y()) {\n    bestover -= top - row->max_y();\n  }\n  if (bottom < row->min_y()) {\n    // compute overlap\n    bestover -= row->min_y() - bottom;\n  }\n  if (testing_blob && textord_debug_blob) {\n    tprintf(\"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\\n\", bottom, top, row->min_y(),\n            row->max_y(), rowsize, bestover);\n  }\n  test_row = row;\n  do {\n    if (!row_it->at_last()) {\n      row_it->forward();\n      test_row = row_it->data();\n      if (test_row->min_y() <= top && test_row->max_y() >= bottom) {\n        merge_top = std::max(test_row->max_y(),row->max_y());\n        merge_bottom = std::min(test_row->min_y(),row->min_y());\n        if (merge_top - merge_bottom <= rowsize) {\n          if (testing_blob && textord_debug_blob) {\n            tprintf(\"Merging rows at (%g,%g), (%g,%g)\\n\", row->min_y(), row->max_y(),\n                    test_row->min_y(), test_row->max_y());\n          }\n          test_row->set_limits(merge_bottom, merge_top);\n          blob_it.set_to_list(test_row->blob_list());\n          blob_it.add_list_after(row->blob_list());\n          blob_it.sort(blob_x_order);\n          row_it->backward();\n          delete row_it->extract();\n          row_it->forward();\n          bestover = -1.0f; // force replacement\n        }\n        overlap = top - bottom;\n        if (top > test_row->max_y()) {\n          overlap -= top - test_row->max_y();\n        }\n        if (bottom < test_row->min_y()) {\n          overlap -= test_row->min_y() - bottom;\n        }\n        if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {\n          result = REJECT;\n        }\n        if (overlap > bestover) {\n          bestover = overlap; // find biggest overlap\n          row = test_row;\n        }\n        if (testing_blob && textord_debug_blob) {\n          tprintf(\"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\\n\", bottom, top,\n                  test_row->min_y(), test_row->max_y(), rowsize, overlap, bestover);\n        }\n      }\n    }\n  } while (!row_it->at_last() && test_row->min_y() <= top && test_row->max_y() >= bottom);\n  while (row_it->data() != row) {\n    row_it->backward(); // make it point to row\n  }\n                        // doesn't overlap much\n  if (top - bottom - bestover > rowsize * textord_overlap_x &&\n      (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x) && result == ASSIGN) {\n    result = NEW_ROW; // doesn't overlap enough\n  }\n  best_row = row;\n  return result;\n}\n\n/**\n * @name blob_x_order\n *\n * Sort function to sort blobs in x from page left.\n */\nint blob_x_order(      // sort function\n    const BLOBNBOX *blob1, // items to compare\n    const BLOBNBOX *blob2) {\n  if (blob1->bounding_box().left() < blob2->bounding_box().left()) {\n    return -1;\n  } else if (blob1->bounding_box().left() > blob2->bounding_box().left()) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\n/**\n * @name mark_repeated_chars\n *\n * Mark blobs marked with BTFT_LEADER in repeated sets using the\n * repeated_set member of BLOBNBOX.\n */\nvoid mark_repeated_chars(TO_ROW *row) {\n  BLOBNBOX_IT box_it(row->blob_list()); // Iterator.\n  int num_repeated_sets = 0;\n  if (!box_it.empty()) {\n    do {\n      BLOBNBOX *bblob = box_it.data();\n      int repeat_length = 1;\n      if (bblob->flow() == BTFT_LEADER && !bblob->joined_to_prev() && bblob->cblob() != nullptr) {\n        BLOBNBOX_IT test_it(box_it);\n        for (test_it.forward(); !test_it.at_first();) {\n          bblob = test_it.data();\n          if (bblob->flow() != BTFT_LEADER) {\n            break;\n          }\n          test_it.forward();\n          bblob = test_it.data();\n          if (bblob->joined_to_prev() || bblob->cblob() == nullptr) {\n            repeat_length = 0;\n            break;\n          }\n          ++repeat_length;\n        }\n      }\n      if (repeat_length >= kMinLeaderCount) {\n        num_repeated_sets++;\n        for (; repeat_length > 0; box_it.forward(), --repeat_length) {\n          bblob = box_it.data();\n          bblob->set_repeated_set(num_repeated_sets);\n        }\n      } else {\n        bblob->set_repeated_set(0);\n        box_it.forward();\n      }\n    } while (!box_it.at_first()); // until all done\n  }\n  row->set_num_repeated_sets(num_repeated_sets);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/makerow.h",
    "content": "/**********************************************************************\n * File:        makerow.h  (Formerly makerows.h)\n * Description: Code to arrange blobs into rows of text.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef MAKEROW_H\n#define MAKEROW_H\n\n#include \"blobbox.h\"\n#include \"blobs.h\"\n#include \"ocrblock.h\"\n#include \"params.h\"\n#include \"statistc.h\"\n\nnamespace tesseract {\n\nenum OVERLAP_STATE {\n  ASSIGN, // assign it to row\n  REJECT, // reject it - dual overlap\n  NEW_ROW\n};\n\nenum ROW_CATEGORY {\n  ROW_ASCENDERS_FOUND,\n  ROW_DESCENDERS_FOUND,\n  ROW_UNKNOWN,\n  ROW_INVALID,\n};\n\nextern BOOL_VAR_H(textord_heavy_nr);\nextern BOOL_VAR_H(textord_show_initial_rows);\nextern BOOL_VAR_H(textord_show_parallel_rows);\nextern BOOL_VAR_H(textord_show_expanded_rows);\nextern BOOL_VAR_H(textord_show_final_rows);\nextern BOOL_VAR_H(textord_show_final_blobs);\nextern BOOL_VAR_H(textord_test_landscape);\nextern BOOL_VAR_H(textord_parallel_baselines);\nextern BOOL_VAR_H(textord_straight_baselines);\nextern BOOL_VAR_H(textord_old_baselines);\nextern BOOL_VAR_H(textord_old_xheight);\nextern BOOL_VAR_H(textord_fix_xheight_bug);\nextern BOOL_VAR_H(textord_fix_makerow_bug);\nextern BOOL_VAR_H(textord_debug_xheights);\nextern INT_VAR_H(textord_test_x);\nextern INT_VAR_H(textord_test_y);\nextern INT_VAR_H(textord_min_blobs_in_row);\nextern INT_VAR_H(textord_spline_minblobs);\nextern INT_VAR_H(textord_spline_medianwin);\nextern INT_VAR_H(textord_min_xheight);\nextern double_VAR_H(textord_spline_shift_fraction);\nextern double_VAR_H(textord_skew_ile);\nextern double_VAR_H(textord_skew_lag);\nextern double_VAR_H(textord_linespace_iqrlimit);\nextern double_VAR_H(textord_width_limit);\nextern double_VAR_H(textord_chop_width);\nextern double_VAR_H(textord_minxh);\nextern double_VAR_H(textord_min_linesize);\nextern double_VAR_H(textord_excess_blobsize);\nextern double_VAR_H(textord_occupancy_threshold);\nextern double_VAR_H(textord_underline_width);\nextern double_VAR_H(textord_min_blob_height_fraction);\nextern double_VAR_H(textord_xheight_mode_fraction);\nextern double_VAR_H(textord_ascheight_mode_fraction);\nextern double_VAR_H(textord_ascx_ratio_min);\nextern double_VAR_H(textord_ascx_ratio_max);\nextern double_VAR_H(textord_descx_ratio_min);\nextern double_VAR_H(textord_descx_ratio_max);\nextern double_VAR_H(textord_xheight_error_margin);\nextern INT_VAR_H(textord_lms_line_trials);\nextern BOOL_VAR_H(textord_new_initial_xheight);\nextern BOOL_VAR_H(textord_debug_blob);\n\ninline void get_min_max_xheight(int block_linesize, int *min_height, int *max_height) {\n  *min_height = static_cast<int32_t>(floor(block_linesize * textord_minxh));\n  if (*min_height < textord_min_xheight) {\n    *min_height = textord_min_xheight;\n  }\n  *max_height = static_cast<int32_t>(ceil(block_linesize * 3.0));\n}\n\ninline ROW_CATEGORY get_row_category(const TO_ROW *row) {\n  if (row->xheight <= 0) {\n    return ROW_INVALID;\n  }\n  return (row->ascrise > 0) ? ROW_ASCENDERS_FOUND\n                            : (row->descdrop != 0) ? ROW_DESCENDERS_FOUND : ROW_UNKNOWN;\n}\n\ninline bool within_error_margin(float test, float num, float margin) {\n  return (test >= num * (1 - margin) && test <= num * (1 + margin));\n}\n\nvoid fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights,\n                  STATS *floating_heights);\n\nfloat make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks);\nfloat make_rows(ICOORD page_tr, // top right\n                TO_BLOCK_LIST *port_blocks);\nvoid make_initial_textrows(ICOORD page_tr,\n                           TO_BLOCK *block,  // block to do\n                           FCOORD rotation,  // for drawing\n                           bool testing_on); // correct orientation\nvoid fit_lms_line(TO_ROW *row);\nvoid compute_page_skew(TO_BLOCK_LIST *blocks, // list of blocks\n                       float &page_m,         // average gradient\n                       float &page_err);      // average error\nvoid vigorous_noise_removal(TO_BLOCK *block);\nvoid cleanup_rows_making(ICOORD page_tr,     // top right\n                         TO_BLOCK *block,    // block to do\n                         float gradient,     // gradient to fit\n                         FCOORD rotation,    // for drawing\n                         int32_t block_edge, // edge of block\n                         bool testing_on);   // correct orientation\nvoid delete_non_dropout_rows(                // find lines\n    TO_BLOCK *block,                         // block to do\n    float gradient,                          // global skew\n    FCOORD rotation,                         // deskew vector\n    int32_t block_edge,                      // left edge\n    bool testing_on                          // correct orientation\n);\nbool find_best_dropout_row( // find neighbours\n    TO_ROW *row,            // row to test\n    int32_t distance,       // dropout dist\n    float dist_limit,       // threshold distance\n    int32_t line_index,     // index of row\n    TO_ROW_IT *row_it,      // current position\n    bool testing_on         // correct orientation\n);\nTBOX deskew_block_coords( // block box\n    TO_BLOCK *block,      // block to do\n    float gradient        // global skew\n);\nvoid compute_line_occupation( // project blobs\n    TO_BLOCK *block,          // block to do\n    float gradient,           // global skew\n    int32_t min_y,            // min coord in block\n    int32_t max_y,            // in block\n    int32_t *occupation,      // output projection\n    int32_t *deltas           // derivative\n);\nvoid compute_occupation_threshold( // project blobs\n    int32_t low_window,            // below result point\n    int32_t high_window,           // above result point\n    int32_t line_count,            // array sizes\n    int32_t *occupation,           // input projection\n    int32_t *thresholds            // output thresholds\n);\nvoid compute_dropout_distances( // project blobs\n    int32_t *occupation,        // input projection\n    int32_t *thresholds,        // output thresholds\n    int32_t line_count          // array sizes\n);\nvoid expand_rows(       // find lines\n    ICOORD page_tr,     // top right\n    TO_BLOCK *block,    // block to do\n    float gradient,     // gradient to fit\n    FCOORD rotation,    // for drawing\n    int32_t block_edge, // edge of block\n    bool testing_on     // correct orientation\n);\nvoid adjust_row_limits( // tidy limits\n    TO_BLOCK *block     // block to do\n);\nvoid compute_row_stats( // find lines\n    TO_BLOCK *block,    // block to do\n    bool testing_on     // correct orientation\n);\nfloat median_block_xheight( // find lines\n    TO_BLOCK *block,        // block to do\n    float gradient          // global skew\n);\n\nint compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only,\n                               int min_height, int max_height, float *xheight, float *ascrise);\n\nint32_t compute_row_descdrop(TO_ROW *row,    // row to do\n                             float gradient, // global skew\n                             int xheight_blob_count, STATS *heights);\nint32_t compute_height_modes(STATS *heights,     // stats to search\n                             int32_t min_height, // bottom of range\n                             int32_t max_height, // top of range\n                             int32_t *modes,     // output array\n                             int32_t maxmodes);  // size of modes\nvoid correct_row_xheight(TO_ROW *row,            // row to fix\n                         float xheight,          // average values\n                         float ascrise, float descdrop);\nvoid separate_underlines(TO_BLOCK *block,   // block to do\n                         float gradient,    // skew angle\n                         FCOORD rotation,   // inverse landscape\n                         bool testing_on);  // correct orientation\nvoid pre_associate_blobs(ICOORD page_tr,    // top right\n                         TO_BLOCK *block,   // block to do\n                         FCOORD rotation,   // inverse landscape\n                         bool testing_on);  // correct orientation\nvoid fit_parallel_rows(TO_BLOCK *block,     // block to do\n                       float gradient,      // gradient to fit\n                       FCOORD rotation,     // for drawing\n                       int32_t block_edge,  // edge of block\n                       bool testing_on);    // correct orientation\nvoid fit_parallel_lms(float gradient,       // forced gradient\n                      TO_ROW *row);         // row to fit\nvoid make_baseline_spline(TO_ROW *row,      // row to fit\n                          TO_BLOCK *block); // block it came from\nbool segment_baseline(                      // split baseline\n    TO_ROW *row,                            // row to fit\n    TO_BLOCK *block,                        // block it came from\n    int32_t &segments,                      // no of segments\n    int32_t *xstarts                        // coords of segments\n);\ndouble *linear_spline_baseline( // split baseline\n    TO_ROW *row,                // row to fit\n    TO_BLOCK *block,            // block it came from\n    int32_t &segments,          // no of segments\n    int32_t xstarts[]           // coords of segments\n);\nvoid assign_blobs_to_rows( // find lines\n    TO_BLOCK *block,       // block to do\n    float *gradient,       // block skew\n    int pass,              // identification\n    bool reject_misses,    // chuck big ones out\n    bool make_new_rows,    // add rows for unmatched\n    bool drawing_skew      // draw smoothed skew\n);\n// find best row\nOVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, // iterator\n                                   TO_ROW *&best_row, // output row\n                                   float top,         // top of blob\n                                   float bottom,      // bottom of blob\n                                   float rowsize,     // max row size\n                                   bool testing_blob  // test stuff\n);\nint blob_x_order(      // sort function\n    const BLOBNBOX *item1, // items to compare\n    const BLOBNBOX *item2);\n\nvoid mark_repeated_chars(TO_ROW *row);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/oldbasel.cpp",
    "content": "/**********************************************************************\n * File:        oldbasel.cpp  (Formerly oldbl.c)\n * Description: A re-implementation of the old baseline algorithm.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"oldbasel.h\"\n\n#include \"ccstruct.h\"\n#include \"detlinefit.h\"\n#include \"drawtord.h\"\n#include \"makerow.h\"\n#include \"quadlsq.h\"\n#include \"statistc.h\"\n#include \"textord.h\"\n#include \"tprintf.h\"\n\n#include <cmath>\n#include <vector> // for std::vector\n\n#include <algorithm>\n\nnamespace tesseract {\n\nstatic BOOL_VAR(textord_really_old_xheight, false, \"Use original wiseowl xheight\");\nBOOL_VAR(textord_oldbl_debug, false, \"Debug old baseline generation\");\nstatic BOOL_VAR(textord_debug_baselines, false, \"Debug baseline generation\");\nstatic BOOL_VAR(textord_oldbl_paradef, true, \"Use para default mechanism\");\nstatic BOOL_VAR(textord_oldbl_split_splines, true, \"Split stepped splines\");\nstatic BOOL_VAR(textord_oldbl_merge_parts, true, \"Merge suspect partitions\");\nstatic BOOL_VAR(oldbl_corrfix, true, \"Improve correlation of heights\");\nstatic BOOL_VAR(oldbl_xhfix, false, \"Fix bug in modes threshold for xheights\");\nstatic BOOL_VAR(textord_ocropus_mode, false, \"Make baselines for ocropus\");\nstatic double_VAR(oldbl_xhfract, 0.4, \"Fraction of est allowed in calc\");\nstatic INT_VAR(oldbl_holed_losscount, 10, \"Max lost before fallback line used\");\nstatic double_VAR(oldbl_dot_error_size, 1.26, \"Max aspect ratio of a dot\");\nstatic double_VAR(textord_oldbl_jumplimit, 0.15, \"X fraction for new partition\");\n\n#define TURNLIMIT 1            /*min size for turning point */\n#define X_HEIGHT_FRACTION 0.7  /*x-height/caps height */\n#define DESCENDER_FRACTION 0.5 /*descender/x-height */\n#define MIN_ASC_FRACTION 0.20  /*min size of ascenders */\n#define MIN_DESC_FRACTION 0.25 /*min size of descenders */\n#define MINASCRISE 2.0         /*min ascender/desc step */\n#define MAXHEIGHTVARIANCE 0.15 /*accepted variation in x-height */\n#define MAXHEIGHT 300          /*max blob height */\n#define MAXOVERLAP 0.1         /*max 10% missed overlap */\n#define MAXBADRUN 2            /*max non best for failed */\n#define HEIGHTBUCKETS 200      /* Num of buckets */\n#define MODENUM 10\n#define MAXPARTS 6\n#define SPLINESIZE 23\n\n#define ABS(x) ((x) < 0 ? (-(x)) : (x))\n\n/**********************************************************************\n * make_old_baselines\n *\n * Top level function to make baselines the old way.\n **********************************************************************/\n\nvoid Textord::make_old_baselines(TO_BLOCK *block, // block to do\n                                 bool testing_on, // correct orientation\n                                 float gradient) {\n  QSPLINE *prev_baseline; // baseline of previous row\n  TO_ROW *row;            // current row\n  TO_ROW_IT row_it = block->get_rows();\n  BLOBNBOX_IT blob_it;\n\n  prev_baseline = nullptr; // nothing yet\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    find_textlines(block, row, 2, nullptr);\n    if (row->xheight <= 0 && prev_baseline != nullptr) {\n      find_textlines(block, row, 2, prev_baseline);\n    }\n    if (row->xheight > 0) { // was a good one\n      prev_baseline = &row->baseline;\n    } else {\n      prev_baseline = nullptr;\n      blob_it.set_to_list(row->blob_list());\n      if (textord_debug_baselines) {\n        tprintf(\"Row baseline generation failed on row at (%d,%d)\\n\",\n                blob_it.data()->bounding_box().left(), blob_it.data()->bounding_box().bottom());\n      }\n    }\n  }\n  correlate_lines(block, gradient);\n  block->block->set_xheight(block->xheight);\n}\n\n/**********************************************************************\n * correlate_lines\n *\n * Correlate the x-heights and ascender heights of a block to fill-in\n * the ascender height and descender height for rows without one.\n * Also fix baselines of rows without a decent fit.\n **********************************************************************/\n\nvoid Textord::correlate_lines(TO_BLOCK *block, float gradient) {\n  int rowcount; /*no of rows to do */\n  int rowindex; /*no of row */\n                // iterator\n  TO_ROW_IT row_it = block->get_rows();\n\n  rowcount = row_it.length();\n  if (rowcount == 0) {\n    // default value\n    block->xheight = block->line_size;\n    return; /*none to do */\n  }\n  // array of ptrs\n  std::vector<TO_ROW *> rows(rowcount);\n  rowindex = 0;\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    // make array\n    rows[rowindex++] = row_it.data();\n  }\n\n  /*try to fix bad lines */\n  correlate_neighbours(block, &rows[0], rowcount);\n\n  if (textord_really_old_xheight || textord_old_xheight) {\n    block->xheight = static_cast<float>(correlate_with_stats(&rows[0], rowcount, block));\n    if (block->xheight <= 0) {\n      block->xheight = block->line_size * tesseract::CCStruct::kXHeightFraction;\n    }\n    if (block->xheight < textord_min_xheight) {\n      block->xheight = (float)textord_min_xheight;\n    }\n  } else {\n    compute_block_xheight(block, gradient);\n  }\n}\n\n/**********************************************************************\n * correlate_neighbours\n *\n * Try to fix rows that had a bad spline fit by using neighbours.\n **********************************************************************/\n\nvoid Textord::correlate_neighbours(TO_BLOCK *block, // block rows are in.\n                                   TO_ROW **rows,   // rows of block.\n                                   int rowcount) {  // no of rows to do.\n  TO_ROW *row;                                      /*current row */\n  int rowindex;                                     /*no of row */\n  int otherrow;                                     /*second row */\n  int upperrow;                                     /*row above to use */\n  int lowerrow;                                     /*row below to use */\n  float biggest;\n\n  for (rowindex = 0; rowindex < rowcount; rowindex++) {\n    row = rows[rowindex]; /*current row */\n    if (row->xheight < 0) {\n      /*quadratic failed */\n      for (otherrow = rowindex - 2;\n           otherrow >= 0 && (rows[otherrow]->xheight < 0.0 ||\n                             !row->baseline.overlap(&rows[otherrow]->baseline, MAXOVERLAP));\n           otherrow--) {\n      }\n      upperrow = otherrow; /*decent row above */\n      for (otherrow = rowindex + 1;\n           otherrow < rowcount && (rows[otherrow]->xheight < 0.0 ||\n                                   !row->baseline.overlap(&rows[otherrow]->baseline, MAXOVERLAP));\n           otherrow++) {\n      }\n      lowerrow = otherrow; /*decent row below */\n      if (upperrow >= 0) {\n        find_textlines(block, row, 2, &rows[upperrow]->baseline);\n      }\n      if (row->xheight < 0 && lowerrow < rowcount) {\n        find_textlines(block, row, 2, &rows[lowerrow]->baseline);\n      }\n      if (row->xheight < 0) {\n        if (upperrow >= 0) {\n          find_textlines(block, row, 1, &rows[upperrow]->baseline);\n        } else if (lowerrow < rowcount) {\n          find_textlines(block, row, 1, &rows[lowerrow]->baseline);\n        }\n      }\n    }\n  }\n\n  for (biggest = 0.0f, rowindex = 0; rowindex < rowcount; rowindex++) {\n    row = rows[rowindex]; /*current row */\n    if (row->xheight < 0) { /*linear failed */\n                            /*make do */\n      row->xheight = -row->xheight;\n    }\n    biggest = std::max(biggest, row->xheight);\n  }\n}\n\n/**********************************************************************\n * correlate_with_stats\n *\n * correlate the x-heights and ascender heights of a block to fill-in\n * the ascender height and descender height for rows without one.\n **********************************************************************/\n\nint Textord::correlate_with_stats(TO_ROW **rows, // rows of block.\n                                  int rowcount,  // no of rows to do.\n                                  TO_BLOCK *block) {\n  TO_ROW *row;         /*current row */\n  int rowindex;        /*no of row */\n  float lineheight;    /*mean x-height */\n  float ascheight;     /*average ascenders */\n  float minascheight;  /*min allowed ascheight */\n  int xcount;          /*no of samples for xheight */\n  float fullheight;    /*mean top height */\n  int fullcount;       /*no of samples */\n  float descheight;    /*mean descender drop */\n  float mindescheight; /*min allowed descheight */\n  int desccount;       /*no of samples */\n\n  /*no samples */\n  xcount = fullcount = desccount = 0;\n  lineheight = ascheight = fullheight = descheight = 0.0;\n  for (rowindex = 0; rowindex < rowcount; rowindex++) {\n    row = rows[rowindex];         /*current row */\n    if (row->ascrise > 0.0) {     /*got ascenders? */\n      lineheight += row->xheight; /*average x-heights */\n      ascheight += row->ascrise;  /*average ascenders */\n      xcount++;\n    } else {\n      fullheight += row->xheight; /*assume full height */\n      fullcount++;\n    }\n    if (row->descdrop < 0.0) { /*got descenders? */\n                               /*average descenders */\n      descheight += row->descdrop;\n      desccount++;\n    }\n  }\n\n  if (xcount > 0 && (!oldbl_corrfix || xcount >= fullcount)) {\n    lineheight /= xcount; /*average x-height */\n                          /*average caps height */\n    fullheight = lineheight + ascheight / xcount;\n    /*must be decent size */\n    if (fullheight < lineheight * (1 + MIN_ASC_FRACTION)) {\n      fullheight = lineheight * (1 + MIN_ASC_FRACTION);\n    }\n  } else {\n    fullheight /= fullcount; /*average max height */\n                             /*guess x-height */\n    lineheight = fullheight * X_HEIGHT_FRACTION;\n  }\n  if (desccount > 0 && (!oldbl_corrfix || desccount >= rowcount / 2)) {\n    descheight /= desccount; /*average descenders */\n  } else {\n    /*guess descenders */\n    descheight = -lineheight * DESCENDER_FRACTION;\n  }\n\n  if (lineheight > 0.0f) {\n    block->block->set_cell_over_xheight((fullheight - descheight) / lineheight);\n  }\n\n  minascheight = lineheight * MIN_ASC_FRACTION;\n  mindescheight = -lineheight * MIN_DESC_FRACTION;\n  for (rowindex = 0; rowindex < rowcount; rowindex++) {\n    row = rows[rowindex]; /*do each row */\n    row->all_caps = false;\n    if (row->ascrise / row->xheight < MIN_ASC_FRACTION) {\n      /*no ascenders */\n      if (row->xheight >= lineheight * (1 - MAXHEIGHTVARIANCE) &&\n          row->xheight <= lineheight * (1 + MAXHEIGHTVARIANCE)) {\n        row->ascrise = fullheight - lineheight;\n        /*set to average */\n        row->xheight = lineheight;\n\n      } else if (row->xheight >= fullheight * (1 - MAXHEIGHTVARIANCE) &&\n                 row->xheight <= fullheight * (1 + MAXHEIGHTVARIANCE)) {\n        row->ascrise = row->xheight - lineheight;\n        /*set to average */\n        row->xheight = lineheight;\n        row->all_caps = true;\n      } else {\n        row->ascrise = (fullheight - lineheight) * row->xheight / fullheight;\n        /*scale it */\n        row->xheight -= row->ascrise;\n        row->all_caps = true;\n      }\n      if (row->ascrise < minascheight) {\n        row->ascrise = row->xheight * ((1.0 - X_HEIGHT_FRACTION) / X_HEIGHT_FRACTION);\n      }\n    }\n    if (row->descdrop > mindescheight) {\n      if (row->xheight >= lineheight * (1 - MAXHEIGHTVARIANCE) &&\n          row->xheight <= lineheight * (1 + MAXHEIGHTVARIANCE)) {\n        /*set to average */\n        row->descdrop = descheight;\n      } else {\n        row->descdrop = -row->xheight * DESCENDER_FRACTION;\n      }\n    }\n  }\n  return static_cast<int>(lineheight); // block xheight\n}\n\n/**********************************************************************\n * find_textlines\n *\n * Compute the baseline for the given row.\n **********************************************************************/\n\nvoid Textord::find_textlines(TO_BLOCK *block,   // block row is in\n                             TO_ROW *row,       // row to do\n                             int degree,        // required approximation\n                             QSPLINE *spline) { // starting spline\n  int partcount;                                /*no of partitions of */\n  bool holed_line = false;                      // lost too many blobs\n  int bestpart;                                 /*biggest partition */\n  int partsizes[MAXPARTS];                      /*no in each partition */\n  int lineheight;                               /*guessed x-height */\n  float jumplimit;                              /*allowed delta change */\n  int blobcount;                                /*no of blobs on line */\n  int pointcount;                               /*no of coords */\n  int xstarts[SPLINESIZE + 1];                  // segment boundaries\n  int segments;                                 // no of segments\n\n  // no of blobs in row\n  blobcount = row->blob_list()->length();\n  // partition no of each blob\n  std::vector<char> partids(blobcount);\n  // useful sample points\n  std::vector<int> xcoords(blobcount);\n  // useful sample points\n  std::vector<int> ycoords(blobcount);\n  // edges of blob rectangles\n  std::vector<TBOX> blobcoords(blobcount);\n  // diffs from 1st approx\n  std::vector<float> ydiffs(blobcount);\n\n  lineheight = get_blob_coords(row, static_cast<int>(block->line_size), &blobcoords[0], holed_line,\n                               blobcount);\n  /*limit for line change */\n  jumplimit = lineheight * textord_oldbl_jumplimit;\n  if (jumplimit < MINASCRISE) {\n    jumplimit = MINASCRISE;\n  }\n\n  if (textord_oldbl_debug) {\n    tprintf(\"\\nInput height=%g, Estimate x-height=%d pixels, jumplimit=%.2f\\n\", block->line_size,\n            lineheight, jumplimit);\n  }\n  if (holed_line) {\n    make_holed_baseline(&blobcoords[0], blobcount, spline, &row->baseline, row->line_m());\n  } else {\n    make_first_baseline(&blobcoords[0], blobcount, &xcoords[0], &ycoords[0], spline, &row->baseline,\n                        jumplimit);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_final_rows) {\n    row->baseline.plot(to_win, ScrollView::GOLDENROD);\n  }\n#endif\n  if (blobcount > 1) {\n    bestpart = partition_line(&blobcoords[0], blobcount, &partcount, &partids[0], partsizes,\n                              &row->baseline, jumplimit, &ydiffs[0]);\n    pointcount = partition_coords(&blobcoords[0], blobcount, &partids[0], bestpart, &xcoords[0],\n                                  &ycoords[0]);\n    segments = segment_spline(&blobcoords[0], blobcount, &xcoords[0], &ycoords[0], degree,\n                              pointcount, xstarts);\n    if (!holed_line) {\n      do {\n        row->baseline = QSPLINE(xstarts, segments, &xcoords[0], &ycoords[0], pointcount, degree);\n      } while (textord_oldbl_split_splines &&\n               split_stepped_spline(&row->baseline, jumplimit / 2, &xcoords[0], xstarts, segments));\n    }\n    find_lesser_parts(row, &blobcoords[0], blobcount, &partids[0], partsizes, partcount, bestpart);\n\n  } else {\n    row->xheight = -1.0f; /*failed */\n    row->descdrop = 0.0f;\n    row->ascrise = 0.0f;\n  }\n  row->baseline.extrapolate(row->line_m(), block->block->pdblk.bounding_box().left(),\n                            block->block->pdblk.bounding_box().right());\n\n  if (textord_really_old_xheight) {\n    old_first_xheight(row, &blobcoords[0], lineheight, blobcount, &row->baseline, jumplimit);\n  } else if (textord_old_xheight) {\n    make_first_xheight(row, &blobcoords[0], lineheight, static_cast<int>(block->line_size),\n                       blobcount, &row->baseline, jumplimit);\n  } else {\n    compute_row_xheight(row, block->block->classify_rotation(), row->line_m(), block->line_size);\n  }\n}\n\n/**********************************************************************\n * get_blob_coords\n *\n * Fill the blobcoords array with the coordinates of the blobs\n * in the row. The return value is the first guess at the line height.\n **********************************************************************/\n\nint get_blob_coords(    // get boxes\n    TO_ROW *row,        // row to use\n    int32_t lineheight, // block level\n    TBOX *blobcoords,   // output boxes\n    bool &holed_line,   // lost a lot of blobs\n    int &outcount       // no of real blobs\n) {\n  // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  int blobindex;    /*no along text line */\n  int losscount;    // lost blobs\n  int maxlosscount; // greatest lost blobs\n  /*height stat collection */\n  STATS heightstat(0, MAXHEIGHT - 1);\n\n  if (blob_it.empty()) {\n    return 0; // none\n  }\n  maxlosscount = 0;\n  losscount = 0;\n  blob_it.mark_cycle_pt();\n  blobindex = 0;\n  do {\n    blobcoords[blobindex] = box_next_pre_chopped(&blob_it);\n    if (blobcoords[blobindex].height() > lineheight * 0.25) {\n      heightstat.add(blobcoords[blobindex].height(), 1);\n    }\n    if (blobindex == 0 || blobcoords[blobindex].height() > lineheight * 0.25 ||\n        blob_it.cycled_list()) {\n      blobindex++; /*no of merged blobs */\n      losscount = 0;\n    } else {\n      if (blobcoords[blobindex].height() < blobcoords[blobindex].width() * oldbl_dot_error_size &&\n          blobcoords[blobindex].width() < blobcoords[blobindex].height() * oldbl_dot_error_size) {\n        // counts as dot\n        blobindex++;\n        losscount = 0;\n      } else {\n        losscount++; // lost it\n        if (losscount > maxlosscount) {\n          // remember max\n          maxlosscount = losscount;\n        }\n      }\n    }\n  } while (!blob_it.cycled_list());\n\n  holed_line = maxlosscount > oldbl_holed_losscount;\n  outcount = blobindex; /*total blobs */\n\n  if (heightstat.get_total() > 1) {\n    /*guess x-height */\n    return static_cast<int>(heightstat.ile(0.25));\n  } else {\n    return blobcoords[0].height();\n  }\n}\n\n/**********************************************************************\n * make_first_baseline\n *\n * Make the first estimate at a baseline, either by shifting\n * a supplied previous spline, or by doing a piecewise linear\n * approximation using all the blobs.\n **********************************************************************/\n\nvoid make_first_baseline( // initial approximation\n    TBOX blobcoords[],    /*blob bounding boxes */\n    int blobcount,        /*no of blobcoords */\n    int xcoords[],        /*coords for spline */\n    int ycoords[],        /*approximator */\n    QSPLINE *spline,      /*initial spline */\n    QSPLINE *baseline,    /*output spline */\n    float jumplimit       /*guess half descenders */\n) {\n  int leftedge;              /*left edge of line */\n  int rightedge;             /*right edge of line */\n  int blobindex;             /*current blob */\n  int segment;               /*current segment */\n  float prevy, thisy, nexty; /*3 y coords */\n  float y1, y2, y3;          /*3 smooth blobs */\n  float maxmax, minmin;      /*absolute limits */\n  int x2 = 0;                /*right edge of old y3 */\n  int ycount;                /*no of ycoords in use */\n  float yturns[SPLINESIZE];  /*y coords of turn pts */\n  int xturns[SPLINESIZE];    /*xcoords of turn pts */\n  int xstarts[SPLINESIZE + 1];\n  int segments; // no of segments\n  ICOORD shift; // shift of spline\n\n  prevy = 0;\n  /*left edge of row */\n  leftedge = blobcoords[0].left();\n  /*right edge of line */\n  rightedge = blobcoords[blobcount - 1].right();\n  if (spline == nullptr       /*no given spline */\n      || spline->segments < 3 /*or trivial */\n                              /*or too non-overlap */\n      || spline->xcoords[1] > leftedge + MAXOVERLAP * (rightedge - leftedge) ||\n      spline->xcoords[spline->segments - 1] < rightedge - MAXOVERLAP * (rightedge - leftedge)) {\n    if (textord_oldbl_paradef) {\n      return; // use default\n    }\n    xstarts[0] = blobcoords[0].left() - 1;\n    for (blobindex = 0; blobindex < blobcount; blobindex++) {\n      xcoords[blobindex] = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) / 2;\n      ycoords[blobindex] = blobcoords[blobindex].bottom();\n    }\n    xstarts[1] = blobcoords[blobcount - 1].right() + 1;\n    segments = 1; /*no of segments */\n\n    /*linear */\n    *baseline = QSPLINE(xstarts, segments, xcoords, ycoords, blobcount, 1);\n\n    if (blobcount >= 3) {\n      y1 = y2 = y3 = 0.0f;\n      ycount = 0;\n      segment = 0; /*no of segments */\n      maxmax = minmin = 0.0f;\n      thisy = ycoords[0] - baseline->y(xcoords[0]);\n      nexty = ycoords[1] - baseline->y(xcoords[1]);\n      for (blobindex = 2; blobindex < blobcount; blobindex++) {\n        prevy = thisy; /*shift ycoords */\n        thisy = nexty;\n        nexty = ycoords[blobindex] - baseline->y(xcoords[blobindex]);\n        /*middle of smooth y */\n        if (ABS(thisy - prevy) < jumplimit && ABS(thisy - nexty) < jumplimit) {\n          y1 = y2; /*shift window */\n          y2 = y3;\n          y3 = thisy; /*middle point */\n          ycount++;\n          /*local max */\n          if (ycount >= 3 && ((y1 < y2 && y2 >= y3)\n                              /*local min */\n                              || (y1 > y2 && y2 <= y3))) {\n            if (segment < SPLINESIZE - 2) {\n              /*turning pt */\n              xturns[segment] = x2;\n              yturns[segment] = y2;\n              segment++; /*no of spline segs */\n            }\n          }\n          if (ycount == 1) {\n            maxmax = minmin = y3; /*initialise limits */\n          } else {\n            if (y3 > maxmax) {\n              maxmax = y3; /*biggest max */\n            }\n            if (y3 < minmin) {\n              minmin = y3; /*smallest min */\n            }\n          }\n          /*possible turning pt */\n          x2 = blobcoords[blobindex - 1].right();\n        }\n      }\n\n      jumplimit *= 1.2f;\n      /*must be wavy */\n      if (maxmax - minmin > jumplimit) {\n        ycount = segment; /*no of segments */\n        for (blobindex = 0, segment = 1; blobindex < ycount; blobindex++) {\n          if (yturns[blobindex] > minmin + jumplimit || yturns[blobindex] < maxmax - jumplimit) {\n            /*significant peak */\n            if (segment == 1 || yturns[blobindex] > prevy + jumplimit ||\n                yturns[blobindex] < prevy - jumplimit) {\n              /*different to previous */\n              xstarts[segment] = xturns[blobindex];\n              segment++;\n              prevy = yturns[blobindex];\n            }\n            /*bigger max */\n            else if ((prevy > minmin + jumplimit && yturns[blobindex] > prevy)\n                     /*smaller min */\n                     || (prevy < maxmax - jumplimit && yturns[blobindex] < prevy)) {\n              xstarts[segment - 1] = xturns[blobindex];\n              /*improved previous */\n              prevy = yturns[blobindex];\n            }\n          }\n        }\n        xstarts[segment] = blobcoords[blobcount - 1].right() + 1;\n        segments = segment; /*no of segments */\n                            /*linear */\n        *baseline = QSPLINE(xstarts, segments, xcoords, ycoords, blobcount, 1);\n      }\n    }\n  } else {\n    *baseline = *spline; /*copy it */\n    shift =\n        ICOORD(0, static_cast<int16_t>(blobcoords[0].bottom() - spline->y(blobcoords[0].right())));\n    baseline->move(shift);\n  }\n}\n\n/**********************************************************************\n * make_holed_baseline\n *\n * Make the first estimate at a baseline, either by shifting\n * a supplied previous spline, or by doing a piecewise linear\n * approximation using all the blobs.\n **********************************************************************/\n\nvoid make_holed_baseline( // initial approximation\n    TBOX blobcoords[],    /*blob bounding boxes */\n    int blobcount,        /*no of blobcoords */\n    QSPLINE *spline,      /*initial spline */\n    QSPLINE *baseline,    /*output spline */\n    float gradient        // of line\n) {\n  int leftedge;  /*left edge of line */\n  int rightedge; /*right edge of line */\n  int blobindex; /*current blob */\n  float x;       // centre of row\n  ICOORD shift;  // shift of spline\n\n  tesseract::DetLineFit lms; // straight baseline\n  int32_t xstarts[2];        // straight line\n  double coeffs[3];\n  float c; // line parameter\n\n  /*left edge of row */\n  leftedge = blobcoords[0].left();\n  /*right edge of line */\n  rightedge = blobcoords[blobcount - 1].right();\n  for (blobindex = 0; blobindex < blobcount; blobindex++) {\n    lms.Add(ICOORD((blobcoords[blobindex].left() + blobcoords[blobindex].right()) / 2,\n                   blobcoords[blobindex].bottom()));\n  }\n  lms.ConstrainedFit(gradient, &c);\n  xstarts[0] = leftedge;\n  xstarts[1] = rightedge;\n  coeffs[0] = 0;\n  coeffs[1] = gradient;\n  coeffs[2] = c;\n  *baseline = QSPLINE(1, xstarts, coeffs);\n  if (spline != nullptr        /*no given spline */\n      && spline->segments >= 3 /*or trivial */\n                               /*or too non-overlap */\n      && spline->xcoords[1] <= leftedge + MAXOVERLAP * (rightedge - leftedge) &&\n      spline->xcoords[spline->segments - 1] >= rightedge - MAXOVERLAP * (rightedge - leftedge)) {\n    *baseline = *spline; /*copy it */\n    x = (leftedge + rightedge) / 2.0;\n    shift = ICOORD(0, static_cast<int16_t>(gradient * x + c - spline->y(x)));\n    baseline->move(shift);\n  }\n}\n\n/**********************************************************************\n * partition_line\n *\n * Partition a row of blobs into different groups of continuous\n * y position. jumplimit specifies the max allowable limit on a jump\n * before a new partition is started.\n * The return value is the biggest partition\n **********************************************************************/\n\nint partition_line(    // partition blobs\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs on row */\n    int *numparts,     /*number of partitions */\n    char partids[],    /*partition no of each blob */\n    int partsizes[],   /*no in each partition */\n    QSPLINE *spline,   /*curve to fit to */\n    float jumplimit,   /*allowed delta change */\n    float ydiffs[]     /*diff from spline */\n) {\n  int blobindex;             /*no along text line */\n  int bestpart;              /*best new partition */\n  int biggestpart;           /*part with most members */\n  float diff;                /*difference from line */\n  int startx;                /*index of start blob */\n  float partdiffs[MAXPARTS]; /*step between parts */\n\n  for (bestpart = 0; bestpart < MAXPARTS; bestpart++) {\n    partsizes[bestpart] = 0; /*zero them all */\n  }\n\n  startx = get_ydiffs(blobcoords, blobcount, spline, ydiffs);\n  *numparts = 1; /*1 partition */\n  bestpart = -1; /*first point */\n  float drift = 0.0f;\n  float last_delta = 0.0f;\n  for (blobindex = startx; blobindex < blobcount; blobindex++) {\n    /*do each blob in row */\n    diff = ydiffs[blobindex]; /*diff from line */\n    if (textord_oldbl_debug) {\n      tprintf(\"%d(%d,%d), \", blobindex, blobcoords[blobindex].left(),\n              blobcoords[blobindex].bottom());\n    }\n    bestpart =\n        choose_partition(diff, partdiffs, bestpart, jumplimit, &drift, &last_delta, numparts);\n    /*record partition */\n    partids[blobindex] = bestpart;\n    partsizes[bestpart]++; /*another in it */\n  }\n\n  bestpart = -1; /*first point */\n  drift = 0.0f;\n  last_delta = 0.0f;\n  partsizes[0]--; /*doing 1st pt again */\n                  /*do each blob in row */\n  for (blobindex = startx; blobindex >= 0; blobindex--) {\n    diff = ydiffs[blobindex]; /*diff from line */\n    if (textord_oldbl_debug) {\n      tprintf(\"%d(%d,%d), \", blobindex, blobcoords[blobindex].left(),\n              blobcoords[blobindex].bottom());\n    }\n    bestpart =\n        choose_partition(diff, partdiffs, bestpart, jumplimit, &drift, &last_delta, numparts);\n    /*record partition */\n    partids[blobindex] = bestpart;\n    partsizes[bestpart]++; /*another in it */\n  }\n\n  for (biggestpart = 0, bestpart = 1; bestpart < *numparts; bestpart++) {\n    if (partsizes[bestpart] >= partsizes[biggestpart]) {\n      biggestpart = bestpart; /*new biggest */\n    }\n  }\n  if (textord_oldbl_merge_parts) {\n    merge_oldbl_parts(blobcoords, blobcount, partids, partsizes, biggestpart, jumplimit);\n  }\n  return biggestpart; /*biggest partition */\n}\n\n/**********************************************************************\n * merge_oldbl_parts\n *\n * For any adjacent group of blobs in a different part, put them in the\n * main part if they fit closely to neighbours in the main part.\n **********************************************************************/\n\nvoid merge_oldbl_parts( // partition blobs\n    TBOX blobcoords[],  // bounding boxes\n    int blobcount,      /*no of blobs on row */\n    char partids[],     /*partition no of each blob */\n    int partsizes[],    /*no in each partition */\n    int biggestpart,    // major partition\n    float jumplimit     /*allowed delta change */\n) {\n  bool found_one; // found a bestpart blob\n  bool close_one; // found was close enough\n  int blobindex;  /*no along text line */\n  int prevpart;   // previous iteration\n  int runlength;  // no in this part\n  float diff;     /*difference from line */\n  int startx;     /*index of start blob */\n  int test_blob;  // another index\n  FCOORD coord;   // blob coordinate\n  float m, c;     // fitted line\n  QLSQ stats;     // line stuff\n\n  prevpart = biggestpart;\n  runlength = 0;\n  startx = 0;\n  for (blobindex = 0; blobindex < blobcount; blobindex++) {\n    if (partids[blobindex] != prevpart) {\n      //                      tprintf(\"Partition change at (%d,%d) from %d to %d\n      //                      after run of %d\\n\",\n      //                              blobcoords[blobindex].left(),blobcoords[blobindex].bottom(),\n      //                              prevpart,partids[blobindex],runlength);\n      if (prevpart != biggestpart && runlength > MAXBADRUN) {\n        stats.clear();\n        for (test_blob = startx; test_blob < blobindex; test_blob++) {\n          coord = FCOORD((blobcoords[test_blob].left() + blobcoords[test_blob].right()) / 2.0,\n                         blobcoords[test_blob].bottom());\n          stats.add(coord.x(), coord.y());\n        }\n        stats.fit(1);\n        m = stats.get_b();\n        c = stats.get_c();\n        if (textord_oldbl_debug) {\n          tprintf(\"Fitted line y=%g x + %g\\n\", m, c);\n        }\n        found_one = false;\n        close_one = false;\n        for (test_blob = 1;\n             !found_one && (startx - test_blob >= 0 || blobindex + test_blob <= blobcount);\n             test_blob++) {\n          if (startx - test_blob >= 0 && partids[startx - test_blob] == biggestpart) {\n            found_one = true;\n            coord = FCOORD(\n                (blobcoords[startx - test_blob].left() + blobcoords[startx - test_blob].right()) /\n                    2.0,\n                blobcoords[startx - test_blob].bottom());\n            diff = m * coord.x() + c - coord.y();\n            if (textord_oldbl_debug) {\n              tprintf(\"Diff of common blob to suspect part=%g at (%g,%g)\\n\", diff, coord.x(),\n                      coord.y());\n            }\n            if (diff < jumplimit && -diff < jumplimit) {\n              close_one = true;\n            }\n          }\n          if (blobindex + test_blob <= blobcount &&\n              partids[blobindex + test_blob - 1] == biggestpart) {\n            found_one = true;\n            coord = FCOORD((blobcoords[blobindex + test_blob - 1].left() +\n                            blobcoords[blobindex + test_blob - 1].right()) /\n                               2.0,\n                           blobcoords[blobindex + test_blob - 1].bottom());\n            diff = m * coord.x() + c - coord.y();\n            if (textord_oldbl_debug) {\n              tprintf(\"Diff of common blob to suspect part=%g at (%g,%g)\\n\", diff, coord.x(),\n                      coord.y());\n            }\n            if (diff < jumplimit && -diff < jumplimit) {\n              close_one = true;\n            }\n          }\n        }\n        if (close_one) {\n          if (textord_oldbl_debug) {\n            tprintf(\n                \"Merged %d blobs back into part %d from %d starting at \"\n                \"(%d,%d)\\n\",\n                runlength, biggestpart, prevpart, blobcoords[startx].left(),\n                blobcoords[startx].bottom());\n          }\n          // switch sides\n          partsizes[prevpart] -= runlength;\n          for (test_blob = startx; test_blob < blobindex; test_blob++) {\n            partids[test_blob] = biggestpart;\n          }\n        }\n      }\n      prevpart = partids[blobindex];\n      runlength = 1;\n      startx = blobindex;\n    } else {\n      runlength++;\n    }\n  }\n}\n\n/**********************************************************************\n * get_ydiffs\n *\n * Get the differences between the blobs and the spline,\n * putting them in ydiffs.  The return value is the index\n * of the blob in the middle of the \"best behaved\" region\n **********************************************************************/\n\nint get_ydiffs(        // evaluate differences\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs */\n    QSPLINE *spline,   /*approximating spline */\n    float ydiffs[]     /*output */\n) {\n  int blobindex; /*current blob */\n  int xcentre;   /*xcoord */\n  int lastx;     /*last xcentre */\n  float diffsum; /*sum of diffs */\n  float diff;    /*current difference */\n  float drift;   /*sum of spline steps */\n  float bestsum; /*smallest diffsum */\n  int bestindex; /*index of bestsum */\n\n  diffsum = 0.0f;\n  bestindex = 0;\n  bestsum = static_cast<float>(INT32_MAX);\n  drift = 0.0f;\n  lastx = blobcoords[0].left();\n  /*do each blob in row */\n  for (blobindex = 0; blobindex < blobcount; blobindex++) {\n    /*centre of blob */\n    xcentre = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) >> 1;\n    // step functions in spline\n    drift += spline->step(lastx, xcentre);\n    lastx = xcentre;\n    diff = blobcoords[blobindex].bottom();\n    diff -= spline->y(xcentre);\n    diff += drift;\n    ydiffs[blobindex] = diff; /*store difference */\n    if (blobindex > 2) {\n      /*remove old one */\n      diffsum -= ABS(ydiffs[blobindex - 3]);\n    }\n    diffsum += ABS(diff); /*add new one */\n    if (blobindex >= 2 && diffsum < bestsum) {\n      bestsum = diffsum;         /*find min sum */\n      bestindex = blobindex - 1; /*middle of set */\n    }\n  }\n  return bestindex;\n}\n\n/**********************************************************************\n * choose_partition\n *\n * Choose a partition for the point and return the index.\n **********************************************************************/\n\nint choose_partition(                              // select partition\n    float diff,                                    /*diff from spline */\n    float partdiffs[],                             /*diff on all parts */\n    int lastpart,                                  /*last assigned partition */\n    float jumplimit,                               /*new part threshold */\n    float *drift, float *lastdelta, int *partcount /*no of partitions */\n) {\n  int partition;   /*partition no */\n  int bestpart;    /*best new partition */\n  float bestdelta; /*best gap from a part */\n  float delta;     /*diff from part */\n\n  if (lastpart < 0) {\n    partdiffs[0] = diff;\n    lastpart = 0; /*first point */\n    *drift = 0.0f;\n    *lastdelta = 0.0f;\n  }\n  /*adjusted diff from part */\n  delta = diff - partdiffs[lastpart] - *drift;\n  if (textord_oldbl_debug) {\n    tprintf(\"Diff=%.2f, Delta=%.3f, Drift=%.3f, \", diff, delta, *drift);\n  }\n  if (ABS(delta) > jumplimit / 2) {\n    /*delta on part 0 */\n    bestdelta = diff - partdiffs[0] - *drift;\n    bestpart = 0; /*0 best so far */\n    for (partition = 1; partition < *partcount; partition++) {\n      delta = diff - partdiffs[partition] - *drift;\n      if (ABS(delta) < ABS(bestdelta)) {\n        bestdelta = delta;\n        bestpart = partition; /*part with nearest jump */\n      }\n    }\n    delta = bestdelta;\n    /*too far away */\n    if (ABS(bestdelta) > jumplimit && *partcount < MAXPARTS) { /*and spare part left */\n      bestpart = (*partcount)++;                               /*best was new one */\n                                                               /*start new one */\n      partdiffs[bestpart] = diff - *drift;\n      delta = 0.0f;\n    }\n  } else {\n    bestpart = lastpart; /*best was last one */\n  }\n\n  if (bestpart == lastpart &&\n      (ABS(delta - *lastdelta) < jumplimit / 2 || ABS(delta) < jumplimit / 2)) {\n    /*smooth the drift */\n    *drift = (3 * *drift + delta) / 3;\n  }\n  *lastdelta = delta;\n\n  if (textord_oldbl_debug) {\n    tprintf(\"P=%d\\n\", bestpart);\n  }\n\n  return bestpart;\n}\n\n/**********************************************************************\n * partition_coords\n *\n * Get the x,y coordinates of all points in the bestpart and put them\n * in xcoords,ycoords. Return the number of points found.\n **********************************************************************/\n\nint partition_coords(  // find relevant coords\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs in row */\n    char partids[],    /*partition no of each blob */\n    int bestpart,      /*best new partition */\n    int xcoords[],     /*points to work on */\n    int ycoords[]      /*points to work on */\n) {\n  int blobindex;  /*no along text line */\n  int pointcount; /*no of points */\n\n  pointcount = 0;\n  for (blobindex = 0; blobindex < blobcount; blobindex++) {\n    if (partids[blobindex] == bestpart) {\n      /*centre of blob */\n      xcoords[pointcount] = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) >> 1;\n      ycoords[pointcount++] = blobcoords[blobindex].bottom();\n    }\n  }\n  return pointcount; /*no of points found */\n}\n\n/**********************************************************************\n * segment_spline\n *\n * Segment the row at midpoints between maxima and minima of the x,y pairs.\n * The xstarts of the segments are returned and the number found.\n **********************************************************************/\n\nint segment_spline(             // make xstarts\n    TBOX blobcoords[],          // boundign boxes\n    int blobcount,              /*no of blobs in row */\n    int xcoords[],              /*points to work on */\n    int ycoords[],              /*points to work on */\n    int degree, int pointcount, /*no of points */\n    int xstarts[]               // result\n) {\n  int ptindex;                /*no along text line */\n  int segment;                /*partition no */\n  int lastmin, lastmax;       /*possible turn points */\n  int turnpoints[SPLINESIZE]; /*good turning points */\n  int turncount;              /*no of turning points */\n  int max_x;                  // max specified coord\n\n  xstarts[0] = xcoords[0] - 1; // leftmost defined pt\n  max_x = xcoords[pointcount - 1] + 1;\n  if (degree < 2) {\n    pointcount = 0;\n  }\n  turncount = 0; /*no turning points yet */\n  if (pointcount > 3) {\n    ptindex = 1;\n    lastmax = lastmin = 0; /*start with first one */\n    while (ptindex < pointcount - 1 && turncount < SPLINESIZE - 1) {\n      /*minimum */\n      if (ycoords[ptindex - 1] > ycoords[ptindex] && ycoords[ptindex] <= ycoords[ptindex + 1]) {\n        if (ycoords[ptindex] < ycoords[lastmax] - TURNLIMIT) {\n          if (turncount == 0 || turnpoints[turncount - 1] != lastmax) {\n            /*new max point */\n            turnpoints[turncount++] = lastmax;\n          }\n          lastmin = ptindex; /*latest minimum */\n        } else if (ycoords[ptindex] < ycoords[lastmin]) {\n          lastmin = ptindex; /*lower minimum */\n        }\n      }\n\n      /*maximum */\n      if (ycoords[ptindex - 1] < ycoords[ptindex] && ycoords[ptindex] >= ycoords[ptindex + 1]) {\n        if (ycoords[ptindex] > ycoords[lastmin] + TURNLIMIT) {\n          if (turncount == 0 || turnpoints[turncount - 1] != lastmin) {\n            /*new min point */\n            turnpoints[turncount++] = lastmin;\n          }\n          lastmax = ptindex; /*latest maximum */\n        } else if (ycoords[ptindex] > ycoords[lastmax]) {\n          lastmax = ptindex; /*higher maximum */\n        }\n      }\n      ptindex++;\n    }\n    /*possible global min */\n    if (ycoords[ptindex] < ycoords[lastmax] - TURNLIMIT &&\n        (turncount == 0 || turnpoints[turncount - 1] != lastmax)) {\n      if (turncount < SPLINESIZE - 1) {\n        /*2 more turns */\n        turnpoints[turncount++] = lastmax;\n      }\n      if (turncount < SPLINESIZE - 1) {\n        turnpoints[turncount++] = ptindex;\n      }\n    } else if (ycoords[ptindex] > ycoords[lastmin] + TURNLIMIT\n               /*possible global max */\n               && (turncount == 0 || turnpoints[turncount - 1] != lastmin)) {\n      if (turncount < SPLINESIZE - 1) {\n        /*2 more turns */\n        turnpoints[turncount++] = lastmin;\n      }\n      if (turncount < SPLINESIZE - 1) {\n        turnpoints[turncount++] = ptindex;\n      }\n    } else if (turncount > 0 && turnpoints[turncount - 1] == lastmin &&\n               turncount < SPLINESIZE - 1) {\n      if (ycoords[ptindex] > ycoords[lastmax]) {\n        turnpoints[turncount++] = ptindex;\n      } else {\n        turnpoints[turncount++] = lastmax;\n      }\n    } else if (turncount > 0 && turnpoints[turncount - 1] == lastmax &&\n               turncount < SPLINESIZE - 1) {\n      if (ycoords[ptindex] < ycoords[lastmin]) {\n        turnpoints[turncount++] = ptindex;\n      } else {\n        turnpoints[turncount++] = lastmin;\n      }\n    }\n  }\n\n  if (textord_oldbl_debug && turncount > 0) {\n    tprintf(\"First turn is %d at (%d,%d)\\n\", turnpoints[0], xcoords[turnpoints[0]],\n            ycoords[turnpoints[0]]);\n  }\n  for (segment = 1; segment < turncount; segment++) {\n    /*centre y coord */\n    lastmax = (ycoords[turnpoints[segment - 1]] + ycoords[turnpoints[segment]]) / 2;\n\n    /* fix alg so that it works with both rising and falling sections */\n    if (ycoords[turnpoints[segment - 1]] < ycoords[turnpoints[segment]]) {\n      /*find rising y centre */\n      for (ptindex = turnpoints[segment - 1] + 1;\n           ptindex < turnpoints[segment] && ycoords[ptindex + 1] <= lastmax; ptindex++) {\n      }\n    } else {\n      /*find falling y centre */\n      for (ptindex = turnpoints[segment - 1] + 1;\n           ptindex < turnpoints[segment] && ycoords[ptindex + 1] >= lastmax; ptindex++) {\n      }\n    }\n\n    /*centre x */\n    xstarts[segment] = (xcoords[ptindex - 1] + xcoords[ptindex] + xcoords[turnpoints[segment - 1]] +\n                        xcoords[turnpoints[segment]] + 2) /\n                       4;\n    /*halfway between turns */\n    if (textord_oldbl_debug) {\n      tprintf(\"Turn %d is %d at (%d,%d), mid pt is %d@%d, final @%d\\n\", segment,\n              turnpoints[segment], xcoords[turnpoints[segment]], ycoords[turnpoints[segment]],\n              ptindex - 1, xcoords[ptindex - 1], xstarts[segment]);\n    }\n  }\n\n  xstarts[segment] = max_x;\n  return segment; /*no of splines */\n}\n\n/**********************************************************************\n * split_stepped_spline\n *\n * Re-segment the spline in cases where there is a big step function.\n * Return true if any were done.\n **********************************************************************/\n\nbool split_stepped_spline( // make xstarts\n    QSPLINE *baseline,     // current shot\n    float jumplimit,       // max step function\n    int *xcoords,          /*points to work on */\n    int *xstarts,          // result\n    int &segments          // no of segments\n) {\n  bool doneany; // return value\n  int segment;  /*partition no */\n  int startindex, centreindex, endindex;\n  float leftcoord, rightcoord;\n  int leftindex, rightindex;\n  float step; // spline step\n\n  doneany = false;\n  startindex = 0;\n  for (segment = 1; segment < segments - 1; segment++) {\n    step = baseline->step((xstarts[segment - 1] + xstarts[segment]) / 2.0,\n                          (xstarts[segment] + xstarts[segment + 1]) / 2.0);\n    if (step < 0) {\n      step = -step;\n    }\n    if (step > jumplimit) {\n      while (xcoords[startindex] < xstarts[segment - 1]) {\n        startindex++;\n      }\n      centreindex = startindex;\n      while (xcoords[centreindex] < xstarts[segment]) {\n        centreindex++;\n      }\n      endindex = centreindex;\n      while (xcoords[endindex] < xstarts[segment + 1]) {\n        endindex++;\n      }\n      if (segments >= SPLINESIZE) {\n        if (textord_debug_baselines) {\n          tprintf(\"Too many segments to resegment spline!!\\n\");\n        }\n      } else if (endindex - startindex >= textord_spline_medianwin * 3) {\n        while (centreindex - startindex < textord_spline_medianwin * 3 / 2) {\n          centreindex++;\n        }\n        while (endindex - centreindex < textord_spline_medianwin * 3 / 2) {\n          centreindex--;\n        }\n        leftindex = (startindex + startindex + centreindex) / 3;\n        rightindex = (centreindex + endindex + endindex) / 3;\n        leftcoord = (xcoords[startindex] * 2 + xcoords[centreindex]) / 3.0;\n        rightcoord = (xcoords[centreindex] + xcoords[endindex] * 2) / 3.0;\n        while (xcoords[leftindex] > leftcoord &&\n               leftindex - startindex > textord_spline_medianwin) {\n          leftindex--;\n        }\n        while (xcoords[leftindex] < leftcoord &&\n               centreindex - leftindex > textord_spline_medianwin / 2) {\n          leftindex++;\n        }\n        if (xcoords[leftindex] - leftcoord > leftcoord - xcoords[leftindex - 1]) {\n          leftindex--;\n        }\n        while (xcoords[rightindex] > rightcoord &&\n               rightindex - centreindex > textord_spline_medianwin / 2) {\n          rightindex--;\n        }\n        while (xcoords[rightindex] < rightcoord &&\n               endindex - rightindex > textord_spline_medianwin) {\n          rightindex++;\n        }\n        if (xcoords[rightindex] - rightcoord > rightcoord - xcoords[rightindex - 1]) {\n          rightindex--;\n        }\n        if (textord_debug_baselines) {\n          tprintf(\"Splitting spline at %d with step %g at (%d,%d)\\n\", xstarts[segment],\n                  baseline->step((xstarts[segment - 1] + xstarts[segment]) / 2.0,\n                                 (xstarts[segment] + xstarts[segment + 1]) / 2.0),\n                  (xcoords[leftindex - 1] + xcoords[leftindex]) / 2,\n                  (xcoords[rightindex - 1] + xcoords[rightindex]) / 2);\n        }\n        insert_spline_point(xstarts, segment, (xcoords[leftindex - 1] + xcoords[leftindex]) / 2,\n                            (xcoords[rightindex - 1] + xcoords[rightindex]) / 2, segments);\n        doneany = true;\n      } else if (textord_debug_baselines) {\n        tprintf(\"Resegmenting spline failed - insufficient pts (%d,%d,%d,%d)\\n\", startindex,\n                centreindex, endindex, (int32_t)textord_spline_medianwin);\n      }\n    }\n    //              else tprintf(\"Spline step at %d is %g\\n\",\n    //                      xstarts[segment],\n    //                      baseline->step((xstarts[segment-1]+xstarts[segment])/2.0,\n    //                      (xstarts[segment]+xstarts[segment+1])/2.0));\n  }\n  return doneany;\n}\n\n/**********************************************************************\n * insert_spline_point\n *\n * Insert a new spline point and shuffle up the others.\n **********************************************************************/\n\nvoid insert_spline_point(     // get descenders\n    int xstarts[],            // starts to shuffle\n    int segment,              // insertion pt\n    int coord1,               // coords to add\n    int coord2, int &segments // total segments\n) {\n  int index; // for shuffling\n\n  for (index = segments; index > segment; index--) {\n    xstarts[index + 1] = xstarts[index];\n  }\n  segments++;\n  xstarts[segment] = coord1;\n  xstarts[segment + 1] = coord2;\n}\n\n/**********************************************************************\n * find_lesser_parts\n *\n * Average the step from the spline for the other partitions\n * and find the commonest partition which has a descender.\n **********************************************************************/\n\nvoid find_lesser_parts( // get descenders\n    TO_ROW *row,        // row to process\n    TBOX blobcoords[],  // bounding boxes\n    int blobcount,      /*no of blobs */\n    char partids[],     /*partition of each blob */\n    int partsizes[],    /*size of each part */\n    int partcount,      /*no of partitions */\n    int bestpart        /*biggest partition */\n) {\n  int blobindex;             /*index of blob */\n  int partition;             /*current partition */\n  int xcentre;               /*centre of blob */\n  int poscount;              /*count of best up step */\n  int negcount;              /*count of best down step */\n  float partsteps[MAXPARTS]; /*average step to part */\n  float bestneg;             /*best down step */\n  int runlength;             /*length of bad run */\n  int biggestrun;            /*biggest bad run */\n\n  biggestrun = 0;\n  for (partition = 0; partition < partcount; partition++) {\n    partsteps[partition] = 0.0; /*zero accumulators */\n  }\n  for (runlength = 0, blobindex = 0; blobindex < blobcount; blobindex++) {\n    xcentre = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) >> 1;\n    /*in other parts */\n    int part_id = static_cast<int>(static_cast<unsigned char>(partids[blobindex]));\n    if (part_id != bestpart) {\n      runlength++; /*run of non bests */\n      if (runlength > biggestrun) {\n        biggestrun = runlength;\n      }\n      partsteps[part_id] += blobcoords[blobindex].bottom() - row->baseline.y(xcentre);\n    } else {\n      runlength = 0;\n    }\n  }\n  if (biggestrun > MAXBADRUN) {\n    row->xheight = -1.0f; /*failed */\n  } else {\n    row->xheight = 1.0f; /*success */\n  }\n  poscount = negcount = 0;\n  bestneg = 0.0; /*no step yet */\n  for (partition = 0; partition < partcount; partition++) {\n    if (partition != bestpart) {\n      // by jetsoft divide by zero possible\n      if (partsizes[partition] == 0) {\n        partsteps[partition] = 0;\n      } else {\n        partsteps[partition] /= partsizes[partition];\n      }\n      //\n\n      if (partsteps[partition] >= MINASCRISE && partsizes[partition] > poscount) {\n        poscount = partsizes[partition];\n      }\n      if (partsteps[partition] <= -MINASCRISE && partsizes[partition] > negcount) {\n        /*ascender rise */\n        bestneg = partsteps[partition];\n        /*2nd most popular */\n        negcount = partsizes[partition];\n      }\n    }\n  }\n  /*average x-height */\n  partsteps[bestpart] /= blobcount;\n  row->descdrop = bestneg;\n}\n\n/**********************************************************************\n * old_first_xheight\n *\n * Makes an x-height spline by copying the baseline and shifting it.\n * It estimates the x-height across the line to use as the shift.\n * It also finds the ascender height if it can.\n **********************************************************************/\n\nvoid old_first_xheight( // the wiseowl way\n    TO_ROW *row,        /*current row */\n    TBOX blobcoords[],  /*blob bounding boxes */\n    int initialheight,  // initial guess\n    int blobcount,      /*blobs in blobcoords */\n    QSPLINE *baseline,  /*established */\n    float jumplimit     /*min ascender height */\n) {\n  int blobindex; /*current blob */\n                 /*height statistics */\n  STATS heightstat(0, MAXHEIGHT - 1);\n  int height;      /*height of blob */\n  int xcentre;     /*centre of blob */\n  int lineheight;  /*approx xheight */\n  float ascenders; /*ascender sum */\n  int asccount;    /*no of ascenders */\n  float xsum;      /*xheight sum */\n  int xcount;      /*xheight count */\n  float diff;      /*height difference */\n\n  if (blobcount > 1) {\n    for (blobindex = 0; blobindex < blobcount; blobindex++) {\n      xcentre = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) / 2;\n      /*height of blob */\n      height = static_cast<int>(blobcoords[blobindex].top() - baseline->y(xcentre) + 0.5);\n      if (height > initialheight * oldbl_xhfract && height > textord_min_xheight) {\n        heightstat.add(height, 1);\n      }\n    }\n    if (heightstat.get_total() > 3) {\n      lineheight = static_cast<int>(heightstat.ile(0.25));\n      if (lineheight <= 0) {\n        lineheight = static_cast<int>(heightstat.ile(0.5));\n      }\n    } else {\n      lineheight = initialheight;\n    }\n  } else {\n    lineheight =\n        static_cast<int>(blobcoords[0].top() -\n                         baseline->y((blobcoords[0].left() + blobcoords[0].right()) / 2) + 0.5);\n  }\n\n  xsum = 0.0f;\n  xcount = 0;\n  for (ascenders = 0.0f, asccount = 0, blobindex = 0; blobindex < blobcount; blobindex++) {\n    xcentre = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) / 2;\n    diff = blobcoords[blobindex].top() - baseline->y(xcentre);\n    /*is it ascender */\n    if (diff > lineheight + jumplimit) {\n      ascenders += diff;\n      asccount++; /*count ascenders */\n    } else if (diff > lineheight - jumplimit) {\n      xsum += diff; /*mean xheight */\n      xcount++;\n    }\n  }\n  if (xcount > 0) {\n    xsum /= xcount; /*average xheight */\n  } else {\n    xsum = static_cast<float>(lineheight); /*guess it */\n  }\n  row->xheight *= xsum;\n  if (asccount > 0) {\n    row->ascrise = ascenders / asccount - xsum;\n  } else {\n    row->ascrise = 0.0f; /*had none */\n  }\n  if (row->xheight == 0) {\n    row->xheight = -1.0f;\n  }\n}\n\n/**********************************************************************\n * make_first_xheight\n *\n * Makes an x-height spline by copying the baseline and shifting it.\n * It estimates the x-height across the line to use as the shift.\n * It also finds the ascender height if it can.\n **********************************************************************/\n\nvoid make_first_xheight( // find xheight\n    TO_ROW *row,         /*current row */\n    TBOX blobcoords[],   /*blob bounding boxes */\n    int lineheight,      // initial guess\n    int init_lineheight, // block level guess\n    int blobcount,       /*blobs in blobcoords */\n    QSPLINE *baseline,   /*established */\n    float jumplimit      /*min ascender height */\n) {\n  STATS heightstat(0, HEIGHTBUCKETS - 1);\n  int lefts[HEIGHTBUCKETS];\n  int rights[HEIGHTBUCKETS];\n  int modelist[MODENUM];\n  int blobindex;\n  int mode_count; // blobs to count in thr\n  int sign_bit;\n  int mode_threshold;\n  const int kBaselineTouch = 2;  // This really should change with resolution.\n  const int kGoodStrength = 8;   // Strength of baseline-touching heights.\n  const float kMinHeight = 0.25; // Min fraction of lineheight to use.\n\n  sign_bit = row->xheight > 0 ? 1 : -1;\n\n  memset(lefts, 0, HEIGHTBUCKETS * sizeof(lefts[0]));\n  memset(rights, 0, HEIGHTBUCKETS * sizeof(rights[0]));\n  mode_count = 0;\n  for (blobindex = 0; blobindex < blobcount; blobindex++) {\n    int xcenter = (blobcoords[blobindex].left() + blobcoords[blobindex].right()) / 2;\n    float base = baseline->y(xcenter);\n    float bottomdiff = std::fabs(base - blobcoords[blobindex].bottom());\n    int strength = textord_ocropus_mode && bottomdiff <= kBaselineTouch ? kGoodStrength : 1;\n    int height = static_cast<int>(blobcoords[blobindex].top() - base + 0.5);\n    if (blobcoords[blobindex].height() > init_lineheight * kMinHeight) {\n      if (height > lineheight * oldbl_xhfract && height > textord_min_xheight) {\n        heightstat.add(height, strength);\n        if (height < HEIGHTBUCKETS) {\n          if (xcenter > rights[height]) {\n            rights[height] = xcenter;\n          }\n          if (xcenter > 0 && (lefts[height] == 0 || xcenter < lefts[height])) {\n            lefts[height] = xcenter;\n          }\n        }\n      }\n      mode_count += strength;\n    }\n  }\n\n  mode_threshold = static_cast<int>(blobcount * 0.1);\n  if (oldbl_dot_error_size > 1 || oldbl_xhfix) {\n    mode_threshold = static_cast<int>(mode_count * 0.1);\n  }\n\n  if (textord_oldbl_debug) {\n    tprintf(\"blobcount=%d, mode_count=%d, mode_t=%d\\n\", blobcount, mode_count, mode_threshold);\n  }\n  find_top_modes(&heightstat, HEIGHTBUCKETS, modelist, MODENUM);\n  if (textord_oldbl_debug) {\n    for (blobindex = 0; blobindex < MODENUM; blobindex++) {\n      tprintf(\"mode[%d]=%d \", blobindex, modelist[blobindex]);\n    }\n    tprintf(\"\\n\");\n  }\n  pick_x_height(row, modelist, lefts, rights, &heightstat, mode_threshold);\n\n  if (textord_oldbl_debug) {\n    tprintf(\"Output xheight=%g\\n\", row->xheight);\n  }\n  if (row->xheight < 0 && textord_oldbl_debug) {\n    tprintf(\"warning: Row Line height < 0; %4.2f\\n\", row->xheight);\n  }\n\n  if (sign_bit < 0) {\n    row->xheight = -row->xheight;\n  }\n}\n\n/**********************************************************************\n * find_top_modes\n *\n * Fill the input array with the indices of the top ten modes of the\n * input distribution.\n **********************************************************************/\n\nconst int kMinModeFactorOcropus = 32;\nconst int kMinModeFactor = 12;\n\nvoid find_top_modes(            // get modes\n    STATS *stats,               // stats to hack\n    int statnum,                // no of piles\n    int modelist[], int modenum // no of modes to get\n) {\n  int mode_count;\n  int last_i = 0;\n  int last_max = INT32_MAX;\n  int i;\n  int mode;\n  int total_max = 0;\n  int mode_factor = textord_ocropus_mode ? kMinModeFactorOcropus : kMinModeFactor;\n\n  for (mode_count = 0; mode_count < modenum; mode_count++) {\n    mode = 0;\n    for (i = 0; i < statnum; i++) {\n      if (stats->pile_count(i) > stats->pile_count(mode)) {\n        if ((stats->pile_count(i) < last_max) ||\n            ((stats->pile_count(i) == last_max) && (i > last_i))) {\n          mode = i;\n        }\n      }\n    }\n    last_i = mode;\n    last_max = stats->pile_count(last_i);\n    total_max += last_max;\n    if (last_max <= total_max / mode_factor) {\n      mode = 0;\n    }\n    modelist[mode_count] = mode;\n  }\n}\n\n/**********************************************************************\n * pick_x_height\n *\n * Choose based on the height modes the best x height value.\n **********************************************************************/\n\nvoid pick_x_height(TO_ROW *row, // row to do\n                   int modelist[], int lefts[], int rights[], STATS *heightstat,\n                   int mode_threshold) {\n  int x;\n  int y;\n  int z;\n  float ratio;\n  int found_one_bigger = false;\n  int best_x_height = 0;\n  int best_asc = 0;\n  int num_in_best;\n\n  for (x = 0; x < MODENUM; x++) {\n    for (y = 0; y < MODENUM; y++) {\n      /* Check for two modes */\n      if (modelist[x] && modelist[y] && heightstat->pile_count(modelist[x]) > mode_threshold &&\n          (!textord_ocropus_mode || std::min(rights[modelist[x]], rights[modelist[y]]) >\n                                        std::max(lefts[modelist[x]], lefts[modelist[y]]))) {\n        ratio = static_cast<float>(modelist[y]) / static_cast<float>(modelist[x]);\n        if (1.2 < ratio && ratio < 1.8) {\n          /* Two modes found */\n          best_x_height = modelist[x];\n          num_in_best = heightstat->pile_count(modelist[x]);\n\n          /* Try to get one higher */\n          do {\n            found_one_bigger = false;\n            for (z = 0; z < MODENUM; z++) {\n              if (modelist[z] == best_x_height + 1 &&\n                  (!textord_ocropus_mode || std::min(rights[modelist[x]], rights[modelist[y]]) >\n                                                std::max(lefts[modelist[x]], lefts[modelist[y]]))) {\n                ratio = static_cast<float>(modelist[y]) / static_cast<float>(modelist[z]);\n                if ((1.2 < ratio && ratio < 1.8) &&\n                    /* Should be half of best */\n                    heightstat->pile_count(modelist[z]) > num_in_best * 0.5) {\n                  best_x_height++;\n                  found_one_bigger = true;\n                  break;\n                }\n              }\n            }\n          } while (found_one_bigger);\n\n          /* try to get a higher ascender */\n\n          best_asc = modelist[y];\n          num_in_best = heightstat->pile_count(modelist[y]);\n\n          /* Try to get one higher */\n          do {\n            found_one_bigger = false;\n            for (z = 0; z < MODENUM; z++) {\n              if (modelist[z] > best_asc &&\n                  (!textord_ocropus_mode || std::min(rights[modelist[x]], rights[modelist[y]]) >\n                                                std::max(lefts[modelist[x]], lefts[modelist[y]]))) {\n                ratio = static_cast<float>(modelist[z]) / static_cast<float>(best_x_height);\n                if ((1.2 < ratio && ratio < 1.8) &&\n                    /* Should be half of best */\n                    heightstat->pile_count(modelist[z]) > num_in_best * 0.5) {\n                  best_asc = modelist[z];\n                  found_one_bigger = true;\n                  break;\n                }\n              }\n            }\n          } while (found_one_bigger);\n\n          row->xheight = static_cast<float>(best_x_height);\n          row->ascrise = static_cast<float>(best_asc) - best_x_height;\n          return;\n        }\n      }\n    }\n  }\n\n  best_x_height = modelist[0]; /* Single Mode found */\n  num_in_best = heightstat->pile_count(best_x_height);\n  do {\n    /* Try to get one higher */\n    found_one_bigger = false;\n    for (z = 1; z < MODENUM; z++) {\n      /* Should be half of best */\n      if ((modelist[z] == best_x_height + 1) &&\n          (heightstat->pile_count(modelist[z]) > num_in_best * 0.5)) {\n        best_x_height++;\n        found_one_bigger = true;\n        break;\n      }\n    }\n  } while (found_one_bigger);\n\n  row->ascrise = 0.0f;\n  row->xheight = static_cast<float>(best_x_height);\n  if (row->xheight == 0) {\n    row->xheight = -1.0f;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/oldbasel.h",
    "content": "/**********************************************************************\n * File:        oldbasel.h  (Formerly oldbl.h)\n * Description: A re-implementation of the old baseline algorithm.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef OLDBASEL_H\n#define OLDBASEL_H\n\n#include \"blobbox.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nextern BOOL_VAR_H(textord_oldbl_debug);\n\nint get_blob_coords(    // get boxes\n    TO_ROW *row,        // row to use\n    int32_t lineheight, // block level\n    TBOX *blobcoords,   // output boxes\n    bool &holed_line,   // lost a lot of blobs\n    int &outcount       // no of real blobs\n);\nvoid make_first_baseline( // initial approximation\n    TBOX blobcoords[],    /*blob bounding boxes */\n    int blobcount,        /*no of blobcoords */\n    int xcoords[],        /*coords for spline */\n    int ycoords[],        /*approximator */\n    QSPLINE *spline,      /*initial spline */\n    QSPLINE *baseline,    /*output spline */\n    float jumplimit       /*guess half descenders */\n);\nvoid make_holed_baseline( // initial approximation\n    TBOX blobcoords[],    /*blob bounding boxes */\n    int blobcount,        /*no of blobcoords */\n    QSPLINE *spline,      /*initial spline */\n    QSPLINE *baseline,    /*output spline */\n    float gradient        // of line\n);\nint partition_line(    // partition blobs\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs on row */\n    int *numparts,     /*number of partitions */\n    char partids[],    /*partition no of each blob */\n    int partsizes[],   /*no in each partition */\n    QSPLINE *spline,   /*curve to fit to */\n    float jumplimit,   /*allowed delta change */\n    float ydiffs[]     /*diff from spline */\n);\nvoid merge_oldbl_parts( // partition blobs\n    TBOX blobcoords[],  // bounding boxes\n    int blobcount,      /*no of blobs on row */\n    char partids[],     /*partition no of each blob */\n    int partsizes[],    /*no in each partition */\n    int biggestpart,    // major partition\n    float jumplimit     /*allowed delta change */\n);\nint get_ydiffs(        // evaluate differences\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs */\n    QSPLINE *spline,   /*approximating spline */\n    float ydiffs[]     /*output */\n);\nint choose_partition(                               // select partition\n    float diff,                                     /*diff from spline */\n    float partdiffs[],                              /*diff on all parts */\n    int lastpart,                                   /*last assigned partition */\n    float jumplimit,                                /*new part threshold */\n    float *drift, float *last_delta, int *partcount /*no of partitions */\n);\nint partition_coords(  // find relevant coords\n    TBOX blobcoords[], // bounding boxes\n    int blobcount,     /*no of blobs in row */\n    char partids[],    /*partition no of each blob */\n    int bestpart,      /*best new partition */\n    int xcoords[],     /*points to work on */\n    int ycoords[]      /*points to work on */\n);\nint segment_spline(             // make xstarts\n    TBOX blobcoords[],          // boundign boxes\n    int blobcount,              /*no of blobs in row */\n    int xcoords[],              /*points to work on */\n    int ycoords[],              /*points to work on */\n    int degree, int pointcount, /*no of points */\n    int xstarts[]               // result\n);\nbool split_stepped_spline( // make xstarts\n    QSPLINE *baseline,     // current shot\n    float jumplimit,       // max step function\n    int *xcoords,          /*points to work on */\n    int *xstarts,          // result\n    int &segments          // no of segments\n);\nvoid insert_spline_point(     // get descenders\n    int xstarts[],            // starts to shuffle\n    int segment,              // insertion pt\n    int coord1,               // coords to add\n    int coord2, int &segments // total segments\n);\nvoid find_lesser_parts( // get descenders\n    TO_ROW *row,        // row to process\n    TBOX blobcoords[],  // bounding boxes\n    int blobcount,      /*no of blobs */\n    char partids[],     /*partition of each blob */\n    int partsizes[],    /*size of each part */\n    int partcount,      /*no of partitions */\n    int bestpart        /*biggest partition */\n);\n\nvoid old_first_xheight( // the wiseowl way\n    TO_ROW *row,        /*current row */\n    TBOX blobcoords[],  /*blob bounding boxes */\n    int initialheight,  // initial guess\n    int blobcount,      /*blobs in blobcoords */\n    QSPLINE *baseline,  /*established */\n    float jumplimit     /*min ascender height */\n);\n\nvoid make_first_xheight( // find xheight\n    TO_ROW *row,         /*current row */\n    TBOX blobcoords[],   /*blob bounding boxes */\n    int lineheight,      // initial guess\n    int init_lineheight, // block level guess\n    int blobcount,       /*blobs in blobcoords */\n    QSPLINE *baseline,   /*established */\n    float jumplimit      /*min ascender height */\n);\n\nint *make_height_array( // get array of heights\n    TBOX blobcoords[],  /*blob bounding boxes */\n    int blobcount,      /*blobs in blobcoords */\n    QSPLINE *baseline   /*established */\n);\n\nvoid find_top_modes(            // get modes\n    STATS *stats,               // stats to hack\n    int statnum,                // no of piles\n    int modelist[], int modenum // no of modes to get\n);\n\nvoid pick_x_height(TO_ROW *row, // row to do\n                   int modelist[], int lefts[], int rights[], STATS *heightstat,\n                   int mode_threshold);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/pithsync.cpp",
    "content": "/**********************************************************************\n * File:        pithsync.cpp  (Formerly pitsync2.c)\n * Description: Code to find the optimum fixed pitch segmentation of some blobs.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"pithsync.h\"\n\n#include \"makerow.h\"\n#include \"pitsync1.h\"\n#include \"topitch.h\"\n#include \"tprintf.h\"\n\n#include <cfloat> // for FLT_MAX\n#include <cmath>\n#include <vector> // for std::vector\n\nnamespace tesseract {\n\n/**********************************************************************\n * FPCUTPT::setup\n *\n * Constructor to make a new FPCUTPT.\n **********************************************************************/\n\nvoid FPCUTPT::setup(      // constructor\n    FPCUTPT *cutpts,      // predecessors\n    int16_t array_origin, // start coord\n    STATS *projection,    // vertical occupation\n    int16_t zero_count,   // official zero\n    int16_t pitch,        // proposed pitch\n    int16_t x,            // position\n    int16_t offset        // dist to gap\n) {\n  // half of pitch\n  int16_t half_pitch = pitch / 2 - 1;\n  uint32_t lead_flag; // new flag\n  int32_t ind;        // current position\n\n  if (half_pitch > 31) {\n    half_pitch = 31;\n  } else if (half_pitch < 0) {\n    half_pitch = 0;\n  }\n  lead_flag = 1 << half_pitch;\n\n  pred = nullptr;\n  mean_sum = 0;\n  sq_sum = offset * offset;\n  cost = sq_sum;\n  faked = false;\n  terminal = false;\n  fake_count = 0;\n  xpos = x;\n  region_index = 0;\n  mid_cuts = 0;\n  if (x == array_origin) {\n    back_balance = 0;\n    fwd_balance = 0;\n    for (ind = 0; ind <= half_pitch; ind++) {\n      fwd_balance >>= 1;\n      if (projection->pile_count(ind) > zero_count) {\n        fwd_balance |= lead_flag;\n      }\n    }\n  } else {\n    back_balance = cutpts[x - 1 - array_origin].back_balance << 1;\n    back_balance &= lead_flag + (lead_flag - 1);\n    if (projection->pile_count(x) > zero_count) {\n      back_balance |= 1;\n    }\n    fwd_balance = cutpts[x - 1 - array_origin].fwd_balance >> 1;\n    if (projection->pile_count(x + half_pitch) > zero_count) {\n      fwd_balance |= lead_flag;\n    }\n  }\n}\n\n/**********************************************************************\n * FPCUTPT::assign\n *\n * Constructor to make a new FPCUTPT.\n **********************************************************************/\n\nvoid FPCUTPT::assign(       // constructor\n    FPCUTPT *cutpts,        // predecessors\n    int16_t array_origin,   // start coord\n    int16_t x,              // position\n    bool faking,            // faking this one\n    bool mid_cut,           // cheap cut.\n    int16_t offset,         // dist to gap\n    STATS *projection,      // vertical occupation\n    float projection_scale, // scaling\n    int16_t zero_count,     // official zero\n    int16_t pitch,          // proposed pitch\n    int16_t pitch_error     // allowed tolerance\n) {\n  int index;             // test index\n  int balance_index;     // for balance factor\n  int16_t balance_count; // ding factor\n  int16_t r_index;       // test cut number\n  FPCUTPT *segpt;        // segment point\n  int32_t dist;          // from prev segment\n  double sq_dist;        // squared distance\n  double mean;           // mean pitch\n  double total;          // total dists\n  double factor;         // cost function\n                         // half of pitch\n  int16_t half_pitch = pitch / 2 - 1;\n  uint32_t lead_flag; // new flag\n\n  if (half_pitch > 31) {\n    half_pitch = 31;\n  } else if (half_pitch < 0) {\n    half_pitch = 0;\n  }\n  lead_flag = 1 << half_pitch;\n\n  back_balance = cutpts[x - 1 - array_origin].back_balance << 1;\n  back_balance &= lead_flag + (lead_flag - 1);\n  if (projection->pile_count(x) > zero_count) {\n    back_balance |= 1;\n  }\n  fwd_balance = cutpts[x - 1 - array_origin].fwd_balance >> 1;\n  if (projection->pile_count(x + half_pitch) > zero_count) {\n    fwd_balance |= lead_flag;\n  }\n\n  xpos = x;\n  cost = FLT_MAX;\n  pred = nullptr;\n  faked = faking;\n  terminal = false;\n  region_index = 0;\n  fake_count = INT16_MAX;\n  for (index = x - pitch - pitch_error; index <= x - pitch + pitch_error; index++) {\n    if (index >= array_origin) {\n      segpt = &cutpts[index - array_origin];\n      dist = x - segpt->xpos;\n      if (!segpt->terminal && segpt->fake_count < INT16_MAX) {\n        balance_count = 0;\n        if (textord_balance_factor > 0) {\n          if (textord_fast_pitch_test) {\n            lead_flag = back_balance ^ segpt->fwd_balance;\n            balance_count = 0;\n            while (lead_flag != 0) {\n              balance_count++;\n              lead_flag &= lead_flag - 1;\n            }\n          } else {\n            for (balance_index = 0; index + balance_index < x - balance_index; balance_index++) {\n              balance_count += (projection->pile_count(index + balance_index) <= zero_count) ^\n                               (projection->pile_count(x - balance_index) <= zero_count);\n            }\n          }\n          balance_count =\n              static_cast<int16_t>(balance_count * textord_balance_factor / projection_scale);\n        }\n        r_index = segpt->region_index + 1;\n        total = segpt->mean_sum + dist;\n        balance_count += offset;\n        sq_dist = dist * dist + segpt->sq_sum + balance_count * balance_count;\n        mean = total / r_index;\n        factor = mean - pitch;\n        factor *= factor;\n        factor += sq_dist / (r_index)-mean * mean;\n        if (factor < cost && segpt->fake_count + faked <= fake_count) {\n          cost = factor; // find least cost\n          pred = segpt;  // save path\n          mean_sum = total;\n          sq_sum = sq_dist;\n          fake_count = segpt->fake_count + faked;\n          mid_cuts = segpt->mid_cuts + mid_cut;\n          region_index = r_index;\n        }\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * FPCUTPT::assign_cheap\n *\n * Constructor to make a new FPCUTPT on the cheap.\n **********************************************************************/\n\nvoid FPCUTPT::assign_cheap( // constructor\n    FPCUTPT *cutpts,        // predecessors\n    int16_t array_origin,   // start coord\n    int16_t x,              // position\n    bool faking,            // faking this one\n    bool mid_cut,           // cheap cut.\n    int16_t offset,         // dist to gap\n    STATS *projection,      // vertical occupation\n    float projection_scale, // scaling\n    int16_t zero_count,     // official zero\n    int16_t pitch,          // proposed pitch\n    int16_t pitch_error     // allowed tolerance\n) {\n  int index;             // test index\n  int16_t balance_count; // ding factor\n  int16_t r_index;       // test cut number\n  FPCUTPT *segpt;        // segment point\n  int32_t dist;          // from prev segment\n  double sq_dist;        // squared distance\n  double mean;           // mean pitch\n  double total;          // total dists\n  double factor;         // cost function\n                         // half of pitch\n  int16_t half_pitch = pitch / 2 - 1;\n  uint32_t lead_flag; // new flag\n\n  if (half_pitch > 31) {\n    half_pitch = 31;\n  } else if (half_pitch < 0) {\n    half_pitch = 0;\n  }\n  lead_flag = 1 << half_pitch;\n\n  back_balance = cutpts[x - 1 - array_origin].back_balance << 1;\n  back_balance &= lead_flag + (lead_flag - 1);\n  if (projection->pile_count(x) > zero_count) {\n    back_balance |= 1;\n  }\n  fwd_balance = cutpts[x - 1 - array_origin].fwd_balance >> 1;\n  if (projection->pile_count(x + half_pitch) > zero_count) {\n    fwd_balance |= lead_flag;\n  }\n\n  xpos = x;\n  cost = FLT_MAX;\n  pred = nullptr;\n  faked = faking;\n  terminal = false;\n  region_index = 0;\n  fake_count = INT16_MAX;\n  index = x - pitch;\n  if (index >= array_origin) {\n    segpt = &cutpts[index - array_origin];\n    dist = x - segpt->xpos;\n    if (!segpt->terminal && segpt->fake_count < INT16_MAX) {\n      balance_count = 0;\n      if (textord_balance_factor > 0) {\n        lead_flag = back_balance ^ segpt->fwd_balance;\n        balance_count = 0;\n        while (lead_flag != 0) {\n          balance_count++;\n          lead_flag &= lead_flag - 1;\n        }\n        balance_count =\n            static_cast<int16_t>(balance_count * textord_balance_factor / projection_scale);\n      }\n      r_index = segpt->region_index + 1;\n      total = segpt->mean_sum + dist;\n      balance_count += offset;\n      sq_dist = dist * dist + segpt->sq_sum + balance_count * balance_count;\n      mean = total / r_index;\n      factor = mean - pitch;\n      factor *= factor;\n      factor += sq_dist / (r_index)-mean * mean;\n      cost = factor; // find least cost\n      pred = segpt;  // save path\n      mean_sum = total;\n      sq_sum = sq_dist;\n      fake_count = segpt->fake_count + faked;\n      mid_cuts = segpt->mid_cuts + mid_cut;\n      region_index = r_index;\n    }\n  }\n}\n\n/**********************************************************************\n * check_pitch_sync\n *\n * Construct the lattice of possible segmentation points and choose the\n * optimal path. Return the optimal path only.\n * The return value is a measure of goodness of the sync.\n **********************************************************************/\n\ndouble check_pitch_sync2(    // find segmentation\n    BLOBNBOX_IT *blob_it,    // blobs to do\n    int16_t blob_count,      // no of blobs\n    int16_t pitch,           // pitch estimate\n    int16_t pitch_error,     // tolerance\n    STATS *projection,       // vertical\n    int16_t projection_left, // edges //scale factor\n    int16_t projection_right, float projection_scale,\n    int16_t &occupation_count, // no of occupied cells\n    FPSEGPT_LIST *seg_list,    // output list\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n) {\n  bool faking;                  // illegal cut pt\n  bool mid_cut;                 // cheap cut pt.\n  int16_t x;                    // current coord\n  int16_t blob_index;           // blob number\n  int16_t left_edge;            // of word\n  int16_t right_edge;           // of word\n  int16_t array_origin;         // x coord of array\n  int16_t offset;               // dist to legal area\n  int16_t zero_count;           // projection zero\n  int16_t best_left_x = 0;      // for equals\n  int16_t best_right_x = 0;     // right edge\n  TBOX this_box;                // bounding box\n  TBOX next_box;                // box of next blob\n  FPSEGPT *segpt;               // segment point\n  double mean_sum;              // computes result\n  int16_t best_fake;            // best fake level\n  int16_t best_count;           // no of cuts\n  BLOBNBOX_IT this_it;          // copy iterator\n  FPSEGPT_IT seg_it = seg_list; // output iterator\n\n  //      tprintf(\"Computing sync on word of %d blobs with pitch %d\\n\",\n  //              blob_count, pitch);\n  //      if (blob_count==8 && pitch==27)\n  //              projection->print(stdout,true);\n  zero_count = 0;\n  if (pitch < 3) {\n    pitch = 3; // nothing ludicrous\n  }\n  if ((pitch - 3) / 2 < pitch_error) {\n    pitch_error = (pitch - 3) / 2;\n  }\n  this_it = *blob_it;\n  this_box = box_next(&this_it); // get box\n  //      left_edge=this_box.left(); //left of word right_edge=this_box.right();\n  //      for (blob_index=1;blob_index<blob_count;blob_index++)\n  //      {\n  //              this_box=box_next(&this_it);\n  //              if (this_box.right()>right_edge)\n  //                      right_edge=this_box.right();\n  //      }\n  for (left_edge = projection_left;\n       projection->pile_count(left_edge) == 0 && left_edge < projection_right; left_edge++) {\n    ;\n  }\n  for (right_edge = projection_right;\n       projection->pile_count(right_edge) == 0 && right_edge > left_edge; right_edge--) {\n    ;\n  }\n  ASSERT_HOST(right_edge >= left_edge);\n  if (pitsync_linear_version >= 4) {\n    return check_pitch_sync3(projection_left, projection_right, zero_count, pitch, pitch_error,\n                             projection, projection_scale, occupation_count, seg_list, start, end);\n  }\n  array_origin = left_edge - pitch;\n  // array of points\n  std::vector<FPCUTPT> cutpts(right_edge - left_edge + pitch * 2 + 1);\n  for (x = array_origin; x < left_edge; x++) {\n    // free cuts\n    cutpts[x - array_origin].setup(&cutpts[0], array_origin, projection, zero_count, pitch, x, 0);\n  }\n  for (offset = 0; offset <= pitch_error; offset++, x++) {\n    // not quite free\n    cutpts[x - array_origin].setup(&cutpts[0], array_origin, projection, zero_count, pitch, x,\n                                   offset);\n  }\n\n  this_it = *blob_it;\n  this_box = box_next(&this_it); // first box\n  next_box = box_next(&this_it); // second box\n  blob_index = 1;\n  while (x < right_edge - pitch_error) {\n    if (x > this_box.right() + pitch_error && blob_index < blob_count) {\n      this_box = next_box;\n      next_box = box_next(&this_it);\n      blob_index++;\n    }\n    faking = false;\n    mid_cut = false;\n    if (x <= this_box.left()) {\n      offset = 0;\n    } else if (x <= this_box.left() + pitch_error) {\n      offset = x - this_box.left();\n    } else if (x >= this_box.right()) {\n      offset = 0;\n    } else if (x >= next_box.left() && blob_index < blob_count) {\n      offset = x - next_box.left();\n      if (this_box.right() - x < offset) {\n        offset = this_box.right() - x;\n      }\n    } else if (x >= this_box.right() - pitch_error) {\n      offset = this_box.right() - x;\n    } else if (x - this_box.left() > pitch * pitsync_joined_edge &&\n               this_box.right() - x > pitch * pitsync_joined_edge) {\n      mid_cut = true;\n      offset = 0;\n    } else {\n      faking = true;\n      offset = projection->pile_count(x);\n    }\n    cutpts[x - array_origin].assign(&cutpts[0], array_origin, x, faking, mid_cut, offset,\n                                    projection, projection_scale, zero_count, pitch, pitch_error);\n    x++;\n  }\n\n  best_fake = INT16_MAX;\n  // best path\n  double best_cost = INT32_MAX;\n  best_count = INT16_MAX;\n  while (x < right_edge + pitch) {\n    offset = x < right_edge ? right_edge - x : 0;\n    cutpts[x - array_origin].assign(&cutpts[0], array_origin, x, false, false, offset, projection,\n                                    projection_scale, zero_count, pitch, pitch_error);\n    cutpts[x - array_origin].terminal = true;\n    if (cutpts[x - array_origin].index() + cutpts[x - array_origin].fake_count <=\n        best_count + best_fake) {\n      if (cutpts[x - array_origin].fake_count < best_fake ||\n          (cutpts[x - array_origin].fake_count == best_fake &&\n           cutpts[x - array_origin].cost_function() < best_cost)) {\n        best_fake = cutpts[x - array_origin].fake_count;\n        best_cost = cutpts[x - array_origin].cost_function();\n        best_left_x = x;\n        best_right_x = x;\n        best_count = cutpts[x - array_origin].index();\n      } else if (cutpts[x - array_origin].fake_count == best_fake && x == best_right_x + 1 &&\n                 cutpts[x - array_origin].cost_function() == best_cost) {\n        // exactly equal\n        best_right_x = x;\n      }\n    }\n    x++;\n  }\n  ASSERT_HOST(best_fake < INT16_MAX);\n\n  // end of best path\n  FPCUTPT *best_end = &cutpts[(best_left_x + best_right_x) / 2 - array_origin];\n  if (this_box.right() == textord_test_x && this_box.top() == textord_test_y) {\n    for (x = left_edge - pitch; x < right_edge + pitch; x++) {\n      tprintf(\"x=%d, C=%g, s=%g, sq=%g, prev=%d\\n\", x, cutpts[x - array_origin].cost_function(),\n              cutpts[x - array_origin].sum(), cutpts[x - array_origin].squares(),\n              cutpts[x - array_origin].previous()->position());\n    }\n  }\n  occupation_count = -1;\n  do {\n    for (x = best_end->position() - pitch + pitch_error;\n         x < best_end->position() - pitch_error && projection->pile_count(x) == 0; x++) {\n      ;\n    }\n    if (x < best_end->position() - pitch_error) {\n      occupation_count++;\n    }\n    // copy it\n    segpt = new FPSEGPT(best_end);\n    seg_it.add_before_then_move(segpt);\n    best_end = best_end->previous();\n  } while (best_end != nullptr);\n  seg_it.move_to_last();\n  mean_sum = seg_it.data()->sum();\n  mean_sum = mean_sum * mean_sum / best_count;\n  if (seg_it.data()->squares() - mean_sum < 0) {\n    tprintf(\"Impossible sqsum=%g, mean=%g, total=%d\\n\", seg_it.data()->squares(),\n            seg_it.data()->sum(), best_count);\n  }\n  //      tprintf(\"blob_count=%d, pitch=%d, sync=%g, occ=%d\\n\",\n  //              blob_count,pitch,seg_it.data()->squares()-mean_sum,\n  //              occupation_count);\n  return seg_it.data()->squares() - mean_sum;\n}\n\n/**********************************************************************\n * check_pitch_sync\n *\n * Construct the lattice of possible segmentation points and choose the\n * optimal path. Return the optimal path only.\n * The return value is a measure of goodness of the sync.\n **********************************************************************/\n\ndouble check_pitch_sync3(    // find segmentation\n    int16_t projection_left, // edges //to be considered 0\n    int16_t projection_right, int16_t zero_count,\n    int16_t pitch,             // pitch estimate\n    int16_t pitch_error,       // tolerance\n    STATS *projection,         // vertical\n    float projection_scale,    // scale factor\n    int16_t &occupation_count, // no of occupied cells\n    FPSEGPT_LIST *seg_list,    // output list\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n) {\n  bool faking;                  // illegal cut pt\n  bool mid_cut;                 // cheap cut pt.\n  int16_t left_edge;            // of word\n  int16_t right_edge;           // of word\n  int16_t x;                    // current coord\n  int16_t array_origin;         // x coord of array\n  int16_t offset;               // dist to legal area\n  int16_t projection_offset;    // from scaled projection\n  int16_t prev_zero;            // previous zero dist\n  int16_t next_zero;            // next zero dist\n  int16_t zero_offset;          // scan window\n  int16_t best_left_x = 0;      // for equals\n  int16_t best_right_x = 0;     // right edge\n  FPSEGPT *segpt;               // segment point\n  int minindex;                 // next input position\n  int test_index;               // index to mins\n  double mean_sum;              // computes result\n  int16_t best_fake;            // best fake level\n  int16_t best_count;           // no of cuts\n  FPSEGPT_IT seg_it = seg_list; // output iterator\n\n  end = (end - start) % pitch;\n  if (pitch < 3) {\n    pitch = 3; // nothing ludicrous\n  }\n  if ((pitch - 3) / 2 < pitch_error) {\n    pitch_error = (pitch - 3) / 2;\n  }\n  // min dist of zero\n  zero_offset = static_cast<int16_t>(pitch * pitsync_joined_edge);\n  for (left_edge = projection_left;\n       projection->pile_count(left_edge) == 0 && left_edge < projection_right; left_edge++) {\n    ;\n  }\n  for (right_edge = projection_right;\n       projection->pile_count(right_edge) == 0 && right_edge > left_edge; right_edge--) {\n    ;\n  }\n  array_origin = left_edge - pitch;\n  // array of points\n  std::vector<FPCUTPT> cutpts(right_edge - left_edge + pitch * 2 + 1);\n  // local min results\n  std::vector<bool> mins(pitch_error * 2 + 1);\n  for (x = array_origin; x < left_edge; x++) {\n    // free cuts\n    cutpts[x - array_origin].setup(&cutpts[0], array_origin, projection, zero_count, pitch, x, 0);\n  }\n  prev_zero = left_edge - 1;\n  for (offset = 0; offset <= pitch_error; offset++, x++) {\n    // not quite free\n    cutpts[x - array_origin].setup(&cutpts[0], array_origin, projection, zero_count, pitch, x,\n                                   offset);\n  }\n\n  for (offset = -pitch_error, minindex = 0; offset < pitch_error; offset++, minindex++) {\n    mins[minindex] = projection->local_min(x + offset);\n  }\n  next_zero = x + zero_offset + 1;\n  for (offset = next_zero - 1; offset >= x; offset--) {\n    if (projection->pile_count(offset) <= zero_count) {\n      next_zero = offset;\n      break;\n    }\n  }\n  while (x < right_edge - pitch_error) {\n    mins[minindex] = projection->local_min(x + pitch_error);\n    minindex++;\n    if (minindex > pitch_error * 2) {\n      minindex = 0;\n    }\n    faking = false;\n    mid_cut = false;\n    offset = 0;\n    if (projection->pile_count(x) <= zero_count) {\n      prev_zero = x;\n    } else {\n      for (offset = 1; offset <= pitch_error; offset++) {\n        if (projection->pile_count(x + offset) <= zero_count ||\n            projection->pile_count(x - offset) <= zero_count) {\n          break;\n        }\n      }\n    }\n    if (offset > pitch_error) {\n      if (x - prev_zero > zero_offset && next_zero - x > zero_offset) {\n        for (offset = 0; offset <= pitch_error; offset++) {\n          test_index = minindex + pitch_error + offset;\n          if (test_index > pitch_error * 2) {\n            test_index -= pitch_error * 2 + 1;\n          }\n          if (mins[test_index]) {\n            break;\n          }\n          test_index = minindex + pitch_error - offset;\n          if (test_index > pitch_error * 2) {\n            test_index -= pitch_error * 2 + 1;\n          }\n          if (mins[test_index]) {\n            break;\n          }\n        }\n      }\n      if (offset > pitch_error) {\n        offset = projection->pile_count(x);\n        faking = true;\n      } else {\n        projection_offset = static_cast<int16_t>(projection->pile_count(x) / projection_scale);\n        if (projection_offset > offset) {\n          offset = projection_offset;\n        }\n        mid_cut = true;\n      }\n    }\n    if ((start == 0 && end == 0) || !textord_fast_pitch_test ||\n        (x - projection_left - start) % pitch <= end) {\n      cutpts[x - array_origin].assign(&cutpts[0], array_origin, x, faking, mid_cut, offset,\n                                      projection, projection_scale, zero_count, pitch, pitch_error);\n    } else {\n      cutpts[x - array_origin].assign_cheap(&cutpts[0], array_origin, x, faking, mid_cut, offset,\n                                            projection, projection_scale, zero_count, pitch,\n                                            pitch_error);\n    }\n    x++;\n    if (next_zero < x || next_zero == x + zero_offset) {\n      next_zero = x + zero_offset + 1;\n    }\n    if (projection->pile_count(x + zero_offset) <= zero_count) {\n      next_zero = x + zero_offset;\n    }\n  }\n\n  best_fake = INT16_MAX;\n  // best path\n  double best_cost = INT32_MAX;\n  best_count = INT16_MAX;\n  while (x < right_edge + pitch) {\n    offset = x < right_edge ? right_edge - x : 0;\n    cutpts[x - array_origin].assign(&cutpts[0], array_origin, x, false, false, offset, projection,\n                                    projection_scale, zero_count, pitch, pitch_error);\n    cutpts[x - array_origin].terminal = true;\n    if (cutpts[x - array_origin].index() + cutpts[x - array_origin].fake_count <=\n        best_count + best_fake) {\n      if (cutpts[x - array_origin].fake_count < best_fake ||\n          (cutpts[x - array_origin].fake_count == best_fake &&\n           cutpts[x - array_origin].cost_function() < best_cost)) {\n        best_fake = cutpts[x - array_origin].fake_count;\n        best_cost = cutpts[x - array_origin].cost_function();\n        best_left_x = x;\n        best_right_x = x;\n        best_count = cutpts[x - array_origin].index();\n      } else if (cutpts[x - array_origin].fake_count == best_fake && x == best_right_x + 1 &&\n                 cutpts[x - array_origin].cost_function() == best_cost) {\n        // exactly equal\n        best_right_x = x;\n      }\n    }\n    x++;\n  }\n  ASSERT_HOST(best_fake < INT16_MAX);\n\n  // end of best path\n  FPCUTPT *best_end = &cutpts[(best_left_x + best_right_x) / 2 - array_origin];\n  //      for (x=left_edge-pitch;x<right_edge+pitch;x++)\n  //      {\n  //              tprintf(\"x=%d, C=%g, s=%g, sq=%g, prev=%d\\n\",\n  //                      x,cutpts[x-array_origin].cost_function(),\n  //                      cutpts[x-array_origin].sum(),\n  //                      cutpts[x-array_origin].squares(),\n  //                      cutpts[x-array_origin].previous()->position());\n  //      }\n  occupation_count = -1;\n  do {\n    for (x = best_end->position() - pitch + pitch_error;\n         x < best_end->position() - pitch_error && projection->pile_count(x) == 0; x++) {\n    }\n    if (x < best_end->position() - pitch_error) {\n      occupation_count++;\n    }\n    // copy it\n    segpt = new FPSEGPT(best_end);\n    seg_it.add_before_then_move(segpt);\n    best_end = best_end->previous();\n  } while (best_end != nullptr);\n  seg_it.move_to_last();\n  mean_sum = seg_it.data()->sum();\n  mean_sum = mean_sum * mean_sum / best_count;\n  if (seg_it.data()->squares() - mean_sum < 0) {\n    tprintf(\"Impossible sqsum=%g, mean=%g, total=%d\\n\", seg_it.data()->squares(),\n            seg_it.data()->sum(), best_count);\n  }\n  return seg_it.data()->squares() - mean_sum;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/pithsync.h",
    "content": "/**********************************************************************\n * File:        pithsync.h  (Formerly pitsync2.h)\n * Description: Code to find the optimum fixed pitch segmentation of some blobs.\n * Author:    Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef PITHSYNC_H\n#define PITHSYNC_H\n\n#include \"blobbox.h\"\n#include \"params.h\"\n#include \"statistc.h\"\n\nnamespace tesseract {\n\nclass FPSEGPT_LIST;\n\nclass FPCUTPT {\npublic:\n  FPCUTPT() = default;\n  void setup(               // start of cut\n      FPCUTPT cutpts[],     // predecessors\n      int16_t array_origin, // start coord\n      STATS *projection,    // occupation\n      int16_t zero_count,   // official zero\n      int16_t pitch,        // proposed pitch\n      int16_t x,            // position\n      int16_t offset);      // dist to gap\n\n  void assign(                // evaluate cut\n      FPCUTPT cutpts[],       // predecessors\n      int16_t array_origin,   // start coord\n      int16_t x,              // position\n      bool faking,            // faking this one\n      bool mid_cut,           // doing free cut\n      int16_t offset,         // extra cost dist\n      STATS *projection,      // occupation\n      float projection_scale, // scaling\n      int16_t zero_count,     // official zero\n      int16_t pitch,          // proposed pitch\n      int16_t pitch_error);   // allowed tolerance\n\n  void assign_cheap(          // evaluate cut\n      FPCUTPT cutpts[],       // predecessors\n      int16_t array_origin,   // start coord\n      int16_t x,              // position\n      bool faking,            // faking this one\n      bool mid_cut,           // doing free cut\n      int16_t offset,         // extra cost dist\n      STATS *projection,      // occupation\n      float projection_scale, // scaling\n      int16_t zero_count,     // official zero\n      int16_t pitch,          // proposed pitch\n      int16_t pitch_error);   // allowed tolerance\n\n  int32_t position() { // access func\n    return xpos;\n  }\n  double cost_function() {\n    return cost;\n  }\n  double squares() {\n    return sq_sum;\n  }\n  double sum() {\n    return mean_sum;\n  }\n  FPCUTPT *previous() {\n    return pred;\n  }\n  int16_t cheap_cuts() const { // no of mi cuts\n    return mid_cuts;\n  }\n  int16_t index() const {\n    return region_index;\n  }\n\n  bool faked;         // faked split point\n  bool terminal;      // successful end\n  int16_t fake_count; // total fakes to here\n\nprivate:\n  int16_t region_index;  // cut serial number\n  int16_t mid_cuts;      // no of cheap cuts\n  int32_t xpos;          // location\n  uint32_t back_balance; // proj backwards\n  uint32_t fwd_balance;  // proj forwards\n  FPCUTPT *pred;         // optimal previous\n  double mean_sum;       // mean so far\n  double sq_sum;         // summed distsances\n  double cost;           // cost function\n};\ndouble check_pitch_sync2(    // find segmentation\n    BLOBNBOX_IT *blob_it,    // blobs to do\n    int16_t blob_count,      // no of blobs\n    int16_t pitch,           // pitch estimate\n    int16_t pitch_error,     // tolerance\n    STATS *projection,       // vertical\n    int16_t projection_left, // edges //scale factor\n    int16_t projection_right, float projection_scale,\n    int16_t &occupation_count, // no of occupied cells\n    FPSEGPT_LIST *seg_list,    // output list\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n);\ndouble check_pitch_sync3(    // find segmentation\n    int16_t projection_left, // edges //to be considered 0\n    int16_t projection_right, int16_t zero_count,\n    int16_t pitch,             // pitch estimate\n    int16_t pitch_error,       // tolerance\n    STATS *projection,         // vertical\n    float projection_scale,    // scale factor\n    int16_t &occupation_count, // no of occupied cells\n    FPSEGPT_LIST *seg_list,    // output list\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/pitsync1.cpp",
    "content": "/**********************************************************************\n * File:        pitsync1.cpp  (Formerly pitsync.c)\n * Description: Code to find the optimum fixed pitch segmentation of some blobs.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"pitsync1.h\"\n\n#include <cfloat> // for FLT_MAX\n#include <cmath>\n\nnamespace tesseract {\n\nINT_VAR(pitsync_linear_version, 6, \"Use new fast algorithm\");\ndouble_VAR(pitsync_joined_edge, 0.75, \"Dist inside big blob for chopping\");\ndouble_VAR(pitsync_offset_freecut_fraction, 0.25, \"Fraction of cut for free cuts\");\n\n/**********************************************************************\n * FPSEGPT::FPSEGPT\n *\n * Constructor to make a new FPSEGPT.\n * The existing FPCUTPT is duplicated.\n **********************************************************************/\n\nFPSEGPT::FPSEGPT(  // constructor\n    FPCUTPT *cutpt // create from new form\n) {\n  pred = nullptr;\n  mean_sum = cutpt->sum();\n  sq_sum = cutpt->squares();\n  cost = cutpt->cost_function();\n  faked = cutpt->faked;\n  terminal = cutpt->terminal;\n  fake_count = cutpt->fake_count;\n  xpos = cutpt->position();\n  mid_cuts = cutpt->cheap_cuts();\n}\n\n/**********************************************************************\n * FPSEGPT::FPSEGPT\n *\n * Constructor to make a new FPSEGPT.\n **********************************************************************/\n\nFPSEGPT::FPSEGPT( // constructor\n    int16_t x     // position\n    )\n    : xpos(x) {\n  pred = nullptr;\n  mean_sum = 0;\n  sq_sum = 0;\n  cost = 0;\n  faked = false;\n  terminal = false;\n  fake_count = 0;\n  mid_cuts = 0;\n}\n\n/**********************************************************************\n * FPSEGPT::FPSEGPT\n *\n * Constructor to make a new FPSEGPT.\n **********************************************************************/\n\nFPSEGPT::FPSEGPT(           // constructor\n    int16_t x,              // position\n    bool faking,            // faking this one\n    int16_t offset,         // dist to gap\n    int16_t region_index,   // segment number\n    int16_t pitch,          // proposed pitch\n    int16_t pitch_error,    // allowed tolerance\n    FPSEGPT_LIST *prev_list // previous segment\n    )\n    : fake_count(0), xpos(x), mean_sum(0.0), sq_sum(0.0) {\n  int16_t best_fake;              // on previous\n  FPSEGPT *segpt;                 // segment point\n  int32_t dist;                   // from prev segment\n  double sq_dist;                 // squared distance\n  double mean;                    // mean pitch\n  double total;                   // total dists\n  double factor;                  // cost function\n  FPSEGPT_IT pred_it = prev_list; // for previous segment\n\n  cost = FLT_MAX;\n  pred = nullptr;\n  faked = faking;\n  terminal = false;\n  best_fake = INT16_MAX;\n  mid_cuts = 0;\n  for (pred_it.mark_cycle_pt(); !pred_it.cycled_list(); pred_it.forward()) {\n    segpt = pred_it.data();\n    if (segpt->fake_count < best_fake) {\n      best_fake = segpt->fake_count;\n    }\n    dist = x - segpt->xpos;\n    if (dist >= pitch - pitch_error && dist <= pitch + pitch_error && !segpt->terminal) {\n      total = segpt->mean_sum + dist;\n      sq_dist = dist * dist + segpt->sq_sum + offset * offset;\n      // sum of squarees\n      mean = total / region_index;\n      factor = mean - pitch;\n      factor *= factor;\n      factor += sq_dist / (region_index)-mean * mean;\n      if (factor < cost) {\n        cost = factor; // find least cost\n        pred = segpt;  // save path\n        mean_sum = total;\n        sq_sum = sq_dist;\n        fake_count = segpt->fake_count + faked;\n      }\n    }\n  }\n  if (fake_count > best_fake + 1) {\n    pred = nullptr; // fail it\n  }\n}\n\n/**********************************************************************\n * check_pitch_sync\n *\n * Construct the lattice of possible segmentation points and choose the\n * optimal path. Return the optimal path only.\n * The return value is a measure of goodness of the sync.\n **********************************************************************/\n\ndouble check_pitch_sync(   // find segmentation\n    BLOBNBOX_IT *blob_it,  // blobs to do\n    int16_t blob_count,    // no of blobs\n    int16_t pitch,         // pitch estimate\n    int16_t pitch_error,   // tolerance\n    STATS *projection,     // vertical\n    FPSEGPT_LIST *seg_list // output list\n) {\n  int16_t x;          // current coord\n  int16_t min_index;  // blob number\n  int16_t max_index;  // blob number\n  int16_t left_edge;  // of word\n  int16_t right_edge; // of word\n  int16_t right_max;  // max allowed x\n  int16_t min_x;      // in this region\n  int16_t max_x;\n  int16_t region_index;\n  int16_t best_region_index = 0; // for best result\n  int16_t offset;                // dist to legal area\n  int16_t left_best_x;           // edge of good region\n  int16_t right_best_x;          // right edge\n  TBOX min_box;                  // bounding box\n  TBOX max_box;                  // bounding box\n  TBOX next_box;                 // box of next blob\n  FPSEGPT *segpt;                // segment point\n  FPSEGPT_LIST *segpts;          // points in a segment\n  double best_cost;              // best path\n  double mean_sum;               // computes result\n  FPSEGPT *best_end;             // end of best path\n  BLOBNBOX_IT min_it;            // copy iterator\n  BLOBNBOX_IT max_it;            // copy iterator\n  FPSEGPT_IT segpt_it;           // iterator\n                                 // output segments\n  FPSEGPT_IT outseg_it = seg_list;\n  FPSEGPT_LIST_CLIST lattice; // list of lists\n                              // region iterator\n  FPSEGPT_LIST_C_IT lattice_it = &lattice;\n\n  //      tprintf(\"Computing sync on word of %d blobs with pitch %d\\n\",\n  //              blob_count, pitch);\n  //      if (blob_count==8 && pitch==27)\n  //              projection->print(stdout,true);\n  if (pitch < 3) {\n    pitch = 3; // nothing ludicrous\n  }\n  if ((pitch - 3) / 2 < pitch_error) {\n    pitch_error = (pitch - 3) / 2;\n  }\n  min_it = *blob_it;\n  min_box = box_next(&min_it); // get box\n  //      if (blob_count==8 && pitch==27)\n  //              tprintf(\"1st box at (%d,%d)->(%d,%d)\\n\",\n  //                      min_box.left(),min_box.bottom(),\n  //                      min_box.right(),min_box.top());\n  // left of word\n  left_edge = min_box.left() + pitch_error;\n  for (min_index = 1; min_index < blob_count; min_index++) {\n    min_box = box_next(&min_it);\n    //              if (blob_count==8 && pitch==27)\n    //                      tprintf(\"Box at (%d,%d)->(%d,%d)\\n\",\n    //                              min_box.left(),min_box.bottom(),\n    //                              min_box.right(),min_box.top());\n  }\n  right_edge = min_box.right(); // end of word\n  max_x = left_edge;\n  // min permissible\n  min_x = max_x - pitch + pitch_error * 2 + 1;\n  right_max = right_edge + pitch - pitch_error - 1;\n  segpts = new FPSEGPT_LIST; // list of points\n  segpt_it.set_to_list(segpts);\n  for (x = min_x; x <= max_x; x++) {\n    segpt = new FPSEGPT(x); // make a new one\n                            // put in list\n    segpt_it.add_after_then_move(segpt);\n  }\n  // first segment\n  lattice_it.add_before_then_move(segpts);\n  min_index = 0;\n  region_index = 1;\n  best_cost = FLT_MAX;\n  best_end = nullptr;\n  min_it = *blob_it;\n  min_box = box_next(&min_it); // first box\n  do {\n    left_best_x = -1;\n    right_best_x = -1;\n    segpts = new FPSEGPT_LIST; // list of points\n    segpt_it.set_to_list(segpts);\n    min_x += pitch - pitch_error; // next limits\n    max_x += pitch + pitch_error;\n    while (min_box.right() < min_x && min_index < blob_count) {\n      min_index++;\n      min_box = box_next(&min_it);\n    }\n    max_it = min_it;\n    max_index = min_index;\n    max_box = min_box;\n    next_box = box_next(&max_it);\n    for (x = min_x; x <= max_x && x <= right_max; x++) {\n      while (x < right_edge && max_index < blob_count && x > max_box.right()) {\n        max_index++;\n        max_box = next_box;\n        next_box = box_next(&max_it);\n      }\n      if (x <= max_box.left() + pitch_error || x >= max_box.right() - pitch_error ||\n          x >= right_edge || (max_index < blob_count - 1 && x >= next_box.left()) ||\n          (x - max_box.left() > pitch * pitsync_joined_edge &&\n           max_box.right() - x > pitch * pitsync_joined_edge)) {\n        //                      || projection->local_min(x))\n        if (x - max_box.left() > 0 && x - max_box.left() <= pitch_error) {\n          // dist to real break\n          offset = x - max_box.left();\n        } else if (max_box.right() - x > 0 && max_box.right() - x <= pitch_error &&\n                   (max_index >= blob_count - 1 || x < next_box.left())) {\n          offset = max_box.right() - x;\n        } else {\n          offset = 0;\n        }\n        //                              offset=pitsync_offset_freecut_fraction*projection->pile_count(x);\n        segpt = new FPSEGPT(x, false, offset, region_index, pitch, pitch_error, lattice_it.data());\n      } else {\n        offset = projection->pile_count(x);\n        segpt = new FPSEGPT(x, true, offset, region_index, pitch, pitch_error, lattice_it.data());\n      }\n      if (segpt->previous() != nullptr) {\n        segpt_it.add_after_then_move(segpt);\n        if (x >= right_edge - pitch_error) {\n          segpt->terminal = true; // no more wanted\n          if (segpt->cost_function() < best_cost) {\n            best_cost = segpt->cost_function();\n            // find least\n            best_end = segpt;\n            best_region_index = region_index;\n            left_best_x = x;\n            right_best_x = x;\n          } else if (segpt->cost_function() == best_cost && right_best_x == x - 1) {\n            right_best_x = x;\n          }\n        }\n      } else {\n        delete segpt; // no good\n      }\n    }\n    if (segpts->empty()) {\n      if (best_end != nullptr) {\n        break; // already found one\n      }\n      make_illegal_segment(lattice_it.data(), min_box, min_it, region_index, pitch, pitch_error,\n                           segpts);\n    } else {\n      if (right_best_x > left_best_x + 1) {\n        left_best_x = (left_best_x + right_best_x + 1) / 2;\n        for (segpt_it.mark_cycle_pt();\n             !segpt_it.cycled_list() && segpt_it.data()->position() != left_best_x;\n             segpt_it.forward()) {\n          ;\n        }\n        if (segpt_it.data()->position() == left_best_x) {\n          // middle of region\n          best_end = segpt_it.data();\n        }\n      }\n    }\n    // new segment\n    lattice_it.add_before_then_move(segpts);\n    region_index++;\n  } while (min_x < right_edge);\n  ASSERT_HOST(best_end != nullptr); // must always find some\n\n  for (lattice_it.mark_cycle_pt(); !lattice_it.cycled_list(); lattice_it.forward()) {\n    segpts = lattice_it.data();\n    segpt_it.set_to_list(segpts);\n    //              if (blob_count==8 && pitch==27)\n    //              {\n    //                      for\n    //                      (segpt_it.mark_cycle_pt();!segpt_it.cycled_list();segpt_it.forward())\n    //                      {\n    //                              segpt=segpt_it.data();\n    //                              tprintf(\"At %d, (%x) cost=%g, m=%g, sq=%g,\n    //                              pred=%x\\n\",\n    //                                      segpt->position(),segpt,segpt->cost_function(),\n    //                                      segpt->sum(),segpt->squares(),segpt->previous());\n    //                      }\n    //                      tprintf(\"\\n\");\n    //              }\n    for (segpt_it.mark_cycle_pt(); !segpt_it.cycled_list() && segpt_it.data() != best_end;\n         segpt_it.forward()) {\n      ;\n    }\n    if (segpt_it.data() == best_end) {\n      // save good one\n      segpt = segpt_it.extract();\n      outseg_it.add_before_then_move(segpt);\n      best_end = segpt->previous();\n    }\n  }\n  ASSERT_HOST(best_end == nullptr);\n  ASSERT_HOST(!outseg_it.empty());\n  outseg_it.move_to_last();\n  mean_sum = outseg_it.data()->sum();\n  mean_sum = mean_sum * mean_sum / best_region_index;\n  if (outseg_it.data()->squares() - mean_sum < 0) {\n    tprintf(\"Impossible sqsum=%g, mean=%g, total=%d\\n\", outseg_it.data()->squares(),\n            outseg_it.data()->sum(), best_region_index);\n  }\n  lattice.deep_clear(); // shift the lot\n  return outseg_it.data()->squares() - mean_sum;\n}\n\n/**********************************************************************\n * make_illegal_segment\n *\n * Make a fake set of chop points due to having no legal places.\n **********************************************************************/\n\nvoid make_illegal_segment(   // find segmentation\n    FPSEGPT_LIST *prev_list, // previous segments\n    TBOX blob_box,           // bounding box\n    BLOBNBOX_IT blob_it,     // iterator\n    int16_t region_index,    // number of segment\n    int16_t pitch,           // pitch estimate\n    int16_t pitch_error,     // tolerance\n    FPSEGPT_LIST *seg_list   // output list\n) {\n  int16_t x;         // current coord\n  int16_t min_x = 0; // in this region\n  int16_t max_x = 0;\n  int16_t offset;                 // dist to edge\n  FPSEGPT *segpt;                 // segment point\n  FPSEGPT *prevpt;                // previous point\n  float best_cost;                // best path\n  FPSEGPT_IT segpt_it = seg_list; // iterator\n                                  // previous points\n  FPSEGPT_IT prevpt_it = prev_list;\n\n  best_cost = FLT_MAX;\n  for (prevpt_it.mark_cycle_pt(); !prevpt_it.cycled_list(); prevpt_it.forward()) {\n    prevpt = prevpt_it.data();\n    if (prevpt->cost_function() < best_cost) {\n      // find least\n      best_cost = prevpt->cost_function();\n      min_x = prevpt->position();\n      max_x = min_x; // limits on coords\n    } else if (prevpt->cost_function() == best_cost) {\n      max_x = prevpt->position();\n    }\n  }\n  min_x += pitch - pitch_error;\n  max_x += pitch + pitch_error;\n  for (x = min_x; x <= max_x; x++) {\n    while (x > blob_box.right()) {\n      blob_box = box_next(&blob_it);\n    }\n    offset = x - blob_box.left();\n    if (blob_box.right() - x < offset) {\n      offset = blob_box.right() - x;\n    }\n    segpt = new FPSEGPT(x, false, offset, region_index, pitch, pitch_error, prev_list);\n    if (segpt->previous() != nullptr) {\n      ASSERT_HOST(offset >= 0);\n      fprintf(stderr, \"made fake at %d\\n\", x);\n      // make one up\n      segpt_it.add_after_then_move(segpt);\n      segpt->faked = true;\n      segpt->fake_count++;\n    } else {\n      delete segpt;\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/pitsync1.h",
    "content": "/**********************************************************************\n * File:        pitsync1.h  (Formerly pitsync.h)\n * Description: Code to find the optimum fixed pitch segmentation of some blobs.\n * Author:    Ray Smith\n * Created:   Thu Nov 19 11:48:05 GMT 1992\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef PITSYNC1_H\n#define PITSYNC1_H\n\n#include \"blobbox.h\"\n#include \"clst.h\"\n#include \"elst.h\"\n#include \"params.h\"\n#include \"pithsync.h\"\n#include \"statistc.h\"\n\nnamespace tesseract {\n\nclass FPSEGPT_LIST;\n\nclass FPSEGPT : public ELIST<FPSEGPT>::LINK {\npublic:\n  FPSEGPT() = default;\n  FPSEGPT(                      // constructor\n      int16_t x);               // position\n  FPSEGPT(                      // constructor\n      int16_t x,                // position\n      bool faking,              // faking this one\n      int16_t offset,           // extra cost dist\n      int16_t region_index,     // segment number\n      int16_t pitch,            // proposed pitch\n      int16_t pitch_error,      // allowed tolerance\n      FPSEGPT_LIST *prev_list); // previous segment\n  FPSEGPT(FPCUTPT *cutpt);      // build from new type\n\n  int32_t position() { // access func\n    return xpos;\n  }\n  double cost_function() {\n    return cost;\n  }\n  double squares() {\n    return sq_sum;\n  }\n  double sum() {\n    return mean_sum;\n  }\n  FPSEGPT *previous() {\n    return pred;\n  }\n  int16_t cheap_cuts() const { // no of cheap cuts\n    return mid_cuts;\n  }\n\n  bool faked;         // faked split point\n  bool terminal;      // successful end\n  int16_t fake_count; // total fakes to here\n\nprivate:\n  int16_t mid_cuts; // no of cheap cuts\n  int32_t xpos;     // location\n  FPSEGPT *pred;    // optimal previous\n  double mean_sum;  // mean so far\n  double sq_sum;    // summed distsances\n  double cost;      // cost function\n};\n\nELISTIZEH(FPSEGPT)\nCLISTIZEH(FPSEGPT_LIST)\nextern INT_VAR_H(pitsync_linear_version);\nextern double_VAR_H(pitsync_joined_edge);\nextern double_VAR_H(pitsync_offset_freecut_fraction);\ndouble check_pitch_sync(   // find segmentation\n    BLOBNBOX_IT *blob_it,  // blobs to do\n    int16_t blob_count,    // no of blobs\n    int16_t pitch,         // pitch estimate\n    int16_t pitch_error,   // tolerance\n    STATS *projection,     // vertical\n    FPSEGPT_LIST *seg_list // output list\n);\nvoid make_illegal_segment(   // find segmentation\n    FPSEGPT_LIST *prev_list, // previous segments\n    TBOX blob_box,           // bounding box\n    BLOBNBOX_IT blob_it,     // iterator\n    int16_t region_index,    // number of segment\n    int16_t pitch,           // pitch estimate\n    int16_t pitch_error,     // tolerance\n    FPSEGPT_LIST *seg_list   // output list\n);\nint16_t vertical_torow_projection( // project whole row\n    TO_ROW *row,                   // row to do\n    STATS *projection              // output\n);\nvoid vertical_cblob_projection( // project outlines\n    C_BLOB *blob,               // blob to project\n    STATS *stats                // output\n);\nvoid vertical_coutline_projection( // project outlines\n    C_OUTLINE *outline,            // outline to project\n    STATS *stats                   // output\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/scanedg.cpp",
    "content": "/**********************************************************************\n * File:        scanedg.cpp  (Formerly scanedge.c)\n * Description: Raster scanning crack based edge extractor.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"scanedg.h\"\n\n#include \"crakedge.h\"\n#include \"edgloop.h\"\n#include \"pdblock.h\"\n\n#include <allheaders.h>\n\n#include <memory> // std::unique_ptr\n\nnamespace tesseract {\n\n#define WHITE_PIX 1 /*thresholded colours */\n#define BLACK_PIX 0\n// Flips between WHITE_PIX and BLACK_PIX.\n#define FLIP_COLOUR(pix) (1 - (pix))\n\nstruct CrackPos {\n  CRACKEDGE **free_cracks; // Freelist for fast allocation.\n  int x;                   // Position of new edge.\n  int y;\n};\n\nstatic void free_crackedges(CRACKEDGE *start);\n\nstatic void join_edges(CRACKEDGE *edge1, CRACKEDGE *edge2, CRACKEDGE **free_cracks,\n                       C_OUTLINE_IT *outline_it);\n\nstatic void line_edges(TDimension x, TDimension y, TDimension xext, uint8_t uppercolour, uint8_t *bwpos,\n                       CRACKEDGE **prevline, CRACKEDGE **free_cracks, C_OUTLINE_IT *outline_it);\n\nstatic void make_margins(PDBLK *block, BLOCK_LINE_IT *line_it, uint8_t *pixels, uint8_t margin,\n                         TDimension left, TDimension right, TDimension y);\n\nstatic CRACKEDGE *h_edge(int sign, CRACKEDGE *join, CrackPos *pos);\nstatic CRACKEDGE *v_edge(int sign, CRACKEDGE *join, CrackPos *pos);\n\n/**********************************************************************\n * block_edges\n *\n * Extract edges from a PDBLK.\n **********************************************************************/\n\nvoid block_edges(Image t_pix,   // thresholded image\n                 PDBLK *block, // block in image\n                 C_OUTLINE_IT *outline_it) {\n  ICOORD bleft; // bounding box\n  ICOORD tright;\n  BLOCK_LINE_IT line_it = block; // line iterator\n\n  int width = pixGetWidth(t_pix);\n  int height = pixGetHeight(t_pix);\n  int wpl = pixGetWpl(t_pix);\n  // lines in progress\n  std::unique_ptr<CRACKEDGE *[]> ptrline(new CRACKEDGE *[width + 1]);\n  CRACKEDGE *free_cracks = nullptr;\n\n  block->bounding_box(bleft, tright); // block box\n  ASSERT_HOST(tright.x() <= width);\n  ASSERT_HOST(tright.y() <= height);\n  int block_width = tright.x() - bleft.x();\n  for (int x = block_width; x >= 0; x--) {\n    ptrline[x] = nullptr; //  no lines in progress\n  }\n\n  std::unique_ptr<uint8_t[]> bwline(new uint8_t[width]);\n\n  const uint8_t margin = WHITE_PIX;\n\n  for (int y = tright.y() - 1; y >= bleft.y() - 1; y--) {\n    if (y >= bleft.y() && y < tright.y()) {\n      // Get the binary pixels from the image.\n      l_uint32 *line = pixGetData(t_pix) + wpl * (height - 1 - y);\n      for (int x = 0; x < block_width; ++x) {\n        bwline[x] = GET_DATA_BIT(line, x + bleft.x()) ^ 1;\n      }\n      make_margins(block, &line_it, bwline.get(), margin, bleft.x(), tright.x(), y);\n    } else {\n      memset(bwline.get(), margin, block_width * sizeof(bwline[0]));\n    }\n    line_edges(bleft.x(), y, block_width, margin, bwline.get(), ptrline.get(), &free_cracks,\n               outline_it);\n  }\n\n  free_crackedges(free_cracks); // really free them\n}\n\n/**********************************************************************\n * make_margins\n *\n * Get an image line and set to margin non-text pixels.\n **********************************************************************/\n\nstatic void make_margins(   // get a line\n    PDBLK *block,           // block in image\n    BLOCK_LINE_IT *line_it, // for old style\n    uint8_t *pixels,        // pixels to strip\n    uint8_t margin,         // white-out pixel\n    TDimension left,        // block edges\n    TDimension right,\n    TDimension y            // line coord\n) {\n  ICOORDELT_IT seg_it;\n\n  if (block->poly_block() != nullptr) {\n    std::unique_ptr<PB_LINE_IT> lines(new PB_LINE_IT(block->poly_block()));\n    const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments(lines->get_line(y));\n    if (!segments->empty()) {\n      seg_it.set_to_list(segments.get());\n      seg_it.mark_cycle_pt();\n      auto start = seg_it.data()->x();\n      auto xext = seg_it.data()->y();\n      for (auto xindex = left; xindex < right; xindex++) {\n        if (xindex >= start && !seg_it.cycled_list()) {\n          xindex = start + xext - 1;\n          seg_it.forward();\n          start = seg_it.data()->x();\n          xext = seg_it.data()->y();\n        } else {\n          pixels[xindex - left] = margin;\n        }\n      }\n    } else {\n      for (auto xindex = left; xindex < right; xindex++) {\n        pixels[xindex - left] = margin;\n      }\n    }\n  } else {\n    TDimension xext;  // of segment\n    auto start = line_it->get_line(y, xext);\n    for (auto xindex = left; xindex < start; xindex++) {\n      pixels[xindex - left] = margin;\n    }\n    for (auto xindex = start + xext; xindex < right; xindex++) {\n      pixels[xindex - left] = margin;\n    }\n  }\n}\n\n/**********************************************************************\n * line_edges\n *\n * Scan a line for edges and update the edges in progress.\n * When edges close into loops, send them for approximation.\n **********************************************************************/\n\nstatic void line_edges(TDimension x,         // coord of line start\n                       TDimension y,         // coord of line\n                       TDimension xext,      // width of line\n                       uint8_t uppercolour,  // start of prev line\n                       uint8_t *bwpos,       // thresholded line\n                       CRACKEDGE **prevline, // edges in progress\n                       CRACKEDGE **free_cracks, C_OUTLINE_IT *outline_it) {\n  CrackPos pos = {free_cracks, x, y};\n  int xmax;              // max x coord\n  int prevcolour;        // of previous pixel\n  CRACKEDGE *current;    // current h edge\n  CRACKEDGE *newcurrent; // new h edge\n\n  xmax = x + xext;          // max allowable coord\n  prevcolour = uppercolour; // forced plain margin\n  current = nullptr;        // nothing yet\n\n  // do each pixel\n  for (; pos.x < xmax; pos.x++, prevline++) {\n    const int colour = *bwpos++; // current pixel\n    if (*prevline != nullptr) {\n      // changed above\n      // change colour\n      uppercolour = FLIP_COLOUR(uppercolour);\n      if (colour == prevcolour) {\n        if (colour == uppercolour) {\n          // finish a line\n          join_edges(current, *prevline, free_cracks, outline_it);\n          current = nullptr; // no edge now\n        } else {\n          // new horiz edge\n          current = h_edge(uppercolour - colour, *prevline, &pos);\n        }\n        *prevline = nullptr; // no change this time\n      } else {\n        if (colour == uppercolour) {\n          *prevline = v_edge(colour - prevcolour, *prevline, &pos);\n        // 8 vs 4 connection\n        } else if (colour == WHITE_PIX) {\n          join_edges(current, *prevline, free_cracks, outline_it);\n          current = h_edge(uppercolour - colour, nullptr, &pos);\n          *prevline = v_edge(colour - prevcolour, current, &pos);\n        } else {\n          newcurrent = h_edge(uppercolour - colour, *prevline, &pos);\n          *prevline = v_edge(colour - prevcolour, current, &pos);\n          current = newcurrent; // right going h edge\n        }\n        prevcolour = colour; // remember new colour\n      }\n    } else {\n      if (colour != prevcolour) {\n        *prevline = current = v_edge(colour - prevcolour, current, &pos);\n        prevcolour = colour;\n      }\n      if (colour != uppercolour) {\n        current = h_edge(uppercolour - colour, current, &pos);\n      } else {\n        current = nullptr; // no edge now\n      }\n    }\n  }\n  if (current != nullptr) {\n    // out of block\n    if (*prevline != nullptr) { // got one to join to?\n      join_edges(current, *prevline, free_cracks, outline_it);\n      *prevline = nullptr; // tidy now\n    } else {\n      // fake vertical\n      *prevline = v_edge(FLIP_COLOUR(prevcolour) - prevcolour, current, &pos);\n    }\n  } else if (*prevline != nullptr) {\n    // continue fake\n    *prevline = v_edge(FLIP_COLOUR(prevcolour) - prevcolour, *prevline, &pos);\n  }\n}\n\n/**********************************************************************\n * h_edge\n *\n * Create a new horizontal CRACKEDGE and join it to the given edge.\n **********************************************************************/\n\nstatic CRACKEDGE *h_edge(int sign,        // sign of edge\n                         CRACKEDGE *join, // edge to join to\n                         CrackPos *pos) {\n  CRACKEDGE *newpt; // return value\n\n  if (*pos->free_cracks != nullptr) {\n    newpt = *pos->free_cracks;\n    *pos->free_cracks = newpt->next; // get one fast\n  } else {\n    newpt = new CRACKEDGE;\n  }\n  newpt->pos.set_y(pos->y + 1); // coords of pt\n  newpt->stepy = 0;             // edge is horizontal\n\n  if (sign > 0) {\n    newpt->pos.set_x(pos->x + 1); // start location\n    newpt->stepx = -1;\n    newpt->stepdir = 0;\n  } else {\n    newpt->pos.set_x(pos->x); // start location\n    newpt->stepx = 1;\n    newpt->stepdir = 2;\n  }\n\n  if (join == nullptr) {\n    newpt->next = newpt; // ptrs to other ends\n    newpt->prev = newpt;\n  } else {\n    if (newpt->pos.x() + newpt->stepx == join->pos.x() && newpt->pos.y() == join->pos.y()) {\n      newpt->prev = join->prev; // update other ends\n      newpt->prev->next = newpt;\n      newpt->next = join; // join up\n      join->prev = newpt;\n    } else {\n      newpt->next = join->next; // update other ends\n      newpt->next->prev = newpt;\n      newpt->prev = join; // join up\n      join->next = newpt;\n    }\n  }\n  return newpt;\n}\n\n/**********************************************************************\n * v_edge\n *\n * Create a new vertical CRACKEDGE and join it to the given edge.\n **********************************************************************/\n\nstatic CRACKEDGE *v_edge(int sign, // sign of edge\n                         CRACKEDGE *join, CrackPos *pos) {\n  CRACKEDGE *newpt; // return value\n\n  if (*pos->free_cracks != nullptr) {\n    newpt = *pos->free_cracks;\n    *pos->free_cracks = newpt->next; // get one fast\n  } else {\n    newpt = new CRACKEDGE;\n  }\n  newpt->pos.set_x(pos->x); // coords of pt\n  newpt->stepx = 0;         // edge is vertical\n\n  if (sign > 0) {\n    newpt->pos.set_y(pos->y); // start location\n    newpt->stepy = 1;\n    newpt->stepdir = 3;\n  } else {\n    newpt->pos.set_y(pos->y + 1); // start location\n    newpt->stepy = -1;\n    newpt->stepdir = 1;\n  }\n\n  if (join == nullptr) {\n    newpt->next = newpt; // ptrs to other ends\n    newpt->prev = newpt;\n  } else {\n    if (newpt->pos.x() == join->pos.x() && newpt->pos.y() + newpt->stepy == join->pos.y()) {\n      newpt->prev = join->prev; // update other ends\n      newpt->prev->next = newpt;\n      newpt->next = join; // join up\n      join->prev = newpt;\n    } else {\n      newpt->next = join->next; // update other ends\n      newpt->next->prev = newpt;\n      newpt->prev = join; // join up\n      join->next = newpt;\n    }\n  }\n  return newpt;\n}\n\n/**********************************************************************\n * join_edges\n *\n * Join 2 edges together. Send the outline for approximation when a\n * closed loop is formed.\n **********************************************************************/\n\nstatic void join_edges(CRACKEDGE *edge1, // edges to join\n                       CRACKEDGE *edge2, // no specific order\n                       CRACKEDGE **free_cracks, C_OUTLINE_IT *outline_it) {\n  if (edge1->pos.x() + edge1->stepx != edge2->pos.x() ||\n      edge1->pos.y() + edge1->stepy != edge2->pos.y()) {\n    CRACKEDGE *tempedge = edge1;\n    edge1 = edge2; // swap around\n    edge2 = tempedge;\n  }\n\n  if (edge1->next == edge2) {\n    // already closed\n    complete_edge(edge1, outline_it);\n    // attach freelist to end\n    edge1->prev->next = *free_cracks;\n    *free_cracks = edge1; // and free list\n  } else {\n    // update opposite ends\n    edge2->prev->next = edge1->next;\n    edge1->next->prev = edge2->prev;\n    edge1->next = edge2; // make joins\n    edge2->prev = edge1;\n  }\n}\n\n/**********************************************************************\n * free_crackedges\n *\n * Really free the CRACKEDGEs by giving them back to delete.\n **********************************************************************/\n\nstatic void free_crackedges(CRACKEDGE *start) {\n  CRACKEDGE *current; // current edge to free\n  CRACKEDGE *next;    // next one to free\n\n  for (current = start; current != nullptr; current = next) {\n    next = current->next;\n    delete current; // delete them all\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/scanedg.h",
    "content": "/**********************************************************************\n * File:        scanedg.h  (Formerly scanedge.h)\n * Description: Raster scanning crack based edge extractor.\n * Author:      Ray Smith\n *\n * (C) Copyright 1991, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef SCANEDG_H\n#define SCANEDG_H\n\n#include \"image.h\"\n#include \"params.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass C_OUTLINE_IT;\nclass PDBLK;\n\nvoid block_edges(Image t_image, // thresholded image\n                 PDBLK *block, // block in image\n                 C_OUTLINE_IT *outline_it);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/sortflts.cpp",
    "content": "/**********************************************************************\n * File:        sortflts.cpp  (Formerly sfloats.c)\n * Description: Code to maintain a sorted list of floats.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"sortflts.h\"\n\nnamespace tesseract {\n\n/**\n * @name SORTED_FLOATS::add\n *\n * Add a new entry to the sorted list of floats.\n */\nvoid SORTED_FLOATS::add( // add new entry\n    float value, int32_t key) {\n  auto *new_float = new SORTED_FLOAT(value, key);\n\n  if (list.empty()) {\n    it.add_after_stay_put(new_float);\n  } else {\n    it.move_to_first();\n    while (!it.at_last() && it.data()->entry < value) {\n      it.forward();\n    }\n    if (it.data()->entry < value) {\n      it.add_after_stay_put(new_float);\n    } else {\n      it.add_before_stay_put(new_float);\n    }\n  }\n}\n\n/**\n * @name SORTED_FLOATS::remove\n *\n * Remove an entry from the sorted list of floats.\n */\n\nvoid SORTED_FLOATS::remove( // remove the entry\n    int32_t key) {\n  if (!list.empty()) {\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      if (it.data()->address == key) {\n        delete it.extract();\n        return;\n      }\n    }\n  }\n}\n\n/**\n * @name SORTED_FLOATS::operator[]\n *\n * Return the floating point value of the given index into the list.\n */\n\nfloat SORTED_FLOATS::operator[]( // get an entry\n    int32_t index                // to list\n) {\n  it.move_to_first();\n  return it.data_relative(index)->entry;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/sortflts.h",
    "content": "/**********************************************************************\n * File:        sortflts.h  (Formerly sfloats.h)\n * Description: Code to maintain a sorted list of floats.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef SORTFLTS_H\n#define SORTFLTS_H\n\n#include \"elst.h\"\n\nnamespace tesseract {\n\nclass SORTED_FLOAT : public ELIST<SORTED_FLOAT>::LINK {\n  friend class SORTED_FLOATS;\n\npublic:\n  SORTED_FLOAT() = default;\n  SORTED_FLOAT(      // create one\n      float value,   // value of entry\n      int32_t key) { // reference\n    entry = value;\n    address = key;\n  }\n\nprivate:\n  float entry;     // value of float\n  int32_t address; // key\n};\n\nELISTIZEH(SORTED_FLOAT)\nclass SORTED_FLOATS {\npublic:\n  /** empty constructor */\n  SORTED_FLOATS() {\n    it.set_to_list(&list);\n  }\n  /**\n   * add sample\n   * @param value sample float\n   * @param key retrieval key\n   */\n  void add(float value, int32_t key);\n  /**\n   * delete sample\n   * @param key key to delete\n   */\n  void remove(int32_t key);\n  /**\n   * index to list\n   * @param index item to get\n   */\n  float operator[](int32_t index);\n\nprivate:\n  SORTED_FLOAT_LIST list; // list of floats\n  SORTED_FLOAT_IT it;     // iterator built-in\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/strokewidth.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        strokewidth.cpp\n// Description: Subclass of BBGrid to find uniformity of strokewidth.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"strokewidth.h\"\n\n#include <algorithm>\n#include <cmath>\n\n#include \"blobbox.h\"\n#include \"colpartition.h\"\n#include \"colpartitiongrid.h\"\n#include \"helpers.h\" // for IntCastRounded\n#include \"imagefind.h\"\n#include \"linlsq.h\"\n#include \"statistc.h\"\n#include \"tabfind.h\"\n#include \"textlineprojection.h\"\n#include \"tordmain.h\" // For SetBlobStrokeWidth.\n\nnamespace tesseract {\n\n#ifndef GRAPHICS_DISABLED\nstatic INT_VAR(textord_tabfind_show_strokewidths, 0, \"Show stroke widths (ScrollView)\");\n#else\nstatic INT_VAR(textord_tabfind_show_strokewidths, 0, \"Show stroke widths\");\n#endif\nstatic BOOL_VAR(textord_tabfind_only_strokewidths, false, \"Only run stroke widths\");\n\n/** Allowed proportional change in stroke width to be the same font. */\nconst double kStrokeWidthFractionTolerance = 0.125;\n/**\n * Allowed constant change in stroke width to be the same font.\n * Really 1.5 pixels.\n */\nconst double kStrokeWidthTolerance = 1.5;\n// Same but for CJK we are a bit more generous.\nconst double kStrokeWidthFractionCJK = 0.25;\nconst double kStrokeWidthCJK = 2.0;\n// Radius in grid cells of search for broken CJK. Doesn't need to be very\n// large as the grid size should be about the size of a character anyway.\nconst int kCJKRadius = 2;\n// Max distance fraction of size to join close but broken CJK characters.\nconst double kCJKBrokenDistanceFraction = 0.25;\n// Max number of components in a broken CJK character.\nconst int kCJKMaxComponents = 8;\n// Max aspect ratio of CJK broken characters when put back together.\nconst double kCJKAspectRatio = 1.25;\n// Max increase in aspect ratio of CJK broken characters when merged.\nconst double kCJKAspectRatioIncrease = 1.0625;\n// Max multiple of the grid size that will be used in computing median CJKsize.\nconst int kMaxCJKSizeRatio = 5;\n// Min fraction of blobs broken CJK to iterate and run it again.\nconst double kBrokenCJKIterationFraction = 0.125;\n// Multiple of gridsize as x-padding for a search box for diacritic base\n// characters.\nconst double kDiacriticXPadRatio = 7.0;\n// Multiple of gridsize as y-padding for a search box for diacritic base\n// characters.\nconst double kDiacriticYPadRatio = 1.75;\n// Min multiple of diacritic height that a neighbour must be to be a\n// convincing base character.\nconst double kMinDiacriticSizeRatio = 1.0625;\n// Max multiple of a textline's median height as a threshold for the sum of\n// a diacritic's farthest x and y distances (gap + size).\nconst double kMaxDiacriticDistanceRatio = 1.25;\n// Max x-gap between a diacritic and its base char as a fraction of the height\n// of the base char (allowing other blobs to fill the gap.)\nconst double kMaxDiacriticGapToBaseCharHeight = 1.0;\n// Ratio between longest side of a line and longest side of a character.\n// (neighbor_min > blob_min * kLineTrapShortest &&\n//  neighbor_max < blob_max / kLineTrapLongest)\n// => neighbor is a grapheme and blob is a line.\nconst int kLineTrapLongest = 4;\n// Ratio between shortest side of a line and shortest side of a character.\nconst int kLineTrapShortest = 2;\n// Max aspect ratio of the total box before CountNeighbourGaps\n// decides immediately based on the aspect ratio.\nconst int kMostlyOneDirRatio = 3;\n// Aspect ratio for a blob to be considered as line residue.\nconst double kLineResidueAspectRatio = 8.0;\n// Padding ratio for line residue search box.\nconst int kLineResiduePadRatio = 3;\n// Min multiple of neighbour size for a line residue to be genuine.\nconst double kLineResidueSizeRatio = 1.75;\n// Aspect ratio filter for OSD.\nconst float kSizeRatioToReject = 2.0;\n// Expansion factor for search box for good neighbours.\nconst double kNeighbourSearchFactor = 2.5;\n// Factor of increase of overlap when adding diacritics to make an image noisy.\nconst double kNoiseOverlapGrowthFactor = 4.0;\n// Fraction of the image size to add overlap when adding diacritics for an\n// image to qualify as noisy.\nconst double kNoiseOverlapAreaFactor = 1.0 / 512;\n\nStrokeWidth::StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)\n    : BlobGrid(gridsize, bleft, tright)\n    , nontext_map_(nullptr)\n    , projection_(nullptr)\n    , denorm_(nullptr)\n    , grid_box_(bleft, tright)\n    , rerotation_(1.0f, 0.0f) {\n}\n\nStrokeWidth::~StrokeWidth() {\n#ifndef GRAPHICS_DISABLED\n  if (widths_win_ != nullptr) {\n    widths_win_->AwaitEvent(SVET_DESTROY);\n    if (textord_tabfind_only_strokewidths) {\n      exit(0);\n    }\n    delete widths_win_;\n  }\n  delete leaders_win_;\n  delete initial_widths_win_;\n  delete chains_win_;\n  delete textlines_win_;\n  delete smoothed_win_;\n  delete diacritics_win_;\n#endif\n}\n\n// Sets the neighbours member of the medium-sized blobs in the block.\n// Searches on 4 sides of each blob for similar-sized, similar-strokewidth\n// blobs and sets pointers to the good neighbours.\nvoid StrokeWidth::SetNeighboursOnMediumBlobs(TO_BLOCK *block) {\n  // Run a preliminary strokewidth neighbour detection on the medium blobs.\n  InsertBlobList(&block->blobs);\n  BLOBNBOX_IT blob_it(&block->blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    SetNeighbours(false, false, blob_it.data());\n  }\n  Clear();\n}\n\n// Sets the neighbour/textline writing direction members of the medium\n// and large blobs with optional repair of broken CJK characters first.\n// Repair of broken CJK is needed here because broken CJK characters\n// can fool the textline direction detection algorithm.\nvoid StrokeWidth::FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge,\n                                                       TO_BLOCK *input_block) {\n  // Setup the grid with the remaining (non-noise) blobs.\n  InsertBlobs(input_block);\n  // Repair broken CJK characters if needed.\n  while (cjk_merge && FixBrokenCJK(input_block)) {\n  }\n  // Grade blobs by inspection of neighbours.\n  FindTextlineFlowDirection(pageseg_mode, false);\n  // Clear the grid ready for rotation or leader finding.\n  Clear();\n}\n\n// Helper to collect and count horizontal and vertical blobs from a list.\nstatic void CollectHorizVertBlobs(BLOBNBOX_LIST *input_blobs, int *num_vertical_blobs,\n                                  int *num_horizontal_blobs, BLOBNBOX_CLIST *vertical_blobs,\n                                  BLOBNBOX_CLIST *horizontal_blobs,\n                                  BLOBNBOX_CLIST *nondescript_blobs) {\n  BLOBNBOX_C_IT v_it(vertical_blobs);\n  BLOBNBOX_C_IT h_it(horizontal_blobs);\n  BLOBNBOX_C_IT n_it(nondescript_blobs);\n  BLOBNBOX_IT blob_it(input_blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    const TBOX &box = blob->bounding_box();\n    float y_x = static_cast<float>(box.height()) / box.width();\n    float x_y = 1.0f / y_x;\n    // Select a >= 1.0 ratio\n    float ratio = x_y > y_x ? x_y : y_x;\n    // If the aspect ratio is small and we want them for osd, save the blob.\n    bool ok_blob = ratio <= kSizeRatioToReject;\n    if (blob->UniquelyVertical()) {\n      ++*num_vertical_blobs;\n      if (ok_blob) {\n        v_it.add_after_then_move(blob);\n      }\n    } else if (blob->UniquelyHorizontal()) {\n      ++*num_horizontal_blobs;\n      if (ok_blob) {\n        h_it.add_after_then_move(blob);\n      }\n    } else if (ok_blob) {\n      n_it.add_after_then_move(blob);\n    }\n  }\n}\n\n// Types all the blobs as vertical or horizontal text or unknown and\n// returns true if the majority are vertical.\n// If the blobs are rotated, it is necessary to call CorrectForRotation\n// after rotating everything, otherwise the work done here will be enough.\n// If osd_blobs is not null, a list of blobs from the dominant textline\n// direction are returned for use in orientation and script detection.\nbool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block,\n                                            BLOBNBOX_CLIST *osd_blobs) {\n  int vertical_boxes = 0;\n  int horizontal_boxes = 0;\n  // Count vertical normal and large blobs.\n  BLOBNBOX_CLIST vertical_blobs;\n  BLOBNBOX_CLIST horizontal_blobs;\n  BLOBNBOX_CLIST nondescript_blobs;\n  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,\n                        &horizontal_blobs, &nondescript_blobs);\n  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,\n                        &horizontal_blobs, &nondescript_blobs);\n  if (textord_debug_tabfind) {\n    tprintf(\"TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\\n\", horizontal_boxes,\n            vertical_boxes, horizontal_blobs.length(), vertical_blobs.length(),\n            nondescript_blobs.length());\n  }\n  if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {\n    // Only nondescript blobs available, so return those.\n    BLOBNBOX_C_IT osd_it(osd_blobs);\n    osd_it.add_list_after(&nondescript_blobs);\n    return false;\n  }\n  int min_vert_boxes =\n      static_cast<int>((vertical_boxes + horizontal_boxes) * find_vertical_text_ratio);\n  if (vertical_boxes >= min_vert_boxes) {\n    if (osd_blobs != nullptr) {\n      BLOBNBOX_C_IT osd_it(osd_blobs);\n      osd_it.add_list_after(&vertical_blobs);\n    }\n    return true;\n  } else {\n    if (osd_blobs != nullptr) {\n      BLOBNBOX_C_IT osd_it(osd_blobs);\n      osd_it.add_list_after(&horizontal_blobs);\n    }\n    return false;\n  }\n}\n\n// Corrects the data structures for the given rotation.\nvoid StrokeWidth::CorrectForRotation(const FCOORD &rotation, ColPartitionGrid *part_grid) {\n  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());\n  grid_box_ = TBOX(bleft(), tright());\n  rerotation_.set_x(rotation.x());\n  rerotation_.set_y(-rotation.y());\n}\n\n// Finds leader partitions and inserts them into the given part_grid.\nvoid StrokeWidth::FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid) {\n  Clear();\n  // Find and isolate leaders in the noise list.\n  ColPartition_LIST leader_parts;\n  FindLeadersAndMarkNoise(block, &leader_parts);\n  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.\n  InsertBlobList(&block->blobs);\n  // Mark blobs that have leader neighbours.\n  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {\n    ColPartition *part = it.extract();\n    part->ClaimBoxes();\n    MarkLeaderNeighbours(part, LR_LEFT);\n    MarkLeaderNeighbours(part, LR_RIGHT);\n    part_grid->InsertBBox(true, true, part);\n  }\n}\n\n// Finds and marks noise those blobs that look like bits of vertical lines\n// that would otherwise screw up layout analysis.\nvoid StrokeWidth::RemoveLineResidue(ColPartition_LIST *big_part_list) {\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  // For every vertical line-like bbox in the grid, search its neighbours\n  // to find the tallest, and if the original box is taller by sufficient\n  // margin, then call it line residue and delete it.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    TBOX box = bbox->bounding_box();\n    if (box.height() < box.width() * kLineResidueAspectRatio) {\n      continue;\n    }\n    // Set up a rectangle search around the blob to find the size of its\n    // neighbours.\n    int padding = box.height() * kLineResiduePadRatio;\n    TBOX search_box = box;\n    search_box.pad(padding, padding);\n    bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());\n    // Find the largest object in the search box not equal to bbox.\n    BlobGridSearch rsearch(this);\n    int max_height = 0;\n    BLOBNBOX *n;\n    rsearch.StartRectSearch(search_box);\n    while ((n = rsearch.NextRectSearch()) != nullptr) {\n      if (n == bbox) {\n        continue;\n      }\n      TBOX nbox = n->bounding_box();\n      if (nbox.height() > max_height) {\n        max_height = nbox.height();\n      }\n    }\n    if (debug) {\n      tprintf(\"Max neighbour size=%d for candidate line box at:\", max_height);\n      box.print();\n    }\n    if (max_height * kLineResidueSizeRatio < box.height()) {\n#ifndef GRAPHICS_DISABLED\n      if (leaders_win_ != nullptr) {\n        // We are debugging, so display deleted in pink blobs in the same\n        // window that we use to display leader detection.\n        leaders_win_->Pen(ScrollView::PINK);\n        leaders_win_->Rectangle(box.left(), box.bottom(), box.right(), box.top());\n      }\n#endif // !GRAPHICS_DISABLED\n      ColPartition::MakeBigPartition(bbox, big_part_list);\n    }\n  }\n}\n\n// Types all the blobs as vertical text or horizontal text or unknown and\n// puts them into initial ColPartitions in the supplied part_grid.\n// rerotation determines how to get back to the image coordinates from the\n// blob coordinates (since they may have been rotated for vertical text).\n// block is the single block for the whole page or rectangle to be OCRed.\n// nontext_pix (full-size), is a binary mask used to prevent merges across\n// photo/text boundaries. It is not kept beyond this function.\n// denorm provides a mapping back to the image from the current blob\n// coordinate space.\n// projection provides a measure of textline density over the image and\n// provides functions to assist with diacritic detection. It should be a\n// pointer to a new TextlineProjection, and will be setup here.\n// part_grid is the output grid of textline partitions.\n// Large blobs that cause overlap are put in separate partitions and added\n// to the big_parts list.\nvoid StrokeWidth::GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation,\n                                           TO_BLOCK *block, Image nontext_pix, const DENORM *denorm,\n                                           bool cjk_script, TextlineProjection *projection,\n                                           BLOBNBOX_LIST *diacritic_blobs,\n                                           ColPartitionGrid *part_grid,\n                                           ColPartition_LIST *big_parts) {\n  nontext_map_ = nontext_pix;\n  projection_ = projection;\n  denorm_ = denorm;\n  // Clear and re Insert to take advantage of the tab stops in the blobs.\n  Clear();\n  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.\n  InsertBlobs(block);\n\n  // Run FixBrokenCJK() again if the page is CJK.\n  if (cjk_script) {\n    FixBrokenCJK(block);\n  }\n  FindTextlineFlowDirection(pageseg_mode, false);\n  projection_->ConstructProjection(block, rerotation, nontext_map_);\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_strokewidths) {\n    ScrollView *line_blobs_win = MakeWindow(0, 0, \"Initial textline Blobs\");\n    projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);\n    projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);\n  }\n#endif\n  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);\n  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);\n  // Clear and re Insert to take advantage of the removed diacritics.\n  Clear();\n  InsertBlobs(block);\n  FCOORD skew;\n  FindTextlineFlowDirection(pageseg_mode, true);\n  PartitionFindResult r = FindInitialPartitions(pageseg_mode, rerotation, true, block,\n                                                diacritic_blobs, part_grid, big_parts, &skew);\n  if (r == PFR_NOISE) {\n    tprintf(\"Detected %d diacritics\\n\", diacritic_blobs->length());\n    // Noise was found, and removed.\n    Clear();\n    InsertBlobs(block);\n    FindTextlineFlowDirection(pageseg_mode, true);\n    r = FindInitialPartitions(pageseg_mode, rerotation, false, block, diacritic_blobs, part_grid,\n                              big_parts, &skew);\n  }\n  nontext_map_ = nullptr;\n  projection_ = nullptr;\n  denorm_ = nullptr;\n}\n\nstatic void PrintBoxWidths(BLOBNBOX *neighbour) {\n  const TBOX &nbox = neighbour->bounding_box();\n  tprintf(\"Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\\n\", nbox.left(),\n          nbox.bottom(), nbox.right(), nbox.top(), neighbour->horz_stroke_width(),\n          neighbour->vert_stroke_width(),\n          2.0 * neighbour->cblob()->area() / neighbour->cblob()->perimeter());\n}\n\n/** Handles a click event in a display window. */\nvoid StrokeWidth::HandleClick(int x, int y) {\n  BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>::HandleClick(x, y);\n  // Run a radial search for blobs that overlap.\n  BlobGridSearch radsearch(this);\n  radsearch.StartRadSearch(x, y, 1);\n  BLOBNBOX *neighbour;\n  FCOORD click(static_cast<float>(x), static_cast<float>(y));\n  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {\n    TBOX nbox = neighbour->bounding_box();\n    if (nbox.contains(click) && neighbour->cblob() != nullptr) {\n      PrintBoxWidths(neighbour);\n      if (neighbour->neighbour(BND_LEFT) != nullptr) {\n        PrintBoxWidths(neighbour->neighbour(BND_LEFT));\n      }\n      if (neighbour->neighbour(BND_RIGHT) != nullptr) {\n        PrintBoxWidths(neighbour->neighbour(BND_RIGHT));\n      }\n      if (neighbour->neighbour(BND_ABOVE) != nullptr) {\n        PrintBoxWidths(neighbour->neighbour(BND_ABOVE));\n      }\n      if (neighbour->neighbour(BND_BELOW) != nullptr) {\n        PrintBoxWidths(neighbour->neighbour(BND_BELOW));\n      }\n      int gaps[BND_COUNT];\n      neighbour->NeighbourGaps(gaps);\n      tprintf(\n          \"Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\\n\"\n          \"Good=    %d        %d        %d        %d\\n\",\n          gaps[BND_LEFT], gaps[BND_RIGHT], gaps[BND_ABOVE], gaps[BND_BELOW],\n          neighbour->horz_possible(), neighbour->vert_possible(),\n          neighbour->good_stroke_neighbour(BND_LEFT), neighbour->good_stroke_neighbour(BND_RIGHT),\n          neighbour->good_stroke_neighbour(BND_ABOVE), neighbour->good_stroke_neighbour(BND_BELOW));\n      break;\n    }\n  }\n}\n\n// Detects and marks leader dots/dashes.\n//    Leaders are horizontal chains of small or noise blobs that look\n//    monospace according to ColPartition::MarkAsLeaderIfMonospaced().\n// Detected leaders become the only occupants of the block->small_blobs list.\n// Non-leader small blobs get moved to the blobs list.\n// Non-leader noise blobs remain singletons in the noise list.\n// All small and noise blobs in high density regions are marked BTFT_NONTEXT.\n// block is the single block for the whole page or rectangle to be OCRed.\n// leader_parts is the output.\nvoid StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK *block, ColPartition_LIST *leader_parts) {\n  InsertBlobList(&block->small_blobs);\n  InsertBlobList(&block->noise_blobs);\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  // For every bbox in the grid, set its neighbours.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SetNeighbours(true, false, bbox);\n  }\n  ColPartition_IT part_it(leader_parts);\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    if (bbox->flow() == BTFT_NONE) {\n      if (bbox->neighbour(BND_RIGHT) == nullptr && bbox->neighbour(BND_LEFT) == nullptr) {\n        continue;\n      }\n      // Put all the linked blobs into a ColPartition.\n      auto *part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));\n      BLOBNBOX *blob;\n      for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;\n           blob = blob->neighbour(BND_RIGHT)) {\n        part->AddBox(blob);\n      }\n      for (blob = bbox->neighbour(BND_LEFT); blob != nullptr && blob->flow() == BTFT_NONE;\n           blob = blob->neighbour(BND_LEFT)) {\n        part->AddBox(blob);\n      }\n      if (part->MarkAsLeaderIfMonospaced()) {\n        part_it.add_after_then_move(part);\n      } else {\n        delete part;\n      }\n    }\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_strokewidths) {\n    leaders_win_ = DisplayGoodBlobs(\"LeaderNeighbours\", 0, 0);\n  }\n#endif\n  // Move any non-leaders from the small to the blobs list, as they are\n  // most likely dashes or broken characters.\n  BLOBNBOX_IT blob_it(&block->blobs);\n  BLOBNBOX_IT small_it(&block->small_blobs);\n  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {\n    BLOBNBOX *blob = small_it.data();\n    if (blob->flow() != BTFT_LEADER) {\n      if (blob->flow() == BTFT_NEIGHBOURS) {\n        blob->set_flow(BTFT_NONE);\n      }\n      blob->ClearNeighbours();\n      blob_it.add_to_end(small_it.extract());\n    }\n  }\n  // Move leaders from the noise list to the small list, leaving the small\n  // list exclusively leaders, so they don't get processed further,\n  // and the remaining small blobs all in the noise list.\n  BLOBNBOX_IT noise_it(&block->noise_blobs);\n  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {\n    BLOBNBOX *blob = noise_it.data();\n    if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {\n      small_it.add_to_end(noise_it.extract());\n    } else if (blob->flow() == BTFT_NEIGHBOURS) {\n      blob->set_flow(BTFT_NONE);\n      blob->ClearNeighbours();\n    }\n  }\n  // Clear the grid as we don't want the small stuff hanging around in it.\n  Clear();\n}\n\n/** Inserts the block blobs (normal and large) into this grid.\n * Blobs remain owned by the block. */\nvoid StrokeWidth::InsertBlobs(TO_BLOCK *block) {\n  InsertBlobList(&block->blobs);\n  InsertBlobList(&block->large_blobs);\n}\n\n// Checks the left or right side of the given leader partition and sets the\n// (opposite) leader_on_right or leader_on_left flags for blobs\n// that are next to the given side of the given leader partition.\nvoid StrokeWidth::MarkLeaderNeighbours(const ColPartition *part, LeftOrRight side) {\n  const TBOX &part_box = part->bounding_box();\n  BlobGridSearch blobsearch(this);\n  // Search to the side of the leader for the nearest neighbour.\n  BLOBNBOX *best_blob = nullptr;\n  int best_gap = 0;\n  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left() : part_box.right(),\n                             part_box.bottom(), part_box.top());\n  BLOBNBOX *blob;\n  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {\n    const TBOX &blob_box = blob->bounding_box();\n    if (!blob_box.y_overlap(part_box)) {\n      continue;\n    }\n    int x_gap = blob_box.x_gap(part_box);\n    if (x_gap > 2 * gridsize()) {\n      break;\n    } else if (best_blob == nullptr || x_gap < best_gap) {\n      best_blob = blob;\n      best_gap = x_gap;\n    }\n  }\n  if (best_blob != nullptr) {\n    if (side == LR_LEFT) {\n      best_blob->set_leader_on_right(true);\n    } else {\n      best_blob->set_leader_on_left(true);\n    }\n#ifndef GRAPHICS_DISABLED\n    if (leaders_win_ != nullptr) {\n      leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);\n      const TBOX &blob_box = best_blob->bounding_box();\n      leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top());\n    }\n#endif // !GRAPHICS_DISABLED\n  }\n}\n\n// Helper to compute the UQ of the square-ish CJK characters.\nstatic int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST *blobs) {\n  STATS sizes(0, gridsize * kMaxCJKSizeRatio - 1);\n  BLOBNBOX_IT it(blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    int width = blob->bounding_box().width();\n    int height = blob->bounding_box().height();\n    if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio) {\n      sizes.add(height, 1);\n    }\n  }\n  return static_cast<int>(sizes.ile(0.75f) + 0.5);\n}\n\n// Fix broken CJK characters, using the fake joined blobs mechanism.\n// Blobs are really merged, ie the master takes all the outlines and the\n// others are deleted.\n// Returns true if sufficient blobs are merged that it may be worth running\n// again, due to a better estimate of character size.\nbool StrokeWidth::FixBrokenCJK(TO_BLOCK *block) {\n  BLOBNBOX_LIST *blobs = &block->blobs;\n  int median_height = UpperQuartileCJKSize(gridsize(), blobs);\n  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);\n  int max_height = static_cast<int>(median_height * kCJKAspectRatio);\n  int num_fixed = 0;\n  BLOBNBOX_IT blob_it(blobs);\n\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty()) {\n      continue;\n    }\n    TBOX bbox = blob->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(), bbox.bottom());\n    if (debug) {\n      tprintf(\"Checking for Broken CJK (max size=%d):\", max_height);\n      bbox.print();\n    }\n    // Generate a list of blobs that overlap or are near enough to merge.\n    BLOBNBOX_CLIST overlapped_blobs;\n    AccumulateOverlaps(blob, debug, max_height, max_dist, &bbox, &overlapped_blobs);\n    if (!overlapped_blobs.empty()) {\n      // There are overlapping blobs, so qualify them as being satisfactory\n      // before removing them from the grid and replacing them with the union.\n      // The final box must be roughly square.\n      if (bbox.width() > bbox.height() * kCJKAspectRatio ||\n          bbox.height() > bbox.width() * kCJKAspectRatio) {\n        if (debug) {\n          tprintf(\"Bad final aspectratio:\");\n          bbox.print();\n        }\n        continue;\n      }\n      // There can't be too many blobs to merge.\n      if (overlapped_blobs.length() >= kCJKMaxComponents) {\n        if (debug) {\n          tprintf(\"Too many neighbours: %d\\n\", overlapped_blobs.length());\n        }\n        continue;\n      }\n      // The strokewidths must match amongst the join candidates.\n      BLOBNBOX_C_IT n_it(&overlapped_blobs);\n      for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {\n        BLOBNBOX *neighbour = nullptr;\n        neighbour = n_it.data();\n        if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK, kStrokeWidthCJK)) {\n          break;\n        }\n      }\n      if (!n_it.cycled_list()) {\n        if (debug) {\n          tprintf(\"Bad stroke widths:\");\n          PrintBoxWidths(blob);\n        }\n        continue; // Not good enough.\n      }\n\n      // Merge all the candidates into blob.\n      // We must remove blob from the grid and reinsert it after merging\n      // to maintain the integrity of the grid.\n      RemoveBBox(blob);\n      // Everything else will be calculated later.\n      for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {\n        BLOBNBOX *neighbour = n_it.data();\n        RemoveBBox(neighbour);\n        // Mark empty blob for deletion.\n        neighbour->set_region_type(BRT_NOISE);\n        blob->really_merge(neighbour);\n        if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {\n          blob->rotate_box(rerotation_);\n        }\n      }\n      InsertBBox(true, true, blob);\n      ++num_fixed;\n      if (debug) {\n        tprintf(\"Done! Final box:\");\n        bbox.print();\n      }\n    }\n  }\n  // Count remaining blobs.\n  int num_remaining = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {\n      ++num_remaining;\n    }\n  }\n  // Permanently delete all the marked blobs after first removing all\n  // references in the neighbour members.\n  block->DeleteUnownedNoise();\n  return num_fixed > num_remaining * kBrokenCJKIterationFraction;\n}\n\n// Helper function to determine whether it is reasonable to merge the\n// bbox and the nbox for repairing broken CJK.\n// The distance apart must not exceed max_dist, the combined size must\n// not exceed max_size, and the aspect ratio must either improve or at\n// least not get worse by much.\nstatic bool AcceptableCJKMerge(const TBOX &bbox, const TBOX &nbox, bool debug, int max_size,\n                               int max_dist, int *x_gap, int *y_gap) {\n  *x_gap = bbox.x_gap(nbox);\n  *y_gap = bbox.y_gap(nbox);\n  TBOX merged(nbox);\n  merged += bbox;\n  if (debug) {\n    tprintf(\"gaps = %d, %d, merged_box:\", *x_gap, *y_gap);\n    merged.print();\n  }\n  if (*x_gap <= max_dist && *y_gap <= max_dist && merged.width() <= max_size &&\n      merged.height() <= max_size) {\n    // Close enough to call overlapping. Check aspect ratios.\n    double old_ratio = static_cast<double>(bbox.width()) / bbox.height();\n    if (old_ratio < 1.0) {\n      old_ratio = 1.0 / old_ratio;\n    }\n    double new_ratio = static_cast<double>(merged.width()) / merged.height();\n    if (new_ratio < 1.0) {\n      new_ratio = 1.0 / new_ratio;\n    }\n    if (new_ratio <= old_ratio * kCJKAspectRatioIncrease) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Collect blobs that overlap or are within max_dist of the input bbox.\n// Return them in the list of blobs and expand the bbox to be the union\n// of all the boxes. not_this is excluded from the search, as are blobs\n// that cause the merged box to exceed max_size in either dimension.\nvoid StrokeWidth::AccumulateOverlaps(const BLOBNBOX *not_this, bool debug, int max_size,\n                                     int max_dist, TBOX *bbox, BLOBNBOX_CLIST *blobs) {\n  // While searching, nearests holds the nearest failed blob in each\n  // direction. When we have a nearest in each of the 4 directions, then\n  // the search is over, and at this point the final bbox must not overlap\n  // any of the nearests.\n  BLOBNBOX *nearests[BND_COUNT];\n  for (auto &nearest : nearests) {\n    nearest = nullptr;\n  }\n  int x = (bbox->left() + bbox->right()) / 2;\n  int y = (bbox->bottom() + bbox->top()) / 2;\n  // Run a radial search for blobs that overlap or are sufficiently close.\n  BlobGridSearch radsearch(this);\n  radsearch.StartRadSearch(x, y, kCJKRadius);\n  BLOBNBOX *neighbour;\n  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {\n    if (neighbour == not_this) {\n      continue;\n    }\n    TBOX nbox = neighbour->bounding_box();\n    int x_gap, y_gap;\n    if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {\n      // Close enough to call overlapping. Merge boxes.\n      *bbox += nbox;\n      blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);\n      if (debug) {\n        tprintf(\"Added:\");\n        nbox.print();\n      }\n      // Since we merged, search the nearests, as some might now me mergeable.\n      for (int dir = 0; dir < BND_COUNT; ++dir) {\n        if (nearests[dir] == nullptr) {\n          continue;\n        }\n        nbox = nearests[dir]->bounding_box();\n        if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {\n          // Close enough to call overlapping. Merge boxes.\n          *bbox += nbox;\n          blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);\n          if (debug) {\n            tprintf(\"Added:\");\n            nbox.print();\n          }\n          nearests[dir] = nullptr;\n          dir = -1; // Restart the search.\n        }\n      }\n    } else if (x_gap < 0 && x_gap <= y_gap) {\n      // A vertical neighbour. Record the nearest.\n      BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;\n      if (nearests[dir] == nullptr || y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {\n        nearests[dir] = neighbour;\n      }\n    } else if (y_gap < 0 && y_gap <= x_gap) {\n      // A horizontal neighbour. Record the nearest.\n      BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;\n      if (nearests[dir] == nullptr || x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {\n        nearests[dir] = neighbour;\n      }\n    }\n    // If all nearests are non-null, then we have finished.\n    if (nearests[BND_LEFT] && nearests[BND_RIGHT] && nearests[BND_ABOVE] && nearests[BND_BELOW]) {\n      break;\n    }\n  }\n  // Final overlap with a nearest is not allowed.\n  for (auto &nearest : nearests) {\n    if (nearest == nullptr) {\n      continue;\n    }\n    const TBOX &nbox = nearest->bounding_box();\n    if (debug) {\n      tprintf(\"Testing for overlap with:\");\n      nbox.print();\n    }\n    if (bbox->overlap(nbox)) {\n      blobs->shallow_clear();\n      if (debug) {\n        tprintf(\"Final box overlaps nearest\\n\");\n      }\n      return;\n    }\n  }\n}\n\n// For each blob in this grid, Finds the textline direction to be horizontal\n// or vertical according to distance to neighbours and 1st and 2nd order\n// neighbours. Non-text tends to end up without a definite direction.\n// Result is setting of the neighbours and vert_possible/horz_possible\n// flags in the BLOBNBOXes currently in this grid.\n// This function is called more than once if page orientation is uncertain,\n// so display_if_debugging is true on the final call to display the results.\nvoid StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode, bool display_if_debugging) {\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  // For every bbox in the grid, set its neighbours.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SetNeighbours(false, display_if_debugging, bbox);\n  }\n  // Where vertical or horizontal wins by a big margin, clarify it.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SimplifyObviousNeighbours(bbox);\n  }\n  // Now try to make the blobs only vertical or horizontal using neighbours.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    if (FindingVerticalOnly(pageseg_mode)) {\n      bbox->set_vert_possible(true);\n      bbox->set_horz_possible(false);\n    } else if (FindingHorizontalOnly(pageseg_mode)) {\n      bbox->set_vert_possible(false);\n      bbox->set_horz_possible(true);\n    } else {\n      SetNeighbourFlows(bbox);\n    }\n  }\n#ifndef GRAPHICS_DISABLED\n  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||\n      textord_tabfind_show_strokewidths > 1) {\n    initial_widths_win_ = DisplayGoodBlobs(\"InitialStrokewidths\", 400, 0);\n  }\n#endif\n  // Improve flow direction with neighbours.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SmoothNeighbourTypes(pageseg_mode, false, bbox);\n  }\n  // Now allow reset of firm values to fix renegades.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SmoothNeighbourTypes(pageseg_mode, true, bbox);\n  }\n  // Repeat.\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    SmoothNeighbourTypes(pageseg_mode, true, bbox);\n  }\n#ifndef GRAPHICS_DISABLED\n  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||\n      textord_tabfind_show_strokewidths > 1) {\n    widths_win_ = DisplayGoodBlobs(\"ImprovedStrokewidths\", 800, 0);\n  }\n#endif\n}\n\n// Sets the neighbours and good_stroke_neighbours members of the blob by\n// searching close on all 4 sides.\n// When finding leader dots/dashes, there is a slightly different rule for\n// what makes a good neighbour.\nvoid StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX *blob) {\n  int line_trap_count = 0;\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    line_trap_count += FindGoodNeighbour(bnd, leaders, blob);\n  }\n  if (line_trap_count > 0 && activate_line_trap) {\n    // It looks like a line so isolate it by clearing its neighbours.\n    blob->ClearNeighbours();\n    const TBOX &box = blob->bounding_box();\n    blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);\n  }\n}\n\n// Sets the good_stroke_neighbours member of the blob if it has a\n// GoodNeighbour on the given side.\n// Also sets the neighbour in the blob, whether or not a good one is found.\n// Returns the number of blobs in the nearby search area that would lead us to\n// believe that this blob is a line separator.\n// Leaders get extra special lenient treatment.\nint StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, BLOBNBOX *blob) {\n  // Search for neighbours that overlap vertically.\n  TBOX blob_box = blob->bounding_box();\n  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(), blob_box.bottom());\n  if (debug) {\n    tprintf(\"FGN in dir %d for blob:\", dir);\n    blob_box.print();\n  }\n  int top = blob_box.top();\n  int bottom = blob_box.bottom();\n  int left = blob_box.left();\n  int right = blob_box.right();\n  int width = right - left;\n  int height = top - bottom;\n\n  // A trap to detect lines tests for the min dimension of neighbours\n  // being larger than a multiple of the min dimension of the line\n  // and the larger dimension being smaller than a fraction of the max\n  // dimension of the line.\n  int line_trap_max = std::max(width, height) / kLineTrapLongest;\n  int line_trap_min = std::min(width, height) * kLineTrapShortest;\n  int line_trap_count = 0;\n\n  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 2 : width / 2;\n  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 3 : width / 3;\n  if (leaders) {\n    min_good_overlap = min_decent_overlap = 1;\n  }\n\n  int search_pad =\n      static_cast<int>(sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);\n  if (gridsize() > search_pad) {\n    search_pad = gridsize();\n  }\n  TBOX search_box = blob_box;\n  // Pad the search in the appropriate direction.\n  switch (dir) {\n    case BND_LEFT:\n      search_box.set_left(search_box.left() - search_pad);\n      break;\n    case BND_RIGHT:\n      search_box.set_right(search_box.right() + search_pad);\n      break;\n    case BND_BELOW:\n      search_box.set_bottom(search_box.bottom() - search_pad);\n      break;\n    case BND_ABOVE:\n      search_box.set_top(search_box.top() + search_pad);\n      break;\n    case BND_COUNT:\n      return 0;\n  }\n\n  BlobGridSearch rectsearch(this);\n  rectsearch.StartRectSearch(search_box);\n  BLOBNBOX *best_neighbour = nullptr;\n  double best_goodness = 0.0;\n  bool best_is_good = false;\n  BLOBNBOX *neighbour;\n  while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {\n    TBOX nbox = neighbour->bounding_box();\n    if (neighbour == blob) {\n      continue;\n    }\n    int mid_x = (nbox.left() + nbox.right()) / 2;\n    if (mid_x < blob->left_rule() || mid_x > blob->right_rule()) {\n      continue; // In a different column.\n    }\n    if (debug) {\n      tprintf(\"Neighbour at:\");\n      nbox.print();\n    }\n\n    // Last-minute line detector. There is a small upper limit to the line\n    // width accepted by the morphological line detector.\n    int n_width = nbox.width();\n    int n_height = nbox.height();\n    if (std::min(n_width, n_height) > line_trap_min &&\n        std::max(n_width, n_height) < line_trap_max) {\n      ++line_trap_count;\n    }\n    // Heavily joined text, such as Arabic may have very different sizes when\n    // looking at the maxes, but the heights may be almost identical, so check\n    // for a difference in height if looking sideways or width vertically.\n    if (TabFind::VeryDifferentSizes(std::max(n_width, n_height), std::max(width, height)) &&\n        (((dir == BND_LEFT || dir == BND_RIGHT) && TabFind::DifferentSizes(n_height, height)) ||\n         ((dir == BND_BELOW || dir == BND_ABOVE) && TabFind::DifferentSizes(n_width, width)))) {\n      if (debug) {\n        tprintf(\"Bad size\\n\");\n      }\n      continue; // Could be a different font size or non-text.\n    }\n    // Amount of vertical overlap between the blobs.\n    int overlap;\n    // If the overlap is along the short side of the neighbour, and it\n    // is fully overlapped, then perp_overlap holds the length of the long\n    // side of the neighbour. A measure to include hyphens and dashes as\n    // legitimate neighbours.\n    int perp_overlap;\n    int gap;\n    if (dir == BND_LEFT || dir == BND_RIGHT) {\n      overlap = std::min(static_cast<int>(nbox.top()), top) -\n                std::max(static_cast<int>(nbox.bottom()), bottom);\n      if (overlap == nbox.height() && nbox.width() > nbox.height()) {\n        perp_overlap = nbox.width();\n      } else {\n        perp_overlap = overlap;\n      }\n      gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;\n      if (gap <= 0) {\n        if (debug) {\n          tprintf(\"On wrong side\\n\");\n        }\n        continue; // On the wrong side.\n      }\n      gap -= n_width;\n    } else {\n      overlap = std::min(static_cast<int>(nbox.right()), right) -\n                std::max(static_cast<int>(nbox.left()), left);\n      if (overlap == nbox.width() && nbox.height() > nbox.width()) {\n        perp_overlap = nbox.height();\n      } else {\n        perp_overlap = overlap;\n      }\n      gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;\n      if (gap <= 0) {\n        if (debug) {\n          tprintf(\"On wrong side\\n\");\n        }\n        continue; // On the wrong side.\n      }\n      gap -= n_height;\n    }\n    if (-gap > overlap) {\n      if (debug) {\n        tprintf(\"Overlaps wrong way\\n\");\n      }\n      continue; // Overlaps the wrong way.\n    }\n    if (perp_overlap < min_decent_overlap) {\n      if (debug) {\n        tprintf(\"Doesn't overlap enough\\n\");\n      }\n      continue; // Doesn't overlap enough.\n    }\n    bool bad_sizes =\n        TabFind::DifferentSizes(height, n_height) && TabFind::DifferentSizes(width, n_width);\n    bool is_good =\n        overlap >= min_good_overlap && !bad_sizes &&\n        blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionTolerance, kStrokeWidthTolerance);\n    // Best is a fuzzy combination of gap, overlap and is good.\n    // Basically if you make one thing twice as good without making\n    // anything else twice as bad, then it is better.\n    if (gap < 1) {\n      gap = 1;\n    }\n    double goodness = (1.0 + is_good) * overlap / gap;\n    if (debug) {\n      tprintf(\"goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\\n\", goodness, best_goodness,\n              is_good, overlap, gap);\n    }\n    if (goodness > best_goodness) {\n      best_neighbour = neighbour;\n      best_goodness = goodness;\n      best_is_good = is_good;\n    }\n  }\n  blob->set_neighbour(dir, best_neighbour, best_is_good);\n  return line_trap_count;\n}\n\n// Helper to get a list of 1st-order neighbours.\nstatic void ListNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    BLOBNBOX *neighbour = blob->neighbour(bnd);\n    if (neighbour != nullptr) {\n      neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);\n    }\n  }\n}\n\n// Helper to get a list of 1st and 2nd order neighbours.\nstatic void List2ndNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {\n  ListNeighbours(blob, neighbours);\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    BLOBNBOX *neighbour = blob->neighbour(bnd);\n    if (neighbour != nullptr) {\n      ListNeighbours(neighbour, neighbours);\n    }\n  }\n}\n\n// Helper to get a list of 1st, 2nd and 3rd order neighbours.\nstatic void List3rdNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {\n  List2ndNeighbours(blob, neighbours);\n  for (int dir = 0; dir < BND_COUNT; ++dir) {\n    auto bnd = static_cast<BlobNeighbourDir>(dir);\n    BLOBNBOX *neighbour = blob->neighbour(bnd);\n    if (neighbour != nullptr) {\n      List2ndNeighbours(neighbour, neighbours);\n    }\n  }\n}\n\n// Helper to count the evidence for verticalness or horizontalness\n// in a list of neighbours.\nstatic void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST *neighbours, int *pure_h_count,\n                               int *pure_v_count) {\n  if (neighbours->length() <= kMostlyOneDirRatio) {\n    return;\n  }\n  BLOBNBOX_C_IT it(neighbours);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    int h_min, h_max, v_min, v_max;\n    blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);\n    if (debug) {\n      tprintf(\"Hgaps [%d,%d], vgaps [%d,%d]:\", h_min, h_max, v_min, v_max);\n    }\n    if (h_max < v_min || blob->leader_on_left() || blob->leader_on_right()) {\n      // Horizontal gaps are clear winners. Count a pure horizontal.\n      ++*pure_h_count;\n      if (debug) {\n        tprintf(\"Horz at:\");\n      }\n    } else if (v_max < h_min) {\n      // Vertical gaps are clear winners. Clear a pure vertical.\n      ++*pure_v_count;\n      if (debug) {\n        tprintf(\"Vert at:\");\n      }\n    } else {\n      if (debug) {\n        tprintf(\"Neither at:\");\n      }\n    }\n    if (debug) {\n      blob->bounding_box().print();\n    }\n  }\n}\n\n// Makes the blob to be only horizontal or vertical where evidence\n// is clear based on gaps of 2nd order neighbours, or definite individual\n// blobs.\nvoid StrokeWidth::SetNeighbourFlows(BLOBNBOX *blob) {\n  if (blob->DefiniteIndividualFlow()) {\n    return;\n  }\n  bool debug =\n      AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), blob->bounding_box().bottom());\n  if (debug) {\n    tprintf(\"SetNeighbourFlows (current flow=%d, type=%d) on:\", blob->flow(), blob->region_type());\n    blob->bounding_box().print();\n  }\n  BLOBNBOX_CLIST neighbours;\n  List3rdNeighbours(blob, &neighbours);\n  // The number of pure horizontal and vertical neighbours.\n  int pure_h_count = 0;\n  int pure_v_count = 0;\n  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);\n  if (debug) {\n    HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);\n    tprintf(\"SetFlows: h_count=%d, v_count=%d\\n\", pure_h_count, pure_v_count);\n  }\n  if (!neighbours.empty()) {\n    blob->set_vert_possible(true);\n    blob->set_horz_possible(true);\n    if (pure_h_count > 2 * pure_v_count) {\n      // Horizontal gaps are clear winners. Clear vertical neighbours.\n      blob->set_vert_possible(false);\n    } else if (pure_v_count > 2 * pure_h_count) {\n      // Vertical gaps are clear winners. Clear horizontal neighbours.\n      blob->set_horz_possible(false);\n    }\n  } else {\n    // Lonely blob. Can't tell its flow direction.\n    blob->set_vert_possible(false);\n    blob->set_horz_possible(false);\n  }\n}\n\n// Helper to count the number of horizontal and vertical blobs in a list.\nstatic void CountNeighbourTypes(BLOBNBOX_CLIST *neighbours, int *pure_h_count, int *pure_v_count) {\n  BLOBNBOX_C_IT it(neighbours);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (blob->UniquelyHorizontal()) {\n      ++*pure_h_count;\n    }\n    if (blob->UniquelyVertical()) {\n      ++*pure_v_count;\n    }\n  }\n}\n\n// Nullify the neighbours in the wrong directions where the direction\n// is clear-cut based on a distance margin. Good for isolating vertical\n// text from neighbouring horizontal text.\nvoid StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX *blob) {\n  // Case 1: We have text that is likely several characters, blurry and joined\n  //         together.\n  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&\n       blob->bounding_box().height() > 3 * blob->area_stroke_width())) {\n    // The blob is complex (not stick-like).\n    if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {\n      // Horizontal conjoined text.\n      blob->set_neighbour(BND_ABOVE, nullptr, false);\n      blob->set_neighbour(BND_BELOW, nullptr, false);\n      return;\n    }\n    if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {\n      // Vertical conjoined text.\n      blob->set_neighbour(BND_LEFT, nullptr, false);\n      blob->set_neighbour(BND_RIGHT, nullptr, false);\n      return;\n    }\n  }\n\n  // Case 2: This blob is likely a single character.\n  int margin = gridsize() / 2;\n  int h_min, h_max, v_min, v_max;\n  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);\n  if ((h_max + margin < v_min && h_max < margin / 2) || blob->leader_on_left() ||\n      blob->leader_on_right()) {\n    // Horizontal gaps are clear winners. Clear vertical neighbours.\n    blob->set_neighbour(BND_ABOVE, nullptr, false);\n    blob->set_neighbour(BND_BELOW, nullptr, false);\n  } else if (v_max + margin < h_min && v_max < margin / 2) {\n    // Vertical gaps are clear winners. Clear horizontal neighbours.\n    blob->set_neighbour(BND_LEFT, nullptr, false);\n    blob->set_neighbour(BND_RIGHT, nullptr, false);\n  }\n}\n\n// Smoothes the vertical/horizontal type of the blob based on the\n// 2nd-order neighbours. If reset_all is true, then all blobs are\n// changed. Otherwise, only ambiguous blobs are processed.\nvoid StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all, BLOBNBOX *blob) {\n  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {\n    // There are both horizontal and vertical so try to fix it.\n    BLOBNBOX_CLIST neighbours;\n    List2ndNeighbours(blob, &neighbours);\n    // The number of pure horizontal and vertical neighbours.\n    int pure_h_count = 0;\n    int pure_v_count = 0;\n    CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);\n    if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),\n                                      blob->bounding_box().bottom())) {\n      HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);\n      tprintf(\"pure_h=%d, pure_v=%d\\n\", pure_h_count, pure_v_count);\n    }\n    if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {\n      // Horizontal gaps are clear winners. Clear vertical neighbours.\n      blob->set_vert_possible(false);\n      blob->set_horz_possible(true);\n    } else if (pure_v_count > pure_h_count && !FindingHorizontalOnly(pageseg_mode)) {\n      // Vertical gaps are clear winners. Clear horizontal neighbours.\n      blob->set_horz_possible(false);\n      blob->set_vert_possible(true);\n    }\n  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),\n                                           blob->bounding_box().bottom())) {\n    HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);\n    tprintf(\"Clean on pass 3!\\n\");\n  }\n}\n\n// Partition creation. Accumulates vertical and horizontal text chains,\n// puts the remaining blobs in as unknowns, and then merges/splits to\n// minimize overlap and smoothes the types with neighbours and the color\n// image if provided. rerotation is used to rotate the coordinate space\n// back to the nontext_map_ image.\n// If find_problems is true, detects possible noise pollution by the amount\n// of partition overlap that is created by the diacritics. If excessive, the\n// noise is separated out into diacritic blobs, and PFR_NOISE is returned.\n// [TODO(rays): if the partition overlap is caused by heavy skew, deskews\n// the components, saves the skew_angle and returns PFR_SKEW.] If the return\n// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be\n// called again after cleaning up the partly done work.\nPartitionFindResult StrokeWidth::FindInitialPartitions(\n    PageSegMode pageseg_mode, const FCOORD &rerotation, bool find_problems, TO_BLOCK *block,\n    BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts,\n    FCOORD *skew_angle) {\n  if (!FindingHorizontalOnly(pageseg_mode)) {\n    FindVerticalTextChains(part_grid);\n  }\n  if (!FindingVerticalOnly(pageseg_mode)) {\n    FindHorizontalTextChains(part_grid);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_strokewidths) {\n    chains_win_ = MakeWindow(0, 400, \"Initial text chains\");\n    part_grid->DisplayBoxes(chains_win_);\n    projection_->DisplayProjection();\n  }\n#endif\n  if (find_problems) {\n    // TODO(rays) Do something to find skew, set skew_angle and return if there\n    // is some.\n  }\n  part_grid->SplitOverlappingPartitions(big_parts);\n  EasyMerges(part_grid);\n  RemoveLargeUnusedBlobs(block, part_grid, big_parts);\n  TBOX grid_box(bleft(), tright());\n  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {\n    ;\n  }\n  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {\n    ;\n  }\n  int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);\n  TestDiacritics(part_grid, block);\n  MergeDiacritics(block, part_grid);\n  if (find_problems && diacritic_blobs != nullptr &&\n      DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, diacritic_blobs)) {\n    return PFR_NOISE;\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_strokewidths) {\n    textlines_win_ = MakeWindow(400, 400, \"GoodTextline blobs\");\n    part_grid->DisplayBoxes(textlines_win_);\n    diacritics_win_ = DisplayDiacritics(\"Diacritics\", 0, 0, block);\n  }\n#endif\n  PartitionRemainingBlobs(pageseg_mode, part_grid);\n  part_grid->SplitOverlappingPartitions(big_parts);\n  EasyMerges(part_grid);\n  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {\n    ;\n  }\n  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {\n    ;\n  }\n  // Now eliminate strong stuff in a sea of the opposite.\n  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_, grid_box, rerotation)) {\n    ;\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_strokewidths) {\n    smoothed_win_ = MakeWindow(800, 400, \"Smoothed blobs\");\n    part_grid->DisplayBoxes(smoothed_win_);\n  }\n#endif\n  return PFR_OK;\n}\n\n// Detects noise by a significant increase in partition overlap from\n// pre_overlap to now, and removes noise from the union of all the overlapping\n// partitions, placing the blobs in diacritic_blobs. Returns true if any noise\n// was found and removed.\nbool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX &grid_box, TO_BLOCK *block,\n                                       ColPartitionGrid *part_grid,\n                                       BLOBNBOX_LIST *diacritic_blobs) {\n  ColPartitionGrid *noise_grid = nullptr;\n  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);\n  if (pre_overlap == 0) {\n    pre_overlap = 1;\n  }\n  BLOBNBOX_IT diacritic_it(diacritic_blobs);\n  if (noise_grid != nullptr) {\n    if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&\n        post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {\n      // This is noisy enough to fix.\n#ifndef GRAPHICS_DISABLED\n      if (textord_tabfind_show_strokewidths) {\n        ScrollView *noise_win = MakeWindow(1000, 500, \"Noise Areas\");\n        noise_grid->DisplayBoxes(noise_win);\n      }\n#endif\n      part_grid->DeleteNonLeaderParts();\n      BLOBNBOX_IT blob_it(&block->noise_blobs);\n      ColPartitionGridSearch rsearch(noise_grid);\n      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n        BLOBNBOX *blob = blob_it.data();\n        blob->ClearNeighbours();\n        if (!blob->IsDiacritic() || blob->owner() != nullptr) {\n          continue; // Not a noise candidate.\n        }\n        TBOX search_box(blob->bounding_box());\n        search_box.pad(gridsize(), gridsize());\n        rsearch.StartRectSearch(search_box);\n        ColPartition *part = rsearch.NextRectSearch();\n        if (part != nullptr) {\n          // Consider blob as possible noise.\n          blob->set_owns_cblob(true);\n          blob->compute_bounding_box();\n          diacritic_it.add_after_then_move(blob_it.extract());\n        }\n      }\n      noise_grid->DeleteParts();\n      delete noise_grid;\n      return true;\n    }\n    noise_grid->DeleteParts();\n    delete noise_grid;\n  }\n  return false;\n}\n\n// Helper verifies that blob's neighbour in direction dir is good to add to a\n// vertical text chain by returning the neighbour if it is not null, not owned,\n// and not uniquely horizontal, as well as its neighbour in the opposite\n// direction is blob.\nstatic BLOBNBOX *MutualUnusedVNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {\n  BLOBNBOX *next_blob = blob->neighbour(dir);\n  if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyHorizontal()) {\n    return nullptr;\n  }\n  if (next_blob->neighbour(DirOtherWay(dir)) == blob) {\n    return next_blob;\n  }\n  return nullptr;\n}\n\n// Finds vertical chains of text-like blobs and puts them in ColPartitions.\nvoid StrokeWidth::FindVerticalTextChains(ColPartitionGrid *part_grid) {\n  // A PageSegMode that forces vertical textlines with the current rotation.\n  PageSegMode pageseg_mode =\n      rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    // Only process boxes that have no horizontal hope and have not yet\n    // been included in a chain.\n    BLOBNBOX *blob;\n    if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&\n        (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {\n      // Put all the linked blobs into a ColPartition.\n      auto *part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));\n      part->AddBox(bbox);\n      while (blob != nullptr) {\n        part->AddBox(blob);\n        blob = MutualUnusedVNeighbour(blob, BND_ABOVE);\n      }\n      blob = MutualUnusedVNeighbour(bbox, BND_BELOW);\n      while (blob != nullptr) {\n        part->AddBox(blob);\n        blob = MutualUnusedVNeighbour(blob, BND_BELOW);\n      }\n      CompletePartition(pageseg_mode, part, part_grid);\n    }\n  }\n}\n\n// Helper verifies that blob's neighbour in direction dir is good to add to a\n// horizontal text chain by returning the neighbour if it is not null, not\n// owned, and not uniquely vertical, as well as its neighbour in the opposite\n// direction is blob.\nstatic BLOBNBOX *MutualUnusedHNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {\n  BLOBNBOX *next_blob = blob->neighbour(dir);\n  if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyVertical()) {\n    return nullptr;\n  }\n  if (next_blob->neighbour(DirOtherWay(dir)) == blob) {\n    return next_blob;\n  }\n  return nullptr;\n}\n\n// Finds horizontal chains of text-like blobs and puts them in ColPartitions.\nvoid StrokeWidth::FindHorizontalTextChains(ColPartitionGrid *part_grid) {\n  // A PageSegMode that forces horizontal textlines with the current rotation.\n  PageSegMode pageseg_mode =\n      rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    BLOBNBOX *blob;\n    if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&\n        (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {\n      // Put all the linked blobs into a ColPartition.\n      auto *part = new ColPartition(BRT_TEXT, ICOORD(0, 1));\n      part->AddBox(bbox);\n      while (blob != nullptr) {\n        part->AddBox(blob);\n        blob = MutualUnusedHNeighbour(blob, BND_RIGHT);\n      }\n      blob = MutualUnusedHNeighbour(bbox, BND_LEFT);\n      while (blob != nullptr) {\n        part->AddBox(blob);\n        blob = MutualUnusedVNeighbour(blob, BND_LEFT);\n      }\n      CompletePartition(pageseg_mode, part, part_grid);\n    }\n  }\n}\n\n// Finds diacritics and saves their base character in the blob.\n// The objective is to move all diacritics to the noise_blobs list, so\n// they don't mess up early textline finding/merging, or force splits\n// on textlines that overlap a bit. Blobs that become diacritics must be\n// either part of no ColPartition (nullptr owner) or in a small partition in\n// which ALL the blobs are diacritics, in which case the partition is\n// exploded (deleted) back to its blobs.\nvoid StrokeWidth::TestDiacritics(ColPartitionGrid *part_grid, TO_BLOCK *block) {\n  BlobGrid small_grid(gridsize(), bleft(), tright());\n  small_grid.InsertBlobList(&block->noise_blobs);\n  small_grid.InsertBlobList(&block->blobs);\n  int medium_diacritics = 0;\n  int small_diacritics = 0;\n  BLOBNBOX_IT small_it(&block->noise_blobs);\n  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {\n    BLOBNBOX *blob = small_it.data();\n    if (blob->owner() == nullptr && !blob->IsDiacritic() && DiacriticBlob(&small_grid, blob)) {\n      ++small_diacritics;\n    }\n  }\n  BLOBNBOX_IT blob_it(&block->blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    if (blob->IsDiacritic()) {\n      small_it.add_to_end(blob_it.extract());\n      continue; // Already a diacritic.\n    }\n    ColPartition *part = blob->owner();\n    if (part == nullptr && DiacriticBlob(&small_grid, blob)) {\n      ++medium_diacritics;\n      RemoveBBox(blob);\n      small_it.add_to_end(blob_it.extract());\n    } else if (part != nullptr && !part->block_owned() && part->boxes_count() < 3) {\n      // We allow blobs in small partitions to become diacritics if ALL the\n      // blobs in the partition qualify as we can then cleanly delete the\n      // partition, turn all the blobs in it to diacritics and they can be\n      // merged into the base character partition more easily than merging\n      // the partitions.\n      BLOBNBOX_C_IT box_it(part->boxes());\n      for (box_it.mark_cycle_pt();\n           !box_it.cycled_list() && DiacriticBlob(&small_grid, box_it.data()); box_it.forward()) {\n        ;\n      }\n      if (box_it.cycled_list()) {\n        // They are all good.\n        while (!box_it.empty()) {\n          // Liberate the blob from its partition so it can be treated\n          // as a diacritic and merged explicitly with the base part.\n          // The blob is really owned by the block. The partition \"owner\"\n          // is nulled to allow the blob to get merged with its base character\n          // partition.\n          BLOBNBOX *box = box_it.extract();\n          box->set_owner(nullptr);\n          box_it.forward();\n          ++medium_diacritics;\n          // We remove the blob from the grid so it isn't found by subsequent\n          // searches where we might not want to include diacritics.\n          RemoveBBox(box);\n        }\n        // We only move the one blob to the small list here, but the others\n        // all get moved by the test at the top of the loop.\n        small_it.add_to_end(blob_it.extract());\n        part_grid->RemoveBBox(part);\n        delete part;\n      }\n    } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),\n                                             blob->bounding_box().bottom())) {\n      tprintf(\"Blob not available to be a diacritic at:\");\n      blob->bounding_box().print();\n    }\n  }\n  if (textord_tabfind_show_strokewidths) {\n    tprintf(\"Found %d small diacritics, %d medium\\n\", small_diacritics, medium_diacritics);\n  }\n}\n\n// Searches this grid for an appropriately close and sized neighbour of the\n// given [small] blob. If such a blob is found, the diacritic base is saved\n// in the blob and true is returned.\n// The small_grid is a secondary grid that contains the small/noise objects\n// that are not in this grid, but may be useful for determining a connection\n// between blob and its potential base character. (See DiacriticXGapFilled.)\nbool StrokeWidth::DiacriticBlob(BlobGrid *small_grid, BLOBNBOX *blob) {\n  if (BLOBNBOX::UnMergeableType(blob->region_type()) || blob->region_type() == BRT_VERT_TEXT) {\n    return false;\n  }\n  TBOX small_box(blob->bounding_box());\n  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(), small_box.bottom());\n  if (debug) {\n    tprintf(\"Testing blob for diacriticness at:\");\n    small_box.print();\n  }\n  int x = (small_box.left() + small_box.right()) / 2;\n  int y = (small_box.bottom() + small_box.top()) / 2;\n  int grid_x, grid_y;\n  GridCoords(x, y, &grid_x, &grid_y);\n  int height = small_box.height();\n  // Setup a rectangle search to find its nearest base-character neighbour.\n  // We keep 2 different best candidates:\n  // best_x_overlap is a category of base characters that have an overlap in x\n  // (like an acute) in which we look for the least y-gap, computed using the\n  // projection to favor base characters in the same textline.\n  // best_y_overlap is a category of base characters that have no x overlap,\n  // (nominally a y-overlap is preferrecd but not essential) in which we\n  // look for the least weighted sum of x-gap and y-gap, with x-gap getting\n  // a lower weight to catch quotes at the end of a textline.\n  // NOTE that x-gap and y-gap are measured from the nearest side of the base\n  // character to the FARTHEST side of the diacritic to allow small diacritics\n  // to be a reasonable distance away, but not big diacritics.\n  BLOBNBOX *best_x_overlap = nullptr;\n  BLOBNBOX *best_y_overlap = nullptr;\n  int best_total_dist = 0;\n  int best_y_gap = 0;\n  TBOX best_xbox;\n  // TODO(rays) the search box could be setup using the projection as a guide.\n  TBOX search_box(small_box);\n  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);\n  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);\n  search_box.pad(x_pad, y_pad);\n  BlobGridSearch rsearch(this);\n  rsearch.SetUniqueMode(true);\n  int min_height = height * kMinDiacriticSizeRatio;\n  rsearch.StartRectSearch(search_box);\n  BLOBNBOX *neighbour;\n  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {\n    if (BLOBNBOX::UnMergeableType(neighbour->region_type()) || neighbour == blob ||\n        neighbour->owner() == blob->owner()) {\n      continue;\n    }\n    TBOX nbox = neighbour->bounding_box();\n    if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||\n        (neighbour->flow() != BTFT_CHAIN && neighbour->flow() != BTFT_STRONG_CHAIN)) {\n      if (debug) {\n        tprintf(\"Neighbour not strong enough:\");\n        nbox.print();\n      }\n      continue; // Diacritics must be attached to strong text.\n    }\n    if (nbox.height() < min_height) {\n      if (debug) {\n        tprintf(\"Neighbour not big enough:\");\n        nbox.print();\n      }\n      continue; // Too small to be the base character.\n    }\n    int x_gap = small_box.x_gap(nbox);\n    int y_gap = small_box.y_gap(nbox);\n    int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox, true, denorm_, debug);\n    if (debug) {\n      tprintf(\"xgap=%d, y=%d, total dist=%d\\n\", x_gap, y_gap, total_distance);\n    }\n    if (total_distance > neighbour->owner()->median_height() * kMaxDiacriticDistanceRatio) {\n      if (debug) {\n        tprintf(\"Neighbour with median size %d too far away:\", neighbour->owner()->median_height());\n        neighbour->bounding_box().print();\n      }\n      continue; // Diacritics must not be too distant.\n    }\n    if (x_gap <= 0) {\n      if (debug) {\n        tprintf(\"Computing reduced box for :\");\n        nbox.print();\n      }\n      int left = small_box.left() - small_box.width();\n      int right = small_box.right() + small_box.width();\n      nbox = neighbour->BoundsWithinLimits(left, right);\n      y_gap = small_box.y_gap(nbox);\n      if (best_x_overlap == nullptr || y_gap < best_y_gap) {\n        best_x_overlap = neighbour;\n        best_xbox = nbox;\n        best_y_gap = y_gap;\n        if (debug) {\n          tprintf(\"New best:\");\n          nbox.print();\n        }\n      } else if (debug) {\n        tprintf(\"Shrunken box doesn't win:\");\n        nbox.print();\n      }\n    } else if (blob->ConfirmNoTabViolation(*neighbour)) {\n      if (best_y_overlap == nullptr || total_distance < best_total_dist) {\n        if (debug) {\n          tprintf(\"New best y overlap:\");\n          nbox.print();\n        }\n        best_y_overlap = neighbour;\n        best_total_dist = total_distance;\n      } else if (debug) {\n        tprintf(\"New y overlap box doesn't win:\");\n        nbox.print();\n      }\n    } else if (debug) {\n      tprintf(\"Neighbour wrong side of a tab:\");\n      nbox.print();\n    }\n  }\n  if (best_x_overlap != nullptr &&\n      (best_y_overlap == nullptr || best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {\n    blob->set_diacritic_box(best_xbox);\n    blob->set_base_char_blob(best_x_overlap);\n    if (debug) {\n      tprintf(\"DiacriticBlob OK! (x-overlap:\");\n      small_box.print();\n      best_xbox.print();\n    }\n    return true;\n  }\n  if (best_y_overlap != nullptr &&\n      DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()) &&\n      NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {\n    blob->set_diacritic_box(best_y_overlap->bounding_box());\n    blob->set_base_char_blob(best_y_overlap);\n    if (debug) {\n      tprintf(\"DiacriticBlob OK! (y-overlap:\");\n      small_box.print();\n      best_y_overlap->bounding_box().print();\n    }\n    return true;\n  }\n  if (debug) {\n    tprintf(\"DiacriticBlob fails:\");\n    small_box.print();\n    tprintf(\"Best x+y gap = %d, y = %d\\n\", best_total_dist, best_y_gap);\n    if (best_y_overlap != nullptr) {\n      tprintf(\"XGapFilled=%d, NoiseBetween=%d\\n\",\n              DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()),\n              NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));\n    }\n  }\n  return false;\n}\n\n// Returns true if there is no gap between the base char and the diacritic\n// bigger than a fraction of the height of the base char:\n// Eg: line end.....'\n// The quote is a long way from the end of the line, yet it needs to be a\n// diacritic. To determine that the quote is not part of an image, or\n// a different text block, we check for other marks in the gap between\n// the base char and the diacritic.\n//                          '<--Diacritic\n// |---------|\n// |         |<-toobig-gap->\n// | Base    |<ok gap>\n// |---------|        x<-----Dot occupying gap\n// The grid is const really.\nbool StrokeWidth::DiacriticXGapFilled(BlobGrid *grid, const TBOX &diacritic_box,\n                                      const TBOX &base_box) {\n  // Since most gaps are small, use an iterative algorithm to search the gap.\n  int max_gap = IntCastRounded(base_box.height() * kMaxDiacriticGapToBaseCharHeight);\n  TBOX occupied_box(base_box);\n  int diacritic_gap;\n  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {\n    TBOX search_box(occupied_box);\n    if (diacritic_box.left() > search_box.right()) {\n      // We are looking right.\n      search_box.set_left(search_box.right());\n      search_box.set_right(search_box.left() + max_gap);\n    } else {\n      // We are looking left.\n      search_box.set_right(search_box.left());\n      search_box.set_left(search_box.left() - max_gap);\n    }\n    BlobGridSearch rsearch(grid);\n    rsearch.StartRectSearch(search_box);\n    BLOBNBOX *neighbour;\n    while ((neighbour = rsearch.NextRectSearch()) != nullptr) {\n      const TBOX &nbox = neighbour->bounding_box();\n      if (nbox.x_gap(diacritic_box) < diacritic_gap) {\n        if (nbox.left() < occupied_box.left()) {\n          occupied_box.set_left(nbox.left());\n        }\n        if (nbox.right() > occupied_box.right()) {\n          occupied_box.set_right(nbox.right());\n        }\n        break;\n      }\n    }\n    if (neighbour == nullptr) {\n      return false; // Found a big gap.\n    }\n  }\n  return true; // The gap was filled.\n}\n\n// Merges diacritics with the ColPartition of the base character blob.\nvoid StrokeWidth::MergeDiacritics(TO_BLOCK *block, ColPartitionGrid *part_grid) {\n  BLOBNBOX_IT small_it(&block->noise_blobs);\n  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {\n    BLOBNBOX *blob = small_it.data();\n    if (blob->base_char_blob() != nullptr) {\n      ColPartition *part = blob->base_char_blob()->owner();\n      // The base character must be owned by a partition and that partition\n      // must not be on the big_parts list (not block owned).\n      if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&\n          blob->IsDiacritic()) {\n        // The partition has to be removed from the grid and reinserted\n        // because its bounding box may change.\n        part_grid->RemoveBBox(part);\n        part->AddBox(blob);\n        blob->set_region_type(part->blob_type());\n        blob->set_flow(part->flow());\n        blob->set_owner(part);\n        part_grid->InsertBBox(true, true, part);\n      }\n      // Set all base chars to nullptr before any blobs get deleted.\n      blob->set_base_char_blob(nullptr);\n    }\n  }\n}\n\n// Any blobs on the large_blobs list of block that are still unowned by a\n// ColPartition, are probably drop-cap or vertically touching so the blobs\n// are removed to the big_parts list and treated separately.\nvoid StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK *block, ColPartitionGrid *part_grid,\n                                         ColPartition_LIST *big_parts) {\n  BLOBNBOX_IT large_it(&block->large_blobs);\n  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {\n    BLOBNBOX *blob = large_it.data();\n    ColPartition *big_part = blob->owner();\n    if (big_part == nullptr) {\n      // Large blobs should have gone into partitions by now if they are\n      // genuine characters, so move any unowned ones out to the big parts\n      // list. This will include drop caps and vertically touching characters.\n      ColPartition::MakeBigPartition(blob, big_parts);\n    }\n  }\n}\n\n// All remaining unused blobs are put in individual ColPartitions.\nvoid StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode, ColPartitionGrid *part_grid) {\n  BlobGridSearch gsearch(this);\n  BLOBNBOX *bbox;\n  int prev_grid_x = -1;\n  int prev_grid_y = -1;\n  BLOBNBOX_CLIST cell_list;\n  BLOBNBOX_C_IT cell_it(&cell_list);\n  bool cell_all_noise = true;\n  gsearch.StartFullSearch();\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    int grid_x = gsearch.GridX();\n    int grid_y = gsearch.GridY();\n    if (grid_x != prev_grid_x || grid_y != prev_grid_y) {\n      // New cell. Process old cell.\n      MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);\n      cell_it.set_to_list(&cell_list);\n      prev_grid_x = grid_x;\n      prev_grid_y = grid_y;\n      cell_all_noise = true;\n    }\n    if (bbox->owner() == nullptr) {\n      cell_it.add_to_end(bbox);\n      if (bbox->flow() != BTFT_NONTEXT) {\n        cell_all_noise = false;\n      }\n    } else {\n      cell_all_noise = false;\n    }\n  }\n  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);\n}\n\n// If combine, put all blobs in the cell_list into a single partition, otherwise\n// put each one into its own partition.\nvoid StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode, bool combine,\n                                             ColPartitionGrid *part_grid,\n                                             BLOBNBOX_CLIST *cell_list) {\n  if (cell_list->empty()) {\n    return;\n  }\n  BLOBNBOX_C_IT cell_it(cell_list);\n  if (combine) {\n    BLOBNBOX *bbox = cell_it.extract();\n    auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));\n    part->AddBox(bbox);\n    part->set_flow(bbox->flow());\n    for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {\n      part->AddBox(cell_it.extract());\n    }\n    CompletePartition(pageseg_mode, part, part_grid);\n  } else {\n    for (; !cell_it.empty(); cell_it.forward()) {\n      BLOBNBOX *bbox = cell_it.extract();\n      auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));\n      part->set_flow(bbox->flow());\n      part->AddBox(bbox);\n      CompletePartition(pageseg_mode, part, part_grid);\n    }\n  }\n}\n\n// Helper function to finish setting up a ColPartition and insert into\n// part_grid.\nvoid StrokeWidth::CompletePartition(PageSegMode pageseg_mode, ColPartition *part,\n                                    ColPartitionGrid *part_grid) {\n  part->ComputeLimits();\n  TBOX box = part->bounding_box();\n  bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());\n  int value = projection_->EvaluateColPartition(*part, denorm_, debug);\n  // Override value if pageseg_mode disagrees.\n  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {\n    value = part->boxes_count() == 1 ? 0 : -2;\n  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {\n    value = part->boxes_count() == 1 ? 0 : 2;\n  }\n  part->SetRegionAndFlowTypesFromProjectionValue(value);\n  part->ClaimBoxes();\n  part_grid->InsertBBox(true, true, part);\n}\n\n// Merge partitions where the merge appears harmless.\n// As this\nvoid StrokeWidth::EasyMerges(ColPartitionGrid *part_grid) {\n  using namespace std::placeholders; // for _1, _2\n  part_grid->Merges(std::bind(&StrokeWidth::OrientationSearchBox, this, _1, _2),\n                    std::bind(&StrokeWidth::ConfirmEasyMerge, this, _1, _2));\n}\n\n// Compute a search box based on the orientation of the partition.\n// Returns true if a suitable box can be calculated.\n// Callback for EasyMerges.\nbool StrokeWidth::OrientationSearchBox(ColPartition *part, TBOX *box) {\n  if (part->IsVerticalType()) {\n    box->set_top(box->top() + box->width());\n    box->set_bottom(box->bottom() - box->width());\n  } else {\n    box->set_left(box->left() - box->height());\n    box->set_right(box->right() + box->height());\n  }\n  return true;\n}\n\n// Merge confirmation callback for EasyMerges.\nbool StrokeWidth::ConfirmEasyMerge(const ColPartition *p1, const ColPartition *p2) {\n  ASSERT_HOST(p1 != nullptr && p2 != nullptr);\n  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());\n  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||\n      (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT)) {\n    return false; // Don't merge confirmed image with text.\n  }\n  if ((p1->IsVerticalType() || p2->IsVerticalType()) && p1->HCoreOverlap(*p2) <= 0 &&\n      ((!p1->IsSingleton() && !p2->IsSingleton()) ||\n       !p1->bounding_box().major_overlap(p2->bounding_box()))) {\n    return false; // Overlap must be in the text line.\n  }\n  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) && p1->VCoreOverlap(*p2) <= 0 &&\n      ((!p1->IsSingleton() && !p2->IsSingleton()) ||\n       (!p1->bounding_box().major_overlap(p2->bounding_box()) &&\n        !p1->OKDiacriticMerge(*p2, false) && !p2->OKDiacriticMerge(*p1, false)))) {\n    return false; // Overlap must be in the text line.\n  }\n  if (!p1->ConfirmNoTabViolation(*p2)) {\n    return false;\n  }\n  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT) {\n    return true;\n  }\n  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());\n}\n\n// Returns true if there is no significant noise in between the boxes.\nbool StrokeWidth::NoNoiseInBetween(const TBOX &box1, const TBOX &box2) const {\n  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_, nontext_map_);\n}\n\n#ifndef GRAPHICS_DISABLED\n\n/** Displays the blobs colored according to the number of good neighbours\n * and the vertical/horizontal flow.\n */\nScrollView *StrokeWidth::DisplayGoodBlobs(const char *window_name, int x, int y) {\n  auto window = MakeWindow(x, y, window_name);\n  // For every blob in the grid, display it.\n  window->Brush(ScrollView::NONE);\n\n  // For every bbox in the grid, display it.\n  BlobGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  BLOBNBOX *bbox;\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &box = bbox->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    int goodness = bbox->GoodTextBlob();\n    BlobRegionType blob_type = bbox->region_type();\n    if (bbox->UniquelyVertical()) {\n      blob_type = BRT_VERT_TEXT;\n    }\n    if (bbox->UniquelyHorizontal()) {\n      blob_type = BRT_TEXT;\n    }\n    BlobTextFlowType flow = bbox->flow();\n    if (flow == BTFT_NONE) {\n      if (goodness == 0) {\n        flow = BTFT_NEIGHBOURS;\n      } else if (goodness == 1) {\n        flow = BTFT_CHAIN;\n      } else {\n        flow = BTFT_STRONG_CHAIN;\n      }\n    }\n    window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));\n    window->Rectangle(left_x, bottom_y, right_x, top_y);\n  }\n  window->Update();\n  return window;\n}\n\nstatic void DrawDiacriticJoiner(const BLOBNBOX *blob, ScrollView *window) {\n  const TBOX &blob_box(blob->bounding_box());\n  int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());\n  int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());\n  int x = (blob_box.left() + blob_box.right()) / 2;\n  window->Line(x, top, x, bottom);\n}\n\n// Displays blobs colored according to whether or not they are diacritics.\nScrollView *StrokeWidth::DisplayDiacritics(const char *window_name, int x, int y, TO_BLOCK *block) {\n  auto window = MakeWindow(x, y, window_name);\n  // For every blob in the grid, display it.\n  window->Brush(ScrollView::NONE);\n\n  BLOBNBOX_IT it(&block->blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (blob->IsDiacritic()) {\n      window->Pen(ScrollView::GREEN);\n      DrawDiacriticJoiner(blob, window);\n    } else {\n      window->Pen(blob->BoxColor());\n    }\n    const TBOX &box = blob->bounding_box();\n    window->Rectangle(box.left(), box.bottom(), box.right(), box.top());\n  }\n  it.set_to_list(&block->noise_blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    if (blob->IsDiacritic()) {\n      window->Pen(ScrollView::GREEN);\n      DrawDiacriticJoiner(blob, window);\n    } else {\n      window->Pen(ScrollView::WHITE);\n    }\n    const TBOX &box = blob->bounding_box();\n    window->Rectangle(box.left(), box.bottom(), box.right(), box.top());\n  }\n  window->Update();\n  return window;\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/strokewidth.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        strokewidth.h\n// Description: Subclass of BBGrid to find uniformity of strokewidth.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_STROKEWIDTH_H_\n#define TESSERACT_TEXTORD_STROKEWIDTH_H_\n\n#include \"blobbox.h\"  // BlobNeighbourDir.\n#include \"blobgrid.h\" // Base class.\n#include \"colpartitiongrid.h\"\n#include \"textlineprojection.h\"\n\nclass DENORM;\nclass ScrollView;\nclass TO_BLOCK;\n\nnamespace tesseract {\n\nclass ColPartition_LIST;\nclass TabFind;\nclass TextlineProjection;\n\n// Misc enums to clarify bool arguments for direction-controlling args.\nenum LeftOrRight { LR_LEFT, LR_RIGHT };\n\n// Return value from FindInitialPartitions indicates detection of severe\n// skew or noise.\nenum PartitionFindResult {\n  PFR_OK,   // Everything is OK.\n  PFR_SKEW, // Skew was detected and rotated.\n  PFR_NOISE // Noise was detected and removed.\n};\n\n/**\n * The StrokeWidth class holds all the normal and large blobs.\n * It is used to find good large blobs and move them to the normal blobs\n * by virtue of having a reasonable strokewidth compatible neighbour.\n */\nclass StrokeWidth : public BlobGrid {\npublic:\n  StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright);\n  ~StrokeWidth() override;\n\n  // Sets the neighbours member of the medium-sized blobs in the block.\n  // Searches on 4 sides of each blob for similar-sized, similar-strokewidth\n  // blobs and sets pointers to the good neighbours.\n  void SetNeighboursOnMediumBlobs(TO_BLOCK *block);\n\n  // Sets the neighbour/textline writing direction members of the medium\n  // and large blobs with optional repair of broken CJK characters first.\n  // Repair of broken CJK is needed here because broken CJK characters\n  // can fool the textline direction detection algorithm.\n  void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge,\n                                            TO_BLOCK *input_block);\n\n  // To save computation, the process of generating partitions is broken\n  // into the following 4 steps:\n  // TestVerticalTextDirection\n  // CorrectForRotation (used only if a rotation is to be applied)\n  // FindLeaderPartitions\n  // GradeBlobsIntoPartitions.\n  // These functions are all required, in sequence, except for\n  // CorrectForRotation, which is not needed if no rotation is applied.\n\n  // Types all the blobs as vertical or horizontal text or unknown and\n  // returns true if the majority are vertical.\n  // If the blobs are rotated, it is necessary to call CorrectForRotation\n  // after rotating everything, otherwise the work done here will be enough.\n  // If osd_blobs is not null, a list of blobs from the dominant textline\n  // direction are returned for use in orientation and script detection.\n  // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.\n  bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block,\n                                 BLOBNBOX_CLIST *osd_blobs);\n\n  // Corrects the data structures for the given rotation.\n  void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid);\n\n  // Finds leader partitions and inserts them into the given grid.\n  void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid);\n\n  // Finds and marks noise those blobs that look like bits of vertical lines\n  // that would otherwise screw up layout analysis.\n  void RemoveLineResidue(ColPartition_LIST *big_part_list);\n\n  // Types all the blobs as vertical text or horizontal text or unknown and\n  // puts them into initial ColPartitions in the supplied part_grid.\n  // rerotation determines how to get back to the image coordinates from the\n  // blob coordinates (since they may have been rotated for vertical text).\n  // block is the single block for the whole page or rectangle to be OCRed.\n  // nontext_pix (full-size), is a binary mask used to prevent merges across\n  // photo/text boundaries. It is not kept beyond this function.\n  // denorm provides a mapping back to the image from the current blob\n  // coordinate space.\n  // projection provides a measure of textline density over the image and\n  // provides functions to assist with diacritic detection. It should be a\n  // pointer to a new TextlineProjection, and will be setup here.\n  // part_grid is the output grid of textline partitions.\n  // Large blobs that cause overlap are put in separate partitions and added\n  // to the big_parts list.\n  void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block,\n                                Image nontext_pix, const DENORM *denorm, bool cjk_script,\n                                TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs,\n                                ColPartitionGrid *part_grid, ColPartition_LIST *big_parts);\n\n  // Handles a click event in a display window.\n  void HandleClick(int x, int y) override;\n\nprivate:\n  // Computes the noise_density_ by summing the number of elements in a\n  // neighbourhood of each grid cell.\n  void ComputeNoiseDensity(TO_BLOCK *block, TabFind *line_grid);\n\n  // Detects and marks leader dots/dashes.\n  //    Leaders are horizontal chains of small or noise blobs that look\n  //    monospace according to ColPartition::MarkAsLeaderIfMonospaced().\n  // Detected leaders become the only occupants of the block->small_blobs list.\n  // Non-leader small blobs get moved to the blobs list.\n  // Non-leader noise blobs remain singletons in the noise list.\n  // All small and noise blobs in high density regions are marked BTFT_NONTEXT.\n  // block is the single block for the whole page or rectangle to be OCRed.\n  // leader_parts is the output.\n  void FindLeadersAndMarkNoise(TO_BLOCK *block, ColPartition_LIST *leader_parts);\n\n  /** Inserts the block blobs (normal and large) into this grid.\n   * Blobs remain owned by the block. */\n  void InsertBlobs(TO_BLOCK *block);\n\n  // Fix broken CJK characters, using the fake joined blobs mechanism.\n  // Blobs are really merged, ie the master takes all the outlines and the\n  // others are deleted.\n  // Returns true if sufficient blobs are merged that it may be worth running\n  // again, due to a better estimate of character size.\n  bool FixBrokenCJK(TO_BLOCK *block);\n\n  // Collect blobs that overlap or are within max_dist of the input bbox.\n  // Return them in the list of blobs and expand the bbox to be the union\n  // of all the boxes. not_this is excluded from the search, as are blobs\n  // that cause the merged box to exceed max_size in either dimension.\n  void AccumulateOverlaps(const BLOBNBOX *not_this, bool debug, int max_size, int max_dist,\n                          TBOX *bbox, BLOBNBOX_CLIST *blobs);\n\n  // For each blob in this grid, Finds the textline direction to be horizontal\n  // or vertical according to distance to neighbours and 1st and 2nd order\n  // neighbours. Non-text tends to end up without a definite direction.\n  // Result is setting of the neighbours and vert_possible/horz_possible\n  // flags in the BLOBNBOXes currently in this grid.\n  // This function is called more than once if page orientation is uncertain,\n  // so display_if_debugging is true on the final call to display the results.\n  void FindTextlineFlowDirection(PageSegMode pageseg_mode, bool display_if_debugging);\n\n  // Sets the neighbours and good_stroke_neighbours members of the blob by\n  // searching close on all 4 sides.\n  // When finding leader dots/dashes, there is a slightly different rule for\n  // what makes a good neighbour.\n  // If activate_line_trap, then line-like objects are found and isolated.\n  void SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX *blob);\n\n  // Sets the good_stroke_neighbours member of the blob if it has a\n  // GoodNeighbour on the given side.\n  // Also sets the neighbour in the blob, whether or not a good one is found.\n  // Return value is the number of neighbours in the line trap size range.\n  // Leaders get extra special lenient treatment.\n  int FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, BLOBNBOX *blob);\n\n  // Makes the blob to be only horizontal or vertical where evidence\n  // is clear based on gaps of 2nd order neighbours.\n  void SetNeighbourFlows(BLOBNBOX *blob);\n\n  // Nullify the neighbours in the wrong directions where the direction\n  // is clear-cut based on a distance margin. Good for isolating vertical\n  // text from neighbouring horizontal text.\n  void SimplifyObviousNeighbours(BLOBNBOX *blob);\n\n  // Smoothes the vertical/horizontal type of the blob based on the\n  // 2nd-order neighbours. If reset_all is true, then all blobs are\n  // changed. Otherwise, only ambiguous blobs are processed.\n  void SmoothNeighbourTypes(PageSegMode pageseg_mode, bool desperate, BLOBNBOX *blob);\n\n  // Checks the left or right side of the given leader partition and sets the\n  // (opposite) leader_on_right or leader_on_left flags for blobs\n  // that are next to the given side of the given leader partition.\n  void MarkLeaderNeighbours(const ColPartition *part, LeftOrRight side);\n\n  // Partition creation. Accumulates vertical and horizontal text chains,\n  // puts the remaining blobs in as unknowns, and then merges/splits to\n  // minimize overlap and smoothes the types with neighbours and the color\n  // image if provided. rerotation is used to rotate the coordinate space\n  // back to the nontext_map_ image.\n  // If find_problems is true, detects possible noise pollution by the amount\n  // of partition overlap that is created by the diacritics. If excessive, the\n  // noise is separated out into diacritic blobs, and PFR_NOISE is returned.\n  // [TODO(rays): if the partition overlap is caused by heavy skew, deskews\n  // the components, saves the skew_angle and returns PFR_SKEW.] If the return\n  // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be\n  // called again after cleaning up the partly done work.\n  PartitionFindResult FindInitialPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation,\n                                            bool find_problems, TO_BLOCK *block,\n                                            BLOBNBOX_LIST *diacritic_blobs,\n                                            ColPartitionGrid *part_grid,\n                                            ColPartition_LIST *big_parts, FCOORD *skew_angle);\n  // Detects noise by a significant increase in partition overlap from\n  // pre_overlap to now, and removes noise from the union of all the overlapping\n  // partitions, placing the blobs in diacritic_blobs. Returns true if any noise\n  // was found and removed.\n  bool DetectAndRemoveNoise(int pre_overlap, const TBOX &grid_box, TO_BLOCK *block,\n                            ColPartitionGrid *part_grid, BLOBNBOX_LIST *diacritic_blobs);\n  // Finds vertical chains of text-like blobs and puts them in ColPartitions.\n  void FindVerticalTextChains(ColPartitionGrid *part_grid);\n  // Finds horizontal chains of text-like blobs and puts them in ColPartitions.\n  void FindHorizontalTextChains(ColPartitionGrid *part_grid);\n  // Finds diacritics and saves their base character in the blob.\n  void TestDiacritics(ColPartitionGrid *part_grid, TO_BLOCK *block);\n  // Searches this grid for an appropriately close and sized neighbour of the\n  // given [small] blob. If such a blob is found, the diacritic base is saved\n  // in the blob and true is returned.\n  // The small_grid is a secondary grid that contains the small/noise objects\n  // that are not in this grid, but may be useful for determining a connection\n  // between blob and its potential base character. (See DiacriticXGapFilled.)\n  bool DiacriticBlob(BlobGrid *small_grid, BLOBNBOX *blob);\n  // Returns true if there is no gap between the base char and the diacritic\n  // bigger than a fraction of the height of the base char:\n  // Eg: line end.....'\n  // The quote is a long way from the end of the line, yet it needs to be a\n  // diacritic. To determine that the quote is not part of an image, or\n  // a different text block, we check for other marks in the gap between\n  // the base char and the diacritic.\n  //                          '<--Diacritic\n  // |---------|\n  // |         |<-toobig-gap->\n  // | Base    |<ok gap>\n  // |---------|        x<-----Dot occupying gap\n  // The grid is const really.\n  bool DiacriticXGapFilled(BlobGrid *grid, const TBOX &diacritic_box, const TBOX &base_box);\n  // Merges diacritics with the ColPartition of the base character blob.\n  void MergeDiacritics(TO_BLOCK *block, ColPartitionGrid *part_grid);\n  // Any blobs on the large_blobs list of block that are still unowned by a\n  // ColPartition, are probably drop-cap or vertically touching so the blobs\n  // are removed to the big_parts list and treated separately.\n  void RemoveLargeUnusedBlobs(TO_BLOCK *block, ColPartitionGrid *part_grid,\n                              ColPartition_LIST *big_parts);\n\n  // All remaining unused blobs are put in individual ColPartitions.\n  void PartitionRemainingBlobs(PageSegMode pageseg_mode, ColPartitionGrid *part_grid);\n\n  // If combine, put all blobs in the cell_list into a single partition,\n  // otherwise put each one into its own partition.\n  void MakePartitionsFromCellList(PageSegMode pageseg_mode, bool combine,\n                                  ColPartitionGrid *part_grid, BLOBNBOX_CLIST *cell_list);\n\n  // Helper function to finish setting up a ColPartition and insert into\n  // part_grid.\n  void CompletePartition(PageSegMode pageseg_mode, ColPartition *part, ColPartitionGrid *part_grid);\n\n  // Helper returns true if we are looking only for vertical textlines,\n  // taking into account any rotation that has been done.\n  bool FindingVerticalOnly(PageSegMode pageseg_mode) const {\n    if (rerotation_.y() == 0.0f) {\n      return pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;\n    }\n    return !PSM_ORIENTATION_ENABLED(pageseg_mode) && pageseg_mode != PSM_SINGLE_BLOCK_VERT_TEXT;\n  }\n  // Helper returns true if we are looking only for horizontal textlines,\n  // taking into account any rotation that has been done.\n  bool FindingHorizontalOnly(PageSegMode pageseg_mode) const {\n    if (rerotation_.y() == 0.0f) {\n      return !PSM_ORIENTATION_ENABLED(pageseg_mode) && pageseg_mode != PSM_SINGLE_BLOCK_VERT_TEXT;\n    }\n    return pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;\n  }\n\n  // Merge partitions where the merge appears harmless.\n  void EasyMerges(ColPartitionGrid *part_grid);\n\n  // Compute a search box based on the orientation of the partition.\n  // Returns true if a suitable box can be calculated.\n  // Callback for EasyMerges.\n  bool OrientationSearchBox(ColPartition *part, TBOX *box);\n\n  // Merge confirmation callback for EasyMerges.\n  bool ConfirmEasyMerge(const ColPartition *p1, const ColPartition *p2);\n\n  // Returns true if there is no significant noise in between the boxes.\n  bool NoNoiseInBetween(const TBOX &box1, const TBOX &box2) const;\n\n#ifndef GRAPHICS_DISABLED\n  // Displays the blobs colored according to the number of good neighbours\n  // and the vertical/horizontal flow.\n  ScrollView *DisplayGoodBlobs(const char *window_name, int x, int y);\n\n  // Displays blobs colored according to whether or not they are diacritics.\n  ScrollView *DisplayDiacritics(const char *window_name, int x, int y, TO_BLOCK *block);\n#endif\n\nprivate:\n  // Image map of photo/noise areas on the page. Borrowed pointer (not owned.)\n  Image nontext_map_;\n  // Textline projection map. Borrowed pointer.\n  TextlineProjection *projection_;\n  // DENORM used by projection_ to get back to image coords. Borrowed pointer.\n  const DENORM *denorm_;\n  // Bounding box of the grid.\n  TBOX grid_box_;\n  // Rerotation to get back to the original image.\n  FCOORD rerotation_;\n#ifndef GRAPHICS_DISABLED\n  // Windows for debug display.\n  ScrollView *leaders_win_ = nullptr;\n  ScrollView *initial_widths_win_ = nullptr;\n  ScrollView *widths_win_ = nullptr;\n  ScrollView *chains_win_ = nullptr;\n  ScrollView *diacritics_win_ = nullptr;\n  ScrollView *textlines_win_ = nullptr;\n  ScrollView *smoothed_win_ = nullptr;\n#endif\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_STROKEWIDTH_H_\n"
  },
  {
    "path": "src/textord/tabfind.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tabfind.cpp\n// Description: Subclass of BBGrid to find vertically aligned blobs.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"alignedblob.h\"\n#include \"colpartitiongrid.h\"\n#include \"detlinefit.h\"\n#include \"host.h\" // for NearlyEqual\n#include \"linefind.h\"\n#include \"tabfind.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// Multiple of box size to search for initial gaps.\nconst int kTabRadiusFactor = 5;\n// Min and Max multiple of height to search vertically when extrapolating.\nconst int kMinVerticalSearch = 3;\nconst int kMaxVerticalSearch = 12;\nconst int kMaxRaggedSearch = 25;\n// Minimum number of lines in a column width to make it interesting.\nconst int kMinLinesInColumn = 10;\n// Minimum width of a column to be interesting.\nconst int kMinColumnWidth = 200;\n// Minimum fraction of total column lines for a column to be interesting.\nconst double kMinFractionalLinesInColumn = 0.125;\n// Fraction of height used as alignment tolerance for aligned tabs.\nconst double kAlignedFraction = 0.03125;\n// Maximum gutter width (in absolute inch) that we care about\nconst double kMaxGutterWidthAbsolute = 2.00;\n// Multiplier of gridsize for min gutter width of TT_MAYBE_RAGGED blobs.\nconst int kRaggedGutterMultiple = 5;\n// Min aspect ratio of tall objects to be considered a separator line.\n// (These will be ignored in searching the gutter for obstructions.)\nconst double kLineFragmentAspectRatio = 10.0;\n// Min number of points to accept after evaluation.\nconst int kMinEvaluatedTabs = 3;\n// Up to 30 degrees is allowed for rotations of diacritic blobs.\n// Keep this value slightly larger than kCosSmallAngle in blobbox.cpp\n// so that the assert there never fails.\nconst double kCosMaxSkewAngle = 0.866025;\n\nstatic BOOL_VAR(textord_tabfind_show_initialtabs, false, \"Show tab candidates\");\nstatic BOOL_VAR(textord_tabfind_show_finaltabs, false, \"Show tab vectors\");\n\nTabFind::TabFind(int gridsize, const ICOORD &bleft, const ICOORD &tright, TabVector_LIST *vlines,\n                 int vertical_x, int vertical_y, int resolution)\n    : AlignedBlob(gridsize, bleft, tright)\n    , resolution_(resolution)\n    , image_origin_(0, tright.y() - 1)\n    , v_it_(&vectors_)\n    , width_cb_(nullptr) {\n  v_it_.add_list_after(vlines);\n  SetVerticalSkewAndParallelize(vertical_x, vertical_y);\n  using namespace std::placeholders; // for _1\n  width_cb_ = std::bind(&TabFind::CommonWidth, this, _1);\n}\n\nTabFind::~TabFind() = default;\n\n///////////////// PUBLIC functions (mostly used by TabVector). //////////////\n\n// Insert a list of blobs into the given grid (not necessarily this).\n// If take_ownership is true, then the blobs are removed from the source list.\n// See InsertBlob for the other arguments.\n// It would seem to make more sense to swap this and grid, but this way\n// around allows grid to not be derived from TabFind, eg a ColPartitionGrid,\n// while the grid that provides the tab stops(this) has to be derived from\n// TabFind.\nvoid TabFind::InsertBlobsToGrid(bool h_spread, bool v_spread, BLOBNBOX_LIST *blobs,\n                                BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) {\n  BLOBNBOX_IT blob_it(blobs);\n  int b_count = 0;\n  int reject_count = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    //    if (InsertBlob(true, true, blob, grid)) {\n    if (InsertBlob(h_spread, v_spread, blob, grid)) {\n      ++b_count;\n    } else {\n      ++reject_count;\n    }\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Inserted %d blobs into grid, %d rejected.\\n\", b_count, reject_count);\n  }\n}\n\n// Insert a single blob into the given grid (not necessarily this).\n// If h_spread, then all cells covered horizontally by the box are\n// used, otherwise, just the bottom-left. Similarly for v_spread.\n// A side effect is that the left and right rule edges of the blob are\n// set according to the tab vectors in this (not grid).\nbool TabFind::InsertBlob(bool h_spread, bool v_spread, BLOBNBOX *blob,\n                         BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) {\n  TBOX box = blob->bounding_box();\n  blob->set_left_rule(LeftEdgeForBox(box, false, false));\n  blob->set_right_rule(RightEdgeForBox(box, false, false));\n  blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));\n  blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));\n  if (blob->joined_to_prev()) {\n    return false;\n  }\n  grid->InsertBBox(h_spread, v_spread, blob);\n  return true;\n}\n\n// Calls SetBlobRuleEdges for all the blobs in the given block.\nvoid TabFind::SetBlockRuleEdges(TO_BLOCK *block) {\n  SetBlobRuleEdges(&block->blobs);\n  SetBlobRuleEdges(&block->small_blobs);\n  SetBlobRuleEdges(&block->noise_blobs);\n  SetBlobRuleEdges(&block->large_blobs);\n}\n\n// Sets the left and right rule and crossing_rules for the blobs in the given\n// list by finding the next outermost tabvectors for each blob.\nvoid TabFind::SetBlobRuleEdges(BLOBNBOX_LIST *blobs) {\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    TBOX box = blob->bounding_box();\n    blob->set_left_rule(LeftEdgeForBox(box, false, false));\n    blob->set_right_rule(RightEdgeForBox(box, false, false));\n    blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));\n    blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));\n  }\n}\n\n// Returns the gutter width of the given TabVector between the given y limits.\n// Also returns x-shift to be added to the vector to clear any intersecting\n// blobs. The shift is deducted from the returned gutter.\n// If ignore_unmergeables is true, then blobs of UnMergeableType are\n// ignored as if they don't exist. (Used for text on image.)\n// max_gutter_width is used as the maximum width worth searching for in case\n// there is nothing near the TabVector.\nint TabFind::GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables,\n                         int max_gutter_width, int *required_shift) {\n  bool right_to_left = v.IsLeftTab();\n  int bottom_x = v.XAtY(bottom_y);\n  int top_x = v.XAtY(top_y);\n  int start_x = right_to_left ? std::max(top_x, bottom_x) : std::min(top_x, bottom_x);\n  BlobGridSearch sidesearch(this);\n  sidesearch.StartSideSearch(start_x, bottom_y, top_y);\n  int min_gap = max_gutter_width;\n  *required_shift = 0;\n  BLOBNBOX *blob = nullptr;\n  while ((blob = sidesearch.NextSideSearch(right_to_left)) != nullptr) {\n    const TBOX &box = blob->bounding_box();\n    if (box.bottom() >= top_y || box.top() <= bottom_y) {\n      continue; // Doesn't overlap enough.\n    }\n    if (box.height() >= gridsize() * 2 && box.height() > box.width() * kLineFragmentAspectRatio) {\n      // Skip likely separator line residue.\n      continue;\n    }\n    if (ignore_unmergeables && BLOBNBOX::UnMergeableType(blob->region_type())) {\n      continue; // Skip non-text if required.\n    }\n    int mid_y = (box.bottom() + box.top()) / 2;\n    // We use the x at the mid-y so that the required_shift guarantees\n    // to clear all the blobs on the tab-stop. If we use the min/max\n    // of x at top/bottom of the blob, then exactness would be required,\n    // which is not a good thing.\n    int tab_x = v.XAtY(mid_y);\n    int gap;\n    if (right_to_left) {\n      gap = tab_x - box.right();\n      if (gap < 0 && box.left() - tab_x < *required_shift) {\n        *required_shift = box.left() - tab_x;\n      }\n    } else {\n      gap = box.left() - tab_x;\n      if (gap < 0 && box.right() - tab_x > *required_shift) {\n        *required_shift = box.right() - tab_x;\n      }\n    }\n    if (gap > 0 && gap < min_gap) {\n      min_gap = gap;\n    }\n  }\n  // Result may be negative, in which case,  this is a really bad tabstop.\n  return min_gap - abs(*required_shift);\n}\n\n// Find the gutter width and distance to inner neighbour for the given blob.\nvoid TabFind::GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left,\n                                         BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap) {\n  const TBOX &box = bbox->bounding_box();\n  // The gutter and internal sides of the box.\n  int gutter_x = left ? box.left() : box.right();\n  int internal_x = left ? box.right() : box.left();\n  // On ragged edges, the gutter side of the box is away from the tabstop.\n  int tab_gap = left ? gutter_x - tab_x : tab_x - gutter_x;\n  *gutter_width = max_gutter;\n  // If the box is away from the tabstop, we need to increase\n  // the allowed gutter width.\n  if (tab_gap > 0) {\n    *gutter_width += tab_gap;\n  }\n  bool debug = WithinTestRegion(2, box.left(), box.bottom());\n  if (debug) {\n    tprintf(\"Looking in gutter\\n\");\n  }\n  // Find the nearest blob on the outside of the column.\n  BLOBNBOX *gutter_bbox = AdjacentBlob(bbox, left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,\n                                       *gutter_width, box.top(), box.bottom());\n  if (gutter_bbox != nullptr) {\n    const TBOX &gutter_box = gutter_bbox->bounding_box();\n    *gutter_width = left ? tab_x - gutter_box.right() : gutter_box.left() - tab_x;\n  }\n  if (*gutter_width >= max_gutter) {\n    // If there is no box because a tab was in the way, get the tab coord.\n    TBOX gutter_box(box);\n    if (left) {\n      gutter_box.set_left(tab_x - max_gutter - 1);\n      gutter_box.set_right(tab_x - max_gutter);\n      int tab_gutter = RightEdgeForBox(gutter_box, true, false);\n      if (tab_gutter < tab_x - 1) {\n        *gutter_width = tab_x - tab_gutter;\n      }\n    } else {\n      gutter_box.set_left(tab_x + max_gutter);\n      gutter_box.set_right(tab_x + max_gutter + 1);\n      int tab_gutter = LeftEdgeForBox(gutter_box, true, false);\n      if (tab_gutter > tab_x + 1) {\n        *gutter_width = tab_gutter - tab_x;\n      }\n    }\n  }\n  if (*gutter_width > max_gutter) {\n    *gutter_width = max_gutter;\n  }\n  // Now look for a neighbour on the inside.\n  if (debug) {\n    tprintf(\"Looking for neighbour\\n\");\n  }\n  BLOBNBOX *neighbour = AdjacentBlob(bbox, !left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,\n                                     *gutter_width, box.top(), box.bottom());\n  int neighbour_edge = left ? RightEdgeForBox(box, true, false) : LeftEdgeForBox(box, true, false);\n  if (neighbour != nullptr) {\n    const TBOX &n_box = neighbour->bounding_box();\n    if (debug) {\n      tprintf(\"Found neighbour:\");\n      n_box.print();\n    }\n    if (left && n_box.left() < neighbour_edge) {\n      neighbour_edge = n_box.left();\n    } else if (!left && n_box.right() > neighbour_edge) {\n      neighbour_edge = n_box.right();\n    }\n  }\n  *neighbour_gap = left ? neighbour_edge - internal_x : internal_x - neighbour_edge;\n}\n\n// Return the x-coord that corresponds to the right edge for the given\n// box. If there is a rule line to the right that vertically overlaps it,\n// then return the x-coord of the rule line, otherwise return the right\n// edge of the page. For details see RightTabForBox below.\nint TabFind::RightEdgeForBox(const TBOX &box, bool crossing, bool extended) {\n  TabVector *v = RightTabForBox(box, crossing, extended);\n  return v == nullptr ? tright_.x() : v->XAtY((box.top() + box.bottom()) / 2);\n}\n// As RightEdgeForBox, but finds the left Edge instead.\nint TabFind::LeftEdgeForBox(const TBOX &box, bool crossing, bool extended) {\n  TabVector *v = LeftTabForBox(box, crossing, extended);\n  return v == nullptr ? bleft_.x() : v->XAtY((box.top() + box.bottom()) / 2);\n}\n\n// This comment documents how this function works.\n// For its purpose and arguments, see the comment in tabfind.h.\n// TabVectors are stored sorted by perpendicular distance of middle from\n// the global mean vertical vector. Since the individual vectors can have\n// differing directions, their XAtY for a given y is not necessarily in the\n// right order. Therefore the search has to be run with a margin.\n// The middle of a vector that passes through (x,y) cannot be higher than\n// halfway from y to the top, or lower than halfway from y to the bottom\n// of the coordinate range; therefore, the search margin is the range of\n// sort keys between these halfway points. Any vector with a sort key greater\n// than the upper margin must be to the right of x at y, and likewise any\n// vector with a sort key less than the lower margin must pass to the left\n// of x at y.\nTabVector *TabFind::RightTabForBox(const TBOX &box, bool crossing, bool extended) {\n  if (v_it_.empty()) {\n    return nullptr;\n  }\n  int top_y = box.top();\n  int bottom_y = box.bottom();\n  int mid_y = (top_y + bottom_y) / 2;\n  int right = crossing ? (box.left() + box.right()) / 2 : box.right();\n  int min_key, max_key;\n  SetupTabSearch(right, mid_y, &min_key, &max_key);\n  // Position the iterator at the first TabVector with sort_key >= min_key.\n  while (!v_it_.at_first() && v_it_.data()->sort_key() >= min_key) {\n    v_it_.backward();\n  }\n  while (!v_it_.at_last() && v_it_.data()->sort_key() < min_key) {\n    v_it_.forward();\n  }\n  // Find the leftmost tab vector that overlaps and has XAtY(mid_y) >= right.\n  TabVector *best_v = nullptr;\n  int best_x = -1;\n  int key_limit = -1;\n  do {\n    TabVector *v = v_it_.data();\n    int x = v->XAtY(mid_y);\n    if (x >= right && (v->VOverlap(top_y, bottom_y) > 0 ||\n                       (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) {\n      if (best_v == nullptr || x < best_x) {\n        best_v = v;\n        best_x = x;\n        // We can guarantee that no better vector can be found if the\n        // sort key exceeds that of the best by max_key - min_key.\n        key_limit = v->sort_key() + max_key - min_key;\n      }\n    }\n    // Break when the search is done to avoid wrapping the iterator and\n    // thereby potentially slowing the next search.\n    if (v_it_.at_last() || (best_v != nullptr && v->sort_key() > key_limit)) {\n      break; // Prevent restarting list for next call.\n    }\n    v_it_.forward();\n  } while (!v_it_.at_first());\n  return best_v;\n}\n\n// As RightTabForBox, but finds the left TabVector instead.\nTabVector *TabFind::LeftTabForBox(const TBOX &box, bool crossing, bool extended) {\n  if (v_it_.empty()) {\n    return nullptr;\n  }\n  int top_y = box.top();\n  int bottom_y = box.bottom();\n  int mid_y = (top_y + bottom_y) / 2;\n  int left = crossing ? (box.left() + box.right()) / 2 : box.left();\n  int min_key, max_key;\n  SetupTabSearch(left, mid_y, &min_key, &max_key);\n  // Position the iterator at the last TabVector with sort_key <= max_key.\n  while (!v_it_.at_last() && v_it_.data()->sort_key() <= max_key) {\n    v_it_.forward();\n  }\n  while (!v_it_.at_first() && v_it_.data()->sort_key() > max_key) {\n    v_it_.backward();\n  }\n  // Find the rightmost tab vector that overlaps and has XAtY(mid_y) <= left.\n  TabVector *best_v = nullptr;\n  int best_x = -1;\n  int key_limit = -1;\n  do {\n    TabVector *v = v_it_.data();\n    int x = v->XAtY(mid_y);\n    if (x <= left && (v->VOverlap(top_y, bottom_y) > 0 ||\n                      (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) {\n      if (best_v == nullptr || x > best_x) {\n        best_v = v;\n        best_x = x;\n        // We can guarantee that no better vector can be found if the\n        // sort key is less than that of the best by max_key - min_key.\n        key_limit = v->sort_key() - (max_key - min_key);\n      }\n    }\n    // Break when the search is done to avoid wrapping the iterator and\n    // thereby potentially slowing the next search.\n    if (v_it_.at_first() || (best_v != nullptr && v->sort_key() < key_limit)) {\n      break; // Prevent restarting list for next call.\n    }\n    v_it_.backward();\n  } while (!v_it_.at_last());\n  return best_v;\n}\n\n// Return true if the given width is close to one of the common\n// widths in column_widths_.\nbool TabFind::CommonWidth(int width) {\n  width /= kColumnWidthFactor;\n  ICOORDELT_IT it(&column_widths_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ICOORDELT *w = it.data();\n    if (w->x() - 1 <= width && width <= w->y() + 1) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Return true if the sizes are more than a\n// factor of 2 different.\nbool TabFind::DifferentSizes(int size1, int size2) {\n  return size1 > size2 * 2 || size2 > size1 * 2;\n}\n\n// Return true if the sizes are more than a\n// factor of 5 different.\nbool TabFind::VeryDifferentSizes(int size1, int size2) {\n  return size1 > size2 * 5 || size2 > size1 * 5;\n}\n\n///////////////// PROTECTED functions (used by ColumnFinder). //////////////\n\n// Top-level function to find TabVectors in an input page block.\n// Returns false if the detected skew angle is impossible.\n// Applies the detected skew angle to deskew the tabs, blobs and part_grid.\nbool TabFind::FindTabVectors(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,\n                             int min_gutter_width, double tabfind_aligned_gap_fraction,\n                             ColPartitionGrid *part_grid, FCOORD *deskew, FCOORD *reskew) {\n  ScrollView *tab_win =\n      FindInitialTabVectors(image_blobs, min_gutter_width, tabfind_aligned_gap_fraction, block);\n  ComputeColumnWidths(tab_win, part_grid);\n  TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);\n  SortVectors();\n  CleanupTabs();\n  if (!Deskew(hlines, image_blobs, block, deskew, reskew)) {\n    return false; // Skew angle is too large.\n  }\n  part_grid->Deskew(*deskew);\n  ApplyTabConstraints();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_finaltabs) {\n    tab_win = MakeWindow(640, 50, \"FinalTabs\");\n    DisplayBoxes(tab_win);\n    DisplayTabs(\"FinalTabs\", tab_win);\n    tab_win = DisplayTabVectors(tab_win);\n  }\n#endif // !GRAPHICS_DISABLED\n  return true;\n}\n\n// Top-level function to not find TabVectors in an input page block,\n// but setup for single column mode.\nvoid TabFind::DontFindTabVectors(BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew,\n                                 FCOORD *reskew) {\n  InsertBlobsToGrid(false, false, image_blobs, this);\n  InsertBlobsToGrid(true, false, &block->blobs, this);\n  deskew->set_x(1.0f);\n  deskew->set_y(0.0f);\n  reskew->set_x(1.0f);\n  reskew->set_y(0.0f);\n}\n\n// Cleans up the lists of blobs in the block ready for use by TabFind.\n// Large blobs that look like text are moved to the main blobs list.\n// Main blobs that are superseded by the image blobs are deleted.\nvoid TabFind::TidyBlobs(TO_BLOCK *block) {\n  BLOBNBOX_IT large_it = &block->large_blobs;\n  BLOBNBOX_IT blob_it = &block->blobs;\n  int b_count = 0;\n  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {\n    BLOBNBOX *large_blob = large_it.data();\n    if (large_blob->owner() != nullptr) {\n      blob_it.add_to_end(large_it.extract());\n      ++b_count;\n    }\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Moved %d large blobs to normal list\\n\", b_count);\n#ifndef GRAPHICS_DISABLED\n    ScrollView *rej_win = MakeWindow(500, 300, \"Image blobs\");\n    block->plot_graded_blobs(rej_win);\n    block->plot_noise_blobs(rej_win);\n    rej_win->Update();\n#endif // !GRAPHICS_DISABLED\n  }\n  block->DeleteUnownedNoise();\n}\n\n// Helper function to setup search limits for *TabForBox.\nvoid TabFind::SetupTabSearch(int x, int y, int *min_key, int *max_key) {\n  int key1 = TabVector::SortKey(vertical_skew_, x, (y + tright_.y()) / 2);\n  int key2 = TabVector::SortKey(vertical_skew_, x, (y + bleft_.y()) / 2);\n  *min_key = std::min(key1, key2);\n  *max_key = std::max(key1, key2);\n}\n\n#ifndef GRAPHICS_DISABLED\n\nScrollView *TabFind::DisplayTabVectors(ScrollView *tab_win) {\n  // For every vector, display it.\n  TabVector_IT it(&vectors_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *vector = it.data();\n    vector->Display(tab_win);\n  }\n  tab_win->Update();\n  return tab_win;\n}\n\n#endif\n\n// PRIVATE CODE.\n//\n// First part of FindTabVectors, which may be used twice if the text\n// is mostly of vertical alignment.\nScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_gutter_width,\n                                           double tabfind_aligned_gap_fraction, TO_BLOCK *block) {\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_initialtabs) {\n    ScrollView *line_win = MakeWindow(0, 0, \"VerticalLines\");\n    line_win = DisplayTabVectors(line_win);\n  }\n#endif\n  // Prepare the grid.\n  if (image_blobs != nullptr) {\n    InsertBlobsToGrid(true, false, image_blobs, this);\n  }\n  InsertBlobsToGrid(true, false, &block->blobs, this);\n  ScrollView *initial_win = FindTabBoxes(min_gutter_width, tabfind_aligned_gap_fraction);\n  FindAllTabVectors(min_gutter_width);\n\n  TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);\n  SortVectors();\n  EvaluateTabs();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_initialtabs && initial_win != nullptr) {\n    initial_win = DisplayTabVectors(initial_win);\n  }\n#endif\n  MarkVerticalText();\n  return initial_win;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Helper displays all the boxes in the given vector on the given window.\nstatic void DisplayBoxVector(const std::vector<BLOBNBOX *> &boxes, ScrollView *win) {\n  for (auto boxe : boxes) {\n    TBOX box = boxe->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    ScrollView::Color box_color = boxe->BoxColor();\n    win->Pen(box_color);\n    win->Rectangle(left_x, bottom_y, right_x, top_y);\n  }\n  win->Update();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// For each box in the grid, decide whether it is a candidate tab-stop,\n// and if so add it to the left/right tab boxes.\nScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_gap_fraction) {\n  left_tab_boxes_.clear();\n  right_tab_boxes_.clear();\n  // For every bbox in the grid, determine whether it uses a tab on an edge.\n  BlobGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  BLOBNBOX *bbox;\n  while ((bbox = gsearch.NextFullSearch()) != nullptr) {\n    if (TestBoxForTabs(bbox, min_gutter_width, tabfind_aligned_gap_fraction)) {\n      // If it is any kind of tab, insert it into the vectors.\n      if (bbox->left_tab_type() != TT_NONE) {\n        left_tab_boxes_.push_back(bbox);\n      }\n      if (bbox->right_tab_type() != TT_NONE) {\n        right_tab_boxes_.push_back(bbox);\n      }\n    }\n  }\n  // Sort left tabs by left and right by right to see the outermost one first\n  // on a ragged tab.\n  std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft<BLOBNBOX>);\n  std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft<BLOBNBOX>);\n  ScrollView *tab_win = nullptr;\n#ifndef GRAPHICS_DISABLED\n  if (textord_tabfind_show_initialtabs) {\n    tab_win = MakeWindow(0, 100, \"InitialTabs\");\n    tab_win->Pen(ScrollView::BLUE);\n    tab_win->Brush(ScrollView::NONE);\n    // Display the left and right tab boxes.\n    DisplayBoxVector(left_tab_boxes_, tab_win);\n    DisplayBoxVector(right_tab_boxes_, tab_win);\n    tab_win = DisplayTabs(\"Tabs\", tab_win);\n  }\n#endif // !GRAPHICS_DISABLED\n  return tab_win;\n}\n\nbool TabFind::TestBoxForTabs(BLOBNBOX *bbox, int min_gutter_width,\n                             double tabfind_aligned_gap_fraction) {\n  GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);\n  TBOX box = bbox->bounding_box();\n  // If there are separator lines, get the column edges.\n  int left_column_edge = bbox->left_rule();\n  int right_column_edge = bbox->right_rule();\n  // The edges of the bounding box of the blob being processed.\n  int left_x = box.left();\n  int right_x = box.right();\n  int top_y = box.top();\n  int bottom_y = box.bottom();\n  int height = box.height();\n  bool debug = WithinTestRegion(3, left_x, top_y);\n  if (debug) {\n    tprintf(\"Column edges for blob at (%d,%d)->(%d,%d) are [%d, %d]\\n\", left_x, top_y, right_x,\n            bottom_y, left_column_edge, right_column_edge);\n  }\n  // Compute a search radius based on a multiple of the height.\n  int radius = (height * kTabRadiusFactor + gridsize_ - 1) / gridsize_;\n  radsearch.StartRadSearch((left_x + right_x) / 2, (top_y + bottom_y) / 2, radius);\n  // In Vertical Page mode, once we have an estimate of the vertical line\n  // spacing, the minimum amount of gutter space before a possible tab is\n  // increased under the assumption that column partition is always larger\n  // than line spacing.\n  int min_spacing = static_cast<int>(height * tabfind_aligned_gap_fraction);\n  if (min_gutter_width > min_spacing) {\n    min_spacing = min_gutter_width;\n  }\n  int min_ragged_gutter = kRaggedGutterMultiple * gridsize();\n  if (min_gutter_width > min_ragged_gutter) {\n    min_ragged_gutter = min_gutter_width;\n  }\n  int target_right = left_x - min_spacing;\n  int target_left = right_x + min_spacing;\n  // We will be evaluating whether the left edge could be a left tab, and\n  // whether the right edge could be a right tab.\n  // A box can be a tab if its bool is_(left/right)_tab remains true, meaning\n  // that no blobs have been found in the gutter during the radial search.\n  // A box can also be a tab if there are objects in the gutter only above\n  // or only below, and there are aligned objects on the opposite side, but\n  // not too many unaligned objects. The maybe_(left/right)_tab_up counts\n  // aligned objects above and negatively counts unaligned objects above,\n  // and is set to -INT32_MAX if a gutter object is found above.\n  // The other 3 maybe ints work similarly for the other sides.\n  // These conditions are very strict, to minimize false positives, and really\n  // only aligned tabs and outermost ragged tab blobs will qualify, so we\n  // also have maybe_ragged_left/right with less stringent rules.\n  // A blob that is maybe_ragged_left/right will be further qualified later,\n  // using the min_ragged_gutter.\n  bool is_left_tab = true;\n  bool is_right_tab = true;\n  bool maybe_ragged_left = true;\n  bool maybe_ragged_right = true;\n  int maybe_left_tab_up = 0;\n  int maybe_right_tab_up = 0;\n  int maybe_left_tab_down = 0;\n  int maybe_right_tab_down = 0;\n  if (bbox->leader_on_left()) {\n    is_left_tab = false;\n    maybe_ragged_left = false;\n    maybe_left_tab_up = -INT32_MAX;\n    maybe_left_tab_down = -INT32_MAX;\n  }\n  if (bbox->leader_on_right()) {\n    is_right_tab = false;\n    maybe_ragged_right = false;\n    maybe_right_tab_up = -INT32_MAX;\n    maybe_right_tab_down = -INT32_MAX;\n  }\n  int alignment_tolerance = static_cast<int>(resolution_ * kAlignedFraction);\n  BLOBNBOX *neighbour = nullptr;\n  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {\n    if (neighbour == bbox) {\n      continue;\n    }\n    TBOX nbox = neighbour->bounding_box();\n    int n_left = nbox.left();\n    int n_right = nbox.right();\n    if (debug) {\n      tprintf(\"Neighbour at (%d,%d)->(%d,%d)\\n\", n_left, nbox.bottom(), n_right, nbox.top());\n    }\n    // If the neighbouring blob is the wrong side of a separator line, then it\n    // \"doesn't exist\" as far as we are concerned.\n    if (n_right > right_column_edge || n_left < left_column_edge ||\n        left_x < neighbour->left_rule() || right_x > neighbour->right_rule()) {\n      continue; // Separator line in the way.\n    }\n    int n_mid_x = (n_left + n_right) / 2;\n    int n_mid_y = (nbox.top() + nbox.bottom()) / 2;\n    if (n_mid_x <= left_x && n_right >= target_right) {\n      if (debug) {\n        tprintf(\"Not a left tab\\n\");\n      }\n      is_left_tab = false;\n      if (n_mid_y < top_y) {\n        maybe_left_tab_down = -INT32_MAX;\n      }\n      if (n_mid_y > bottom_y) {\n        maybe_left_tab_up = -INT32_MAX;\n      }\n    } else if (NearlyEqual(left_x, n_left, alignment_tolerance)) {\n      if (debug) {\n        tprintf(\"Maybe a left tab\\n\");\n      }\n      if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) {\n        ++maybe_left_tab_up;\n      }\n      if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) {\n        ++maybe_left_tab_down;\n      }\n    } else if (n_left < left_x && n_right >= left_x) {\n      // Overlaps but not aligned so negative points on a maybe.\n      if (debug) {\n        tprintf(\"Maybe Not a left tab\\n\");\n      }\n      if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) {\n        --maybe_left_tab_up;\n      }\n      if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) {\n        --maybe_left_tab_down;\n      }\n    }\n    if (n_left < left_x && nbox.y_overlap(box) && n_right >= target_right) {\n      maybe_ragged_left = false;\n      if (debug) {\n        tprintf(\"Not a ragged left\\n\");\n      }\n    }\n    if (n_mid_x >= right_x && n_left <= target_left) {\n      if (debug) {\n        tprintf(\"Not a right tab\\n\");\n      }\n      is_right_tab = false;\n      if (n_mid_y < top_y) {\n        maybe_right_tab_down = -INT32_MAX;\n      }\n      if (n_mid_y > bottom_y) {\n        maybe_right_tab_up = -INT32_MAX;\n      }\n    } else if (NearlyEqual(right_x, n_right, alignment_tolerance)) {\n      if (debug) {\n        tprintf(\"Maybe a right tab\\n\");\n      }\n      if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) {\n        ++maybe_right_tab_up;\n      }\n      if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) {\n        ++maybe_right_tab_down;\n      }\n    } else if (n_right > right_x && n_left <= right_x) {\n      // Overlaps but not aligned so negative points on a maybe.\n      if (debug) {\n        tprintf(\"Maybe Not a right tab\\n\");\n      }\n      if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) {\n        --maybe_right_tab_up;\n      }\n      if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) {\n        --maybe_right_tab_down;\n      }\n    }\n    if (n_right > right_x && nbox.y_overlap(box) && n_left <= target_left) {\n      maybe_ragged_right = false;\n      if (debug) {\n        tprintf(\"Not a ragged right\\n\");\n      }\n    }\n    if (maybe_left_tab_down == -INT32_MAX && maybe_left_tab_up == -INT32_MAX &&\n        maybe_right_tab_down == -INT32_MAX && maybe_right_tab_up == -INT32_MAX) {\n      break;\n    }\n  }\n  if (is_left_tab || maybe_left_tab_up > 1 || maybe_left_tab_down > 1) {\n    bbox->set_left_tab_type(TT_MAYBE_ALIGNED);\n  } else if (maybe_ragged_left && ConfirmRaggedLeft(bbox, min_ragged_gutter)) {\n    bbox->set_left_tab_type(TT_MAYBE_RAGGED);\n  } else {\n    bbox->set_left_tab_type(TT_NONE);\n  }\n  if (is_right_tab || maybe_right_tab_up > 1 || maybe_right_tab_down > 1) {\n    bbox->set_right_tab_type(TT_MAYBE_ALIGNED);\n  } else if (maybe_ragged_right && ConfirmRaggedRight(bbox, min_ragged_gutter)) {\n    bbox->set_right_tab_type(TT_MAYBE_RAGGED);\n  } else {\n    bbox->set_right_tab_type(TT_NONE);\n  }\n  if (debug) {\n    tprintf(\"Left result = %s, Right result=%s\\n\",\n            bbox->left_tab_type() == TT_MAYBE_ALIGNED\n                ? \"Aligned\"\n                : (bbox->left_tab_type() == TT_MAYBE_RAGGED ? \"Ragged\" : \"None\"),\n            bbox->right_tab_type() == TT_MAYBE_ALIGNED\n                ? \"Aligned\"\n                : (bbox->right_tab_type() == TT_MAYBE_RAGGED ? \"Ragged\" : \"None\"));\n  }\n  return bbox->left_tab_type() != TT_NONE || bbox->right_tab_type() != TT_NONE;\n}\n\n// Returns true if there is nothing in the rectangle of width min_gutter to\n// the left of bbox.\nbool TabFind::ConfirmRaggedLeft(BLOBNBOX *bbox, int min_gutter) {\n  TBOX search_box(bbox->bounding_box());\n  search_box.set_right(search_box.left());\n  search_box.set_left(search_box.left() - min_gutter);\n  return NothingYOverlapsInBox(search_box, bbox->bounding_box());\n}\n\n// Returns true if there is nothing in the rectangle of width min_gutter to\n// the right of bbox.\nbool TabFind::ConfirmRaggedRight(BLOBNBOX *bbox, int min_gutter) {\n  TBOX search_box(bbox->bounding_box());\n  search_box.set_left(search_box.right());\n  search_box.set_right(search_box.right() + min_gutter);\n  return NothingYOverlapsInBox(search_box, bbox->bounding_box());\n}\n\n// Returns true if there is nothing in the given search_box that vertically\n// overlaps target_box other than target_box itself.\nbool TabFind::NothingYOverlapsInBox(const TBOX &search_box, const TBOX &target_box) {\n  BlobGridSearch rsearch(this);\n  rsearch.StartRectSearch(search_box);\n  BLOBNBOX *blob;\n  while ((blob = rsearch.NextRectSearch()) != nullptr) {\n    const TBOX &box = blob->bounding_box();\n    if (box.y_overlap(target_box) && !(box == target_box)) {\n      return false;\n    }\n  }\n  return true;\n}\n\nvoid TabFind::FindAllTabVectors(int min_gutter_width) {\n  // A list of vectors that will be created in estimating the skew.\n  TabVector_LIST dummy_vectors;\n  // An estimate of the vertical direction, revised as more lines are added.\n  int vertical_x = 0;\n  int vertical_y = 1;\n  // Find an estimate of the vertical direction by finding some tab vectors.\n  // Slowly up the search size until we get some vectors.\n  for (int search_size = kMinVerticalSearch; search_size < kMaxVerticalSearch;\n       search_size += kMinVerticalSearch) {\n    int vector_count = FindTabVectors(search_size, TA_LEFT_ALIGNED, min_gutter_width,\n                                      &dummy_vectors, &vertical_x, &vertical_y);\n    vector_count += FindTabVectors(search_size, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,\n                                   &vertical_x, &vertical_y);\n    if (vector_count > 0) {\n      break;\n    }\n  }\n  // Get rid of the test vectors and reset the types of the tabs.\n  dummy_vectors.clear();\n  for (auto bbox : left_tab_boxes_) {\n    if (bbox->left_tab_type() == TT_CONFIRMED) {\n      bbox->set_left_tab_type(TT_MAYBE_ALIGNED);\n    }\n  }\n  for (auto bbox : right_tab_boxes_) {\n    if (bbox->right_tab_type() == TT_CONFIRMED) {\n      bbox->set_right_tab_type(TT_MAYBE_ALIGNED);\n    }\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"Beginning real tab search with vertical = %d,%d...\\n\", vertical_x, vertical_y);\n  }\n  // Now do the real thing ,but keep the vectors in the dummy_vectors list\n  // until they are all done, so we don't get the tab vectors confused with\n  // the rule line vectors.\n  FindTabVectors(kMaxVerticalSearch, TA_LEFT_ALIGNED, min_gutter_width, &dummy_vectors, &vertical_x,\n                 &vertical_y);\n  FindTabVectors(kMaxVerticalSearch, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,\n                 &vertical_x, &vertical_y);\n  FindTabVectors(kMaxRaggedSearch, TA_LEFT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,\n                 &vertical_y);\n  FindTabVectors(kMaxRaggedSearch, TA_RIGHT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,\n                 &vertical_y);\n  // Now add the vectors to the vectors_ list.\n  TabVector_IT v_it(&vectors_);\n  v_it.add_list_after(&dummy_vectors);\n  // Now use the summed (mean) vertical vector as the direction for everything.\n  SetVerticalSkewAndParallelize(vertical_x, vertical_y);\n}\n\n// Helper for FindAllTabVectors finds the vectors of a particular type.\nint TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, int min_gutter_width,\n                            TabVector_LIST *vectors, int *vertical_x, int *vertical_y) {\n  TabVector_IT vector_it(vectors);\n  int vector_count = 0;\n  // Search the right or left tab boxes, looking for tab vectors.\n  bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED;\n  const std::vector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;\n  for (auto bbox : boxes) {\n    if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) ||\n        (right && bbox->right_tab_type() == TT_MAYBE_ALIGNED)) {\n      TabVector *vector = FindTabVector(search_size_multiple, min_gutter_width, alignment, bbox,\n                                        vertical_x, vertical_y);\n      if (vector != nullptr) {\n        ++vector_count;\n        vector_it.add_to_end(vector);\n      }\n    }\n  }\n  return vector_count;\n}\n\n// Finds a vector corresponding to a tabstop running through the\n// given box of the given alignment type.\n// search_size_multiple is a multiple of height used to control\n// the size of the search.\n// vertical_x and y are updated with an estimate of the real\n// vertical direction. (skew finding.)\n// Returns nullptr if no decent tabstop can be found.\nTabVector *TabFind::FindTabVector(int search_size_multiple, int min_gutter_width,\n                                  TabAlignment alignment, BLOBNBOX *bbox, int *vertical_x,\n                                  int *vertical_y) {\n  int height = std::max(static_cast<int>(bbox->bounding_box().height()), gridsize());\n  AlignedBlobParams align_params(*vertical_x, *vertical_y, height, search_size_multiple,\n                                 min_gutter_width, resolution_, alignment);\n  // FindVerticalAlignment is in the parent (AlignedBlob) class.\n  return FindVerticalAlignment(align_params, bbox, vertical_x, vertical_y);\n}\n\n// Set the vertical_skew_ member from the given vector and refit\n// all vectors parallel to the skew vector.\nvoid TabFind::SetVerticalSkewAndParallelize(int vertical_x, int vertical_y) {\n  // Fit the vertical vector into an ICOORD, which is 16 bit.\n  vertical_skew_.set_with_shrink(vertical_x, vertical_y);\n  if (textord_debug_tabfind) {\n    tprintf(\"Vertical skew vector=(%d,%d)\\n\", vertical_skew_.x(), vertical_skew_.y());\n  }\n  v_it_.set_to_list(&vectors_);\n  for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) {\n    TabVector *v = v_it_.data();\n    v->Fit(vertical_skew_, true);\n  }\n  // Now sort the vectors as their direction has potentially changed.\n  SortVectors();\n}\n\n// Sort all the current vectors using the given vertical direction vector.\nvoid TabFind::SortVectors() {\n  vectors_.sort(TabVector::SortVectorsByKey);\n  v_it_.set_to_list(&vectors_);\n}\n\n// Evaluate all the current tab vectors.\nvoid TabFind::EvaluateTabs() {\n  TabVector_IT rule_it(&vectors_);\n  for (rule_it.mark_cycle_pt(); !rule_it.cycled_list(); rule_it.forward()) {\n    TabVector *tab = rule_it.data();\n    if (!tab->IsSeparator()) {\n      tab->Evaluate(vertical_skew_, this);\n      if (tab->BoxCount() < kMinEvaluatedTabs) {\n        if (textord_debug_tabfind > 2) {\n          tab->Print(\"Too few boxes\");\n        }\n        delete rule_it.extract();\n        v_it_.set_to_list(&vectors_);\n      } else if (WithinTestRegion(3, tab->startpt().x(), tab->startpt().y())) {\n        tab->Print(\"Evaluated tab\");\n      }\n    }\n  }\n}\n\n// Trace textlines from one side to the other of each tab vector, saving\n// the most frequent column widths found in a list so that a given width\n// can be tested for being a common width with a simple callback function.\nvoid TabFind::ComputeColumnWidths(ScrollView *tab_win, ColPartitionGrid *part_grid) {\n#ifndef GRAPHICS_DISABLED\n  if (tab_win != nullptr) {\n    tab_win->Pen(ScrollView::WHITE);\n  }\n#endif // !GRAPHICS_DISABLED\n  // Accumulate column sections into a STATS\n  int col_widths_size = (tright_.x() - bleft_.x()) / kColumnWidthFactor;\n  STATS col_widths(0, col_widths_size);\n  ApplyPartitionsToColumnWidths(part_grid, &col_widths);\n#ifndef GRAPHICS_DISABLED\n  if (tab_win != nullptr) {\n    tab_win->Update();\n  }\n#endif // !GRAPHICS_DISABLED\n  if (textord_debug_tabfind > 1) {\n    col_widths.print();\n  }\n  // Now make a list of column widths.\n  MakeColumnWidths(col_widths_size, &col_widths);\n  // Turn the column width into a range.\n  ApplyPartitionsToColumnWidths(part_grid, nullptr);\n}\n\n// Finds column width and:\n//   if col_widths is not null (pass1):\n//     pair-up tab vectors with existing ColPartitions and accumulate widths.\n//   else (pass2):\n//     find the largest real partition width for each recorded column width,\n//     to be used as the minimum acceptable width.\nvoid TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid *part_grid, STATS *col_widths) {\n  // For every ColPartition in the part_grid, add partners to the tabvectors\n  // and accumulate the column widths.\n  ColPartitionGridSearch gsearch(part_grid);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    BLOBNBOX_C_IT blob_it(part->boxes());\n    if (blob_it.empty()) {\n      continue;\n    }\n    BLOBNBOX *left_blob = blob_it.data();\n    blob_it.move_to_last();\n    BLOBNBOX *right_blob = blob_it.data();\n    TabVector *left_vector = LeftTabForBox(left_blob->bounding_box(), true, false);\n    if (left_vector == nullptr || left_vector->IsRightTab()) {\n      continue;\n    }\n    TabVector *right_vector = RightTabForBox(right_blob->bounding_box(), true, false);\n    if (right_vector == nullptr || right_vector->IsLeftTab()) {\n      continue;\n    }\n\n    int line_left = left_vector->XAtY(left_blob->bounding_box().bottom());\n    int line_right = right_vector->XAtY(right_blob->bounding_box().bottom());\n    // Add to STATS of measurements if the width is significant.\n    int width = line_right - line_left;\n    if (col_widths != nullptr) {\n      AddPartnerVector(left_blob, right_blob, left_vector, right_vector);\n      if (width >= kMinColumnWidth) {\n        col_widths->add(width / kColumnWidthFactor, 1);\n      }\n    } else {\n      width /= kColumnWidthFactor;\n      ICOORDELT_IT it(&column_widths_);\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        ICOORDELT *w = it.data();\n        if (NearlyEqual<int>(width, w->y(), 1)) {\n          int true_width = part->bounding_box().width() / kColumnWidthFactor;\n          if (true_width <= w->y() && true_width > w->x()) {\n            w->set_x(true_width);\n          }\n          break;\n        }\n      }\n    }\n  }\n}\n\n// Helper makes the list of common column widths in column_widths_ from the\n// input col_widths. Destroys the content of col_widths by repeatedly\n// finding the mode and erasing the peak.\nvoid TabFind::MakeColumnWidths(int col_widths_size, STATS *col_widths) {\n  ICOORDELT_IT w_it(&column_widths_);\n  int total_col_count = col_widths->get_total();\n  while (col_widths->get_total() > 0) {\n    int width = col_widths->mode();\n    int col_count = col_widths->pile_count(width);\n    col_widths->add(width, -col_count);\n    // Get the entire peak.\n    for (int left = width - 1; left > 0 && col_widths->pile_count(left) > 0; --left) {\n      int new_count = col_widths->pile_count(left);\n      col_count += new_count;\n      col_widths->add(left, -new_count);\n    }\n    for (int right = width + 1; right < col_widths_size && col_widths->pile_count(right) > 0;\n         ++right) {\n      int new_count = col_widths->pile_count(right);\n      col_count += new_count;\n      col_widths->add(right, -new_count);\n    }\n    if (col_count > kMinLinesInColumn &&\n        col_count > kMinFractionalLinesInColumn * total_col_count) {\n      auto *w = new ICOORDELT(0, width);\n      w_it.add_after_then_move(w);\n      if (textord_debug_tabfind) {\n        tprintf(\"Column of width %d has %d = %.2f%% lines\\n\", width * kColumnWidthFactor, col_count,\n                100.0 * col_count / total_col_count);\n      }\n    }\n  }\n}\n\n// Mark blobs as being in a vertical text line where that is the case.\n// Returns true if the majority of the image is vertical text lines.\nvoid TabFind::MarkVerticalText() {\n  if (textord_debug_tabfind) {\n    tprintf(\"Checking for vertical lines\\n\");\n  }\n  BlobGridSearch gsearch(this);\n  gsearch.StartFullSearch();\n  BLOBNBOX *blob = nullptr;\n  while ((blob = gsearch.NextFullSearch()) != nullptr) {\n    if (blob->region_type() < BRT_UNKNOWN) {\n      continue;\n    }\n    if (blob->UniquelyVertical()) {\n      blob->set_region_type(BRT_VERT_TEXT);\n    }\n  }\n}\n\nint TabFind::FindMedianGutterWidth(TabVector_LIST *lines) {\n  TabVector_IT it(lines);\n  int prev_right = -1;\n  int max_gap = static_cast<int>(kMaxGutterWidthAbsolute * resolution_);\n  STATS gaps(0, max_gap - 1);\n  STATS heights(0, max_gap - 1);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    TabVector *partner = v->GetSinglePartner();\n    if (!v->IsLeftTab() || v->IsSeparator() || !partner) {\n      continue;\n    }\n    heights.add(partner->startpt().x() - v->startpt().x(), 1);\n    if (prev_right > 0 && v->startpt().x() > prev_right) {\n      gaps.add(v->startpt().x() - prev_right, 1);\n    }\n    prev_right = partner->startpt().x();\n  }\n  if (textord_debug_tabfind) {\n    tprintf(\"TabGutter total %d  median_gap %.2f  median_hgt %.2f\\n\", gaps.get_total(),\n            gaps.median(), heights.median());\n  }\n  if (gaps.get_total() < kMinLinesInColumn) {\n    return 0;\n  }\n  return static_cast<int>(gaps.median());\n}\n\n// Find the next adjacent (looking to the left or right) blob on this text\n// line, with the constraint that it must vertically significantly overlap\n// the [top_y, bottom_y] range.\n// If ignore_images is true, then blobs with aligned_text() < 0 are treated\n// as if they do not exist.\nBLOBNBOX *TabFind::AdjacentBlob(const BLOBNBOX *bbox, bool look_left, bool ignore_images,\n                                double min_overlap_fraction, int gap_limit, int top_y,\n                                int bottom_y) {\n  GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> sidesearch(this);\n  const TBOX &box = bbox->bounding_box();\n  int left = box.left();\n  int right = box.right();\n  int mid_x = (left + right) / 2;\n  sidesearch.StartSideSearch(mid_x, bottom_y, top_y);\n  int best_gap = 0;\n  bool debug = WithinTestRegion(3, left, bottom_y);\n  BLOBNBOX *result = nullptr;\n  BLOBNBOX *neighbour = nullptr;\n  while ((neighbour = sidesearch.NextSideSearch(look_left)) != nullptr) {\n    if (debug) {\n      tprintf(\"Adjacent blob: considering box:\");\n      neighbour->bounding_box().print();\n    }\n    if (neighbour == bbox || (ignore_images && neighbour->region_type() < BRT_UNKNOWN)) {\n      continue;\n    }\n    const TBOX &nbox = neighbour->bounding_box();\n    int n_top_y = nbox.top();\n    int n_bottom_y = nbox.bottom();\n    int v_overlap = std::min(n_top_y, top_y) - std::max(n_bottom_y, bottom_y);\n    int height = top_y - bottom_y;\n    int n_height = n_top_y - n_bottom_y;\n    if (v_overlap > min_overlap_fraction * std::min(height, n_height) &&\n        (min_overlap_fraction == 0.0 || !DifferentSizes(height, n_height))) {\n      int n_left = nbox.left();\n      int n_right = nbox.right();\n      int h_gap = std::max(n_left, left) - std::min(n_right, right);\n      int n_mid_x = (n_left + n_right) / 2;\n      if (look_left == (n_mid_x < mid_x) && n_mid_x != mid_x) {\n        if (h_gap > gap_limit) {\n          // Hit a big gap before next tab so don't return anything.\n          if (debug) {\n            tprintf(\"Giving up due to big gap = %d vs %d\\n\", h_gap, gap_limit);\n          }\n          return result;\n        }\n        if (h_gap > 0 && (look_left ? neighbour->right_tab_type() : neighbour->left_tab_type()) >=\n                             TT_CONFIRMED) {\n          // Hit a tab facing the wrong way. Stop in case we are crossing\n          // the column boundary.\n          if (debug) {\n            tprintf(\"Collision with like tab of type %d at %d,%d\\n\",\n                    look_left ? neighbour->right_tab_type() : neighbour->left_tab_type(), n_left,\n                    nbox.bottom());\n          }\n          return result;\n        }\n        // This is a good fit to the line. Continue with this\n        // neighbour as the bbox if the best gap.\n        if (result == nullptr || h_gap < best_gap) {\n          if (debug) {\n            tprintf(\"Good result\\n\");\n          }\n          result = neighbour;\n          best_gap = h_gap;\n        } else {\n          // The new one is worse, so we probably already have the best result.\n          return result;\n        }\n      } else if (debug) {\n        tprintf(\"Wrong way\\n\");\n      }\n    } else if (debug) {\n      tprintf(\"Insufficient overlap\\n\");\n    }\n  }\n  if (WithinTestRegion(3, left, box.top())) {\n    tprintf(\"Giving up due to end of search\\n\");\n  }\n  return result; // Hit the edge and found nothing.\n}\n\n// Add a bi-directional partner relationship between the left\n// and the right. If one (or both) of the vectors is a separator,\n// extend a nearby extendable vector or create a new one of the\n// correct type, using the given left or right blob as a guide.\nvoid TabFind::AddPartnerVector(BLOBNBOX *left_blob, BLOBNBOX *right_blob, TabVector *left,\n                               TabVector *right) {\n  const TBOX &left_box = left_blob->bounding_box();\n  const TBOX &right_box = right_blob->bounding_box();\n  if (left->IsSeparator()) {\n    // Try to find a nearby left edge to extend.\n    TabVector *v = LeftTabForBox(left_box, true, true);\n    if (v != nullptr && v != left && v->IsLeftTab() &&\n        v->XAtY(left_box.top()) > left->XAtY(left_box.top())) {\n      left = v; // Found a good replacement.\n      left->ExtendToBox(left_blob);\n    } else {\n      // Fake a vector.\n      left = new TabVector(*left, TA_LEFT_RAGGED, vertical_skew_, left_blob);\n      vectors_.add_sorted(TabVector::SortVectorsByKey, left);\n      v_it_.move_to_first();\n    }\n  }\n  if (right->IsSeparator()) {\n    // Try to find a nearby left edge to extend.\n    if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {\n      tprintf(\"Box edge (%d,%d-%d)\", right_box.right(), right_box.bottom(), right_box.top());\n      right->Print(\" looking for improvement for\");\n    }\n    TabVector *v = RightTabForBox(right_box, true, true);\n    if (v != nullptr && v != right && v->IsRightTab() &&\n        v->XAtY(right_box.top()) < right->XAtY(right_box.top())) {\n      right = v; // Found a good replacement.\n      right->ExtendToBox(right_blob);\n      if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {\n        right->Print(\"Extended vector\");\n      }\n    } else {\n      // Fake a vector.\n      right = new TabVector(*right, TA_RIGHT_RAGGED, vertical_skew_, right_blob);\n      vectors_.add_sorted(TabVector::SortVectorsByKey, right);\n      v_it_.move_to_first();\n      if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {\n        right->Print(\"Created new vector\");\n      }\n    }\n  }\n  left->AddPartner(right);\n  right->AddPartner(left);\n}\n\n// Remove separators and unused tabs from the main vectors_ list\n// to the dead_vectors_ list.\nvoid TabFind::CleanupTabs() {\n  // TODO(rays) Before getting rid of separators and unused vectors, it\n  // would be useful to try moving ragged vectors outwards to see if this\n  // allows useful extension. Could be combined with checking ends of partners.\n  TabVector_IT it(&vectors_);\n  TabVector_IT dead_it(&dead_vectors_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    if (v->IsSeparator() || v->Partnerless()) {\n      dead_it.add_after_then_move(it.extract());\n      v_it_.set_to_list(&vectors_);\n    } else {\n      v->FitAndEvaluateIfNeeded(vertical_skew_, this);\n    }\n  }\n}\n\n// Apply the given rotation to the given list of blobs.\nvoid TabFind::RotateBlobList(const FCOORD &rotation, BLOBNBOX_LIST *blobs) {\n  BLOBNBOX_IT it(blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    it.data()->rotate_box(rotation);\n  }\n}\n\n// Recreate the grid with deskewed BLOBNBOXes.\n// Returns false if the detected skew angle is impossible.\nbool TabFind::Deskew(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,\n                     FCOORD *deskew, FCOORD *reskew) {\n  ComputeDeskewVectors(deskew, reskew);\n  if (deskew->x() < kCosMaxSkewAngle) {\n    return false;\n  }\n  RotateBlobList(*deskew, image_blobs);\n  RotateBlobList(*deskew, &block->blobs);\n  RotateBlobList(*deskew, &block->small_blobs);\n  RotateBlobList(*deskew, &block->noise_blobs);\n\n  // Rotate the horizontal vectors. The vertical vectors don't need\n  // rotating as they can just be refitted.\n  TabVector_IT h_it(hlines);\n  for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {\n    TabVector *h = h_it.data();\n    h->Rotate(*deskew);\n  }\n  TabVector_IT d_it(&dead_vectors_);\n  for (d_it.mark_cycle_pt(); !d_it.cycled_list(); d_it.forward()) {\n    TabVector *d = d_it.data();\n    d->Rotate(*deskew);\n  }\n  SetVerticalSkewAndParallelize(0, 1);\n  // Rebuild the grid to the new size.\n  TBOX grid_box(bleft_, tright_);\n  grid_box.rotate_large(*deskew);\n  Init(gridsize(), grid_box.botleft(), grid_box.topright());\n  InsertBlobsToGrid(false, false, image_blobs, this);\n  InsertBlobsToGrid(true, false, &block->blobs, this);\n  return true;\n}\n\n// Flip the vertical and horizontal lines and rotate the grid ready\n// for working on the rotated image.\n// This also makes parameter adjustments for FindInitialTabVectors().\nvoid TabFind::ResetForVerticalText(const FCOORD &rotate, const FCOORD &rerotate,\n                                   TabVector_LIST *horizontal_lines, int *min_gutter_width) {\n  // Rotate the horizontal and vertical vectors and swap them over.\n  // Only the separators are kept and rotated; other tabs are used\n  // to estimate the gutter width then thrown away.\n  TabVector_LIST ex_verticals;\n  TabVector_IT ex_v_it(&ex_verticals);\n  TabVector_LIST vlines;\n  TabVector_IT v_it(&vlines);\n  while (!v_it_.empty()) {\n    TabVector *v = v_it_.extract();\n    if (v->IsSeparator()) {\n      v->Rotate(rotate);\n      ex_v_it.add_after_then_move(v);\n    } else {\n      v_it.add_after_then_move(v);\n    }\n    v_it_.forward();\n  }\n\n  // Adjust the min gutter width for better tabbox selection\n  // in 2nd call to FindInitialTabVectors().\n  int median_gutter = FindMedianGutterWidth(&vlines);\n  if (median_gutter > *min_gutter_width) {\n    *min_gutter_width = median_gutter;\n  }\n\n  TabVector_IT h_it(horizontal_lines);\n  for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {\n    TabVector *h = h_it.data();\n    h->Rotate(rotate);\n  }\n  v_it_.add_list_after(horizontal_lines);\n  v_it_.move_to_first();\n  h_it.set_to_list(horizontal_lines);\n  h_it.add_list_after(&ex_verticals);\n\n  // Rebuild the grid to the new size.\n  TBOX grid_box(bleft(), tright());\n  grid_box.rotate_large(rotate);\n  Init(gridsize(), grid_box.botleft(), grid_box.topright());\n}\n\n// Clear the grid and get rid of the tab vectors, but not separators,\n// ready to start again.\nvoid TabFind::Reset() {\n  v_it_.move_to_first();\n  for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) {\n    if (!v_it_.data()->IsSeparator()) {\n      delete v_it_.extract();\n    }\n  }\n  Clear();\n}\n\n// Reflect the separator tab vectors and the grids in the y-axis.\n// Can only be called after Reset!\nvoid TabFind::ReflectInYAxis() {\n  TabVector_LIST temp_list;\n  TabVector_IT temp_it(&temp_list);\n  v_it_.move_to_first();\n  // The TabVector list only contains vertical lines, but they need to be\n  // reflected and the list needs to be reversed, so they are still in\n  // sort_key order.\n  while (!v_it_.empty()) {\n    TabVector *v = v_it_.extract();\n    v_it_.forward();\n    v->ReflectInYAxis();\n    temp_it.add_before_then_move(v);\n  }\n  v_it_.add_list_after(&temp_list);\n  v_it_.move_to_first();\n  // Reset this grid with reflected bounding boxes.\n  TBOX grid_box(bleft(), tright());\n  int tmp = grid_box.left();\n  grid_box.set_left(-grid_box.right());\n  grid_box.set_right(-tmp);\n  Init(gridsize(), grid_box.botleft(), grid_box.topright());\n}\n\n// Compute the rotation required to deskew, and its inverse rotation.\nvoid TabFind::ComputeDeskewVectors(FCOORD *deskew, FCOORD *reskew) {\n  double length = vertical_skew_ % vertical_skew_;\n  length = sqrt(length);\n  deskew->set_x(static_cast<float>(vertical_skew_.y() / length));\n  deskew->set_y(static_cast<float>(vertical_skew_.x() / length));\n  reskew->set_x(deskew->x());\n  reskew->set_y(-deskew->y());\n}\n\n// Compute and apply constraints to the end positions of TabVectors so\n// that where possible partners end at the same y coordinate.\nvoid TabFind::ApplyTabConstraints() {\n  TabVector_IT it(&vectors_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    v->SetupConstraints();\n  }\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    // With the first and last partner, we want a common bottom and top,\n    // respectively, and for each change of partner, we want a common\n    // top of first with bottom of next.\n    v->SetupPartnerConstraints();\n  }\n  // TODO(rays) The back-to-back pairs should really be done like the\n  // front-to-front pairs, but there is no convenient way of producing the\n  // list of partners like there is with the front-to-front.\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    if (!v->IsRightTab()) {\n      continue;\n    }\n    // For each back-to-back pair of vectors, try for common top and bottom.\n    TabVector_IT partner_it(it);\n    for (partner_it.forward(); !partner_it.at_first(); partner_it.forward()) {\n      TabVector *partner = partner_it.data();\n      if (!partner->IsLeftTab() || !v->VOverlap(*partner)) {\n        continue;\n      }\n      v->SetupPartnerConstraints(partner);\n    }\n  }\n  // Now actually apply the constraints to get common start/end points.\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *v = it.data();\n    if (!v->IsSeparator()) {\n      v->ApplyConstraints();\n    }\n  }\n  // TODO(rays) Where constraint application fails, it would be good to try\n  // checking the ends to see if they really should be moved.\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/tabfind.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tabfind.h\n// Description: Subclass of BBGrid to find tabstops.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_TABFIND_H_\n#define TESSERACT_TEXTORD_TABFIND_H_\n\n#include <functional> // for std::function\n#include \"alignedblob.h\"\n#include \"linefind.h\"\n#include \"tabvector.h\"\n\nclass BLOBNBOX;\nclass BLOBNBOX_LIST;\nclass TO_BLOCK;\nclass ScrollView;\nstruct Pix;\n\nnamespace tesseract {\n\nusing WidthCallback = std::function<bool(int)>;\n\nstruct AlignedBlobParams;\nclass ColPartitionGrid;\n\n/** Pixel resolution of column width estimates. */\nconst int kColumnWidthFactor = 20;\n\n/**\n * The TabFind class contains code to find tab-stops and maintain the\n * vectors_ list of tab vectors.\n * Also provides an interface to find neighbouring blobs\n * in the grid of BLOBNBOXes that is used by multiple subclasses.\n * Searching is a complex operation because of the need to enforce\n * rule/separator lines, and tabstop boundaries, (when available), so\n * as the holder of the list of TabVectors this class provides the functions.\n */\nclass TESS_API TabFind : public AlignedBlob {\npublic:\n  TabFind(int gridsize, const ICOORD &bleft, const ICOORD &tright, TabVector_LIST *vlines,\n          int vertical_x, int vertical_y, int resolution);\n  ~TabFind() override;\n\n  /**\n   * Insert a list of blobs into the given grid (not necessarily this).\n   * See InsertBlob for the other arguments.\n   * It would seem to make more sense to swap this and grid, but this way\n   * around allows grid to not be derived from TabFind, eg a ColPartitionGrid,\n   * while the grid that provides the tab stops(this) has to be derived from\n   * TabFind.\n   */\n  void InsertBlobsToGrid(bool h_spread, bool v_spread, BLOBNBOX_LIST *blobs,\n                         BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid);\n\n  /**\n   * Insert a single blob into the given grid (not necessarily this).\n   * If h_spread, then all cells covered horizontally by the box are\n   * used, otherwise, just the bottom-left. Similarly for v_spread.\n   * A side effect is that the left and right rule edges of the blob are\n   * set according to the tab vectors in this (not grid).\n   */\n  bool InsertBlob(bool h_spread, bool v_spread, BLOBNBOX *blob,\n                  BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid);\n  // Calls SetBlobRuleEdges for all the blobs in the given block.\n  void SetBlockRuleEdges(TO_BLOCK *block);\n  // Sets the left and right rule and crossing_rules for the blobs in the given\n  // list by finding the next outermost tabvectors for each blob.\n  void SetBlobRuleEdges(BLOBNBOX_LIST *blobs);\n\n  // Returns the gutter width of the given TabVector between the given y limits.\n  // Also returns x-shift to be added to the vector to clear any intersecting\n  // blobs. The shift is deducted from the returned gutter.\n  // If ignore_unmergeables is true, then blobs of UnMergeableType are\n  // ignored as if they don't exist. (Used for text on image.)\n  // max_gutter_width is used as the maximum width worth searching for in case\n  // there is nothing near the TabVector.\n  int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables,\n                  int max_gutter_width, int *required_shift);\n  /**\n   * Find the gutter width and distance to inner neighbour for the given blob.\n   */\n  void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left,\n                                  BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap);\n\n  /**\n   * Return the x-coord that corresponds to the right edge for the given\n   * box. If there is a rule line to the right that vertically overlaps it,\n   * then return the x-coord of the rule line, otherwise return the right\n   * edge of the page. For details see RightTabForBox below.\n   */\n  int RightEdgeForBox(const TBOX &box, bool crossing, bool extended);\n  /**\n   * As RightEdgeForBox, but finds the left Edge instead.\n   */\n  int LeftEdgeForBox(const TBOX &box, bool crossing, bool extended);\n\n  /**\n   * Return the TabVector that corresponds to the right edge for the given\n   * box. If there is a TabVector to the right that vertically overlaps it,\n   * then return it, otherwise return nullptr. Note that Right and Left refer\n   * to the position of the TabVector, not its type, ie RightTabForBox\n   * returns the nearest TabVector to the right of the box, regardless of\n   * its type.\n   * If a TabVector crosses right through the box (as opposed to grazing one\n   * edge or missing entirely), then crossing false will ignore such a line.\n   * Crossing true will return the line for BOTH left and right edges.\n   * If extended is true, then TabVectors are considered to extend to their\n   * extended_start/end_y, otherwise, just the startpt_ and endpt_.\n   * These functions make use of an internal iterator to the vectors_ list\n   * for speed when used repeatedly on neighbouring boxes. The caveat is\n   * that the iterator must be updated whenever the list is modified.\n   */\n  TabVector *RightTabForBox(const TBOX &box, bool crossing, bool extended);\n  /**\n   * As RightTabForBox, but finds the left TabVector instead.\n   */\n  TabVector *LeftTabForBox(const TBOX &box, bool crossing, bool extended);\n\n  /**\n   * Return true if the given width is close to one of the common\n   * widths in column_widths_.\n   */\n  bool CommonWidth(int width);\n  /**\n   * Return true if the sizes are more than a\n   * factor of 2 different.\n   */\n  static bool DifferentSizes(int size1, int size2);\n  /**\n   * Return true if the sizes are more than a\n   * factor of 5 different.\n   */\n  static bool VeryDifferentSizes(int size1, int size2);\n\n  /**\n   * Return a callback for testing CommonWidth.\n   */\n  WidthCallback WidthCB() {\n    return width_cb_;\n  }\n\n  /**\n   * Return the coords at which to draw the image backdrop.\n   */\n  const ICOORD &image_origin() const {\n    return image_origin_;\n  }\n\nprotected:\n  /**\n// Accessors\n */\n  TabVector_LIST *vectors() {\n    return &vectors_;\n  }\n  TabVector_LIST *dead_vectors() {\n    return &dead_vectors_;\n  }\n\n  /**\n   * Top-level function to find TabVectors in an input page block.\n   * Returns false if the detected skew angle is impossible.\n   * Applies the detected skew angle to deskew the tabs, blobs and part_grid.\n   * tabfind_aligned_gap_fraction should be the value of parameter\n   * textord_tabfind_aligned_gap_fraction\n   */\n  bool FindTabVectors(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,\n                      int min_gutter_width, double tabfind_aligned_gap_fraction,\n                      ColPartitionGrid *part_grid, FCOORD *deskew, FCOORD *reskew);\n\n  // Top-level function to not find TabVectors in an input page block,\n  // but setup for single column mode.\n  void DontFindTabVectors(BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew,\n                          FCOORD *reskew);\n\n  // Cleans up the lists of blobs in the block ready for use by TabFind.\n  // Large blobs that look like text are moved to the main blobs list.\n  // Main blobs that are superseded by the image blobs are deleted.\n  void TidyBlobs(TO_BLOCK *block);\n\n  // Helper function to setup search limits for *TabForBox.\n  void SetupTabSearch(int x, int y, int *min_key, int *max_key);\n\n  /**\n   * Display the tab vectors found in this grid.\n   */\n  ScrollView *DisplayTabVectors(ScrollView *tab_win);\n\n  // First part of FindTabVectors, which may be used twice if the text\n  // is mostly of vertical alignment.  If find_vertical_text flag is\n  // true, this finds vertical textlines in possibly rotated blob space.\n  // In other words, when the page has mostly vertical lines and is rotated,\n  // setting this to true will find horizontal lines on the page.\n  // tabfind_aligned_gap_fraction should be the value of parameter\n  // textord_tabfind_aligned_gap_fraction\n  ScrollView *FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_gutter_width,\n                                    double tabfind_aligned_gap_fraction, TO_BLOCK *block);\n\n  // Apply the given rotation to the given list of blobs.\n  static void RotateBlobList(const FCOORD &rotation, BLOBNBOX_LIST *blobs);\n\n  // Flip the vertical and horizontal lines and rotate the grid ready\n  // for working on the rotated image.\n  // The min_gutter_width will be adjusted to the median gutter width between\n  // vertical tabs to set a better threshold for tabboxes in the 2nd pass.\n  void ResetForVerticalText(const FCOORD &rotate, const FCOORD &rerotate,\n                            TabVector_LIST *horizontal_lines, int *min_gutter_width);\n\n  // Clear the grid and get rid of the tab vectors, but not separators,\n  // ready to start again.\n  void Reset();\n\n  // Reflect the separator tab vectors and the grids in the y-axis.\n  // Can only be called after Reset!\n  void ReflectInYAxis();\n\nprivate:\n  // For each box in the grid, decide whether it is a candidate tab-stop,\n  // and if so add it to the left and right tab boxes.\n  // tabfind_aligned_gap_fraction should be the value of parameter\n  // textord_tabfind_aligned_gap_fraction\n  ScrollView *FindTabBoxes(int min_gutter_width, double tabfind_aligned_gap_fraction);\n\n  // Return true if this box looks like a candidate tab stop, and set\n  // the appropriate tab type(s) to TT_UNCONFIRMED.\n  // tabfind_aligned_gap_fraction should be the value of parameter\n  // textord_tabfind_aligned_gap_fraction\n  bool TestBoxForTabs(BLOBNBOX *bbox, int min_gutter_width, double tabfind_aligned_gap_fraction);\n\n  // Returns true if there is nothing in the rectangle of width min_gutter to\n  // the left of bbox.\n  bool ConfirmRaggedLeft(BLOBNBOX *bbox, int min_gutter);\n  // Returns true if there is nothing in the rectangle of width min_gutter to\n  // the right of bbox.\n  bool ConfirmRaggedRight(BLOBNBOX *bbox, int min_gutter);\n  // Returns true if there is nothing in the given search_box that vertically\n  // overlaps target_box other than target_box itself.\n  bool NothingYOverlapsInBox(const TBOX &search_box, const TBOX &target_box);\n\n  // Fills the list of TabVector with the tabstops found in the grid,\n  // and estimates the logical vertical direction.\n  void FindAllTabVectors(int min_gutter_width);\n  // Helper for FindAllTabVectors finds the vectors of a particular type.\n  int FindTabVectors(int search_size_multiple, TabAlignment alignment, int min_gutter_width,\n                     TabVector_LIST *vectors, int *vertical_x, int *vertical_y);\n  // Finds a vector corresponding to a tabstop running through the\n  // given box of the given alignment type.\n  // search_size_multiple is a multiple of height used to control\n  // the size of the search.\n  // vertical_x and y are updated with an estimate of the real\n  // vertical direction. (skew finding.)\n  // Returns nullptr if no decent tabstop can be found.\n  TabVector *FindTabVector(int search_size_multiple, int min_gutter_width, TabAlignment alignment,\n                           BLOBNBOX *bbox, int *vertical_x, int *vertical_y);\n\n  // Set the vertical_skew_ member from the given vector and refit\n  // all vectors parallel to the skew vector.\n  void SetVerticalSkewAndParallelize(int vertical_x, int vertical_y);\n\n  // Sort all the current vectors using the vertical_skew_ vector.\n  void SortVectors();\n\n  // Evaluate all the current tab vectors.\n  void EvaluateTabs();\n\n  // Trace textlines from one side to the other of each tab vector, saving\n  // the most frequent column widths found in a list so that a given width\n  // can be tested for being a common width with a simple callback function.\n  void ComputeColumnWidths(ScrollView *tab_win, ColPartitionGrid *part_grid);\n\n  // Finds column width and:\n  //   if col_widths is not null (pass1):\n  //     pair-up tab vectors with existing ColPartitions and accumulate widths.\n  //   else (pass2):\n  //     find the largest real partition width for each recorded column width,\n  //     to be used as the minimum acceptable width.\n  void ApplyPartitionsToColumnWidths(ColPartitionGrid *part_grid, STATS *col_widths);\n\n  // Helper makes the list of common column widths in column_widths_ from the\n  // input col_widths. Destroys the content of col_widths by repeatedly\n  // finding the mode and erasing the peak.\n  void MakeColumnWidths(int col_widths_size, STATS *col_widths);\n\n  // Mark blobs as being in a vertical text line where that is the case.\n  void MarkVerticalText();\n\n  // Returns the median gutter width between pairs of matching tab vectors\n  // assuming they are sorted left-to-right.  If there are too few data\n  // points (< kMinLinesInColumn), then 0 is returned.\n  int FindMedianGutterWidth(TabVector_LIST *tab_vectors);\n\n  // Find the next adjacent (to left or right) blob on this text line,\n  // with the constraint that it must vertically significantly overlap\n  // the [top_y, bottom_y] range.\n  // If ignore_images is true, then blobs with aligned_text() < 0 are treated\n  // as if they do not exist.\n  BLOBNBOX *AdjacentBlob(const BLOBNBOX *bbox, bool look_left, bool ignore_images,\n                         double min_overlap_fraction, int gap_limit, int top_y, int bottom_y);\n\n  // Add a bi-directional partner relationship between the left\n  // and the right. If one (or both) of the vectors is a separator,\n  // extend a nearby extendable vector or create a new one of the\n  // correct type, using the given left or right blob as a guide.\n  void AddPartnerVector(BLOBNBOX *left_blob, BLOBNBOX *right_blob, TabVector *left,\n                        TabVector *right);\n\n  /**\n   * Remove separators and unused tabs from the main vectors_ list\n   * to the dead_vectors_ list.\n   */\n  void CleanupTabs();\n\n  /**\n   * Deskew the tab vectors and blobs, computing the rotation and resetting\n   * the storked vertical_skew_. The deskew inverse is returned in reskew.\n   * Returns false if the detected skew angle is impossible.\n   */\n  bool Deskew(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew,\n              FCOORD *reskew);\n\n  // Compute the rotation required to deskew, and its inverse rotation.\n  void ComputeDeskewVectors(FCOORD *deskew, FCOORD *reskew);\n\n  /**\n   * Compute and apply constraints to the end positions of TabVectors so\n   * that where possible partners end at the same y coordinate.\n   */\n  void ApplyTabConstraints();\n\nprotected:\n  ICOORD vertical_skew_; ///< Estimate of true vertical in this image.\n  int resolution_;       ///< Of source image in pixels per inch.\nprivate:\n  ICOORD image_origin_;         ///< Top-left of image in deskewed coords\n  TabVector_LIST vectors_;      ///< List of rule line and tabstops.\n  TabVector_IT v_it_;           ///< Iterator for searching vectors_.\n  TabVector_LIST dead_vectors_; ///< Separators and unpartnered tab vectors.\n  // List of commonly occurring width ranges with x=min and y=max.\n  ICOORDELT_LIST column_widths_; ///< List of commonly occurring width ranges.\n  /** Callback to test an int for being a common width. */\n  WidthCallback width_cb_;\n  // Sets of bounding boxes that are candidate tab stops.\n  std::vector<BLOBNBOX *> left_tab_boxes_;\n  std::vector<BLOBNBOX *> right_tab_boxes_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_TABFIND_H_\n"
  },
  {
    "path": "src/textord/tablefind.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tablefind.cpp\n// Description: Helper classes to find tables from ColPartitions.\n// Author:      Faisal Shafait (faisal.shafait@dfki.de)\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <algorithm>\n#include <cmath>\n#include <utility>\n#include \"tablefind.h\"\n\n#include <allheaders.h>\n\n#include \"colpartitionset.h\"\n#include \"tablerecog.h\"\n\nnamespace tesseract {\n\n// These numbers are used to calculate the global median stats.\n// They just set an upper bound on the stats objects.\n// Maximum vertical spacing between neighbor partitions.\nconst int kMaxVerticalSpacing = 500;\n// Maximum width of a blob in a partition.\nconst int kMaxBlobWidth = 500;\n\n// Minimum whitespace size to split a partition (measured as a multiple\n// of a partition's median width).\nconst double kSplitPartitionSize = 2.0;\n// To insert text, the partition must satisfy these size constraints\n// in AllowTextPartition(). The idea is to filter noise partitions\n// determined by the size compared to the global medians.\n// TODO(nbeato): Need to find good numbers again.\nconst double kAllowTextHeight = 0.5;\nconst double kAllowTextWidth = 0.6;\nconst double kAllowTextArea = 0.8;\n// The same thing applies to blobs (to filter noise).\n// TODO(nbeato): These numbers are a shot in the dark...\n// height and width are 0.5 * gridsize() in colfind.cpp\n// area is a rough guess for the size of a period.\nconst double kAllowBlobHeight = 0.3;\nconst double kAllowBlobWidth = 0.4;\nconst double kAllowBlobArea = 0.05;\n\n// Minimum number of components in a text partition. A partition having fewer\n// components than that is more likely a data partition and is a candidate\n// table cell.\nconst int kMinBoxesInTextPartition = 10;\n\n// Maximum number of components that a data partition can have\nconst int kMaxBoxesInDataPartition = 20;\n\n// Maximum allowed gap in a text partitions as a multiple of its median size.\nconst double kMaxGapInTextPartition = 4.0;\n\n// Minimum value that the maximum gap in a text partition should have as a\n// factor of its median size.\nconst double kMinMaxGapInTextPartition = 0.5;\n\n// The amount of overlap that is \"normal\" for adjacent blobs in a text\n// partition. This is used to calculate gap between overlapping blobs.\nconst double kMaxBlobOverlapFactor = 4.0;\n\n// Maximum x-height a table partition can have as a multiple of global\n// median x-height\nconst double kMaxTableCellXheight = 2.0;\n\n// Maximum line spacing between a table column header and column contents\n// for merging the two (as a multiple of the partition's median_height).\nconst int kMaxColumnHeaderDistance = 4;\n\n// Minimum ratio of num_table_partitions to num_text_partitions in a column\n// block to be called it a table column\nconst double kTableColumnThreshold = 3.0;\n\n// Search for horizontal ruling lines within the vertical margin as a\n// multiple of grid size\n// const int kRulingVerticalMargin = 3;\n\n// Minimum overlap that a colpartition must have with a table region\n// to become part of that table\nconst double kMinOverlapWithTable = 0.6;\n\n// Maximum side space (distance from column boundary) that a typical\n// text-line in flowing text should have as a multiple of its x-height\n// (Median size).\nconst int kSideSpaceMargin = 10;\n\n// Fraction of the peak of x-projection of a table region to set the\n// threshold for the x-projection histogram\nconst double kSmallTableProjectionThreshold = 0.35;\nconst double kLargeTableProjectionThreshold = 0.45;\n// Minimum number of rows required to look for more rows in the projection.\nconst int kLargeTableRowCount = 6;\n\n// Minimum number of rows in a table\nconst int kMinRowsInTable = 3;\n\n// The amount of padding (multiplied by global_median_xheight_ during use)\n// that is vertically added to the search adjacent leader search during\n// ColPartition marking.\nconst int kAdjacentLeaderSearchPadding = 2;\n\n// Used when filtering false positives. When finding the last line\n// of a paragraph (typically left-aligned), the previous line should have\n// its center to the right of the last line by this scaled amount.\nconst double kParagraphEndingPreviousLineRatio = 1.3;\n\n// The maximum amount of whitespace allowed left of a paragraph ending.\n// Do not filter a ColPartition with more than this space left of it.\nconst double kMaxParagraphEndingLeftSpaceMultiple = 3.0;\n\n// Used when filtering false positives. The last line of a paragraph\n// should be preceded by a line that is predominantly text. This is the\n// ratio of text to whitespace (to the right of the text) that is required\n// for the previous line to be a text.\nconst double kMinParagraphEndingTextToWhitespaceRatio = 3.0;\n\n// When counting table columns, this is the required gap between two columns\n// (it is multiplied by global_median_xheight_).\nconst double kMaxXProjectionGapFactor = 2.0;\n\n// Used for similarity in partitions using stroke width. Values copied\n// from ColFind.cpp in Ray's CL.\nconst double kStrokeWidthFractionalTolerance = 0.25;\nconst double kStrokeWidthConstantTolerance = 2.0;\n\n#ifndef GRAPHICS_DISABLED\nstatic BOOL_VAR(textord_show_tables, false, \"Show table regions (ScrollView)\");\nstatic BOOL_VAR(textord_tablefind_show_mark, false,\n                \"Debug table marking steps in detail (ScrollView)\");\nstatic BOOL_VAR(textord_tablefind_show_stats, false,\n                \"Show page stats used in table finding (ScrollView)\");\n#endif\nstatic BOOL_VAR(textord_tablefind_recognize_tables, false,\n                \"Enables the table recognizer for table layout and filtering.\");\n\n// Templated helper function used to create destructor callbacks for the\n// BBGrid::ClearGridData() method.\ntemplate <typename T>\nvoid DeleteObject(T *object) {\n  delete object;\n}\n\nTableFinder::TableFinder()\n    : resolution_(0),\n      global_median_xheight_(0),\n      global_median_blob_width_(0),\n      global_median_ledding_(0),\n      left_to_right_language_(true) {}\n\nTableFinder::~TableFinder() {\n  // ColPartitions and ColSegments created by this class for storage in grids\n  // need to be deleted explicitly.\n  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);\n  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);\n  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);\n  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);\n  table_grid_.ClearGridData(&DeleteObject<ColSegment>);\n}\n\nvoid TableFinder::set_left_to_right_language(bool order) {\n  left_to_right_language_ = order;\n}\n\nvoid TableFinder::Init(int grid_size, const ICOORD &bottom_left,\n                       const ICOORD &top_right) {\n  // Initialize clean partitions list and grid\n  clean_part_grid_.Init(grid_size, bottom_left, top_right);\n  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);\n  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);\n  col_seg_grid_.Init(grid_size, bottom_left, top_right);\n  table_grid_.Init(grid_size, bottom_left, top_right);\n}\n\n// Copy cleaned partitions from part_grid_ to clean_part_grid_ and\n// insert leaders and rulers into the leader_and_ruling_grid_\nvoid TableFinder::InsertCleanPartitions(ColPartitionGrid *grid,\n                                        TO_BLOCK *block) {\n  // Calculate stats. This lets us filter partitions in AllowTextPartition()\n  // and filter blobs in AllowBlob().\n  SetGlobalSpacings(grid);\n\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // Reject partitions with nothing useful inside of them.\n    if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {\n      continue;\n    }\n    ColPartition *clean_part = part->ShallowCopy();\n    ColPartition *leader_part = nullptr;\n    if (part->IsLineType()) {\n      InsertRulingPartition(clean_part);\n      continue;\n    }\n    // Insert all non-text partitions to clean_parts\n    if (!part->IsTextType()) {\n      InsertImagePartition(clean_part);\n      continue;\n    }\n    // Insert text colpartitions after removing noisy components from them\n    // The leaders are split into a separate grid.\n    BLOBNBOX_CLIST *part_boxes = part->boxes();\n    BLOBNBOX_C_IT pit(part_boxes);\n    for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {\n      BLOBNBOX *pblob = pit.data();\n      // Bad blobs... happens in UNLV set.\n      // news.3G1, page 17 (around x=6)\n      if (!AllowBlob(*pblob)) {\n        continue;\n      }\n      if (pblob->flow() == BTFT_LEADER) {\n        if (leader_part == nullptr) {\n          leader_part = part->ShallowCopy();\n          leader_part->set_flow(BTFT_LEADER);\n        }\n        leader_part->AddBox(pblob);\n      } else if (pblob->region_type() != BRT_NOISE) {\n        clean_part->AddBox(pblob);\n      }\n    }\n    clean_part->ComputeLimits();\n    ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();\n    InsertTextPartition(clean_part);\n    SplitAndInsertFragmentedTextPartition(fragmented);\n    if (leader_part != nullptr) {\n      // TODO(nbeato): Note that ComputeLimits does not update the column\n      // information. So the leader may appear to span more columns than it\n      // really does later on when IsInSameColumnAs gets called to test\n      // for adjacent leaders.\n      leader_part->ComputeLimits();\n      InsertLeaderPartition(leader_part);\n    }\n  }\n\n  // Make the partition partners better for upper and lower neighbors.\n  clean_part_grid_.FindPartitionPartners();\n  clean_part_grid_.RefinePartitionPartners(false);\n}\n\n// High level function to perform table detection\nvoid TableFinder::LocateTables(ColPartitionGrid *grid,\n                               ColPartitionSet **all_columns,\n                               WidthCallback width_cb, const FCOORD &reskew) {\n  // initialize spacing, neighbors, and columns\n  InitializePartitions(all_columns);\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_tables) {\n    ScrollView *table_win = MakeWindow(0, 300, \"Column Partitions & Neighbors\");\n    DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n    DisplayColPartitions(table_win, &leader_and_ruling_grid_,\n                         ScrollView::AQUAMARINE);\n    DisplayColPartitionConnections(table_win, &clean_part_grid_,\n                                   ScrollView::ORANGE);\n\n    table_win = MakeWindow(100, 300, \"Fragmented Text\");\n    DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);\n  }\n#endif // !GRAPHICS_DISABLED\n\n  // mark, filter, and smooth candidate table partitions\n  MarkTablePartitions();\n\n  // Make single-column blocks from good_columns_ partitions. col_segments are\n  // moved to a grid later which takes the ownership\n  ColSegment_LIST column_blocks;\n  GetColumnBlocks(all_columns, &column_blocks);\n  // Set the ratio of candidate table partitions in each column\n  SetColumnsType(&column_blocks);\n\n  // Move column segments to col_seg_grid_\n  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);\n\n  // Detect split in column layout that might have occurred due to the\n  // presence of a table. In such a case, merge the corresponding columns.\n  GridMergeColumnBlocks();\n\n  // Group horizontally overlapping table partitions into table columns.\n  // table_columns created here get deleted at the end of this method.\n  ColSegment_LIST table_columns;\n  GetTableColumns(&table_columns);\n\n  // Within each column, mark the range table regions occupy based on the\n  // table columns detected. table_regions are moved to a grid later which\n  // takes the ownership\n  ColSegment_LIST table_regions;\n  GetTableRegions(&table_columns, &table_regions);\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_mark) {\n    ScrollView *table_win = MakeWindow(1200, 300, \"Table Columns and Regions\");\n    DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);\n    DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);\n  }\n#endif // !GRAPHICS_DISABLED\n\n  // Merge table regions across columns for tables spanning multiple\n  // columns\n  MoveColSegmentsToGrid(&table_regions, &table_grid_);\n  GridMergeTableRegions();\n\n  // Adjust table boundaries by including nearby horizontal lines and left\n  // out column headers\n  AdjustTableBoundaries();\n  GridMergeTableRegions();\n\n  if (textord_tablefind_recognize_tables) {\n    // Remove false alarms consisting of a single column\n    DeleteSingleColumnTables();\n\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_tables) {\n      ScrollView *table_win = MakeWindow(1200, 300, \"Detected Table Locations\");\n      DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n      DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);\n      table_grid_.DisplayBoxes(table_win);\n    }\n#endif // !GRAPHICS_DISABLED\n\n    // Find table grid structure and reject tables that are malformed.\n    RecognizeTables();\n    GridMergeTableRegions();\n    RecognizeTables();\n\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_tables) {\n      ScrollView *table_win = MakeWindow(1400, 600, \"Recognized Tables\");\n      DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,\n                           ScrollView::BLUE);\n      table_grid_.DisplayBoxes(table_win);\n    }\n#endif // !GRAPHICS_DISABLED\n  } else {\n    // Remove false alarms consisting of a single column\n    // TODO(nbeato): verify this is a NOP after structured table rejection.\n    // Right now it isn't. If the recognize function is doing what it is\n    // supposed to do, this function is obsolete.\n    DeleteSingleColumnTables();\n\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_tables) {\n      ScrollView *table_win = MakeWindow(1500, 300, \"Detected Tables\");\n      DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,\n                           ScrollView::BLUE);\n      table_grid_.DisplayBoxes(table_win);\n    }\n#endif // !GRAPHICS_DISABLED\n  }\n\n  // Merge all colpartitions in table regions to make them a single\n  // colpartition and revert types of isolated table cells not\n  // assigned to any table to their original types.\n  MakeTableBlocks(grid, all_columns, width_cb);\n}\n// All grids have the same dimensions. The clean_part_grid_ sizes are set from\n// the part_grid_ that is passed to InsertCleanPartitions, which was the same as\n// the grid that is the base of ColumnFinder. Just return the clean_part_grid_\n// dimensions instead of duplicated memory.\nint TableFinder::gridsize() const {\n  return clean_part_grid_.gridsize();\n}\nint TableFinder::gridwidth() const {\n  return clean_part_grid_.gridwidth();\n}\nint TableFinder::gridheight() const {\n  return clean_part_grid_.gridheight();\n}\nconst ICOORD &TableFinder::bleft() const {\n  return clean_part_grid_.bleft();\n}\nconst ICOORD &TableFinder::tright() const {\n  return clean_part_grid_.tright();\n}\n\nvoid TableFinder::InsertTextPartition(ColPartition *part) {\n  ASSERT_HOST(part != nullptr);\n  if (AllowTextPartition(*part)) {\n    clean_part_grid_.InsertBBox(true, true, part);\n  } else {\n    delete part;\n  }\n}\nvoid TableFinder::InsertFragmentedTextPartition(ColPartition *part) {\n  ASSERT_HOST(part != nullptr);\n  if (AllowTextPartition(*part)) {\n    fragmented_text_grid_.InsertBBox(true, true, part);\n  } else {\n    delete part;\n  }\n}\nvoid TableFinder::InsertLeaderPartition(ColPartition *part) {\n  ASSERT_HOST(part != nullptr);\n  if (!part->IsEmpty() && part->bounding_box().area() > 0) {\n    leader_and_ruling_grid_.InsertBBox(true, true, part);\n  } else {\n    delete part;\n  }\n}\nvoid TableFinder::InsertRulingPartition(ColPartition *part) {\n  leader_and_ruling_grid_.InsertBBox(true, true, part);\n}\nvoid TableFinder::InsertImagePartition(ColPartition *part) {\n  // NOTE: If images are placed into a different grid in the future,\n  // the function SetPartitionSpacings needs to be updated. It should\n  // be the only thing that cares about image partitions.\n  clean_part_grid_.InsertBBox(true, true, part);\n}\n\n// Splits a partition into its \"words\". The splits happen\n// at locations with wide inter-blob spacing. This is useful\n// because it allows the table recognize to \"cut through\" the\n// text lines on the page. The assumption is that a table\n// will have several lines with similar overlapping whitespace\n// whereas text will not have this type of property.\n// Note: The code assumes that blobs are sorted by the left side x!\n// This will not work (as well) if the blobs are sorted by center/right.\nvoid TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) {\n  ASSERT_HOST(part != nullptr);\n  // Bye bye empty partitions!\n  if (part->boxes()->empty()) {\n    delete part;\n    return;\n  }\n\n  // The AllowBlob function prevents this.\n  ASSERT_HOST(part->median_width() > 0);\n  const double kThreshold = part->median_width() * kSplitPartitionSize;\n\n  ColPartition *right_part = part;\n  bool found_split = true;\n  while (found_split) {\n    found_split = false;\n    BLOBNBOX_C_IT box_it(right_part->boxes());\n    // Blobs are sorted left side first. If blobs overlap,\n    // the previous blob may have a \"more right\" right side.\n    // Account for this by always keeping the largest \"right\"\n    // so far.\n    int previous_right = INT32_MIN;\n\n    // Look for the next split in the partition.\n    for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {\n      const TBOX &box = box_it.data()->bounding_box();\n      if (previous_right != INT32_MIN &&\n          box.left() - previous_right > kThreshold) {\n        // We have a split position. Split the partition in two pieces.\n        // Insert the left piece in the grid and keep processing the right.\n        int mid_x = (box.left() + previous_right) / 2;\n        ColPartition *left_part = right_part;\n        right_part = left_part->SplitAt(mid_x);\n\n        InsertFragmentedTextPartition(left_part);\n        found_split = true;\n        break;\n      }\n\n      // The right side of the previous blobs.\n      previous_right = std::max(previous_right, static_cast<int>(box.right()));\n    }\n  }\n  // When a split is not found, the right part is minimized\n  // as much as possible, so process it.\n  InsertFragmentedTextPartition(right_part);\n}\n\n// Some simple criteria to filter out now. We want to make sure the\n// average blob size in the partition is consistent with the\n// global page stats.\n// The area metric will almost always pass for multi-blob partitions.\n// It is useful when filtering out noise caused by an isolated blob.\nbool TableFinder::AllowTextPartition(const ColPartition &part) const {\n  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;\n  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;\n  const int median_area = global_median_xheight_ * global_median_blob_width_;\n  const double kAreaPerBlobRequired = median_area * kAllowTextArea;\n  // Keep comparisons strictly greater to disallow 0!\n  return part.median_height() > kHeightRequired &&\n         part.median_width() > kWidthRequired &&\n         part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();\n}\n\n// Same as above, applied to blobs. Keep in mind that\n// leaders, commas, and periods are important in tables.\nbool TableFinder::AllowBlob(const BLOBNBOX &blob) const {\n  const TBOX &box = blob.bounding_box();\n  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;\n  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;\n  const int median_area = global_median_xheight_ * global_median_blob_width_;\n  const double kAreaRequired = median_area * kAllowBlobArea;\n  // Keep comparisons strictly greater to disallow 0!\n  return box.height() > kHeightRequired && box.width() > kWidthRequired &&\n         box.area() > kAreaRequired;\n}\n\n// TODO(nbeato): The grid that makes the window doesn't seem to matter.\n// The only downside is that window messages will be caught by\n// clean_part_grid_ instead of a useful object. This is a temporary solution\n// for the debug windows created by the TableFinder.\n#ifndef GRAPHICS_DISABLED\nScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {\n  return clean_part_grid_.MakeWindow(x, y, window_name);\n}\n#endif\n\n// Make single-column blocks from good_columns_ partitions.\nvoid TableFinder::GetColumnBlocks(ColPartitionSet **all_columns,\n                                  ColSegment_LIST *column_blocks) {\n  for (int i = 0; i < gridheight(); ++i) {\n    ColPartitionSet *columns = all_columns[i];\n    if (columns != nullptr) {\n      ColSegment_LIST new_blocks;\n      // Get boxes from the current vertical position on the grid\n      columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),\n                              &new_blocks);\n      // Merge the new_blocks boxes into column_blocks if they are well-aligned\n      GroupColumnBlocks(&new_blocks, column_blocks);\n    }\n  }\n}\n\n// Merge column segments into the current list if they are well aligned.\nvoid TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,\n                                    ColSegment_LIST *column_blocks) {\n  ColSegment_IT src_it(new_blocks);\n  ColSegment_IT dest_it(column_blocks);\n  // iterate through the source list\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    ColSegment *src_seg = src_it.data();\n    const TBOX &src_box = src_seg->bounding_box();\n    bool match_found = false;\n    // iterate through the destination list to find a matching column block\n    for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {\n      ColSegment *dest_seg = dest_it.data();\n      TBOX dest_box = dest_seg->bounding_box();\n      if (ConsecutiveBoxes(src_box, dest_box)) {\n        // If matching block is found, insert the current block into it\n        // and delete the source block.\n        dest_seg->InsertBox(src_box);\n        match_found = true;\n        delete src_it.extract();\n        break;\n      }\n    }\n    // If no match is found, just append the source block to column_blocks\n    if (!match_found) {\n      dest_it.add_after_then_move(src_it.extract());\n    }\n  }\n}\n\n// are the two boxes immediate neighbors along the vertical direction\nbool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {\n  int x_margin = 20;\n  int y_margin = 5;\n  return (abs(b1.left() - b2.left()) < x_margin) &&\n         (abs(b1.right() - b2.right()) < x_margin) &&\n         (abs(b1.top() - b2.bottom()) < y_margin ||\n          abs(b2.top() - b1.bottom()) < y_margin);\n}\n\n// Set up info for clean_part_grid_ partitions to be valid during detection\n// code.\nvoid TableFinder::InitializePartitions(ColPartitionSet **all_columns) {\n  FindNeighbors();\n  SetPartitionSpacings(&clean_part_grid_, all_columns);\n  SetGlobalSpacings(&clean_part_grid_);\n}\n\n// Set left, right and top, bottom spacings of each colpartition.\nvoid TableFinder::SetPartitionSpacings(ColPartitionGrid *grid,\n                                       ColPartitionSet **all_columns) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    ColPartitionSet *columns = all_columns[gsearch.GridY()];\n    TBOX box = part->bounding_box();\n    int y = part->MidY();\n    ColPartition *left_column = columns->ColumnContaining(box.left(), y);\n    ColPartition *right_column = columns->ColumnContaining(box.right(), y);\n    // set distance from left column as space to the left\n    if (left_column) {\n      int left_space = std::max(0, box.left() - left_column->LeftAtY(y));\n      part->set_space_to_left(left_space);\n    }\n    // set distance from right column as space to the right\n    if (right_column) {\n      int right_space = std::max(0, right_column->RightAtY(y) - box.right());\n      part->set_space_to_right(right_space);\n    }\n\n    // Look for images that may be closer.\n    // NOTE: used to be part_grid_, might cause issues now\n    ColPartitionGridSearch hsearch(grid);\n    hsearch.StartSideSearch(box.left(), box.bottom(), box.top());\n    ColPartition *neighbor = nullptr;\n    while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {\n      if (neighbor->type() == PT_PULLOUT_IMAGE ||\n          neighbor->type() == PT_FLOWING_IMAGE ||\n          neighbor->type() == PT_HEADING_IMAGE) {\n        int right = neighbor->bounding_box().right();\n        if (right < box.left()) {\n          int space = std::min(box.left() - right, part->space_to_left());\n          part->set_space_to_left(space);\n        }\n      }\n    }\n    hsearch.StartSideSearch(box.left(), box.bottom(), box.top());\n    neighbor = nullptr;\n    while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {\n      if (neighbor->type() == PT_PULLOUT_IMAGE ||\n          neighbor->type() == PT_FLOWING_IMAGE ||\n          neighbor->type() == PT_HEADING_IMAGE) {\n        int left = neighbor->bounding_box().left();\n        if (left > box.right()) {\n          int space = std::min(left - box.right(), part->space_to_right());\n          part->set_space_to_right(space);\n        }\n      }\n    }\n\n    ColPartition *upper_part = part->SingletonPartner(true);\n    if (upper_part) {\n      int space =\n          std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -\n                                       part->bounding_box().bottom()));\n      part->set_space_above(space);\n    } else {\n      // TODO(nbeato): What constitutes a good value?\n      // 0 is the default value when not set, explicitly noting it needs to\n      // be something else.\n      part->set_space_above(INT32_MAX);\n    }\n\n    ColPartition *lower_part = part->SingletonPartner(false);\n    if (lower_part) {\n      int space =\n          std::max(0, static_cast<int>(part->bounding_box().bottom() -\n                                       lower_part->bounding_box().bottom()));\n      part->set_space_below(space);\n    } else {\n      // TODO(nbeato): What constitutes a good value?\n      // 0 is the default value when not set, explicitly noting it needs to\n      // be something else.\n      part->set_space_below(INT32_MAX);\n    }\n  }\n}\n\n// Set spacing and closest neighbors above and below a given colpartition.\nvoid TableFinder::SetVerticalSpacing(ColPartition *part) {\n  TBOX box = part->bounding_box();\n  int top_range =\n      std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));\n  int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,\n                              static_cast<int>(bleft().y()));\n  box.set_top(top_range);\n  box.set_bottom(bottom_range);\n\n  TBOX part_box = part->bounding_box();\n  // Start a rect search\n  ColPartitionGridSearch rectsearch(&clean_part_grid_);\n  rectsearch.StartRectSearch(box);\n  ColPartition *neighbor;\n  int min_space_above = kMaxVerticalSpacing;\n  int min_space_below = kMaxVerticalSpacing;\n  ColPartition *above_neighbor = nullptr;\n  ColPartition *below_neighbor = nullptr;\n  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {\n    if (neighbor == part) {\n      continue;\n    }\n    TBOX neighbor_box = neighbor->bounding_box();\n    if (neighbor_box.major_x_overlap(part_box)) {\n      int gap = abs(part->median_bottom() - neighbor->median_bottom());\n      // If neighbor is below current partition\n      if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {\n        min_space_below = gap;\n        below_neighbor = neighbor;\n      } // If neighbor is above current partition\n      else if (part_box.top() < neighbor_box.bottom() &&\n               gap < min_space_above) {\n        min_space_above = gap;\n        above_neighbor = neighbor;\n      }\n    }\n  }\n  part->set_space_above(min_space_above);\n  part->set_space_below(min_space_below);\n  part->set_nearest_neighbor_above(above_neighbor);\n  part->set_nearest_neighbor_below(below_neighbor);\n}\n\n// Set global spacing and x-height estimates\nvoid TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) {\n  STATS xheight_stats(0, kMaxVerticalSpacing);\n  STATS width_stats(0, kMaxBlobWidth);\n  STATS ledding_stats(0, kMaxVerticalSpacing);\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.\n    // ComputeLimits needs to get called somewhere outside of TableFinder\n    // to make sure the partitions are properly initialized.\n    // When this is called, SmoothPartitionPartners dies in an assert after\n    // table find runs. Alternative solution.\n    // part->ComputeLimits();\n    if (part->IsTextType()) {\n      // xheight_stats.add(part->median_height(), part->boxes_count());\n      // width_stats.add(part->median_width(), part->boxes_count());\n\n      // This loop can be removed when above issues are fixed.\n      // Replace it with the 2 lines commented out above.\n      BLOBNBOX_C_IT it(part->boxes());\n      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n        xheight_stats.add(it.data()->bounding_box().height(), 1);\n        width_stats.add(it.data()->bounding_box().width(), 1);\n      }\n\n      ledding_stats.add(part->space_above(), 1);\n      ledding_stats.add(part->space_below(), 1);\n    }\n  }\n  // Set estimates based on median of statistics obtained\n  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));\n  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));\n  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_stats) {\n    const char *kWindowName = \"X-height (R), X-width (G), and ledding (B)\";\n    ScrollView *stats_win = MakeWindow(500, 10, kWindowName);\n    xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);\n    width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);\n    ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);\n  }\n#endif // !GRAPHICS_DISABLED\n}\n\nvoid TableFinder::set_global_median_xheight(int xheight) {\n  global_median_xheight_ = xheight;\n}\nvoid TableFinder::set_global_median_blob_width(int width) {\n  global_median_blob_width_ = width;\n}\nvoid TableFinder::set_global_median_ledding(int ledding) {\n  global_median_ledding_ = ledding;\n}\n\nvoid TableFinder::FindNeighbors() {\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    // TODO(nbeato): Rename this function, meaning is different now.\n    // IT is finding nearest neighbors its own way\n    // SetVerticalSpacing(part);\n\n    ColPartition *upper = part->SingletonPartner(true);\n    if (upper) {\n      part->set_nearest_neighbor_above(upper);\n    }\n\n    ColPartition *lower = part->SingletonPartner(false);\n    if (lower) {\n      part->set_nearest_neighbor_below(lower);\n    }\n  }\n}\n\n// High level interface. Input is an unmarked ColPartitionGrid\n// (namely, clean_part_grid_). Partitions are identified using local\n// information and filter/smoothed. The function exit should contain\n// a good sampling of the table partitions.\nvoid TableFinder::MarkTablePartitions() {\n  MarkPartitionsUsingLocalInformation();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_mark) {\n    ScrollView *table_win = MakeWindow(300, 300, \"Initial Table Partitions\");\n    DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n    DisplayColPartitions(table_win, &leader_and_ruling_grid_,\n                         ScrollView::AQUAMARINE);\n  }\n#endif\n  FilterFalseAlarms();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_mark) {\n    ScrollView *table_win = MakeWindow(600, 300, \"Filtered Table Partitions\");\n    DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n    DisplayColPartitions(table_win, &leader_and_ruling_grid_,\n                         ScrollView::AQUAMARINE);\n  }\n#endif\n  SmoothTablePartitionRuns();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_mark) {\n    ScrollView *table_win = MakeWindow(900, 300, \"Smoothed Table Partitions\");\n    DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n    DisplayColPartitions(table_win, &leader_and_ruling_grid_,\n                         ScrollView::AQUAMARINE);\n  }\n#endif\n  FilterFalseAlarms();\n#ifndef GRAPHICS_DISABLED\n  if (textord_tablefind_show_mark || textord_show_tables) {\n    ScrollView *table_win = MakeWindow(900, 300, \"Final Table Partitions\");\n    DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);\n    DisplayColPartitions(table_win, &leader_and_ruling_grid_,\n                         ScrollView::AQUAMARINE);\n  }\n#endif\n}\n\n// These types of partitions are marked as table partitions:\n//  1- Partitions that have at lease one large gap between words\n//  2- Partitions that consist of only one word (no significant gap\n//     between components)\n//  3- Partitions that vertically overlap with other partitions within the\n//     same column.\n//  4- Partitions with leaders before/after them.\nvoid TableFinder::MarkPartitionsUsingLocalInformation() {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (!part->IsTextType()) { // Only consider text partitions\n      continue;\n    }\n    // Only consider partitions in dominant font size or smaller\n    if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {\n      continue;\n    }\n    // Mark partitions with a large gap, or no significant gap as\n    // table partitions.\n    // Comments: It produces several false alarms at:\n    //  - last line of a paragraph (fixed)\n    //  - single word section headings\n    //  - page headers and footers\n    //  - numbered equations\n    //  - line drawing regions\n    // TODO(faisal): detect and fix above-mentioned cases\n    if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {\n      part->set_table_type();\n    }\n  }\n}\n\n// Check if the partition has at least one large gap between words or no\n// significant gap at all\nbool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const {\n  // Should only get text partitions.\n  ASSERT_HOST(part->IsTextType());\n  // Blob access\n  BLOBNBOX_CLIST *part_boxes = part->boxes();\n  BLOBNBOX_C_IT it(part_boxes);\n  // Check if this is a relatively small partition (such as a single word)\n  if (part->bounding_box().width() <\n          kMinBoxesInTextPartition * part->median_height() &&\n      part_boxes->length() < kMinBoxesInTextPartition) {\n    return true;\n  }\n\n  // Variables used to compute inter-blob spacing.\n  int previous_x1 = -1;\n  // Stores the maximum gap detected.\n  int largest_partition_gap_found = -1;\n  // Text partition gap limits. If this is text (and not a table),\n  // there should be at least one gap larger than min_gap and no gap\n  // larger than max_gap.\n  const double max_gap = kMaxGapInTextPartition * part->median_height();\n  const double min_gap = kMinMaxGapInTextPartition * part->median_height();\n\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    int current_x0 = blob->bounding_box().left();\n    int current_x1 = blob->bounding_box().right();\n    if (previous_x1 != -1) {\n      int gap = current_x0 - previous_x1;\n\n      // TODO(nbeato): Boxes may overlap? Huh?\n      // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors\n      // on the top right of the page are filtered out with this line.\n      // Note 2: Iterating over blobs in a partition, so we are looking for\n      // spacing between the words.\n      if (gap < 0) {\n        // More likely case, the blobs slightly overlap. This can happen\n        // with diacritics (accents) or broken alphabet symbols (characters).\n        // Merge boxes together by taking max of right sides.\n        if (-gap < part->median_height() * kMaxBlobOverlapFactor) {\n          previous_x1 = std::max(previous_x1, current_x1);\n          continue;\n        }\n        // Extreme case, blobs overlap significantly in the same partition...\n        // This should not happen often (if at all), but it does.\n        // TODO(nbeato): investigate cases when this happens.\n        else {\n          // The behavior before was to completely ignore this case.\n        }\n      }\n\n      // If a large enough gap is found, mark it as a table cell (return true)\n      if (gap > max_gap) {\n        return true;\n      }\n      if (gap > largest_partition_gap_found) {\n        largest_partition_gap_found = gap;\n      }\n    }\n    previous_x1 = current_x1;\n  }\n  // Since no large gap was found, return false if the partition is too\n  // long to be a data cell\n  if (part->bounding_box().width() >\n          kMaxBoxesInDataPartition * part->median_height() ||\n      part_boxes->length() > kMaxBoxesInDataPartition) {\n    return false;\n  }\n\n  // A partition may be a single blob. In this case, it's an isolated symbol\n  // or non-text (such as a ruling or image).\n  // Detect these as table partitions? Shouldn't this be case by case?\n  // The behavior before was to ignore this, making max_partition_gap < 0\n  // and implicitly return true. Just making it explicit.\n  if (largest_partition_gap_found == -1) {\n    return true;\n  }\n\n  // return true if the maximum gap found is smaller than the minimum allowed\n  // max_gap in a text partition. This indicates that there is no significant\n  // space in the partition, hence it is likely a single word.\n  return largest_partition_gap_found < min_gap;\n}\n\n// A criteria for possible tables is that a table may have leaders\n// between data cells. An aggressive solution to find such tables is to\n// explicitly mark partitions that have adjacent leaders.\n// Note that this includes overlapping leaders. However, it does not\n// include leaders in different columns on the page.\n// Possible false-positive will include lists, such as a table of contents.\n// As these arise, the aggressive nature of this search may need to be\n// trimmed down.\nbool TableFinder::HasLeaderAdjacent(const ColPartition &part) {\n  if (part.flow() == BTFT_LEADER) {\n    return true;\n  }\n  // Search range is left and right bounded by an offset of the\n  // median xheight. This offset is to allow some tolerance to the\n  // the leaders on the page in the event that the alignment is still\n  // a bit off.\n  const TBOX &box = part.bounding_box();\n  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;\n  const int top = box.top() + search_size;\n  const int bottom = box.bottom() - search_size;\n  ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);\n  for (int direction = 0; direction < 2; ++direction) {\n    bool right_to_left = (direction == 0);\n    int x = right_to_left ? box.right() : box.left();\n    hsearch.StartSideSearch(x, bottom, top);\n    ColPartition *leader = nullptr;\n    while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {\n      // The leader could be a horizontal ruling in the grid.\n      // Make sure it is actually a leader.\n      if (leader->flow() != BTFT_LEADER) {\n        continue;\n      }\n      // This should not happen, they are in different grids.\n      ASSERT_HOST(&part != leader);\n      // Make sure the leader shares a page column with the partition,\n      // otherwise we are spreading across columns.\n      if (!part.IsInSameColumnAs(*leader)) {\n        break;\n      }\n      // There should be a significant vertical overlap\n      if (!leader->VSignificantCoreOverlap(part)) {\n        continue;\n      }\n      // Leader passed all tests, so it is adjacent.\n      return true;\n    }\n  }\n  // No leaders are adjacent to the given partition.\n  return false;\n}\n\n// Filter individual text partitions marked as table partitions\n// consisting of paragraph endings, small section headings, and\n// headers and footers.\nvoid TableFinder::FilterFalseAlarms() {\n  FilterParagraphEndings();\n  FilterHeaderAndFooter();\n  // TODO(nbeato): Fully justified text as non-table?\n}\n\nvoid TableFinder::FilterParagraphEndings() {\n  // Detect last line of paragraph\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->type() != PT_TABLE) {\n      continue; // Consider only table partitions\n    }\n\n    // Paragraph ending should have flowing text above it.\n    ColPartition *upper_part = part->nearest_neighbor_above();\n    if (!upper_part) {\n      continue;\n    }\n    if (upper_part->type() != PT_FLOWING_TEXT) {\n      continue;\n    }\n    if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {\n      continue;\n    }\n    // Check if its the last line of a paragraph.\n    // In most cases, a paragraph ending should be left-aligned to text line\n    // above it. Sometimes, it could be a 2 line paragraph, in which case\n    // the line above it is indented.\n    // To account for that, check if the partition center is to\n    // the left of the one above it.\n    int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;\n    int upper_mid = (upper_part->bounding_box().left() +\n                     upper_part->bounding_box().right()) /\n                    2;\n    int current_spacing = 0; // spacing of the current line to margin\n    int upper_spacing = 0;   // spacing of the previous line to the margin\n    if (left_to_right_language_) {\n      // Left to right languages, use mid - left to figure out the distance\n      // the middle is from the left margin.\n      int left = std::min(part->bounding_box().left(),\n                          upper_part->bounding_box().left());\n      current_spacing = mid - left;\n      upper_spacing = upper_mid - left;\n    } else {\n      // Right to left languages, use right - mid to figure out the distance\n      // the middle is from the right margin.\n      int right = std::max(part->bounding_box().right(),\n                           upper_part->bounding_box().right());\n      current_spacing = right - mid;\n      upper_spacing = right - upper_mid;\n    }\n    if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {\n      continue;\n    }\n\n    // Paragraphs should have similar fonts.\n    if (!part->MatchingSizes(*upper_part) ||\n        !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,\n                                   kStrokeWidthConstantTolerance)) {\n      continue;\n    }\n\n    // The last line of a paragraph should be left aligned.\n    // TODO(nbeato): This would be untrue if the text was right aligned.\n    // How often is that?\n    if (part->space_to_left() >\n        kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {\n      continue;\n    }\n    // The line above it should be right aligned (assuming justified format).\n    // Since we can't assume justified text, we compare whitespace to text.\n    // The above line should have majority spanning text (or the current\n    // line could have fit on the previous line). So compare\n    // whitespace to text.\n    if (upper_part->bounding_box().width() <\n        kMinParagraphEndingTextToWhitespaceRatio *\n            upper_part->space_to_right()) {\n      continue;\n    }\n\n    // Ledding above the line should be less than ledding below\n    if (part->space_above() >= part->space_below() ||\n        part->space_above() > 2 * global_median_ledding_) {\n      continue;\n    }\n\n    // If all checks failed, it is probably text.\n    part->clear_table_type();\n  }\n}\n\nvoid TableFinder::FilterHeaderAndFooter() {\n  // Consider top-most text colpartition as header and bottom most as footer\n  ColPartition *header = nullptr;\n  ColPartition *footer = nullptr;\n  int max_top = INT32_MIN;\n  int min_bottom = INT32_MAX;\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (!part->IsTextType()) {\n      continue; // Consider only text partitions\n    }\n    int top = part->bounding_box().top();\n    int bottom = part->bounding_box().bottom();\n    if (top > max_top) {\n      max_top = top;\n      header = part;\n    }\n    if (bottom < min_bottom) {\n      min_bottom = bottom;\n      footer = part;\n    }\n  }\n  if (header) {\n    header->clear_table_type();\n  }\n  if (footer) {\n    footer->clear_table_type();\n  }\n}\n\n// Mark all ColPartitions as table cells that have a table cell above\n// and below them\n// TODO(faisal): This is too aggressive at the moment. The method needs to\n// consider spacing and alignment as well. Detection of false alarm table cells\n// should also be done as part of it.\nvoid TableFinder::SmoothTablePartitionRuns() {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {\n      continue; // Consider only text partitions\n    }\n    ColPartition *upper_part = part->nearest_neighbor_above();\n    ColPartition *lower_part = part->nearest_neighbor_below();\n    if (!upper_part || !lower_part) {\n      continue;\n    }\n    if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {\n      part->set_table_type();\n    }\n  }\n\n  // Pass 2, do the opposite. If both the upper and lower neighbors\n  // exist and are not tables, this probably shouldn't be a table.\n  gsearch.StartFullSearch();\n  part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->type() != PT_TABLE) {\n      continue; // Consider only text partitions\n    }\n    ColPartition *upper_part = part->nearest_neighbor_above();\n    ColPartition *lower_part = part->nearest_neighbor_below();\n\n    // table can't be by itself\n    if ((upper_part && upper_part->type() != PT_TABLE) &&\n        (lower_part && lower_part->type() != PT_TABLE)) {\n      part->clear_table_type();\n    }\n  }\n}\n\n// Set the type of a column segment based on the ratio of table to text cells\nvoid TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {\n  ColSegment_IT it(column_blocks);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColSegment *seg = it.data();\n    TBOX box = seg->bounding_box();\n    int num_table_cells = 0;\n    int num_text_cells = 0;\n    ColPartitionGridSearch rsearch(&clean_part_grid_);\n    rsearch.SetUniqueMode(true);\n    rsearch.StartRectSearch(box);\n    ColPartition *part = nullptr;\n    while ((part = rsearch.NextRectSearch()) != nullptr) {\n      if (part->type() == PT_TABLE) {\n        num_table_cells++;\n      } else if (part->type() == PT_FLOWING_TEXT) {\n        num_text_cells++;\n      }\n    }\n    // If a column block has no text or table partition in it, it is not needed\n    // for table detection.\n    if (!num_table_cells && !num_text_cells) {\n      delete it.extract();\n    } else {\n      seg->set_num_table_cells(num_table_cells);\n      seg->set_num_text_cells(num_text_cells);\n      // set column type based on the ratio of table to text cells\n      seg->set_type();\n    }\n  }\n}\n\n// Move column blocks to grid\nvoid TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,\n                                        ColSegmentGrid *col_seg_grid) {\n  ColSegment_IT it(segments);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColSegment *seg = it.extract();\n    col_seg_grid->InsertBBox(true, true, seg);\n  }\n}\n\n// Merge column blocks if a split is detected due to the presence of a\n// table. A text block is considered split if it has multiple\n// neighboring blocks above/below it, and at least one of the\n// neighboring blocks is of table type (has a high density of table\n// partitions). In this case neighboring blocks in the direction\n// (above/below) of the table block are merged with the text block.\n\n// Comment: This method does not handle split due to a full page table\n// since table columns in this case do not have a text column on which\n// split decision can be based.\nvoid TableFinder::GridMergeColumnBlocks() {\n  int margin = gridsize();\n\n  // Iterate the Column Blocks in the grid.\n  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(\n      &col_seg_grid_);\n  gsearch.StartFullSearch();\n  ColSegment *seg;\n  while ((seg = gsearch.NextFullSearch()) != nullptr) {\n    if (seg->type() != COL_TEXT) {\n      continue; // only consider text blocks for split detection\n    }\n    bool neighbor_found = false;\n    bool modified = false; // Modified at least once\n    // keep expanding current box as long as neighboring table columns\n    // are found above or below it.\n    do {\n      TBOX box = seg->bounding_box();\n      // slightly expand the search region vertically\n      int top_range =\n          std::min(box.top() + margin, static_cast<int>(tright().y()));\n      int bottom_range =\n          std::max(box.bottom() - margin, static_cast<int>(bleft().y()));\n      box.set_top(top_range);\n      box.set_bottom(bottom_range);\n      neighbor_found = false;\n      GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(\n          &col_seg_grid_);\n      rectsearch.StartRectSearch(box);\n      ColSegment *neighbor = nullptr;\n      while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {\n        if (neighbor == seg) {\n          continue;\n        }\n        const TBOX &neighbor_box = neighbor->bounding_box();\n        // If the neighbor box significantly overlaps with the current\n        // box (due to the expansion of the current box in the\n        // previous iteration of this loop), remove the neighbor box\n        // and expand the current box to include it.\n        if (neighbor_box.overlap_fraction(box) >= 0.9) {\n          seg->InsertBox(neighbor_box);\n          modified = true;\n          rectsearch.RemoveBBox();\n          gsearch.RepositionIterator();\n          delete neighbor;\n          continue;\n        }\n        // Only expand if the neighbor box is of table type\n        if (neighbor->type() != COL_TABLE) {\n          continue;\n        }\n        // Insert the neighbor box into the current column block\n        if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {\n          seg->InsertBox(neighbor_box);\n          neighbor_found = true;\n          modified = true;\n          rectsearch.RemoveBBox();\n          gsearch.RepositionIterator();\n          delete neighbor;\n        }\n      }\n    } while (neighbor_found);\n    if (modified) {\n      // Because the box has changed, it has to be removed first.\n      gsearch.RemoveBBox();\n      col_seg_grid_.InsertBBox(true, true, seg);\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// Group horizontally overlapping table partitions into table columns.\n// TODO(faisal): This is too aggressive at the moment. The method should\n// consider more attributes to group table partitions together. Some common\n// errors are:\n//  1- page number is merged with a table column above it even\n//      if there is a large vertical gap between them.\n//  2- column headers go on to catch one of the columns arbitrarily\n//  3- an isolated noise blob near page top or bottom merges with the table\n//     column below/above it\n//  4- cells from two vertically adjacent tables merge together to make a\n//     single column resulting in merging of the two tables\nvoid TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {\n  ColSegment_IT it(table_columns);\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(&clean_part_grid_);\n  gsearch.StartFullSearch();\n  ColPartition *part;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->inside_table_column() || part->type() != PT_TABLE) {\n      continue; // prevent a partition to be assigned to multiple columns\n    }\n    const TBOX &box = part->bounding_box();\n    auto *col = new ColSegment();\n    col->InsertBox(box);\n    part->set_inside_table_column(true);\n    // Start a search below the current cell to find bottom neighbours\n    // Note: a full search will always process things above it first, so\n    // this should be starting at the highest cell and working its way down.\n    ColPartitionGridSearch vsearch(&clean_part_grid_);\n    vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());\n    ColPartition *neighbor = nullptr;\n    bool found_neighbours = false;\n    while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {\n      // only consider neighbors not assigned to any column yet\n      if (neighbor->inside_table_column()) {\n        continue;\n      }\n      // Horizontal lines should not break the flow\n      if (neighbor->IsHorizontalLine()) {\n        continue;\n      }\n      // presence of a non-table neighbor marks the end of current\n      // table column\n      if (neighbor->type() != PT_TABLE) {\n        break;\n      }\n      // add the neighbor partition to the table column\n      const TBOX &neighbor_box = neighbor->bounding_box();\n      col->InsertBox(neighbor_box);\n      neighbor->set_inside_table_column(true);\n      found_neighbours = true;\n    }\n    if (found_neighbours) {\n      it.add_after_then_move(col);\n    } else {\n      part->set_inside_table_column(false);\n      delete col;\n    }\n  }\n}\n\n// Mark regions in a column that are x-bounded by the column boundaries and\n// y-bounded by the table columns' projection on the y-axis as table regions\nvoid TableFinder::GetTableRegions(ColSegment_LIST *table_columns,\n                                  ColSegment_LIST *table_regions) {\n  ColSegment_IT cit(table_columns);\n  ColSegment_IT rit(table_regions);\n  // Iterate through column blocks\n  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(\n      &col_seg_grid_);\n  gsearch.StartFullSearch();\n  ColSegment *part;\n  int page_height = tright().y() - bleft().y();\n  ASSERT_HOST(page_height > 0);\n  // create a bool array to hold projection on y-axis\n  bool *table_region = new bool[page_height];\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &part_box = part->bounding_box();\n    // reset the projection array\n    for (int i = 0; i < page_height; i++) {\n      table_region[i] = false;\n    }\n    // iterate through all table columns to find regions in the current\n    // page column block\n    cit.move_to_first();\n    for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {\n      TBOX col_box = cit.data()->bounding_box();\n      // find intersection region of table column and page column\n      TBOX intersection_box = col_box.intersection(part_box);\n      // project table column on the y-axis\n      for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {\n        table_region[i - bleft().y()] = true;\n      }\n    }\n    // set x-limits of table regions to page column width\n    TBOX current_table_box;\n    current_table_box.set_left(part_box.left());\n    current_table_box.set_right(part_box.right());\n    // go through the y-axis projection to find runs of table\n    // regions. Each run makes one table region.\n    for (int i = 1; i < page_height; i++) {\n      // detect start of a table region\n      if (!table_region[i - 1] && table_region[i]) {\n        current_table_box.set_bottom(i + bleft().y());\n      }\n      // TODO(nbeato): Is it guaranteed that the last row is not a table region?\n      // detect end of a table region\n      if (table_region[i - 1] && !table_region[i]) {\n        current_table_box.set_top(i + bleft().y());\n        if (!current_table_box.null_box()) {\n          auto *seg = new ColSegment();\n          seg->InsertBox(current_table_box);\n          rit.add_after_then_move(seg);\n        }\n      }\n    }\n  }\n  delete[] table_region;\n}\n\n// Merge table regions corresponding to tables spanning multiple columns if\n// there is a colpartition (horizontal ruling line or normal text) that\n// touches both regions.\n// TODO(faisal): A rare error occurs if there are two horizontally adjacent\n// tables with aligned ruling lines. In this case, line finder returns a\n// single line and hence the tables get merged together\nvoid TableFinder::GridMergeTableRegions() {\n  // Iterate the table regions in the grid.\n  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(\n      &table_grid_);\n  gsearch.StartFullSearch();\n  ColSegment *seg = nullptr;\n  while ((seg = gsearch.NextFullSearch()) != nullptr) {\n    bool neighbor_found = false;\n    bool modified = false; // Modified at least once\n    do {\n      // Start a rectangle search x-bounded by the image and y by the table\n      const TBOX &box = seg->bounding_box();\n      TBOX search_region(box);\n      search_region.set_left(bleft().x());\n      search_region.set_right(tright().x());\n      neighbor_found = false;\n      GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(\n          &table_grid_);\n      rectsearch.StartRectSearch(search_region);\n      ColSegment *neighbor = nullptr;\n      while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {\n        if (neighbor == seg) {\n          continue;\n        }\n        const TBOX &neighbor_box = neighbor->bounding_box();\n        // Check if a neighbor box has a large overlap with the table\n        // region.  This may happen as a result of merging two table\n        // regions in the previous iteration.\n        if (neighbor_box.overlap_fraction(box) >= 0.9) {\n          seg->InsertBox(neighbor_box);\n          rectsearch.RemoveBBox();\n          gsearch.RepositionIterator();\n          delete neighbor;\n          modified = true;\n          continue;\n        }\n        // Check if two table regions belong together based on a common\n        // horizontal ruling line\n        if (BelongToOneTable(box, neighbor_box)) {\n          seg->InsertBox(neighbor_box);\n          neighbor_found = true;\n          modified = true;\n          rectsearch.RemoveBBox();\n          gsearch.RepositionIterator();\n          delete neighbor;\n        }\n      }\n    } while (neighbor_found);\n    if (modified) {\n      // Because the box has changed, it has to be removed first.\n      gsearch.RemoveBBox();\n      table_grid_.InsertBBox(true, true, seg);\n      gsearch.RepositionIterator();\n    }\n  }\n}\n\n// Decide if two table regions belong to one table based on a common\n// horizontal ruling line or another colpartition\nbool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {\n  // Check the obvious case. Most likely not true because overlapping boxes\n  // should already be merged, but seems like a good thing to do in case things\n  // change.\n  if (box1.overlap(box2)) {\n    return true;\n  }\n  // Check for ColPartitions spanning both table regions\n  TBOX bbox = box1.bounding_union(box2);\n  // Start a rect search on bbox\n  ColPartitionGridSearch rectsearch(&clean_part_grid_);\n  rectsearch.StartRectSearch(bbox);\n  ColPartition *part = nullptr;\n  while ((part = rectsearch.NextRectSearch()) != nullptr) {\n    const TBOX &part_box = part->bounding_box();\n    // return true if a colpartition spanning both table regions is found\n    if (part_box.overlap(box1) && part_box.overlap(box2) &&\n        !part->IsImageType()) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Adjust table boundaries by:\n//  - building a tight bounding box around all ColPartitions contained in it.\n//  - expanding table boundaries to include all colpartitions that overlap the\n//    table by more than half of their area\n//  - expanding table boundaries to include nearby horizontal rule lines\n//  - expanding table vertically to include left out column headers\n// TODO(faisal): Expansion of table boundaries is quite aggressive. It usually\n//               makes following errors:\n//  1- horizontal lines consisting of underlines are included in the table if\n//     they are close enough\n//  2- horizontal lines originating from noise tend to get merged with a table\n//     near the top of the page\n//  3- the criteria for including horizontal lines is very generous. Many times\n//     horizontal lines separating headers and footers get merged with a\n//     single-column table in a multi-column page thereby including text\n//     from the neighboring column inside the table\n//  4- the criteria for including left out column headers also tends to\n//     occasionally include text-lines above the tables, typically from\n//     table caption\nvoid TableFinder::AdjustTableBoundaries() {\n  // Iterate the table regions in the grid\n  ColSegment_CLIST adjusted_tables;\n  ColSegment_C_IT it(&adjusted_tables);\n  ColSegmentGridSearch gsearch(&table_grid_);\n  gsearch.StartFullSearch();\n  ColSegment *table = nullptr;\n  while ((table = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &table_box = table->bounding_box();\n    TBOX grown_box = table_box;\n    GrowTableBox(table_box, &grown_box);\n    // To prevent a table from expanding again, do not insert the\n    // modified box back to the grid. Instead move it to a list and\n    // and remove it from the grid. The list is moved later back to the grid.\n    if (!grown_box.null_box()) {\n      auto *col = new ColSegment();\n      col->InsertBox(grown_box);\n      it.add_after_then_move(col);\n    }\n    gsearch.RemoveBBox();\n    delete table;\n  }\n  // clear table grid to move final tables in it\n  // TODO(nbeato): table_grid_ should already be empty. The above loop\n  // removed everything. Maybe just assert it is empty?\n  table_grid_.Clear();\n  it.move_to_first();\n  // move back final tables to table_grid_\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColSegment *seg = it.extract();\n    table_grid_.InsertBBox(true, true, seg);\n  }\n}\n\nvoid TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {\n  // TODO(nbeato): The growing code is a bit excessive right now.\n  // By removing these lines, the partitions considered need\n  // to have some overlap or be special cases. These lines could\n  // be added again once a check is put in place to make sure that\n  // growing tables don't stomp on a lot of non-table partitions.\n\n  // search for horizontal ruling lines within the vertical margin\n  // int vertical_margin = kRulingVerticalMargin * gridsize();\n  TBOX search_box = table_box;\n  // int top = MIN(search_box.top() + vertical_margin, tright().y());\n  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());\n  // search_box.set_top(top);\n  // search_box.set_bottom(bottom);\n\n  GrowTableToIncludePartials(table_box, search_box, result_box);\n  GrowTableToIncludeLines(table_box, search_box, result_box);\n  IncludeLeftOutColumnHeaders(result_box);\n}\n\n// Grow a table by increasing the size of the box to include\n// partitions with significant overlap with the table.\nvoid TableFinder::GrowTableToIncludePartials(const TBOX &table_box,\n                                             const TBOX &search_range,\n                                             TBOX *result_box) {\n  // Rulings are in a different grid, so search 2 grids for rulings, text,\n  // and table partitions that are not entirely within the new box.\n  for (int i = 0; i < 2; ++i) {\n    ColPartitionGrid *grid =\n        (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_;\n    ColPartitionGridSearch rectsearch(grid);\n    rectsearch.StartRectSearch(search_range);\n    ColPartition *part = nullptr;\n    while ((part = rectsearch.NextRectSearch()) != nullptr) {\n      // Only include text and table types.\n      if (part->IsImageType()) {\n        continue;\n      }\n      const TBOX &part_box = part->bounding_box();\n      // Include partition in the table if more than half of it\n      // is covered by the table\n      if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {\n        *result_box = result_box->bounding_union(part_box);\n        continue;\n      }\n    }\n  }\n}\n\n// Grow a table by expanding to the extents of significantly\n// overlapping lines.\nvoid TableFinder::GrowTableToIncludeLines(const TBOX &table_box,\n                                          const TBOX &search_range,\n                                          TBOX *result_box) {\n  ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);\n  rsearch.SetUniqueMode(true);\n  rsearch.StartRectSearch(search_range);\n  ColPartition *part = nullptr;\n  while ((part = rsearch.NextRectSearch()) != nullptr) {\n    // TODO(nbeato) This should also do vertical, but column\n    // boundaries are breaking things. This function needs to be\n    // updated to allow vertical lines as well.\n    if (!part->IsLineType()) {\n      continue;\n    }\n    // Avoid the following function call if the result of the\n    // function is irrelevant.\n    const TBOX &part_box = part->bounding_box();\n    if (result_box->contains(part_box)) {\n      continue;\n    }\n    // Include a partially overlapping horizontal line only if the\n    // extra ColPartitions that will be included due to expansion\n    // have large side spacing w.r.t. columns containing them.\n    if (HLineBelongsToTable(*part, table_box)) {\n      *result_box = result_box->bounding_union(part_box);\n    }\n    // TODO(nbeato): Vertical\n  }\n}\n\n// Checks whether the horizontal line belong to the table by looking at the\n// side spacing of extra ColPartitions that will be included in the table\n// due to expansion\nbool TableFinder::HLineBelongsToTable(const ColPartition &part,\n                                      const TBOX &table_box) {\n  if (!part.IsHorizontalLine()) {\n    return false;\n  }\n  const TBOX &part_box = part.bounding_box();\n  if (!part_box.major_x_overlap(table_box)) {\n    return false;\n  }\n  // Do not consider top-most horizontal line since it usually\n  // originates from noise.\n  // TODO(nbeato): I had to comment this out because the ruling grid doesn't\n  // have neighbors solved.\n  // if (!part.nearest_neighbor_above())\n  //   return false;\n  const TBOX bbox = part_box.bounding_union(table_box);\n  // In the \"unioned table\" box (the table extents expanded by the line),\n  // keep track of how many partitions have significant padding to the left\n  // and right. If more than half of the partitions covered by the new table\n  // have significant spacing, the line belongs to the table and the table\n  // grows to include all of the partitions.\n  int num_extra_partitions = 0;\n  int extra_space_to_right = 0;\n  int extra_space_to_left = 0;\n  // Rulings are in a different grid, so search 2 grids for rulings, text,\n  // and table partitions that are introduced by the new box.\n  for (int i = 0; i < 2; ++i) {\n    ColPartitionGrid *grid =\n        (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_;\n    // Start a rect search on bbox\n    ColPartitionGridSearch rectsearch(grid);\n    rectsearch.SetUniqueMode(true);\n    rectsearch.StartRectSearch(bbox);\n    ColPartition *extra_part = nullptr;\n    while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {\n      // ColPartition already in table\n      const TBOX &extra_part_box = extra_part->bounding_box();\n      if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {\n        continue;\n      }\n      // Non-text ColPartitions do not contribute\n      if (extra_part->IsImageType()) {\n        continue;\n      }\n      // Consider this partition.\n      num_extra_partitions++;\n      // presence of a table cell is a strong hint, so just increment the scores\n      // without looking at the spacing.\n      if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {\n        extra_space_to_right++;\n        extra_space_to_left++;\n        continue;\n      }\n      int space_threshold = kSideSpaceMargin * part.median_height();\n      if (extra_part->space_to_right() > space_threshold) {\n        extra_space_to_right++;\n      }\n      if (extra_part->space_to_left() > space_threshold) {\n        extra_space_to_left++;\n      }\n    }\n  }\n  // tprintf(\"%d %d %d\\n\",\n  // num_extra_partitions,extra_space_to_right,extra_space_to_left);\n  return (extra_space_to_right > num_extra_partitions / 2) ||\n         (extra_space_to_left > num_extra_partitions / 2);\n}\n\n// Look for isolated column headers above the given table box and\n// include them in the table\nvoid TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) {\n  // Start a search above the current table to look for column headers\n  ColPartitionGridSearch vsearch(&clean_part_grid_);\n  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),\n                              table_box->top());\n  ColPartition *neighbor = nullptr;\n  ColPartition *previous_neighbor = nullptr;\n  while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {\n    // Max distance to find a table heading.\n    const int max_distance =\n        kMaxColumnHeaderDistance * neighbor->median_height();\n    int table_top = table_box->top();\n    const TBOX &box = neighbor->bounding_box();\n    // Do not continue if the next box is way above\n    if (box.bottom() - table_top > max_distance) {\n      break;\n    }\n    // Unconditionally include partitions of type TABLE or LINE\n    // TODO(faisal): add some reasonable conditions here\n    if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {\n      table_box->set_top(box.top());\n      previous_neighbor = nullptr;\n      continue;\n    }\n    // If there are two text partitions, one above the other, without a table\n    // cell on their left or right side, consider them a barrier and quit\n    if (previous_neighbor == nullptr) {\n      previous_neighbor = neighbor;\n    } else {\n      const TBOX &previous_box = previous_neighbor->bounding_box();\n      if (!box.major_y_overlap(previous_box)) {\n        break;\n      }\n    }\n  }\n}\n\n// Remove false alarms consisting of a single column based on their\n// projection on the x-axis. Projection of a real table on the x-axis\n// should have at least one zero-valley larger than the global median\n// x-height of the page.\nvoid TableFinder::DeleteSingleColumnTables() {\n  int page_width = tright().x() - bleft().x();\n  ASSERT_HOST(page_width > 0);\n  // create an integer array to hold projection on x-axis\n  int *table_xprojection = new int[page_width];\n  // Iterate through all tables in the table grid\n  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(\n      &table_grid_);\n  table_search.StartFullSearch();\n  ColSegment *table;\n  while ((table = table_search.NextFullSearch()) != nullptr) {\n    TBOX table_box = table->bounding_box();\n    // reset the projection array\n    for (int i = 0; i < page_width; i++) {\n      table_xprojection[i] = 0;\n    }\n    // Start a rect search on table_box\n    ColPartitionGridSearch rectsearch(&clean_part_grid_);\n    rectsearch.SetUniqueMode(true);\n    rectsearch.StartRectSearch(table_box);\n    ColPartition *part;\n    while ((part = rectsearch.NextRectSearch()) != nullptr) {\n      if (!part->IsTextType()) {\n        continue; // Do not consider non-text partitions\n      }\n      if (part->flow() == BTFT_LEADER) {\n        continue; // Assume leaders are in tables\n      }\n      TBOX part_box = part->bounding_box();\n      // Do not consider partitions partially covered by the table\n      if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {\n        continue;\n      }\n      BLOBNBOX_CLIST *part_boxes = part->boxes();\n      BLOBNBOX_C_IT pit(part_boxes);\n\n      // Make sure overlapping blobs don't artificially inflate the number\n      // of rows in the table. This happens frequently with things such as\n      // decimals and split characters. Do this by assuming the column\n      // partition is sorted mostly left to right and just clip\n      // bounding boxes by the previous box's extent.\n      int next_position_to_write = 0;\n\n      for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {\n        BLOBNBOX *pblob = pit.data();\n        // ignore blob height for the purpose of projection since we\n        // are only interested in finding valleys\n        int xstart = pblob->bounding_box().left();\n        int xend = pblob->bounding_box().right();\n\n        xstart = std::max(xstart, next_position_to_write);\n        for (int i = xstart; i < xend; i++) {\n          table_xprojection[i - bleft().x()]++;\n        }\n        next_position_to_write = xend;\n      }\n    }\n    // Find largest valley between two reasonable peaks in the table\n    if (!GapInXProjection(table_xprojection, page_width)) {\n      table_search.RemoveBBox();\n      delete table;\n    }\n  }\n  delete[] table_xprojection;\n}\n\n// Return true if at least one gap larger than the global x-height\n// exists in the horizontal projection\nbool TableFinder::GapInXProjection(int *xprojection, int length) {\n  // Find peak value of the histogram\n  int peak_value = 0;\n  for (int i = 0; i < length; i++) {\n    if (xprojection[i] > peak_value) {\n      peak_value = xprojection[i];\n    }\n  }\n  // Peak value represents the maximum number of horizontally\n  // overlapping colpartitions, so this can be considered as the\n  // number of rows in the table\n  if (peak_value < kMinRowsInTable) {\n    return false;\n  }\n  double projection_threshold = kSmallTableProjectionThreshold * peak_value;\n  if (peak_value >= kLargeTableRowCount) {\n    projection_threshold = kLargeTableProjectionThreshold * peak_value;\n  }\n  // Threshold the histogram\n  for (int i = 0; i < length; i++) {\n    xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;\n  }\n  // Find the largest run of zeros between two ones\n  int largest_gap = 0;\n  int run_start = -1;\n  for (int i = 1; i < length; i++) {\n    // detect start of a run of zeros\n    if (xprojection[i - 1] && !xprojection[i]) {\n      run_start = i;\n    }\n    // detect end of a run of zeros and update the value of largest gap\n    if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {\n      int gap = i - run_start;\n      if (gap > largest_gap) {\n        largest_gap = gap;\n      }\n      run_start = -1;\n    }\n  }\n  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;\n}\n\n// Given the location of a table \"guess\", try to overlay a cellular\n// grid in the location, adjusting the boundaries.\n// TODO(nbeato): Falsely introduces:\n//   -headers/footers (not any worse, too much overlap destroys cells)\n//   -page numbers (not worse, included because maximize margins)\n//   -equations (nicely fit into a celluar grid, but more sparsely)\n//   -figures (random text box, also sparse)\n//   -small left-aligned text areas with overlapping positioned whitespace\n//       (rejected before)\n// Overall, this just needs some more work.\nvoid TableFinder::RecognizeTables() {\n#ifndef GRAPHICS_DISABLED\n  ScrollView *table_win = nullptr;\n  if (textord_show_tables) {\n    table_win = MakeWindow(0, 0, \"Table Structure\");\n    DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE,\n                         ScrollView::LIGHT_BLUE);\n    // table_grid_.DisplayBoxes(table_win);\n  }\n#endif\n\n  TableRecognizer recognizer;\n  recognizer.Init();\n  recognizer.set_line_grid(&leader_and_ruling_grid_);\n  recognizer.set_text_grid(&fragmented_text_grid_);\n  recognizer.set_max_text_height(global_median_xheight_ * 2.0);\n  recognizer.set_min_height(1.5 * gridheight());\n  // Loop over all of the tables and try to fit them.\n  // Store the good tables here.\n  ColSegment_CLIST good_tables;\n  ColSegment_C_IT good_it(&good_tables);\n\n  ColSegmentGridSearch gsearch(&table_grid_);\n  gsearch.StartFullSearch();\n  ColSegment *found_table = nullptr;\n  while ((found_table = gsearch.NextFullSearch()) != nullptr) {\n    gsearch.RemoveBBox();\n\n    // The goal is to make the tables persistent in a list.\n    // When that happens, this will move into the search loop.\n    const TBOX &found_box = found_table->bounding_box();\n    StructuredTable *table_structure = recognizer.RecognizeTable(found_box);\n\n    // Process a table. Good tables are inserted into the grid again later on\n    // We can't change boxes in the grid while it is running a search.\n    if (table_structure != nullptr) {\n#ifndef GRAPHICS_DISABLED\n      if (textord_show_tables) {\n        table_structure->Display(table_win, ScrollView::LIME_GREEN);\n      }\n#endif\n      found_table->set_bounding_box(table_structure->bounding_box());\n      delete table_structure;\n      good_it.add_after_then_move(found_table);\n    } else {\n      delete found_table;\n    }\n  }\n  // TODO(nbeato): MERGE!! There is awesome info now available for merging.\n\n  // At this point, the grid is empty. We can safely insert the good tables\n  // back into grid.\n  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {\n    table_grid_.InsertBBox(true, true, good_it.extract());\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the column segments in some window.\nvoid TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,\n                                     ScrollView::Color color) {\n  win->Pen(color);\n  win->Brush(ScrollView::NONE);\n  ColSegment_IT it(segments);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    ColSegment *col = it.data();\n    const TBOX &box = col->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    win->Rectangle(left_x, bottom_y, right_x, top_y);\n  }\n  win->UpdateWindow();\n}\n\n// Displays the colpartitions using a new coloring on an existing window.\n// Note: This method is only for debug purpose during development and\n// would not be part of checked in code\nvoid TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,\n                                       ScrollView::Color default_color,\n                                       ScrollView::Color table_color) {\n  ScrollView::Color color = default_color;\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    color = default_color;\n    if (part->type() == PT_TABLE) {\n      color = table_color;\n    }\n\n    const TBOX &box = part->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n    win->Brush(ScrollView::NONE);\n    win->Pen(color);\n    win->Rectangle(left_x, bottom_y, right_x, top_y);\n  }\n  win->UpdateWindow();\n}\n\nvoid TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,\n                                       ScrollView::Color default_color) {\n  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);\n}\n\nvoid TableFinder::DisplayColPartitionConnections(ScrollView *win,\n                                                 ColPartitionGrid *grid,\n                                                 ScrollView::Color color) {\n  // Iterate the ColPartitions in the grid.\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    const TBOX &box = part->bounding_box();\n    int left_x = box.left();\n    int right_x = box.right();\n    int top_y = box.top();\n    int bottom_y = box.bottom();\n\n    ColPartition *upper_part = part->nearest_neighbor_above();\n    if (upper_part) {\n      const TBOX &upper_box = upper_part->bounding_box();\n      int mid_x = (left_x + right_x) / 2;\n      int mid_y = (top_y + bottom_y) / 2;\n      int other_x = (upper_box.left() + upper_box.right()) / 2;\n      int other_y = (upper_box.top() + upper_box.bottom()) / 2;\n      win->Brush(ScrollView::NONE);\n      win->Pen(color);\n      win->Line(mid_x, mid_y, other_x, other_y);\n    }\n    ColPartition *lower_part = part->nearest_neighbor_below();\n    if (lower_part) {\n      const TBOX &lower_box = lower_part->bounding_box();\n      int mid_x = (left_x + right_x) / 2;\n      int mid_y = (top_y + bottom_y) / 2;\n      int other_x = (lower_box.left() + lower_box.right()) / 2;\n      int other_y = (lower_box.top() + lower_box.bottom()) / 2;\n      win->Brush(ScrollView::NONE);\n      win->Pen(color);\n      win->Line(mid_x, mid_y, other_x, other_y);\n    }\n  }\n  win->UpdateWindow();\n}\n\n#endif\n\n// Merge all colpartitions in table regions to make them a single\n// colpartition and revert types of isolated table cells not\n// assigned to any table to their original types.\nvoid TableFinder::MakeTableBlocks(ColPartitionGrid *grid,\n                                  ColPartitionSet **all_columns,\n                                  const WidthCallback &width_cb) {\n  // Since we have table blocks already, remove table tags from all\n  // colpartitions\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.StartFullSearch();\n  ColPartition *part = nullptr;\n\n  while ((part = gsearch.NextFullSearch()) != nullptr) {\n    if (part->type() == PT_TABLE) {\n      part->clear_table_type();\n    }\n  }\n  // Now make a single colpartition out of each table block and remove\n  // all colpartitions contained within a table\n  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(\n      &table_grid_);\n  table_search.StartFullSearch();\n  ColSegment *table;\n  while ((table = table_search.NextFullSearch()) != nullptr) {\n    const TBOX &table_box = table->bounding_box();\n    // Start a rect search on table_box\n    ColPartitionGridSearch rectsearch(grid);\n    rectsearch.StartRectSearch(table_box);\n    ColPartition *part;\n    ColPartition *table_partition = nullptr;\n    while ((part = rectsearch.NextRectSearch()) != nullptr) {\n      // Do not consider image partitions\n      if (!part->IsTextType()) {\n        continue;\n      }\n      TBOX part_box = part->bounding_box();\n      // Include partition in the table if more than half of it\n      // is covered by the table\n      if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {\n        rectsearch.RemoveBBox();\n        if (table_partition) {\n          table_partition->Absorb(part, width_cb);\n        } else {\n          table_partition = part;\n        }\n      }\n    }\n    // Insert table colpartition back to part_grid_\n    if (table_partition) {\n      // To match the columns used when transforming to blocks, the new table\n      // partition must have its first and last column set at the grid y that\n      // corresponds to its bottom.\n      const TBOX &table_box = table_partition->bounding_box();\n      int grid_x, grid_y;\n      grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);\n      table_partition->SetPartitionType(resolution_, all_columns[grid_y]);\n      table_partition->set_table_type();\n      table_partition->set_blob_type(BRT_TEXT);\n      table_partition->set_flow(BTFT_CHAIN);\n      table_partition->SetBlobTypes();\n      grid->InsertBBox(true, true, table_partition);\n    }\n  }\n}\n\n//////// ColSegment code\n////////\nColSegment::ColSegment()\n    : ELIST<ColSegment>::LINK(),\n      num_table_cells_(0),\n      num_text_cells_(0),\n      type_(COL_UNKNOWN) {}\n\n// Provides a color for BBGrid to draw the rectangle.\nScrollView::Color ColSegment::BoxColor() const {\n  const ScrollView::Color kBoxColors[PT_COUNT] = {\n      ScrollView::YELLOW,\n      ScrollView::BLUE,\n      ScrollView::YELLOW,\n      ScrollView::MAGENTA,\n  };\n  return kBoxColors[type_];\n}\n\n// Insert a box into this column segment\nvoid ColSegment::InsertBox(const TBOX &other) {\n  bounding_box_ = bounding_box_.bounding_union(other);\n}\n\n// Set column segment type based on the ratio of text and table partitions\n// in it.\nvoid ColSegment::set_type() {\n  if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {\n    type_ = COL_TABLE;\n  } else if (num_text_cells_ > num_table_cells_) {\n    type_ = COL_TEXT;\n  } else {\n    type_ = COL_MIXED;\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/tablefind.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tablefind.h\n// Description: Helper classes to find tables from ColPartitions.\n// Author:      Faisal Shafait (faisal.shafait@dfki.de)\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_TABLEFIND_H_\n#define TESSERACT_TEXTORD_TABLEFIND_H_\n\n#include \"colpartitiongrid.h\"\n#include \"elst.h\"\n#include \"rect.h\"\n\nnamespace tesseract {\n\n// Possible types for a column segment.\nenum ColSegType { COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED, COL_COUNT };\n\nclass ColPartitionSet;\n\n// ColSegment holds rectangular blocks that represent segmentation of a page\n// into regions containing single column text/table.\nclass ColSegment;\nELISTIZEH(ColSegment)\nCLISTIZEH(ColSegment)\n\nclass ColSegment : public ELIST<ColSegment>::LINK {\npublic:\n  ColSegment();\n  ~ColSegment() = default;\n\n  // Simple accessors and mutators\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n\n  void set_top(int y) {\n    bounding_box_.set_top(y);\n  }\n\n  void set_bottom(int y) {\n    bounding_box_.set_bottom(y);\n  }\n\n  void set_left(int x) {\n    bounding_box_.set_left(x);\n  }\n\n  void set_right(int x) {\n    bounding_box_.set_right(x);\n  }\n\n  void set_bounding_box(const TBOX &other) {\n    bounding_box_ = other;\n  }\n\n  int get_num_table_cells() const {\n    return num_table_cells_;\n  }\n\n  // set the number of table colpartitions covered by the bounding_box_\n  void set_num_table_cells(int n) {\n    num_table_cells_ = n;\n  }\n\n  int get_num_text_cells() const {\n    return num_text_cells_;\n  }\n\n  // set the number of text colpartitions covered by the bounding_box_\n  void set_num_text_cells(int n) {\n    num_text_cells_ = n;\n  }\n\n  ColSegType type() const {\n    return type_;\n  }\n\n  // set the type of the block based on the ratio of table to text\n  // colpartitions covered by it.\n  void set_type();\n\n  // Provides a color for BBGrid to draw the rectangle.\n  ScrollView::Color BoxColor() const;\n\n  // Insert a rectangle into bounding_box_\n  void InsertBox(const TBOX &other);\n\nprivate:\n  TBOX bounding_box_; // bounding box\n  int num_table_cells_;\n  int num_text_cells_;\n  ColSegType type_;\n};\n\n// Typedef BBGrid of ColSegments\nusing ColSegmentGrid = BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT>;\nusing ColSegmentGridSearch =\n    GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>;\n\n// TableFinder is a utility class to find a set of tables given a set of\n// ColPartitions and Columns. The TableFinder will mark candidate ColPartitions\n// based on research in \"Table Detection in Heterogeneous Documents\".\n// Usage flow is as follows:\n//   TableFinder finder;\n//   finder.InsertCleanPartitions(/* grid info */)\n//   finder.LocateTables(/* ColPartitions and Columns */);\n//   finder.Update TODO(nbeato)\nclass TESS_API TableFinder {\npublic:\n  // Constructor is simple initializations\n  TableFinder();\n  ~TableFinder();\n\n  // Set the resolution of the connected components in ppi.\n  void set_resolution(int resolution) {\n    resolution_ = resolution;\n  }\n  // Change the reading order. Initially it is left to right.\n  void set_left_to_right_language(bool order);\n\n  // Initialize\n  void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right);\n\n  // Copy cleaned partitions from ColumnFinder's part_grid_ to this\n  // clean_part_grid_ and insert dot-like noise into period_grid_.\n  // It resizes the grids in this object to the dimensions of grid.\n  void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block);\n\n  // High level function to perform table detection\n  // Finds tables and updates the grid object with new partitions for the\n  // tables. The columns and width callbacks are used to merge tables.\n  // The reskew argument is only used to write the tables to the out.png\n  // if that feature is enabled.\n  void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns,\n                    WidthCallback width_cb, const FCOORD &reskew);\n\nprotected:\n  // Access for the grid dimensions.\n  // The results will not be correct until InsertCleanPartitions\n  // has been called. The values are taken from the grid passed as an argument\n  // to that function.\n  int gridsize() const;\n  int gridwidth() const;\n  int gridheight() const;\n  const ICOORD &bleft() const;\n  const ICOORD &tright() const;\n\n  // Makes a window for debugging, see BBGrid\n  ScrollView *MakeWindow(int x, int y, const char *window_name);\n\n  //////// Functions to insert objects from the grid into the table finder.\n  //////// In all cases, ownership is transferred to the table finder.\n  // Inserts text into the table finder.\n  void InsertTextPartition(ColPartition *part);\n  void InsertFragmentedTextPartition(ColPartition *part);\n  void InsertLeaderPartition(ColPartition *part);\n  void InsertRulingPartition(ColPartition *part);\n  void InsertImagePartition(ColPartition *part);\n  void SplitAndInsertFragmentedTextPartition(ColPartition *part);\n  bool AllowTextPartition(const ColPartition &part) const;\n  bool AllowBlob(const BLOBNBOX &blob) const;\n\n  //////// Functions that manipulate ColPartitions in the part_grid_ /////\n  //////// to find tables.\n  ////////\n\n  // Utility function to move segments to col_seg_grid\n  // Note: Move includes ownership,\n  // so segments will be be owned by col_seg_grid\n  void MoveColSegmentsToGrid(ColSegment_LIST *segments,\n                             ColSegmentGrid *col_seg_grid);\n\n  //////// Set up code to run during table detection to correctly\n  //////// initialize variables on column partitions that are used later.\n  ////////\n\n  // Initialize the grid and partitions\n  void InitializePartitions(ColPartitionSet **all_columns);\n\n  // Set left, right and top, bottom spacings of each colpartition.\n  // Left/right spacings are w.r.t the column boundaries\n  // Top/bottom spacings are w.r.t. previous and next colpartitions\n  static void SetPartitionSpacings(ColPartitionGrid *grid,\n                                   ColPartitionSet **all_columns);\n\n  // Set spacing and closest neighbors above and below a given colpartition.\n  void SetVerticalSpacing(ColPartition *part);\n\n  // Set global spacing estimates. This function is dependent on the\n  // partition spacings. So make sure SetPartitionSpacings is called\n  // on the same grid before this.\n  void SetGlobalSpacings(ColPartitionGrid *grid);\n  // Access to the global median xheight. The xheight is the height\n  // of a lowercase 'x' character on the page. This can be viewed as the\n  // average height of a lowercase letter in a textline. As a result\n  // it is used to make assumptions about spacing between words and\n  // table cells.\n  void set_global_median_xheight(int xheight);\n  // Access to the global median blob width. The width is useful\n  // when deciding if a partition is noise.\n  void set_global_median_blob_width(int width);\n  // Access to the global median ledding. The ledding is the distance between\n  // two adjacent text lines. This value can be used to get a rough estimate\n  // for the amount of space between two lines of text. As a result, it\n  // is used to calculate appropriate spacing between adjacent rows of text.\n  void set_global_median_ledding(int ledding);\n\n  // Updates the nearest neighbors for each ColPartition in clean_part_grid_.\n  // The neighbors are most likely SingletonPartner calls after the neighbors\n  // are assigned. This is hear until it is decided to remove the\n  // nearest_neighbor code in ColPartition\n  void FindNeighbors();\n\n  //////// Functions to mark candidate column partitions as tables.\n  //////// Tables are marked as described in\n  ////////   Table Detection in Heterogeneous Documents (2010, Shafait & Smith)\n  ////////\n\n  // High level function to mark partitions as table rows/cells.\n  // When this function is done, the column partitions in clean_part_grid_\n  // should mostly be marked as tables.\n  void MarkTablePartitions();\n  // Marks partitions given a local view of a single partition\n  void MarkPartitionsUsingLocalInformation();\n  /////// Heuristics for local marking\n  // Check if the partition has at least one large gap between words or no\n  // significant gap at all\n  // TODO(nbeato): Make const, prevented because blobnbox array access\n  bool HasWideOrNoInterWordGap(ColPartition *part) const;\n  // Checks if a partition is adjacent to leaders on the page\n  bool HasLeaderAdjacent(const ColPartition &part);\n  // Filter individual text partitions marked as table partitions\n  // consisting of paragraph endings, small section headings, and\n  // headers and footers.\n  void FilterFalseAlarms();\n  void FilterParagraphEndings();\n  void FilterHeaderAndFooter();\n  // Mark all ColPartitions as table cells that have a table cell above\n  // and below them\n  void SmoothTablePartitionRuns();\n\n  //////// Functions to create bounding boxes (ColSegment) objects for\n  //////// the columns on the page. The columns are not necessarily\n  //////// vertical lines, meaning if tab stops strongly suggests that\n  //////// a column changes horizontal position, as in the case below,\n  //////// The ColSegment objects will respect that after processing.\n  ////////\n  ////////     _____________\n  //////// Ex. |     |      |\n  ////////     |_____|______|  5 boxes: 2 on this line\n  ////////     |   |    |   |           3 on this line\n  ////////     |___|____|___|\n  ////////\n\n  // Get Column segments from best_columns_\n  void GetColumnBlocks(ColPartitionSet **columns,\n                       ColSegment_LIST *col_segments);\n\n  // Group Column segments into consecutive single column regions.\n  void GroupColumnBlocks(ColSegment_LIST *current_segments,\n                         ColSegment_LIST *col_segments);\n\n  // Check if two boxes are consecutive within the same column\n  bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2);\n\n  // Set the ratio of candidate table partitions in each column\n  void SetColumnsType(ColSegment_LIST *col_segments);\n\n  // Merge Column Blocks that were split due to the presence of a table\n  void GridMergeColumnBlocks();\n\n  //////// Functions to turn marked ColPartitions into candidate tables\n  //////// using a modified T-Recs++ algorithm described in\n  ////////   Applying The T-Recs Table Recognition System\n  ////////   To The Business Letter Domain (2001, Kieninger & Dengel)\n  ////////\n\n  // Merge partititons cells into table columns\n  // Differs from paper by just looking at marked table partitions\n  // instead of similarity metric.\n  // Modified section 4.1 of paper.\n  void GetTableColumns(ColSegment_LIST *table_columns);\n\n  // Finds regions within a column that potentially contain a table.\n  // Ie, the table columns from GetTableColumns are turned into boxes\n  // that span the entire page column (using ColumnBlocks found in\n  // earlier functions) in the x direction and the min/max extent of\n  // overlapping table columns in the y direction.\n  // Section 4.2 of paper.\n  void GetTableRegions(ColSegment_LIST *table_columns,\n                       ColSegment_LIST *table_regions);\n\n  //////// Functions to \"patch up\" found tables\n  ////////\n\n  // Merge table regions corresponding to tables spanning multiple columns\n  void GridMergeTableRegions();\n  bool BelongToOneTable(const TBOX &box1, const TBOX &box2);\n\n  // Adjust table boundaries by building a tight bounding box around all\n  // ColPartitions contained in it.\n  void AdjustTableBoundaries();\n\n  // Grows a table to include partitions that are partially covered\n  // by the table. This includes lines and text. It does not include\n  // noise or images.\n  // On entry, result_box is the minimum size of the result. The results of the\n  // function will union the actual result with result_box.\n  void GrowTableBox(const TBOX &table_box, TBOX *result_box);\n  // Grow a table by increasing the size of the box to include\n  // partitions with significant overlap with the table.\n  void GrowTableToIncludePartials(const TBOX &table_box,\n                                  const TBOX &search_range, TBOX *result_box);\n  // Grow a table by expanding to the extents of significantly\n  // overlapping lines.\n  void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range,\n                               TBOX *result_box);\n  // Checks whether the horizontal line belong to the table by looking at the\n  // side spacing of extra ColPartitions that will be included in the table\n  // due to expansion\n  bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box);\n\n  // Look for isolated column headers above the given table box and\n  // include them in the table\n  void IncludeLeftOutColumnHeaders(TBOX *table_box);\n\n  // Remove false alarms consisting of a single column\n  void DeleteSingleColumnTables();\n\n  // Return true if at least one gap larger than the global x-height\n  // exists in the horizontal projection\n  bool GapInXProjection(int *xprojection, int length);\n\n  //////// Recognize the tables.\n  ////////\n  // This function will run the table recognizer and try to find better\n  // bounding boxes. The structures of the tables never leave this function\n  // right now. It just tries to prune and merge tables based on info it\n  // has available.\n  void RecognizeTables();\n\n  //////// Debugging functions. Render different structures to GUI\n  //////// for visual debugging / intuition.\n  ////////\n\n  // Displays Colpartitions marked as table row. Overlays them on top of\n  // part_grid_.\n  void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols,\n                          ScrollView::Color color);\n\n  // Displays the colpartitions using a new coloring on an existing window.\n  // Note: This method is only for debug purpose during development and\n  // would not be part of checked in code\n  void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,\n                            ScrollView::Color text_color,\n                            ScrollView::Color table_color);\n  void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,\n                            ScrollView::Color default_color);\n  void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid,\n                                      ScrollView::Color default_color);\n\n  // Merge all colpartitions in table regions to make them a single\n  // colpartition and revert types of isolated table cells not\n  // assigned to any table to their original types.\n  void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns,\n                       const WidthCallback &width_cb);\n\n  /////////////////////////////////////////////////\n  // Useful objects used during table find process.\n  /////////////////////////////////////////////////\n  // Resolution of the connected components in ppi.\n  int resolution_;\n  // Estimate of median x-height over the page\n  int global_median_xheight_;\n  // Estimate of the median blob width on the page\n  int global_median_blob_width_;\n  // Estimate of median leading on the page\n  int global_median_ledding_;\n  // Grid to hold cleaned colpartitions after removing all\n  // colpartitions that consist of only noise blobs, and removing\n  // noise blobs from remaining colpartitions.\n  ColPartitionGrid clean_part_grid_;\n  // Grid contains the leaders and ruling lines.\n  ColPartitionGrid leader_and_ruling_grid_;\n  // Grid contains the broken down column partitions. It can be thought\n  // of as a \"word\" grid. However, it usually doesn't break apart text lines.\n  // It does break apart table data (most of the time).\n  ColPartitionGrid fragmented_text_grid_;\n  // Grid of page column blocks\n  ColSegmentGrid col_seg_grid_;\n  // Grid of detected tables\n  ColSegmentGrid table_grid_;\n  // The reading order of text. Defaults to true, for languages such as English.\n  bool left_to_right_language_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_TABLEFIND_H_\n"
  },
  {
    "path": "src/textord/tablerecog.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tablerecog.cpp\n// Description: Helper class to help structure table areas. Given an bounding\n//              box from TableFinder, the TableRecognizer should give a\n//              StructuredTable (maybe a list in the future) of \"good\" tables\n//              in that area.\n// Author:      Nicholas Beato\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"tablerecog.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// The amount of space required between the ColPartitions in 2 columns\n// of a non-lined table as a multiple of the median width.\nconst double kHorizontalSpacing = 0.30;\n// The amount of space required between the ColPartitions in 2 rows\n// of a non-lined table as multiples of the median height.\nconst double kVerticalSpacing = -0.2;\n// The number of cells that the grid lines may intersect.\n// See FindCellSplitLocations for explanation.\nconst int kCellSplitRowThreshold = 0;\nconst int kCellSplitColumnThreshold = 0;\n// For \"lined tables\", the number of required lines. Currently a guess.\nconst int kLinedTableMinVerticalLines = 3;\nconst int kLinedTableMinHorizontalLines = 3;\n// Number of columns required, as a fraction of the most columns found.\n// None of these are tweaked at all.\nconst double kRequiredColumns = 0.7;\n// The tolerance for comparing margins of potential tables.\nconst double kMarginFactor = 1.1;\n// The first and last row should be consistent cell height.\n// This factor is the first and last row cell height max.\nconst double kMaxRowSize = 2.5;\n// Number of filled columns required to form a strong table row.\n// For small tables, this is an absolute number.\nconst double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};\n// For large tables, it is a relative number\nconst double kGoodRowNumberOfColumnsLarge = 0.7;\n// The amount of area that must be covered in a cell by ColPartitions to\n// be considered \"filled\"\nconst double kMinFilledArea = 0.35;\n\n// Indicates that a table row is weak. This means that it has\n// many missing data cells or very large cell heights compared.\n// to the rest of the table.\n// Code is buggy right now. It is disabled in the calling function.\n// It seems like sometimes the row that is passed in is not correct\n// sometimes (like a phantom row is introduced). There's something going\n// on in the cell_y_ data member before this is called... not certain.\nstatic bool IsWeakTableRow(StructuredTable *table, int row) {\n  if (!table->VerifyRowFilled(row)) {\n    return false;\n  }\n\n  double threshold;\n  if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) {\n    threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];\n  } else {\n    threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;\n  }\n\n  return table->CountFilledCellsInRow(row) < threshold;\n}\n\n////////\n//////// StructuredTable Class\n////////\n\nStructuredTable::StructuredTable()\n    : text_grid_(nullptr)\n    , line_grid_(nullptr)\n    , is_lined_(false)\n    , space_above_(0)\n    , space_below_(0)\n    , space_left_(0)\n    , space_right_(0)\n    , median_cell_height_(0)\n    , median_cell_width_(0)\n    , max_text_height_(INT32_MAX) {}\n\nvoid StructuredTable::Init() {}\n\nvoid StructuredTable::set_text_grid(ColPartitionGrid *text_grid) {\n  text_grid_ = text_grid;\n}\nvoid StructuredTable::set_line_grid(ColPartitionGrid *line_grid) {\n  line_grid_ = line_grid;\n}\nvoid StructuredTable::set_max_text_height(int height) {\n  max_text_height_ = height;\n}\nbool StructuredTable::is_lined() const {\n  return is_lined_;\n}\nunsigned StructuredTable::row_count() const {\n  return cell_y_.empty() ? 0 : cell_y_.size() - 1;\n}\nunsigned StructuredTable::column_count() const {\n  return cell_x_.empty() ? 0 : cell_x_.size() - 1;\n}\nunsigned StructuredTable::cell_count() const {\n  return row_count() * column_count();\n}\nvoid StructuredTable::set_bounding_box(const TBOX &box) {\n  bounding_box_ = box;\n}\nconst TBOX &StructuredTable::bounding_box() const {\n  return bounding_box_;\n}\nint StructuredTable::median_cell_height() {\n  return median_cell_height_;\n}\nint StructuredTable::median_cell_width() {\n  return median_cell_width_;\n}\nint StructuredTable::row_height(unsigned row) const {\n  ASSERT_HOST(row < row_count());\n  return cell_y_[row + 1] - cell_y_[row];\n}\nint StructuredTable::column_width(unsigned column) const {\n  ASSERT_HOST(column < column_count());\n  return cell_x_[column + 1] - cell_x_[column];\n}\nint StructuredTable::space_above() const {\n  return space_above_;\n}\nint StructuredTable::space_below() const {\n  return space_below_;\n}\n\n// At this point, we know that the lines are contained\n// by the box (by FindLinesBoundingBox).\n// So try to find the cell structure and make sure it works out.\n// The assumption is that all lines span the table. If this\n// assumption fails, the VerifyLinedTable method will\n// abort the lined table. The TableRecognizer will fall\n// back on FindWhitespacedStructure.\nbool StructuredTable::FindLinedStructure() {\n  ClearStructure();\n\n  // Search for all of the lines in the current box.\n  // Update the cellular structure with the exact lines.\n  ColPartitionGridSearch box_search(line_grid_);\n  box_search.SetUniqueMode(true);\n  box_search.StartRectSearch(bounding_box_);\n  ColPartition *line = nullptr;\n\n  while ((line = box_search.NextRectSearch()) != nullptr) {\n    if (line->IsHorizontalLine()) {\n      cell_y_.push_back(line->MidY());\n    }\n    if (line->IsVerticalLine()) {\n      cell_x_.push_back(line->MidX());\n    }\n  }\n\n  // HasSignificantLines should guarantee cells.\n  // Because that code is a different class, just gracefully\n  // return false. This could be an assert.\n  if (cell_x_.size() < 3 || cell_y_.size() < 3) {\n    return false;\n  }\n\n  // Sort and remove duplicates that may have occurred due to split lines.\n  std::sort(cell_x_.begin(), cell_x_.end());\n  auto last_x = std::unique(cell_x_.begin(), cell_x_.end());\n  cell_x_.erase(last_x, cell_x_.end());\n  std::sort(cell_y_.begin(), cell_y_.end());\n  auto last_y = std::unique(cell_y_.begin(), cell_y_.end());\n  cell_y_.erase(last_y, cell_y_.end());\n\n  // The border should be the extents of line boxes, not middle.\n  cell_x_[0] = bounding_box_.left();\n  cell_x_[cell_x_.size() - 1] = bounding_box_.right();\n  cell_y_[0] = bounding_box_.bottom();\n  cell_y_[cell_y_.size() - 1] = bounding_box_.top();\n\n  // Remove duplicates that may have occurred due to moving the borders.\n  last_x = std::unique(cell_x_.begin(), cell_x_.end());\n  cell_x_.erase(last_x, cell_x_.end());\n  last_y = std::unique(cell_y_.begin(), cell_y_.end());\n  cell_y_.erase(last_y, cell_y_.end());\n\n  CalculateMargins();\n  CalculateStats();\n  is_lined_ = VerifyLinedTableCells();\n  return is_lined_;\n}\n\n// Finds the cellular structure given a particular box.\nbool StructuredTable::FindWhitespacedStructure() {\n  ClearStructure();\n  FindWhitespacedColumns();\n  FindWhitespacedRows();\n\n  if (!VerifyWhitespacedTable()) {\n    return false;\n  } else {\n    bounding_box_.set_left(cell_x_[0]);\n    bounding_box_.set_right(cell_x_[cell_x_.size() - 1]);\n    bounding_box_.set_bottom(cell_y_[0]);\n    bounding_box_.set_top(cell_y_[cell_y_.size() - 1]);\n    AbsorbNearbyLines();\n    CalculateMargins();\n    CalculateStats();\n    return true;\n  }\n}\n\n// Tests if a partition fits inside the table structure.\n// Partitions must fully span a grid line in order to intersect it.\n// This means that a partition does not intersect a line\n// that it \"just\" touches. This is mainly because the assumption\n// throughout the code is that \"0\" distance is a very very small space.\nbool StructuredTable::DoesPartitionFit(const ColPartition &part) const {\n  const TBOX &box = part.bounding_box();\n  for (int i : cell_x_) {\n    if (box.left() < i && i < box.right()) {\n      return false;\n    }\n  }\n  for (int i : cell_y_) {\n    if (box.bottom() < i && i < box.top()) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Checks if a sub-table has multiple data cells filled.\nint StructuredTable::CountFilledCells() {\n  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);\n}\nint StructuredTable::CountFilledCellsInRow(int row) {\n  return CountFilledCells(row, row, 0, column_count() - 1);\n}\nint StructuredTable::CountFilledCellsInColumn(int column) {\n  return CountFilledCells(0, row_count() - 1, column, column);\n}\nint StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,\n                                      unsigned column_end) {\n  ASSERT_HOST(row_start <= row_end && row_end < row_count());\n  ASSERT_HOST(column_start <= column_end && column_end < column_count());\n  int cell_count = 0;\n  TBOX cell_box;\n  for (unsigned row = row_start; row <= row_end; ++row) {\n    cell_box.set_bottom(cell_y_[row]);\n    cell_box.set_top(cell_y_[row + 1]);\n    for (unsigned col = column_start; col <= column_end; ++col) {\n      cell_box.set_left(cell_x_[col]);\n      cell_box.set_right(cell_x_[col + 1]);\n      if (CountPartitions(cell_box) > 0) {\n        ++cell_count;\n      }\n    }\n  }\n  return cell_count;\n}\n\n// Makes sure that at least one cell in a row has substantial area filled.\n// This can filter out large whitespace caused by growing tables too far\n// and page numbers.\nbool StructuredTable::VerifyRowFilled(int row) {\n  for (unsigned i = 0; i < column_count(); ++i) {\n    auto area_filled = CalculateCellFilledPercentage(row, i);\n    if (area_filled >= kMinFilledArea) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Finds the filled area in a cell.\n// Assume ColPartitions do not overlap for simplicity (even though they do).\ndouble StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {\n  ASSERT_HOST(row <= row_count());\n  ASSERT_HOST(column <= column_count());\n  const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);\n  ASSERT_HOST(!kCellBox.null_box());\n\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(kCellBox);\n  double area_covered = 0;\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (text->IsTextType()) {\n      area_covered += text->bounding_box().intersection(kCellBox).area();\n    }\n  }\n  const int32_t current_area = kCellBox.area();\n  if (current_area == 0) {\n    return 1.0;\n  }\n  return std::min(1.0, area_covered / current_area);\n}\n\n#ifndef GRAPHICS_DISABLED\n\nvoid StructuredTable::Display(ScrollView *window, ScrollView::Color color) {\n  window->Brush(ScrollView::NONE);\n  window->Pen(color);\n  window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),\n                    bounding_box_.top());\n  for (int i : cell_x_) {\n    window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());\n  }\n  for (int i : cell_y_) {\n    window->Line(bounding_box_.left(), i, bounding_box_.right(), i);\n  }\n  window->UpdateWindow();\n}\n\n#endif\n\n// Clear structure information.\nvoid StructuredTable::ClearStructure() {\n  cell_x_.clear();\n  cell_y_.clear();\n  is_lined_ = false;\n  space_above_ = 0;\n  space_below_ = 0;\n  space_left_ = 0;\n  space_right_ = 0;\n  median_cell_height_ = 0;\n  median_cell_width_ = 0;\n}\n\n// When a table has lines, the lines should not intersect any partitions.\n// The following function makes sure the previous assumption is met.\nbool StructuredTable::VerifyLinedTableCells() {\n  // Function only called when lines exist.\n  ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);\n  for (int i : cell_y_) {\n    if (CountHorizontalIntersections(i) > 0) {\n      return false;\n    }\n  }\n  for (int i : cell_x_) {\n    if (CountVerticalIntersections(i) > 0) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// TODO(nbeato): Could be much better than this.\n// Examples:\n//   - Calculate the percentage of filled cells.\n//   - Calculate the average number of ColPartitions per cell.\n//   - Calculate the number of cells per row with partitions.\n//   - Check if ColPartitions in adjacent cells are similar.\n//   - Check that all columns are at least a certain width.\n//   - etc.\nbool StructuredTable::VerifyWhitespacedTable() {\n  // criteria for a table, must be at least 2x3 or 3x2\n  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;\n}\n\n// Finds vertical splits in the ColPartitions of text_grid_ by considering\n// all possible \"good\" guesses. A good guess is just the left/right sides of\n// the partitions, since these locations will uniquely define where the\n// extremal values where the splits can occur. The split happens\n// in the middle of the two nearest partitions.\nvoid StructuredTable::FindWhitespacedColumns() {\n  // Set of the extents of all partitions on the page.\n  std::vector<int> left_sides;\n  std::vector<int> right_sides;\n\n  // Look at each text partition. We want to find the partitions\n  // that have extremal left/right sides. These will give us a basis\n  // for the table columns.\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(bounding_box_);\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (!text->IsTextType()) {\n      continue;\n    }\n\n    ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());\n    int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);\n    left_sides.push_back(text->bounding_box().left() - spacing);\n    right_sides.push_back(text->bounding_box().right() + spacing);\n  }\n  // It causes disaster below, so avoid it!\n  if (left_sides.empty() || right_sides.empty()) {\n    return;\n  }\n\n  // Since data may be inserted in grid order, we sort the left/right sides.\n  std::sort(left_sides.begin(), left_sides.end());\n  std::sort(right_sides.begin(), right_sides.end());\n\n  // At this point, in the \"merged list\", we expect to have a left side,\n  // followed by either more left sides or a right side. The last number\n  // should be a right side. We find places where the splits occur by looking\n  // for \"valleys\". If we want to force gap sizes or allow overlap, change\n  // the spacing above. If you want to let lines \"slice\" partitions as long\n  // as it is infrequent, change the following function.\n  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);\n}\n\n// Finds horizontal splits in the ColPartitions of text_grid_ by considering\n// all possible \"good\" guesses. A good guess is just the bottom/top sides of\n// the partitions, since these locations will uniquely define where the\n// extremal values where the splits can occur. The split happens\n// in the middle of the two nearest partitions.\nvoid StructuredTable::FindWhitespacedRows() {\n  // Set of the extents of all partitions on the page.\n  std::vector<int> bottom_sides;\n  std::vector<int> top_sides;\n  // We will be \"shrinking\" partitions, so keep the min/max around to\n  // make sure the bottom/top lines do not intersect text.\n  int min_bottom = INT32_MAX;\n  int max_top = INT32_MIN;\n\n  // Look at each text partition. We want to find the partitions\n  // that have extremal bottom/top sides. These will give us a basis\n  // for the table rows. Because the textlines can be skewed and close due\n  // to warping, the height of the partitions is toned down a little bit.\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(bounding_box_);\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (!text->IsTextType()) {\n      continue;\n    }\n\n    ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());\n    min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));\n    max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));\n\n    // Ignore \"tall\" text partitions, as these are usually false positive\n    // vertical text or multiple lines pulled together.\n    if (text->bounding_box().height() > max_text_height_) {\n      continue;\n    }\n\n    int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);\n    int bottom = text->bounding_box().bottom() - spacing;\n    int top = text->bounding_box().top() + spacing;\n    // For horizontal text, the factor can be negative. This should\n    // probably cause a warning or failure. I haven't actually checked if\n    // it happens.\n    if (bottom >= top) {\n      continue;\n    }\n\n    bottom_sides.push_back(bottom);\n    top_sides.push_back(top);\n  }\n  // It causes disaster below, so avoid it!\n  if (bottom_sides.empty() || top_sides.empty()) {\n    return;\n  }\n\n  // Since data may be inserted in grid order, we sort the bottom/top sides.\n  std::sort(bottom_sides.begin(), bottom_sides.end());\n  std::sort(top_sides.begin(), top_sides.end());\n\n  // At this point, in the \"merged list\", we expect to have a bottom side,\n  // followed by either more bottom sides or a top side. The last number\n  // should be a top side. We find places where the splits occur by looking\n  // for \"valleys\". If we want to force gap sizes or allow overlap, change\n  // the spacing above. If you want to let lines \"slice\" partitions as long\n  // as it is infrequent, change the following function.\n  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);\n\n  // Recover the min/max correctly since it was shifted.\n  cell_y_[0] = min_bottom;\n  cell_y_[cell_y_.size() - 1] = max_top;\n}\n\nvoid StructuredTable::CalculateMargins() {\n  space_above_ = INT32_MAX;\n  space_below_ = INT32_MAX;\n  space_right_ = INT32_MAX;\n  space_left_ = INT32_MAX;\n  UpdateMargins(text_grid_);\n  UpdateMargins(line_grid_);\n}\n// Finds the nearest partition in grid to the table\n// boundaries and updates the margin.\nvoid StructuredTable::UpdateMargins(ColPartitionGrid *grid) {\n  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);\n  space_below_ = std::min(space_below_, below);\n  int above = FindVerticalMargin(grid, bounding_box_.top(), false);\n  space_above_ = std::min(space_above_, above);\n  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);\n  space_left_ = std::min(space_left_, left);\n  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);\n  space_right_ = std::min(space_right_, right);\n}\nint StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border);\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {\n    if (!part->IsTextType() && !part->IsHorizontalLine()) {\n      continue;\n    }\n    int distance =\n        decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;\n    if (distance >= 0) {\n      return distance;\n    }\n  }\n  return INT32_MAX;\n}\nint StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {\n  ColPartitionGridSearch gsearch(grid);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());\n  ColPartition *part = nullptr;\n  while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {\n    if (!part->IsTextType() && !part->IsVerticalLine()) {\n      continue;\n    }\n    int distance =\n        decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;\n    if (distance >= 0) {\n      return distance;\n    }\n  }\n  return INT32_MAX;\n}\n\nvoid StructuredTable::CalculateStats() {\n  const int kMaxCellHeight = 1000;\n  const int kMaxCellWidth = 1000;\n  STATS height_stats(0, kMaxCellHeight);\n  STATS width_stats(0, kMaxCellWidth);\n\n  for (unsigned i = 0; i < row_count(); ++i) {\n    height_stats.add(row_height(i), column_count());\n  }\n  for (unsigned i = 0; i < column_count(); ++i) {\n    width_stats.add(column_width(i), row_count());\n  }\n\n  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);\n  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);\n}\n\n// Looks for grid lines near the current bounding box and\n// grows the bounding box to include them if no intersections\n// will occur as a result. This is necessary because the margins\n// are calculated relative to the closest line/text. If the\n// line isn't absorbed, the margin will be the distance to the line.\nvoid StructuredTable::AbsorbNearbyLines() {\n  ColPartitionGridSearch gsearch(line_grid_);\n  gsearch.SetUniqueMode(true);\n\n  // Is the closest line above good? Loop multiple times for tables with\n  // multi-line (sometimes 2) borders. Limit the number of lines by\n  // making sure they stay within a table cell or so.\n  ColPartition *line = nullptr;\n  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top());\n  while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {\n    if (!line->IsHorizontalLine()) {\n      break;\n    }\n    TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(),\n                     line->MidY());\n    if (text_search.height() > median_cell_height_ * 2) {\n      break;\n    }\n    if (CountPartitions(text_search) > 0) {\n      break;\n    }\n    bounding_box_.set_top(line->MidY());\n  }\n  // As above, is the closest line below good?\n  line = nullptr;\n  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom());\n  while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {\n    if (!line->IsHorizontalLine()) {\n      break;\n    }\n    TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),\n                     bounding_box_.bottom() - 1);\n    if (text_search.height() > median_cell_height_ * 2) {\n      break;\n    }\n    if (CountPartitions(text_search) > 0) {\n      break;\n    }\n    bounding_box_.set_bottom(line->MidY());\n  }\n  // TODO(nbeato): vertical lines\n}\n\n// This function will find all \"0 valleys\" (of any length) given two\n// arrays. The arrays are the mins and maxes of partitions (either\n// left and right or bottom and top). Since the min/max lists are generated\n// with pairs of increasing integers, we can make some assumptions in\n// the function about ordering of the overall list, which are shown in the\n// asserts.\n// The algorithm works as follows:\n//   While there are numbers to process, take the smallest number.\n//     If it is from the min_list, increment the \"hill\" counter.\n//     Otherwise, decrement the \"hill\" counter.\n//     In the process of doing this, keep track of \"crossing\" the\n//     desired height.\n// The first/last items are extremal values of the list and known.\n// NOTE: This function assumes the lists are sorted!\nvoid StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,\n                                             const std::vector<int> &max_list, int max_merged,\n                                             std::vector<int> *locations) {\n  locations->clear();\n  ASSERT_HOST(min_list.size() == max_list.size());\n  if (min_list.empty()) {\n    return;\n  }\n  ASSERT_HOST(min_list.at(0) < max_list.at(0));\n  ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));\n\n  locations->push_back(min_list.at(0));\n  unsigned min_index = 0;\n  unsigned max_index = 0;\n  int stacked_partitions = 0;\n  int last_cross_position = INT32_MAX;\n  // max_index will expire after min_index.\n  // However, we can't \"increase\" the hill size if min_index expired.\n  // So finish processing when min_index expires.\n  while (min_index < min_list.size()) {\n    // Increase the hill count.\n    if (min_list[min_index] < max_list[max_index]) {\n      ++stacked_partitions;\n      if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {\n        int mid = (last_cross_position + min_list[min_index]) / 2;\n        locations->push_back(mid);\n        last_cross_position = INT32_MAX;\n      }\n      ++min_index;\n    } else {\n      // Decrease the hill count.\n      --stacked_partitions;\n      if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {\n        last_cross_position = max_list[max_index];\n      }\n      ++max_index;\n    }\n  }\n  locations->push_back(max_list.at(max_list.size() - 1));\n}\n\n// Counts the number of partitions in the table\n// box that intersection the given x value.\nint StructuredTable::CountVerticalIntersections(int x) {\n  int count = 0;\n  // Make a small box to keep the search time down.\n  const int kGridSize = text_grid_->gridsize();\n  TBOX vertical_box = bounding_box_;\n  vertical_box.set_left(x - kGridSize);\n  vertical_box.set_right(x + kGridSize);\n\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(vertical_box);\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (!text->IsTextType()) {\n      continue;\n    }\n    const TBOX &box = text->bounding_box();\n    if (box.left() < x && x < box.right()) {\n      ++count;\n    }\n  }\n  return count;\n}\n\n// Counts the number of partitions in the table\n// box that intersection the given y value.\nint StructuredTable::CountHorizontalIntersections(int y) {\n  int count = 0;\n  // Make a small box to keep the search time down.\n  const int kGridSize = text_grid_->gridsize();\n  TBOX horizontal_box = bounding_box_;\n  horizontal_box.set_bottom(y - kGridSize);\n  horizontal_box.set_top(y + kGridSize);\n\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(horizontal_box);\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (!text->IsTextType()) {\n      continue;\n    }\n\n    const TBOX &box = text->bounding_box();\n    if (box.bottom() < y && y < box.top()) {\n      ++count;\n    }\n  }\n  return count;\n}\n\n// Counts how many text partitions are in this box.\n// This is used to count partitions in cells, as that can indicate\n// how \"strong\" a potential table row/column (or even full table) actually is.\nint StructuredTable::CountPartitions(const TBOX &box) {\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartRectSearch(box);\n  int count = 0;\n  ColPartition *text = nullptr;\n  while ((text = gsearch.NextRectSearch()) != nullptr) {\n    if (text->IsTextType()) {\n      ++count;\n    }\n  }\n  return count;\n}\n\n////////\n//////// TableRecognizer Class\n////////\n\nvoid TableRecognizer::Init() {}\n\nvoid TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) {\n  text_grid_ = text_grid;\n}\nvoid TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) {\n  line_grid_ = line_grid;\n}\nvoid TableRecognizer::set_min_height(int height) {\n  min_height_ = height;\n}\nvoid TableRecognizer::set_min_width(int width) {\n  min_width_ = width;\n}\nvoid TableRecognizer::set_max_text_height(int height) {\n  max_text_height_ = height;\n}\n\nStructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) {\n  auto *table = new StructuredTable();\n  table->Init();\n  table->set_text_grid(text_grid_);\n  table->set_line_grid(line_grid_);\n  table->set_max_text_height(max_text_height_);\n\n  // Try to solve this simple case, a table with *both*\n  // vertical and horizontal lines.\n  if (RecognizeLinedTable(guess, table)) {\n    return table;\n  }\n\n  // Fallback to whitespace if that failed.\n  // TODO(nbeato): Break this apart to take advantage of horizontal\n  // lines or vertical lines when present.\n  if (RecognizeWhitespacedTable(guess, table)) {\n    return table;\n  }\n\n  // No table found...\n  delete table;\n  return nullptr;\n}\n\nbool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) {\n  if (!HasSignificantLines(guess_box)) {\n    return false;\n  }\n  TBOX line_bound = guess_box;\n  if (!FindLinesBoundingBox(&line_bound)) {\n    return false;\n  }\n  table->set_bounding_box(line_bound);\n  return table->FindLinedStructure();\n}\n\n// Quick implementation. Just count the number of lines in the box.\n// A better implementation would counter intersections and look for connected\n// components. It could even go as far as finding similar length lines.\n// To account for these possible issues, the VerifyLinedTableCells function\n// will reject lined tables that cause intersections with text on the page.\n// TODO(nbeato): look for \"better\" lines\nbool TableRecognizer::HasSignificantLines(const TBOX &guess) {\n  ColPartitionGridSearch box_search(line_grid_);\n  box_search.SetUniqueMode(true);\n  box_search.StartRectSearch(guess);\n  ColPartition *line = nullptr;\n  int vertical_count = 0;\n  int horizontal_count = 0;\n\n  while ((line = box_search.NextRectSearch()) != nullptr) {\n    if (line->IsHorizontalLine()) {\n      ++horizontal_count;\n    }\n    if (line->IsVerticalLine()) {\n      ++vertical_count;\n    }\n  }\n\n  return vertical_count >= kLinedTableMinVerticalLines &&\n         horizontal_count >= kLinedTableMinHorizontalLines;\n}\n\n// Given a bounding box with a bunch of horizontal / vertical lines,\n// we just find the extents of all of these lines iteratively.\n// The box will be at least as large as guess. This\n// could possibly be a bad assumption.\n// It is guaranteed to halt in at least O(n * gridarea) where n\n// is the number of lines.\n// The assumption is that growing the box iteratively will add lines\n// several times, but eventually we'll find the extents.\n//\n// For tables, the approach is a bit aggressive, a single line (which could be\n// noise or a column ruling) can destroy the table inside.\n//\n// TODO(nbeato): This is a quick first implementation.\n// A better implementation would actually look for consistency\n// in extents of the lines and find the extents using lines\n// that clearly describe the table. This would allow the\n// lines to \"vote\" for height/width. An approach like\n// this would solve issues with page layout rulings.\n// I haven't looked for these issues yet, so I can't even\n// say they happen confidently.\nbool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) {\n  // The first iteration will tell us if there are lines\n  // present and shrink the box to a minimal iterative size.\n  if (!FindLinesBoundingBoxIteration(bounding_box)) {\n    return false;\n  }\n\n  // Keep growing until the area of the table stabilizes.\n  // The box can only get bigger, increasing area.\n  bool changed = true;\n  while (changed) {\n    changed = false;\n    int old_area = bounding_box->area();\n    bool check = FindLinesBoundingBoxIteration(bounding_box);\n    // At this point, the function will return true.\n    ASSERT_HOST(check);\n    ASSERT_HOST(bounding_box->area() >= old_area);\n    changed = (bounding_box->area() > old_area);\n  }\n\n  return true;\n}\n\nbool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) {\n  // Search for all of the lines in the current box, keeping track of extents.\n  ColPartitionGridSearch box_search(line_grid_);\n  box_search.SetUniqueMode(true);\n  box_search.StartRectSearch(*bounding_box);\n  ColPartition *line = nullptr;\n  bool first_line = true;\n\n  while ((line = box_search.NextRectSearch()) != nullptr) {\n    if (line->IsLineType()) {\n      if (first_line) {\n        // The first iteration can shrink the box.\n        *bounding_box = line->bounding_box();\n        first_line = false;\n      } else {\n        *bounding_box += line->bounding_box();\n      }\n    }\n  }\n  return !first_line;\n}\n\n// The goal of this function is to move the table boundaries around and find\n// a table that maximizes the whitespace around the table while maximizing\n// the cellular structure. As a result, it gets confused by headers, footers,\n// and merged columns (text that crosses columns). There is a tolerance\n// that allows a few partitions to count towards potential cell merges.\n// It's the max_merged parameter to FindPartitionLocations.\n// It can work, but it needs some false positive remove on boundaries.\n// For now, the grid structure must not intersect any partitions.\n// Also, small tolerance is added to the horizontal lines for tightly packed\n// tables. The tolerance is added by adjusting the bounding boxes of the\n// partitions (in FindHorizontalPartitions). The current implementation\n// only adjusts the vertical extents of the table.\n//\n// Also note. This was hacked at a lot. It could probably use some\n// more hacking at to find a good set of border conditions and then a\n// nice clean up.\nbool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) {\n  TBOX best_box = guess_box; // Best borders known.\n  int best_below = 0;        // Margin size above best table.\n  int best_above = 0;        // Margin size below best table.\n  TBOX adjusted = guess_box; // The search box.\n\n  // We assume that the guess box is somewhat accurate, so we don't allow\n  // the adjusted border to pass half of the guessed area. This prevents\n  // \"negative\" tables from forming.\n  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;\n  // Keeps track of the most columns in an accepted table. The resulting table\n  // may be less than the max, but we don't want to stray too far.\n  unsigned best_cols = 0;\n  // Make sure we find a good border.\n  bool found_good_border = false;\n\n  // Find the bottom of the table by trying a few different locations. For\n  // each location, the top, left, and right are fixed. We start the search\n  // in a smaller table to favor best_cols getting a good estimate sooner.\n  int last_bottom = INT32_MAX;\n  int bottom =\n      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);\n  int top =\n      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);\n  adjusted.set_top(top);\n\n  // Headers/footers can be spaced far from everything.\n  // Make sure that the space below is greater than the space above\n  // the lowest row.\n  int previous_below = 0;\n  const int kMaxChances = 10;\n  int chances = kMaxChances;\n  while (bottom != last_bottom) {\n    adjusted.set_bottom(bottom);\n\n    if (adjusted.height() >= min_height_) {\n      // Try to fit the grid on the current box. We give it a chance\n      // if the number of columns didn't significantly drop.\n      table->set_bounding_box(adjusted);\n      if (table->FindWhitespacedStructure() &&\n          table->column_count() >= best_cols * kRequiredColumns) {\n        if (false && IsWeakTableRow(table, 0)) {\n          // Currently buggy, but was looking promising so disabled.\n          --chances;\n        } else {\n          // We favor 2 things,\n          //   1- Adding rows that have partitioned data.\n          //   2- Better margins (to find header/footer).\n          // For better tables, we just look for multiple cells in the\n          // bottom row with data in them.\n          // For margins, the space below the last row should\n          // be better than a table with the last row removed.\n          chances = kMaxChances;\n          double max_row_height = kMaxRowSize * table->median_cell_height();\n          if ((table->space_below() * kMarginFactor >= best_below &&\n               table->space_below() >= previous_below) ||\n              (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {\n            best_box.set_bottom(bottom);\n            best_below = table->space_below();\n            best_cols = std::max(table->column_count(), best_cols);\n            found_good_border = true;\n          }\n        }\n        previous_below = table->space_below();\n      } else {\n        --chances;\n      }\n    }\n    if (chances <= 0) {\n      break;\n    }\n\n    last_bottom = bottom;\n    bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);\n  }\n  if (!found_good_border) {\n    return false;\n  }\n\n  // TODO(nbeato) comments: follow modified code above... put it in a function!\n  found_good_border = false;\n  int last_top = INT32_MIN;\n  top =\n      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);\n  int previous_above = 0;\n  chances = kMaxChances;\n\n  adjusted.set_bottom(best_box.bottom());\n  while (last_top != top) {\n    adjusted.set_top(top);\n    if (adjusted.height() >= min_height_) {\n      table->set_bounding_box(adjusted);\n      if (table->FindWhitespacedStructure() &&\n          table->column_count() >= best_cols * kRequiredColumns) {\n        int last_row = table->row_count() - 1;\n        if (false && IsWeakTableRow(table, last_row)) {\n          // Currently buggy, but was looking promising so disabled.\n          --chances;\n        } else {\n          chances = kMaxChances;\n          double max_row_height = kMaxRowSize * table->median_cell_height();\n          if ((table->space_above() * kMarginFactor >= best_above &&\n               table->space_above() >= previous_above) ||\n              (table->CountFilledCellsInRow(last_row) > 1 &&\n               table->row_height(last_row) < max_row_height)) {\n            best_box.set_top(top);\n            best_above = table->space_above();\n            best_cols = std::max(table->column_count(), best_cols);\n            found_good_border = true;\n          }\n        }\n        previous_above = table->space_above();\n      } else {\n        --chances;\n      }\n    }\n    if (chances <= 0) {\n      break;\n    }\n\n    last_top = top;\n    top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);\n  }\n\n  if (!found_good_border) {\n    return false;\n  }\n\n  // If we get here, this shouldn't happen. It can be an assert, but\n  // I haven't tested it enough to make it crash things.\n  if (best_box.null_box()) {\n    return false;\n  }\n\n  // Given the best locations, fit the box to those locations.\n  table->set_bounding_box(best_box);\n  return table->FindWhitespacedStructure();\n}\n\n// Finds the closest value to y that can safely cause a horizontal\n// split in the partitions.\n// This function has been buggy and not as reliable as I would've\n// liked. I suggest finding all of the splits using the\n// FindPartitionLocations once and then just keeping the results\n// of that function cached somewhere.\nint TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {\n  ColPartitionGridSearch gsearch(text_grid_);\n  gsearch.SetUniqueMode(true);\n  gsearch.StartVerticalSearch(left, right, y);\n  ColPartition *text = nullptr;\n  int last_y = y;\n  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {\n    if (!text->IsTextType() || !text->IsHorizontalType()) {\n      continue;\n    }\n    if (text->bounding_box().height() > max_text_height_) {\n      continue;\n    }\n\n    const TBOX &text_box = text->bounding_box();\n    if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {\n      last_y = std::min(last_y, static_cast<int>(text_box.bottom()));\n      continue;\n    }\n    if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {\n      last_y = std::max(last_y, static_cast<int>(text_box.top()));\n      continue;\n    }\n\n    return last_y;\n  }\n  // If none is found, we at least want to preserve the min/max,\n  // which defines the overlap of y with the last partition in the grid.\n  return last_y;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/tablerecog.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tablerecog.h\n// Description: Functions to detect structure of tables.\n// Author:      Nicholas Beato\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TABLERECOG_H_\n#define TABLERECOG_H_\n\n#include \"colpartitiongrid.h\"\n\nnamespace tesseract {\n\n// There are 2 classes in this file. They have 2 different purposes.\n//  - StructuredTable contains the methods to find the structure given\n//    a specific bounding box and grow that structure.\n//  - TableRecognizer contains the methods to adjust the possible positions\n//    of a table without worrying about structure.\n//\n// To use these classes, the assumption is that the TableFinder will\n// have a guess of the location of a table (or possibly over/undersegmented\n// tables). The TableRecognizer is responsible for finding the table boundaries\n// at a high level. The StructuredTable class is responsible for determining\n// the structure of the table and trying to maximize its bounds while retaining\n// the structure.\n// (The latter part is not implemented yet, but that was the goal).\n//\n// While on the boundary discussion, keep in mind that this is a first pass.\n// There should eventually be some things like internal structure checks,\n// and, more importantly, surrounding text flow checks.\n//\n\n// Usage:\n// The StructuredTable class contains methods to query a potential table.\n// It has functions to find structure, count rows, find ColPartitions that\n// intersect gridlines, etc. It is not meant to blindly find a table. It\n// is meant to start with a known table location and enhance it.\n// Usage:\n//    ColPartitionGrid text_grid, line_grid;  // init\n//    TBOX table_box;  // known location of table location\n//\n//    StructuredTable table;\n//    table.Init();  // construction code\n//    table.set_text_grid(/* text */);  // These 2 grids can be the same!\n//    table.set_line_grid(/* lines */);\n//    table.set_min_text_height(10);    // Filter vertical and tall text.\n//    // IMPORTANT! The table needs to be told where it is!\n//    table.set_bounding_box(table_box);  // Set initial table location.\n//    if (table.FindWhitespacedStructure()) {\n//      // process table\n//      table.column_count();  // number of columns\n//      table.row_count();     // number of rows\n//      table.cells_count();   // number of cells\n//      table.bounding_box();  // updated bounding box\n//      // etc.\n//    }\n//\nclass TESS_API StructuredTable {\npublic:\n  StructuredTable();\n  ~StructuredTable() = default;\n\n  // Initialization code. Must be called after the constructor.\n  void Init();\n\n  // Sets the grids used by the table. These can be changed between\n  // calls to Recognize. They are treated as read-only data.\n  void set_text_grid(ColPartitionGrid *text);\n  void set_line_grid(ColPartitionGrid *lines);\n  // Filters text partitions that are ridiculously tall to prevent\n  // merging rows.\n  void set_max_text_height(int height);\n\n  // Basic accessors. Some are treated as attributes despite having indirect\n  // representation.\n  bool is_lined() const;\n  unsigned row_count() const;\n  unsigned column_count() const;\n  unsigned cell_count() const;\n  void set_bounding_box(const TBOX &box);\n  const TBOX &bounding_box() const;\n  int median_cell_height();\n  int median_cell_width();\n  int row_height(unsigned row) const;\n  int column_width(unsigned column) const;\n  int space_above() const;\n  int space_below() const;\n\n  // Given enough horizontal and vertical lines in a region, create this table\n  // based on the structure given by the lines. Return true if it worked out.\n  // Code assumes the lines exist. It is the caller's responsibility to check\n  // for lines and find an appropriate bounding box.\n  bool FindLinedStructure();\n\n  // The main subroutine for finding generic table structure. The function\n  // finds the grid structure in the given box. Returns true if a good grid\n  // exists, implying that \"this\" table is valid.\n  bool FindWhitespacedStructure();\n\n  ////////\n  //////// Functions to query table info.\n  ////////\n\n  // Returns true if inserting part into the table does not cause any\n  // cell merges.\n  bool DoesPartitionFit(const ColPartition &part) const;\n  // Checks if a sub-table has multiple data cells filled.\n  int CountFilledCells();\n  int CountFilledCellsInRow(int row);\n  int CountFilledCellsInColumn(int column);\n  int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end);\n\n  // Makes sure that at least one cell in a row has substantial area filled.\n  // This can filter out large whitespace caused by growing tables too far\n  // and page numbers.\n  // (currently bugged for some reason).\n  bool VerifyRowFilled(int row);\n  // Finds the filled area in a cell.\n  double CalculateCellFilledPercentage(unsigned row, unsigned column);\n\n  // Debug display, draws the table in the given color. If the table is not\n  // valid, the table and \"best\" grid lines are still drawn in the given color.\n  void Display(ScrollView *window, ScrollView::Color color);\n\nprotected:\n  // Clear the structure information.\n  void ClearStructure();\n\n  ////////\n  //////// Lined tables\n  ////////\n\n  // Verifies the lines do not intersect partitions. This happens when\n  // the lines are in column boundaries and extend the full page. As a result,\n  // the grid lines go through column text. The condition is detectable.\n  bool VerifyLinedTableCells();\n\n  ////////\n  //////// Tables with whitespace\n  ////////\n\n  // This is the function to change if you want to filter resulting tables\n  // better. Right now it just checks for a minimum cell count and such.\n  // You could add things like maximum number of ColPartitions per cell or\n  // similar.\n  bool VerifyWhitespacedTable();\n  // Find the columns of a table using whitespace.\n  void FindWhitespacedColumns();\n  // Find the rows of a table using whitespace.\n  void FindWhitespacedRows();\n\n  ////////\n  //////// Functions to provide information about the table.\n  ////////\n\n  // Calculates the whitespace around the table using the table boundary and\n  // the supplied grids (set_text_grid and set_line_grid).\n  void CalculateMargins();\n  // Update the table margins with the supplied grid. This is\n  // only called by calculate margins to use multiple grid sources.\n  void UpdateMargins(ColPartitionGrid *grid);\n  int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const;\n  int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const;\n  // Calculates stats on the table, namely the median cell height and width.\n  void CalculateStats();\n\n  ////////\n  //////// Functions to try to \"fix\" some table errors.\n  ////////\n\n  // Given a whitespaced table, this looks for bordering lines that might\n  // be page layout boxes around the table. It is necessary to get the margins\n  // correct on the table. If the lines are not joined, the margins will be\n  // the distance to the line, which is not right.\n  void AbsorbNearbyLines();\n\n  // Nice utility function for finding partition gaps. You feed it a sorted\n  // list of all of the mins/maxes of the partitions in the table, and it gives\n  // you the gaps (middle). This works for both vertical and horizontal\n  // gaps.\n  //\n  // If you want to allow slight overlap in the division and the partitions,\n  // just scale down the partitions before inserting them in the list.\n  // Likewise, you can force at least some space between partitions.\n  // This trick is how the horizontal partitions are done (since the page\n  // skew could make it hard to find splits in the text).\n  //\n  // As a result, \"0 distance\" between closest partitions causes a gap.\n  // This is not a programmatic assumption. It is intentional and simplifies\n  // things.\n  //\n  // \"max_merged\" indicates both the minimum number of stacked partitions\n  // to cause a cell (add 1 to it), and the maximum number of partitions that\n  // a grid line can intersect. For example, if max_merged is 0, then lines\n  // are inserted wherever space exists between partitions. If it is 2,\n  // lines may intersect 2 partitions at most, but you also need at least\n  // 2 partitions to generate a line.\n  static void FindCellSplitLocations(const std::vector<int> &min_list,\n                                     const std::vector<int> &max_list, int max_merged,\n                                     std::vector<int> *locations);\n\n  ////////\n  //////// Utility function for table queries\n  ////////\n\n  // Counts the number of ColPartitions that intersect vertical cell\n  // division at this x value. Used by VerifyLinedTable.\n  int CountVerticalIntersections(int x);\n  int CountHorizontalIntersections(int y);\n\n  // Counts how many text partitions are in this box.\n  int CountPartitions(const TBOX &box);\n\n  ////////\n  //////// Data members.\n  ////////\n\n  // Input data, used as read only data to make decisions.\n  ColPartitionGrid *text_grid_; // Text ColPartitions\n  ColPartitionGrid *line_grid_; // Line ColPartitions\n  // Table structure.\n  // bounding box is a convenient external representation.\n  // cell_x_ and cell_y_ indicate the grid lines.\n  TBOX bounding_box_;         // Bounding box\n  std::vector<int> cell_x_; // Locations of vertical divisions (sorted)\n  std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)\n  bool is_lined_;             // Is the table backed up by a line structure\n  // Table margins, set via CalculateMargins\n  int space_above_;\n  int space_below_;\n  int space_left_;\n  int space_right_;\n  int median_cell_height_;\n  int median_cell_width_;\n  // Filters, used to prevent awkward partitions from destroying structure.\n  int max_text_height_;\n};\n\nclass TESS_API TableRecognizer {\npublic:\n  TableRecognizer() = default;\n  ~TableRecognizer() = default;\n\n  // Initialization code. Must be called after the constructor.\n  void Init();\n\n  ////////\n  //////// Pre-recognize methods to initial table constraints.\n  ////////\n\n  // Sets the grids used by the table. These can be changed between\n  // calls to Recognize. They are treated as read-only data.\n  void set_text_grid(ColPartitionGrid *text);\n  void set_line_grid(ColPartitionGrid *lines);\n  // Sets some additional constraints on the table.\n  void set_min_height(int height);\n  void set_min_width(int width);\n  // Filters text partitions that are ridiculously tall to prevent\n  // merging rows. Note that \"filters\" refers to allowing horizontal\n  // cells to slice through them on the premise that they were\n  // merged text rows during previous layout.\n  void set_max_text_height(int height);\n\n  // Given a guess location, the RecognizeTable function will try to find a\n  // structured grid in the area. On success, it will return a new\n  // StructuredTable (and assumes you will delete it). Otherwise,\n  // nullptr is returned.\n  //\n  // Keep in mind, this may \"overgrow\" or \"undergrow\" the size of guess.\n  // Ideally, there is either a one-to-one correspondence between\n  // the guess and table or no table at all. This is not the best of\n  // assumptions right now, but was made to try to keep things simple in\n  // the first pass.\n  //\n  // If a line structure is available on the page in the given region,\n  // the table will use the linear structure as it is.\n  // Otherwise, it will try to maximize the whitespace around it while keeping\n  // a grid structure. This is somewhat working.\n  //\n  // Since the combination of adjustments can get high, effort was\n  // originally made to keep the number of adjustments linear in the number\n  // of partitions. The underlying structure finding code used to be\n  // much more complex. I don't know how necessary this constraint is anymore.\n  // The evaluation of a possible table is kept within O(nlogn) in the size of\n  // the table (where size is the number of partitions in the table).\n  // As a result, the algorithm is capable of O(n^2 log n). Depending\n  // on the grid search size, it may be higher.\n  //\n  // Last note: it is possible to just try all partition boundaries at a high\n  // level O(n^4) and do a verification scheme (at least O(nlogn)). If there\n  // area 200 partitions on a page, this could be too costly. Effort could go\n  // into pruning the search, but I opted for something quicker. I'm confident\n  // that the independent adjustments can get similar results and keep the\n  // complextiy down. However, the other approach could work without using\n  // TableFinder at all if it is fast enough.  It comes down to properly\n  // deciding what is a table. The code currently relies on TableFinder's\n  // guess to the location of a table for that.\n  StructuredTable *RecognizeTable(const TBOX &guess_box);\n\nprotected:\n  ////////\n  //////// Lined tables\n  ////////\n\n  // Returns true if the given box has a lined table within it. The\n  // table argument will be updated with the table if the table exists.\n  bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table);\n  // Returns true if the given box has a large number of horizontal and\n  // vertical lines present. If so, we assume the extent of these lines\n  // uniquely defines a table and find that table via SolveLinedTable.\n  bool HasSignificantLines(const TBOX &guess);\n\n  // Given enough horizontal and vertical lines in a region, find a bounding\n  // box that encloses all of them (as well as newly introduced lines).\n  // The bounding box is the smallest box that encloses the lines in guess\n  // without having any lines sticking out of it.\n  // bounding_box is an in/out parameter.\n  // On input, it in the extents of the box to search.\n  // On output, it is the resulting bounding box.\n  bool FindLinesBoundingBox(TBOX *bounding_box);\n  // Iteration in above search.\n  // bounding_box is an in/out parameter.\n  // On input, it in the extents of the box to search.\n  // On output, it is the resulting bounding box.\n  bool FindLinesBoundingBoxIteration(TBOX *bounding_box);\n\n  ////////\n  //////// Generic \"whitespaced\" tables\n  ////////\n\n  // Returns true if the given box has a whitespaced table within it. The\n  // table argument will be updated if the table exists. Also note\n  // that this method will fail if the guess_box center is not\n  // mostly within the table.\n  bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table);\n\n  // Finds the location of a horizontal split relative to y.\n  // This function is mostly unused now. If the SolveWhitespacedTable\n  // changes much, it can be removed. Note, it isn't really as reliable\n  // as I thought. I went with alternatives for most of the other uses.\n  int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);\n\n  // Input data, used as read only data to make decisions.\n  ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions\n  ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions\n  // Table constraints, a \"good\" table must satisfy these.\n  int min_height_ = 0;\n  int min_width_ = 0;\n  // Filters, used to prevent awkward partitions from destroying structure.\n  int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text.\n};\n\n} // namespace tesseract\n\n#endif /* TABLERECOG_H_ */\n"
  },
  {
    "path": "src/textord/tabvector.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tabvector.cpp\n// Description: Class to hold a near-vertical vector representing a tab-stop.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blobbox.h\"\n#include \"colfind.h\"\n#include \"colpartitionset.h\"\n#include \"detlinefit.h\"\n#include \"helpers.h\" // for IntCastRounded\n#include \"statistc.h\"\n#include \"tabvector.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\n// Multiple of height used as a gutter for evaluation search.\nconst int kGutterMultiple = 4;\n// Multiple of neighbour gap that we expect the gutter gap to be at minimum.\nconst int kGutterToNeighbourRatio = 3;\n// Pixel distance for tab vectors to be considered the same.\nconst int kSimilarVectorDist = 10;\n// Pixel distance for ragged tab vectors to be considered the same if there\n// is nothing in the overlap box\nconst int kSimilarRaggedDist = 50;\n// Max multiple of height to allow filling in between blobs when evaluating.\nconst int kMaxFillinMultiple = 11;\n// Min fraction of mean gutter size to allow a gutter on a good tab blob.\nconst double kMinGutterFraction = 0.5;\n// Multiple of 1/n lines as a minimum gutter in evaluation.\nconst double kLineCountReciprocal = 4.0;\n// Constant add-on for minimum gutter for aligned tabs.\nconst double kMinAlignedGutter = 0.25;\n// Constant add-on for minimum gutter for ragged tabs.\nconst double kMinRaggedGutter = 1.5;\n\ndouble_VAR(textord_tabvector_vertical_gap_fraction, 0.5,\n           \"max fraction of mean blob width allowed for vertical gaps in \"\n           \"vertical text\");\n\ndouble_VAR(textord_tabvector_vertical_box_ratio, 0.5,\n           \"Fraction of box matches required to declare a line vertical\");\n\n// Create a constraint for the top or bottom of this TabVector.\nvoid TabConstraint::CreateConstraint(TabVector *vector, bool is_top) {\n  auto *constraint = new TabConstraint(vector, is_top);\n  auto *constraints = new TabConstraint_LIST;\n  TabConstraint_IT it(constraints);\n  it.add_to_end(constraint);\n  if (is_top) {\n    vector->set_top_constraints(constraints);\n  } else {\n    vector->set_bottom_constraints(constraints);\n  }\n}\n\n// Test to see if the constraints are compatible enough to merge.\nbool TabConstraint::CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {\n  if (list1 == list2) {\n    return false;\n  }\n  int y_min = -INT32_MAX;\n  int y_max = INT32_MAX;\n  if (textord_debug_tabfind > 3) {\n    tprintf(\"Testing constraint compatibility\\n\");\n  }\n  GetConstraints(list1, &y_min, &y_max);\n  GetConstraints(list2, &y_min, &y_max);\n  if (textord_debug_tabfind > 3) {\n    tprintf(\"Resulting range = [%d,%d]\\n\", y_min, y_max);\n  }\n  return y_max >= y_min;\n}\n\n// Merge the lists of constraints and update the TabVector pointers.\n// The second list is deleted.\nvoid TabConstraint::MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {\n  if (list1 == list2) {\n    return;\n  }\n  TabConstraint_IT it(list2);\n  if (textord_debug_tabfind > 3) {\n    tprintf(\"Merging constraints\\n\");\n  }\n  // The vectors of all constraints on list2 are now going to be on list1.\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabConstraint *constraint = it.data();\n    if (textord_debug_tabfind > 3) {\n      constraint->vector_->Print(\"Merge\");\n    }\n    if (constraint->is_top_) {\n      constraint->vector_->set_top_constraints(list1);\n    } else {\n      constraint->vector_->set_bottom_constraints(list1);\n    }\n  }\n  it = list1;\n  it.add_list_before(list2);\n  delete list2;\n}\n\n// Set all the tops and bottoms as appropriate to a mean of the\n// constrained range. Delete all the constraints and list.\nvoid TabConstraint::ApplyConstraints(TabConstraint_LIST *constraints) {\n  int y_min = -INT32_MAX;\n  int y_max = INT32_MAX;\n  GetConstraints(constraints, &y_min, &y_max);\n  int y = (y_min + y_max) / 2;\n  TabConstraint_IT it(constraints);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabConstraint *constraint = it.data();\n    TabVector *v = constraint->vector_;\n    if (constraint->is_top_) {\n      v->SetYEnd(y);\n      v->set_top_constraints(nullptr);\n    } else {\n      v->SetYStart(y);\n      v->set_bottom_constraints(nullptr);\n    }\n  }\n  delete constraints;\n}\n\nTabConstraint::TabConstraint(TabVector *vector, bool is_top) : vector_(vector), is_top_(is_top) {\n  if (is_top) {\n    y_min_ = vector->endpt().y();\n    y_max_ = vector->extended_ymax();\n  } else {\n    y_max_ = vector->startpt().y();\n    y_min_ = vector->extended_ymin();\n  }\n}\n\n// Get the max of the mins and the min of the maxes.\nvoid TabConstraint::GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max) {\n  TabConstraint_IT it(constraints);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabConstraint *constraint = it.data();\n    if (textord_debug_tabfind > 3) {\n      tprintf(\"Constraint is [%d,%d]\", constraint->y_min_, constraint->y_max_);\n      constraint->vector_->Print(\" for\");\n    }\n    *y_min = std::max(*y_min, constraint->y_min_);\n    *y_max = std::min(*y_max, constraint->y_max_);\n  }\n}\n\n// The constructor is private. See the bottom of the file...\n\n// Public factory to build a TabVector from a list of boxes.\n// The TabVector will be of the given alignment type.\n// The input vertical vector is used in fitting, and the output\n// vertical_x, vertical_y have the resulting line vector added to them\n// if the alignment is not ragged.\n// The extended_start_y and extended_end_y are the maximum possible\n// extension to the line segment that can be used to align with others.\n// The input CLIST of BLOBNBOX good_points is consumed and taken over.\nTabVector *TabVector::FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y,\n                                int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x,\n                                int *vertical_y) {\n  auto *vector = new TabVector(extended_start_y, extended_end_y, alignment, good_points);\n  if (!vector->Fit(vertical, false)) {\n    delete vector;\n    return nullptr;\n  }\n  if (!vector->IsRagged()) {\n    vertical = vector->endpt_ - vector->startpt_;\n    int weight = vector->BoxCount();\n    *vertical_x += vertical.x() * weight;\n    *vertical_y += vertical.y() * weight;\n  }\n  return vector;\n}\n\n// Build a ragged TabVector by copying another's direction, shifting it\n// to match the given blob, and making its initial extent the height\n// of the blob, but its extended bounds from the bounds of the original.\nTabVector::TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew,\n                     BLOBNBOX *blob)\n    : extended_ymin_(src.extended_ymin_)\n    , extended_ymax_(src.extended_ymax_)\n    , needs_refit_(true)\n    , needs_evaluation_(true)\n    , alignment_(alignment) {\n  BLOBNBOX_C_IT it(&boxes_);\n  it.add_to_end(blob);\n  TBOX box = blob->bounding_box();\n  if (IsLeftTab()) {\n    startpt_ = box.botleft();\n    endpt_ = box.topleft();\n  } else {\n    startpt_ = box.botright();\n    endpt_ = box.topright();\n  }\n  sort_key_ =\n      SortKey(vertical_skew, (startpt_.x() + endpt_.x()) / 2, (startpt_.y() + endpt_.y()) / 2);\n  if (textord_debug_tabfind > 3) {\n    Print(\"Constructed a new tab vector:\");\n  }\n}\n\n// Copies basic attributes of a tab vector for simple operations.\n// Copies things such startpt, endpt, range.\n// Does not copy things such as partners, boxes, or constraints.\n// This is useful if you only need vector information for processing, such\n// as in the table detection code.\nTabVector *TabVector::ShallowCopy() const {\n  auto *copy = new TabVector();\n  copy->startpt_ = startpt_;\n  copy->endpt_ = endpt_;\n  copy->alignment_ = alignment_;\n  copy->extended_ymax_ = extended_ymax_;\n  copy->extended_ymin_ = extended_ymin_;\n  copy->intersects_other_lines_ = intersects_other_lines_;\n  return copy;\n}\n\n// Extend this vector to include the supplied blob if it doesn't\n// already have it.\nvoid TabVector::ExtendToBox(BLOBNBOX *new_blob) {\n  TBOX new_box = new_blob->bounding_box();\n  BLOBNBOX_C_IT it(&boxes_);\n  if (!it.empty()) {\n    BLOBNBOX *blob = it.data();\n    TBOX box = blob->bounding_box();\n    while (!it.at_last() && box.top() <= new_box.top()) {\n      if (blob == new_blob) {\n        return; // We have it already.\n      }\n      it.forward();\n      blob = it.data();\n      box = blob->bounding_box();\n    }\n    if (box.top() >= new_box.top()) {\n      it.add_before_stay_put(new_blob);\n      needs_refit_ = true;\n      return;\n    }\n  }\n  needs_refit_ = true;\n  it.add_after_stay_put(new_blob);\n}\n\n// Set the ycoord of the start and move the xcoord to match.\nvoid TabVector::SetYStart(int start_y) {\n  startpt_.set_x(XAtY(start_y));\n  startpt_.set_y(start_y);\n}\n// Set the ycoord of the end and move the xcoord to match.\nvoid TabVector::SetYEnd(int end_y) {\n  endpt_.set_x(XAtY(end_y));\n  endpt_.set_y(end_y);\n}\n\n// Rotate the ends by the given vector. Auto flip start and end if needed.\nvoid TabVector::Rotate(const FCOORD &rotation) {\n  startpt_.rotate(rotation);\n  endpt_.rotate(rotation);\n  int dx = endpt_.x() - startpt_.x();\n  int dy = endpt_.y() - startpt_.y();\n  if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {\n    // Need to flip start/end.\n    ICOORD tmp = startpt_;\n    startpt_ = endpt_;\n    endpt_ = tmp;\n  }\n}\n\n// Setup the initial constraints, being the limits of\n// the vector and the extended ends.\nvoid TabVector::SetupConstraints() {\n  TabConstraint::CreateConstraint(this, false);\n  TabConstraint::CreateConstraint(this, true);\n}\n\n// Setup the constraints between the partners of this TabVector.\nvoid TabVector::SetupPartnerConstraints() {\n  // With the first and last partner, we want a common bottom and top,\n  // respectively, and for each change of partner, we want a common\n  // top of first with bottom of next.\n  TabVector_C_IT it(&partners_);\n  TabVector *prev_partner = nullptr;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *partner = it.data();\n    if (partner->top_constraints_ == nullptr || partner->bottom_constraints_ == nullptr) {\n      partner->Print(\"Impossible: has no constraints\");\n      Print(\"This vector has it as a partner\");\n      continue;\n    }\n    if (prev_partner == nullptr) {\n      // This is the first partner, so common bottom.\n      if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {\n        TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);\n      }\n    } else {\n      // We need prev top to be common with partner bottom.\n      if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,\n                                               partner->bottom_constraints_)) {\n        TabConstraint::MergeConstraints(prev_partner->top_constraints_,\n                                        partner->bottom_constraints_);\n      }\n    }\n    prev_partner = partner;\n    if (it.at_last()) {\n      // This is the last partner, so common top.\n      if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {\n        TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);\n      }\n    }\n  }\n}\n\n// Setup the constraints between this and its partner.\nvoid TabVector::SetupPartnerConstraints(TabVector *partner) {\n  if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {\n    TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);\n  }\n  if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {\n    TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);\n  }\n}\n\n// Use the constraints to modify the top and bottom.\nvoid TabVector::ApplyConstraints() {\n  if (top_constraints_ != nullptr) {\n    TabConstraint::ApplyConstraints(top_constraints_);\n  }\n  if (bottom_constraints_ != nullptr) {\n    TabConstraint::ApplyConstraints(bottom_constraints_);\n  }\n}\n\n// Merge close tab vectors of the same side that overlap.\nvoid TabVector::MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors,\n                                       BlobGrid *grid) {\n  TabVector_IT it1(vectors);\n  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {\n    TabVector *v1 = it1.data();\n    TabVector_IT it2(it1);\n    for (it2.forward(); !it2.at_first(); it2.forward()) {\n      TabVector *v2 = it2.data();\n      if (v2->SimilarTo(vertical, *v1, grid)) {\n        // Merge into the forward one, in case the combined vector now\n        // overlaps one in between.\n        if (textord_debug_tabfind) {\n          v2->Print(\"Merging\");\n          v1->Print(\"by deleting\");\n        }\n        v2->MergeWith(vertical, it1.extract());\n        if (textord_debug_tabfind) {\n          v2->Print(\"Producing\");\n        }\n        ICOORD merged_vector = v2->endpt();\n        merged_vector -= v2->startpt();\n        if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {\n          v2->Print(\"Garbage result of merge?\");\n        }\n        break;\n      }\n    }\n  }\n}\n\n// Return true if this vector is the same side, overlaps, and close\n// enough to the other to be merged.\nbool TabVector::SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const {\n  if ((IsRightTab() && other.IsRightTab()) || (IsLeftTab() && other.IsLeftTab())) {\n    // If they don't overlap, at least in extensions, then there is no chance.\n    if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0) {\n      return false;\n    }\n    // A fast approximation to the scale factor of the sort_key_.\n    int v_scale = abs(vertical.y());\n    if (v_scale == 0) {\n      v_scale = 1;\n    }\n    // If they are close enough, then OK.\n    if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&\n        sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_) {\n      return true;\n    }\n    // Ragged tabs get a bigger threshold.\n    if (!IsRagged() || !other.IsRagged() ||\n        sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||\n        sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_) {\n      return false;\n    }\n    if (grid == nullptr) {\n      // There is nothing else to test!\n      return true;\n    }\n    // If there is nothing in the rectangle between the vector that is going to\n    // move, and the place it is moving to, then they can be merged.\n    // Setup a vertical search for any blob.\n    const TabVector *mover = (IsRightTab() && sort_key_ < other.sort_key_) ? this : &other;\n    int top_y = mover->endpt_.y();\n    int bottom_y = mover->startpt_.y();\n    int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));\n    int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));\n    int shift = abs(sort_key_ - other.sort_key_) / v_scale;\n    if (IsRightTab()) {\n      right += shift;\n    } else {\n      left -= shift;\n    }\n\n    GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> vsearch(grid);\n    vsearch.StartVerticalSearch(left, right, top_y);\n    BLOBNBOX *blob;\n    while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {\n      const TBOX &box = blob->bounding_box();\n      if (box.top() > bottom_y) {\n        return true; // Nothing found.\n      }\n      if (box.bottom() < top_y) {\n        continue; // Doesn't overlap.\n      }\n      int left_at_box = XAtY(box.bottom());\n      int right_at_box = left_at_box;\n      if (IsRightTab()) {\n        right_at_box += shift;\n      } else {\n        left_at_box -= shift;\n      }\n      if (std::min(right_at_box, static_cast<int>(box.right())) >\n          std::max(left_at_box, static_cast<int>(box.left()))) {\n        return false;\n      }\n    }\n    return true; // Nothing found.\n  }\n  return false;\n}\n\n// Eat the other TabVector into this and delete it.\nvoid TabVector::MergeWith(const ICOORD &vertical, TabVector *other) {\n  extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);\n  extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);\n  if (other->IsRagged()) {\n    alignment_ = other->alignment_;\n  }\n  // Merge sort the two lists of boxes.\n  BLOBNBOX_C_IT it1(&boxes_);\n  BLOBNBOX_C_IT it2(&other->boxes_);\n  while (!it2.empty()) {\n    BLOBNBOX *bbox2 = it2.extract();\n    it2.forward();\n    TBOX box2 = bbox2->bounding_box();\n    BLOBNBOX *bbox1 = it1.data();\n    TBOX box1 = bbox1->bounding_box();\n    while (box1.bottom() < box2.bottom() && !it1.at_last()) {\n      it1.forward();\n      bbox1 = it1.data();\n      box1 = bbox1->bounding_box();\n    }\n    if (box1.bottom() < box2.bottom()) {\n      it1.add_to_end(bbox2);\n    } else if (bbox1 != bbox2) {\n      it1.add_before_stay_put(bbox2);\n    }\n  }\n  Fit(vertical, true);\n  other->Delete(this);\n}\n\n// Add a new element to the list of partner TabVectors.\n// Partners must be added in order of increasing y coordinate of the text line\n// that makes them partners.\n// Groups of identical partners are merged into one.\nvoid TabVector::AddPartner(TabVector *partner) {\n  if (IsSeparator() || partner->IsSeparator()) {\n    return;\n  }\n  TabVector_C_IT it(&partners_);\n  if (!it.empty()) {\n    it.move_to_last();\n    if (it.data() == partner) {\n      return;\n    }\n  }\n  it.add_after_then_move(partner);\n}\n\n// Return true if other is a partner of this.\nbool TabVector::IsAPartner(const TabVector *other) {\n  TabVector_C_IT it(&partners_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    if (it.data() == other) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// These names must be synced with the TabAlignment enum in tabvector.h.\nstatic const char *const kAlignmentNames[] = {\"Left Aligned\",  \"Left Ragged\",  \"Center\",\n                                              \"Right Aligned\", \"Right Ragged\", \"Separator\"};\n\n// Print basic information about this tab vector.\nvoid TabVector::Print(const char *prefix) {\n  tprintf(\n      \"%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,\"\n      \" partners=%d\\n\",\n      prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),\n      mean_width_, percent_score_, sort_key_, boxes_.length(), partners_.length());\n}\n\n// Print basic information about this tab vector and every box in it.\nvoid TabVector::Debug(const char *prefix) {\n  Print(prefix);\n  BLOBNBOX_C_IT it(&boxes_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    const TBOX &box = bbox->bounding_box();\n    tprintf(\"Box at (%d,%d)->(%d,%d)\\n\", box.left(), box.bottom(), box.right(), box.top());\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Draw this tabvector in place in the given window.\nvoid TabVector::Display(ScrollView *tab_win) {\n  if (textord_debug_printable) {\n    tab_win->Pen(ScrollView::BLUE);\n  } else if (alignment_ == TA_LEFT_ALIGNED) {\n    tab_win->Pen(ScrollView::LIME_GREEN);\n  } else if (alignment_ == TA_LEFT_RAGGED) {\n    tab_win->Pen(ScrollView::DARK_GREEN);\n  } else if (alignment_ == TA_RIGHT_ALIGNED) {\n    tab_win->Pen(ScrollView::PINK);\n  } else if (alignment_ == TA_RIGHT_RAGGED) {\n    tab_win->Pen(ScrollView::CORAL);\n  } else {\n    tab_win->Pen(ScrollView::WHITE);\n  }\n  tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());\n  tab_win->Pen(ScrollView::GREY);\n  tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);\n  tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());\n  auto score_string = std::to_string(percent_score_);\n  tab_win->TextAttributes(\"Times\", 50, false, false, false);\n  tab_win->Text(startpt_.x(), startpt_.y(), score_string.c_str());\n}\n\n#endif\n\n// Refit the line and/or re-evaluate the vector if the dirty flags are set.\nvoid TabVector::FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder) {\n  if (needs_refit_) {\n    Fit(vertical, true);\n  }\n  if (needs_evaluation_) {\n    Evaluate(vertical, finder);\n  }\n}\n\n// Evaluate the vector in terms of coverage of its length by good-looking\n// box edges. A good looking box is one where its nearest neighbour on the\n// inside is nearer than half the distance its nearest neighbour on the\n// outside of the putative column. Bad boxes are removed from the line.\n// A second pass then further filters boxes by requiring that the gutter\n// width be a minimum fraction of the mean gutter along the line.\nvoid TabVector::Evaluate(const ICOORD &vertical, TabFind *finder) {\n  bool debug = false;\n  needs_evaluation_ = false;\n  int length = endpt_.y() - startpt_.y();\n  if (length == 0 || boxes_.empty()) {\n    percent_score_ = 0;\n    Print(\"Zero length in evaluate\");\n    return;\n  }\n  // Compute the mean box height.\n  BLOBNBOX_C_IT it(&boxes_);\n  int mean_height = 0;\n  int height_count = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    const TBOX &box = bbox->bounding_box();\n    int height = box.height();\n    mean_height += height;\n    ++height_count;\n  }\n  if (height_count > 0) {\n    mean_height /= height_count;\n  }\n  int max_gutter = kGutterMultiple * mean_height;\n  if (IsRagged()) {\n    // Ragged edges face a tougher test in that the gap must always be within\n    // the height of the blob.\n    max_gutter = kGutterToNeighbourRatio * mean_height;\n  }\n\n  STATS gutters(0, max_gutter);\n  // Evaluate the boxes for their goodness, calculating the coverage as we go.\n  // Remove boxes that are not good and shorten the list to the first and\n  // last good boxes.\n  int num_deleted_boxes = 0;\n  bool text_on_image = false;\n  int good_length = 0;\n  const TBOX *prev_good_box = nullptr;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    const TBOX &box = bbox->bounding_box();\n    int mid_y = (box.top() + box.bottom()) / 2;\n    if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {\n      if (!debug) {\n        tprintf(\"After already deleting %d boxes, \", num_deleted_boxes);\n        Print(\"Starting evaluation\");\n      }\n      debug = true;\n    }\n    // A good box is one where the nearest neighbour on the inside is closer\n    // than half the distance to the nearest neighbour on the outside\n    // (of the putative column).\n    bool left = IsLeftTab();\n    int tab_x = XAtY(mid_y);\n    int gutter_width;\n    int neighbour_gap;\n    finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,\n                                       &neighbour_gap);\n    if (debug) {\n      tprintf(\"Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\\n\", box.left(), box.bottom(),\n              box.right(), box.top(), gutter_width, neighbour_gap);\n    }\n    // Now we can make the test.\n    if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {\n      // A good box contributes its height to the good_length.\n      good_length += box.top() - box.bottom();\n      gutters.add(gutter_width, 1);\n      // Two good boxes together contribute the gap between them\n      // to the good_length as well, as long as the gap is not\n      // too big.\n      if (prev_good_box != nullptr) {\n        int vertical_gap = box.bottom() - prev_good_box->top();\n        double size1 = sqrt(static_cast<double>(prev_good_box->area()));\n        double size2 = sqrt(static_cast<double>(box.area()));\n        if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2)) {\n          good_length += vertical_gap;\n        }\n        if (debug) {\n          tprintf(\"Box and prev good, gap=%d, target %g, goodlength=%d\\n\", vertical_gap,\n                  kMaxFillinMultiple * std::min(size1, size2), good_length);\n        }\n      } else {\n        // Adjust the start to the first good box.\n        SetYStart(box.bottom());\n      }\n      prev_good_box = &box;\n      if (bbox->flow() == BTFT_TEXT_ON_IMAGE) {\n        text_on_image = true;\n      }\n    } else {\n      // Get rid of boxes that are not good.\n      if (debug) {\n        tprintf(\"Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\\n\", box.left(), box.bottom(),\n                box.right(), box.top(), gutter_width, neighbour_gap);\n      }\n      it.extract();\n      ++num_deleted_boxes;\n    }\n  }\n  if (debug) {\n    Print(\"Evaluating:\");\n  }\n  // If there are any good boxes, do it again, except this time get rid of\n  // boxes that have a gutter that is a small fraction of the mean gutter.\n  // This filters out ends that run into a coincidental gap in the text.\n  int search_top = endpt_.y();\n  int search_bottom = startpt_.y();\n  int median_gutter = IntCastRounded(gutters.median());\n  if (gutters.get_total() > 0) {\n    prev_good_box = nullptr;\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      BLOBNBOX *bbox = it.data();\n      const TBOX &box = bbox->bounding_box();\n      int mid_y = (box.top() + box.bottom()) / 2;\n      // A good box is one where the gutter width is at least some constant\n      // fraction of the mean gutter width.\n      bool left = IsLeftTab();\n      int tab_x = XAtY(mid_y);\n      int max_gutter = kGutterMultiple * mean_height;\n      if (IsRagged()) {\n        // Ragged edges face a tougher test in that the gap must always be\n        // within the height of the blob.\n        max_gutter = kGutterToNeighbourRatio * mean_height;\n      }\n      int gutter_width;\n      int neighbour_gap;\n      finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,\n                                         &neighbour_gap);\n      // Now we can make the test.\n      if (gutter_width >= median_gutter * kMinGutterFraction) {\n        if (prev_good_box == nullptr) {\n          // Adjust the start to the first good box.\n          SetYStart(box.bottom());\n          search_bottom = box.top();\n        }\n        prev_good_box = &box;\n        search_top = box.bottom();\n      } else {\n        // Get rid of boxes that are not good.\n        if (debug) {\n          tprintf(\"Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\\n\", box.left(),\n                  box.bottom(), box.right(), box.top(), gutter_width, median_gutter);\n        }\n        it.extract();\n        ++num_deleted_boxes;\n      }\n    }\n  }\n  // If there has been a good box, adjust the end.\n  if (prev_good_box != nullptr) {\n    SetYEnd(prev_good_box->top());\n    // Compute the percentage of the vector that is occupied by good boxes.\n    int length = endpt_.y() - startpt_.y();\n    percent_score_ = 100 * good_length / length;\n    if (num_deleted_boxes > 0) {\n      needs_refit_ = true;\n      FitAndEvaluateIfNeeded(vertical, finder);\n      if (boxes_.empty()) {\n        return;\n      }\n    }\n    // Test the gutter over the whole vector, instead of just at the boxes.\n    int required_shift;\n    if (search_bottom > search_top) {\n      search_bottom = startpt_.y();\n      search_top = endpt_.y();\n    }\n    double min_gutter_width = kLineCountReciprocal / boxes_.length();\n    min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;\n    min_gutter_width *= mean_height;\n    int max_gutter_width = IntCastRounded(min_gutter_width) + 1;\n    if (median_gutter > max_gutter_width) {\n      max_gutter_width = median_gutter;\n    }\n    int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, text_on_image,\n                                           max_gutter_width, &required_shift);\n    if (gutter_width < min_gutter_width) {\n      if (debug) {\n        tprintf(\"Rejecting bad tab Vector with %d gutter vs %g min\\n\", gutter_width,\n                min_gutter_width);\n      }\n      boxes_.shallow_clear();\n      percent_score_ = 0;\n    } else if (debug) {\n      tprintf(\"Final gutter %d, vs limit of %g, required shift = %d\\n\", gutter_width,\n              min_gutter_width, required_shift);\n    }\n  } else {\n    // There are no good boxes left, so score is 0.\n    percent_score_ = 0;\n  }\n\n  if (debug) {\n    Print(\"Evaluation complete:\");\n  }\n}\n\n// (Re)Fit a line to the stored points. Returns false if the line\n// is degenerate. Although the TabVector code mostly doesn't care about the\n// direction of lines, XAtY would give silly results for a horizontal line.\n// The class is mostly aimed at use for vertical lines representing\n// horizontal tab stops.\nbool TabVector::Fit(ICOORD vertical, bool force_parallel) {\n  needs_refit_ = false;\n  if (boxes_.empty()) {\n    // Don't refit something with no boxes, as that only happens\n    // in Evaluate, and we don't want to end up with a zero vector.\n    if (!force_parallel) {\n      return false;\n    }\n    // If we are forcing parallel, then we just need to set the sort_key_.\n    ICOORD midpt = startpt_;\n    midpt += endpt_;\n    midpt /= 2;\n    sort_key_ = SortKey(vertical, midpt.x(), midpt.y());\n    return startpt_.y() != endpt_.y();\n  }\n  if (!force_parallel && !IsRagged()) {\n    // Use a fitted line as the vertical.\n    DetLineFit linepoints;\n    BLOBNBOX_C_IT it(&boxes_);\n    // Fit a line to all the boxes in the list.\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      BLOBNBOX *bbox = it.data();\n      const TBOX &box = bbox->bounding_box();\n      int x1 = IsRightTab() ? box.right() : box.left();\n      ICOORD boxpt(x1, box.bottom());\n      linepoints.Add(boxpt);\n      if (it.at_last()) {\n        ICOORD top_pt(x1, box.top());\n        linepoints.Add(top_pt);\n      }\n    }\n    linepoints.Fit(&startpt_, &endpt_);\n    if (startpt_.y() != endpt_.y()) {\n      vertical = endpt_;\n      vertical -= startpt_;\n    }\n  }\n  int start_y = startpt_.y();\n  int end_y = endpt_.y();\n  sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;\n  BLOBNBOX_C_IT it(&boxes_);\n  // Choose a line parallel to the vertical such that all boxes are on the\n  // correct side of it.\n  mean_width_ = 0;\n  int width_count = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *bbox = it.data();\n    const TBOX &box = bbox->bounding_box();\n    mean_width_ += box.width();\n    ++width_count;\n    int x1 = IsRightTab() ? box.right() : box.left();\n    // Test both the bottom and the top, as one will be more extreme, depending\n    // on the direction of skew.\n    int bottom_y = box.bottom();\n    int top_y = box.top();\n    int key = SortKey(vertical, x1, bottom_y);\n    if (IsLeftTab() == (key < sort_key_)) {\n      sort_key_ = key;\n      startpt_ = ICOORD(x1, bottom_y);\n    }\n    key = SortKey(vertical, x1, top_y);\n    if (IsLeftTab() == (key < sort_key_)) {\n      sort_key_ = key;\n      startpt_ = ICOORD(x1, top_y);\n    }\n    if (it.at_first()) {\n      start_y = bottom_y;\n    }\n    if (it.at_last()) {\n      end_y = top_y;\n    }\n  }\n  if (width_count > 0) {\n    mean_width_ = (mean_width_ + width_count - 1) / width_count;\n  }\n  endpt_ = startpt_ + vertical;\n  needs_evaluation_ = true;\n  if (start_y != end_y) {\n    // Set the ends of the vector to fully include the first and last blobs.\n    startpt_.set_x(XAtY(vertical, sort_key_, start_y));\n    startpt_.set_y(start_y);\n    endpt_.set_x(XAtY(vertical, sort_key_, end_y));\n    endpt_.set_y(end_y);\n    return true;\n  }\n  return false;\n}\n\n// Returns the singleton partner if there is one, or nullptr otherwise.\nTabVector *TabVector::GetSinglePartner() {\n  if (!partners_.singleton()) {\n    return nullptr;\n  }\n  TabVector_C_IT partner_it(&partners_);\n  TabVector *partner = partner_it.data();\n  return partner;\n}\n\n// Return the partner of this TabVector if the vector qualifies as\n// being a vertical text line, otherwise nullptr.\nTabVector *TabVector::VerticalTextlinePartner() {\n  if (!partners_.singleton()) {\n    return nullptr;\n  }\n  TabVector_C_IT partner_it(&partners_);\n  TabVector *partner = partner_it.data();\n  BLOBNBOX_C_IT box_it1(&boxes_);\n  BLOBNBOX_C_IT box_it2(&partner->boxes_);\n  // Count how many boxes are also in the other list.\n  // At the same time, gather the mean width and median vertical gap.\n  if (textord_debug_tabfind > 1) {\n    Print(\"Testing for vertical text\");\n    partner->Print(\"           partner\");\n  }\n  int num_matched = 0;\n  int num_unmatched = 0;\n  int total_widths = 0;\n  int width = startpt().x() - partner->startpt().x();\n  if (width < 0) {\n    width = -width;\n  }\n  STATS gaps(0, width * 2 - 1);\n  BLOBNBOX *prev_bbox = nullptr;\n  box_it2.mark_cycle_pt();\n  for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {\n    BLOBNBOX *bbox = box_it1.data();\n    TBOX box = bbox->bounding_box();\n    if (prev_bbox != nullptr) {\n      gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);\n    }\n    while (!box_it2.cycled_list() && box_it2.data() != bbox &&\n           box_it2.data()->bounding_box().bottom() < box.bottom()) {\n      box_it2.forward();\n    }\n    if (!box_it2.cycled_list() && box_it2.data() == bbox && bbox->region_type() >= BRT_UNKNOWN &&\n        (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN)) {\n      ++num_matched;\n    } else {\n      ++num_unmatched;\n    }\n    total_widths += box.width();\n    prev_bbox = bbox;\n  }\n  if (num_unmatched + num_matched == 0) {\n    return nullptr;\n  }\n  double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);\n  double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;\n  int min_box_match =\n      static_cast<int>((num_matched + num_unmatched) * textord_tabvector_vertical_box_ratio);\n  bool is_vertical =\n      (gaps.get_total() > 0 && num_matched >= min_box_match && gaps.median() <= max_gap);\n  if (textord_debug_tabfind > 1) {\n    tprintf(\n        \"gaps=%d, matched=%d, unmatched=%d, min_match=%d \"\n        \"median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\\n\",\n        gaps.get_total(), num_matched, num_unmatched, min_box_match, gaps.median(), avg_width,\n        max_gap, is_vertical ? \"Yes\" : \"No\");\n  }\n  return (is_vertical) ? partner : nullptr;\n}\n\n// The constructor is private.\nTabVector::TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment,\n                     BLOBNBOX_CLIST *boxes)\n    : extended_ymin_(extended_ymin)\n    , extended_ymax_(extended_ymax)\n    , sort_key_(0)\n    , percent_score_(0)\n    , mean_width_(0)\n    , needs_refit_(true)\n    , needs_evaluation_(true)\n    , alignment_(alignment)\n    , top_constraints_(nullptr)\n    , bottom_constraints_(nullptr) {\n  BLOBNBOX_C_IT it(&boxes_);\n  it.add_list_after(boxes);\n}\n\n// Delete this, but first, repoint all the partners to point to\n// replacement. If replacement is nullptr, then partner relationships\n// are removed.\nvoid TabVector::Delete(TabVector *replacement) {\n  TabVector_C_IT it(&partners_);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    TabVector *partner = it.data();\n    TabVector_C_IT p_it(&partner->partners_);\n    // If partner already has replacement in its list, then make\n    // replacement null, and just remove this TabVector when we find it.\n    TabVector *partner_replacement = replacement;\n    for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {\n      TabVector *p_partner = p_it.data();\n      if (p_partner == partner_replacement) {\n        partner_replacement = nullptr;\n        break;\n      }\n    }\n    // Remove all references to this, and replace with replacement if not\n    // nullptr.\n    for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {\n      TabVector *p_partner = p_it.data();\n      if (p_partner == this) {\n        p_it.extract();\n        if (partner_replacement != nullptr) {\n          p_it.add_before_stay_put(partner_replacement);\n        }\n      }\n    }\n    if (partner_replacement != nullptr) {\n      partner_replacement->AddPartner(partner);\n    }\n  }\n  delete this;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/tabvector.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        tabvector.h\n// Description: Class to hold a near-vertical vector representing a tab-stop.\n// Author:      Ray Smith\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_TABVECTOR_H_\n#define TESSERACT_TEXTORD_TABVECTOR_H_\n\n#include \"bbgrid.h\"\n#include \"blobgrid.h\"\n#include \"clst.h\"\n#include \"elst.h\"\n#include \"elst2.h\"\n#include \"rect.h\"\n\n#include <algorithm>\n\nclass BLOBNBOX;\nclass ScrollView;\n\nnamespace tesseract {\n\nextern double_VAR_H(textord_tabvector_vertical_gap_fraction);\nextern double_VAR_H(textord_tabvector_vertical_box_ratio);\n\n// The alignment type that a tab vector represents.\n// Keep this enum synced with kAlignmentNames in tabvector.cpp.\nenum TabAlignment {\n  TA_LEFT_ALIGNED,\n  TA_LEFT_RAGGED,\n  TA_CENTER_JUSTIFIED,\n  TA_RIGHT_ALIGNED,\n  TA_RIGHT_RAGGED,\n  TA_SEPARATOR,\n  TA_COUNT\n};\n\n// Forward declarations. The classes use their own list types, so we\n// need to make the list types first.\nclass TabFind;\nclass TabVector;\nclass TabConstraint;\n\nELIST2IZEH(TabVector)\nCLISTIZEH(TabVector)\nELISTIZEH(TabConstraint)\n\n// TabConstraint is a totally self-contained class to maintain\n// a list of [min,max] constraints, each referring to a TabVector.\n// The constraints are manipulated through static methods that act\n// on a list of constraints. The list itself is cooperatively owned\n// by the TabVectors of the constraints on the list and managed\n// by implicit reference counting via the elements of the list.\nclass TabConstraint : public ELIST<TabConstraint>::LINK {\npublic:\n  // This empty constructor is here only so that the class can be ELISTIZED.\n  // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier\n  // and eliminate CLASSNAME##_copier.\n  TabConstraint() = default;\n\n  // Create a constraint for the top or bottom of this TabVector.\n  static void CreateConstraint(TabVector *vector, bool is_top);\n\n  // Test to see if the constraints are compatible enough to merge.\n  static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2);\n\n  // Merge the lists of constraints and update the TabVector pointers.\n  // The second list is deleted.\n  static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2);\n\n  // Set all the tops and bottoms as appropriate to a mean of the\n  // constrained range. Delete all the constraints and list.\n  static void ApplyConstraints(TabConstraint_LIST *constraints);\n\nprivate:\n  TabConstraint(TabVector *vector, bool is_top);\n\n  // Get the max of the mins and the min of the maxes.\n  static void GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max);\n\n  // The TabVector this constraint applies to.\n  TabVector *vector_;\n  // If true then we refer to the top of the vector_.\n  bool is_top_;\n  // The allowed range of this vector_.\n  int y_min_;\n  int y_max_;\n};\n\n// Class to hold information about a single vector\n// that represents a tab stop or a rule line.\nclass TabVector : public ELIST2<TabVector>::LINK {\npublic:\n  // TODO(rays) fix this in elst.h line 1076, where it should use the\n  // copy constructor instead of operator=.\n  TabVector() = default;\n  ~TabVector() = default;\n\n  // Public factory to build a TabVector from a list of boxes.\n  // The TabVector will be of the given alignment type.\n  // The input vertical vector is used in fitting, and the output\n  // vertical_x, vertical_y have the resulting line vector added to them\n  // if the alignment is not ragged.\n  // The extended_start_y and extended_end_y are the maximum possible\n  // extension to the line segment that can be used to align with others.\n  // The input CLIST of BLOBNBOX good_points is consumed and taken over.\n  static TabVector *FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y,\n                              int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x,\n                              int *vertical_y);\n\n  // Build a ragged TabVector by copying another's direction, shifting it\n  // to match the given blob, and making its initial extent the height\n  // of the blob, but its extended bounds from the bounds of the original.\n  TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew,\n            BLOBNBOX *blob);\n\n  // Copies basic attributes of a tab vector for simple operations.\n  // Copies things such startpt, endpt, range, width.\n  // Does not copy things such as partners, boxes, or constraints.\n  // This is useful if you only need vector information for processing, such\n  // as in the table detection code.\n  TabVector *ShallowCopy() const;\n\n  // Simple accessors.\n  const ICOORD &startpt() const {\n    return startpt_;\n  }\n  const ICOORD &endpt() const {\n    return endpt_;\n  }\n  int extended_ymax() const {\n    return extended_ymax_;\n  }\n  int extended_ymin() const {\n    return extended_ymin_;\n  }\n  int sort_key() const {\n    return sort_key_;\n  }\n  int mean_width() const {\n    return mean_width_;\n  }\n  void set_top_constraints(TabConstraint_LIST *constraints) {\n    top_constraints_ = constraints;\n  }\n  void set_bottom_constraints(TabConstraint_LIST *constraints) {\n    bottom_constraints_ = constraints;\n  }\n  TabVector_CLIST *partners() {\n    return &partners_;\n  }\n  void set_startpt(const ICOORD &start) {\n    startpt_ = start;\n  }\n  void set_endpt(const ICOORD &end) {\n    endpt_ = end;\n  }\n  bool intersects_other_lines() const {\n    return intersects_other_lines_;\n  }\n  void set_intersects_other_lines(bool value) {\n    intersects_other_lines_ = value;\n  }\n\n  // Inline quasi-accessors that require some computation.\n\n  // Compute the x coordinate at the given y coordinate.\n  int XAtY(int y) const {\n    int height = endpt_.y() - startpt_.y();\n    if (height != 0) {\n      return (y - startpt_.y()) * (endpt_.x() - startpt_.x()) / height + startpt_.x();\n    } else {\n      return startpt_.x();\n    }\n  }\n\n  // Compute the vertical overlap with the other TabVector.\n  int VOverlap(const TabVector &other) const {\n    return std::min(other.endpt_.y(), endpt_.y()) - std::max(other.startpt_.y(), startpt_.y());\n  }\n  // Compute the vertical overlap with the given y bounds.\n  int VOverlap(int top_y, int bottom_y) const {\n    return std::min(top_y, static_cast<int>(endpt_.y())) -\n           std::max(bottom_y, static_cast<int>(startpt_.y()));\n  }\n  // Compute the extended vertical overlap with the given y bounds.\n  int ExtendedOverlap(int top_y, int bottom_y) const {\n    return std::min(top_y, extended_ymax_) - std::max(bottom_y, extended_ymin_);\n  }\n\n  // Return true if this is a left tab stop, either aligned, or ragged.\n  bool IsLeftTab() const {\n    return alignment_ == TA_LEFT_ALIGNED || alignment_ == TA_LEFT_RAGGED;\n  }\n  // Return true if this is a right tab stop, either aligned, or ragged.\n  bool IsRightTab() const {\n    return alignment_ == TA_RIGHT_ALIGNED || alignment_ == TA_RIGHT_RAGGED;\n  }\n  // Return true if this is a separator.\n  bool IsSeparator() const {\n    return alignment_ == TA_SEPARATOR;\n  }\n  // Return true if this is a center aligned tab stop.\n  bool IsCenterTab() const {\n    return alignment_ == TA_CENTER_JUSTIFIED;\n  }\n  // Return true if this is a ragged tab top, either left or right.\n  bool IsRagged() const {\n    return alignment_ == TA_LEFT_RAGGED || alignment_ == TA_RIGHT_RAGGED;\n  }\n\n  // Return true if this vector is to the left of the other in terms\n  // of sort_key_.\n  bool IsLeftOf(const TabVector &other) const {\n    return sort_key_ < other.sort_key_;\n  }\n\n  // Return true if the vector has no partners.\n  bool Partnerless() {\n    return partners_.empty();\n  }\n\n  // Return the number of tab boxes in this vector.\n  int BoxCount() {\n    return boxes_.length();\n  }\n\n  // Lock the vector from refits by clearing the boxes_ list.\n  void Freeze() {\n    boxes_.shallow_clear();\n  }\n\n  // Flip x and y on the ends so a vector can be created from flipped input.\n  void XYFlip() {\n    int x = startpt_.y();\n    startpt_.set_y(startpt_.x());\n    startpt_.set_x(x);\n    x = endpt_.y();\n    endpt_.set_y(endpt_.x());\n    endpt_.set_x(x);\n  }\n\n  // Reflect the tab vector in the y-axis.\n  void ReflectInYAxis() {\n    startpt_.set_x(-startpt_.x());\n    endpt_.set_x(-endpt_.x());\n    sort_key_ = -sort_key_;\n    if (alignment_ == TA_LEFT_ALIGNED) {\n      alignment_ = TA_RIGHT_ALIGNED;\n    } else if (alignment_ == TA_RIGHT_ALIGNED) {\n      alignment_ = TA_LEFT_ALIGNED;\n    }\n    if (alignment_ == TA_LEFT_RAGGED) {\n      alignment_ = TA_RIGHT_RAGGED;\n    } else if (alignment_ == TA_RIGHT_RAGGED) {\n      alignment_ = TA_LEFT_RAGGED;\n    }\n  }\n\n  // Separate function to compute the sort key for a given coordinate pair.\n  static int SortKey(const ICOORD &vertical, int x, int y) {\n    ICOORD pt(x, y);\n    return pt * vertical;\n  }\n\n  // Return the x at the given y for the given sort key.\n  static int XAtY(const ICOORD &vertical, int sort_key, int y) {\n    if (vertical.y() != 0) {\n      return (vertical.x() * y + sort_key) / vertical.y();\n    } else {\n      return sort_key;\n    }\n  }\n\n  // Sort function for E2LIST::sort to sort by sort_key_.\n  static int SortVectorsByKey(const TabVector *tv1, const TabVector *tv2) {\n    return tv1->sort_key_ - tv2->sort_key_;\n  }\n\n  // More complex members.\n\n  // Extend this vector to include the supplied blob if it doesn't\n  // already have it.\n  void ExtendToBox(BLOBNBOX *blob);\n\n  // Set the ycoord of the start and move the xcoord to match.\n  void SetYStart(int start_y);\n  // Set the ycoord of the end and move the xcoord to match.\n  void SetYEnd(int end_y);\n\n  // Rotate the ends by the given vector.\n  void Rotate(const FCOORD &rotation);\n\n  // Setup the initial constraints, being the limits of\n  // the vector and the extended ends.\n  void SetupConstraints();\n\n  // Setup the constraints between the partners of this TabVector.\n  void SetupPartnerConstraints();\n\n  // Setup the constraints between this and its partner.\n  void SetupPartnerConstraints(TabVector *partner);\n\n  // Use the constraints to modify the top and bottom.\n  void ApplyConstraints();\n\n  // Merge close tab vectors of the same side that overlap.\n  static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors,\n                                     BlobGrid *grid);\n\n  // Return true if this vector is the same side, overlaps, and close\n  // enough to the other to be merged.\n  bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const;\n\n  // Eat the other TabVector into this and delete it.\n  void MergeWith(const ICOORD &vertical, TabVector *other);\n\n  // Add a new element to the list of partner TabVectors.\n  // Partners must be added in order of increasing y coordinate of the text line\n  // that makes them partners.\n  // Groups of identical partners are merged into one.\n  void AddPartner(TabVector *partner);\n\n  // Return true if other is a partner of this.\n  bool IsAPartner(const TabVector *other);\n\n  // Print basic information about this tab vector.\n  void Print(const char *prefix);\n\n  // Print basic information about this tab vector and every box in it.\n  void Debug(const char *prefix);\n\n  // Draw this tabvector in place in the given window.\n  void Display(ScrollView *tab_win);\n\n  // Refit the line and/or re-evaluate the vector if the dirty flags are set.\n  void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder);\n\n  // Evaluate the vector in terms of coverage of its length by good-looking\n  // box edges. A good looking box is one where its nearest neighbour on the\n  // inside is nearer than half the distance its nearest neighbour on the\n  // outside of the putative column. Bad boxes are removed from the line.\n  // A second pass then further filters boxes by requiring that the gutter\n  // width be a minimum fraction of the mean gutter along the line.\n  void Evaluate(const ICOORD &vertical, TabFind *finder);\n\n  // (Re)Fit a line to the stored points. Returns false if the line\n  // is degenerate. Although the TabVector code mostly doesn't care about the\n  // direction of lines, XAtY would give silly results for a horizontal line.\n  // The class is mostly aimed at use for vertical lines representing\n  // horizontal tab stops.\n  bool Fit(ICOORD vertical, bool force_parallel);\n\n  // Return the partner of this TabVector if the vector qualifies as\n  // being a vertical text line, otherwise nullptr.\n  TabVector *VerticalTextlinePartner();\n\n  // Return the matching tabvector if there is exactly one partner, or\n  // nullptr otherwise.  This can be used after matching is done, eg. by\n  // VerticalTextlinePartner(), without checking if the line is vertical.\n  TabVector *GetSinglePartner();\n\nprivate:\n  // Constructor is private as the static factory is the external way\n  // to build a TabVector.\n  TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment, BLOBNBOX_CLIST *boxes);\n\n  // Delete this, but first, repoint all the partners to point to\n  // replacement. If replacement is nullptr, then partner relationships\n  // are removed.\n  void Delete(TabVector *replacement);\n\nprivate:\n  // The bottom of the tab line.\n  ICOORD startpt_;\n  // The top of the tab line.\n  ICOORD endpt_;\n  // The lowest y that the vector might extend to.\n  int extended_ymin_ = 0;\n  // The highest y that the vector might extend to.\n  int extended_ymax_ = 0;\n  // Perpendicular distance of vector from a given vertical for sorting.\n  int sort_key_ = 0;\n  // Result of Evaluate 0-100. Coverage of line with good boxes.\n  int percent_score_ = 0;\n  // The mean width of the blobs. Meaningful only for separator lines.\n  int mean_width_ = 0;\n  // True if the boxes_ list has been modified, so a refit is needed.\n  bool needs_refit_ = false;\n  // True if a fit has been done, so re-evaluation is needed.\n  bool needs_evaluation_ = false;\n  // True if a separator line intersects at least 2 other lines.\n  bool intersects_other_lines_ = false;\n  // The type of this TabVector.\n  TabAlignment alignment_ = TA_LEFT_ALIGNED;\n  // The list of boxes whose edges are aligned at this TabVector.\n  BLOBNBOX_CLIST boxes_;\n  // List of TabVectors that have a connection with this via a text line.\n  TabVector_CLIST partners_;\n  // Constraints used to resolve the exact location of the top and bottom\n  // of the tab line.\n  TabConstraint_LIST *top_constraints_ = nullptr;\n  TabConstraint_LIST *bottom_constraints_ = nullptr;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_TABVECTOR_H_\n"
  },
  {
    "path": "src/textord/textlineprojection.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <allheaders.h>\n#include \"bbgrid.h\"  // Base class.\n#include \"blobbox.h\" // BlobNeighbourDir.\n#include \"blobs.h\"\n#include \"colpartition.h\"\n#include \"helpers.h\" // for IntCastRounded\n#include \"normalis.h\"\n#include \"textlineprojection.h\"\n\n#include <algorithm>\n\n// Padding factor to use on definitely oriented blobs\nconst int kOrientedPadFactor = 8;\n// Padding factor to use on not definitely oriented blobs.\nconst int kDefaultPadFactor = 2;\n// Penalty factor for going away from the line center.\nconst int kWrongWayPenalty = 4;\n// Ratio between parallel gap and perpendicular gap used to measure total\n// distance of a box from a target box in curved textline space.\n// parallel-gap is treated more favorably by this factor to allow catching\n// quotes and ellipsis at the end of textlines.\nconst int kParaPerpDistRatio = 4;\n// Multiple of scale_factor_ that the inter-line gap must be before we start\n// padding the increment box perpendicular to the text line.\nconst int kMinLineSpacingFactor = 4;\n// Maximum tab-stop overrun for horizontal padding, in projection pixels.\nconst int kMaxTabStopOverrun = 6;\n\nnamespace tesseract {\n\nTextlineProjection::TextlineProjection(int resolution) : x_origin_(0), y_origin_(0), pix_(nullptr) {\n  // The projection map should be about 100 ppi, whatever the input.\n  scale_factor_ = IntCastRounded(resolution / 100.0);\n  if (scale_factor_ < 1) {\n    scale_factor_ = 1;\n  }\n}\nTextlineProjection::~TextlineProjection() {\n  pix_.destroy();\n}\n\n// Build the projection profile given the input_block containing lists of\n// blobs, a rotation to convert to image coords,\n// and a full-resolution nontext_map, marking out areas to avoid.\n// During construction, we have the following assumptions:\n// The rotation is a multiple of 90 degrees, ie no deskew yet.\n// The blobs have had their left and right rules set to also limit\n// the range of projection.\nvoid TextlineProjection::ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation,\n                                             Image nontext_map) {\n  pix_.destroy();\n  TBOX image_box(0, 0, pixGetWidth(nontext_map), pixGetHeight(nontext_map));\n  x_origin_ = 0;\n  y_origin_ = image_box.height();\n  int width = (image_box.width() + scale_factor_ - 1) / scale_factor_;\n  int height = (image_box.height() + scale_factor_ - 1) / scale_factor_;\n\n  pix_ = pixCreate(width, height, 8);\n  ProjectBlobs(&input_block->blobs, rotation, image_box, nontext_map);\n  ProjectBlobs(&input_block->large_blobs, rotation, image_box, nontext_map);\n  Image final_pix = pixBlockconv(pix_, 1, 1);\n  //  Pix* final_pix = pixBlockconv(pix_, 2, 2);\n  pix_.destroy();\n  pix_ = final_pix;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Display the blobs in the window colored according to textline quality.\nvoid TextlineProjection::PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win) {\n  BLOBNBOX_IT it(blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    const TBOX &box = blob->bounding_box();\n    bool bad_box = BoxOutOfHTextline(box, nullptr, false);\n    if (blob->UniquelyVertical()) {\n      win->Pen(ScrollView::YELLOW);\n    } else {\n      win->Pen(bad_box ? ScrollView::RED : ScrollView::BLUE);\n    }\n    win->Rectangle(box.left(), box.bottom(), box.right(), box.top());\n  }\n  win->Update();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Moves blobs that look like they don't sit well on a textline from the\n// input blobs list to the output small_blobs list.\n// This gets them away from initial textline finding to stop diacritics\n// from forming incorrect textlines. (Introduced mainly to fix Thai.)\nvoid TextlineProjection::MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs,\n                                              BLOBNBOX_LIST *small_blobs) const {\n  BLOBNBOX_IT it(blobs);\n  BLOBNBOX_IT small_it(small_blobs);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    BLOBNBOX *blob = it.data();\n    const TBOX &box = blob->bounding_box();\n    bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());\n    if (BoxOutOfHTextline(box, nullptr, debug) && !blob->UniquelyVertical()) {\n      blob->ClearNeighbours();\n      small_it.add_to_end(it.extract());\n    }\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Create a window and display the projection in it.\nvoid TextlineProjection::DisplayProjection() const {\n  int width = pixGetWidth(pix_);\n  int height = pixGetHeight(pix_);\n  Image pixc = pixCreate(width, height, 32);\n  int src_wpl = pixGetWpl(pix_);\n  int col_wpl = pixGetWpl(pixc);\n  uint32_t *src_data = pixGetData(pix_);\n  uint32_t *col_data = pixGetData(pixc);\n  for (int y = 0; y < height; ++y, src_data += src_wpl, col_data += col_wpl) {\n    for (int x = 0; x < width; ++x) {\n      int pixel = GET_DATA_BYTE(src_data, x);\n      l_uint32 result;\n      if (pixel <= 17) {\n        composeRGBPixel(0, 0, pixel * 15, &result);\n      } else if (pixel <= 145) {\n        composeRGBPixel(0, (pixel - 17) * 2, 255, &result);\n      } else {\n        composeRGBPixel((pixel - 145) * 2, 255, 255, &result);\n      }\n      col_data[x] = result;\n    }\n  }\n  auto *win = new ScrollView(\"Projection\", 0, 0, width, height, width, height);\n  win->Draw(pixc, 0, 0);\n  win->Update();\n  pixc.destroy();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Compute the distance of the box from the partition using curved projection\n// space. As DistanceOfBoxFromBox, except that the direction is taken from\n// the ColPartition and the median bounds of the ColPartition are used as\n// the to_box.\nint TextlineProjection::DistanceOfBoxFromPartition(const TBOX &box, const ColPartition &part,\n                                                   const DENORM *denorm, bool debug) const {\n  // Compute a partition box that uses the median top/bottom of the blobs\n  // within and median left/right for vertical.\n  TBOX part_box = part.bounding_box();\n  if (part.IsHorizontalType()) {\n    part_box.set_top(part.median_top());\n    part_box.set_bottom(part.median_bottom());\n  } else {\n    part_box.set_left(part.median_left());\n    part_box.set_right(part.median_right());\n  }\n  // Now use DistanceOfBoxFromBox to make the actual calculation.\n  return DistanceOfBoxFromBox(box, part_box, part.IsHorizontalType(), denorm, debug);\n}\n\n// Compute the distance from the from_box to the to_box using curved\n// projection space. Separation that involves a decrease in projection\n// density (moving from the from_box to the to_box) is weighted more heavily\n// than constant density, and an increase is weighted less.\n// If horizontal_textline is true, then curved space is used vertically,\n// as for a diacritic on the edge of a textline.\n// The projection uses original image coords, so denorm is used to get\n// back to the image coords from box/part space.\n// How the calculation works: Think of a diacritic near a textline.\n// Distance is measured from the far side of the from_box to the near side of\n// the to_box. Shown is the horizontal textline case.\n//          |------^-----|\n//          | from | box |\n//          |------|-----|\n//   perpendicular |\n//          <------v-------->|--------------------|\n//                  parallel |     to box         |\n//                           |--------------------|\n// Perpendicular distance uses \"curved space\" See VerticalDistance below.\n// Parallel distance is linear.\n// Result is perpendicular_gap + parallel_gap / kParaPerpDistRatio.\nint TextlineProjection::DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box,\n                                             bool horizontal_textline, const DENORM *denorm,\n                                             bool debug) const {\n  // The parallel_gap is the horizontal gap between a horizontal textline and\n  // the box. Analogous for vertical.\n  int parallel_gap = 0;\n  // start_pt is the box end of the line to be modified for curved space.\n  TPOINT start_pt;\n  // end_pt is the partition end of the line to be modified for curved space.\n  TPOINT end_pt;\n  if (horizontal_textline) {\n    parallel_gap = from_box.x_gap(to_box) + from_box.width();\n    start_pt.x = (from_box.left() + from_box.right()) / 2;\n    end_pt.x = start_pt.x;\n    if (from_box.top() - to_box.top() >= to_box.bottom() - from_box.bottom()) {\n      start_pt.y = from_box.top();\n      end_pt.y = std::min(to_box.top(), start_pt.y);\n    } else {\n      start_pt.y = from_box.bottom();\n      end_pt.y = std::max(to_box.bottom(), start_pt.y);\n    }\n  } else {\n    parallel_gap = from_box.y_gap(to_box) + from_box.height();\n    if (from_box.right() - to_box.right() >= to_box.left() - from_box.left()) {\n      start_pt.x = from_box.right();\n      end_pt.x = std::min(to_box.right(), start_pt.x);\n    } else {\n      start_pt.x = from_box.left();\n      end_pt.x = std::max(to_box.left(), start_pt.x);\n    }\n    start_pt.y = (from_box.bottom() + from_box.top()) / 2;\n    end_pt.y = start_pt.y;\n  }\n  // The perpendicular gap is the max vertical distance gap out of:\n  // top of from_box to to_box top and bottom of from_box to to_box bottom.\n  // This value is then modified for curved projection space.\n  // Analogous for vertical.\n  int perpendicular_gap = 0;\n  // If start_pt == end_pt, then the from_box lies entirely within the to_box\n  // (in the perpendicular direction), so we don't need to calculate the\n  // perpendicular_gap.\n  if (start_pt.x != end_pt.x || start_pt.y != end_pt.y) {\n    if (denorm != nullptr) {\n      // Denormalize the start and end.\n      denorm->DenormTransform(nullptr, start_pt, &start_pt);\n      denorm->DenormTransform(nullptr, end_pt, &end_pt);\n    }\n    if (abs(start_pt.y - end_pt.y) >= abs(start_pt.x - end_pt.x)) {\n      perpendicular_gap = VerticalDistance(debug, start_pt.x, start_pt.y, end_pt.y);\n    } else {\n      perpendicular_gap = HorizontalDistance(debug, start_pt.x, end_pt.x, start_pt.y);\n    }\n  }\n  // The parallel_gap weighs less than the perpendicular_gap.\n  return perpendicular_gap + parallel_gap / kParaPerpDistRatio;\n}\n\n// Compute the distance between (x, y1) and (x, y2) using the rule that\n// a decrease in textline density is weighted more heavily than an increase.\n// The coordinates are in source image space, ie processed by any denorm\n// already, but not yet scaled by scale_factor_.\n// Going from the outside of a textline to the inside should measure much\n// less distance than going from the inside of a textline to the outside.\n// How it works:\n// An increase is cheap (getting closer to a textline).\n// Constant costs unity.\n// A decrease is expensive (getting further from a textline).\n// Pixels in projection map Counted distance\n//              2\n//              3              1/x\n//              3               1\n//              2               x\n//              5              1/x\n//              7              1/x\n// Total: 1 + x + 3/x where x = kWrongWayPenalty.\nint TextlineProjection::VerticalDistance(bool debug, int x, int y1, int y2) const {\n  x = ImageXToProjectionX(x);\n  y1 = ImageYToProjectionY(y1);\n  y2 = ImageYToProjectionY(y2);\n  if (y1 == y2) {\n    return 0;\n  }\n  int wpl = pixGetWpl(pix_);\n  int step = y1 < y2 ? 1 : -1;\n  uint32_t *data = pixGetData(pix_) + y1 * wpl;\n  wpl *= step;\n  int prev_pixel = GET_DATA_BYTE(data, x);\n  int distance = 0;\n  int right_way_steps = 0;\n  for (int y = y1; y != y2; y += step) {\n    data += wpl;\n    int pixel = GET_DATA_BYTE(data, x);\n    if (debug) {\n      tprintf(\"At (%d,%d), pix = %d, prev=%d\\n\", x, y + step, pixel, prev_pixel);\n    }\n    if (pixel < prev_pixel) {\n      distance += kWrongWayPenalty;\n    } else if (pixel > prev_pixel) {\n      ++right_way_steps;\n    } else {\n      ++distance;\n    }\n    prev_pixel = pixel;\n  }\n  return distance * scale_factor_ + right_way_steps * scale_factor_ / kWrongWayPenalty;\n}\n\n// Compute the distance between (x1, y) and (x2, y) using the rule that\n// a decrease in textline density is weighted more heavily than an increase.\nint TextlineProjection::HorizontalDistance(bool debug, int x1, int x2, int y) const {\n  x1 = ImageXToProjectionX(x1);\n  x2 = ImageXToProjectionX(x2);\n  y = ImageYToProjectionY(y);\n  if (x1 == x2) {\n    return 0;\n  }\n  int wpl = pixGetWpl(pix_);\n  int step = x1 < x2 ? 1 : -1;\n  uint32_t *data = pixGetData(pix_) + y * wpl;\n  int prev_pixel = GET_DATA_BYTE(data, x1);\n  int distance = 0;\n  int right_way_steps = 0;\n  for (int x = x1; x != x2; x += step) {\n    int pixel = GET_DATA_BYTE(data, x + step);\n    if (debug) {\n      tprintf(\"At (%d,%d), pix = %d, prev=%d\\n\", x + step, y, pixel, prev_pixel);\n    }\n    if (pixel < prev_pixel) {\n      distance += kWrongWayPenalty;\n    } else if (pixel > prev_pixel) {\n      ++right_way_steps;\n    } else {\n      ++distance;\n    }\n    prev_pixel = pixel;\n  }\n  return distance * scale_factor_ + right_way_steps * scale_factor_ / kWrongWayPenalty;\n}\n\n// Returns true if the blob appears to be outside of a textline.\n// Such blobs are potentially diacritics (even if large in Thai) and should\n// be kept away from initial textline finding.\nbool TextlineProjection::BoxOutOfHTextline(const TBOX &box, const DENORM *denorm,\n                                           bool debug) const {\n  int grad1 = 0;\n  int grad2 = 0;\n  EvaluateBoxInternal(box, denorm, debug, &grad1, &grad2, nullptr, nullptr);\n  int worst_result = std::min(grad1, grad2);\n  int total_result = grad1 + grad2;\n  if (total_result >= 6) {\n    return false; // Strongly in textline.\n  }\n  // Medium strength: if either gradient is negative, it is likely outside\n  // the body of the textline.\n  if (worst_result < 0) {\n    return true;\n  }\n  return false;\n}\n\n// Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below,\n// but uses the median top/bottom for horizontal and median left/right for\n// vertical instead of the bounding box edges.\n// Evaluates for both horizontal and vertical and returns the best result,\n// with a positive value for horizontal and a negative value for vertical.\nint TextlineProjection::EvaluateColPartition(const ColPartition &part, const DENORM *denorm,\n                                             bool debug) const {\n  if (part.IsSingleton()) {\n    return EvaluateBox(part.bounding_box(), denorm, debug);\n  }\n  // Test vertical orientation.\n  TBOX box = part.bounding_box();\n  // Use the partition median for left/right.\n  box.set_left(part.median_left());\n  box.set_right(part.median_right());\n  int vresult = EvaluateBox(box, denorm, debug);\n\n  // Test horizontal orientation.\n  box = part.bounding_box();\n  // Use the partition median for top/bottom.\n  box.set_top(part.median_top());\n  box.set_bottom(part.median_bottom());\n  int hresult = EvaluateBox(box, denorm, debug);\n  if (debug) {\n    tprintf(\"Partition hresult=%d, vresult=%d from:\", hresult, vresult);\n    part.bounding_box().print();\n    part.Print();\n  }\n  return hresult >= -vresult ? hresult : vresult;\n}\n\n// Computes the mean projection gradients over the horizontal and vertical\n// edges of the box:\n//   -h-h-h-h-h-h\n//  |------------| mean=htop   -v|+v--------+v|-v\n//  |+h+h+h+h+h+h|             -v|+v        +v|-v\n//  |            |             -v|+v        +v|-v\n//  |    box     |             -v|+v  box   +v|-v\n//  |            |             -v|+v        +v|-v\n//  |+h+h+h+h+h+h|             -v|+v        +v|-v\n//  |------------| mean=hbot   -v|+v--------+v|-v\n//   -h-h-h-h-h-h\n//                           mean=vleft  mean=vright\n//\n// Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number\n// for a horizontal textline, a negative number for a vertical textline,\n// and near zero for undecided. Undecided is most likely non-text.\n// All the gradients are truncated to remain non-negative, since negative\n// horizontal gradients don't give any indication of being vertical and\n// vice versa.\n// Additional complexity: The coordinates have to be transformed to original\n// image coordinates with denorm (if not null), scaled to match the projection\n// pix, and THEN step out 2 pixels each way from the edge to compute the\n// gradient, and tries 3 positions, each measuring the gradient over a\n// 4-pixel spread: (+3/-1), (+2/-2), (+1/-3).  This complexity is handled by\n// several layers of helpers below.\nint TextlineProjection::EvaluateBox(const TBOX &box, const DENORM *denorm, bool debug) const {\n  return EvaluateBoxInternal(box, denorm, debug, nullptr, nullptr, nullptr, nullptr);\n}\n\n// Internal version of EvaluateBox returns the unclipped gradients as well\n// as the result of EvaluateBox.\n// hgrad1 and hgrad2 are the gradients for the horizontal textline.\nint TextlineProjection::EvaluateBoxInternal(const TBOX &box, const DENORM *denorm, bool debug,\n                                            int *hgrad1, int *hgrad2, int *vgrad1,\n                                            int *vgrad2) const {\n  int top_gradient = BestMeanGradientInRow(denorm, box.left(), box.right(), box.top(), true);\n  int bottom_gradient =\n      -BestMeanGradientInRow(denorm, box.left(), box.right(), box.bottom(), false);\n  int left_gradient = BestMeanGradientInColumn(denorm, box.left(), box.bottom(), box.top(), true);\n  int right_gradient =\n      -BestMeanGradientInColumn(denorm, box.right(), box.bottom(), box.top(), false);\n  int top_clipped = std::max(top_gradient, 0);\n  int bottom_clipped = std::max(bottom_gradient, 0);\n  int left_clipped = std::max(left_gradient, 0);\n  int right_clipped = std::max(right_gradient, 0);\n  if (debug) {\n    tprintf(\"Gradients: top = %d, bottom = %d, left= %d, right= %d for box:\", top_gradient,\n            bottom_gradient, left_gradient, right_gradient);\n    box.print();\n  }\n  int result = std::max(top_clipped, bottom_clipped) - std::max(left_clipped, right_clipped);\n  if (hgrad1 != nullptr && hgrad2 != nullptr) {\n    *hgrad1 = top_gradient;\n    *hgrad2 = bottom_gradient;\n  }\n  if (vgrad1 != nullptr && vgrad2 != nullptr) {\n    *vgrad1 = left_gradient;\n    *vgrad2 = right_gradient;\n  }\n  return result;\n}\n\n// Helper returns the mean gradient value for the horizontal row at the given\n// y, (in the external coordinates) by subtracting the mean of the transformed\n// row 2 pixels above from the mean of the transformed row 2 pixels below.\n// This gives a positive value for a good top edge and negative for bottom.\n// Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.\nint TextlineProjection::BestMeanGradientInRow(const DENORM *denorm, int16_t min_x, int16_t max_x,\n                                              int16_t y, bool best_is_max) const {\n  TPOINT start_pt(min_x, y);\n  TPOINT end_pt(max_x, y);\n  int upper = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt);\n  int lower = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt);\n  int best_gradient = lower - upper;\n  upper = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt);\n  lower = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt);\n  int gradient = lower - upper;\n  if ((gradient > best_gradient) == best_is_max) {\n    best_gradient = gradient;\n  }\n  upper = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt);\n  lower = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt);\n  gradient = lower - upper;\n  if ((gradient > best_gradient) == best_is_max) {\n    best_gradient = gradient;\n  }\n  return best_gradient;\n}\n\n// Helper returns the mean gradient value for the vertical column at the\n// given x, (in the external coordinates) by subtracting the mean of the\n// transformed column 2 pixels left from the mean of the transformed column\n// 2 pixels to the right.\n// This gives a positive value for a good left edge and negative for right.\n// Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.\nint TextlineProjection::BestMeanGradientInColumn(const DENORM *denorm, int16_t x, int16_t min_y,\n                                                 int16_t max_y, bool best_is_max) const {\n  TPOINT start_pt(x, min_y);\n  TPOINT end_pt(x, max_y);\n  int left = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt);\n  int right = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt);\n  int best_gradient = right - left;\n  left = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt);\n  right = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt);\n  int gradient = right - left;\n  if ((gradient > best_gradient) == best_is_max) {\n    best_gradient = gradient;\n  }\n  left = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt);\n  right = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt);\n  gradient = right - left;\n  if ((gradient > best_gradient) == best_is_max) {\n    best_gradient = gradient;\n  }\n  return best_gradient;\n}\n\n// Helper returns the mean pixel value over the line between the start_pt and\n// end_pt (inclusive), but shifted perpendicular to the line in the projection\n// image by offset pixels. For simplicity, it is assumed that the vector is\n// either nearly horizontal or nearly vertical. It works on skewed textlines!\n// The end points are in external coordinates, and will be denormalized with\n// the denorm if not nullptr before further conversion to pix coordinates.\n// After all the conversions, the offset is added to the direction\n// perpendicular to the line direction. The offset is thus in projection image\n// coordinates, which allows the caller to get a guaranteed displacement\n// between pixels used to calculate gradients.\nint TextlineProjection::MeanPixelsInLineSegment(const DENORM *denorm, int offset, TPOINT start_pt,\n                                                TPOINT end_pt) const {\n  TransformToPixCoords(denorm, &start_pt);\n  TransformToPixCoords(denorm, &end_pt);\n  TruncateToImageBounds(&start_pt);\n  TruncateToImageBounds(&end_pt);\n  int wpl = pixGetWpl(pix_);\n  uint32_t *data = pixGetData(pix_);\n  int total = 0;\n  int count = 0;\n  int x_delta = end_pt.x - start_pt.x;\n  int y_delta = end_pt.y - start_pt.y;\n  if (abs(x_delta) >= abs(y_delta)) {\n    if (x_delta == 0) {\n      return 0;\n    }\n    // Horizontal line. Add the offset vertically.\n    int x_step = x_delta > 0 ? 1 : -1;\n    // Correct offset for rotation, keeping it anti-clockwise of the delta.\n    offset *= x_step;\n    start_pt.y += offset;\n    end_pt.y += offset;\n    TruncateToImageBounds(&start_pt);\n    TruncateToImageBounds(&end_pt);\n    x_delta = end_pt.x - start_pt.x;\n    y_delta = end_pt.y - start_pt.y;\n    count = x_delta * x_step + 1;\n    for (int x = start_pt.x; x != end_pt.x; x += x_step) {\n      int y = start_pt.y + DivRounded(y_delta * (x - start_pt.x), x_delta);\n      total += GET_DATA_BYTE(data + wpl * y, x);\n    }\n  } else {\n    // Vertical line. Add the offset horizontally.\n    int y_step = y_delta > 0 ? 1 : -1;\n    // Correct offset for rotation, keeping it anti-clockwise of the delta.\n    // Pix holds the image with y=0 at the top, so the offset is negated.\n    offset *= -y_step;\n    start_pt.x += offset;\n    end_pt.x += offset;\n    TruncateToImageBounds(&start_pt);\n    TruncateToImageBounds(&end_pt);\n    x_delta = end_pt.x - start_pt.x;\n    y_delta = end_pt.y - start_pt.y;\n    count = y_delta * y_step + 1;\n    for (int y = start_pt.y; y != end_pt.y; y += y_step) {\n      int x = start_pt.x + DivRounded(x_delta * (y - start_pt.y), y_delta);\n      total += GET_DATA_BYTE(data + wpl * y, x);\n    }\n  }\n  return DivRounded(total, count);\n}\n\n// Given an input pix, and a box, the sides of the box are shrunk inwards until\n// they bound any black pixels found within the original box.\n// The function converts between tesseract coords and the pix coords assuming\n// that this pix is full resolution equal in size to the original image.\n// Returns an empty box if there are no black pixels in the source box.\nstatic TBOX BoundsWithinBox(Image pix, const TBOX &box) {\n  int im_height = pixGetHeight(pix);\n  Box *input_box = boxCreate(box.left(), im_height - box.top(), box.width(), box.height());\n  Box *output_box = nullptr;\n  pixClipBoxToForeground(pix, input_box, nullptr, &output_box);\n  TBOX result_box;\n  if (output_box != nullptr) {\n    l_int32 x, y, width, height;\n    boxGetGeometry(output_box, &x, &y, &width, &height);\n    result_box.set_left(x);\n    result_box.set_right(x + width);\n    result_box.set_top(im_height - y);\n    result_box.set_bottom(result_box.top() - height);\n    boxDestroy(&output_box);\n  }\n  boxDestroy(&input_box);\n  return result_box;\n}\n\n// Splits the given box in half at x_middle or y_middle according to split_on_x\n// and checks for nontext_map pixels in each half. Reduces the bbox so that it\n// still includes the middle point, but does not touch any fg pixels in\n// nontext_map. An empty box may be returned if there is no such box.\nstatic void TruncateBoxToMissNonText(int x_middle, int y_middle, bool split_on_x, Image nontext_map,\n                                     TBOX *bbox) {\n  TBOX box1(*bbox);\n  TBOX box2(*bbox);\n  TBOX im_box;\n  if (split_on_x) {\n    box1.set_right(x_middle);\n    im_box = BoundsWithinBox(nontext_map, box1);\n    if (!im_box.null_box()) {\n      box1.set_left(im_box.right());\n    }\n    box2.set_left(x_middle);\n    im_box = BoundsWithinBox(nontext_map, box2);\n    if (!im_box.null_box()) {\n      box2.set_right(im_box.left());\n    }\n  } else {\n    box1.set_bottom(y_middle);\n    im_box = BoundsWithinBox(nontext_map, box1);\n    if (!im_box.null_box()) {\n      box1.set_top(im_box.bottom());\n    }\n    box2.set_top(y_middle);\n    im_box = BoundsWithinBox(nontext_map, box2);\n    if (!im_box.null_box()) {\n      box2.set_bottom(im_box.top());\n    }\n  }\n  box1 += box2;\n  *bbox = box1;\n}\n\n// Helper function to add 1 to a rectangle in source image coords to the\n// internal projection pix_.\nvoid TextlineProjection::IncrementRectangle8Bit(const TBOX &box) {\n  int scaled_left = ImageXToProjectionX(box.left());\n  int scaled_top = ImageYToProjectionY(box.top());\n  int scaled_right = ImageXToProjectionX(box.right());\n  int scaled_bottom = ImageYToProjectionY(box.bottom());\n  int wpl = pixGetWpl(pix_);\n  uint32_t *data = pixGetData(pix_) + scaled_top * wpl;\n  for (int y = scaled_top; y <= scaled_bottom; ++y) {\n    for (int x = scaled_left; x <= scaled_right; ++x) {\n      int pixel = GET_DATA_BYTE(data, x);\n      if (pixel < 255) {\n        SET_DATA_BYTE(data, x, pixel + 1);\n      }\n    }\n    data += wpl;\n  }\n}\n\n// Inserts a list of blobs into the projection.\n// Rotation is a multiple of 90 degrees to get from blob coords to\n// nontext_map coords, nontext_map_box is the bounds of the nontext_map.\n// Blobs are spread horizontally or vertically according to their internal\n// flags, but the spreading is truncated by set pixels in the nontext_map\n// and also by the horizontal rule line limits on the blobs.\nvoid TextlineProjection::ProjectBlobs(BLOBNBOX_LIST *blobs, const FCOORD &rotation,\n                                      const TBOX &nontext_map_box, Image nontext_map) {\n  BLOBNBOX_IT blob_it(blobs);\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    BLOBNBOX *blob = blob_it.data();\n    TBOX bbox = blob->bounding_box();\n    ICOORD middle((bbox.left() + bbox.right()) / 2, (bbox.bottom() + bbox.top()) / 2);\n    bool spreading_horizontally = PadBlobBox(blob, &bbox);\n    // Rotate to match the nontext_map.\n    bbox.rotate(rotation);\n    middle.rotate(rotation);\n    if (rotation.x() == 0.0f) {\n      spreading_horizontally = !spreading_horizontally;\n    }\n    // Clip to the image before applying the increments.\n    bbox &= nontext_map_box; // This is in-place box intersection.\n    // Check for image pixels before spreading.\n    TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally, nontext_map, &bbox);\n    if (bbox.area() > 0) {\n      IncrementRectangle8Bit(bbox);\n    }\n  }\n}\n\n// Pads the bounding box of the given blob according to whether it is on\n// a horizontal or vertical text line, taking into account tab-stops near\n// the blob. Returns true if padding was in the horizontal direction.\nbool TextlineProjection::PadBlobBox(BLOBNBOX *blob, TBOX *bbox) {\n  // Determine which direction to spread.\n  // If text is well spaced out, it can be useful to pad perpendicular to\n  // the textline direction, so as to ensure diacritics get absorbed\n  // correctly, but if the text is tightly spaced, this will destroy the\n  // blank space between textlines in the projection map, and that would\n  // be very bad.\n  int pad_limit = scale_factor_ * kMinLineSpacingFactor;\n  int xpad = 0;\n  int ypad = 0;\n  bool padding_horizontally = false;\n  if (blob->UniquelyHorizontal()) {\n    xpad = bbox->height() * kOrientedPadFactor;\n    padding_horizontally = true;\n    // If the text appears to be very well spaced, pad the other direction by a\n    // single pixel in the projection profile space to help join diacritics to\n    // the textline.\n    if ((blob->neighbour(BND_ABOVE) == nullptr ||\n         bbox->y_gap(blob->neighbour(BND_ABOVE)->bounding_box()) > pad_limit) &&\n        (blob->neighbour(BND_BELOW) == nullptr ||\n         bbox->y_gap(blob->neighbour(BND_BELOW)->bounding_box()) > pad_limit)) {\n      ypad = scale_factor_;\n    }\n  } else if (blob->UniquelyVertical()) {\n    ypad = bbox->width() * kOrientedPadFactor;\n    if ((blob->neighbour(BND_LEFT) == nullptr ||\n         bbox->x_gap(blob->neighbour(BND_LEFT)->bounding_box()) > pad_limit) &&\n        (blob->neighbour(BND_RIGHT) == nullptr ||\n         bbox->x_gap(blob->neighbour(BND_RIGHT)->bounding_box()) > pad_limit)) {\n      xpad = scale_factor_;\n    }\n  } else {\n    if ((blob->neighbour(BND_ABOVE) != nullptr &&\n         blob->neighbour(BND_ABOVE)->neighbour(BND_BELOW) == blob) ||\n        (blob->neighbour(BND_BELOW) != nullptr &&\n         blob->neighbour(BND_BELOW)->neighbour(BND_ABOVE) == blob)) {\n      ypad = bbox->width() * kDefaultPadFactor;\n    }\n    if ((blob->neighbour(BND_RIGHT) != nullptr &&\n         blob->neighbour(BND_RIGHT)->neighbour(BND_LEFT) == blob) ||\n        (blob->neighbour(BND_LEFT) != nullptr &&\n         blob->neighbour(BND_LEFT)->neighbour(BND_RIGHT) == blob)) {\n      xpad = bbox->height() * kDefaultPadFactor;\n      padding_horizontally = true;\n    }\n  }\n  bbox->pad(xpad, ypad);\n  pad_limit = scale_factor_ * kMaxTabStopOverrun;\n  // Now shrink horizontally to avoid stepping more than pad_limit over a\n  // tab-stop.\n  if (bbox->left() < blob->left_rule() - pad_limit) {\n    bbox->set_left(blob->left_rule() - pad_limit);\n  }\n  if (bbox->right() > blob->right_rule() + pad_limit) {\n    bbox->set_right(blob->right_rule() + pad_limit);\n  }\n  return padding_horizontally;\n}\n\n// Helper denormalizes the TPOINT with the denorm if not nullptr, then\n// converts to pix_ coordinates.\nvoid TextlineProjection::TransformToPixCoords(const DENORM *denorm, TPOINT *pt) const {\n  if (denorm != nullptr) {\n    // Denormalize the point.\n    denorm->DenormTransform(nullptr, *pt, pt);\n  }\n  pt->x = ImageXToProjectionX(pt->x);\n  pt->y = ImageYToProjectionY(pt->y);\n}\n\n#if defined(_MSC_VER) && !defined(__clang__)\n#  pragma optimize(\"g\", off)\n#endif // _MSC_VER\n// Helper truncates the TPOINT to be within the pix_.\nvoid TextlineProjection::TruncateToImageBounds(TPOINT *pt) const {\n  pt->x = ClipToRange<int>(pt->x, 0, pixGetWidth(pix_) - 1);\n  pt->y = ClipToRange<int>(pt->y, 0, pixGetHeight(pix_) - 1);\n}\n#if defined(_MSC_VER) && !defined(__clang__)\n#  pragma optimize(\"\", on)\n#endif // _MSC_VER\n\n// Transform tesseract image coordinates to coordinates used in the projection.\nint TextlineProjection::ImageXToProjectionX(int x) const {\n  x = ClipToRange((x - x_origin_) / scale_factor_, 0, pixGetWidth(pix_) - 1);\n  return x;\n}\nint TextlineProjection::ImageYToProjectionY(int y) const {\n  y = ClipToRange((y_origin_ - y) / scale_factor_, 0, pixGetHeight(pix_) - 1);\n  return y;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/textlineprojection.h",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_\n#define TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_\n\n#include \"blobgrid.h\" // For BlobGrid\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass DENORM;\nstruct TPOINT;\nclass ColPartition;\n\n// Simple class to encapsulate the computation of an image representing\n// local textline density, and function(s) to make use of it.\n// The underlying principle is that if you smear connected components\n// horizontally (vertically for components on a vertically written textline)\n// and count the number of smeared components in an image, then the resulting\n// image shows the density of the textlines at each image position.\nclass TESS_API TextlineProjection {\npublic:\n  // The down-scaling factor is computed to obtain a projection resolution\n  // of about 100 dpi, whatever the input.\n  explicit TextlineProjection(int resolution);\n  ~TextlineProjection();\n\n  // Build the projection profile given the input_block containing lists of\n  // blobs, a rotation to convert to image coords,\n  // and a full-resolution nontext_map, marking out areas to avoid.\n  // During construction, we have the following assumptions:\n  // The rotation is a multiple of 90 degrees, ie no deskew yet.\n  // The blobs have had their left and right rules set to also limit\n  // the range of projection.\n  void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Image nontext_map);\n\n  // Display the blobs in the window colored according to textline quality.\n  void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win);\n\n  // Moves blobs that look like they don't sit well on a textline from the\n  // input blobs list to the output small_blobs list.\n  // This gets them away from initial textline finding to stop diacritics\n  // from forming incorrect textlines. (Introduced mainly to fix Thai.)\n  void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const;\n\n  // Create a window and display the projection in it.\n  void DisplayProjection() const;\n\n  // Compute the distance of the box from the partition using curved projection\n  // space. As DistanceOfBoxFromBox, except that the direction is taken from\n  // the ColPartition and the median bounds of the ColPartition are used as\n  // the to_box.\n  int DistanceOfBoxFromPartition(const TBOX &box, const ColPartition &part, const DENORM *denorm,\n                                 bool debug) const;\n\n  // Compute the distance from the from_box to the to_box using curved\n  // projection space. Separation that involves a decrease in projection\n  // density (moving from the from_box to the to_box) is weighted more heavily\n  // than constant density, and an increase is weighted less.\n  // If horizontal_textline is true, then curved space is used vertically,\n  // as for a diacritic on the edge of a textline.\n  // The projection uses original image coords, so denorm is used to get\n  // back to the image coords from box/part space.\n  int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline,\n                           const DENORM *denorm, bool debug) const;\n\n  // Compute the distance between (x, y1) and (x, y2) using the rule that\n  // a decrease in textline density is weighted more heavily than an increase.\n  // The coordinates are in source image space, ie processed by any denorm\n  // already, but not yet scaled by scale_factor_.\n  // Going from the outside of a textline to the inside should measure much\n  // less distance than going from the inside of a textline to the outside.\n  int VerticalDistance(bool debug, int x, int y1, int y2) const;\n\n  // Compute the distance between (x1, y) and (x2, y) using the rule that\n  // a decrease in textline density is weighted more heavily than an increase.\n  int HorizontalDistance(bool debug, int x1, int x2, int y) const;\n\n  // Returns true if the blob appears to be outside of a horizontal textline.\n  // Such blobs are potentially diacritics (even if large in Thai) and should\n  // be kept away from initial textline finding.\n  bool BoxOutOfHTextline(const TBOX &box, const DENORM *denorm, bool debug) const;\n\n  // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below,\n  // but uses the median top/bottom for horizontal and median left/right for\n  // vertical instead of the bounding box edges.\n  // Evaluates for both horizontal and vertical and returns the best result,\n  // with a positive value for horizontal and a negative value for vertical.\n  int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const;\n\n  // Computes the mean projection gradients over the horizontal and vertical\n  // edges of the box:\n  //   -h-h-h-h-h-h\n  //  |------------| mean=htop   -v|+v--------+v|-v\n  //  |+h+h+h+h+h+h|             -v|+v        +v|-v\n  //  |            |             -v|+v        +v|-v\n  //  |    box     |             -v|+v  box   +v|-v\n  //  |            |             -v|+v        +v|-v\n  //  |+h+h+h+h+h+h|             -v|+v        +v|-v\n  //  |------------| mean=hbot   -v|+v--------+v|-v\n  //   -h-h-h-h-h-h\n  //                           mean=vleft  mean=vright\n  //\n  // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number\n  // for a horizontal textline, a negative number for a vertical textline,\n  // and near zero for undecided. Undecided is most likely non-text.\n  int EvaluateBox(const TBOX &box, const DENORM *denorm, bool debug) const;\n\nprivate:\n  // Internal version of EvaluateBox returns the unclipped gradients as well\n  // as the result of EvaluateBox.\n  // hgrad1 and hgrad2 are the gradients for the horizontal textline.\n  int EvaluateBoxInternal(const TBOX &box, const DENORM *denorm, bool debug, int *hgrad1,\n                          int *hgrad2, int *vgrad1, int *vgrad2) const;\n\n  // Helper returns the mean gradient value for the horizontal row at the given\n  // y, (in the external coordinates) by subtracting the mean of the transformed\n  // row 2 pixels above from the mean of the transformed row 2 pixels below.\n  // This gives a positive value for a good top edge and negative for bottom.\n  // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.\n  int BestMeanGradientInRow(const DENORM *denorm, int16_t min_x, int16_t max_x, int16_t y,\n                            bool best_is_max) const;\n\n  // Helper returns the mean gradient value for the vertical column at the\n  // given x, (in the external coordinates) by subtracting the mean of the\n  // transformed column 2 pixels left from the mean of the transformed column\n  // 2 pixels to the right.\n  // This gives a positive value for a good left edge and negative for right.\n  // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.\n  int BestMeanGradientInColumn(const DENORM *denorm, int16_t x, int16_t min_y, int16_t max_y,\n                               bool best_is_max) const;\n\n  // Helper returns the mean pixel value over the line between the start_pt and\n  // end_pt (inclusive), but shifted perpendicular to the line in the projection\n  // image by offset pixels. For simplicity, it is assumed that the vector is\n  // either nearly horizontal or nearly vertical. It works on skewed textlines!\n  // The end points are in external coordinates, and will be denormalized with\n  // the denorm if not nullptr before further conversion to pix coordinates.\n  // After all the conversions, the offset is added to the direction\n  // perpendicular to the line direction. The offset is thus in projection image\n  // coordinates, which allows the caller to get a guaranteed displacement\n  // between pixels used to calculate gradients.\n  int MeanPixelsInLineSegment(const DENORM *denorm, int offset, TPOINT start_pt,\n                              TPOINT end_pt) const;\n\n  // Helper function to add 1 to a rectangle in source image coords to the\n  // internal projection pix_.\n  void IncrementRectangle8Bit(const TBOX &box);\n  // Inserts a list of blobs into the projection.\n  // Rotation is a multiple of 90 degrees to get from blob coords to\n  // nontext_map coords, image_box is the bounds of the nontext_map.\n  // Blobs are spread horizontally or vertically according to their internal\n  // flags, but the spreading is truncated by set pixels in the nontext_map\n  // and also by the horizontal rule line limits on the blobs.\n  void ProjectBlobs(BLOBNBOX_LIST *blobs, const FCOORD &rotation, const TBOX &image_box,\n                    Image nontext_map);\n  // Pads the bounding box of the given blob according to whether it is on\n  // a horizontal or vertical text line, taking into account tab-stops near\n  // the blob. Returns true if padding was in the horizontal direction.\n  bool PadBlobBox(BLOBNBOX *blob, TBOX *bbox);\n\n  // Helper denormalizes the TPOINT with the denorm if not nullptr, then\n  // converts to pix_ coordinates.\n  void TransformToPixCoords(const DENORM *denorm, TPOINT *pt) const;\n\n  // Helper truncates the TPOINT to be within the pix_.\n  void TruncateToImageBounds(TPOINT *pt) const;\n\n  // Transform tesseract coordinates to coordinates used in the pix.\n  int ImageXToProjectionX(int x) const;\n  int ImageYToProjectionY(int y) const;\n\n  // The down-sampling scale factor used in building the image.\n  int scale_factor_;\n  // The blob coordinates of the top-left (origin of the pix_) in tesseract\n  // coordinates. Used to transform the bottom-up tesseract coordinates to\n  // the top-down coordinates of the pix.\n  int x_origin_;\n  int y_origin_;\n  // The image of horizontally smeared blob boxes summed to provide a\n  // textline density map. As with a horizontal projection, the map has\n  // dips in the gaps between textlines.\n  Image pix_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_\n"
  },
  {
    "path": "src/textord/textord.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        textord.cpp\n// Description: The top-level text line and word finding functionality.\n// Author:      Ray Smith\n// Created:     Fri Mar 13 14:43:01 PDT 2009\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"baselinedetect.h\"\n#include \"drawtord.h\"\n#include \"makerow.h\"\n#include \"pageres.h\"\n#include \"textord.h\"\n#include \"tordmain.h\"\n#include \"wordseg.h\"\n\nnamespace tesseract {\n\nTextord::Textord(CCStruct *ccstruct)\n    : ccstruct_(ccstruct)\n    , use_cjk_fp_model_(false)\n    ,\n    // makerow.cpp ///////////////////////////////////////////\n    BOOL_MEMBER(textord_single_height_mode, false, \"Script has no xheight, so use a single mode\",\n                ccstruct_->params())\n    ,\n    // tospace.cpp ///////////////////////////////////////////\n    BOOL_MEMBER(tosp_old_to_method, false, \"Space stats use prechopping?\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false,\n                  \"Constrain relative values of inter and intra-word gaps for \"\n                  \"old_to_method.\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_only_use_prop_rows, true, \"Block stats to use fixed pitch rows?\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_force_wordbreak_on_punct, false,\n                  \"Force word breaks on punct to break long lines in non-space \"\n                  \"delimited langs\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_use_pre_chopping, false, \"Space stats use prechopping?\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_old_to_bug_fix, false, \"Fix suspected bug in old code\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_block_use_cert_spaces, true, \"Only stat OBVIOUS spaces\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_row_use_cert_spaces, true, \"Only stat OBVIOUS spaces\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, \"Only stat OBVIOUS spaces\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_row_use_cert_spaces1, true, \"Only stat OBVIOUS spaces\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_recovery_isolated_row_stats, true,\n                  \"Use row alone when inadequate cert spaces\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, \"Better guess\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_all_flips_fuzzy, false, \"Pass ANY flip to context?\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_fuzzy_limit_all, true, \"Don't restrict kn->sp fuzzy limit to tables\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_stats_use_xht_gaps, true, \"Use within xht gap for wd breaks\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_use_xht_gaps, true, \"Use within xht gap for wd breaks\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_only_use_xht_gaps, false, \"Only use within xht gap for wd breaks\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_rule_9_test_punct, false, \"Don't chng kn to space next to punct\",\n                  ccstruct_->params())\n    , BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, \"Default flip\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, \"Default flip\", ccstruct_->params())\n    , BOOL_MEMBER(tosp_improve_thresh, false, \"Enable improvement heuristic\", ccstruct_->params())\n    , INT_MEMBER(tosp_debug_level, 0, \"Debug data\", ccstruct_->params())\n    , INT_MEMBER(tosp_enough_space_samples_for_median, 3, \"or should we use mean\",\n                 ccstruct_->params())\n    , INT_MEMBER(tosp_redo_kern_limit, 10, \"No.samples reqd to reestimate for row\",\n                 ccstruct_->params())\n    , INT_MEMBER(tosp_few_samples, 40, \"No.gaps reqd with 1 large gap to treat as a table\",\n                 ccstruct_->params())\n    , INT_MEMBER(tosp_short_row, 20, \"No.gaps reqd with few cert spaces to use certs\",\n                 ccstruct_->params())\n    , INT_MEMBER(tosp_sanity_method, 1, \"How to avoid being silly\", ccstruct_->params())\n    , double_MEMBER(tosp_old_sp_kn_th_factor, 2.0,\n                    \"Factor for defining space threshold in terms of space and \"\n                    \"kern sizes\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_threshold_bias1, 0, \"how far between kern and space?\", ccstruct_->params())\n    , double_MEMBER(tosp_threshold_bias2, 0, \"how far between kern and space?\", ccstruct_->params())\n    , double_MEMBER(tosp_narrow_fraction, 0.3, \"Fract of xheight for narrow\", ccstruct_->params())\n    , double_MEMBER(tosp_narrow_aspect_ratio, 0.48, \"narrow if w/h less than this\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_wide_fraction, 0.52, \"Fract of xheight for wide\", ccstruct_->params())\n    , double_MEMBER(tosp_wide_aspect_ratio, 0.0, \"wide if w/h less than this\", ccstruct_->params())\n    , double_MEMBER(tosp_fuzzy_space_factor, 0.6, \"Fract of xheight for fuzz sp\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_fuzzy_space_factor1, 0.5, \"Fract of xheight for fuzz sp\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_fuzzy_space_factor2, 0.72, \"Fract of xheight for fuzz sp\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_gap_factor, 0.83, \"gap ratio to flip sp->kern\", ccstruct_->params())\n    , double_MEMBER(tosp_kern_gap_factor1, 2.0, \"gap ratio to flip kern->sp\", ccstruct_->params())\n    , double_MEMBER(tosp_kern_gap_factor2, 1.3, \"gap ratio to flip kern->sp\", ccstruct_->params())\n    , double_MEMBER(tosp_kern_gap_factor3, 2.5, \"gap ratio to flip kern->sp\", ccstruct_->params())\n    , double_MEMBER(tosp_ignore_big_gaps, -1, \"xht multiplier\", ccstruct_->params())\n    , double_MEMBER(tosp_ignore_very_big_gaps, 3.5, \"xht multiplier\", ccstruct_->params())\n    , double_MEMBER(tosp_rep_space, 1.6, \"rep gap multiplier for space\", ccstruct_->params())\n    , double_MEMBER(tosp_enough_small_gaps, 0.65, \"Fract of kerns reqd for isolated row stats\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_table_kn_sp_ratio, 2.25, \"Min difference of kn & sp in table\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_table_xht_sp_ratio, 0.33, \"Expect spaces bigger than this\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, \"Fuzzy if less than this\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, \"New fuzzy kn alg\", ccstruct_->params())\n    , double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, \"New fuzzy sp alg\", ccstruct_->params())\n    , double_MEMBER(tosp_min_sane_kn_sp, 1.5, \"Don't trust spaces less than this time kn\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_init_guess_kn_mult, 2.2, \"Thresh guess - mult kn by this\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_init_guess_xht_mult, 0.28, \"Thresh guess - mult xht by this\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_max_sane_kn_thresh, 5.0, \"Multiplier on kn to limit thresh\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_flip_caution, 0.0, \"Don't autoflip kn to sp when large separation\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_large_kerning, 0.19, \"Limit use of xht gap with large kns\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_dont_fool_with_small_kerns, -1, \"Limit use of xht gap with odd small kns\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_near_lh_edge, 0, \"Don't reduce box if the top left is non blank\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_silly_kn_sp_gap, 0.2, \"Don't let sp minus kn get too small\",\n                    ccstruct_->params())\n    , double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, \"How wide fuzzies need context\",\n                    ccstruct_->params())\n    ,\n    // tordmain.cpp ///////////////////////////////////////////\n    BOOL_MEMBER(textord_no_rejects, false, \"Don't remove noise blobs\", ccstruct_->params())\n    , BOOL_MEMBER(textord_show_blobs, false, \"Display unsorted blobs\", ccstruct_->params())\n    , BOOL_MEMBER(textord_show_boxes, false, \"Display unsorted blobs\", ccstruct_->params())\n    , INT_MEMBER(textord_max_noise_size, 7, \"Pixel size of noise\", ccstruct_->params())\n    , INT_MEMBER(textord_baseline_debug, 0, \"Baseline debug level\", ccstruct_->params())\n    , double_MEMBER(textord_noise_area_ratio, 0.7, \"Fraction of bounding box for noise\",\n                    ccstruct_->params())\n    , double_MEMBER(textord_initialx_ile, 0.75, \"Ile of sizes for xheight guess\",\n                    ccstruct_->params())\n    , double_MEMBER(textord_initialasc_ile, 0.90, \"Ile of sizes for xheight guess\",\n                    ccstruct_->params())\n    , INT_MEMBER(textord_noise_sizefraction, 10, \"Fraction of size for maxima\", ccstruct_->params())\n    , double_MEMBER(textord_noise_sizelimit, 0.5, \"Fraction of x for big t count\",\n                    ccstruct_->params())\n    , INT_MEMBER(textord_noise_translimit, 16, \"Transitions for normal blob\", ccstruct_->params())\n    , double_MEMBER(textord_noise_normratio, 2.0, \"Dot to norm ratio for deletion\",\n                    ccstruct_->params())\n    , BOOL_MEMBER(textord_noise_rejwords, true, \"Reject noise-like words\", ccstruct_->params())\n    , BOOL_MEMBER(textord_noise_rejrows, true, \"Reject noise-like rows\", ccstruct_->params())\n    , double_MEMBER(textord_noise_syfract, 0.2, \"xh fract height error for norm blobs\",\n                    ccstruct_->params())\n    , double_MEMBER(textord_noise_sxfract, 0.4, \"xh fract width error for norm blobs\",\n                    ccstruct_->params())\n    , double_MEMBER(textord_noise_hfract, 1.0 / 64,\n                    \"Height fraction to discard outlines as speckle noise\", ccstruct_->params())\n    , INT_MEMBER(textord_noise_sncount, 1, \"super norm blobs to save row\", ccstruct_->params())\n    , double_MEMBER(textord_noise_rowratio, 6.0, \"Dot to norm ratio for deletion\",\n                    ccstruct_->params())\n    , BOOL_MEMBER(textord_noise_debug, false, \"Debug row garbage detector\", ccstruct_->params())\n    , double_MEMBER(textord_blshift_maxshift, 0.00, \"Max baseline shift\", ccstruct_->params())\n    , double_MEMBER(textord_blshift_xfraction, 9.99, \"Min size of baseline shift\",\n                    ccstruct_->params()) {}\n\n// Make the textlines and words inside each block.\nvoid Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height,\n                          Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms,\n                          BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,\n                          TO_BLOCK_LIST *to_blocks, float *gradient) {\n  page_tr_.set_x(width);\n  page_tr_.set_y(height);\n  if (to_blocks->empty()) {\n    // AutoPageSeg was not used, so we need to find_components first.\n    find_components(binary_pix, blocks, to_blocks);\n    TO_BLOCK_IT it(to_blocks);\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      TO_BLOCK *to_block = it.data();\n      // Compute the edge offsets whether or not there is a grey_pix.\n      // We have by-passed auto page seg, so we have to run it here.\n      // By page segmentation mode there is no non-text to avoid running on.\n      to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);\n    }\n  } else if (!PSM_SPARSE(pageseg_mode)) {\n    // AutoPageSeg does not need to find_components as it did that already.\n    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.\n    filter_blobs(page_tr_, to_blocks, true);\n  }\n\n  ASSERT_HOST(!to_blocks->empty());\n  if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {\n    const FCOORD anticlockwise90(0.0f, 1.0f);\n    const FCOORD clockwise90(0.0f, -1.0f);\n    TO_BLOCK_IT it(to_blocks);\n    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n      TO_BLOCK *to_block = it.data();\n      BLOCK *block = to_block->block;\n      // Create a fake poly_block in block from its bounding box.\n      block->pdblk.set_poly_block(new POLY_BLOCK(block->pdblk.bounding_box(), PT_VERTICAL_TEXT));\n      // Rotate the to_block along with its contained block and blobnbox lists.\n      to_block->rotate(anticlockwise90);\n      // Set the block's rotation values to obey the convention followed in\n      // layout analysis for vertical text.\n      block->set_re_rotation(clockwise90);\n      block->set_classify_rotation(clockwise90);\n    }\n  }\n\n  TO_BLOCK_IT to_block_it(to_blocks);\n  TO_BLOCK *to_block = to_block_it.data();\n  // Make the rows in the block.\n  // Do it the old fashioned way.\n  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {\n    *gradient = make_rows(page_tr_, to_blocks);\n  } else if (!PSM_SPARSE(pageseg_mode)) {\n    // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.\n    *gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks);\n  } else {\n    *gradient = 0.0f;\n  }\n  BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks);\n  baseline_detector.ComputeStraightBaselines(use_box_bottoms);\n  baseline_detector.ComputeBaselineSplinesAndXheights(\n      page_tr_, pageseg_mode != PSM_RAW_LINE, textord_heavy_nr, textord_show_final_rows, this);\n  // Now make the words in the lines.\n  if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {\n    // SINGLE_LINE uses the old word maker on the single line.\n    make_words(this, page_tr_, *gradient, blocks, to_blocks);\n  } else {\n    // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a\n    // single word, and in SINGLE_CHAR mode, all the outlines\n    // go in a single blob.\n    TO_BLOCK *to_block = to_block_it.data();\n    make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(),\n                     to_block->block->row_list());\n  }\n  // Remove empties.\n  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);\n  TransferDiacriticsToBlockGroups(diacritic_blobs, blocks);\n  // Compute the margins for each row in the block, to be used later for\n  // paragraph detection.\n  BLOCK_IT b_it(blocks);\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    b_it.data()->compute_row_margins();\n  }\n#ifndef GRAPHICS_DISABLED\n  close_to_win();\n#endif\n}\n\n// If we were supposed to return only a single textline, and there is more\n// than one, clean up and leave only the best.\nvoid Textord::CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res) {\n  if (PSM_LINE_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) {\n    return; // No cleanup required.\n  }\n  PAGE_RES_IT it(page_res);\n  // Find the best row, being the greatest mean word conf.\n  float row_total_conf = 0.0f;\n  int row_word_count = 0;\n  ROW_RES *best_row = nullptr;\n  float best_conf = 0.0f;\n  for (it.restart_page(); it.word() != nullptr; it.forward()) {\n    WERD_RES *word = it.word();\n    row_total_conf += word->best_choice->certainty();\n    ++row_word_count;\n    if (it.next_row() != it.row()) {\n      row_total_conf /= row_word_count;\n      if (best_row == nullptr || best_conf < row_total_conf) {\n        best_row = it.row();\n        best_conf = row_total_conf;\n      }\n      row_total_conf = 0.0f;\n      row_word_count = 0;\n    }\n  }\n  // Now eliminate any word not in the best row.\n  for (it.restart_page(); it.word() != nullptr; it.forward()) {\n    if (it.row() != best_row) {\n      it.DeleteCurrentWord();\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/textord.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        textord.h\n// Description: The Textord class definition gathers text line and word\n//              finding functionality.\n// Author:      Ray Smith\n// Created:     Fri Mar 13 14:29:01 PDT 2009\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_TEXTORD_H_\n#define TESSERACT_TEXTORD_TEXTORD_H_\n\n#include \"bbgrid.h\"\n#include \"blobbox.h\"\n#include \"ccstruct.h\"\n#include \"gap_map.h\"\n\n#include <tesseract/publictypes.h> // For PageSegMode.\n\nnamespace tesseract {\n\nclass FCOORD;\nclass BLOCK_LIST;\nclass PAGE_RES;\nclass TO_BLOCK;\nclass TO_BLOCK_LIST;\nclass ScrollView;\n\n// A simple class that can be used by BBGrid to hold a word and an expanded\n// bounding box that makes it easy to find words to put diacritics.\nclass WordWithBox {\npublic:\n  WordWithBox() : word_(nullptr) {}\n  explicit WordWithBox(WERD *word) : word_(word), bounding_box_(word->bounding_box()) {\n    int height = bounding_box_.height();\n    bounding_box_.pad(height, height);\n  }\n\n  const TBOX &bounding_box() const {\n    return bounding_box_;\n  }\n  // Returns the bounding box of only the good blobs.\n  TBOX true_bounding_box() const {\n    return word_->true_bounding_box();\n  }\n  C_BLOB_LIST *RejBlobs() const {\n    return word_->rej_cblob_list();\n  }\n  const WERD *word() const {\n    return word_;\n  }\n\nprivate:\n  // Borrowed pointer to a real word somewhere that must outlive this class.\n  WERD *word_;\n  // Cached expanded bounding box of the word, padded all round by its height.\n  TBOX bounding_box_;\n};\n\n// Make it usable by BBGrid.\nCLISTIZEH(WordWithBox)\nusing WordGrid = BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>;\nusing WordSearch = GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>;\n\nclass Textord {\npublic:\n  explicit Textord(CCStruct *ccstruct);\n  ~Textord() = default;\n\n  // Make the textlines and words inside each block.\n  // binary_pix is mandatory and is the binarized input after line removal.\n  // grey_pix is optional, but if present must match the binary_pix in size,\n  // and must be a *real* grey image instead of binary_pix * 255.\n  // thresholds_pix is expected to be present iff grey_pix is present and\n  // can be an integer factor reduction of the grey_pix. It represents the\n  // thresholds that were used to create the binary_pix from the grey_pix.\n  // diacritic_blobs contain small confusing components that should be added\n  // to the appropriate word(s) in case they are really diacritics.\n  void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height,\n                   Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms,\n                   BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,\n                   float *gradient);\n\n  // If we were supposed to return only a single textline, and there is more\n  // than one, clean up and leave only the best.\n  void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res);\n\n  bool use_cjk_fp_model() const {\n    return use_cjk_fp_model_;\n  }\n  void set_use_cjk_fp_model(bool flag) {\n    use_cjk_fp_model_ = flag;\n  }\n\n  // tospace.cpp ///////////////////////////////////////////\n  void to_spacing(ICOORD page_tr,       // topright of page\n                  TO_BLOCK_LIST *blocks // blocks on page\n  );\n  ROW *make_prop_words(TO_ROW *row,    // row to make\n                       FCOORD rotation // for drawing\n  );\n  ROW *make_blob_words(TO_ROW *row,    // row to make\n                       FCOORD rotation // for drawing\n  );\n  // tordmain.cpp ///////////////////////////////////////////\n  void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);\n  void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on);\n\nprivate:\n  // For underlying memory management and other utilities.\n  CCStruct *ccstruct_;\n\n  // The size of the input image.\n  ICOORD page_tr_;\n\n  bool use_cjk_fp_model_;\n\n  // makerow.cpp ///////////////////////////////////////////\n  // Make the textlines inside each block.\n  void MakeRows(PageSegMode pageseg_mode, const FCOORD &skew, int width, int height,\n                TO_BLOCK_LIST *to_blocks);\n  // Make the textlines inside a single block.\n  void MakeBlockRows(int min_spacing, int max_spacing, const FCOORD &skew, TO_BLOCK *block,\n                     ScrollView *win);\n\npublic:\n  void compute_block_xheight(TO_BLOCK *block, float gradient);\n  void compute_row_xheight(TO_ROW *row, // row to do\n                           const FCOORD &rotation,\n                           float gradient, // global skew\n                           int block_line_size);\n  void make_spline_rows(TO_BLOCK *block, // block to do\n                        float gradient,  // gradient to fit\n                        bool testing_on);\n\nprivate:\n  //// oldbasel.cpp ////////////////////////////////////////\n  void make_old_baselines(TO_BLOCK *block, // block to do\n                          bool testing_on, // correct orientation\n                          float gradient);\n  void correlate_lines(TO_BLOCK *block, float gradient);\n  void correlate_neighbours(TO_BLOCK *block, // block rows are in.\n                            TO_ROW **rows,   // rows of block.\n                            int rowcount);   // no of rows to do.\n  int correlate_with_stats(TO_ROW **rows,    // rows of block.\n                           int rowcount,     // no of rows to do.\n                           TO_BLOCK *block);\n  void find_textlines(TO_BLOCK *block,  // block row is in\n                      TO_ROW *row,      // row to do\n                      int degree,       // required approximation\n                      QSPLINE *spline); // starting spline\n  // tospace.cpp ///////////////////////////////////////////\n  // DEBUG USE ONLY\n  void block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,\n                           // resulting estimate\n                           int16_t &block_space_gap_width,\n                           // resulting estimate\n                           int16_t &block_non_space_gap_width);\n  void row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,\n                         // estimate for block\n                         int16_t block_space_gap_width,\n                         // estimate for block\n                         int16_t block_non_space_gap_width);\n  void old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,\n                     STATS *small_gap_stats, int16_t block_space_gap_width,\n                     // estimate for block\n                     int16_t block_non_space_gap_width);\n  bool isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, bool suspected_table,\n                          int16_t block_idx, int16_t row_idx);\n  int16_t stats_count_under(STATS *stats, int16_t threshold);\n  void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats);\n  bool make_a_word_break(TO_ROW *row,   // row being made\n                         TBOX blob_box, // for next_blob // how many blanks?\n                         int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,\n                         int16_t within_xht_current_gap, TBOX next_blob_box, int16_t next_gap,\n                         uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,\n                         bool &prev_gap_was_a_space, bool &break_at_next_gap);\n  bool narrow_blob(TO_ROW *row, TBOX blob_box);\n  bool wide_blob(TO_ROW *row, TBOX blob_box);\n  bool suspected_punct_blob(TO_ROW *row, TBOX box);\n  void peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, int16_t &next_gap,\n                        int16_t &next_within_xht_gap);\n  void mark_gap(TBOX blob,    // blob following gap\n                int16_t rule, // heuristic id\n                int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,\n                int16_t next_blob_width, int16_t next_gap);\n  float find_mean_blob_spacing(WERD *word);\n  bool ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, int16_t right);\n  // get bounding box\n  TBOX reduced_box_next(TO_ROW *row,    // current row\n                        BLOBNBOX_IT *it // iterator to blobds\n  );\n  TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht);\n  // tordmain.cpp ///////////////////////////////////////////\n  float filter_noise_blobs(BLOBNBOX_LIST *src_list, BLOBNBOX_LIST *noise_list,\n                           BLOBNBOX_LIST *small_list, BLOBNBOX_LIST *large_list);\n  // Fixes the block so it obeys all the rules:\n  // Must have at least one ROW.\n  // Must have at least one WERD.\n  // WERDs contain a fake blob.\n  void cleanup_nontext_block(BLOCK *block);\n  void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);\n  bool clean_noise_from_row(ROW *row);\n  void clean_noise_from_words(ROW *row);\n  // Remove outlines that are a tiny fraction in either width or height\n  // of the word height.\n  void clean_small_noise_from_words(ROW *row);\n  // Groups blocks by rotation, then, for each group, makes a WordGrid and calls\n  // TransferDiacriticsToWords to copy the diacritic blobs to the most\n  // appropriate words in the group of blocks. Source blobs are not touched.\n  void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks);\n  // Places a copy of blobs that are near a word (after applying rotation to the\n  // blob) in the most appropriate word, unless there is doubt, in which case a\n  // blob can end up in two words. Source blobs are not touched.\n  void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation,\n                                 WordGrid *word_grid);\n\npublic:\n  // makerow.cpp ///////////////////////////////////////////\n  BOOL_VAR_H(textord_single_height_mode);\n  // tospace.cpp ///////////////////////////////////////////\n  BOOL_VAR_H(tosp_old_to_method);\n  BOOL_VAR_H(tosp_old_to_constrain_sp_kn);\n  BOOL_VAR_H(tosp_only_use_prop_rows);\n  BOOL_VAR_H(tosp_force_wordbreak_on_punct);\n  BOOL_VAR_H(tosp_use_pre_chopping);\n  BOOL_VAR_H(tosp_old_to_bug_fix);\n  BOOL_VAR_H(tosp_block_use_cert_spaces);\n  BOOL_VAR_H(tosp_row_use_cert_spaces);\n  BOOL_VAR_H(tosp_narrow_blobs_not_cert);\n  BOOL_VAR_H(tosp_row_use_cert_spaces1);\n  BOOL_VAR_H(tosp_recovery_isolated_row_stats);\n  BOOL_VAR_H(tosp_only_small_gaps_for_kern);\n  BOOL_VAR_H(tosp_all_flips_fuzzy);\n  BOOL_VAR_H(tosp_fuzzy_limit_all);\n  BOOL_VAR_H(tosp_stats_use_xht_gaps);\n  BOOL_VAR_H(tosp_use_xht_gaps);\n  BOOL_VAR_H(tosp_only_use_xht_gaps);\n  BOOL_VAR_H(tosp_rule_9_test_punct);\n  BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp);\n  BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn);\n  BOOL_VAR_H(tosp_improve_thresh);\n  INT_VAR_H(tosp_debug_level);\n  INT_VAR_H(tosp_enough_space_samples_for_median);\n  INT_VAR_H(tosp_redo_kern_limit);\n  INT_VAR_H(tosp_few_samples);\n  INT_VAR_H(tosp_short_row);\n  INT_VAR_H(tosp_sanity_method);\n  double_VAR_H(tosp_old_sp_kn_th_factor);\n  double_VAR_H(tosp_threshold_bias1);\n  double_VAR_H(tosp_threshold_bias2);\n  double_VAR_H(tosp_narrow_fraction);\n  double_VAR_H(tosp_narrow_aspect_ratio);\n  double_VAR_H(tosp_wide_fraction);\n  double_VAR_H(tosp_wide_aspect_ratio);\n  double_VAR_H(tosp_fuzzy_space_factor);\n  double_VAR_H(tosp_fuzzy_space_factor1);\n  double_VAR_H(tosp_fuzzy_space_factor2);\n  double_VAR_H(tosp_gap_factor);\n  double_VAR_H(tosp_kern_gap_factor1);\n  double_VAR_H(tosp_kern_gap_factor2);\n  double_VAR_H(tosp_kern_gap_factor3);\n  double_VAR_H(tosp_ignore_big_gaps);\n  double_VAR_H(tosp_ignore_very_big_gaps);\n  double_VAR_H(tosp_rep_space);\n  double_VAR_H(tosp_enough_small_gaps);\n  double_VAR_H(tosp_table_kn_sp_ratio);\n  double_VAR_H(tosp_table_xht_sp_ratio);\n  double_VAR_H(tosp_table_fuzzy_kn_sp_ratio);\n  double_VAR_H(tosp_fuzzy_kn_fraction);\n  double_VAR_H(tosp_fuzzy_sp_fraction);\n  double_VAR_H(tosp_min_sane_kn_sp);\n  double_VAR_H(tosp_init_guess_kn_mult);\n  double_VAR_H(tosp_init_guess_xht_mult);\n  double_VAR_H(tosp_max_sane_kn_thresh);\n  double_VAR_H(tosp_flip_caution);\n  double_VAR_H(tosp_large_kerning);\n  double_VAR_H(tosp_dont_fool_with_small_kerns);\n  double_VAR_H(tosp_near_lh_edge);\n  double_VAR_H(tosp_silly_kn_sp_gap);\n  double_VAR_H(tosp_pass_wide_fuzz_sp_to_context);\n  // tordmain.cpp ///////////////////////////////////////////\n  BOOL_VAR_H(textord_no_rejects);\n  BOOL_VAR_H(textord_show_blobs);\n  BOOL_VAR_H(textord_show_boxes);\n  INT_VAR_H(textord_max_noise_size);\n  INT_VAR_H(textord_baseline_debug);\n  double_VAR_H(textord_noise_area_ratio);\n  double_VAR_H(textord_initialx_ile);\n  double_VAR_H(textord_initialasc_ile);\n  INT_VAR_H(textord_noise_sizefraction);\n  double_VAR_H(textord_noise_sizelimit);\n  INT_VAR_H(textord_noise_translimit);\n  double_VAR_H(textord_noise_normratio);\n  BOOL_VAR_H(textord_noise_rejwords);\n  BOOL_VAR_H(textord_noise_rejrows);\n  double_VAR_H(textord_noise_syfract);\n  double_VAR_H(textord_noise_sxfract);\n  double_VAR_H(textord_noise_hfract);\n  INT_VAR_H(textord_noise_sncount);\n  double_VAR_H(textord_noise_rowratio);\n  BOOL_VAR_H(textord_noise_debug);\n  double_VAR_H(textord_blshift_maxshift);\n  double_VAR_H(textord_blshift_xfraction);\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TEXTORD_TEXTORD_H_\n"
  },
  {
    "path": "src/textord/topitch.cpp",
    "content": "/**********************************************************************\n * File:        topitch.cpp  (Formerly to_pitch.c)\n * Description: Code to determine fixed pitchness and the pitch if fixed.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"topitch.h\"\n\n#include \"blobbox.h\"\n#include \"drawtord.h\"\n#include \"makerow.h\"\n#include \"pithsync.h\"\n#include \"pitsync1.h\"\n#include \"statistc.h\"\n#include \"tovars.h\"\n#include \"wordseg.h\"\n\n#include \"helpers.h\"\n\n#include <memory>\n\nnamespace tesseract {\n\nstatic BOOL_VAR(textord_all_prop, false, \"All doc is proportial text\");\nBOOL_VAR(textord_debug_pitch_test, false, \"Debug on fixed pitch test\");\nstatic BOOL_VAR(textord_disable_pitch_test, false, \"Turn off dp fixed pitch algorithm\");\nBOOL_VAR(textord_fast_pitch_test, false, \"Do even faster pitch algorithm\");\nBOOL_VAR(textord_debug_pitch_metric, false, \"Write full metric stuff\");\nBOOL_VAR(textord_show_row_cuts, false, \"Draw row-level cuts\");\nBOOL_VAR(textord_show_page_cuts, false, \"Draw page-level cuts\");\nBOOL_VAR(textord_blockndoc_fixed, false, \"Attempt whole doc/block fixed pitch\");\ndouble_VAR(textord_projection_scale, 0.200, \"Ding rate for mid-cuts\");\ndouble_VAR(textord_balance_factor, 1.0, \"Ding rate for unbalanced char cells\");\n\n#define BLOCK_STATS_CLUSTERS 10\n#define MAX_ALLOWED_PITCH 100 // max pixel pitch.\n\n// qsort function to sort 2 floats.\nstatic int sort_floats(const void *arg1, const void *arg2) {\n  float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2);\n  if (diff > 0) {\n    return 1;\n  } else if (diff < 0) {\n    return -1;\n  } else {\n    return 0;\n  }\n}\n\n/**********************************************************************\n * compute_fixed_pitch\n *\n * Decide whether each row is fixed pitch individually.\n * Correlate definite and uncertain results to obtain an individual\n * result for each row in the TO_ROW class.\n **********************************************************************/\n\nvoid compute_fixed_pitch(ICOORD page_tr,             // top right\n                         TO_BLOCK_LIST *port_blocks, // input list\n                         float gradient,             // page skew\n                         FCOORD rotation,            // for drawing\n                         bool testing_on) {          // correct orientation\n  TO_BLOCK_IT block_it;                              // iterator\n  TO_BLOCK *block;                                   // current block;\n  TO_ROW *row;                                       // current row\n  int block_index;                                   // block number\n  int row_index;                                     // row number\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_initial_words && testing_on) {\n    if (to_win == nullptr) {\n      create_to_win(page_tr);\n    }\n  }\n#endif\n\n  block_it.set_to_list(port_blocks);\n  block_index = 1;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    compute_block_pitch(block, rotation, block_index, testing_on);\n    block_index++;\n  }\n\n  if (!try_doc_fixed(page_tr, port_blocks, gradient)) {\n    block_index = 1;\n    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n      block = block_it.data();\n      if (!try_block_fixed(block, block_index)) {\n        try_rows_fixed(block, block_index, testing_on);\n      }\n      block_index++;\n    }\n  }\n\n  block_index = 1;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    POLY_BLOCK *pb = block->block->pdblk.poly_block();\n    if (pb != nullptr && !pb->IsText()) {\n      continue; // Non-text doesn't exist!\n    }\n    // row iterator\n    TO_ROW_IT row_it(block->get_rows());\n    row_index = 1;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      fix_row_pitch(row, block, port_blocks, row_index, block_index);\n      row_index++;\n    }\n    block_index++;\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_initial_words && testing_on) {\n    ScrollView::Update();\n  }\n#endif\n}\n\n/**********************************************************************\n * fix_row_pitch\n *\n * Get a pitch_decision for this row by voting among similar rows in the\n * block, then similar rows over all the page, or any other rows at all.\n **********************************************************************/\n\nvoid fix_row_pitch(TO_ROW *bad_row,        // row to fix\n                   TO_BLOCK *bad_block,    // block of bad_row\n                   TO_BLOCK_LIST *blocks,  // blocks to scan\n                   int32_t row_target,     // number of row\n                   int32_t block_target) { // number of block\n  int16_t mid_cuts;\n  int block_votes;               // votes in block\n  int like_votes;                // votes over page\n  int other_votes;               // votes of unlike blocks\n  int block_index;               // number of block\n  int maxwidth;                  // max pitch\n  TO_BLOCK_IT block_it = blocks; // block iterator\n  TO_BLOCK *block;               // current block\n  TO_ROW *row;                   // current row\n  float sp_sd;                   // space deviation\n  STATS block_stats;             // pitches in block\n  STATS like_stats;              // pitches in page\n\n  block_votes = like_votes = other_votes = 0;\n  maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace));\n  if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) {\n    block_stats.set_range(0, maxwidth - 1);\n    like_stats.set_range(0, maxwidth - 1);\n    block_index = 1;\n    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n      block = block_it.data();\n      POLY_BLOCK *pb = block->block->pdblk.poly_block();\n      if (pb != nullptr && !pb->IsText()) {\n        continue; // Non text doesn't exist!\n      }\n      TO_ROW_IT row_it(block->get_rows());\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        row = row_it.data();\n        if ((bad_row->all_caps &&\n             row->xheight + row->ascrise <\n                 (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) &&\n             row->xheight + row->ascrise >\n                 (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) ||\n            (!bad_row->all_caps &&\n             row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) &&\n             row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {\n          if (block_index == block_target) {\n            if (row->pitch_decision == PITCH_DEF_FIXED) {\n              block_votes += textord_words_veto_power;\n              block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);\n            } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||\n                       row->pitch_decision == PITCH_CORR_FIXED) {\n              block_votes++;\n              block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);\n            } else if (row->pitch_decision == PITCH_DEF_PROP) {\n              block_votes -= textord_words_veto_power;\n            } else if (row->pitch_decision == PITCH_MAYBE_PROP ||\n                       row->pitch_decision == PITCH_CORR_PROP) {\n              block_votes--;\n            }\n          } else {\n            if (row->pitch_decision == PITCH_DEF_FIXED) {\n              like_votes += textord_words_veto_power;\n              like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);\n            } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||\n                       row->pitch_decision == PITCH_CORR_FIXED) {\n              like_votes++;\n              like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);\n            } else if (row->pitch_decision == PITCH_DEF_PROP) {\n              like_votes -= textord_words_veto_power;\n            } else if (row->pitch_decision == PITCH_MAYBE_PROP ||\n                       row->pitch_decision == PITCH_CORR_PROP) {\n              like_votes--;\n            }\n          }\n        } else {\n          if (row->pitch_decision == PITCH_DEF_FIXED) {\n            other_votes += textord_words_veto_power;\n          } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||\n                     row->pitch_decision == PITCH_CORR_FIXED) {\n            other_votes++;\n          } else if (row->pitch_decision == PITCH_DEF_PROP) {\n            other_votes -= textord_words_veto_power;\n          } else if (row->pitch_decision == PITCH_MAYBE_PROP ||\n                     row->pitch_decision == PITCH_CORR_PROP) {\n            other_votes--;\n          }\n        }\n      }\n      block_index++;\n    }\n    if (block_votes > textord_words_veto_power) {\n      bad_row->fixed_pitch = block_stats.ile(0.5);\n      bad_row->pitch_decision = PITCH_CORR_FIXED;\n    } else if (block_votes <= textord_words_veto_power && like_votes > 0) {\n      bad_row->fixed_pitch = like_stats.ile(0.5);\n      bad_row->pitch_decision = PITCH_CORR_FIXED;\n    } else {\n      bad_row->pitch_decision = PITCH_CORR_PROP;\n      if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&\n          (textord_debug_pitch_test || textord_debug_pitch_metric)) {\n        tprintf(\n            \"Warning:row %d of block %d set prop with no like rows against \"\n            \"trend\\n\",\n            row_target, block_target);\n      }\n    }\n  }\n  if (textord_debug_pitch_metric) {\n    tprintf(\":b_votes=%d:l_votes=%d:o_votes=%d\", block_votes, like_votes, other_votes);\n    tprintf(\"x=%g:asc=%g\\n\", bad_row->xheight, bad_row->ascrise);\n  }\n  if (bad_row->pitch_decision == PITCH_CORR_FIXED) {\n    if (bad_row->fixed_pitch < textord_min_xheight) {\n      if (block_votes > 0) {\n        bad_row->fixed_pitch = block_stats.ile(0.5);\n      } else if (block_votes == 0 && like_votes > 0) {\n        bad_row->fixed_pitch = like_stats.ile(0.5);\n      } else {\n        tprintf(\"Warning:guessing pitch as xheight on row %d, block %d\\n\", row_target,\n                block_target);\n        bad_row->fixed_pitch = bad_row->xheight;\n      }\n    }\n    if (bad_row->fixed_pitch < textord_min_xheight) {\n      bad_row->fixed_pitch = (float)textord_min_xheight;\n    }\n    bad_row->kern_size = bad_row->fixed_pitch / 4;\n    bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6);\n    bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4);\n    bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2;\n    bad_row->space_size = bad_row->fixed_pitch;\n    if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {\n      tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left,\n                     bad_row->projection_right,\n                     (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,\n                     sp_sd, mid_cuts, &bad_row->char_cells, false);\n    }\n  } else if (bad_row->pitch_decision == PITCH_CORR_PROP ||\n             bad_row->pitch_decision == PITCH_DEF_PROP) {\n    bad_row->fixed_pitch = 0.0f;\n    bad_row->char_cells.clear();\n  }\n}\n\n/**********************************************************************\n * compute_block_pitch\n *\n * Decide whether each block is fixed pitch individually.\n **********************************************************************/\n\nvoid compute_block_pitch(TO_BLOCK *block,     // input list\n                         FCOORD rotation,     // for drawing\n                         int32_t block_index, // block number\n                         bool testing_on) {   // correct orientation\n  TBOX block_box;                             // bounding box\n\n  block_box = block->block->pdblk.bounding_box();\n  if (testing_on && textord_debug_pitch_test) {\n    tprintf(\"Block %d at (%d,%d)->(%d,%d)\\n\", block_index, block_box.left(), block_box.bottom(),\n            block_box.right(), block_box.top());\n  }\n  block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace));\n  block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace));\n  block->fixed_pitch = 0.0f;\n  block->space_size = static_cast<float>(block->min_space);\n  block->kern_size = static_cast<float>(block->max_nonspace);\n  block->pr_nonsp = block->xheight * words_default_prop_nonspace;\n  block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;\n  if (!block->get_rows()->empty()) {\n    ASSERT_HOST(block->xheight > 0);\n    find_repeated_chars(block, textord_show_initial_words && testing_on);\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_initial_words && testing_on) {\n      // overlap_picture_ops(true);\n      ScrollView::Update();\n    }\n#endif\n    compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on);\n  }\n}\n\n/**********************************************************************\n * compute_rows_pitch\n *\n * Decide whether each row is fixed pitch individually.\n **********************************************************************/\n\nbool compute_rows_pitch( // find line stats\n    TO_BLOCK *block,     // block to do\n    int32_t block_index, // block number\n    bool testing_on      // correct orientation\n) {\n  int32_t maxwidth;   // of spaces\n  TO_ROW *row;        // current row\n  int32_t row_index;  // row number.\n  float lower, upper; // cluster thresholds\n  TO_ROW_IT row_it = block->get_rows();\n\n  row_index = 1;\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    ASSERT_HOST(row->xheight > 0);\n    row->compute_vertical_projection();\n    maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace));\n    if (row_pitch_stats(row, maxwidth, testing_on) &&\n        find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index,\n                       testing_on)) {\n      if (row->fixed_pitch == 0) {\n        lower = row->pr_nonsp;\n        upper = row->pr_space;\n        row->space_size = upper;\n        row->kern_size = lower;\n      }\n    } else {\n      row->fixed_pitch = 0.0f; // insufficient data\n      row->pitch_decision = PITCH_DUNNO;\n    }\n    row_index++;\n  }\n  return false;\n}\n\n/**********************************************************************\n * try_doc_fixed\n *\n * Attempt to call the entire document fixed pitch.\n **********************************************************************/\n\nbool try_doc_fixed(             // determine pitch\n    ICOORD page_tr,             // top right\n    TO_BLOCK_LIST *port_blocks, // input list\n    float gradient              // page skew\n) {\n  int16_t master_x; // uniform shifts\n  int16_t pitch;    // median pitch.\n  int x;            // profile coord\n  int prop_blocks;  // correct counts\n  int fixed_blocks;\n  int total_row_count; // total in page\n                       // iterator\n  TO_BLOCK_IT block_it = port_blocks;\n  TO_BLOCK *block;         // current block;\n  TO_ROW *row;             // current row\n  int16_t projection_left; // edges\n  int16_t projection_right;\n  int16_t row_left; // edges of row\n  int16_t row_right;\n  float master_y;     // uniform shifts\n  float shift_factor; // page skew correction\n  float final_pitch;  // output pitch\n  float row_y;        // baseline\n  STATS projection;   // entire page\n  STATS pitches(0, MAX_ALLOWED_PITCH - 1);\n  // for median\n  float sp_sd;      // space sd\n  int16_t mid_cuts; // no of cheap cuts\n  float pitch_sd;   // sync rating\n\n  if (!textord_blockndoc_fixed ||\n      block_it.empty() || block_it.data()->get_rows()->empty()) {\n    return false;\n  }\n  shift_factor = gradient / (gradient * gradient + 1);\n  // row iterator\n  TO_ROW_IT row_it(block_it.data()->get_rows());\n  master_x = row_it.data()->projection_left;\n  master_y = row_it.data()->baseline.y(master_x);\n  projection_left = INT16_MAX;\n  projection_right = -INT16_MAX;\n  prop_blocks = 0;\n  fixed_blocks = 0;\n  total_row_count = 0;\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    row_it.set_to_list(block->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      total_row_count++;\n      if (row->fixed_pitch > 0) {\n        pitches.add(static_cast<int32_t>(row->fixed_pitch), 1);\n      }\n      // find median\n      row_y = row->baseline.y(master_x);\n      row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));\n      row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y));\n      if (row_left < projection_left) {\n        projection_left = row_left;\n      }\n      if (row_right > projection_right) {\n        projection_right = row_right;\n      }\n    }\n  }\n  if (pitches.get_total() == 0) {\n    return false;\n  }\n  projection.set_range(projection_left, projection_right - 1);\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    row_it.set_to_list(block->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      row_y = row->baseline.y(master_x);\n      row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));\n      for (x = row->projection_left; x < row->projection_right; x++, row_left++) {\n        projection.add(row_left, row->projection.pile_count(x));\n      }\n    }\n  }\n\n  row_it.set_to_list(block_it.data()->get_rows());\n  row = row_it.data();\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_page_cuts && to_win != nullptr) {\n    projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);\n  }\n#endif\n  final_pitch = pitches.ile(0.5);\n  pitch = static_cast<int16_t>(final_pitch);\n  pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,\n                            final_pitch, sp_sd, mid_cuts, &row->char_cells, false);\n\n  if (textord_debug_pitch_metric) {\n    tprintf(\n        \"try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%\"\n        \"g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\\n\",\n        prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,\n        pitch_sd / pitch, pitch_sd / total_row_count / pitch);\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_page_cuts && to_win != nullptr) {\n    float row_shift;              // shift for row\n    ICOORDELT_LIST *master_cells; // cells for page\n    master_cells = &row->char_cells;\n    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n      block = block_it.data();\n      row_it.set_to_list(block->get_rows());\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        row = row_it.data();\n        row_y = row->baseline.y(master_x);\n        row_shift = shift_factor * (master_y - row_y);\n        plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);\n      }\n    }\n  }\n#endif\n  row->char_cells.clear();\n  return false;\n}\n\n/**********************************************************************\n * try_block_fixed\n *\n * Try to call the entire block fixed.\n **********************************************************************/\n\nbool try_block_fixed(   // find line stats\n    TO_BLOCK *block,    // block to do\n    int32_t block_index // block number\n) {\n  return false;\n}\n\n/**********************************************************************\n * try_rows_fixed\n *\n * Decide whether each row is fixed pitch individually.\n **********************************************************************/\n\nbool try_rows_fixed(     // find line stats\n    TO_BLOCK *block,     // block to do\n    int32_t block_index, // block number\n    bool testing_on      // correct orientation\n) {\n  TO_ROW *row;           // current row\n  int32_t def_fixed = 0; // counters\n  int32_t def_prop = 0;\n  int32_t maybe_fixed = 0;\n  int32_t maybe_prop = 0;\n  int32_t dunno = 0;\n  int32_t corr_fixed = 0;\n  int32_t corr_prop = 0;\n  float lower, upper; // cluster thresholds\n  TO_ROW_IT row_it = block->get_rows();\n\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    ASSERT_HOST(row->xheight > 0);\n    if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) {\n      if (row->fixed_pitch == 0) {\n        lower = row->pr_nonsp;\n        upper = row->pr_space;\n        row->space_size = upper;\n        row->kern_size = lower;\n      }\n    }\n  }\n  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,\n                    dunno);\n  if (testing_on &&\n      (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) {\n    tprintf(\"Initially:\");\n    print_block_counts(block, block_index);\n  }\n  if (def_fixed > def_prop * textord_words_veto_power) {\n    block->pitch_decision = PITCH_DEF_FIXED;\n  } else if (def_prop > def_fixed * textord_words_veto_power) {\n    block->pitch_decision = PITCH_DEF_PROP;\n  } else if (def_fixed > 0 || def_prop > 0) {\n    block->pitch_decision = PITCH_DUNNO;\n  } else if (maybe_fixed > maybe_prop * textord_words_veto_power) {\n    block->pitch_decision = PITCH_MAYBE_FIXED;\n  } else if (maybe_prop > maybe_fixed * textord_words_veto_power) {\n    block->pitch_decision = PITCH_MAYBE_PROP;\n  } else {\n    block->pitch_decision = PITCH_DUNNO;\n  }\n  return false;\n}\n\n/**********************************************************************\n * print_block_counts\n *\n * Count up how many rows have what decision and print the results.\n **********************************************************************/\n\nvoid print_block_counts( // find line stats\n    TO_BLOCK *block,     // block to do\n    int32_t block_index  // block number\n) {\n  int32_t def_fixed = 0; // counters\n  int32_t def_prop = 0;\n  int32_t maybe_fixed = 0;\n  int32_t maybe_prop = 0;\n  int32_t dunno = 0;\n  int32_t corr_fixed = 0;\n  int32_t corr_prop = 0;\n\n  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,\n                    dunno);\n  tprintf(\"Block %d has (%d,%d,%d)\", block_index, def_fixed, maybe_fixed, corr_fixed);\n  if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) {\n    tprintf(\" (Wrongly)\");\n  }\n  tprintf(\" fixed, (%d,%d,%d)\", def_prop, maybe_prop, corr_prop);\n  if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) {\n    tprintf(\" (Wrongly)\");\n  }\n  tprintf(\" prop, %d dunno\\n\", dunno);\n}\n\n/**********************************************************************\n * count_block_votes\n *\n * Count the number of rows in the block with each kind of pitch_decision.\n **********************************************************************/\n\nvoid count_block_votes( // find line stats\n    TO_BLOCK *block,    // block to do\n    int32_t &def_fixed, // add to counts\n    int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,\n    int32_t &corr_prop, int32_t &dunno) {\n  TO_ROW *row; // current row\n  TO_ROW_IT row_it = block->get_rows();\n\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    switch (row->pitch_decision) {\n      case PITCH_DUNNO:\n        dunno++;\n        break;\n      case PITCH_DEF_PROP:\n        def_prop++;\n        break;\n      case PITCH_MAYBE_PROP:\n        maybe_prop++;\n        break;\n      case PITCH_DEF_FIXED:\n        def_fixed++;\n        break;\n      case PITCH_MAYBE_FIXED:\n        maybe_fixed++;\n        break;\n      case PITCH_CORR_PROP:\n        corr_prop++;\n        break;\n      case PITCH_CORR_FIXED:\n        corr_fixed++;\n        break;\n    }\n  }\n}\n\n/**********************************************************************\n * row_pitch_stats\n *\n * Decide whether each row is fixed pitch individually.\n **********************************************************************/\n\nbool row_pitch_stats( // find line stats\n    TO_ROW *row,      // current row\n    int32_t maxwidth, // of spaces\n    bool testing_on   // correct orientation\n) {\n  BLOBNBOX *blob;        // current blob\n  int gap_index;         // current gap\n  int32_t prev_x;        // end of prev blob\n  int32_t cluster_count; // no of clusters\n  int32_t prev_count;    // of clusters\n  int32_t smooth_factor; // for smoothing stats\n  TBOX blob_box;         // bounding box\n  float lower, upper;    // cluster thresholds\n                         // gap sizes\n  float gaps[BLOCK_STATS_CLUSTERS];\n  // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  STATS gap_stats(0, maxwidth - 1);\n  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];\n  // clusters\n\n  smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5);\n  if (!blob_it.empty()) {\n    prev_x = blob_it.data()->bounding_box().right();\n    blob_it.forward();\n    while (!blob_it.at_first()) {\n      blob = blob_it.data();\n      if (!blob->joined_to_prev()) {\n        blob_box = blob->bounding_box();\n        if (blob_box.left() - prev_x < maxwidth) {\n          gap_stats.add(blob_box.left() - prev_x, 1);\n        }\n        prev_x = blob_box.right();\n      }\n      blob_it.forward();\n    }\n  }\n  if (gap_stats.get_total() == 0) {\n    return false;\n  }\n  cluster_count = 0;\n  lower = row->xheight * words_initial_lower;\n  upper = row->xheight * words_initial_upper;\n  gap_stats.smooth(smooth_factor);\n  do {\n    prev_count = cluster_count;\n    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,\n                                      BLOCK_STATS_CLUSTERS, cluster_stats);\n  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);\n  if (cluster_count < 1) {\n    return false;\n  }\n  for (gap_index = 0; gap_index < cluster_count; gap_index++) {\n    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);\n  }\n  // get medians\n  if (testing_on) {\n    tprintf(\"cluster_count=%d:\", cluster_count);\n    for (gap_index = 0; gap_index < cluster_count; gap_index++) {\n      tprintf(\" %g(%d)\", gaps[gap_index], cluster_stats[gap_index + 1].get_total());\n    }\n    tprintf(\"\\n\");\n  }\n  qsort(gaps, cluster_count, sizeof(float), sort_floats);\n\n  // Try to find proportional non-space and space for row.\n  lower = row->xheight * words_default_prop_nonspace;\n  upper = row->xheight * textord_words_min_minspace;\n  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {\n    ;\n  }\n  if (gap_index == 0) {\n    if (testing_on) {\n      tprintf(\"No clusters below nonspace threshold!!\\n\");\n    }\n    if (cluster_count > 1) {\n      row->pr_nonsp = gaps[0];\n      row->pr_space = gaps[1];\n    } else {\n      row->pr_nonsp = lower;\n      row->pr_space = gaps[0];\n    }\n  } else {\n    row->pr_nonsp = gaps[gap_index - 1];\n    while (gap_index < cluster_count && gaps[gap_index] < upper) {\n      gap_index++;\n    }\n    if (gap_index == cluster_count) {\n      if (testing_on) {\n        tprintf(\"No clusters above nonspace threshold!!\\n\");\n      }\n      row->pr_space = lower * textord_spacesize_ratioprop;\n    } else {\n      row->pr_space = gaps[gap_index];\n    }\n  }\n\n  // Now try to find the fixed pitch space and non-space.\n  upper = row->xheight * words_default_fixed_space;\n  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {\n    ;\n  }\n  if (gap_index == 0) {\n    if (testing_on) {\n      tprintf(\"No clusters below space threshold!!\\n\");\n    }\n    row->fp_nonsp = upper;\n    row->fp_space = gaps[0];\n  } else {\n    row->fp_nonsp = gaps[gap_index - 1];\n    if (gap_index == cluster_count) {\n      if (testing_on) {\n        tprintf(\"No clusters above space threshold!!\\n\");\n      }\n      row->fp_space = row->xheight;\n    } else {\n      row->fp_space = gaps[gap_index];\n    }\n  }\n  if (testing_on) {\n    tprintf(\n        \"Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, \"\n        \"fp_space=%g\\n\",\n        row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);\n  }\n  return true; // computed some stats\n}\n\n/**********************************************************************\n * find_row_pitch\n *\n * Check to see if this row could be fixed pitch using the given spacings.\n * Blobs with gaps smaller than the lower threshold are assumed to be one.\n * The larger threshold is the word gap threshold.\n **********************************************************************/\n\nbool find_row_pitch(     // find lines\n    TO_ROW *row,         // row to do\n    int32_t maxwidth,    // max permitted space\n    int32_t dm_gap,      // ignorable gaps\n    TO_BLOCK *block,     // block of row\n    int32_t block_index, // block_number\n    int32_t row_index,   // number of row\n    bool testing_on      // correct orientation\n) {\n  bool used_dm_model; // looks like dot matrix\n  float min_space;    // estimate threshold\n  float non_space;    // gap size\n  float gap_iqr;      // interquartile range\n  float pitch_iqr;\n  float dm_gap_iqr; // interquartile range\n  float dm_pitch_iqr;\n  float dm_pitch;      // pitch with dm on\n  float pitch;         // revised estimate\n  float initial_pitch; // guess at pitch\n  STATS gap_stats(0, maxwidth - 1);\n  // centre-centre\n  STATS pitch_stats(0, maxwidth - 1);\n\n  row->fixed_pitch = 0.0f;\n  initial_pitch = row->fp_space;\n  if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) {\n    initial_pitch = row->xheight; // keep pitch decent\n  }\n  non_space = row->fp_nonsp;\n  if (non_space > initial_pitch) {\n    non_space = initial_pitch;\n  }\n  min_space = (initial_pitch + non_space) / 2;\n\n  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false,\n                         dm_gap)) {\n    dm_gap_iqr = 0.0001f;\n    dm_pitch_iqr = maxwidth * 2.0f;\n    dm_pitch = initial_pitch;\n  } else {\n    dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);\n    dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);\n    dm_pitch = pitch_stats.ile(0.5);\n  }\n  gap_stats.clear();\n  pitch_stats.clear();\n  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) {\n    gap_iqr = 0.0001f;\n    pitch_iqr = maxwidth * 3.0f;\n  } else {\n    gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);\n    pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);\n    if (testing_on) {\n      tprintf(\n          \"First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, \"\n          \"pitch=%g\\n\",\n          initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));\n    }\n    initial_pitch = pitch_stats.ile(0.5);\n    if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,\n                                                       initial_pitch, true, false, 0)) {\n      min_space = initial_pitch;\n      gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);\n      pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);\n      if (testing_on) {\n        tprintf(\n            \"Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, \"\n            \"pitch=%g\\n\",\n            initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));\n      }\n      initial_pitch = pitch_stats.ile(0.5);\n    }\n  }\n  if (textord_debug_pitch_metric) {\n    tprintf(\"Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:\", block_index,\n            row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,\n            pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth\n                ? 'D'\n                : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));\n  }\n  if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {\n    row->pitch_decision = PITCH_DUNNO;\n    if (textord_debug_pitch_metric) {\n      tprintf(\"\\n\");\n    }\n    return false; // insufficient data\n  }\n  if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {\n    if (testing_on) {\n      tprintf(\n          \"Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, \"\n          \"dm_gap_iqr=%g\\n\",\n          pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);\n    }\n    gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);\n    pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);\n    pitch = pitch_stats.ile(0.5);\n    used_dm_model = false;\n  } else {\n    if (testing_on) {\n      tprintf(\n          \"Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, \"\n          \"dm_gap_iqr=%g\\n\",\n          pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);\n    }\n    gap_iqr = dm_gap_iqr;\n    pitch_iqr = dm_pitch_iqr;\n    pitch = dm_pitch;\n    used_dm_model = true;\n  }\n  if (textord_debug_pitch_metric) {\n    tprintf(\"rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:\", pitch_iqr, gap_iqr, pitch);\n    tprintf(\"p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:\", pitch_iqr / gap_iqr, pitch_iqr / block->xheight,\n            pitch_iqr < gap_iqr * textord_fpiqr_ratio &&\n                    pitch_iqr < block->xheight * textord_max_pitch_iqr &&\n                    pitch < block->xheight * textord_words_default_maxspace\n                ? 'F'\n                : 'P');\n  }\n  if (pitch_iqr < gap_iqr * textord_fpiqr_ratio &&\n      pitch_iqr < block->xheight * textord_max_pitch_iqr &&\n      pitch < block->xheight * textord_words_default_maxspace) {\n    row->pitch_decision = PITCH_MAYBE_FIXED;\n  } else {\n    row->pitch_decision = PITCH_MAYBE_PROP;\n  }\n  row->fixed_pitch = pitch;\n  row->kern_size = gap_stats.ile(0.5);\n  row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2;\n  if (row->min_space > row->fixed_pitch) {\n    row->min_space = static_cast<int32_t>(row->fixed_pitch);\n  }\n  row->max_nonspace = row->min_space;\n  row->space_size = row->fixed_pitch;\n  row->space_threshold = (row->max_nonspace + row->min_space) / 2;\n  row->used_dm_model = used_dm_model;\n  return true;\n}\n\n/**********************************************************************\n * fixed_pitch_row\n *\n * Check to see if this row could be fixed pitch using the given spacings.\n * Blobs with gaps smaller than the lower threshold are assumed to be one.\n * The larger threshold is the word gap threshold.\n **********************************************************************/\n\nbool fixed_pitch_row(TO_ROW *row, // row to do\n                     BLOCK *block,\n                     int32_t block_index // block_number\n) {\n  const char *res_string; // pitch result\n  int16_t mid_cuts;       // no of cheap cuts\n  float non_space;        // gap size\n  float pitch_sd;         // error on pitch\n  float sp_sd = 0.0f;     // space sd\n\n  non_space = row->fp_nonsp;\n  if (non_space > row->fixed_pitch) {\n    non_space = row->fixed_pitch;\n  }\n  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;\n  if (textord_all_prop || (pb != nullptr && !pb->IsText())) {\n    // Set the decision to definitely proportional.\n    pitch_sd = textord_words_def_prop * row->fixed_pitch;\n    row->pitch_decision = PITCH_DEF_PROP;\n  } else {\n    pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right,\n                              (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd,\n                              mid_cuts, &row->char_cells, block_index == textord_debug_block);\n    if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&\n        ((pitsync_linear_version & 3) < 3 ||\n         ((pitsync_linear_version & 3) >= 3 &&\n          (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {\n      if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps &&\n          ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) {\n        row->pitch_decision = PITCH_DEF_FIXED;\n      } else {\n        row->pitch_decision = PITCH_MAYBE_FIXED;\n      }\n    } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 ||\n               pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {\n      if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {\n        row->pitch_decision = PITCH_MAYBE_PROP;\n      } else {\n        row->pitch_decision = PITCH_DEF_PROP;\n      }\n    } else {\n      row->pitch_decision = PITCH_DUNNO;\n    }\n  }\n\n  if (textord_debug_pitch_metric) {\n    res_string = \"??\";\n    switch (row->pitch_decision) {\n      case PITCH_DEF_PROP:\n        res_string = \"DP\";\n        break;\n      case PITCH_MAYBE_PROP:\n        res_string = \"MP\";\n        break;\n      case PITCH_DEF_FIXED:\n        res_string = \"DF\";\n        break;\n      case PITCH_MAYBE_FIXED:\n        res_string = \"MF\";\n        break;\n      default:\n        res_string = \"??\";\n    }\n    tprintf(\":sd/p=%g:occ=%g:init_res=%s\\n\", pitch_sd / row->fixed_pitch, sp_sd, res_string);\n  }\n  return true;\n}\n\n/**********************************************************************\n * count_pitch_stats\n *\n * Count up the gap and pitch stats on the block to see if it is fixed pitch.\n * Blobs with gaps smaller than the lower threshold are assumed to be one.\n * The larger threshold is the word gap threshold.\n * The return value indicates whether there were any decent values to use.\n **********************************************************************/\n\nbool count_pitch_stats(  // find lines\n    TO_ROW *row,         // row to do\n    STATS *gap_stats,    // blob gaps\n    STATS *pitch_stats,  // centre-centre stats\n    float initial_pitch, // guess at pitch\n    float min_space,     // estimate space size\n    bool ignore_outsize, // discard big objects\n    bool split_outsize,  // split big objects\n    int32_t dm_gap       // ignorable gaps\n) {\n  bool prev_valid; // not word broken\n  BLOBNBOX *blob;  // current blob\n                   // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  int32_t prev_right;  // end of prev blob\n  int32_t prev_centre; // centre of previous blob\n  int32_t x_centre;    // centre of this blob\n  int32_t blob_width;  // width of blob\n  int32_t width_units; // no of widths in blob\n  float width;         // blob width\n  TBOX blob_box;       // bounding box\n  TBOX joined_box;     // of super blob\n\n  gap_stats->clear();\n  pitch_stats->clear();\n  if (blob_it.empty()) {\n    return false;\n  }\n  prev_valid = false;\n  prev_centre = 0;\n  prev_right = 0; // stop compiler warning\n  joined_box = blob_it.data()->bounding_box();\n  do {\n    blob_it.forward();\n    blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      blob_box = blob->bounding_box();\n      if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) ||\n          blob->cblob() == nullptr) {\n        joined_box += blob_box; // merge blobs\n      } else {\n        blob_width = joined_box.width();\n        if (split_outsize) {\n          width_units =\n              static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5));\n          if (width_units < 1) {\n            width_units = 1;\n          }\n          width_units--;\n        } else if (ignore_outsize) {\n          width = static_cast<float>(blob_width) / initial_pitch;\n          width_units =\n              width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0\n                                                                                             : -1;\n        } else {\n          width_units = 0; // everything in\n        }\n        x_centre = static_cast<int32_t>(joined_box.left() +\n                                        (blob_width - width_units * initial_pitch) / 2);\n        if (prev_valid && width_units >= 0) {\n          //                                              if (width_units>0)\n          //                                              {\n          //                                                      tprintf(\"wu=%d,\n          //                                                      width=%d,\n          //                                                      xc=%d, adding\n          //                                                      %d\\n\",\n          //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);\n          //                                              }\n          gap_stats->add(joined_box.left() - prev_right, 1);\n          pitch_stats->add(x_centre - prev_centre, 1);\n        }\n        prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch);\n        prev_right = joined_box.right();\n        prev_valid = blob_box.left() - joined_box.right() < min_space;\n        prev_valid = prev_valid && width_units >= 0;\n        joined_box = blob_box;\n      }\n    }\n  } while (!blob_it.at_first());\n  return gap_stats->get_total() >= 3;\n}\n\n/**********************************************************************\n * tune_row_pitch\n *\n * Use a dp algorithm to fit the character cells and return the sd of\n * the cell size over the row.\n **********************************************************************/\n\nfloat tune_row_pitch(           // find fp cells\n    TO_ROW *row,                // row to do\n    STATS *projection,          // vertical projection\n    int16_t projection_left,    // edge of projection\n    int16_t projection_right,   // edge of projection\n    float space_size,           // size of blank\n    float &initial_pitch,       // guess at pitch\n    float &best_sp_sd,          // space sd\n    int16_t &best_mid_cuts,     // no of cheap cuts\n    ICOORDELT_LIST *best_cells, // row cells\n    bool testing_on             // individual words\n) {\n  int pitch_delta;           // offset pitch\n  int16_t mid_cuts;          // cheap cuts\n  float pitch_sd;            // current sd\n  float best_sd;             // best result\n  float best_pitch;          // pitch for best result\n  float initial_sd;          // starting error\n  float sp_sd;               // space sd\n  ICOORDELT_LIST test_cells; // row cells\n  ICOORDELT_IT best_it;      // start of best list\n\n  if (textord_fast_pitch_test) {\n    return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,\n                           initial_pitch, best_sp_sd,\n                           // space sd\n                           best_mid_cuts, best_cells, testing_on);\n  }\n  if (textord_disable_pitch_test) {\n    best_sp_sd = initial_pitch;\n    return initial_pitch;\n  }\n  initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,\n                                initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);\n  best_sd = initial_sd;\n  best_pitch = initial_pitch;\n  if (testing_on) {\n    tprintf(\"tune_row_pitch:start pitch=%g, sd=%g\\n\", best_pitch, best_sd);\n  }\n  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {\n    pitch_sd =\n        compute_pitch_sd(row, projection, projection_left, projection_right, space_size,\n                         initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);\n    if (testing_on) {\n      tprintf(\"testing pitch at %g, sd=%g\\n\", initial_pitch + pitch_delta, pitch_sd);\n    }\n    if (pitch_sd < best_sd) {\n      best_sd = pitch_sd;\n      best_mid_cuts = mid_cuts;\n      best_sp_sd = sp_sd;\n      best_pitch = initial_pitch + pitch_delta;\n      best_cells->clear();\n      best_it.set_to_list(best_cells);\n      best_it.add_list_after(&test_cells);\n    } else {\n      test_cells.clear();\n    }\n    if (pitch_sd > initial_sd) {\n      break; // getting worse\n    }\n  }\n  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {\n    pitch_sd =\n        compute_pitch_sd(row, projection, projection_left, projection_right, space_size,\n                         initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);\n    if (testing_on) {\n      tprintf(\"testing pitch at %g, sd=%g\\n\", initial_pitch - pitch_delta, pitch_sd);\n    }\n    if (pitch_sd < best_sd) {\n      best_sd = pitch_sd;\n      best_mid_cuts = mid_cuts;\n      best_sp_sd = sp_sd;\n      best_pitch = initial_pitch - pitch_delta;\n      best_cells->clear();\n      best_it.set_to_list(best_cells);\n      best_it.add_list_after(&test_cells);\n    } else {\n      test_cells.clear();\n    }\n    if (pitch_sd > initial_sd) {\n      break;\n    }\n  }\n  initial_pitch = best_pitch;\n\n  if (textord_debug_pitch_metric) {\n    print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);\n  }\n\n  return best_sd;\n}\n\n/**********************************************************************\n * tune_row_pitch\n *\n * Use a dp algorithm to fit the character cells and return the sd of\n * the cell size over the row.\n **********************************************************************/\n\nfloat tune_row_pitch2(          // find fp cells\n    TO_ROW *row,                // row to do\n    STATS *projection,          // vertical projection\n    int16_t projection_left,    // edge of projection\n    int16_t projection_right,   // edge of projection\n    float space_size,           // size of blank\n    float &initial_pitch,       // guess at pitch\n    float &best_sp_sd,          // space sd\n    int16_t &best_mid_cuts,     // no of cheap cuts\n    ICOORDELT_LIST *best_cells, // row cells\n    bool testing_on             // individual words\n) {\n  int pitch_delta;    // offset pitch\n  int16_t pixel;      // pixel coord\n  int16_t best_pixel; // pixel coord\n  int16_t best_delta; // best pitch\n  int16_t best_pitch; // best pitch\n  int16_t start;      // of good range\n  int16_t end;        // of good range\n  int32_t best_count; // lowest sum\n  float best_sd;      // best result\n\n  best_sp_sd = initial_pitch;\n\n  best_pitch = static_cast<int>(initial_pitch);\n  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {\n    return initial_pitch;\n  }\n  std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection\n\n  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {\n    sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta);\n  }\n  for (pixel = projection_left; pixel <= projection_right; pixel++) {\n    for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {\n      sum_proj[textord_pitch_range + pitch_delta].add(\n          (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel));\n    }\n  }\n  best_count = sum_proj[textord_pitch_range].pile_count(0);\n  best_delta = 0;\n  best_pixel = 0;\n  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {\n    for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {\n      if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) {\n        best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel);\n        best_delta = pitch_delta;\n        best_pixel = pixel;\n      }\n    }\n  }\n  if (testing_on) {\n    tprintf(\"tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\\n\", initial_pitch, best_delta,\n            best_count);\n  }\n  best_pitch += best_delta;\n  initial_pitch = best_pitch;\n  best_count++;\n  best_count += best_count;\n  for (start = best_pixel - 2;\n       start > best_pixel - best_pitch &&\n       sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count;\n       start--) {\n    ;\n  }\n  for (end = best_pixel + 2;\n       end < best_pixel + best_pitch &&\n       sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count;\n       end++) {\n    ;\n  }\n\n  best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,\n                             initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,\n                             start, end);\n  if (testing_on) {\n    tprintf(\"tune_row_pitch:output pitch=%g, sd=%g\\n\", initial_pitch, best_sd);\n  }\n\n  if (textord_debug_pitch_metric) {\n    print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);\n  }\n\n  return best_sd;\n}\n\n/**********************************************************************\n * compute_pitch_sd\n *\n * Use a dp algorithm to fit the character cells and return the sd of\n * the cell size over the row.\n **********************************************************************/\n\nfloat compute_pitch_sd(        // find fp cells\n    TO_ROW *row,               // row to do\n    STATS *projection,         // vertical projection\n    int16_t projection_left,   // edge\n    int16_t projection_right,  // edge\n    float space_size,          // size of blank\n    float initial_pitch,       // guess at pitch\n    float &sp_sd,              // space sd\n    int16_t &mid_cuts,         // no of free cuts\n    ICOORDELT_LIST *row_cells, // list of chop pts\n    bool testing_on,           // individual words\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n) {\n  int16_t occupation; // no of cells in word.\n                      // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT start_it;  // start of word\n  BLOBNBOX_IT plot_it;   // for plotting\n  int16_t blob_count;    // no of blobs\n  TBOX blob_box;         // bounding box\n  TBOX prev_box;         // of super blob\n  int32_t prev_right;    // of word sync\n  int scale_factor;      // on scores for big words\n  int32_t sp_count;      // spaces\n  FPSEGPT_LIST seg_list; // char cells\n  FPSEGPT_IT seg_it;     // iterator\n  int16_t segpos;        // position of segment\n  int16_t cellpos;       // previous cell boundary\n                         // iterator\n  ICOORDELT_IT cell_it = row_cells;\n  ICOORDELT *cell;     // new cell\n  double sqsum;        // sum of squares\n  double spsum;        // of spaces\n  double sp_var;       // space error\n  double word_sync;    // result for word\n  int32_t total_count; // total blobs\n\n  if ((pitsync_linear_version & 3) > 1) {\n    word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,\n                                  occupation, mid_cuts, row_cells, testing_on, start, end);\n    sp_sd = occupation;\n    return word_sync;\n  }\n  mid_cuts = 0;\n  cellpos = 0;\n  total_count = 0;\n  sqsum = 0;\n  sp_count = 0;\n  spsum = 0;\n  prev_right = -1;\n  if (blob_it.empty()) {\n    return space_size * 10;\n  }\n#ifndef GRAPHICS_DISABLED\n  if (testing_on && to_win != nullptr) {\n    blob_box = blob_it.data()->bounding_box();\n    projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);\n  }\n#endif\n  start_it = blob_it;\n  blob_count = 0;\n  blob_box = box_next(&blob_it); // first blob\n  blob_it.mark_cycle_pt();\n  do {\n    for (; blob_count > 0; blob_count--) {\n      box_next(&start_it);\n    }\n    do {\n      prev_box = blob_box;\n      blob_count++;\n      blob_box = box_next(&blob_it);\n    } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);\n    plot_it = start_it;\n    if (pitsync_linear_version & 3) {\n      word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,\n                                    projection, projection_left, projection_right,\n                                    row->xheight * textord_projection_scale, occupation, &seg_list,\n                                    start, end);\n    } else {\n      word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,\n                                   projection, &seg_list);\n    }\n    if (testing_on) {\n      tprintf(\"Word ending at (%d,%d), len=%d, sync rating=%g, \", prev_box.right(), prev_box.top(),\n              seg_list.length() - 1, word_sync);\n      seg_it.set_to_list(&seg_list);\n      for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n        if (seg_it.data()->faked) {\n          tprintf(\"(F)\");\n        }\n        tprintf(\"%d, \", seg_it.data()->position());\n        //                              tprintf(\"C=%g, s=%g, sq=%g\\n\",\n        //                                      seg_it.data()->cost_function(),\n        //                                      seg_it.data()->sum(),\n        //                                      seg_it.data()->squares());\n      }\n      tprintf(\"\\n\");\n    }\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {\n      plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);\n    }\n#endif\n    seg_it.set_to_list(&seg_list);\n    if (prev_right >= 0) {\n      sp_var = seg_it.data()->position() - prev_right;\n      sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;\n      sp_var *= sp_var;\n      spsum += sp_var;\n      sp_count++;\n    }\n    for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n      segpos = seg_it.data()->position();\n      if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {\n        // big gap\n        while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {\n          cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0);\n          cell_it.add_after_then_move(cell);\n          cellpos += static_cast<int16_t>(initial_pitch);\n        }\n        // make new one\n        cell = new ICOORDELT(segpos, 0);\n        cell_it.add_after_then_move(cell);\n        cellpos = segpos;\n      } else if (segpos > cellpos - initial_pitch / 2) {\n        cell = cell_it.data();\n        // average positions\n        cell->set_x((cellpos + segpos) / 2);\n        cellpos = cell->x();\n      }\n    }\n    seg_it.move_to_last();\n    prev_right = seg_it.data()->position();\n    if (textord_pitch_scalebigwords) {\n      scale_factor = (seg_list.length() - 2) / 2;\n      if (scale_factor < 1) {\n        scale_factor = 1;\n      }\n    } else {\n      scale_factor = 1;\n    }\n    sqsum += word_sync * scale_factor;\n    total_count += (seg_list.length() - 1) * scale_factor;\n    seg_list.clear();\n  } while (!blob_it.cycled_list());\n  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;\n  return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;\n}\n\n/**********************************************************************\n * compute_pitch_sd2\n *\n * Use a dp algorithm to fit the character cells and return the sd of\n * the cell size over the row.\n **********************************************************************/\n\nfloat compute_pitch_sd2(       // find fp cells\n    TO_ROW *row,               // row to do\n    STATS *projection,         // vertical projection\n    int16_t projection_left,   // edge\n    int16_t projection_right,  // edge\n    float initial_pitch,       // guess at pitch\n    int16_t &occupation,       // no of occupied cells\n    int16_t &mid_cuts,         // no of free cuts\n    ICOORDELT_LIST *row_cells, // list of chop pts\n    bool testing_on,           // individual words\n    int16_t start,             // start of good range\n    int16_t end                // end of good range\n) {\n  // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT plot_it;\n  int16_t blob_count;    // no of blobs\n  TBOX blob_box;         // bounding box\n  FPSEGPT_LIST seg_list; // char cells\n  FPSEGPT_IT seg_it;     // iterator\n  int16_t segpos;        // position of segment\n                         // iterator\n  ICOORDELT_IT cell_it = row_cells;\n  ICOORDELT *cell;  // new cell\n  double word_sync; // result for word\n\n  mid_cuts = 0;\n  if (blob_it.empty()) {\n    occupation = 0;\n    return initial_pitch * 10;\n  }\n#ifndef GRAPHICS_DISABLED\n  if (testing_on && to_win != nullptr) {\n    projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);\n  }\n#endif\n  blob_count = 0;\n  blob_it.mark_cycle_pt();\n  do {\n    // first blob\n    blob_box = box_next(&blob_it);\n    blob_count++;\n  } while (!blob_it.cycled_list());\n  plot_it = blob_it;\n  word_sync = check_pitch_sync2(\n      &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,\n      projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end);\n  if (testing_on) {\n    tprintf(\"Row ending at (%d,%d), len=%d, sync rating=%g, \", blob_box.right(), blob_box.top(),\n            seg_list.length() - 1, word_sync);\n    seg_it.set_to_list(&seg_list);\n    for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n      if (seg_it.data()->faked) {\n        tprintf(\"(F)\");\n      }\n      tprintf(\"%d, \", seg_it.data()->position());\n      //                              tprintf(\"C=%g, s=%g, sq=%g\\n\",\n      //                                      seg_it.data()->cost_function(),\n      //                                      seg_it.data()->sum(),\n      //                                      seg_it.data()->squares());\n    }\n    tprintf(\"\\n\");\n  }\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {\n    plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);\n  }\n#endif\n  seg_it.set_to_list(&seg_list);\n  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {\n    segpos = seg_it.data()->position();\n    // make new one\n    cell = new ICOORDELT(segpos, 0);\n    cell_it.add_after_then_move(cell);\n    if (seg_it.at_last()) {\n      mid_cuts = seg_it.data()->cheap_cuts();\n    }\n  }\n  seg_list.clear();\n  return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;\n}\n\n/**********************************************************************\n * print_pitch_sd\n *\n * Use a dp algorithm to fit the character cells and return the sd of\n * the cell size over the row.\n **********************************************************************/\n\nvoid print_pitch_sd(         // find fp cells\n    TO_ROW *row,             // row to do\n    STATS *projection,       // vertical projection\n    int16_t projection_left, // edges //size of blank\n    int16_t projection_right, float space_size,\n    float initial_pitch // guess at pitch\n) {\n  const char *res2;   // pitch result\n  int16_t occupation; // used cells\n  float sp_sd;        // space sd\n                      // blobs\n  BLOBNBOX_IT blob_it = row->blob_list();\n  BLOBNBOX_IT start_it;     // start of word\n  BLOBNBOX_IT row_start;    // start of row\n  int16_t blob_count;       // no of blobs\n  int16_t total_blob_count; // total blobs in line\n  TBOX blob_box;            // bounding box\n  TBOX prev_box;            // of super blob\n  int32_t prev_right;       // of word sync\n  int scale_factor;         // on scores for big words\n  int32_t sp_count;         // spaces\n  FPSEGPT_LIST seg_list;    // char cells\n  FPSEGPT_IT seg_it;        // iterator\n  double sqsum;             // sum of squares\n  double spsum;             // of spaces\n  double sp_var;            // space error\n  double word_sync;         // result for word\n  double total_count;       // total cuts\n\n  if (blob_it.empty()) {\n    return;\n  }\n  row_start = blob_it;\n  total_blob_count = 0;\n\n  total_count = 0;\n  sqsum = 0;\n  sp_count = 0;\n  spsum = 0;\n  prev_right = -1;\n  blob_it = row_start;\n  start_it = blob_it;\n  blob_count = 0;\n  blob_box = box_next(&blob_it); // first blob\n  blob_it.mark_cycle_pt();\n  do {\n    for (; blob_count > 0; blob_count--) {\n      box_next(&start_it);\n    }\n    do {\n      prev_box = blob_box;\n      blob_count++;\n      blob_box = box_next(&blob_it);\n    } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);\n    word_sync = check_pitch_sync2(\n        &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,\n        projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);\n    total_blob_count += blob_count;\n    seg_it.set_to_list(&seg_list);\n    if (prev_right >= 0) {\n      sp_var = seg_it.data()->position() - prev_right;\n      sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;\n      sp_var *= sp_var;\n      spsum += sp_var;\n      sp_count++;\n    }\n    seg_it.move_to_last();\n    prev_right = seg_it.data()->position();\n    if (textord_pitch_scalebigwords) {\n      scale_factor = (seg_list.length() - 2) / 2;\n      if (scale_factor < 1) {\n        scale_factor = 1;\n      }\n    } else {\n      scale_factor = 1;\n    }\n    sqsum += word_sync * scale_factor;\n    total_count += (seg_list.length() - 1) * scale_factor;\n    seg_list.clear();\n  } while (!blob_it.cycled_list());\n  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;\n  word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;\n  tprintf(\"new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:\", word_sync, word_sync / initial_pitch, sp_sd,\n          word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P');\n\n  start_it = row_start;\n  blob_it = row_start;\n  word_sync =\n      check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2,\n                        projection, projection_left, projection_right,\n                        row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);\n  if (occupation > 1) {\n    word_sync /= occupation;\n  }\n  word_sync = sqrt(word_sync);\n\n#ifndef GRAPHICS_DISABLED\n  if (textord_show_row_cuts && to_win != nullptr) {\n    plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);\n  }\n#endif\n  seg_list.clear();\n  if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {\n    if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {\n      res2 = \"DF\";\n    } else {\n      res2 = \"MF\";\n    }\n  } else {\n    res2 = word_sync < textord_words_def_prop * initial_pitch ? \"MP\" : \"DP\";\n  }\n  tprintf(\n      \"row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, \"\n      \"all_caps=%d\\n\",\n      word_sync, word_sync / initial_pitch,\n      word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2,\n      initial_pitch, row->fixed_pitch, row->all_caps);\n}\n\n/**********************************************************************\n * find_repeated_chars\n *\n * Extract marked leader blobs and put them\n * into words in advance of fixed pitch checking and word generation.\n **********************************************************************/\nvoid find_repeated_chars(TO_BLOCK *block,   // Block to search.\n                         bool testing_on) { // Debug mode.\n  POLY_BLOCK *pb = block->block->pdblk.poly_block();\n  if (pb != nullptr && !pb->IsText()) {\n    return; // Don't find repeated chars in non-text blocks.\n  }\n\n  TO_ROW *row;\n  BLOBNBOX_IT box_it;\n  BLOBNBOX_IT search_it; // forward search\n  WERD *word;            // new word\n  TBOX word_box;         // for plotting\n  int blobcount, repeated_set;\n\n  TO_ROW_IT row_it = block->get_rows();\n  if (row_it.empty()) {\n    return; // empty block\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    box_it.set_to_list(row->blob_list());\n    if (box_it.empty()) {\n      continue; // no blobs in this row\n    }\n    if (!row->rep_chars_marked()) {\n      mark_repeated_chars(row);\n    }\n    if (row->num_repeated_sets() == 0) {\n      continue; // nothing to do for this row\n    }\n    // new words\n    WERD_IT word_it(&row->rep_words);\n    do {\n      if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {\n        blobcount = 1;\n        repeated_set = box_it.data()->repeated_set();\n        search_it = box_it;\n        search_it.forward();\n        while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {\n          blobcount++;\n          search_it.forward();\n        }\n        // After the call to make_real_word() all the blobs from this\n        // repeated set will be removed from the blob list. box_it will be\n        // set to point to the blob after the end of the extracted sequence.\n        word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);\n        if (!box_it.empty() && box_it.data()->joined_to_prev()) {\n          tprintf(\"Bad box joined to prev at\");\n          box_it.data()->bounding_box().print();\n          tprintf(\"After repeated word:\");\n          word->bounding_box().print();\n        }\n        ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());\n        word->set_flag(W_REP_CHAR, true);\n        word->set_flag(W_DONT_CHOP, true);\n        word_it.add_after_then_move(word);\n      } else {\n        box_it.forward();\n      }\n    } while (!box_it.at_first());\n  }\n}\n\n/**********************************************************************\n * plot_fp_word\n *\n * Plot a block of words as if fixed pitch.\n **********************************************************************/\n\n#ifndef GRAPHICS_DISABLED\nvoid plot_fp_word(   // draw block of words\n    TO_BLOCK *block, // block to draw\n    float pitch,     // pitch to draw with\n    float nonspace   // for space threshold\n) {\n  TO_ROW *row; // current row\n  TO_ROW_IT row_it = block->get_rows();\n\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    row->min_space = static_cast<int32_t>((pitch + nonspace) / 2);\n    row->max_nonspace = row->min_space;\n    row->space_threshold = row->min_space;\n    plot_word_decisions(to_win, static_cast<int16_t>(pitch), row);\n  }\n}\n#endif\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/topitch.h",
    "content": "/**********************************************************************\n * File:        topitch.h  (Formerly to_pitch.h)\n * Description: Code to determine fixed pitchness and the pitch if fixed.\n * Author:      Ray Smith\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TOPITCH_H\n#define TOPITCH_H\n\n#include \"blobbox.h\"\n\nnamespace tesseract {\n\nclass Tesseract;\n\nextern BOOL_VAR_H(textord_debug_pitch_test);\nextern BOOL_VAR_H(textord_debug_pitch_metric);\nextern BOOL_VAR_H(textord_show_row_cuts);\nextern BOOL_VAR_H(textord_show_page_cuts);\nextern BOOL_VAR_H(textord_blockndoc_fixed);\nextern BOOL_VAR_H(textord_fast_pitch_test);\nextern double_VAR_H(textord_projection_scale);\nextern double_VAR_H(textord_balance_factor);\n\nvoid compute_fixed_pitch(ICOORD page_tr,             // top right\n                         TO_BLOCK_LIST *port_blocks, // input list\n                         float gradient,             // page skew\n                         FCOORD rotation,            // for drawing\n                         bool testing_on);           // correct orientation\nvoid fix_row_pitch(                                  // get some value\n    TO_ROW *bad_row,                                 // row to fix\n    TO_BLOCK *bad_block,                             // block of bad_row\n    TO_BLOCK_LIST *blocks,                           // blocks to scan\n    int32_t row_target,                              // number of row\n    int32_t block_target                             // number of block\n);\nvoid compute_block_pitch(TO_BLOCK *block,     // input list\n                         FCOORD rotation,     // for drawing\n                         int32_t block_index, // block number\n                         bool testing_on);    // correct orientation\nbool compute_rows_pitch(                      // find line stats\n    TO_BLOCK *block,                          // block to do\n    int32_t block_index,                      // block number\n    bool testing_on                           // correct orientation\n);\nbool try_doc_fixed(             // determine pitch\n    ICOORD page_tr,             // top right\n    TO_BLOCK_LIST *port_blocks, // input list\n    float gradient              // page skew\n);\nbool try_block_fixed(   // find line stats\n    TO_BLOCK *block,    // block to do\n    int32_t block_index // block number\n);\nbool try_rows_fixed(     // find line stats\n    TO_BLOCK *block,     // block to do\n    int32_t block_index, // block number\n    bool testing_on      // correct orientation\n);\nvoid print_block_counts( // find line stats\n    TO_BLOCK *block,     // block to do\n    int32_t block_index  // block number\n);\nvoid count_block_votes( // find line stats\n    TO_BLOCK *block,    // block to do\n    int32_t &def_fixed, // add to counts\n    int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,\n    int32_t &corr_prop, int32_t &dunno);\nbool row_pitch_stats( // find line stats\n    TO_ROW *row,      // current row\n    int32_t maxwidth, // of spaces\n    bool testing_on   // correct orientation\n);\nbool find_row_pitch(     // find lines\n    TO_ROW *row,         // row to do\n    int32_t maxwidth,    // max permitted space\n    int32_t dm_gap,      // ignorable gaps\n    TO_BLOCK *block,     // block of row\n    int32_t block_index, // block_number\n    int32_t row_index,   // number of row\n    bool testing_on      // correct orientation\n);\nbool fixed_pitch_row( // find lines\n    TO_ROW *row,      // row to do\n    BLOCK *block,\n    int32_t block_index // block_number\n);\nbool count_pitch_stats(  // find lines\n    TO_ROW *row,         // row to do\n    STATS *gap_stats,    // blob gaps\n    STATS *pitch_stats,  // centre-centre stats\n    float initial_pitch, // guess at pitch\n    float min_space,     // estimate space size\n    bool ignore_outsize, // discard big objects\n    bool split_outsize,  // split big objects\n    int32_t dm_gap       // ignorable gaps\n);\nfloat tune_row_pitch(           // find fp cells\n    TO_ROW *row,                // row to do\n    STATS *projection,          // vertical projection\n    int16_t projection_left,    // edge of projection\n    int16_t projection_right,   // edge of projection\n    float space_size,           // size of blank\n    float &initial_pitch,       // guess at pitch\n    float &best_sp_sd,          // space sd\n    int16_t &best_mid_cuts,     // no of cheap cuts\n    ICOORDELT_LIST *best_cells, // row cells\n    bool testing_on             // individual words\n);\nfloat tune_row_pitch2(          // find fp cells\n    TO_ROW *row,                // row to do\n    STATS *projection,          // vertical projection\n    int16_t projection_left,    // edge of projection\n    int16_t projection_right,   // edge of projection\n    float space_size,           // size of blank\n    float &initial_pitch,       // guess at pitch\n    float &best_sp_sd,          // space sd\n    int16_t &best_mid_cuts,     // no of cheap cuts\n    ICOORDELT_LIST *best_cells, // row cells\n    bool testing_on             // individual words\n);\nfloat compute_pitch_sd(        // find fp cells\n    TO_ROW *row,               // row to do\n    STATS *projection,         // vertical projection\n    int16_t projection_left,   // edge\n    int16_t projection_right,  // edge\n    float space_size,          // size of blank\n    float initial_pitch,       // guess at pitch\n    float &sp_sd,              // space sd\n    int16_t &mid_cuts,         // no of free cuts\n    ICOORDELT_LIST *row_cells, // list of chop pts\n    bool testing_on,           // individual words\n    int16_t start = 0,         // start of good range\n    int16_t end = 0            // end of good range\n);\nfloat compute_pitch_sd2(       // find fp cells\n    TO_ROW *row,               // row to do\n    STATS *projection,         // vertical projection\n    int16_t projection_left,   // edge\n    int16_t projection_right,  // edge\n    float initial_pitch,       // guess at pitch\n    int16_t &occupation,       // no of occupied cells\n    int16_t &mid_cuts,         // no of free cuts\n    ICOORDELT_LIST *row_cells, // list of chop pts\n    bool testing_on,           // individual words\n    int16_t start = 0,         // start of good range\n    int16_t end = 0            // end of good range\n);\nvoid print_pitch_sd(         // find fp cells\n    TO_ROW *row,             // row to do\n    STATS *projection,       // vertical projection\n    int16_t projection_left, // edges //size of blank\n    int16_t projection_right, float space_size,\n    float initial_pitch // guess at pitch\n);\nvoid find_repeated_chars(TO_BLOCK *block,  // Block to search.\n                         bool testing_on); // Debug mode.\nvoid plot_fp_word(                         // draw block of words\n    TO_BLOCK *block,                       // block to draw\n    float pitch,                           // pitch to draw with\n    float nonspace                         // for space threshold\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/tordmain.cpp",
    "content": "/**********************************************************************\n * File:        tordmain.cpp  (Formerly textordp.c)\n * Description: C++ top level textord code.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"tordmain.h\"\n\n#include \"arrayaccess.h\" // for GET_DATA_BYTE\n#include \"blobbox.h\"     // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B...\n#include \"ccstruct.h\"    // for CCStruct, CCStruct::kXHeightFraction\n#include \"clst.h\"        // for CLISTIZE\n#include \"coutln.h\"      // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE\n#include \"drawtord.h\"    // for plot_box_list, to_win, create_to_win\n#include \"edgblob.h\"     // for extract_edges\n#include \"errcode.h\"     // for ASSERT_HOST, ...\n#include \"makerow.h\"     // for textord_test_x, textord_test_y, texto...\n#include \"ocrblock.h\"    // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)\n#include \"ocrrow.h\"      // for ROW, ROW_IT, ROW_LIST, tweak_row_base...\n#include \"params.h\"      // for DoubleParam, BoolParam, IntParam\n#include \"pdblock.h\"     // for PDBLK\n#include \"points.h\"      // for FCOORD, ICOORD\n#include \"polyblk.h\"     // for POLY_BLOCK\n#include \"quadratc.h\"    // for QUAD_COEFFS\n#include \"quspline.h\"    // for QSPLINE, tweak_row_baseline\n#include \"rect.h\"        // for TBOX\n#include \"scrollview.h\"  // for ScrollView, ScrollView::WHITE\n#include \"statistc.h\"    // for STATS\n#include \"stepblob.h\"    // for C_BLOB_IT, C_BLOB, C_BLOB_LIST\n#include \"textord.h\"     // for Textord, WordWithBox, WordGrid, WordS...\n#include \"tprintf.h\"     // for tprintf\n#include \"werd.h\"        // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP\n\n#include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate\n\n#include <cfloat>  // for FLT_MAX\n#include <cmath>   // for ceil, floor, M_PI\n#include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t\n#include <memory>\n\nnamespace tesseract {\n\n#define MAX_NEAREST_DIST 600 // for block skew stats\n\n/**********************************************************************\n * SetBlobStrokeWidth\n *\n * Set the horizontal and vertical stroke widths in the blob.\n **********************************************************************/\nvoid SetBlobStrokeWidth(Image pix, BLOBNBOX *blob) {\n  // Cut the blob rectangle into a Pix.\n  int pix_height = pixGetHeight(pix);\n  const TBOX &box = blob->bounding_box();\n  int width = box.width();\n  int height = box.height();\n  Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height);\n  Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr);\n  boxDestroy(&blob_pix_box);\n  Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);\n  pix_blob.destroy();\n  // Compute the stroke widths.\n  uint32_t *data = pixGetData(dist_pix);\n  int wpl = pixGetWpl(dist_pix);\n  // Horizontal width of stroke.\n  STATS h_stats(0, width);\n  for (int y = 0; y < height; ++y) {\n    uint32_t *pixels = data + y * wpl;\n    int prev_pixel = 0;\n    int pixel = GET_DATA_BYTE(pixels, 0);\n    for (int x = 1; x < width; ++x) {\n      int next_pixel = GET_DATA_BYTE(pixels, x);\n      // We are looking for a pixel that is equal to its vertical neighbours,\n      // yet greater than its left neighbour.\n      if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&\n          (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {\n        if (pixel > next_pixel) {\n          // Single local max, so an odd width.\n          h_stats.add(pixel * 2 - 1, 1);\n        } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) {\n          // Double local max, so an even width.\n          h_stats.add(pixel * 2, 1);\n        }\n      }\n      prev_pixel = pixel;\n      pixel = next_pixel;\n    }\n  }\n  // Vertical width of stroke.\n  STATS v_stats(0, height);\n  for (int x = 0; x < width; ++x) {\n    int prev_pixel = 0;\n    int pixel = GET_DATA_BYTE(data, x);\n    for (int y = 1; y < height; ++y) {\n      uint32_t *pixels = data + y * wpl;\n      int next_pixel = GET_DATA_BYTE(pixels, x);\n      // We are looking for a pixel that is equal to its horizontal neighbours,\n      // yet greater than its upper neighbour.\n      if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&\n          (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {\n        if (pixel > next_pixel) {\n          // Single local max, so an odd width.\n          v_stats.add(pixel * 2 - 1, 1);\n        } else if (pixel == next_pixel && y + 1 < height &&\n                   pixel > GET_DATA_BYTE(pixels + wpl, x)) {\n          // Double local max, so an even width.\n          v_stats.add(pixel * 2, 1);\n        }\n      }\n      prev_pixel = pixel;\n      pixel = next_pixel;\n    }\n  }\n  dist_pix.destroy();\n  // Store the horizontal and vertical width in the blob, keeping both\n  // widths if there is enough information, otherwise only the one with\n  // the most samples.\n  // If there are insufficient samples, store zero, rather than using\n  // 2*area/perimeter, as the numbers that gives do not match the numbers\n  // from the distance method.\n  if (h_stats.get_total() >= (width + height) / 4) {\n    blob->set_horz_stroke_width(h_stats.ile(0.5f));\n    if (v_stats.get_total() >= (width + height) / 4) {\n      blob->set_vert_stroke_width(v_stats.ile(0.5f));\n    } else {\n      blob->set_vert_stroke_width(0.0f);\n    }\n  } else {\n    if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) {\n      blob->set_horz_stroke_width(0.0f);\n      blob->set_vert_stroke_width(v_stats.ile(0.5f));\n    } else {\n      blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f);\n      blob->set_vert_stroke_width(0.0f);\n    }\n  }\n}\n\n/**********************************************************************\n * assign_blobs_to_blocks2\n *\n * Make a list of TO_BLOCKs for portrait and landscape orientation.\n **********************************************************************/\n\nvoid assign_blobs_to_blocks2(Image pix,\n                             BLOCK_LIST *blocks,           // blocks to process\n                             TO_BLOCK_LIST *port_blocks) { // output list\n  BLOCK_IT block_it = blocks;\n  C_BLOB_IT blob_it;       // iterator\n  BLOBNBOX_IT port_box_it; // iterator\n                           // destination iterator\n  TO_BLOCK_IT port_block_it = port_blocks;\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    auto block = block_it.data();\n    auto port_block = new TO_BLOCK(block);\n\n    // Convert the good outlines to block->blob_list\n    port_box_it.set_to_list(&port_block->blobs);\n    blob_it.set_to_list(block->blob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      auto blob = blob_it.extract();\n      auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.\n      newblob->set_owns_cblob(true);\n      SetBlobStrokeWidth(pix, newblob);\n      port_box_it.add_after_then_move(newblob);\n    }\n\n    // Put the rejected outlines in block->noise_blobs, which allows them to\n    // be reconsidered and sorted back into rows and recover outlines mistakenly\n    // rejected.\n    port_box_it.set_to_list(&port_block->noise_blobs);\n    blob_it.set_to_list(block->reject_blobs());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      auto blob = blob_it.extract();\n      auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.\n      newblob->set_owns_cblob(true);\n      SetBlobStrokeWidth(pix, newblob);\n      port_box_it.add_after_then_move(newblob);\n    }\n\n    port_block_it.add_after_then_move(port_block);\n  }\n}\n\n/**********************************************************************\n * find_components\n *\n * Find the C_OUTLINEs of the connected components in each block, put them\n * in C_BLOBs, and filter them by size, putting the different size\n * grades on different lists in the matching TO_BLOCK in to_blocks.\n **********************************************************************/\n\nvoid Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {\n  int width = pixGetWidth(pix);\n  int height = pixGetHeight(pix);\n  if (width > INT16_MAX || height > INT16_MAX) {\n    tprintf(\"Input image too large! (%d, %d)\\n\", width, height);\n    return; // Can't handle it.\n  }\n\n  BLOCK_IT block_it(blocks); // iterator\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {\n      extract_edges(pix, block);\n    }\n  }\n\n  assign_blobs_to_blocks2(pix, blocks, to_blocks);\n  ICOORD page_tr(width, height);\n  filter_blobs(page_tr, to_blocks, !textord_test_landscape);\n}\n\n/**********************************************************************\n * filter_blobs\n *\n * Sort the blobs into sizes in all the blocks for later work.\n **********************************************************************/\n\nvoid Textord::filter_blobs(ICOORD page_tr,        // top right\n                           TO_BLOCK_LIST *blocks, // output list\n                           bool testing_on) {     // for plotting\n  TO_BLOCK_IT block_it = blocks;                  // destination iterator\n  TO_BLOCK *block;                                // created block\n\n#ifndef GRAPHICS_DISABLED\n  if (to_win != nullptr) {\n    to_win->Clear();\n  }\n#endif // !GRAPHICS_DISABLED\n\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs,\n                                          &block->large_blobs);\n    if (block->line_size == 0) {\n      block->line_size = 1;\n    }\n    block->line_spacing =\n        block->line_size *\n        (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction +\n         2 * tesseract::CCStruct::kAscenderFraction) /\n        tesseract::CCStruct::kXHeightFraction;\n    block->line_size *= textord_min_linesize;\n    block->max_blob_size = block->line_size * textord_excess_blobsize;\n\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_blobs && testing_on) {\n      if (to_win == nullptr) {\n        create_to_win(page_tr);\n      }\n      block->plot_graded_blobs(to_win);\n    }\n    if (textord_show_boxes && testing_on) {\n      if (to_win == nullptr) {\n        create_to_win(page_tr);\n      }\n      plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);\n      plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);\n      plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);\n      plot_box_list(to_win, &block->blobs, ScrollView::WHITE);\n    }\n#endif // !GRAPHICS_DISABLED\n  }\n}\n\n/**********************************************************************\n * filter_noise_blobs\n *\n * Move small blobs to a separate list.\n **********************************************************************/\n\nfloat Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list,     // original list\n                                  BLOBNBOX_LIST *noise_list,   // noise list\n                                  BLOBNBOX_LIST *small_list,   // small blobs\n                                  BLOBNBOX_LIST *large_list) { // large blobs\n  int16_t height;                                              // height of blob\n  int16_t width;                                               // of blob\n  BLOBNBOX *blob;                                              // current blob\n  float initial_x;                                             // first guess\n  BLOBNBOX_IT src_it = src_list;                               // iterators\n  BLOBNBOX_IT noise_it = noise_list;\n  BLOBNBOX_IT small_it = small_list;\n  BLOBNBOX_IT large_it = large_list;\n  STATS size_stats(0, MAX_NEAREST_DIST - 1);\n  // blob heights\n  float min_y; // size limits\n  float max_y;\n  float max_x;\n  float max_height; // of good blobs\n\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    blob = src_it.data();\n    if (blob->bounding_box().height() < textord_max_noise_size) {\n      noise_it.add_after_then_move(src_it.extract());\n    } else if (blob->enclosed_area() >= blob->bounding_box().height() *\n                                            blob->bounding_box().width() *\n                                            textord_noise_area_ratio) {\n      small_it.add_after_then_move(src_it.extract());\n    }\n  }\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    size_stats.add(src_it.data()->bounding_box().height(), 1);\n  }\n  initial_x = size_stats.ile(textord_initialx_ile);\n  max_y = ceil(initial_x *\n               (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction +\n                2 * tesseract::CCStruct::kAscenderFraction) /\n               tesseract::CCStruct::kXHeightFraction);\n  min_y = std::floor(initial_x / 2);\n  max_x = ceil(initial_x * textord_width_limit);\n  small_it.move_to_first();\n  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {\n    height = small_it.data()->bounding_box().height();\n    if (height > max_y) {\n      large_it.add_after_then_move(small_it.extract());\n    } else if (height >= min_y) {\n      src_it.add_after_then_move(small_it.extract());\n    }\n  }\n  size_stats.clear();\n  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {\n    height = src_it.data()->bounding_box().height();\n    width = src_it.data()->bounding_box().width();\n    if (height < min_y) {\n      small_it.add_after_then_move(src_it.extract());\n    } else if (height > max_y || width > max_x) {\n      large_it.add_after_then_move(src_it.extract());\n    } else {\n      size_stats.add(height, 1);\n    }\n  }\n  max_height = size_stats.ile(textord_initialasc_ile);\n  //      tprintf(\"max_y=%g, min_y=%g, initial_x=%g, max_height=%g,\",\n  //              max_y,min_y,initial_x,max_height);\n  max_height *= tesseract::CCStruct::kXHeightCapRatio;\n  if (max_height > initial_x) {\n    initial_x = max_height;\n  }\n  //      tprintf(\" ret=%g\\n\",initial_x);\n  return initial_x;\n}\n\n// Fixes the block so it obeys all the rules:\n// Must have at least one ROW.\n// Must have at least one WERD.\n// WERDs contain a fake blob.\nvoid Textord::cleanup_nontext_block(BLOCK *block) {\n  // Non-text blocks must contain at least one row.\n  ROW_IT row_it(block->row_list());\n  if (row_it.empty()) {\n    const TBOX &box = block->pdblk.bounding_box();\n    float height = box.height();\n    int32_t xstarts[2] = {box.left(), box.right()};\n    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};\n    ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1);\n    row_it.add_after_then_move(row);\n  }\n  // Each row must contain at least one word.\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    ROW *row = row_it.data();\n    WERD_IT w_it(row->word_list());\n    if (w_it.empty()) {\n      // Make a fake blob to put in the word.\n      TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box();\n      C_BLOB *blob = C_BLOB::FakeBlob(box);\n      C_BLOB_LIST blobs;\n      C_BLOB_IT blob_it(&blobs);\n      blob_it.add_after_then_move(blob);\n      WERD *word = new WERD(&blobs, 0, nullptr);\n      w_it.add_after_then_move(word);\n    }\n    // Each word must contain a fake blob.\n    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n      WERD *word = w_it.data();\n      // Just assert that this is true, as it would be useful to find\n      // out why it isn't.\n      ASSERT_HOST(!word->cblob_list()->empty());\n    }\n    row->recalc_bounding_box();\n  }\n}\n\n/**********************************************************************\n * cleanup_blocks\n *\n * Delete empty blocks, rows from the page.\n **********************************************************************/\n\nvoid Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {\n  BLOCK_IT block_it = blocks; // iterator\n  ROW_IT row_it;              // row iterator\n\n  int num_rows = 0;\n  int num_rows_all = 0;\n  int num_blocks = 0;\n  int num_blocks_all = 0;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    BLOCK *block = block_it.data();\n    if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {\n      cleanup_nontext_block(block);\n      continue;\n    }\n    num_rows = 0;\n    num_rows_all = 0;\n    if (clean_noise) {\n      row_it.set_to_list(block->row_list());\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        ROW *row = row_it.data();\n        ++num_rows_all;\n        clean_small_noise_from_words(row);\n        if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) ||\n            row->word_list()->empty()) {\n          delete row_it.extract(); // lose empty row.\n        } else {\n          if (textord_noise_rejwords) {\n            clean_noise_from_words(row_it.data());\n          }\n          if (textord_blshift_maxshift >= 0) {\n            tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction);\n          }\n          ++num_rows;\n        }\n      }\n    }\n    if (block->row_list()->empty()) {\n      delete block_it.extract(); // Lose empty text blocks.\n    } else {\n      ++num_blocks;\n    }\n    ++num_blocks_all;\n    if (textord_noise_debug) {\n      tprintf(\"cleanup_blocks: # rows = %d / %d\\n\", num_rows, num_rows_all);\n    }\n  }\n  if (textord_noise_debug) {\n    tprintf(\"cleanup_blocks: # blocks = %d / %d\\n\", num_blocks, num_blocks_all);\n  }\n}\n\n/**********************************************************************\n * clean_noise_from_row\n *\n * Move blobs of words from rows of garbage into the reject blobs list.\n **********************************************************************/\n\nbool Textord::clean_noise_from_row( // remove empties\n    ROW *row                        // row to clean\n) {\n  bool testing_on;\n  TBOX blob_box;            // bounding box\n  C_BLOB *blob;             // current blob\n  C_OUTLINE *outline;       // current outline\n  WERD *word;               // current word\n  int32_t blob_size;        // biggest size\n  int32_t trans_count = 0;  // no of transitions\n  int32_t trans_threshold;  // noise tolerance\n  int32_t dot_count;        // small objects\n  int32_t norm_count;       // normal objects\n  int32_t super_norm_count; // real char-like\n                            // words of row\n  WERD_IT word_it = row->word_list();\n  C_BLOB_IT blob_it;   // blob iterator\n  C_OUTLINE_IT out_it; // outline iterator\n\n  testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs &&\n               textord_test_y < row->base_line(textord_test_x) + row->x_height();\n  dot_count = 0;\n  norm_count = 0;\n  super_norm_count = 0;\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data(); // current word\n                           // blobs in word\n    blob_it.set_to_list(word->cblob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      if (!word->flag(W_DONT_CHOP)) {\n        // get outlines\n        out_it.set_to_list(blob->out_list());\n        for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n          outline = out_it.data();\n          blob_box = outline->bounding_box();\n          blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();\n          if (blob_size < textord_noise_sizelimit * row->x_height()) {\n            dot_count++; // count small outlines\n          }\n          if (!outline->child()->empty() &&\n              blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&\n              blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&\n              blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&\n              blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {\n            super_norm_count++; // count small outlines\n          }\n        }\n      } else {\n        super_norm_count++;\n      }\n      blob_box = blob->bounding_box();\n      blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();\n      if (blob_size >= textord_noise_sizelimit * row->x_height() &&\n          blob_size < row->x_height() * 2) {\n        trans_threshold = blob_size / textord_noise_sizefraction;\n        trans_count = blob->count_transitions(trans_threshold);\n        if (trans_count < textord_noise_translimit) {\n          norm_count++;\n        }\n      } else if (blob_box.height() > row->x_height() * 2 &&\n                 (!word_it.at_first() || !blob_it.at_first())) {\n        dot_count += 2;\n      }\n      if (testing_on) {\n        tprintf(\"Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\\n\", blob_box.left(),\n                blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(),\n                trans_count, blob_box.bottom() - row->base_line(blob_box.left()));\n      }\n    }\n  }\n  // TODO: check whether `&& super_norm_count < textord_noise_sncount`should always be added here.\n  bool rejected = dot_count > norm_count * textord_noise_normratio &&\n                  dot_count > 2;\n  if (textord_noise_debug) {\n    tprintf(\"Row ending at (%d,%g):\", blob_box.right(), row->base_line(blob_box.right()));\n    tprintf(\" R=%g, dc=%d, nc=%d, %s\\n\",\n            norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count,\n            norm_count,\n            rejected? \"REJECTED\": \"ACCEPTED\");\n  }\n  return super_norm_count < textord_noise_sncount && rejected;\n}\n\n/**********************************************************************\n * clean_noise_from_words\n *\n * Move blobs of words from rows of garbage into the reject blobs list.\n **********************************************************************/\n\nvoid Textord::clean_noise_from_words( // remove empties\n    ROW *row                          // row to clean\n) {\n  TBOX blob_box;           // bounding box\n  C_BLOB *blob;            // current blob\n  C_OUTLINE *outline;      // current outline\n  WERD *word;              // current word\n  int32_t blob_size;       // biggest size\n  int32_t trans_count;     // no of transitions\n  int32_t trans_threshold; // noise tolerance\n  int32_t dot_count;       // small objects\n  int32_t norm_count;      // normal objects\n  int32_t dud_words;       // number discarded\n  int32_t ok_words;        // number remaining\n  int32_t word_index;      // current word\n                           // words of row\n  WERD_IT word_it = row->word_list();\n  C_BLOB_IT blob_it;   // blob iterator\n  C_OUTLINE_IT out_it; // outline iterator\n\n  ok_words = word_it.length();\n  if (ok_words == 0 || textord_no_rejects) {\n    return;\n  }\n  // was it chucked\n  std::vector<int8_t> word_dud(ok_words);\n  dud_words = 0;\n  ok_words = 0;\n  word_index = 0;\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data(); // current word\n    dot_count = 0;\n    norm_count = 0;\n    // blobs in word\n    blob_it.set_to_list(word->cblob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      if (!word->flag(W_DONT_CHOP)) {\n        // get outlines\n        out_it.set_to_list(blob->out_list());\n        for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n          outline = out_it.data();\n          blob_box = outline->bounding_box();\n          blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();\n          if (blob_size < textord_noise_sizelimit * row->x_height()) {\n            dot_count++; // count small outlines\n          }\n          if (!outline->child()->empty() &&\n              blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&\n              blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&\n              blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&\n              blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {\n            norm_count++; // count small outlines\n          }\n        }\n      } else {\n        norm_count++;\n      }\n      blob_box = blob->bounding_box();\n      blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();\n      if (blob_size >= textord_noise_sizelimit * row->x_height() &&\n          blob_size < row->x_height() * 2) {\n        trans_threshold = blob_size / textord_noise_sizefraction;\n        trans_count = blob->count_transitions(trans_threshold);\n        if (trans_count < textord_noise_translimit) {\n          norm_count++;\n        }\n      } else if (blob_box.height() > row->x_height() * 2 &&\n                 (!word_it.at_first() || !blob_it.at_first())) {\n        dot_count += 2;\n      }\n    }\n    if (dot_count > 2 && !word->flag(W_REP_CHAR)) {\n      if (dot_count > norm_count * textord_noise_normratio * 2) {\n        word_dud[word_index] = 2;\n      } else if (dot_count > norm_count * textord_noise_normratio) {\n        word_dud[word_index] = 1;\n      } else {\n        word_dud[word_index] = 0;\n      }\n    } else {\n      word_dud[word_index] = 0;\n    }\n    if (word_dud[word_index] == 2) {\n      dud_words++;\n    } else {\n      ok_words++;\n    }\n    word_index++;\n  }\n\n  word_index = 0;\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) {\n      word = word_it.data(); // Current word.\n      // Previously we threw away the entire word.\n      // Now just aggressively throw all small blobs into the reject list, where\n      // the classifier can decide whether they are actually needed.\n      word->CleanNoise(textord_noise_sizelimit * row->x_height());\n    }\n    word_index++;\n  }\n}\n\n// Remove outlines that are a tiny fraction in either width or height\n// of the word height.\nvoid Textord::clean_small_noise_from_words(ROW *row) {\n  WERD_IT word_it(row->word_list());\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    WERD *word = word_it.data();\n    int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5);\n    C_BLOB_IT blob_it(word->cblob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      C_BLOB *blob = blob_it.data();\n      C_OUTLINE_IT out_it(blob->out_list());\n      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n        C_OUTLINE *outline = out_it.data();\n        outline->RemoveSmallRecursive(min_size, &out_it);\n      }\n      if (blob->out_list()->empty()) {\n        delete blob_it.extract();\n      }\n    }\n    if (word->cblob_list()->empty()) {\n      if (!word_it.at_last()) {\n        // The next word is no longer a fuzzy non space if it was before,\n        // since the word before is about to be deleted.\n        WERD *next_word = word_it.data_relative(1);\n        if (next_word->flag(W_FUZZY_NON)) {\n          next_word->set_flag(W_FUZZY_NON, false);\n        }\n      }\n      delete word_it.extract();\n    }\n  }\n}\n\n// Local struct to hold a group of blocks.\nstruct BlockGroup {\n  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}\n  explicit BlockGroup(BLOCK *block)\n      : bounding_box(block->pdblk.bounding_box())\n      , rotation(block->re_rotation())\n      , angle(block->re_rotation().angle())\n      , min_xheight(block->x_height()) {\n    blocks.push_back(block);\n  }\n  // Union of block bounding boxes.\n  TBOX bounding_box;\n  // Common rotation of the blocks.\n  FCOORD rotation;\n  // Angle of rotation.\n  float angle;\n  // Min xheight of the blocks.\n  float min_xheight;\n  // Collection of borrowed pointers to the blocks in the group.\n  std::vector<BLOCK *> blocks;\n};\n\n// Groups blocks by rotation, then, for each group, makes a WordGrid and calls\n// TransferDiacriticsToWords to copy the diacritic blobs to the most\n// appropriate words in the group of blocks. Source blobs are not touched.\nvoid Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) {\n  // Angle difference larger than this is too much to consider equal.\n  // They should only be in multiples of M_PI/2 anyway.\n  const double kMaxAngleDiff = 0.01; // About 0.6 degrees.\n  std::vector<std::unique_ptr<BlockGroup>> groups;\n  BLOCK_IT bk_it(blocks);\n  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {\n    BLOCK *block = bk_it.data();\n    if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {\n      continue;\n    }\n    // Linear search of the groups to find a matching rotation.\n    float block_angle = block->re_rotation().angle();\n    int best_g = 0;\n    float best_angle_diff = FLT_MAX;\n    for (const auto &group : groups) {\n      double angle_diff = std::fabs(block_angle - group->angle);\n      if (angle_diff > M_PI) {\n        angle_diff = fabs(angle_diff - 2.0 * M_PI);\n      }\n      if (angle_diff < best_angle_diff) {\n        best_angle_diff = angle_diff;\n        best_g = &group - &groups[0];\n      }\n    }\n    if (best_angle_diff > kMaxAngleDiff) {\n      groups.push_back(std::make_unique<BlockGroup>(block));\n    } else {\n      groups[best_g]->blocks.push_back(block);\n      groups[best_g]->bounding_box += block->pdblk.bounding_box();\n      float x_height = block->x_height();\n      if (x_height < groups[best_g]->min_xheight) {\n        groups[best_g]->min_xheight = x_height;\n      }\n    }\n  }\n  // Now process each group of blocks.\n  std::vector<std::unique_ptr<WordWithBox>> word_ptrs;\n  for (const auto &group : groups) {\n    if (group->bounding_box.null_box()) {\n      continue;\n    }\n    WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),\n                       group->bounding_box.topright());\n    for (auto b : group->blocks) {\n      ROW_IT row_it(b->row_list());\n      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n        ROW *row = row_it.data();\n        // Put the words of the row into the grid.\n        WERD_IT w_it(row->word_list());\n        for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {\n          WERD *word = w_it.data();\n          auto box_word = std::make_unique<WordWithBox>(word);\n          word_grid.InsertBBox(true, true, box_word.get());\n          // Save the pointer where it will be auto-deleted.\n          word_ptrs.emplace_back(std::move(box_word));\n        }\n      }\n    }\n    FCOORD rotation = group->rotation;\n    // Make it a forward rotation that will transform blob coords to block.\n    rotation.set_y(-rotation.y());\n    TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);\n  }\n}\n\n// Places a copy of blobs that are near a word (after applying rotation to the\n// blob) in the most appropriate word, unless there is doubt, in which case a\n// blob can end up in two words. Source blobs are not touched.\nvoid Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation,\n                                        WordGrid *word_grid) {\n  WordSearch ws(word_grid);\n  BLOBNBOX_IT b_it(diacritic_blobs);\n  // Apply rotation to each blob before finding the nearest words. The rotation\n  // allows us to only consider above/below placement and not left/right on\n  // vertical text, because all text is horizontal here.\n  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n    BLOBNBOX *blobnbox = b_it.data();\n    TBOX blob_box = blobnbox->bounding_box();\n    blob_box.rotate(rotation);\n    ws.StartRectSearch(blob_box);\n    // Above/below refer to word position relative to diacritic. Since some\n    // scripts eg Kannada/Telugu habitually put diacritics below words, and\n    // others eg Thai/Vietnamese/Latin put most diacritics above words, try\n    // for both if there isn't much in it.\n    WordWithBox *best_above_word = nullptr;\n    WordWithBox *best_below_word = nullptr;\n    int best_above_distance = 0;\n    int best_below_distance = 0;\n    for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) {\n      if (word->word()->flag(W_REP_CHAR)) {\n        continue;\n      }\n      TBOX word_box = word->true_bounding_box();\n      int x_distance = blob_box.x_gap(word_box);\n      int y_distance = blob_box.y_gap(word_box);\n      if (x_distance > 0) {\n        // Arbitrarily divide x-distance by 2 if there is a major y overlap,\n        // and the word is to the left of the diacritic. If the\n        // diacritic is a dropped broken character between two words, this will\n        // help send all the pieces to a single word, instead of splitting them\n        // over the 2 words.\n        if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) {\n          x_distance /= 2;\n        }\n        y_distance += x_distance;\n      }\n      if (word_box.y_middle() > blob_box.y_middle() &&\n          (best_above_word == nullptr || y_distance < best_above_distance)) {\n        best_above_word = word;\n        best_above_distance = y_distance;\n      }\n      if (word_box.y_middle() <= blob_box.y_middle() &&\n          (best_below_word == nullptr || y_distance < best_below_distance)) {\n        best_below_word = word;\n        best_below_distance = y_distance;\n      }\n    }\n    bool above_good = best_above_word != nullptr &&\n                      (best_below_word == nullptr ||\n                       best_above_distance < best_below_distance + blob_box.height());\n    bool below_good = best_below_word != nullptr && best_below_word != best_above_word &&\n                      (best_above_word == nullptr ||\n                       best_below_distance < best_above_distance + blob_box.height());\n    if (below_good) {\n      C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());\n      copied_blob->rotate(rotation);\n      // Put the blob into the word's reject blobs list.\n      C_BLOB_IT blob_it(best_below_word->RejBlobs());\n      blob_it.add_to_end(copied_blob);\n    }\n    if (above_good) {\n      C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());\n      copied_blob->rotate(rotation);\n      // Put the blob into the word's reject blobs list.\n      C_BLOB_IT blob_it(best_above_word->RejBlobs());\n      blob_it.add_to_end(copied_blob);\n    }\n  }\n}\n\n/**********************************************************************\n * tweak_row_baseline\n *\n * Shift baseline to fit the blobs more accurately where they are\n * close enough.\n **********************************************************************/\n\nvoid tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) {\n  TBOX blob_box;      // bounding box\n  C_BLOB *blob;       // current blob\n  WERD *word;         // current word\n  int32_t blob_count; // no of blobs\n  int32_t src_index;  // source segment\n  int32_t dest_index; // destination segment\n  float ydiff;        // baseline error\n  float x_centre;     // centre of blob\n                      // words of row\n  WERD_IT word_it = row->word_list();\n  C_BLOB_IT blob_it; // blob iterator\n\n  blob_count = 0;\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data(); // current word\n                           // get total blobs\n    blob_count += word->cblob_list()->length();\n  }\n  if (blob_count == 0) {\n    return;\n  }\n  // spline segments\n  std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1);\n  // spline coeffs\n  std::vector<double> coeffs((blob_count + row->baseline.segments) * 3);\n\n  src_index = 0;\n  dest_index = 0;\n  xstarts[0] = row->baseline.xcoords[0];\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word = word_it.data(); // current word\n                           // blobs in word\n    blob_it.set_to_list(word->cblob_list());\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      blob_box = blob->bounding_box();\n      x_centre = (blob_box.left() + blob_box.right()) / 2.0;\n      ydiff = blob_box.bottom() - row->base_line(x_centre);\n      if (ydiff < 0) {\n        ydiff = -ydiff / row->x_height();\n      } else {\n        ydiff = ydiff / row->x_height();\n      }\n      if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) {\n        if (xstarts[dest_index] >= x_centre) {\n          xstarts[dest_index] = blob_box.left();\n        }\n        coeffs[dest_index * 3] = 0;\n        coeffs[dest_index * 3 + 1] = 0;\n        coeffs[dest_index * 3 + 2] = blob_box.bottom();\n        // shift it\n        dest_index++;\n        xstarts[dest_index] = blob_box.right() + 1;\n      } else {\n        if (xstarts[dest_index] <= x_centre) {\n          while (row->baseline.xcoords[src_index + 1] <= x_centre &&\n                 src_index < row->baseline.segments - 1) {\n            if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) {\n              coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;\n              coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;\n              coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;\n              dest_index++;\n              xstarts[dest_index] = row->baseline.xcoords[src_index + 1];\n            }\n            src_index++;\n          }\n          coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;\n          coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;\n          coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;\n          dest_index++;\n          xstarts[dest_index] = row->baseline.xcoords[src_index + 1];\n        }\n      }\n    }\n  }\n  while (src_index < row->baseline.segments &&\n         row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) {\n    src_index++;\n  }\n  while (src_index < row->baseline.segments) {\n    coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;\n    coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;\n    coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;\n    dest_index++;\n    src_index++;\n    xstarts[dest_index] = row->baseline.xcoords[src_index];\n  }\n  // turn to spline\n  row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/tordmain.h",
    "content": "/**********************************************************************\n * File:        tordmain.h  (Formerly textordp.h)\n * Description: C++ top level textord code.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TORDMAIN_H\n#define TORDMAIN_H\n\n#include \"blobbox.h\"\n#include \"blobs.h\"\n#include \"ocrblock.h\"\n#include \"params.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\nclass Tesseract;\n\nvoid SetBlobStrokeWidth(Image pix, BLOBNBOX *blob);\nvoid assign_blobs_to_blocks2(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks);\n\nvoid tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/tospace.cpp",
    "content": "// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n/**********************************************************************\n * tospace.cpp\n *\n * Compute fuzzy word spacing thresholds for each row.\n * I.e. set :   max_nonspace\n *              space_threshold\n *              min_space\n *              kern_size\n *              space_size\n * for each row.\n * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE\n *\n * Note: functions in this file were originally not members of any\n * class or enclosed by any namespace. Now they are all static members\n * of the Textord class.\n *\n **********************************************************************/\n\n#include \"drawtord.h\"\n#include \"statistc.h\"\n#include \"textord.h\"\n#include \"tovars.h\"\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <algorithm>\n#include <cmath>\n#include <memory>\n\n#define MAXSPACING 128 /*max expected spacing in pix */\n\nnamespace tesseract {\nvoid Textord::to_spacing(ICOORD page_tr,       // topright of page\n                         TO_BLOCK_LIST *blocks // blocks on page\n) {\n  TO_BLOCK_IT block_it; // iterator\n  TO_BLOCK *block;      // current block;\n  TO_ROW *row;          // current row\n  int block_index;      // block number\n  int row_index;        // row number\n  // estimated width of real spaces for whole block\n  int16_t block_space_gap_width;\n  // estimated width of non space gaps for whole block\n  int16_t block_non_space_gap_width;\n  bool old_text_ord_proportional; // old fixed/prop result\n\n  block_it.set_to_list(blocks);\n  block_index = 1;\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk\n    block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,\n                        block_non_space_gap_width);\n    // Make sure relative values of block-level space and non-space gap\n    // widths are reasonable. The ratio of 1:3 is also used in\n    // block_spacing_stats, to correct the block_space_gap_width.\n    // Useful for arabic and hindi, when the non-space gap width is\n    // often over-estimated and should not be trusted. A similar ratio\n    // is found in block_spacing_stats.\n    if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&\n        block_non_space_gap_width > block_space_gap_width / 3) {\n      block_non_space_gap_width = block_space_gap_width / 3;\n    }\n    // row iterator\n    TO_ROW_IT row_it(block->get_rows());\n    row_index = 1;\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {\n        if ((tosp_debug_level > 0) && !old_text_ord_proportional) {\n          tprintf(\"Block %d Row %d: Now Proportional\\n\", block_index, row_index);\n        }\n        row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,\n                          block_non_space_gap_width);\n      } else {\n        if ((tosp_debug_level > 0) && old_text_ord_proportional) {\n          tprintf(\"Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\\n\", block_index,\n                  row_index, row->pitch_decision, row->fixed_pitch);\n        }\n      }\n#ifndef GRAPHICS_DISABLED\n      if (textord_show_initial_words) {\n        plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);\n      }\n#endif\n      row_index++;\n    }\n    block_index++;\n  }\n}\n\n/*************************************************************************\n * block_spacing_stats()\n *************************************************************************/\n\nvoid Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,\n                                  int16_t &block_space_gap_width,    // resulting estimate\n                                  int16_t &block_non_space_gap_width // resulting estimate\n) {\n  TO_ROW *row;         // current row\n  BLOBNBOX_IT blob_it; // iterator\n\n  STATS centre_to_centre_stats(0, MAXSPACING - 1);\n  // DEBUG USE ONLY\n  STATS all_gap_stats(0, MAXSPACING - 1);\n  STATS space_gap_stats(0, MAXSPACING - 1);\n  int16_t minwidth = MAXSPACING; // narrowest blob\n  TBOX blob_box;\n  TBOX prev_blob_box;\n  int16_t centre_to_centre;\n  int16_t gap_width;\n  float real_space_threshold;\n  float iqr_centre_to_centre; // DEBUG USE ONLY\n  float iqr_all_gap_stats;    // DEBUG USE ONLY\n  int32_t end_of_row;\n  int32_t row_length;\n\n  // row iterator\n  TO_ROW_IT row_it(block->get_rows());\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    if (!row->blob_list()->empty() &&\n        (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||\n         (row->pitch_decision == PITCH_CORR_PROP))) {\n      blob_it.set_to_list(row->blob_list());\n      blob_it.mark_cycle_pt();\n      end_of_row = blob_it.data_relative(-1)->bounding_box().right();\n      if (tosp_use_pre_chopping) {\n        blob_box = box_next_pre_chopped(&blob_it);\n      } else if (tosp_stats_use_xht_gaps) {\n        blob_box = reduced_box_next(row, &blob_it);\n      } else {\n        blob_box = box_next(&blob_it);\n      }\n      row_length = end_of_row - blob_box.left();\n      if (blob_box.width() < minwidth) {\n        minwidth = blob_box.width();\n      }\n      prev_blob_box = blob_box;\n      while (!blob_it.cycled_list()) {\n        if (tosp_use_pre_chopping) {\n          blob_box = box_next_pre_chopped(&blob_it);\n        } else if (tosp_stats_use_xht_gaps) {\n          blob_box = reduced_box_next(row, &blob_it);\n        } else {\n          blob_box = box_next(&blob_it);\n        }\n        if (blob_box.width() < minwidth) {\n          minwidth = blob_box.width();\n        }\n        int16_t left = prev_blob_box.right();\n        int16_t right = blob_box.left();\n        gap_width = right - left;\n        if (!ignore_big_gap(row, row_length, gapmap, left, right)) {\n          all_gap_stats.add(gap_width, 1);\n\n          centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;\n          // DEBUG\n          centre_to_centre_stats.add(centre_to_centre, 1);\n          // DEBUG\n        }\n        prev_blob_box = blob_box;\n      }\n    }\n  }\n\n  // Inadequate samples\n  if (all_gap_stats.get_total() <= 1) {\n    block_non_space_gap_width = minwidth;\n    block_space_gap_width = -1; // No est. space width\n                                // DEBUG\n    old_text_ord_proportional = true;\n  } else {\n    /* For debug only ..... */\n    iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);\n    iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);\n    old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;\n    /* .......For debug only */\n\n    /*\nThe median of the gaps is used as an estimate of the NON-SPACE gap width.\nThis RELIES on the assumption that there are more gaps WITHIN words than\nBETWEEN words in a block\n\nNow try to estimate the width of a real space for all real spaces in the\nblock. Do this by using a crude threshold to ignore \"narrow\" gaps, then\nfind the median of the \"wide\" gaps and use this.\n*/\n    block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));\n    // median gap\n\n    row_it.set_to_list(block->get_rows());\n    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n      row = row_it.data();\n      if (!row->blob_list()->empty() &&\n          (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||\n           (row->pitch_decision == PITCH_CORR_PROP))) {\n        real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,\n                                        tosp_init_guess_xht_mult * row->xheight);\n        blob_it.set_to_list(row->blob_list());\n        blob_it.mark_cycle_pt();\n        end_of_row = blob_it.data_relative(-1)->bounding_box().right();\n        if (tosp_use_pre_chopping) {\n          blob_box = box_next_pre_chopped(&blob_it);\n        } else if (tosp_stats_use_xht_gaps) {\n          blob_box = reduced_box_next(row, &blob_it);\n        } else {\n          blob_box = box_next(&blob_it);\n        }\n        row_length = blob_box.left() - end_of_row;\n        prev_blob_box = blob_box;\n        while (!blob_it.cycled_list()) {\n          if (tosp_use_pre_chopping) {\n            blob_box = box_next_pre_chopped(&blob_it);\n          } else if (tosp_stats_use_xht_gaps) {\n            blob_box = reduced_box_next(row, &blob_it);\n          } else {\n            blob_box = box_next(&blob_it);\n          }\n          int16_t left = prev_blob_box.right();\n          int16_t right = blob_box.left();\n          gap_width = right - left;\n          if ((gap_width > real_space_threshold) &&\n              !ignore_big_gap(row, row_length, gapmap, left, right)) {\n            /*\nIf tosp_use_cert_spaces is enabled, the estimate of the space gap is\nrestricted to obvious spaces - those wider than half the xht or\nthose with wide blobs on both sides - i.e not things that are\nsuspect 1's or punctuation that is sometimes widely spaced.\n*/\n            if (!tosp_block_use_cert_spaces ||\n                (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||\n                ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&\n                 (!tosp_narrow_blobs_not_cert ||\n                  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||\n                (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {\n              space_gap_stats.add(gap_width, 1);\n            }\n          }\n          prev_blob_box = blob_box;\n        }\n      }\n    }\n    // Inadequate samples\n    if (space_gap_stats.get_total() <= 2) {\n      block_space_gap_width = -1; // No est. space width\n    } else {\n      block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),\n                                       static_cast<int16_t>(3 * block_non_space_gap_width));\n    }\n  }\n}\n\n/*************************************************************************\n * row_spacing_stats()\n * Set values for min_space, max_non_space based on row stats only\n * If failure - return 0 values.\n *************************************************************************/\nvoid Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,\n                                int16_t block_space_gap_width,    // estimate for block\n                                int16_t block_non_space_gap_width // estimate for block\n) {\n  // iterator\n  BLOBNBOX_IT blob_it = row->blob_list();\n  STATS all_gap_stats(0, MAXSPACING - 1);\n  STATS cert_space_gap_stats(0, MAXSPACING - 1);\n  STATS all_space_gap_stats(0, MAXSPACING - 1);\n  STATS small_gap_stats(0, MAXSPACING - 1);\n  TBOX blob_box;\n  TBOX prev_blob_box;\n  int16_t gap_width;\n  int16_t real_space_threshold = 0;\n  int16_t max = 0;\n  int16_t large_gap_count = 0;\n  bool suspected_table;\n  bool good_block_space_estimate = block_space_gap_width > 0;\n  int32_t end_of_row;\n  int32_t row_length = 0;\n  float sane_space;\n  int32_t sane_threshold;\n\n  /* Collect first pass stats for row */\n\n  if (!good_block_space_estimate) {\n    block_space_gap_width = int16_t(std::floor(row->xheight / 2));\n  }\n  if (!row->blob_list()->empty()) {\n    if (tosp_threshold_bias1 > 0) {\n      real_space_threshold =\n          block_non_space_gap_width +\n          int16_t(floor(0.5 + tosp_threshold_bias1 *\n                                  (block_space_gap_width - block_non_space_gap_width)));\n    } else {\n      real_space_threshold = // Old TO method\n          (block_space_gap_width + block_non_space_gap_width) / 2;\n    }\n    blob_it.set_to_list(row->blob_list());\n    blob_it.mark_cycle_pt();\n    end_of_row = blob_it.data_relative(-1)->bounding_box().right();\n    if (tosp_use_pre_chopping) {\n      blob_box = box_next_pre_chopped(&blob_it);\n    } else if (tosp_stats_use_xht_gaps) {\n      blob_box = reduced_box_next(row, &blob_it);\n    } else {\n      blob_box = box_next(&blob_it);\n    }\n    row_length = end_of_row - blob_box.left();\n    prev_blob_box = blob_box;\n    while (!blob_it.cycled_list()) {\n      if (tosp_use_pre_chopping) {\n        blob_box = box_next_pre_chopped(&blob_it);\n      } else if (tosp_stats_use_xht_gaps) {\n        blob_box = reduced_box_next(row, &blob_it);\n      } else {\n        blob_box = box_next(&blob_it);\n      }\n      int16_t left = prev_blob_box.right();\n      int16_t right = blob_box.left();\n      gap_width = right - left;\n      if (ignore_big_gap(row, row_length, gapmap, left, right)) {\n        large_gap_count++;\n      } else {\n        if (gap_width >= real_space_threshold) {\n          if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||\n              ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&\n               (!tosp_narrow_blobs_not_cert ||\n                (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||\n              (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {\n            cert_space_gap_stats.add(gap_width, 1);\n          }\n          all_space_gap_stats.add(gap_width, 1);\n        } else {\n          small_gap_stats.add(gap_width, 1);\n        }\n        all_gap_stats.add(gap_width, 1);\n      }\n      prev_blob_box = blob_box;\n    }\n  }\n  suspected_table = (large_gap_count > 1) ||\n                    ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));\n\n  /* Now determine row kern size, space size and threshold */\n\n  if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||\n      ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&\n       cert_space_gap_stats.get_total() > 0)) {\n    old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,\n                  block_space_gap_width, block_non_space_gap_width);\n  } else {\n    if (!tosp_recovery_isolated_row_stats ||\n        !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {\n      if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {\n        tprintf(\"B:%d R:%d -- Inadequate certain spaces.\\n\", block_idx, row_idx);\n      }\n      if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {\n        // Use block default\n        row->space_size = block_space_gap_width;\n        if (all_gap_stats.get_total() > tosp_redo_kern_limit) {\n          row->kern_size = all_gap_stats.median();\n        } else {\n          row->kern_size = block_non_space_gap_width;\n        }\n        row->space_threshold =\n            int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));\n      } else {\n        old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,\n                      block_space_gap_width, block_non_space_gap_width);\n      }\n    }\n  }\n\n  if (tosp_improve_thresh && !suspected_table) {\n    improve_row_threshold(row, &all_gap_stats);\n  }\n\n  /* Now lets try to be careful not to do anything silly with tables when we\nare ignoring big gaps*/\n  if (tosp_sanity_method == 0) {\n    if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {\n      if (tosp_debug_level > 5) {\n        tprintf(\"B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\\n\", block_idx, row_idx,\n                row->kern_size, row->space_threshold, row->space_size);\n      }\n      row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);\n      row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);\n    }\n  } else if (tosp_sanity_method == 1) {\n    sane_space = row->space_size;\n    /* NEVER let space size get too close to kern size */\n    if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||\n        ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {\n      if (good_block_space_estimate &&\n          (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {\n        sane_space = block_space_gap_width;\n      } else {\n        sane_space =\n            std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),\n                     row->xheight / 2.0f);\n      }\n      if (tosp_debug_level > 5) {\n        tprintf(\"B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\\n\", block_idx, row_idx,\n                row->kern_size, row->space_threshold, row->space_size, sane_space);\n      }\n      row->space_size = sane_space;\n      row->space_threshold =\n          int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));\n    }\n    /* NEVER let threshold get VERY far away from kern */\n    sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));\n    if (row->space_threshold > sane_threshold) {\n      if (tosp_debug_level > 5) {\n        tprintf(\"B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\\n\", block_idx, row_idx,\n                row->kern_size, row->space_threshold, row->space_size, sane_threshold);\n      }\n      row->space_threshold = sane_threshold;\n      if (row->space_size <= sane_threshold) {\n        row->space_size = row->space_threshold + 1.0f;\n      }\n    }\n    /* Beware of tables - there may be NO spaces */\n    if (suspected_table) {\n      sane_space =\n          std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);\n      sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));\n\n      if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {\n        if (tosp_debug_level > 5) {\n          tprintf(\"B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\\n\", block_idx, row_idx,\n                  row->kern_size, row->space_threshold, row->space_size);\n        }\n        // the minimum sane value\n        row->space_threshold = static_cast<int32_t>(sane_space);\n        row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);\n      }\n    }\n  }\n\n  /* Now lets try to put some error limits on the threshold */\n\n  if (tosp_old_to_method) {\n    /* Old textord made a space if gap >= threshold */\n    // NO FUZZY SPACES YET\n    row->max_nonspace = row->space_threshold;\n    // NO FUZZY SPACES       YET\n    row->min_space = row->space_threshold + 1;\n  } else {\n    /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */\n    row->min_space =\n        std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));\n    if (row->min_space <= row->space_threshold) {\n      // Don't be silly\n      row->min_space = row->space_threshold + 1;\n    }\n    /*\nLets try to guess the max certain kern gap by looking at the cluster of\nkerns for the row. The row is proportional so the kerns should cluster\ntightly at the bottom of the distribution. We also expect most gaps to be\nkerns. Find the maximum of the kern piles between 0 and twice the kern\nestimate. Piles before the first one with less than 1/10 the maximum\nnumber of samples can be taken as certain kerns.\n\n  Of course, there are some cases where the kern peak and space peaks merge,\n  so we will put an UPPER limit on the max certain kern gap of some fraction\n  below the threshold.\n*/\n\n    // upper bound\n    int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);\n\n    // default\n    row->max_nonspace = max_max_nonspace;\n    for (int32_t index = 0; index <= max_max_nonspace; index++) {\n      if (all_gap_stats.pile_count(index) > max) {\n        max = all_gap_stats.pile_count(index);\n      }\n      if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {\n        row->max_nonspace = index;\n        break;\n      }\n    }\n  }\n\n  /* Yet another algorithm - simpler this time - just choose a fraction of the\nthreshold to space range */\n\n  if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {\n    row->min_space = std::max(\n        row->min_space, static_cast<int32_t>(ceil(row->space_threshold +\n                                                  tosp_fuzzy_sp_fraction *\n                                                      (row->space_size - row->space_threshold))));\n  }\n\n  /* Ensure that ANY space less than some multiplier times the kern size is\nfuzzy.  In tables there is a risk of erroneously setting a small space size\nwhen there are no real spaces. Sometimes tables have text squashed into\ncolumns so that the kn->sp ratio is small anyway - this means that we can't\nuse this to force a wider separation - hence we rely on context to join any\ndubious breaks. */\n\n  if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {\n    row->min_space = std::max(\n        row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));\n  }\n\n  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {\n    row->max_nonspace = static_cast<int32_t>(floor(\n        0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));\n  }\n  if (row->max_nonspace > row->space_threshold) {\n    // Don't be silly\n    row->max_nonspace = row->space_threshold;\n  }\n\n  if (tosp_debug_level > 5) {\n    tprintf(\n        \"B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) \"\n        \"Sp:%3.2f\\n\",\n        block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,\n        real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,\n        row->min_space, row->space_size);\n  }\n  if (tosp_debug_level > 10) {\n    tprintf(\n        \"row->kern_size = %3.2f, row->space_size = %3.2f, \"\n        \"row->space_threshold = %d\\n\",\n        row->kern_size, row->space_size, row->space_threshold);\n  }\n}\n\nvoid Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,\n                            STATS *small_gap_stats,\n                            int16_t block_space_gap_width,    // estimate for block\n                            int16_t block_non_space_gap_width // estimate for block\n) {\n  /* First, estimate row space size */\n  /* Old to condition was > 2 */\n  if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {\n    // Adequate samples\n    /* Set space size to median of spaces BUT limits it if it seems wildly out\n     */\n    row->space_size = space_gap_stats->median();\n    if (row->space_size > block_space_gap_width * 1.5) {\n      if (tosp_old_to_bug_fix) {\n        row->space_size = block_space_gap_width * 1.5;\n      } else {\n        // BUG??? should be *1.5\n        row->space_size = block_space_gap_width;\n      }\n    }\n    if (row->space_size < (block_non_space_gap_width * 2) + 1) {\n      row->space_size = (block_non_space_gap_width * 2) + 1;\n    }\n  }\n  // Only 1 or 2 samples\n  else if (space_gap_stats->get_total() >= 1) {\n    // hence mean not median\n    row->space_size = space_gap_stats->mean();\n    if (row->space_size > block_space_gap_width * 1.5) {\n      if (tosp_old_to_bug_fix) {\n        row->space_size = block_space_gap_width * 1.5;\n      } else {\n        // BUG??? should be *1.5\n        row->space_size = block_space_gap_width;\n      }\n    }\n    if (row->space_size < (block_non_space_gap_width * 3) + 1) {\n      row->space_size = (block_non_space_gap_width * 3) + 1;\n    }\n  } else {\n    // Use block default\n    row->space_size = block_space_gap_width;\n  }\n\n  /* Next, estimate row kern size */\n  if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {\n    row->kern_size = small_gap_stats->median();\n  } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {\n    row->kern_size = all_gap_stats->median();\n  } else { // old TO -SAME FOR ALL ROWS\n    row->kern_size = block_non_space_gap_width;\n  }\n\n  /* Finally, estimate row space threshold */\n  if (tosp_threshold_bias2 > 0) {\n    row->space_threshold = int32_t(\n        floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));\n  } else {\n    /*\n  NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold\nand holds this in a float. The use is with a >= test\nNEW textord uses an integer threshold and a > test\nIt comes to the same thing.\n  (Though there is a difference in that old textor has integer space_size\n  and kern_size.)\n*/\n    row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));\n  }\n\n  // Apply the same logic and ratios as in row_spacing_stats to\n  // restrict relative values of the row's space_size, kern_size, and\n  // space_threshold\n  if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&\n      ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||\n       ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {\n    if (row->kern_size > 2.5) {\n      row->kern_size = row->space_size / tosp_min_sane_kn_sp;\n    }\n    row->space_threshold =\n        int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));\n  }\n}\n\n/*************************************************************************\n * isolated_row_stats()\n * Set values for min_space, max_non_space based on row stats only\n *************************************************************************/\nbool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,\n                                 bool suspected_table, int16_t block_idx, int16_t row_idx) {\n  float kern_estimate;\n  float crude_threshold_estimate;\n  int16_t small_gaps_count;\n  int16_t total;\n  // iterator\n  BLOBNBOX_IT blob_it = row->blob_list();\n  STATS cert_space_gap_stats(0, MAXSPACING - 1);\n  STATS all_space_gap_stats(0, MAXSPACING - 1);\n  STATS small_gap_stats(0, MAXSPACING - 1);\n  TBOX blob_box;\n  TBOX prev_blob_box;\n  int16_t gap_width;\n  int32_t end_of_row;\n  int32_t row_length;\n\n  kern_estimate = all_gap_stats->median();\n  crude_threshold_estimate =\n      std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);\n  small_gaps_count =\n      stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));\n  total = all_gap_stats->get_total();\n\n  if ((total <= tosp_redo_kern_limit) ||\n      ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||\n      (total - small_gaps_count < 1)) {\n    if (tosp_debug_level > 5) {\n      tprintf(\"B:%d R:%d -- Can't do isolated row stats.\\n\", block_idx, row_idx);\n    }\n    return false;\n  }\n  blob_it.set_to_list(row->blob_list());\n  blob_it.mark_cycle_pt();\n  end_of_row = blob_it.data_relative(-1)->bounding_box().right();\n  if (tosp_use_pre_chopping) {\n    blob_box = box_next_pre_chopped(&blob_it);\n  } else if (tosp_stats_use_xht_gaps) {\n    blob_box = reduced_box_next(row, &blob_it);\n  } else {\n    blob_box = box_next(&blob_it);\n  }\n  row_length = end_of_row - blob_box.left();\n  prev_blob_box = blob_box;\n  while (!blob_it.cycled_list()) {\n    if (tosp_use_pre_chopping) {\n      blob_box = box_next_pre_chopped(&blob_it);\n    } else if (tosp_stats_use_xht_gaps) {\n      blob_box = reduced_box_next(row, &blob_it);\n    } else {\n      blob_box = box_next(&blob_it);\n    }\n    int16_t left = prev_blob_box.right();\n    int16_t right = blob_box.left();\n    gap_width = right - left;\n    if (!ignore_big_gap(row, row_length, gapmap, left, right) &&\n        (gap_width > crude_threshold_estimate)) {\n      if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||\n          ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&\n           (!tosp_narrow_blobs_not_cert ||\n            (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||\n          (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {\n        cert_space_gap_stats.add(gap_width, 1);\n      }\n      all_space_gap_stats.add(gap_width, 1);\n    }\n    if (gap_width < crude_threshold_estimate) {\n      small_gap_stats.add(gap_width, 1);\n    }\n\n    prev_blob_box = blob_box;\n  }\n  if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {\n    // median\n    row->space_size = cert_space_gap_stats.median();\n  } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {\n    // to avoid spaced\n    row->space_size = cert_space_gap_stats.mean();\n  //      1's in tables\n  } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {\n    // median\n    row->space_size = all_space_gap_stats.median();\n  } else {\n    row->space_size = all_space_gap_stats.mean();\n  }\n\n  if (tosp_only_small_gaps_for_kern) {\n    row->kern_size = small_gap_stats.median();\n  } else {\n    row->kern_size = all_gap_stats->median();\n  }\n  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));\n  /* Sanity check */\n  if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||\n      (row->space_threshold <= 0)) {\n    if (tosp_debug_level > 5) {\n      tprintf(\"B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\\n\", block_idx, row_idx,\n              row->kern_size, row->space_threshold, row->space_size);\n    }\n    row->kern_size = 0.0f;\n    row->space_threshold = 0;\n    row->space_size = 0.0f;\n    return false;\n  }\n\n  if (tosp_debug_level > 5) {\n    tprintf(\"B:%d R:%d -- Isolated row stats: %f %d %f\\n\", block_idx, row_idx, row->kern_size,\n            row->space_threshold, row->space_size);\n  }\n  return true;\n}\n\nint16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {\n  int16_t index;\n  int16_t total = 0;\n\n  for (index = 0; index < threshold; index++) {\n    total += stats->pile_count(index);\n  }\n  return total;\n}\n\n/*************************************************************************\n * improve_row_threshold()\n *    Try to recognise a \"normal line\" -\n *           > 25 gaps\n *     &&    space > 3 * kn  && space > 10\n *              (I.e. reasonably large space and kn:sp ratio)\n *     &&    > 3/4 # gaps < kn + (sp - kn)/3\n *              (I.e. most gaps are well away from space estimate)\n *     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found\n *           somewhere in the histogram between kn and sp\n *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies\n *          NO!!!!! the bristol line has \"11\" with a gap of 12 between the\n *1's!!! try moving the default threshold to within this band but leave the\n *          fuzzy limit calculation as at present.\n *************************************************************************/\nvoid Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {\n  float sp = row->space_size;\n  float kn = row->kern_size;\n  int16_t reqd_zero_width = 0;\n  int16_t zero_width = 0;\n  int16_t zero_start = 0;\n  int16_t index = 0;\n\n  if (tosp_debug_level > 10) {\n    tprintf(\"Improve row threshold 0\");\n  }\n  if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||\n      (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <\n       (0.75 * all_gap_stats->get_total()))) {\n    return;\n  }\n  if (tosp_debug_level > 10) {\n    tprintf(\" 1\");\n  }\n  /*\nLook for the first region of all 0's in the histogram which is wider than\nmax(3, (sp - kn) / 3) and starts between kn and sp. If found, and current\nthreshold is not within it, move the threshold so that is just inside it.\n*/\n  reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));\n  if (reqd_zero_width < 3) {\n    reqd_zero_width = 3;\n  }\n\n  for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {\n    if (all_gap_stats->pile_count(index) == 0) {\n      if (zero_width == 0) {\n        zero_start = index;\n      }\n      zero_width++;\n    } else {\n      if (zero_width >= reqd_zero_width) {\n        break;\n      } else {\n        zero_width = 0;\n      }\n    }\n  }\n  index--;\n  if (tosp_debug_level > 10) {\n    tprintf(\" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n\", reqd_zero_width,\n            zero_width, zero_start, row->space_threshold);\n  }\n  if ((zero_width < reqd_zero_width) ||\n      ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {\n    return;\n  }\n  if (tosp_debug_level > 10) {\n    tprintf(\" 2\");\n  }\n  if (row->space_threshold < zero_start) {\n    if (tosp_debug_level > 5) {\n      tprintf(\"Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\\n\", kn, sp, zero_start,\n              index, row->space_threshold, zero_start);\n    }\n    row->space_threshold = zero_start;\n  }\n  if (row->space_threshold > index) {\n    if (tosp_debug_level > 5) {\n      tprintf(\"Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\\n\", kn, sp, zero_start,\n              index, row->space_threshold, index);\n    }\n    row->space_threshold = index;\n  }\n}\n\n/**********************************************************************\n * make_prop_words\n *\n * Convert a TO_ROW to a ROW.\n **********************************************************************/\nROW *Textord::make_prop_words(TO_ROW *row,    // row to make\n                              FCOORD rotation // for drawing\n) {\n  bool bol; // start of line\n  /* prev_ values are for start of word being built. non prev_ values are for\nthe gap between the word being built and the next one. */\n  bool prev_fuzzy_sp;     // probably space\n  bool prev_fuzzy_non;    // probably not\n  uint8_t prev_blanks;    // in front of word\n  bool fuzzy_sp = false;  // probably space\n  bool fuzzy_non = false; // probably not\n  uint8_t blanks = 0;     // in front of word\n  bool prev_gap_was_a_space = false;\n  bool break_at_next_gap = false;\n  ROW *real_row; // output row\n  C_OUTLINE_IT cout_it;\n  C_BLOB_LIST cblobs;\n  C_BLOB_IT cblob_it = &cblobs;\n  WERD_LIST words;\n  WERD *word; // new word\n  int32_t next_rep_char_word_right = INT32_MAX;\n  float repetition_spacing; // gap between repetitions\n  int32_t xstarts[2];       // row ends\n  int32_t prev_x;           // end of prev blob\n  BLOBNBOX_IT box_it;       // iterator\n  TBOX prev_blob_box;\n  TBOX next_blob_box;\n  int16_t prev_gap = INT16_MAX;\n  int16_t current_gap = INT16_MAX;\n  int16_t next_gap = INT16_MAX;\n  int16_t prev_within_xht_gap = INT16_MAX;\n  int16_t current_within_xht_gap = INT16_MAX;\n  int16_t next_within_xht_gap = INT16_MAX;\n  int16_t word_count = 0;\n\n  // repeated char words\n  WERD_IT rep_char_it(&(row->rep_words));\n  if (!rep_char_it.empty()) {\n    next_rep_char_word_right = rep_char_it.data()->bounding_box().right();\n  }\n\n  prev_x = -INT16_MAX;\n  cblob_it.set_to_list(&cblobs);\n  box_it.set_to_list(row->blob_list());\n  // new words\n  WERD_IT word_it(&words);\n  bol = true;\n  prev_blanks = 0;\n  prev_fuzzy_sp = false;\n  prev_fuzzy_non = false;\n  if (!box_it.empty()) {\n    xstarts[0] = box_it.data()->bounding_box().left();\n    if (xstarts[0] > next_rep_char_word_right) {\n      /* We need to insert a repeated char word at the start of the row */\n      word = rep_char_it.extract();\n      word_it.add_after_then_move(word);\n      /* Set spaces before repeated char word */\n      word->set_flag(W_BOL, true);\n      bol = false;\n      word->set_blanks(0);\n      // NO uncertainty\n      word->set_flag(W_FUZZY_SP, false);\n      word->set_flag(W_FUZZY_NON, false);\n      xstarts[0] = word->bounding_box().left();\n      /* Set spaces after repeated char word (and leave current word set) */\n      repetition_spacing = find_mean_blob_spacing(word);\n      current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;\n      current_within_xht_gap = current_gap;\n      if (current_gap > tosp_rep_space * repetition_spacing) {\n        prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));\n        if (prev_blanks < 1) {\n          prev_blanks = 1;\n        }\n      } else {\n        prev_blanks = 0;\n      }\n      if (tosp_debug_level > 5) {\n        tprintf(\"Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  \",\n                box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),\n                repetition_spacing, current_gap);\n      }\n      prev_fuzzy_sp = false;\n      prev_fuzzy_non = false;\n      if (rep_char_it.empty()) {\n        next_rep_char_word_right = INT32_MAX;\n      } else {\n        rep_char_it.forward();\n        next_rep_char_word_right = rep_char_it.data()->bounding_box().right();\n      }\n    }\n\n    peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);\n    do {\n      auto bblob = box_it.data();\n      auto blob_box = bblob->bounding_box();\n      if (bblob->joined_to_prev()) {\n        auto cblob = bblob->remove_cblob();\n        if (cblob != nullptr) {\n          cout_it.set_to_list(cblob_it.data()->out_list());\n          cout_it.move_to_last();\n          cout_it.add_list_after(cblob->out_list());\n          delete cblob;\n        }\n      } else {\n        auto cblob = bblob->cblob();\n        if (cblob != nullptr) {\n          bblob->set_owns_cblob(false);\n          cblob_it.add_after_then_move(cblob);\n        }\n        prev_x = blob_box.right();\n      }\n      box_it.forward(); // next one\n      bblob = box_it.data();\n      blob_box = bblob->bounding_box();\n\n      if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {\n        /* Real Blob - not multiple outlines or pre-chopped */\n        prev_gap = current_gap;\n        prev_within_xht_gap = current_within_xht_gap;\n        prev_blob_box = next_blob_box;\n        current_gap = next_gap;\n        current_within_xht_gap = next_within_xht_gap;\n        peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);\n\n        int16_t prev_gap_arg = prev_gap;\n        int16_t next_gap_arg = next_gap;\n        if (tosp_only_use_xht_gaps) {\n          prev_gap_arg = prev_within_xht_gap;\n          next_gap_arg = next_within_xht_gap;\n        }\n        // Decide if a word-break should be inserted\n        if (blob_box.left() > next_rep_char_word_right ||\n            make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,\n                              current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,\n                              fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||\n            box_it.at_first()) {\n          /* Form a new word out of the blobs collected */\n          word = new WERD(&cblobs, prev_blanks, nullptr);\n          word_count++;\n          word_it.add_after_then_move(word);\n          if (bol) {\n            word->set_flag(W_BOL, true);\n            bol = false;\n          }\n          if (prev_fuzzy_sp) {\n            // probably space\n            word->set_flag(W_FUZZY_SP, true);\n          } else if (prev_fuzzy_non) {\n            word->set_flag(W_FUZZY_NON, true);\n          }\n          // probably not\n\n          if (blob_box.left() > next_rep_char_word_right) {\n            /* We need to insert a repeated char word */\n            word = rep_char_it.extract();\n            word_it.add_after_then_move(word);\n\n            /* Set spaces before repeated char word */\n            repetition_spacing = find_mean_blob_spacing(word);\n            current_gap = word->bounding_box().left() - prev_x;\n            current_within_xht_gap = current_gap;\n            if (current_gap > tosp_rep_space * repetition_spacing) {\n              blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));\n              if (blanks < 1) {\n                blanks = 1;\n              }\n            } else {\n              blanks = 0;\n            }\n            if (tosp_debug_level > 5) {\n              tprintf(\"Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);\",\n                      word->bounding_box().left(), word->bounding_box().bottom(),\n                      repetition_spacing, current_gap, blanks);\n            }\n            word->set_blanks(blanks);\n            // NO uncertainty\n            word->set_flag(W_FUZZY_SP, false);\n            word->set_flag(W_FUZZY_NON, false);\n\n            /* Set spaces after repeated char word (and leave current word set)\n             */\n            current_gap = blob_box.left() - next_rep_char_word_right;\n            if (current_gap > tosp_rep_space * repetition_spacing) {\n              blanks = static_cast<uint8_t>(current_gap / row->space_size);\n              if (blanks < 1) {\n                blanks = 1;\n              }\n            } else {\n              blanks = 0;\n            }\n            if (tosp_debug_level > 5) {\n              tprintf(\" Rgap:%d (%d blanks)\\n\", current_gap, blanks);\n            }\n            fuzzy_sp = false;\n            fuzzy_non = false;\n\n            if (rep_char_it.empty()) {\n              next_rep_char_word_right = INT32_MAX;\n            } else {\n              rep_char_it.forward();\n              next_rep_char_word_right = rep_char_it.data()->bounding_box().right();\n            }\n          }\n\n          if (box_it.at_first() && rep_char_it.empty()) {\n            // at end of line\n            word->set_flag(W_EOL, true);\n            xstarts[1] = prev_x;\n          } else {\n            prev_blanks = blanks;\n            prev_fuzzy_sp = fuzzy_sp;\n            prev_fuzzy_non = fuzzy_non;\n          }\n        }\n      }\n    } while (!box_it.at_first()); // until back at start\n\n    /* Insert any further repeated char words */\n    while (!rep_char_it.empty()) {\n      word = rep_char_it.extract();\n      word_it.add_after_then_move(word);\n\n      /* Set spaces before repeated char word */\n      repetition_spacing = find_mean_blob_spacing(word);\n      current_gap = word->bounding_box().left() - prev_x;\n      if (current_gap > tosp_rep_space * repetition_spacing) {\n        blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));\n        if (blanks < 1) {\n          blanks = 1;\n        }\n      } else {\n        blanks = 0;\n      }\n      if (tosp_debug_level > 5) {\n        tprintf(\"Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\\n\",\n                word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,\n                current_gap, blanks);\n      }\n      word->set_blanks(blanks);\n      // NO uncertainty\n      word->set_flag(W_FUZZY_SP, false);\n      word->set_flag(W_FUZZY_NON, false);\n      prev_x = word->bounding_box().right();\n      if (rep_char_it.empty()) {\n        // at end of line\n        word->set_flag(W_EOL, true);\n        xstarts[1] = prev_x;\n      } else {\n        rep_char_it.forward();\n      }\n    }\n    real_row =\n        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));\n    word_it.set_to_list(real_row->word_list());\n    // put words in row\n    word_it.add_list_after(&words);\n    real_row->recalc_bounding_box();\n\n    if (tosp_debug_level > 4) {\n      tprintf(\"Row: Made %d words in row ((%d,%d)(%d,%d))\\n\", word_count,\n              real_row->bounding_box().left(), real_row->bounding_box().bottom(),\n              real_row->bounding_box().right(), real_row->bounding_box().top());\n    }\n    return real_row;\n  }\n  return nullptr;\n}\n\n/**********************************************************************\n * make_blob_words\n *\n * Converts words into blobs so that each blob is a single character.\n *  Used for chopper test.\n **********************************************************************/\nROW *Textord::make_blob_words(TO_ROW *row,    // row to make\n                              FCOORD rotation // for drawing\n) {\n  bool bol;      // start of line\n  ROW *real_row; // output row\n  C_OUTLINE_IT cout_it;\n  C_BLOB_LIST cblobs;\n  C_BLOB_IT cblob_it = &cblobs;\n  WERD_LIST words;\n  WERD *word;         // new word\n  BLOBNBOX_IT box_it; // iterator\n  int16_t word_count = 0;\n\n  cblob_it.set_to_list(&cblobs);\n  box_it.set_to_list(row->blob_list());\n  // new words\n  WERD_IT word_it(&words);\n  bol = true;\n  if (!box_it.empty()) {\n    do {\n      auto bblob = box_it.data();\n      auto blob_box = bblob->bounding_box();\n      if (bblob->joined_to_prev()) {\n        auto cblob = bblob->remove_cblob();\n        if (cblob != nullptr) {\n          cout_it.set_to_list(cblob_it.data()->out_list());\n          cout_it.move_to_last();\n          cout_it.add_list_after(cblob->out_list());\n          delete cblob;\n        }\n      } else {\n        auto cblob = bblob->cblob();\n        if (cblob != nullptr) {\n          bblob->set_owns_cblob(false);\n          cblob_it.add_after_then_move(cblob);\n        }\n      }\n      box_it.forward(); // next one\n      bblob = box_it.data();\n      blob_box = bblob->bounding_box();\n\n      if (!bblob->joined_to_prev() && !cblobs.empty()) {\n        word = new WERD(&cblobs, 1, nullptr);\n        word_count++;\n        word_it.add_after_then_move(word);\n        if (bol) {\n          word->set_flag(W_BOL, true);\n          bol = false;\n        }\n        if (box_it.at_first()) { // at end of line\n          word->set_flag(W_EOL, true);\n        }\n      }\n    } while (!box_it.at_first()); // until back at start\n    /* Setup the row with created words. */\n    real_row =\n        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));\n    word_it.set_to_list(real_row->word_list());\n    // put words in row\n    word_it.add_list_after(&words);\n    real_row->recalc_bounding_box();\n    if (tosp_debug_level > 4) {\n      tprintf(\"Row:Made %d words in row ((%d,%d)(%d,%d))\\n\", word_count,\n              real_row->bounding_box().left(), real_row->bounding_box().bottom(),\n              real_row->bounding_box().right(), real_row->bounding_box().top());\n    }\n    return real_row;\n  }\n  return nullptr;\n}\n\nbool Textord::make_a_word_break(TO_ROW *row,   // row being made\n                                TBOX blob_box, // for next_blob // how many blanks?\n                                int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,\n                                int16_t within_xht_current_gap, TBOX next_blob_box,\n                                int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,\n                                bool &prev_gap_was_a_space, bool &break_at_next_gap) {\n  bool space;\n  int16_t current_gap;\n  float fuzzy_sp_to_kn_limit;\n\n  if (break_at_next_gap) {\n    break_at_next_gap = false;\n    return true;\n  }\n  /* Inhibit using the reduced gap if\n  The kerning is large - chars are not kerned and reducing \"f\"s can cause\n  erroneous blanks\nOR  The real gap is less than 0\nOR  The real gap is less than the kerning estimate\n*/\n  if ((row->kern_size > tosp_large_kerning * row->xheight) ||\n      ((tosp_dont_fool_with_small_kerns >= 0) &&\n       (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {\n    // Ignore the difference\n    within_xht_current_gap = real_current_gap;\n  }\n\n  if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {\n    current_gap = within_xht_current_gap;\n  } else {\n    current_gap = real_current_gap;\n  }\n\n  if (tosp_old_to_method) {\n    // Boring old method\n    space = current_gap > row->max_nonspace;\n    if (space && (current_gap < INT16_MAX)) {\n      if (current_gap < row->min_space) {\n        if (current_gap > row->space_threshold) {\n          blanks = 1;\n          fuzzy_sp = true;\n          fuzzy_non = false;\n        } else {\n          blanks = 0;\n          fuzzy_sp = false;\n          fuzzy_non = true;\n        }\n      } else {\n        if (row->space_size == 0.0f) {\n          // Avoid FP division by 0.\n          blanks = 1;\n        } else {\n          blanks = static_cast<uint8_t>(current_gap / row->space_size);\n          if (blanks < 1) {\n            blanks = 1;\n          }\n        }\n        fuzzy_sp = false;\n        fuzzy_non = false;\n      }\n    }\n    return space;\n  } else {\n    /* New exciting heuristic method */\n    if (prev_blob_box.null_box()) { // Beginning of row\n      prev_gap_was_a_space = true;\n    }\n\n    // Default as old TO\n    space = current_gap > row->space_threshold;\n\n    /* Set defaults for the word break in case we find one.  Currently there are\nno fuzzy spaces. Depending on the reliability of the different heuristics\nwe may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY\nbe used if the function returns true - ie the word is to be broken.\n*/\n    int num_blanks = current_gap;\n    if (row->space_size > 1.0f) {\n      num_blanks = IntCastRounded(current_gap / row->space_size);\n    }\n    blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));\n    fuzzy_sp = false;\n    fuzzy_non = false;\n    /*\nIf xht measure causes gap to flip one of the 3 thresholds act accordingly -\ndespite any other heuristics - the MINIMUM action is to pass a fuzzy kern to\ncontext.\n*/\n    if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&\n        (within_xht_current_gap > row->max_nonspace)) {\n      space = true;\n      fuzzy_non = true;\n#ifndef GRAPHICS_DISABLED\n      mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n               next_gap);\n#endif\n    } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&\n               (within_xht_current_gap > row->space_threshold)) {\n      space = true;\n      if (tosp_flip_fuzz_kn_to_sp) {\n        fuzzy_sp = true;\n      } else {\n        fuzzy_non = true;\n      }\n#ifndef GRAPHICS_DISABLED\n      mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n               next_gap);\n#endif\n    } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&\n               (within_xht_current_gap >= row->min_space)) {\n      space = true;\n#ifndef GRAPHICS_DISABLED\n      mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n               next_gap);\n#endif\n    } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&\n               suspected_punct_blob(row, blob_box)) {\n      break_at_next_gap = true;\n    }\n    /* Now continue with normal heuristics */\n    else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {\n      /* Heuristics to turn dubious spaces to kerns */\n      if (tosp_pass_wide_fuzz_sp_to_context > 0) {\n        fuzzy_sp_to_kn_limit =\n            row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);\n      } else {\n        fuzzy_sp_to_kn_limit = 99999.0f;\n      }\n\n      /* If current gap is significantly smaller than the previous space the\nother side of a narrow blob then this gap is a kern. */\n      if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&\n          (current_gap <= tosp_gap_factor * prev_gap)) {\n        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {\n          if (tosp_flip_fuzz_sp_to_kn) {\n            fuzzy_non = true;\n          } else {\n            fuzzy_sp = true;\n          }\n        } else {\n          space = false;\n        }\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      }\n      /* If current gap not much bigger than the previous kern the other side of\na narrow blob then this gap is a kern as well */\n      else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&\n               !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {\n        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {\n          if (tosp_flip_fuzz_sp_to_kn) {\n            fuzzy_non = true;\n          } else {\n            fuzzy_sp = true;\n          }\n        } else {\n          space = false;\n        }\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&\n                 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {\n        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {\n          if (tosp_flip_fuzz_sp_to_kn) {\n            fuzzy_non = true;\n          } else {\n            fuzzy_sp = true;\n          }\n        } else {\n          space = false;\n        }\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&\n                 (next_gap <= row->space_threshold) &&\n                 (current_gap * tosp_gap_factor <= next_gap)) {\n        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {\n          if (tosp_flip_fuzz_sp_to_kn) {\n            fuzzy_non = true;\n          } else {\n            fuzzy_sp = true;\n          }\n        } else {\n          space = false;\n        }\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||\n                  ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {\n        fuzzy_sp = true;\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      }\n    } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {\n      /* Heuristics to turn dubious kerns to spaces */\n      /* TRIED THIS BUT IT MADE THINGS WORSE\n    if (prev_gap == INT16_MAX)\n      prev_gap = 0;  // start of row\n    if (next_gap == INT16_MAX)\n      next_gap = 0;  // end of row\n*/\n      if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&\n          (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&\n          wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {\n        space = true;\n        /*\ntosp_flip_caution is an attempt to stop the default changing in cases\nwhere there is a large difference between the kern and space estimates.\n  See problem in 'chiefs' where \"have\" gets split in the quotation.\n*/\n        if ((tosp_flip_fuzz_kn_to_sp) &&\n            ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {\n          fuzzy_sp = true;\n        } else {\n          fuzzy_non = true;\n        }\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&\n                 current_gap > 5 && // Rule 9 handles small gap, big ratio.\n                 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&\n                 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&\n                 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {\n        space = true;\n        fuzzy_non = true;\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&\n                 (next_blob_box.width() > 0) &&\n                 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&\n                 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&\n                                              !suspected_punct_blob(row, next_blob_box)))) {\n        space = true;\n        fuzzy_non = true;\n#ifndef GRAPHICS_DISABLED\n        mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),\n                 next_gap);\n#endif\n      }\n    }\n    if (tosp_debug_level > 10) {\n      tprintf(\n          \"word break = %d current_gap = %d, prev_gap = %d, \"\n          \"next_gap = %d\\n\",\n          space ? 1 : 0, current_gap, prev_gap, next_gap);\n    }\n    prev_gap_was_a_space = space && !(fuzzy_non);\n    return space;\n  }\n}\n\nbool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {\n  bool result;\n  result =\n      ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||\n       ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));\n  return result;\n}\n\nbool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {\n  bool result;\n  if (tosp_wide_fraction > 0) {\n    if (tosp_wide_aspect_ratio > 0) {\n      result =\n          ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&\n           ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));\n    } else {\n      result = (blob_box.width() >= tosp_wide_fraction * row->xheight);\n    }\n  } else {\n    result = !narrow_blob(row, blob_box);\n  }\n  return result;\n}\n\nbool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {\n  bool result;\n  float baseline;\n  float blob_x_centre;\n  /* Find baseline of centre of blob */\n  blob_x_centre = (box.right() + box.left()) / 2.0;\n  baseline = row->baseline.y(blob_x_centre);\n\n  result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||\n           (box.bottom() > baseline + row->xheight / 2.0);\n  return result;\n}\n\nvoid Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,\n                               int16_t &next_gap, int16_t &next_within_xht_gap) {\n  TBOX next_reduced_blob_box;\n  TBOX bit_beyond;\n  BLOBNBOX_IT reduced_box_it = box_it;\n\n  next_blob_box = box_next(&box_it);\n  next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);\n  if (box_it.at_first()) {\n    next_gap = INT16_MAX;\n    next_within_xht_gap = INT16_MAX;\n  } else {\n    bit_beyond = box_it.data()->bounding_box();\n    next_gap = bit_beyond.left() - next_blob_box.right();\n    bit_beyond = reduced_box_next(row, &reduced_box_it);\n    next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\nvoid Textord::mark_gap(TBOX blob,    // blob following gap\n                       int16_t rule, // heuristic id\n                       int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,\n                       int16_t next_blob_width, int16_t next_gap) {\n  ScrollView::Color col; // of ellipse marking flipped gap\n\n  switch (rule) {\n    case 1:\n      col = ScrollView::RED;\n      break;\n    case 2:\n      col = ScrollView::CYAN;\n      break;\n    case 3:\n      col = ScrollView::GREEN;\n      break;\n    case 4:\n      col = ScrollView::BLACK;\n      break;\n    case 5:\n      col = ScrollView::MAGENTA;\n      break;\n    case 6:\n      col = ScrollView::BLUE;\n      break;\n\n    case 7:\n      col = ScrollView::WHITE;\n      break;\n    case 8:\n      col = ScrollView::YELLOW;\n      break;\n    case 9:\n      col = ScrollView::BLACK;\n      break;\n\n    case 20:\n      col = ScrollView::CYAN;\n      break;\n    case 21:\n      col = ScrollView::GREEN;\n      break;\n    case 22:\n      col = ScrollView::MAGENTA;\n      break;\n    default:\n      col = ScrollView::BLACK;\n  }\n  if (textord_show_initial_words) {\n    to_win->Pen(col);\n    /*  if (rule < 20)\n    //interior_style(to_win, INT_SOLID, false);\n  else\n    //interior_style(to_win, INT_HOLLOW, true);*/\n    // x radius\n    to_win->Ellipse(current_gap / 2.0f,\n                    blob.height() / 2.0f, // y radius\n                                          // x centre\n                    blob.left() - current_gap / 2.0f,\n                    // y centre\n                    blob.bottom() + blob.height() / 2.0f);\n  }\n  if (tosp_debug_level > 5) {\n    tprintf(\"  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\\n\", blob.left() - current_gap / 2,\n            blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);\n  }\n}\n#endif\n\nfloat Textord::find_mean_blob_spacing(WERD *word) {\n  C_BLOB_IT cblob_it;\n  TBOX blob_box;\n  int32_t gap_sum = 0;\n  int16_t gap_count = 0;\n  int16_t prev_right;\n\n  cblob_it.set_to_list(word->cblob_list());\n  if (!cblob_it.empty()) {\n    cblob_it.mark_cycle_pt();\n    prev_right = cblob_it.data()->bounding_box().right();\n    // first blob\n    cblob_it.forward();\n    for (; !cblob_it.cycled_list(); cblob_it.forward()) {\n      blob_box = cblob_it.data()->bounding_box();\n      gap_sum += blob_box.left() - prev_right;\n      gap_count++;\n      prev_right = blob_box.right();\n    }\n  }\n  if (gap_count > 0) {\n    return (gap_sum / static_cast<float>(gap_count));\n  } else {\n    return 0.0f;\n  }\n}\n\nbool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,\n                             int16_t right) {\n  int16_t gap = right - left + 1;\n\n  if (tosp_ignore_big_gaps > 999) {\n    return false; // Don't ignore\n  }\n  if (tosp_ignore_big_gaps > 0) {\n    return (gap > tosp_ignore_big_gaps * row->xheight);\n  }\n  if (gap > tosp_ignore_very_big_gaps * row->xheight) {\n    return true;\n  }\n  if (tosp_ignore_big_gaps == 0) {\n    if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {\n      return true;\n    }\n    if ((gap > 1.75 * row->xheight) &&\n        ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {\n      return true;\n    }\n  } else {\n    /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table\n     */\n    if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/**********************************************************************\n * reduced_box_next\n *\n * Compute the bounding box of this blob with merging of x overlaps\n * but no pre-chopping.\n * Then move the iterator on to the start of the next blob.\n * DON'T reduce the box for small things - eg punctuation.\n **********************************************************************/\nTBOX Textord::reduced_box_next(TO_ROW *row,    // current row\n                               BLOBNBOX_IT *it // iterator to blobds\n) {\n  BLOBNBOX *blob;             // current blob\n  BLOBNBOX *head_blob;        // place to store box\n  TBOX full_box;              // full blob boundg box\n  TBOX reduced_box;           // box of significant part\n  int16_t left_above_xht;     // ABOVE xht left limit\n  int16_t new_left_above_xht; // ABOVE xht left limit\n\n  blob = it->data();\n  if (blob->red_box_set()) {\n    reduced_box = blob->reduced_box();\n    do {\n      it->forward();\n      blob = it->data();\n    } while (blob->cblob() == nullptr || blob->joined_to_prev());\n    return reduced_box;\n  }\n  head_blob = blob;\n  full_box = blob->bounding_box();\n  reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);\n  do {\n    it->forward();\n    blob = it->data();\n    if (blob->cblob() == nullptr) {\n      // was pre-chopped\n      full_box += blob->bounding_box();\n    } else if (blob->joined_to_prev()) {\n      reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);\n      left_above_xht = std::min(left_above_xht, new_left_above_xht);\n    }\n  }\n  // until next real blob\n  while (blob->cblob() == nullptr || blob->joined_to_prev());\n\n  if ((reduced_box.width() > 0) &&\n      ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&\n      (reduced_box.height() > 0.7 * row->xheight)) {\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_initial_words) {\n      reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);\n    }\n#endif\n  } else {\n    reduced_box = full_box;\n  }\n  head_blob->set_reduced_box(reduced_box);\n  return reduced_box;\n}\n\n/*************************************************************************\n * reduced_box_for_blob()\n * Find box for blob which is the same height and y position as the whole blob,\n * but whose left limit is the left most position of the blob ABOVE the\n * baseline and whose right limit is the right most position of the blob BELOW\n * the xheight.\n *\n *\n * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on\n *         \"home\".  Perhaps we need something which say if the width ABOVE the\n *         xht alone includes the whole of the reduced width, then use the full\n *         blob box - Might still fail on italic F\n *\n *         Alternatively we could be a little less severe and only reduce the\n *         left and right edges by half the difference between the full box and\n *         the reduced box.\n *\n * NOTE that we need to rotate all the coordinates as\n * find_blob_limits finds the y min and max within a specified x band\n *************************************************************************/\nTBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {\n  float baseline;\n  float blob_x_centre;\n  float left_limit;\n  float right_limit;\n  float junk;\n  TBOX blob_box;\n\n  /* Find baseline of centre of blob */\n\n  blob_box = blob->bounding_box();\n  blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;\n  baseline = row->baseline.y(blob_x_centre);\n\n  /*\nFind LH limit of blob ABOVE the xht. This is so that we can detect certain\ncaps ht chars which should NOT have their box reduced: T, Y, V, W etc\n*/\n  left_limit = static_cast<float>(INT32_MAX);\n  junk = static_cast<float>(-INT32_MAX);\n  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),\n                     left_limit, junk);\n  if (left_limit > junk) {\n    *left_above_xht = INT16_MAX; // No area above xht\n  } else {\n    *left_above_xht = static_cast<int16_t>(std::floor(left_limit));\n  }\n  /*\nFind reduced LH limit of blob - the left extent of the region ABOVE the\nbaseline.\n*/\n  left_limit = static_cast<float>(INT32_MAX);\n  junk = static_cast<float>(-INT32_MAX);\n  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);\n\n  if (left_limit > junk) {\n    return TBOX(); // no area within xht so return empty box\n  }\n  /*\nFind reduced RH limit of blob - the right extent of the region BELOW the xht.\n*/\n  junk = static_cast<float>(INT32_MAX);\n  right_limit = static_cast<float>(-INT32_MAX);\n  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,\n                     right_limit);\n  if (junk > right_limit) {\n    return TBOX(); // no area within xht so return empty box\n  }\n\n  return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),\n              ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));\n}\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/tovars.cpp",
    "content": "/**********************************************************************\n * File:        tovars.cpp  (Formerly to_vars.c)\n * Description: Variables used by textord.\n * Author:    Ray Smith\n * Created:   Tue Aug 24 16:55:02 BST 1993\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"tovars.h\"\n#include \"params.h\"\n\nnamespace tesseract {\n\nBOOL_VAR(textord_show_initial_words, false, \"Display separate words\");\nBOOL_VAR(textord_blocksall_fixed, false, \"Moan about prop blocks\");\nBOOL_VAR(textord_blocksall_prop, false, \"Moan about fixed pitch blocks\");\nINT_VAR(textord_dotmatrix_gap, 3, \"Max pixel gap for broken pixed pitch\");\nINT_VAR(textord_debug_block, 0, \"Block to do debug on\");\nINT_VAR(textord_pitch_range, 2, \"Max range test on pitch\");\ndouble_VAR(textord_wordstats_smooth_factor, 0.05, \"Smoothing gap stats\");\ndouble_VAR(textord_words_maxspace, 4.0, \"Multiple of xheight\");\ndouble_VAR(textord_words_default_maxspace, 3.5, \"Max believable third space\");\ndouble_VAR(textord_words_default_minspace, 0.6, \"Fraction of xheight\");\ndouble_VAR(textord_words_min_minspace, 0.3, \"Fraction of xheight\");\ndouble_VAR(textord_words_default_nonspace, 0.2, \"Fraction of xheight\");\ndouble_VAR(textord_words_initial_lower, 0.25, \"Max initial cluster size\");\ndouble_VAR(textord_words_initial_upper, 0.15, \"Min initial cluster spacing\");\ndouble_VAR(textord_words_minlarge, 0.75, \"Fraction of valid gaps needed\");\ndouble_VAR(textord_words_pitchsd_threshold, 0.040, \"Pitch sync threshold\");\ndouble_VAR(textord_words_def_fixed, 0.016, \"Threshold for definite fixed\");\ndouble_VAR(textord_words_def_prop, 0.090, \"Threshold for definite prop\");\nINT_VAR(textord_words_veto_power, 5, \"Rows required to outvote a veto\");\ndouble_VAR(textord_pitch_rowsimilarity, 0.08, \"Fraction of xheight for sameness\");\nBOOL_VAR(textord_pitch_scalebigwords, false, \"Scale scores on big words\");\ndouble_VAR(words_initial_lower, 0.5, \"Max initial cluster size\");\ndouble_VAR(words_initial_upper, 0.15, \"Min initial cluster spacing\");\ndouble_VAR(words_default_prop_nonspace, 0.25, \"Fraction of xheight\");\ndouble_VAR(words_default_fixed_space, 0.75, \"Fraction of xheight\");\ndouble_VAR(words_default_fixed_limit, 0.6, \"Allowed size variance\");\ndouble_VAR(textord_words_definite_spread, 0.30, \"Non-fuzzy spacing region\");\ndouble_VAR(textord_spacesize_ratioprop, 2.0, \"Min ratio space/nonspace\");\ndouble_VAR(textord_fpiqr_ratio, 1.5, \"Pitch IQR/Gap IQR threshold\");\ndouble_VAR(textord_max_pitch_iqr, 0.20, \"Xh fraction noise in pitch\");\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/tovars.h",
    "content": "/**********************************************************************\n * File:        tovars.h  (Formerly to_vars.h)\n * Description: Variables used by textord.\n * Author:    Ray Smith\n * Created:   Tue Aug 24 16:55:02 BST 1993\n *\n * (C) Copyright 1993, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TOVARS_H\n#define TOVARS_H\n\n#include \"params.h\"\n\nnamespace tesseract {\n\nextern BOOL_VAR_H(textord_show_initial_words);\nextern BOOL_VAR_H(textord_blocksall_fixed);\nextern BOOL_VAR_H(textord_blocksall_prop);\nextern INT_VAR_H(textord_dotmatrix_gap);\nextern INT_VAR_H(textord_debug_block);\nextern INT_VAR_H(textord_pitch_range);\nextern double_VAR_H(textord_wordstats_smooth_factor);\nextern double_VAR_H(textord_words_maxspace);\nextern double_VAR_H(textord_words_default_maxspace);\nextern double_VAR_H(textord_words_default_minspace);\nextern double_VAR_H(textord_words_min_minspace);\nextern double_VAR_H(textord_words_default_nonspace);\nextern double_VAR_H(textord_words_initial_lower);\nextern double_VAR_H(textord_words_initial_upper);\nextern double_VAR_H(textord_words_minlarge);\nextern double_VAR_H(textord_words_pitchsd_threshold);\nextern double_VAR_H(textord_words_def_fixed);\nextern double_VAR_H(textord_words_def_prop);\nextern INT_VAR_H(textord_words_veto_power);\nextern double_VAR_H(textord_pitch_rowsimilarity);\nextern BOOL_VAR_H(textord_pitch_scalebigwords);\nextern double_VAR_H(words_initial_lower);\nextern double_VAR_H(words_initial_upper);\nextern double_VAR_H(words_default_prop_nonspace);\nextern double_VAR_H(words_default_fixed_space);\nextern double_VAR_H(words_default_fixed_limit);\nextern double_VAR_H(textord_words_definite_spread);\nextern double_VAR_H(textord_spacesize_ratioprop);\nextern double_VAR_H(textord_fpiqr_ratio);\nextern double_VAR_H(textord_max_pitch_iqr);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/underlin.cpp",
    "content": "/**********************************************************************\n * File:        underlin.cpp  (Formerly undrline.c)\n * Description: Code to chop blobs apart from underlines.\n * Author:      Ray Smith\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"underlin.h\"\n\nnamespace tesseract {\n\ndouble_VAR(textord_underline_offset, 0.1, \"Fraction of x to ignore\");\nBOOL_VAR(textord_restore_underlines, true, \"Chop underlines & put back\");\n\n/**********************************************************************\n * restore_underlined_blobs\n *\n * Find underlined blobs and put them back in the row.\n **********************************************************************/\n\nvoid restore_underlined_blobs( // get chop points\n    TO_BLOCK *block            // block to do\n) {\n  int16_t chop_coord;        // chop boundary\n  TBOX blob_box;             // of underline\n  BLOBNBOX *u_line;          // underline bit\n  TO_ROW *row;               // best row for blob\n  ICOORDELT_LIST chop_cells; // blobs to cut out\n                             // real underlines\n  BLOBNBOX_LIST residual_underlines;\n  C_OUTLINE_LIST left_coutlines;\n  C_OUTLINE_LIST right_coutlines;\n  ICOORDELT_IT cell_it = &chop_cells;\n  // under lines\n  BLOBNBOX_IT under_it = &block->underlines;\n  BLOBNBOX_IT ru_it = &residual_underlines;\n\n  if (block->get_rows()->empty()) {\n    return; // Don't crash if there are no rows.\n  }\n  for (under_it.mark_cycle_pt(); !under_it.cycled_list(); under_it.forward()) {\n    u_line = under_it.extract();\n    blob_box = u_line->bounding_box();\n    row = most_overlapping_row(block->get_rows(), u_line);\n    if (row == nullptr) {\n      return; // Don't crash if there is no row.\n    }\n    find_underlined_blobs(u_line, &row->baseline, row->xheight,\n                          row->xheight * textord_underline_offset, &chop_cells);\n    cell_it.set_to_list(&chop_cells);\n    for (cell_it.mark_cycle_pt(); !cell_it.cycled_list(); cell_it.forward()) {\n      chop_coord = cell_it.data()->x();\n      if (cell_it.data()->y() - chop_coord > textord_fp_chop_error + 1) {\n        split_to_blob(u_line, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,\n                      &right_coutlines);\n        if (!left_coutlines.empty()) {\n          ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));\n        }\n        chop_coord = cell_it.data()->y();\n        split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,\n                      &right_coutlines);\n        if (!left_coutlines.empty()) {\n          row->insert_blob(new BLOBNBOX(new C_BLOB(&left_coutlines)));\n        }\n        u_line = nullptr; // no more blobs to add\n      }\n      delete cell_it.extract();\n    }\n    if (!right_coutlines.empty()) {\n      split_to_blob(nullptr, blob_box.right(), textord_fp_chop_error + 0.5, &left_coutlines,\n                    &right_coutlines);\n      if (!left_coutlines.empty()) {\n        ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));\n      }\n    }\n    delete u_line;\n  }\n  if (!ru_it.empty()) {\n    ru_it.move_to_first();\n    for (ru_it.mark_cycle_pt(); !ru_it.cycled_list(); ru_it.forward()) {\n      under_it.add_after_then_move(ru_it.extract());\n    }\n  }\n}\n\n/**********************************************************************\n * most_overlapping_row\n *\n * Return the row which most overlaps the blob.\n **********************************************************************/\n\nTO_ROW *most_overlapping_row( // find best row\n    TO_ROW_LIST *rows,        // list of rows\n    BLOBNBOX *blob            // blob to place\n) {\n  int16_t x = (blob->bounding_box().left() + blob->bounding_box().right()) / 2;\n  TO_ROW_IT row_it = rows; // row iterator\n  TO_ROW *row;             // current row\n  TO_ROW *best_row;        // output row\n  float overlap;           // of blob & row\n  float bestover;          // best overlap\n\n  best_row = nullptr;\n  bestover = static_cast<float>(-INT32_MAX);\n  if (row_it.empty()) {\n    return nullptr;\n  }\n  row = row_it.data();\n  row_it.mark_cycle_pt();\n  while (row->baseline.y(x) + row->descdrop > blob->bounding_box().top() && !row_it.cycled_list()) {\n    best_row = row;\n    bestover = blob->bounding_box().top() - row->baseline.y(x) + row->descdrop;\n    row_it.forward();\n    row = row_it.data();\n  }\n  while (row->baseline.y(x) + row->xheight + row->ascrise >= blob->bounding_box().bottom() &&\n         !row_it.cycled_list()) {\n    overlap = row->baseline.y(x) + row->xheight + row->ascrise;\n    if (blob->bounding_box().top() < overlap) {\n      overlap = blob->bounding_box().top();\n    }\n    if (blob->bounding_box().bottom() > row->baseline.y(x) + row->descdrop) {\n      overlap -= blob->bounding_box().bottom();\n    } else {\n      overlap -= row->baseline.y(x) + row->descdrop;\n    }\n    if (overlap > bestover) {\n      bestover = overlap;\n      best_row = row;\n    }\n    row_it.forward();\n    row = row_it.data();\n  }\n  if (bestover < 0 &&\n      row->baseline.y(x) + row->xheight + row->ascrise - blob->bounding_box().bottom() > bestover) {\n    best_row = row;\n  }\n  return best_row;\n}\n\n/**********************************************************************\n * find_underlined_blobs\n *\n * Find the start and end coords of blobs in the underline.\n **********************************************************************/\n\nvoid find_underlined_blobs(    // get chop points\n    BLOBNBOX *u_line,          // underlined unit\n    QSPLINE *baseline,         // actual baseline\n    float xheight,             // height of line\n    float baseline_offset,     // amount to shrinke it\n    ICOORDELT_LIST *chop_cells // places to chop\n) {\n  ICOORD blob_chop; // sides of blob\n  TBOX blob_box = u_line->bounding_box();\n  // cell iterator\n  ICOORDELT_IT cell_it = chop_cells;\n  STATS upper_proj(blob_box.left(), blob_box.right());\n  STATS middle_proj(blob_box.left(), blob_box.right());\n  STATS lower_proj(blob_box.left(), blob_box.right());\n  C_OUTLINE_IT out_it; // outlines of blob\n\n  ASSERT_HOST(u_line->cblob() != nullptr);\n\n  out_it.set_to_list(u_line->cblob()->out_list());\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, &lower_proj,\n                                   &middle_proj, &upper_proj);\n  }\n\n  for (auto x = blob_box.left(); x < blob_box.right(); x++) {\n    if (middle_proj.pile_count(x) > 0) {\n      auto y = x + 1;\n      for (; y < blob_box.right() && middle_proj.pile_count(y) > 0; y++) {\n        ;\n      }\n      blob_chop = ICOORD(x, y);\n      cell_it.add_after_then_move(new ICOORDELT(blob_chop));\n      x = y;\n    }\n  }\n}\n\n/**********************************************************************\n * vertical_cunderline_projection\n *\n * Compute the vertical projection of an outline from its outlines\n * and add to the given STATS.\n **********************************************************************/\n\nvoid vertical_cunderline_projection( // project outlines\n    C_OUTLINE *outline,              // outline to project\n    QSPLINE *baseline,               // actual baseline\n    float xheight,                   // height of line\n    float baseline_offset,           // amount to shrinke it\n    STATS *lower_proj,               // below baseline\n    STATS *middle_proj,              // centre region\n    STATS *upper_proj                // top region\n) {\n  ICOORD pos;               // current point\n  ICOORD step;              // edge step\n  int16_t lower_y, upper_y; // region limits\n  C_OUTLINE_IT out_it = outline->child();\n\n  pos = outline->start_pos();\n  int16_t length = outline->pathlength();\n  for (int16_t stepindex = 0; stepindex < length; stepindex++) {\n    step = outline->step(stepindex);\n    if (step.x() > 0) {\n      lower_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + 0.5));\n      upper_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + xheight + 0.5));\n      if (pos.y() >= lower_y) {\n        lower_proj->add(pos.x(), -lower_y);\n        if (pos.y() >= upper_y) {\n          middle_proj->add(pos.x(), lower_y - upper_y);\n          upper_proj->add(pos.x(), upper_y - pos.y());\n        } else {\n          middle_proj->add(pos.x(), lower_y - pos.y());\n        }\n      } else {\n        lower_proj->add(pos.x(), -pos.y());\n      }\n    } else if (step.x() < 0) {\n      lower_y = static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + 0.5));\n      upper_y =\n          static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + xheight + 0.5));\n      if (pos.y() >= lower_y) {\n        lower_proj->add(pos.x() - 1, lower_y);\n        if (pos.y() >= upper_y) {\n          middle_proj->add(pos.x() - 1, upper_y - lower_y);\n          upper_proj->add(pos.x() - 1, pos.y() - upper_y);\n        } else {\n          middle_proj->add(pos.x() - 1, pos.y() - lower_y);\n        }\n      } else {\n        lower_proj->add(pos.x() - 1, pos.y());\n      }\n    }\n    pos += step;\n  }\n\n  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {\n    vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, lower_proj,\n                                   middle_proj, upper_proj);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/underlin.h",
    "content": "/**********************************************************************\n * File:        underlin.h  (Formerly undrline.h)\n * Description: Code to chop blobs apart from underlines.\n * Author:      Ray Smith\n *\n * (C) Copyright 1994, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef UNDERLIN_H\n#define UNDERLIN_H\n\n#include \"fpchop.h\"\n\nnamespace tesseract {\n\nextern double_VAR_H(textord_underline_offset);\nextern BOOL_VAR_H(textord_restore_underlines);\nvoid restore_underlined_blobs( // get chop points\n    TO_BLOCK *block            // block to do\n);\nTO_ROW *most_overlapping_row( // find best row\n    TO_ROW_LIST *rows,        // list of rows\n    BLOBNBOX *blob            // blob to place\n);\nvoid find_underlined_blobs(    // get chop points\n    BLOBNBOX *u_line,          // underlined unit\n    QSPLINE *baseline,         // actual baseline\n    float xheight,             // height of line\n    float baseline_offset,     // amount to shrinke it\n    ICOORDELT_LIST *chop_cells // places to chop\n);\nvoid vertical_cunderline_projection( // project outlines\n    C_OUTLINE *outline,              // outline to project\n    QSPLINE *baseline,               // actual baseline\n    float xheight,                   // height of line\n    float baseline_offset,           // amount to shrinke it\n    STATS *lower_proj,               // below baseline\n    STATS *middle_proj,              // centre region\n    STATS *upper_proj                // top region\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/wordseg.cpp",
    "content": "/**********************************************************************\n * File:        wordseg.cpp  (Formerly wspace.c)\n * Description: Code to segment the blobs into words.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"wordseg.h\"\n\n#include <cmath>\n\n#include \"blobbox.h\"\n#include \"cjkpitch.h\"\n#include \"drawtord.h\"\n#include \"fpchop.h\"\n#include \"makerow.h\"\n#include \"pitsync1.h\"\n#include \"statistc.h\"\n#include \"textord.h\"\n#include \"topitch.h\"\n#include \"tovars.h\"\n\nnamespace tesseract {\n\nBOOL_VAR(textord_force_make_prop_words, false, \"Force proportional word segmentation on all rows\");\nBOOL_VAR(textord_chopper_test, false, \"Chopper is being tested.\");\n\n#define BLOCK_STATS_CLUSTERS 10\n\n/**\n * @name make_single_word\n *\n * For each row, arrange the blobs into one word. There is no fixed\n * pitch detection.\n */\n\nvoid make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {\n  TO_ROW_IT to_row_it(rows);\n  ROW_IT row_it(real_rows);\n  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {\n    TO_ROW *row = to_row_it.data();\n    // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready\n    // to create the word.\n    C_BLOB_LIST cblobs;\n    C_BLOB_IT cblob_it(&cblobs);\n    BLOBNBOX_IT box_it(row->blob_list());\n    for (; !box_it.empty(); box_it.forward()) {\n      BLOBNBOX *bblob = box_it.extract();\n      if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {\n        auto cblob = bblob->remove_cblob();\n        if (cblob != nullptr) {\n          C_OUTLINE_IT cout_it(cblob_it.data()->out_list());\n          cout_it.move_to_last();\n          cout_it.add_list_after(cblob->out_list());\n          delete cblob;\n        }\n      } else {\n        auto cblob = bblob->remove_cblob();\n        if (cblob != nullptr) {\n          cblob_it.add_after_then_move(cblob);\n        }\n      }\n      delete bblob;\n    }\n    // Convert the TO_ROW to a ROW.\n    ROW *real_row =\n        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));\n    WERD_IT word_it(real_row->word_list());\n    WERD *word = new WERD(&cblobs, 0, nullptr);\n    word->set_flag(W_BOL, true);\n    word->set_flag(W_EOL, true);\n    word->set_flag(W_DONT_CHOP, one_blob);\n    word_it.add_after_then_move(word);\n    real_row->recalc_bounding_box();\n    row_it.add_after_then_move(real_row);\n  }\n}\n\n/**\n * make_words\n *\n * Arrange the blobs into words.\n */\nvoid make_words(tesseract::Textord *textord,\n                ICOORD page_tr,               // top right\n                float gradient,               // page skew\n                BLOCK_LIST *blocks,           // block list\n                TO_BLOCK_LIST *port_blocks) { // output list\n  TO_BLOCK_IT block_it;                       // iterator\n  TO_BLOCK *block;                            // current block\n\n  if (textord->use_cjk_fp_model()) {\n    compute_fixed_pitch_cjk(page_tr, port_blocks);\n  } else {\n    compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),\n                        !bool(textord_test_landscape));\n  }\n  textord->to_spacing(page_tr, port_blocks);\n  block_it.set_to_list(port_blocks);\n  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {\n    block = block_it.data();\n    make_real_words(textord, block, FCOORD(1.0f, 0.0f));\n  }\n}\n\n/**\n * @name set_row_spaces\n *\n * Set the min_space and max_nonspace members of the row so that\n * the blobs can be arranged into words.\n */\n\nvoid set_row_spaces( // find space sizes\n    TO_BLOCK *block, // block to do\n    FCOORD rotation, // for drawing\n    bool testing_on  // correct orientation\n) {\n  TO_ROW *row; // current row\n  TO_ROW_IT row_it = block->get_rows();\n\n  if (row_it.empty()) {\n    return; // empty block\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    if (row->fixed_pitch == 0) {\n      row->min_space = static_cast<int32_t>(\n          ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));\n      row->max_nonspace = static_cast<int32_t>(\n          floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));\n      if (testing_on && textord_show_initial_words) {\n        tprintf(\"Assigning defaults %d non, %d space to row at %g\\n\", row->max_nonspace,\n                row->min_space, row->intercept());\n      }\n      row->space_threshold = (row->max_nonspace + row->min_space) / 2;\n      row->space_size = row->pr_space;\n      row->kern_size = row->pr_nonsp;\n    }\n#ifndef GRAPHICS_DISABLED\n    if (textord_show_initial_words && testing_on) {\n      plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);\n    }\n#endif\n  }\n}\n\n/**\n * @name row_words\n *\n * Compute the max nonspace and min space for the row.\n */\n\nint32_t row_words(    // compute space size\n    TO_BLOCK *block,  // block it came from\n    TO_ROW *row,      // row to operate on\n    int32_t maxwidth, // max expected space size\n    FCOORD rotation,  // for drawing\n    bool testing_on   // for debug\n) {\n  bool testing_row;      // contains testpt\n  bool prev_valid;       // if decent size\n  int32_t prev_x;        // end of prev blob\n  int32_t cluster_count; // no of clusters\n  int32_t gap_index;     // which cluster\n  int32_t smooth_factor; // for smoothing stats\n  BLOBNBOX *blob;        // current blob\n  float lower, upper;    // clustering parameters\n  float gaps[3];         // gap clusers\n  ICOORD testpt;\n  TBOX blob_box; // bounding box\n                 // iterator\n  BLOBNBOX_IT blob_it = row->blob_list();\n  STATS gap_stats(0, maxwidth - 1);\n  STATS cluster_stats[4]; // clusters\n\n  testpt = ICOORD(textord_test_x, textord_test_y);\n  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);\n  //      if (testing_on)\n  //              tprintf(\"Row smooth factor=%d\\n\",smooth_factor);\n  prev_valid = false;\n  prev_x = -INT32_MAX;\n  testing_row = false;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    blob_box = blob->bounding_box();\n    if (blob_box.contains(testpt)) {\n      testing_row = true;\n    }\n    gap_stats.add(blob_box.width(), 1);\n  }\n  gap_stats.clear();\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      blob_box = blob->bounding_box();\n      if (prev_valid && blob_box.left() - prev_x < maxwidth) {\n        gap_stats.add(blob_box.left() - prev_x, 1);\n      }\n      prev_valid = true;\n      prev_x = blob_box.right();\n    }\n  }\n  if (gap_stats.get_total() == 0) {\n    row->min_space = 0; // no evidence\n    row->max_nonspace = 0;\n    return 0;\n  }\n  gap_stats.smooth(smooth_factor);\n  lower = row->xheight * textord_words_initial_lower;\n  upper = row->xheight * textord_words_initial_upper;\n  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);\n  while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {\n    // shrink gap\n    upper = (upper * 3 + lower) / 4;\n    lower = (lower * 3 + upper) / 4;\n    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);\n  }\n  if (cluster_count < 2) {\n    row->min_space = 0; // no evidence\n    row->max_nonspace = 0;\n    return 0;\n  }\n  for (gap_index = 0; gap_index < cluster_count; gap_index++) {\n    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);\n  }\n  // get medians\n  if (cluster_count > 2) {\n    if (testing_on && textord_show_initial_words) {\n      tprintf(\"Row at %g has 3 sizes of gap:%g,%g,%g\\n\", row->intercept(),\n              cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));\n    }\n    lower = gaps[0];\n    if (gaps[1] > lower) {\n      upper = gaps[1]; // prefer most frequent\n      if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {\n        upper = gaps[2];\n      }\n    } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {\n      upper = gaps[2];\n    } else if (lower >= block->xheight * textord_words_min_minspace) {\n      upper = lower; // not nice\n      lower = gaps[1];\n      if (testing_on && textord_show_initial_words) {\n        tprintf(\"Had to switch most common from lower to upper!!\\n\");\n        gap_stats.print();\n      }\n    } else {\n      row->min_space = 0; // no evidence\n      row->max_nonspace = 0;\n      return 0;\n    }\n  } else {\n    if (gaps[1] < gaps[0]) {\n      if (testing_on && textord_show_initial_words) {\n        tprintf(\"Had to switch most common from lower to upper!!\\n\");\n        gap_stats.print();\n      }\n      lower = gaps[1];\n      upper = gaps[0];\n    } else {\n      upper = gaps[1];\n      lower = gaps[0];\n    }\n  }\n  if (upper < block->xheight * textord_words_min_minspace) {\n    row->min_space = 0; // no evidence\n    row->max_nonspace = 0;\n    return 0;\n  }\n  if (upper * 3 < block->min_space * 2 + block->max_nonspace ||\n      lower * 3 > block->min_space * 2 + block->max_nonspace) {\n    if (testing_on && textord_show_initial_words) {\n      tprintf(\"Disagreement between block and row at %g!!\\n\", row->intercept());\n      tprintf(\"Lower=%g, upper=%g, Stats:\\n\", lower, upper);\n      gap_stats.print();\n    }\n  }\n  row->min_space =\n      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));\n  row->max_nonspace =\n      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));\n  row->space_threshold = (row->max_nonspace + row->min_space) / 2;\n  row->space_size = upper;\n  row->kern_size = lower;\n  if (testing_on && textord_show_initial_words) {\n    if (testing_row) {\n      tprintf(\"GAP STATS\\n\");\n      gap_stats.print();\n      tprintf(\"SPACE stats\\n\");\n      cluster_stats[2].print_summary();\n      tprintf(\"NONSPACE stats\\n\");\n      cluster_stats[1].print_summary();\n    }\n    tprintf(\"Row at %g has minspace=%d(%g), max_non=%d(%g)\\n\", row->intercept(), row->min_space,\n            upper, row->max_nonspace, lower);\n  }\n  return cluster_stats[2].get_total();\n}\n\n/**\n * @name row_words2\n *\n * Compute the max nonspace and min space for the row.\n */\n\nint32_t row_words2(   // compute space size\n    TO_BLOCK *block,  // block it came from\n    TO_ROW *row,      // row to operate on\n    int32_t maxwidth, // max expected space size\n    FCOORD rotation,  // for drawing\n    bool testing_on   // for debug\n) {\n  bool prev_valid;       // if decent size\n  bool this_valid;       // current blob big enough\n  int32_t prev_x;        // end of prev blob\n  int32_t min_width;     // min interesting width\n  int32_t valid_count;   // good gaps\n  int32_t total_count;   // total gaps\n  int32_t cluster_count; // no of clusters\n  int32_t prev_count;    // previous cluster_count\n  int32_t gap_index;     // which cluster\n  int32_t smooth_factor; // for smoothing stats\n  BLOBNBOX *blob;        // current blob\n  float lower, upper;    // clustering parameters\n  ICOORD testpt;\n  TBOX blob_box; // bounding box\n                 // iterator\n  BLOBNBOX_IT blob_it = row->blob_list();\n  STATS gap_stats(0, maxwidth - 1);\n  // gap sizes\n  float gaps[BLOCK_STATS_CLUSTERS];\n  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];\n  // clusters\n\n  testpt = ICOORD(textord_test_x, textord_test_y);\n  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);\n  //      if (testing_on)\n  //              tprintf(\"Row smooth factor=%d\\n\",smooth_factor);\n  prev_valid = false;\n  prev_x = -INT16_MAX;\n  const bool testing_row = false;\n  // min blob size\n  min_width = static_cast<int32_t>(block->pr_space);\n  total_count = 0;\n  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n    blob = blob_it.data();\n    if (!blob->joined_to_prev()) {\n      blob_box = blob->bounding_box();\n      this_valid = blob_box.width() >= min_width;\n      if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {\n        gap_stats.add(blob_box.left() - prev_x, 1);\n      }\n      total_count++; // count possibles\n      prev_x = blob_box.right();\n      prev_valid = this_valid;\n    }\n  }\n  valid_count = gap_stats.get_total();\n  if (valid_count < total_count * textord_words_minlarge) {\n    gap_stats.clear();\n    prev_x = -INT16_MAX;\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      blob = blob_it.data();\n      if (!blob->joined_to_prev()) {\n        blob_box = blob->bounding_box();\n        if (blob_box.left() - prev_x < maxwidth) {\n          gap_stats.add(blob_box.left() - prev_x, 1);\n        }\n        prev_x = blob_box.right();\n      }\n    }\n  }\n  if (gap_stats.get_total() == 0) {\n    row->min_space = 0; // no evidence\n    row->max_nonspace = 0;\n    return 0;\n  }\n\n  cluster_count = 0;\n  lower = block->xheight * words_initial_lower;\n  upper = block->xheight * words_initial_upper;\n  gap_stats.smooth(smooth_factor);\n  do {\n    prev_count = cluster_count;\n    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,\n                                      BLOCK_STATS_CLUSTERS, cluster_stats);\n  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);\n  if (cluster_count < 1) {\n    row->min_space = 0;\n    row->max_nonspace = 0;\n    return 0;\n  }\n  for (gap_index = 0; gap_index < cluster_count; gap_index++) {\n    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);\n  }\n  // get medians\n  if (testing_on) {\n    tprintf(\"cluster_count=%d:\", cluster_count);\n    for (gap_index = 0; gap_index < cluster_count; gap_index++) {\n      tprintf(\" %g(%d)\", gaps[gap_index], cluster_stats[gap_index + 1].get_total());\n    }\n    tprintf(\"\\n\");\n  }\n\n  // Try to find proportional non-space and space for row.\n  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;\n       gap_index++) {\n    ;\n  }\n  if (gap_index < cluster_count) {\n    lower = gaps[gap_index]; // most frequent below\n  } else {\n    if (testing_on) {\n      tprintf(\"No cluster below block threshold!, using default=%g\\n\", block->pr_nonsp);\n    }\n    lower = block->pr_nonsp;\n  }\n  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;\n       gap_index++) {\n    ;\n  }\n  if (gap_index < cluster_count) {\n    upper = gaps[gap_index]; // most frequent above\n  } else {\n    if (testing_on) {\n      tprintf(\"No cluster above block threshold!, using default=%g\\n\", block->pr_space);\n    }\n    upper = block->pr_space;\n  }\n  row->min_space =\n      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));\n  row->max_nonspace =\n      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));\n  row->space_threshold = (row->max_nonspace + row->min_space) / 2;\n  row->space_size = upper;\n  row->kern_size = lower;\n  if (testing_on) {\n    if (testing_row) {\n      tprintf(\"GAP STATS\\n\");\n      gap_stats.print();\n      tprintf(\"SPACE stats\\n\");\n      cluster_stats[2].print_summary();\n      tprintf(\"NONSPACE stats\\n\");\n      cluster_stats[1].print_summary();\n    }\n    tprintf(\"Row at %g has minspace=%d(%g), max_non=%d(%g)\\n\", row->intercept(), row->min_space,\n            upper, row->max_nonspace, lower);\n  }\n  return 1;\n}\n\n/**\n * @name make_real_words\n *\n * Convert a TO_BLOCK to a BLOCK.\n */\n\nvoid make_real_words(tesseract::Textord *textord,\n                     TO_BLOCK *block, // block to do\n                     FCOORD rotation  // for drawing\n) {\n  TO_ROW *row; // current row\n  TO_ROW_IT row_it = block->get_rows();\n  ROW *real_row = nullptr; // output row\n  ROW_IT real_row_it = block->block->row_list();\n\n  if (row_it.empty()) {\n    return; // empty block\n  }\n  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {\n    row = row_it.data();\n    if (row->blob_list()->empty() && !row->rep_words.empty()) {\n      real_row = make_rep_words(row, block);\n    } else if (!row->blob_list()->empty()) {\n      // In a fixed pitch document, some lines may be detected as fixed pitch\n      // while others don't, and will go through different path.\n      // For non-space delimited language like CJK, fixed pitch chop always\n      // leave the entire line as one word.  We can force consistent chopping\n      // with force_make_prop_words flag.\n      POLY_BLOCK *pb = block->block->pdblk.poly_block();\n      if (textord_chopper_test) {\n        real_row = textord->make_blob_words(row, rotation);\n      } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||\n                 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {\n        real_row = textord->make_prop_words(row, rotation);\n      } else if (row->pitch_decision == PITCH_DEF_FIXED ||\n                 row->pitch_decision == PITCH_CORR_FIXED) {\n        real_row = fixed_pitch_words(row, rotation);\n      } else {\n        ASSERT_HOST(false);\n      }\n    }\n    if (real_row != nullptr) {\n      // put row in block\n      real_row_it.add_after_then_move(real_row);\n    }\n  }\n  block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),\n                          static_cast<int16_t>(block->space_size),\n                          static_cast<int16_t>(block->fixed_pitch));\n  block->block->check_pitch();\n}\n\n/**\n * @name make_rep_words\n *\n * Fabricate a real row from only the repeated blob words.\n * Get the xheight from the block as it may be more meaningful.\n */\n\nROW *make_rep_words( // make a row\n    TO_ROW *row,     // row to convert\n    TO_BLOCK *block  // block it lives in\n) {\n  ROW *real_row; // output row\n  TBOX word_box; // bounding box\n                 // iterator\n  WERD_IT word_it = &row->rep_words;\n\n  if (word_it.empty()) {\n    return nullptr;\n  }\n  word_box = word_it.data()->bounding_box();\n  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {\n    word_box += word_it.data()->bounding_box();\n  }\n  row->xheight = block->xheight;\n  real_row =\n      new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));\n  word_it.set_to_list(real_row->word_list());\n  // put words in row\n  word_it.add_list_after(&row->rep_words);\n  real_row->recalc_bounding_box();\n  return real_row;\n}\n\n/**\n * @name make_real_word\n *\n * Construct a WERD from a given number of adjacent entries in a\n * list of BLOBNBOXs.\n */\n\nWERD *make_real_word(BLOBNBOX_IT *box_it, // iterator\n                     int32_t blobcount,   // no of blobs to use\n                     bool bol,            // start of line\n                     uint8_t blanks       // no of blanks\n) {\n  C_OUTLINE_IT cout_it;\n  C_BLOB_LIST cblobs;\n  C_BLOB_IT cblob_it = &cblobs;\n\n  for (int blobindex = 0; blobindex < blobcount; blobindex++) {\n    auto bblob = box_it->extract();\n    if (bblob->joined_to_prev()) {\n      auto cblob = bblob->remove_cblob();\n      if (cblob != nullptr) {\n        cout_it.set_to_list(cblob_it.data()->out_list());\n        cout_it.move_to_last();\n        cout_it.add_list_after(cblob->out_list());\n        delete cblob;\n      }\n    } else {\n      auto cblob = bblob->remove_cblob();\n      if (cblob != nullptr) {\n        cblob_it.add_after_then_move(cblob);\n      }\n    }\n    delete bblob;\n    box_it->forward(); // next one\n  }\n\n  if (blanks < 1) {\n    blanks = 1;\n  }\n\n  auto word = new WERD(&cblobs, blanks, nullptr);\n\n  if (bol) {\n    word->set_flag(W_BOL, true);\n  }\n  if (box_it->at_first()) {\n    word->set_flag(W_EOL, true); // at end of line\n  }\n\n  return word;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/textord/wordseg.h",
    "content": "/**********************************************************************\n * File:        wordseg.h  (Formerly wspace.h)\n * Description: Code to segment the blobs into words.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef WORDSEG_H\n#define WORDSEG_H\n\n#include \"blobbox.h\"\n#include \"params.h\"\n#include \"textord.h\"\n\nnamespace tesseract {\nclass Tesseract;\n\nextern BOOL_VAR_H(textord_force_make_prop_words);\nextern BOOL_VAR_H(textord_chopper_test);\n\nvoid make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows);\nvoid make_words(tesseract::Textord *textord,\n                ICOORD page_tr,              // top right\n                float gradient,              // page skew\n                BLOCK_LIST *blocks,          // block list\n                TO_BLOCK_LIST *port_blocks); // output list\nvoid set_row_spaces(                         // find space sizes\n    TO_BLOCK *block,                         // block to do\n    FCOORD rotation,                         // for drawing\n    bool testing_on                          // correct orientation\n);\nint32_t row_words(    // compute space size\n    TO_BLOCK *block,  // block it came from\n    TO_ROW *row,      // row to operate on\n    int32_t maxwidth, // max expected space size\n    FCOORD rotation,  // for drawing\n    bool testing_on   // for debug\n);\nint32_t row_words2(   // compute space size\n    TO_BLOCK *block,  // block it came from\n    TO_ROW *row,      // row to operate on\n    int32_t maxwidth, // max expected space size\n    FCOORD rotation,  // for drawing\n    bool testing_on   // for debug\n);\nvoid make_real_words(tesseract::Textord *textord,\n                     TO_BLOCK *block, // block to do\n                     FCOORD rotation  // for drawing\n);\nROW *make_rep_words( // make a row\n    TO_ROW *row,     // row to convert\n    TO_BLOCK *block  // block it lives in\n);\nWERD *make_real_word(    // make a WERD\n    BLOBNBOX_IT *box_it, // iterator\n    int32_t blobcount,   // no of blobs to use\n    bool bol,            // start of line\n    uint8_t blanks       // no of blanks\n);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/textord/workingpartset.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        workingpartset.cpp\n// Description: Class to hold a working set of partitions of the page\n//              during construction of text/image regions.\n// Author:      Ray Smith\n// Created:     Tue Ocr 28 17:21:01 PDT 2008\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"workingpartset.h\"\n#include \"colpartition.h\"\n\nnamespace tesseract {\n\n// Add the partition to this WorkingPartSet. Unrelated partitions are\n// stored in the order in which they are received, but if the partition\n// has a SingletonPartner, make sure that it stays with its partner.\nvoid WorkingPartSet::AddPartition(ColPartition *part) {\n  ColPartition *partner = part->SingletonPartner(true);\n  if (partner != nullptr) {\n    ASSERT_HOST(partner->SingletonPartner(false) == part);\n  }\n  if (latest_part_ == nullptr || partner == nullptr) {\n    // This partition goes at the end of the list\n    part_it_.move_to_last();\n  } else if (latest_part_->SingletonPartner(false) != part) {\n    // Reposition the iterator to the correct partner, or at the end.\n    for (part_it_.move_to_first(); !part_it_.at_last() && part_it_.data() != partner;\n         part_it_.forward()) {\n      ;\n    }\n  }\n  part_it_.add_after_then_move(part);\n  latest_part_ = part;\n}\n\n// Make blocks out of any partitions in this WorkingPartSet, and append\n// them to the end of the blocks list. bleft, tright and resolution give\n// the bounds and resolution of the source image, so that blocks can be\n// made to fit in the bounds.\n// All ColPartitions go in the used_parts list, as they need to be kept\n// around, but are no longer needed.\nvoid WorkingPartSet::ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright,\n                                            int resolution, ColPartition_LIST *used_parts,\n                                            BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {\n  MakeBlocks(bleft, tright, resolution, used_parts);\n  BLOCK_IT block_it(blocks);\n  block_it.move_to_last();\n  block_it.add_list_after(&completed_blocks_);\n  TO_BLOCK_IT to_block_it(to_blocks);\n  to_block_it.move_to_last();\n  to_block_it.add_list_after(&to_blocks_);\n}\n\n// Insert the given blocks at the front of the completed_blocks_ list so\n// they can be kept in the correct reading order.\nvoid WorkingPartSet::InsertCompletedBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {\n  BLOCK_IT block_it(&completed_blocks_);\n  block_it.add_list_before(blocks);\n  TO_BLOCK_IT to_block_it(&to_blocks_);\n  to_block_it.add_list_before(to_blocks);\n}\n\n// Make a block using lines parallel to the given vector that fit between\n// the min and max coordinates specified by the ColPartitions.\n// Construct a block from the given list of partitions.\nvoid WorkingPartSet::MakeBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution,\n                                ColPartition_LIST *used_parts) {\n  part_it_.move_to_first();\n  while (!part_it_.empty()) {\n    // Gather a list of ColPartitions in block_parts that will be split\n    // by linespacing into smaller blocks.\n    ColPartition_LIST block_parts;\n    ColPartition_IT block_it(&block_parts);\n    ColPartition *next_part = nullptr;\n    bool text_block = false;\n    do {\n      ColPartition *part = part_it_.extract();\n      if (part->blob_type() == BRT_UNKNOWN || (part->IsTextType() && part->type() != PT_TABLE)) {\n        text_block = true;\n      }\n      part->set_working_set(nullptr);\n      part_it_.forward();\n      block_it.add_after_then_move(part);\n      next_part = part->SingletonPartner(false);\n      if (part_it_.empty() || next_part != part_it_.data()) {\n        // Sequences of partitions can get split by titles.\n        next_part = nullptr;\n      }\n      // Merge adjacent blocks that are of the same type and let the\n      // linespacing determine the real boundaries.\n      if (next_part == nullptr && !part_it_.empty()) {\n        ColPartition *next_block_part = part_it_.data();\n        const TBOX &part_box = part->bounding_box();\n        const TBOX &next_box = next_block_part->bounding_box();\n\n        // In addition to the same type, the next box must not be above the\n        // current box, nor (if image) too far below.\n        PolyBlockType type = part->type(), next_type = next_block_part->type();\n        if (ColPartition::TypesSimilar(type, next_type) && !part->IsLineType() &&\n            !next_block_part->IsLineType() && next_box.bottom() <= part_box.top() &&\n            (text_block || part_box.bottom() <= next_box.top())) {\n          next_part = next_block_part;\n        }\n      }\n    } while (!part_it_.empty() && next_part != nullptr);\n    if (!text_block) {\n      TO_BLOCK *to_block = ColPartition::MakeBlock(bleft, tright, &block_parts, used_parts);\n      if (to_block != nullptr) {\n        TO_BLOCK_IT to_block_it(&to_blocks_);\n        to_block_it.add_to_end(to_block);\n        BLOCK_IT block_it(&completed_blocks_);\n        block_it.add_to_end(to_block->block);\n      }\n    } else {\n      // Further sub-divide text blocks where linespacing changes.\n      ColPartition::LineSpacingBlocks(bleft, tright, resolution, &block_parts, used_parts,\n                                      &completed_blocks_, &to_blocks_);\n    }\n  }\n  part_it_.set_to_list(&part_set_);\n  latest_part_ = nullptr;\n  ASSERT_HOST(completed_blocks_.length() == to_blocks_.length());\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/textord/workingpartset.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        workingpartset.h\n// Description: Class to hold a working set of partitions of the page\n//              during construction of text/image regions.\n// Author:      Ray Smith\n// Created:     Tue Ocr 28 17:21:01 PDT 2008\n//\n// (C) Copyright 2008, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TEXTORD_WORKINGPARSET_H_\n#define TESSERACT_TEXTORD_WORKINGPARSET_H_\n\n#include \"blobbox.h\"      // For TO_BLOCK_LIST and BLOCK_LIST.\n#include \"colpartition.h\" // For ColPartition_LIST.\n\nnamespace tesseract {\n\n// WorkingPartSet holds a working set of ColPartitions during transformation\n// from the grid-based storage to regions in logical reading order, and is\n// therefore only used during construction of the regions.\nclass WorkingPartSet : public ELIST<WorkingPartSet>::LINK {\npublic:\n  explicit WorkingPartSet(ColPartition *column)\n      : column_(column), latest_part_(nullptr), part_it_(&part_set_) {}\n\n  // Simple accessors.\n  ColPartition *column() const {\n    return column_;\n  }\n  void set_column(ColPartition *col) {\n    column_ = col;\n  }\n\n  // Add the partition to this WorkingPartSet. Partitions are generally\n  // stored in the order in which they are received, but if the partition\n  // has a SingletonPartner, make sure that it stays with its partner.\n  void AddPartition(ColPartition *part);\n\n  // Make blocks out of any partitions in this WorkingPartSet, and append\n  // them to the end of the blocks list. bleft, tright and resolution give\n  // the bounds and resolution of the source image, so that blocks can be\n  // made to fit in the bounds.\n  // All ColPartitions go in the used_parts list, as they need to be kept\n  // around, but are no longer needed.\n  void ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution,\n                              ColPartition_LIST *used_parts, BLOCK_LIST *blocks,\n                              TO_BLOCK_LIST *to_blocks);\n\n  // Insert the given blocks at the front of the completed_blocks_ list so\n  // they can be kept in the correct reading order.\n  void InsertCompletedBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);\n\nprivate:\n  // Convert the part_set_ into blocks, starting a new block at a break\n  // in partnerships, or a change in linespacing (for text).\n  void MakeBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution,\n                  ColPartition_LIST *used_parts);\n\n  // The column that this working set applies to. Used by the caller.\n  ColPartition *column_;\n  // The most recently added partition.\n  ColPartition *latest_part_;\n  // All the partitions in the block that is currently under construction.\n  ColPartition_LIST part_set_;\n  // Iteratorn on part_set_ pointing to the most recent addition.\n  ColPartition_IT part_it_;\n  // The blocks that have been made so far and belong before the current block.\n  BLOCK_LIST completed_blocks_;\n  TO_BLOCK_LIST to_blocks_;\n};\n\nELISTIZEH(WorkingPartSet)\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TEXTORD_WORKINGPARSET_H_\n"
  },
  {
    "path": "src/training/CMakeLists.txt",
    "content": "#\n# tesseract training tools\n#\nif(NOT ${CMAKE_VERSION} VERSION_LESS \"3.12.0\")\n  cmake_policy(SET CMP0074 NEW)\nendif()\n\n# Include build optimizations\ninclude(BuildOptimizations)\n\nif(SW_BUILD)\n  set(ICU_FOUND 1)\nelse() # NOT SW_BUILD\n  find_package(PkgConfig)\nendif()\n\n# experimental\n# If PkgConfig is not present training tools will not be build,\n# so it does not make sense to set ICU.\nif(MSVC\n   AND PKG_CONFIG_FOUND\n   AND NOT SW_BUILD\n   AND NOT USE_SYSTEM_ICU)\n  include(CheckTypeSize)\n  check_type_size(\"void *\" SIZEOF_VOID_P)\n\n  if(SIZEOF_VOID_P EQUAL 8)\n    set(X64 1)\n    set(ARCH_NAME 64)\n  elseif(SIZEOF_VOID_P EQUAL 4)\n    set(X86 1)\n    set(ARCH_NAME 32)\n  else()\n    message(FATAL_ERROR \"Cannot determine target architecture\")\n  endif()\n\n  set(ICU_DIR \"${CMAKE_CURRENT_BINARY_DIR}/icu\")\n  set(ICU_ARCHIVE \"${ICU_DIR}/icu${ARCH_NAME}.zip\")\n\n  if(X86)\n    set(ICU_HASH 45167a240b60e36b59a87eda23490ce4)\n  else()\n    set(ICU_HASH 480c72491576c048de1218c3c5519399)\n  endif()\n\n  message(STATUS \"Downloading latest ICU binaries\")\n  set(COMPILER \"msvc10\")\n  set(ICU_URL \"https://github.com/unicode-org/icu/releases/download\")\n  set(ICU_R \"56-1\")\n  set(ICU_V \"56_1\")\n  file(\n    DOWNLOAD\n    \"${ICU_URL}/release-${ICU_R}/icu4c-${ICU_V}-Win${ARCH_NAME}-${COMPILER}.zip\"\n    \"${ICU_ARCHIVE}\"\n    SHOW_PROGRESS\n    INACTIVITY_TIMEOUT 300 # seconds\n    EXPECTED_HASH MD5=${ICU_HASH})\n  execute_process(\n    COMMAND ${CMAKE_COMMAND} -E tar xz \"${ICU_ARCHIVE}\"\n    WORKING_DIRECTORY \"${ICU_DIR}\"\n    RESULT_VARIABLE __result)\n  if(NOT __result EQUAL 0)\n    message(FATAL_ERROR \"error ${__result}\")\n  endif()\n\n  set(ICU_ROOT ${ICU_DIR}/icu)\nendif()\n# experimental\n\nif(NOT SW_BUILD)\n  if(PKG_CONFIG_FOUND)\n    pkg_check_modules(ICU REQUIRED IMPORTED_TARGET icu-uc icu-i18n)\n  else()\n    find_package(ICU 52.1 COMPONENTS uc i18n)\n  endif()\n  if(ICU_FOUND)\n    message(\">> ICU_FOUND ${ICU_FOUND} ${ICU_VERSION} ${ICU_LIBRARIES} ${ICU_INCLUDE_DIRS}\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${ICU_CXX_FLAGS}\")\n  else()\n    message(\">> ICU not found!\")\n  endif()\nendif()\n\n\n# ##############################################################################\n# LIBRARY common_training\n# ##############################################################################\n\nset(COMMON_TRAINING_SRC\n    common/commandlineflags.cpp\n    common/commandlineflags.h\n    common/commontraining.cpp\n    common/commontraining.h\n    common/ctc.cpp\n    common/ctc.h\n    common/networkbuilder.cpp\n    common/networkbuilder.h)\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  list(\n    APPEND\n    COMMON_TRAINING_SRC\n    common/errorcounter.cpp\n    common/errorcounter.h\n    common/intfeaturedist.cpp\n    common/intfeaturedist.h\n    common/intfeaturemap.cpp\n    common/intfeaturemap.h\n    common/mastertrainer.cpp\n    common/mastertrainer.h\n    common/sampleiterator.cpp\n    common/sampleiterator.h\n    common/trainingsampleset.cpp\n    common/trainingsampleset.h)\nendif()\n\nadd_library(common_training ${COMMON_TRAINING_SRC})\ntarget_include_directories(common_training PUBLIC common\n                                                  ${CMAKE_CURRENT_BINARY_DIR})\ntarget_link_libraries(common_training PUBLIC libtesseract)\n\n# Apply modern build optimizations\napply_training_optimizations(common_training)\ninstall(\n  TARGETS common_training\n  RUNTIME DESTINATION bin\n  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}\n  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})\ngenerate_export_header(common_training EXPORT_MACRO_NAME\n                       TESS_COMMON_TRAINING_API)\nif (MSVC AND BUILD_SHARED_LIBS)\n  install(FILES $<TARGET_PDB_FILE:common_training> DESTINATION bin OPTIONAL)\nendif()\nproject_group(common_training \"Training Tools\")\n\n# ##############################################################################\n# EXECUTABLE ambiguous_words\n# ##############################################################################\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  add_executable(ambiguous_words ambiguous_words.cpp)\n  target_link_libraries(ambiguous_words common_training)\n  project_group(ambiguous_words \"Training Tools\")\n  install(\n    TARGETS ambiguous_words\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:ambiguous_words> DESTINATION bin OPTIONAL)\n  endif()\nendif()\n\n# ##############################################################################\n# EXECUTABLE classifier_tester\n# ##############################################################################\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  add_executable(classifier_tester classifier_tester.cpp)\n  target_link_libraries(classifier_tester common_training)\n  project_group(classifier_tester \"Training Tools\")\n  install(\n    TARGETS classifier_tester\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:classifier_tester> DESTINATION bin OPTIONAL)\n  endif()\nendif()\n\n# ##############################################################################\n# EXECUTABLE combine_tessdata\n# ##############################################################################\n\nadd_executable(combine_tessdata combine_tessdata.cpp)\ntarget_link_libraries(combine_tessdata common_training)\nproject_group(combine_tessdata \"Training Tools\")\ninstall(\n  TARGETS combine_tessdata\n  RUNTIME DESTINATION bin\n  LIBRARY DESTINATION lib\n  ARCHIVE DESTINATION lib)\nif (MSVC)\n  install(FILES $<TARGET_PDB_FILE:combine_tessdata> DESTINATION bin OPTIONAL)\nendif()\n\n# ##############################################################################\n# EXECUTABLE cntraining\n# ##############################################################################\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  add_executable(cntraining cntraining.cpp)\n  target_link_libraries(cntraining common_training)\n  project_group(cntraining \"Training Tools\")\n  install(\n    TARGETS cntraining\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:cntraining> DESTINATION bin OPTIONAL)\n  endif()\nendif()\n\n# ##############################################################################\n# EXECUTABLE dawg2wordlist\n# ##############################################################################\n\nadd_executable(dawg2wordlist dawg2wordlist.cpp)\ntarget_link_libraries(dawg2wordlist common_training)\nproject_group(dawg2wordlist \"Training Tools\")\ninstall(\n  TARGETS dawg2wordlist\n  RUNTIME DESTINATION bin\n  LIBRARY DESTINATION lib\n  ARCHIVE DESTINATION lib)\nif (MSVC)\n  install(FILES $<TARGET_PDB_FILE:dawg2wordlist> DESTINATION bin OPTIONAL)\nendif()\n\n# ##############################################################################\n# EXECUTABLE mftraining\n# ##############################################################################\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  add_executable(mftraining mftraining.cpp mergenf.cpp mergenf.h)\n  target_link_libraries(mftraining common_training)\n  project_group(mftraining \"Training Tools\")\n  install(\n    TARGETS mftraining\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:mftraining> DESTINATION bin OPTIONAL)\n  endif()\nendif()\n\n# ##############################################################################\n# EXECUTABLE shapeclustering\n# ##############################################################################\n\nif(NOT DISABLED_LEGACY_ENGINE)\n  add_executable(shapeclustering shapeclustering.cpp)\n  target_link_libraries(shapeclustering common_training)\n  project_group(shapeclustering \"Training Tools\")\n  install(\n    TARGETS shapeclustering\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n     install(FILES $<TARGET_PDB_FILE:shapeclustering> DESTINATION bin OPTIONAL)\n  endif()\nendif()\n\n# ##############################################################################\n# EXECUTABLE wordlist2dawg\n# ##############################################################################\n\nadd_executable(wordlist2dawg wordlist2dawg.cpp)\ntarget_link_libraries(wordlist2dawg common_training)\nproject_group(wordlist2dawg \"Training Tools\")\ninstall(\n  TARGETS wordlist2dawg\n  RUNTIME DESTINATION bin\n  LIBRARY DESTINATION lib\n  ARCHIVE DESTINATION lib)\nif (MSVC)\n  install(FILES $<TARGET_PDB_FILE:wordlist2dawg> DESTINATION bin OPTIONAL)\nendif()\n\nif(ICU_FOUND)\n  if(NOT SW_BUILD)\n    include_directories(${ICU_INCLUDE_DIRS})\n  endif()\n\n  # ############################################################################\n  # LIBRARY unicharset_training\n  # ############################################################################\n\n  file(GLOB unicharset_training_src unicharset/*)\n\n  add_library(unicharset_training ${unicharset_training_src})\n  if(SW_BUILD)\n    target_link_libraries(unicharset_training\n                          PUBLIC common_training org.sw.demo.unicode.icu.i18n)\n  else()\n    if(PKG_CONFIG_FOUND)\n      target_link_libraries(unicharset_training PUBLIC common_training PkgConfig::ICU)\n    else()\n      target_link_libraries(unicharset_training PUBLIC common_training ${ICU_LIBRARIES})\n    endif()\n  endif()\n  target_include_directories(unicharset_training\n                             PUBLIC unicharset ${CMAKE_CURRENT_BINARY_DIR})\n  install(\n    TARGETS unicharset_training\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}\n    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})\n  if (MSVC AND BUILD_SHARED_LIBS)\n    install(FILES $<TARGET_PDB_FILE:unicharset_training> DESTINATION bin OPTIONAL)\n  endif()\n  generate_export_header(unicharset_training EXPORT_MACRO_NAME\n                         TESS_UNICHARSET_TRAINING_API)\n  project_group(unicharset_training \"Training Tools\")\n\n  # ############################################################################\n  # EXECUTABLE combine_lang_model\n  # ############################################################################\n\n  add_executable(combine_lang_model combine_lang_model.cpp)\n  target_link_libraries(combine_lang_model unicharset_training)\n  project_group(combine_lang_model \"Training Tools\")\n  install(\n    TARGETS combine_lang_model\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:combine_lang_model> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n  # EXECUTABLE lstmeval\n  # ############################################################################\n\n  add_executable(lstmeval lstmeval.cpp)\n  target_link_libraries(lstmeval unicharset_training)\n  project_group(lstmeval \"Training Tools\")\n  install(\n    TARGETS lstmeval\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:lstmeval> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n  # EXECUTABLE lstmtraining\n  # ############################################################################\n\n  add_executable(lstmtraining lstmtraining.cpp)\n  target_link_libraries(lstmtraining unicharset_training)\n  project_group(lstmtraining \"Training Tools\")\n  install(\n    TARGETS lstmtraining\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:lstmtraining> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n  # EXECUTABLE merge_unicharsets\n  # ############################################################################\n\n  add_executable(merge_unicharsets merge_unicharsets.cpp)\n  target_link_libraries(merge_unicharsets common_training)\n  project_group(merge_unicharsets \"Training Tools\")\n  install(\n    TARGETS merge_unicharsets\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:merge_unicharsets> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n  # EXECUTABLE set_unicharset_properties\n  # ############################################################################\n\n  add_executable(set_unicharset_properties set_unicharset_properties.cpp)\n  target_link_libraries(set_unicharset_properties unicharset_training)\n  project_group(set_unicharset_properties \"Training Tools\")\n  install(\n    TARGETS set_unicharset_properties\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:set_unicharset_properties> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n  # EXECUTABLE unicharset_extractor\n  # ############################################################################\n\n  add_executable(unicharset_extractor unicharset_extractor.cpp)\n  target_compile_features(unicharset_extractor PRIVATE cxx_std_17)\n  target_link_libraries(unicharset_extractor unicharset_training)\n  project_group(unicharset_extractor \"Training Tools\")\n  install(\n    TARGETS unicharset_extractor\n    RUNTIME DESTINATION bin\n    LIBRARY DESTINATION lib\n    ARCHIVE DESTINATION lib)\n  if (MSVC)\n    install(FILES $<TARGET_PDB_FILE:unicharset_extractor> DESTINATION bin OPTIONAL)\n  endif()\n\n  # ############################################################################\n\n  if(PKG_CONFIG_FOUND OR SW_BUILD)\n\n    if(PKG_CONFIG_FOUND)\n      pkg_check_modules(\n        PANGO\n        REQUIRED\n        IMPORTED_TARGET\n        pango>=1.38.0\n        cairo\n        pangoft2\n        pangocairo\n        fontconfig)\n    endif()\n\n    # ##########################################################################\n    # LIBRARY pango_training\n    # ##########################################################################\n\n    file(GLOB pango_training_src pango/*)\n\n    add_library(pango_training ${pango_training_src})\n    target_link_libraries(pango_training PUBLIC unicharset_training)\n    if(SW_BUILD)\n      target_link_libraries(pango_training\n                            PUBLIC org.sw.demo.gnome.pango.pangocairo)\n    else()\n      if(PKG_CONFIG_FOUND)\n        target_include_directories(pango_training BEFORE\n                                   PUBLIC ${PANGO_INCLUDE_DIRS})\n        target_compile_definitions(pango_training PUBLIC -DPANGO_ENABLE_ENGINE)\n        target_link_libraries(pango_training PUBLIC PkgConfig::PANGO)\n      endif()\n    endif()\n    target_include_directories(pango_training\n                               PUBLIC pango ${CMAKE_CURRENT_BINARY_DIR})\n    generate_export_header(pango_training EXPORT_MACRO_NAME\n                           TESS_PANGO_TRAINING_API)\n    project_group(pango_training \"Training Tools\")\n\n    # ##########################################################################\n    # EXECUTABLE text2image\n    # ##########################################################################\n\n    set(TEXT2IMAGE_SRC text2image.cpp degradeimage.cpp degradeimage.h)\n\n    add_executable(text2image ${TEXT2IMAGE_SRC})\n    target_link_libraries(text2image pango_training)\n    project_group(text2image \"Training Tools\")\n    install(\n      TARGETS text2image\n      RUNTIME DESTINATION bin\n      LIBRARY DESTINATION lib\n      ARCHIVE DESTINATION lib)\n    if (MSVC)\n      install(FILES $<TARGET_PDB_FILE:text2image> DESTINATION bin OPTIONAL)\n    endif()\n  endif()\nendif(ICU_FOUND)\n\n# ##############################################################################\n"
  },
  {
    "path": "src/training/ambiguous_words.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ambiguous_words.cpp\n// Description: A program that takes a text file with a list of words as\n//              input (one per line) and outputs a file with the words\n//              that were found in the dictionary followed by the words\n//              that are ambiguous to them.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"dict.h\"\n#include \"tesseractclass.h\"\n\n#include <tesseract/baseapi.h>\n#include \"helpers.h\"\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  // Parse input arguments.\n  if (argc > 1 && (!strcmp(argv[1], \"-v\") || !strcmp(argv[1], \"--version\"))) {\n    printf(\"%s\\n\", tesseract::TessBaseAPI::Version());\n    return EXIT_SUCCESS;\n  } else if (argc != 4 && (argc != 6 || strcmp(argv[1], \"-l\") != 0)) {\n    printf(\n        \"Usage: %s -v | --version | %s [-l lang] tessdata_dir wordlist_file\"\n        \" output_ambiguous_wordlist_file\\n\",\n        argv[0], argv[0]);\n    return EXIT_FAILURE;\n  }\n  int argv_offset = 0;\n  std::string lang;\n  if (argc == 6) {\n    lang = argv[2];\n    argv_offset = 2;\n  } else {\n    lang = \"eng\";\n  }\n  const char *tessdata_dir = argv[++argv_offset];\n  const char *input_file_str = argv[++argv_offset];\n  const char *output_file_str = argv[++argv_offset];\n\n  // Initialize Tesseract.\n  tesseract::TessBaseAPI api;\n  std::vector<std::string> vars_vec;\n  std::vector<std::string> vars_values;\n  vars_vec.emplace_back(\"output_ambig_words_file\");\n  vars_values.emplace_back(output_file_str);\n  api.Init(tessdata_dir, lang.c_str(), tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec,\n           &vars_values, false);\n  tesseract::Dict &dict = api.tesseract()->getDict();\n  FILE *input_file = fopen(input_file_str, \"rb\");\n  if (input_file == nullptr) {\n    tesseract::tprintf(\"Failed to open input wordlist file %s\\n\", input_file_str);\n    return EXIT_FAILURE;\n  }\n  char str[CHARS_PER_LINE];\n\n  // Read word list and call Dict::NoDangerousAmbig() for each word\n  // to record ambiguities in the output file.\n  while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) {\n    tesseract::chomp_string(str); // remove newline\n    tesseract::WERD_CHOICE word(str, dict.getUnicharset());\n    dict.NoDangerousAmbig(&word, nullptr, false, nullptr);\n  }\n  // Clean up.\n  fclose(input_file);\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "src/training/classifier_tester.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n//  Filename: classifier_tester.cpp\n//  Purpose:  Tests a character classifier on data as formatted for training,\n//            but doesn't have to be the same as the training data.\n//  Author:   Ray Smith\n\n#include <tesseract/baseapi.h>\n#include <algorithm>\n#include <cstdio>\n#include \"commontraining.h\"\n#include \"mastertrainer.h\"\n#include \"params.h\"\n#include \"tessclassifier.h\"\n#include \"tesseractclass.h\"\n\nusing namespace tesseract;\n\nstatic STRING_PARAM_FLAG(classifier, \"\", \"Classifier to test\");\nstatic STRING_PARAM_FLAG(lang, \"eng\", \"Language to test\");\nstatic STRING_PARAM_FLAG(tessdata_dir, \"\", \"Directory of traineddata files\");\n\nenum ClassifierName { CN_PRUNER, CN_FULL, CN_COUNT };\n\nstatic const char *names[] = {\"pruner\", \"full\"};\n\nstatic tesseract::ShapeClassifier *InitializeClassifier(const char *classifier_name,\n                                                        const UNICHARSET &unicharset, int argc,\n                                                        char **argv, tesseract::TessBaseAPI **api) {\n  // Decode the classifier string.\n  ClassifierName classifier = CN_COUNT;\n  for (int c = 0; c < CN_COUNT; ++c) {\n    if (strcmp(classifier_name, names[c]) == 0) {\n      classifier = static_cast<ClassifierName>(c);\n      break;\n    }\n  }\n  if (classifier == CN_COUNT) {\n    fprintf(stderr, \"Invalid classifier name:%s\\n\", FLAGS_classifier.c_str());\n    return nullptr;\n  }\n\n  // We need to initialize tesseract to test.\n  *api = new tesseract::TessBaseAPI;\n  tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;\n  tesseract::Tesseract *tesseract = nullptr;\n  tesseract::Classify *classify = nullptr;\n  if (classifier == CN_PRUNER || classifier == CN_FULL) {\n    if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(), engine_mode) < 0) {\n      fprintf(stderr, \"Tesseract initialization failed!\\n\");\n      return nullptr;\n    }\n    tesseract = const_cast<tesseract::Tesseract *>((*api)->tesseract());\n    classify = static_cast<tesseract::Classify *>(tesseract);\n    if (classify->shape_table() == nullptr) {\n      fprintf(stderr, \"Tesseract must contain a ShapeTable!\\n\");\n      return nullptr;\n    }\n  }\n  tesseract::ShapeClassifier *shape_classifier = nullptr;\n\n  if (classifier == CN_PRUNER) {\n    shape_classifier = new tesseract::TessClassifier(true, classify);\n  } else if (classifier == CN_FULL) {\n    shape_classifier = new tesseract::TessClassifier(false, classify);\n  }\n  tprintf(\"Testing classifier %s:\\n\", classifier_name);\n  return shape_classifier;\n}\n\n// This program has complex setup requirements, so here is some help:\n// Two different modes, tr files and serialized mastertrainer.\n// From tr files:\n//   classifier_tester -U unicharset -F font_properties -X xheights\n//     -classifier x -lang lang [-output_trainer trainer] *.tr\n// From a serialized trainer:\n//  classifier_tester -input_trainer trainer [-lang lang] -classifier x\n//\n// In the first case, the unicharset must be the unicharset from within\n// the classifier under test, and the font_properties and xheights files must\n// match the files used during training.\n// In the second case, the trainer file must have been prepared from\n// some previous run of shapeclustering, mftraining, or classifier_tester\n// using the same conditions as above, ie matching unicharset/font_properties.\n//\n// Available values of classifier (x above) are:\n// pruner   : Tesseract class pruner only.\n// full     : Tesseract full classifier.\n//            with an input trainer.)\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n  ParseArguments(&argc, &argv);\n  std::string file_prefix;\n  auto trainer = tesseract::LoadTrainingData(argv + 1, false, nullptr, file_prefix);\n  tesseract::TessBaseAPI *api;\n  // Decode the classifier string.\n  tesseract::ShapeClassifier *shape_classifier =\n      InitializeClassifier(FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);\n  if (shape_classifier == nullptr) {\n    fprintf(stderr, \"Classifier init failed!:%s\\n\", FLAGS_classifier.c_str());\n    return EXIT_FAILURE;\n  }\n\n  // We want to test junk as well if it is available.\n  // trainer->IncludeJunk();\n  // We want to test with replicated samples too.\n  trainer->ReplicateAndRandomizeSamplesIfRequired();\n\n  trainer->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR,\n                                   std::max(3, static_cast<int>(FLAGS_debug_level)), false,\n                                   shape_classifier, nullptr);\n  delete shape_classifier;\n  delete api;\n\n  return EXIT_SUCCESS;\n} /* main */\n"
  },
  {
    "path": "src/training/cntraining.cpp",
    "content": "/******************************************************************************\n **  Filename:  cntraining.cpp\n **  Purpose:  Generates a normproto and pffmtable.\n **  Author:    Dan Johnson\n **  Revisment:  Christy Russon\n **\n **  (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n#include <tesseract/unichar.h>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include \"cluster.h\"\n#include \"clusttool.h\"\n#include \"commontraining.h\"\n#include \"featdefs.h\"\n#include \"ocrfeatures.h\"\n#include \"oldlist.h\"\n\n#define PROGRAM_FEATURE_TYPE \"cn\"\n\nusing namespace tesseract;\n\n/*----------------------------------------------------------------------------\n          Private Function Prototypes\n----------------------------------------------------------------------------*/\n\nstatic void WriteNormProtos(const char *Directory, LIST LabeledProtoList,\n                            const FEATURE_DESC_STRUCT *feature_desc);\n\nstatic void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,\n                        bool WriteInsigProtos);\n\n/*----------------------------------------------------------------------------\n          Global Data Definitions and Declarations\n----------------------------------------------------------------------------*/\n/* global variable to hold configuration parameters to control clustering */\n//-M 0.025   -B 0.05   -I 0.8   -C 1e-3\nstatic const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0};\n\n/*----------------------------------------------------------------------------\n              Public Code\n----------------------------------------------------------------------------*/\n\n/**\n* This program reads in a text file consisting of feature\n* samples from a training page in the following format:\n* @verbatim\n   FontName CharName NumberOfFeatureTypes(N)\n      FeatureTypeName1 NumberOfFeatures(M)\n         Feature1\n         ...\n         FeatureM\n      FeatureTypeName2 NumberOfFeatures(M)\n         Feature1\n         ...\n         FeatureM\n      ...\n      FeatureTypeNameN NumberOfFeatures(M)\n         Feature1\n         ...\n         FeatureM\n   FontName CharName ...\n@endverbatim\n* It then appends these samples into a separate file for each\n* character.  The name of the file is\n*\n*   DirectoryName/FontName/CharName.FeatureTypeName\n*\n* The DirectoryName can be specified via a command\n* line argument.  If not specified, it defaults to the\n* current directory.  The format of the resulting files is:\n* @verbatim\n   NumberOfFeatures(M)\n      Feature1\n      ...\n      FeatureM\n   NumberOfFeatures(M)\n   ...\n@endverbatim\n* The output files each have a header which describes the\n* type of feature which the file contains.  This header is\n* in the format required by the clusterer.  A command line\n* argument can also be used to specify that only the first\n* N samples of each class should be used.\n* @param argc  number of command line arguments\n* @param argv  array of command line arguments\n* @return 0 on success\n*/\nint main(int argc, char *argv[]) {\n  tesseract::CheckSharedLibraryVersion();\n\n  // Set the global Config parameters before parsing the command line.\n  Config = CNConfig;\n\n  LIST CharList = NIL_LIST;\n  CLUSTERER *Clusterer = nullptr;\n  LIST ProtoList = NIL_LIST;\n  LIST NormProtoList = NIL_LIST;\n  LIST pCharList;\n  LABELEDLIST CharSample;\n  FEATURE_DEFS_STRUCT FeatureDefs;\n  InitFeatureDefs(&FeatureDefs);\n\n  ParseArguments(&argc, &argv);\n#if !defined(NDEBUG)\n  int num_fonts = 0;\n#endif\n  for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {\n    printf(\"Reading %s ...\\n\", PageName);\n    FILE *TrainingPage = fopen(PageName, \"rb\");\n    ASSERT_HOST(TrainingPage);\n    if (TrainingPage) {\n      ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);\n      fclose(TrainingPage);\n#if !defined(NDEBUG)\n      ++num_fonts;\n#endif\n    }\n  }\n  printf(\"Clustering ...\\n\");\n  // To allow an individual font to form a separate cluster,\n  // reduce the min samples:\n  // Config.MinSamples = 0.5 / num_fonts;\n  pCharList = CharList;\n  // The norm protos will count the source protos, so we keep them here in\n  // freeable_protos, so they can be freed later.\n  std::vector<LIST> freeable_protos;\n  iterate(pCharList) {\n    // Cluster\n    CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());\n    Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);\n    if (Clusterer == nullptr) { // To avoid a SIGSEGV\n      fprintf(stderr, \"Error: nullptr clusterer!\\n\");\n      return EXIT_FAILURE;\n    }\n    float SavedMinSamples = Config.MinSamples;\n    // To disable the tendency to produce a single cluster for all fonts,\n    // make MagicSamples an impossible to achieve number:\n    // Config.MagicSamples = CharSample->SampleCount * 10;\n    Config.MagicSamples = CharSample->SampleCount;\n    while (Config.MinSamples > 0.001) {\n      ProtoList = ClusterSamples(Clusterer, &Config);\n      if (NumberOfProtos(ProtoList, true, false) > 0) {\n        break;\n      } else {\n        Config.MinSamples *= 0.95;\n        printf(\n            \"0 significant protos for %s.\"\n            \" Retrying clustering with MinSamples = %f%%\\n\",\n            CharSample->Label.c_str(), Config.MinSamples);\n      }\n    }\n    Config.MinSamples = SavedMinSamples;\n    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);\n    freeable_protos.push_back(ProtoList);\n    FreeClusterer(Clusterer);\n  }\n  FreeTrainingSamples(CharList);\n  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);\n  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);\n  FreeNormProtoList(NormProtoList);\n  for (auto &freeable_proto : freeable_protos) {\n    FreeProtoList(&freeable_proto);\n  }\n  printf(\"\\n\");\n  return EXIT_SUCCESS;\n} // main\n\n/*----------------------------------------------------------------------------\n              Private Code\n----------------------------------------------------------------------------*/\n\n/*----------------------------------------------------------------------------*/\n/**\n * This routine writes the specified samples into files which\n * are organized according to the font name and character name\n * of the samples.\n * @param Directory  directory to place sample files into\n * @param LabeledProtoList List of labeled protos\n * @param feature_desc Description of the features\n */\nstatic void WriteNormProtos(const char *Directory, LIST LabeledProtoList,\n                            const FEATURE_DESC_STRUCT *feature_desc) {\n  FILE *File;\n  LABELEDLIST LabeledProto;\n  int N;\n\n  std::string Filename = \"\";\n  if (Directory != nullptr && Directory[0] != '\\0') {\n    Filename += Directory;\n    Filename += \"/\";\n  }\n  Filename += \"normproto\";\n  printf(\"\\nWriting %s ...\", Filename.c_str());\n  File = fopen(Filename.c_str(), \"wb\");\n  ASSERT_HOST(File);\n  fprintf(File, \"%0d\\n\", feature_desc->NumParams);\n  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);\n  iterate(LabeledProtoList) {\n    LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node());\n    N = NumberOfProtos(LabeledProto->List, true, false);\n    if (N < 1) {\n      printf(\n          \"\\nError! Not enough protos for %s: %d protos\"\n          \" (%d significant protos\"\n          \", %d insignificant protos)\\n\",\n          LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false),\n          NumberOfProtos(LabeledProto->List, false, true));\n      exit(1);\n    }\n    fprintf(File, \"\\n%s %d\\n\", LabeledProto->Label.c_str(), N);\n    WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);\n  }\n  fclose(File);\n\n} // WriteNormProtos\n\n/*-------------------------------------------------------------------------*/\n\nstatic void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,\n                        bool WriteInsigProtos) {\n  PROTOTYPE *Proto;\n\n  // write prototypes\n  iterate(ProtoList) {\n    Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());\n    if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) {\n      WritePrototype(File, N, Proto);\n    }\n  }\n} // WriteProtos\n"
  },
  {
    "path": "src/training/combine_lang_model.cpp",
    "content": "// Copyright 2017 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Purpose: Program to generate a traineddata file that can be used to train an\n//          LSTM-based neural network model from a unicharset and an optional\n//          set of wordlists. Eliminates the need to run\n//          set_unicharset_properties, wordlist2dawg, some non-existent binary\n//          to generate the recoder, and finally combine_tessdata.\n\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"commandlineflags.h\"\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"lang_model_helpers.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"tprintf.h\"\n#include \"unicharset_training_utils.h\"\n\nusing namespace tesseract;\n\nstatic STRING_PARAM_FLAG(input_unicharset, \"\",\n                         \"Filename with unicharset to complete and use in encoding\");\nstatic STRING_PARAM_FLAG(script_dir, \"\", \"Directory name for input script unicharsets\");\nstatic STRING_PARAM_FLAG(words, \"\", \"File listing words to use for the system dictionary\");\nstatic STRING_PARAM_FLAG(puncs, \"\", \"File listing punctuation patterns\");\nstatic STRING_PARAM_FLAG(numbers, \"\", \"File listing number patterns\");\nstatic STRING_PARAM_FLAG(output_dir, \"\", \"Root directory for output files\");\nstatic STRING_PARAM_FLAG(version_str, \"\", \"Version string to add to traineddata file\");\nstatic STRING_PARAM_FLAG(lang, \"\", \"Name of language being processed\");\nstatic BOOL_PARAM_FLAG(lang_is_rtl, false, \"True if lang being processed is written right-to-left\");\nstatic BOOL_PARAM_FLAG(pass_through_recoder, false,\n                       \"If true, the recoder is a simple pass-through of the \"\n                       \"unicharset. Otherwise, potentially a compression of it\");\n\nint main(int argc, char **argv) {\n  // Sets properties on the input unicharset file, and writes:\n  //   rootdir/lang/lang.charset_size=ddd.txt\n  //   rootdir/lang/lang.traineddata\n  //   rootdir/lang/lang.unicharset\n  // If the 3 word lists are provided, the dawgs are also added\n  // to the traineddata file.\n  // The output unicharset and charset_size files are just for\n  // human readability.\n  tesseract::CheckSharedLibraryVersion();\n  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);\n\n  // If these reads fail, we get a warning message and an empty list of words.\n  std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\\n');\n  std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\\n');\n  std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\\n');\n  // Load the input unicharset\n  UNICHARSET unicharset;\n  if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {\n    tprintf(\"Failed to load unicharset from %s\\n\", FLAGS_input_unicharset.c_str());\n    return EXIT_FAILURE;\n  }\n  tesserr << \"Loaded unicharset of size \" << unicharset.size()\n          << \" from file \" << FLAGS_input_unicharset.c_str() << '\\n';\n\n  // Set unichar properties\n  tprintf(\"Setting unichar properties\\n\");\n  tesseract::SetupBasicProperties(/*report_errors*/ true,\n                                  /*decompose (NFD)*/ false, &unicharset);\n  tprintf(\"Setting script properties\\n\");\n  tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);\n  // Combine everything into a traineddata file.\n  return tesseract::CombineLangModel(unicharset, FLAGS_script_dir.c_str(),\n                                     FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(),\n                                     FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs,\n                                     numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,\n                                     /*writer*/ nullptr);\n}\n"
  },
  {
    "path": "src/training/combine_tessdata.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        combine_tessdata.cpp\n// Description: Creates a unified traineddata file from several\n//              data files produced by the training process.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"lstmrecognizer.h\"\n#include \"tessdatamanager.h\"\n\n#include <cerrno>\n#include <iostream> // std::cout\n\nusing namespace tesseract;\n\nstatic int list_components(TessdataManager &tm, const char *filename) {\n  // Initialize TessdataManager with the data in the given traineddata file.\n  if (filename != nullptr && !tm.Init(filename)) {\n    tprintf(\"Failed to read %s\\n\", filename);\n    return EXIT_FAILURE;\n  }\n  tm.Directory();\n  return EXIT_SUCCESS;\n}\n\nstatic int list_network(TessdataManager &tm, const char *filename) {\n  if (filename != nullptr && !tm.Init(filename)) {\n    tprintf(\"Failed to read %s\\n\", filename);\n    return EXIT_FAILURE;\n  }\n  tesseract::TFile fp;\n  if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {\n    tesseract::LSTMRecognizer recognizer;\n    if (!recognizer.DeSerialize(&tm, &fp)) {\n      tprintf(\"Failed to deserialize LSTM in %s!\\n\", filename);\n      return EXIT_FAILURE;\n    }\n    std::cout << \"LSTM: network=\" << recognizer.GetNetwork()\n              << \", int_mode=\" << recognizer.IsIntMode()\n              << \", recoding=\" << recognizer.IsRecoding()\n              << \", iteration=\" << recognizer.training_iteration()\n              << \", sample_iteration=\" << recognizer.sample_iteration()\n              << \", null_char=\" << recognizer.null_char()\n              << \", learning_rate=\" << recognizer.learning_rate()\n              << \", momentum=\" << recognizer.GetMomentum()\n              << \", adam_beta=\" << recognizer.GetAdamBeta() << '\\n';\n\n    std::cout << \"Layer Learning Rates: \";\n    auto layers = recognizer.EnumerateLayers();\n    for (const auto &id : layers) {\n      auto layer = recognizer.GetLayer(id);\n      std::cout << id << \"(\" << layer->name() << \")\"\n                << \"=\" << recognizer.GetLayerLearningRate(id)\n                << (layers[layers.size() - 1] != id ? \", \" : \"\");\n    }\n    std::cout << \"\\n\";\n  }\n  return EXIT_SUCCESS;\n}\n\n// Main program to combine/extract/overwrite tessdata components\n// in [lang].traineddata files.\n//\n// To combine all the individual tessdata components (unicharset, DAWGs,\n// classifier templates, ambiguities, language configs) located at, say,\n// /home/$USER/temp/eng.* run:\n//\n//   combine_tessdata /home/$USER/temp/eng.\n//\n// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata\n//\n// Specify option -e if you would like to extract individual components\n// from a combined traineddata file. For example, to extract language config\n// file and the unicharset from tessdata/eng.traineddata run:\n//\n//   combine_tessdata -e tessdata/eng.traineddata\n//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset\n//\n// The desired config file and unicharset will be written to\n// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset\n//\n// Specify option -o to overwrite individual components of the given\n// [lang].traineddata file. For example, to overwrite language config\n// and unichar ambiguities files in tessdata/eng.traineddata use:\n//\n//   combine_tessdata -o tessdata/eng.traineddata\n//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs\n//\n// As a result, tessdata/eng.traineddata will contain the new language config\n// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.\n//\n// Note: the file names of the files to extract to and to overwrite from should\n// have the appropriate file suffixes (extensions) indicating their tessdata\n// component type (.unicharset for the unicharset, .unicharambigs for unichar\n// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.\n//\n// Specify option -u to unpack all the components to the specified path:\n//\n// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.\n//\n// This will create  /home/$USER/temp/eng.* files with individual tessdata\n// components from tessdata/eng.traineddata.\n//\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  int i;\n  tesseract::TessdataManager tm;\n  if (argc > 1 && (!strcmp(argv[1], \"-v\") || !strcmp(argv[1], \"--version\"))) {\n    printf(\"%s\\n\", tesseract::TessBaseAPI::Version());\n    return EXIT_SUCCESS;\n  } else if (argc == 2) {\n    printf(\"Combining tessdata files\\n\");\n    std::string lang = argv[1];\n    char *last = &argv[1][strlen(argv[1]) - 1];\n    if (*last != '.') {\n      lang += '.';\n    }\n    std::string output_file = lang;\n    output_file += kTrainedDataSuffix;\n    if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {\n      printf(\"Error combining tessdata files into %s\\n\", output_file.c_str());\n    } else {\n      printf(\"Output %s created successfully.\\n\", output_file.c_str());\n    }\n  } else if (argc >= 4 &&\n             (strcmp(argv[1], \"-e\") == 0 || strcmp(argv[1], \"-u\") == 0)) {\n    // Initialize TessdataManager with the data in the given traineddata file.\n    if (!tm.Init(argv[2])) {\n      tprintf(\"Failed to read %s\\n\", argv[2]);\n      return EXIT_FAILURE;\n    }\n    printf(\"Extracting tessdata components from %s\\n\", argv[2]);\n    if (strcmp(argv[1], \"-e\") == 0) {\n      for (i = 3; i < argc; ++i) {\n        errno = 0;\n        if (tm.ExtractToFile(argv[i])) {\n          printf(\"Wrote %s\\n\", argv[i]);\n        } else if (errno == 0) {\n          printf(\n              \"Not extracting %s, since this component\"\n              \" is not present\\n\",\n              argv[i]);\n          return EXIT_FAILURE;\n        } else {\n          printf(\"Error, could not extract %s: %s\\n\", argv[i], strerror(errno));\n          return EXIT_FAILURE;\n        }\n      }\n    } else { // extract all the components\n      for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {\n        std::string filename = argv[3];\n        char *last = &argv[3][strlen(argv[3]) - 1];\n        if (*last != '.') {\n          filename += '.';\n        }\n        filename += tesseract::kTessdataFileSuffixes[i];\n        errno = 0;\n        if (tm.ExtractToFile(filename.c_str())) {\n          printf(\"Wrote %s\\n\", filename.c_str());\n        } else if (errno != 0) {\n          printf(\"Error, could not extract %s: %s\\n\", filename.c_str(),\n                 strerror(errno));\n          return EXIT_FAILURE;\n        }\n      }\n    }\n  } else if (argc >= 4 && strcmp(argv[1], \"-o\") == 0) {\n    // Rename the current traineddata file to a temporary name.\n    const char *new_traineddata_filename = argv[2];\n    std::string traineddata_filename = new_traineddata_filename;\n    traineddata_filename += \".__tmp__\";\n    if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {\n      tprintf(\"Failed to create a temporary file %s\\n\",\n              traineddata_filename.c_str());\n      return EXIT_FAILURE;\n    }\n\n    // Initialize TessdataManager with the data in the given traineddata file.\n    tm.Init(traineddata_filename.c_str());\n\n    // Write the updated traineddata file.\n    tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);\n  } else if (argc == 3 && strcmp(argv[1], \"-c\") == 0) {\n    if (!tm.Init(argv[2])) {\n      tprintf(\"Failed to read %s\\n\", argv[2]);\n      return EXIT_FAILURE;\n    }\n    tesseract::TFile fp;\n    if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {\n      tprintf(\"No LSTM Component found in %s!\\n\", argv[2]);\n      return EXIT_FAILURE;\n    }\n    tesseract::LSTMRecognizer recognizer;\n    if (!recognizer.DeSerialize(&tm, &fp)) {\n      tprintf(\"Failed to deserialize LSTM in %s!\\n\", argv[2]);\n      return EXIT_FAILURE;\n    }\n    recognizer.ConvertToInt();\n    std::vector<char> lstm_data;\n    fp.OpenWrite(&lstm_data);\n    ASSERT_HOST(recognizer.Serialize(&tm, &fp));\n    tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],\n                      lstm_data.size());\n    if (!tm.SaveFile(argv[2], nullptr)) {\n      tprintf(\"Failed to write modified traineddata:%s!\\n\", argv[2]);\n      return EXIT_FAILURE;\n    }\n  } else if (argc == 3 && strcmp(argv[1], \"-d\") == 0) {\n    return list_components(tm, argv[2]);\n  } else if (argc == 3 && strcmp(argv[1], \"-l\") == 0) {\n    return list_network(tm, argv[2]);\n  } else if (argc == 3 && strcmp(argv[1], \"-dl\") == 0) {\n    int result = list_components(tm, argv[2]);\n    if (result == EXIT_SUCCESS) {\n      result = list_network(tm, nullptr);\n    }\n    return result;\n  } else if (argc == 3 && strcmp(argv[1], \"-ld\") == 0) {\n    int result = list_network(tm, argv[2]);\n    if (result == EXIT_SUCCESS) {\n      result = list_components(tm, nullptr);\n    }\n    return result;\n  } else {\n    printf(\n        \"Usage for combining tessdata components:\\n\"\n        \"  %s language_data_path_prefix\\n\"\n        \"  (e.g. %s tessdata/eng.)\\n\\n\",\n        argv[0], argv[0]);\n    printf(\n        \"Usage for extracting tessdata components:\\n\"\n        \"  %s -e traineddata_file [output_component_file...]\\n\"\n        \"  (e.g. %s -e eng.traineddata eng.unicharset)\\n\\n\",\n        argv[0], argv[0]);\n    printf(\n        \"Usage for overwriting tessdata components:\\n\"\n        \"  %s -o traineddata_file [input_component_file...]\\n\"\n        \"  (e.g. %s -o eng.traineddata eng.unicharset)\\n\\n\",\n        argv[0], argv[0]);\n    printf(\n        \"Usage for unpacking all tessdata components:\\n\"\n        \"  %s -u traineddata_file output_path_prefix\\n\"\n        \"  (e.g. %s -u eng.traineddata tmp/eng.)\\n\\n\",\n        argv[0], argv[0]);\n    printf(\n        \"Usage for listing the network information\\n\"\n        \"  %s -l traineddata_file\\n\"\n        \"  (e.g. %s -l eng.traineddata)\\n\\n\",\n        argv[0], argv[0]);\n    printf(\n        \"Usage for listing directory of components:\\n\"\n        \"  %s -d traineddata_file\\n\\n\",\n        argv[0]);\n    printf(\n        \"NOTE: Above two flags may combined as -dl or -ld to get both outputs\"\n        );\n    printf(\n        \"Usage for compacting LSTM component to int:\\n\"\n        \"  %s -c traineddata_file\\n\",\n        argv[0]);\n    return EXIT_FAILURE;\n  }\n  tm.Directory();\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "src/training/common/commandlineflags.cpp",
    "content": "// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"commandlineflags.h\"\n#include <tesseract/baseapi.h> // TessBaseAPI::Version\n#include <cmath>               // for std::isnan, NAN\n#include <locale>              // for std::locale::classic\n#include <sstream>             // for std::stringstream\n#include <vector>              // for std::vector\n#include \"errcode.h\"\n#include \"tprintf.h\" // for tprintf\n\nnamespace tesseract {\nstatic bool IntFlagExists(const char *flag_name, int32_t *value) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<IntParam *> empty;\n  auto *p =\n      ParamUtils::FindParam<IntParam>(full_flag_name.c_str(), GlobalParams()->int_params, empty);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = (int32_t)(*p);\n  return true;\n}\n\nstatic bool DoubleFlagExists(const char *flag_name, double *value) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<DoubleParam *> empty;\n  auto *p = ParamUtils::FindParam<DoubleParam>(full_flag_name.c_str(),\n                                               GlobalParams()->double_params, empty);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = static_cast<double>(*p);\n  return true;\n}\n\nstatic bool BoolFlagExists(const char *flag_name, bool *value) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<BoolParam *> empty;\n  auto *p =\n      ParamUtils::FindParam<BoolParam>(full_flag_name.c_str(), GlobalParams()->bool_params, empty);\n  if (p == nullptr) {\n    return false;\n  }\n  *value = bool(*p);\n  return true;\n}\n\nstatic bool StringFlagExists(const char *flag_name, const char **value) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<StringParam *> empty;\n  auto *p = ParamUtils::FindParam<StringParam>(full_flag_name.c_str(),\n                                               GlobalParams()->string_params, empty);\n  *value = (p != nullptr) ? p->c_str() : nullptr;\n  return p != nullptr;\n}\n\nstatic void SetIntFlagValue(const char *flag_name, const int32_t new_val) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<IntParam *> empty;\n  auto *p =\n      ParamUtils::FindParam<IntParam>(full_flag_name.c_str(), GlobalParams()->int_params, empty);\n  ASSERT_HOST(p != nullptr);\n  p->set_value(new_val);\n}\n\nstatic void SetDoubleFlagValue(const char *flag_name, const double new_val) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<DoubleParam *> empty;\n  auto *p = ParamUtils::FindParam<DoubleParam>(full_flag_name.c_str(),\n                                               GlobalParams()->double_params, empty);\n  ASSERT_HOST(p != nullptr);\n  p->set_value(new_val);\n}\n\nstatic void SetBoolFlagValue(const char *flag_name, const bool new_val) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<BoolParam *> empty;\n  auto *p =\n      ParamUtils::FindParam<BoolParam>(full_flag_name.c_str(), GlobalParams()->bool_params, empty);\n  ASSERT_HOST(p != nullptr);\n  p->set_value(new_val);\n}\n\nstatic void SetStringFlagValue(const char *flag_name, const char *new_val) {\n  std::string full_flag_name(\"FLAGS_\");\n  full_flag_name += flag_name;\n  std::vector<StringParam *> empty;\n  auto *p = ParamUtils::FindParam<StringParam>(full_flag_name.c_str(),\n                                               GlobalParams()->string_params, empty);\n  ASSERT_HOST(p != nullptr);\n  p->set_value(std::string(new_val));\n}\n\nstatic bool SafeAtoi(const char *str, int *val) {\n  char *endptr = nullptr;\n  *val = strtol(str, &endptr, 10);\n  return endptr != nullptr && *endptr == '\\0';\n}\n\nstatic bool SafeAtod(const char *str, double *val) {\n  double d = NAN;\n  std::stringstream stream(str);\n  // Use \"C\" locale for reading double value.\n  stream.imbue(std::locale::classic());\n  stream >> d;\n  *val = 0;\n  bool success = !std::isnan(d);\n  if (success) {\n    *val = d;\n  }\n  return success;\n}\n\nstatic void PrintCommandLineFlags() {\n  const char *kFlagNamePrefix = \"FLAGS_\";\n  const int kFlagNamePrefixLen = strlen(kFlagNamePrefix);\n  for (auto &param : GlobalParams()->int_params) {\n    if (!strncmp(param->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) {\n      printf(\"  --%s  %s  (type:int default:%d)\\n\",\n             param->name_str() + kFlagNamePrefixLen,\n             param->info_str(), int32_t(*param));\n    }\n  }\n  for (auto &param : GlobalParams()->double_params) {\n    if (!strncmp(param->name_str(), kFlagNamePrefix,\n                 kFlagNamePrefixLen)) {\n      printf(\"  --%s  %s  (type:double default:%g)\\n\",\n             param->name_str() + kFlagNamePrefixLen,\n             param->info_str(),\n             static_cast<double>(*param));\n    }\n  }\n  for (auto &param : GlobalParams()->bool_params) {\n    if (!strncmp(param->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) {\n      printf(\"  --%s  %s  (type:bool default:%s)\\n\",\n             param->name_str() + kFlagNamePrefixLen,\n             param->info_str(),\n             bool(*param) ? \"true\" : \"false\");\n    }\n  }\n  for (auto &param : GlobalParams()->string_params) {\n    if (!strncmp(param->name_str(), kFlagNamePrefix,\n                 kFlagNamePrefixLen)) {\n      printf(\"  --%s  %s  (type:string default:%s)\\n\",\n             param->name_str() + kFlagNamePrefixLen,\n             param->info_str(),\n             param->c_str());\n    }\n  }\n}\n\nvoid ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags) {\n  if (*argc == 1) {\n    printf(\"USAGE: %s\\n\", usage);\n    PrintCommandLineFlags();\n    exit(0);\n  }\n\n  if (*argc > 1 && (!strcmp((*argv)[1], \"-v\") || !strcmp((*argv)[1], \"--version\"))) {\n    printf(\"%s\\n\", TessBaseAPI::Version());\n    exit(0);\n  }\n\n  int i;\n  for (i = 1; i < *argc; ++i) {\n    const char *current_arg = (*argv)[i];\n    // If argument does not start with a hyphen then break.\n    if (current_arg[0] != '-') {\n      break;\n    }\n    // Position current_arg after startings hyphens. We treat a sequence of\n    // one or two consecutive hyphens identically.\n    ++current_arg;\n    if (current_arg[0] == '-') {\n      ++current_arg;\n    }\n    // If this is asking for usage, print the help message and abort.\n    if (!strcmp(current_arg, \"help\")) {\n      printf(\"Usage:\\n  %s [OPTION ...]\\n\\n\", usage);\n      PrintCommandLineFlags();\n      exit(0);\n    }\n    // Find the starting position of the value if it was specified in this\n    // string.\n    const char *equals_position = strchr(current_arg, '=');\n    const char *rhs = nullptr;\n    if (equals_position != nullptr) {\n      rhs = equals_position + 1;\n    }\n    // Extract the flag name.\n    std::string lhs;\n    if (equals_position == nullptr) {\n      lhs = current_arg;\n    } else {\n      lhs.assign(current_arg, equals_position - current_arg);\n    }\n    if (!lhs.length()) {\n      tprintf(\"ERROR: Bad argument: %s\\n\", (*argv)[i]);\n      exit(1);\n    }\n\n    // Find the flag name in the list of global flags.\n    // int32_t flag\n    int32_t int_val;\n    if (IntFlagExists(lhs.c_str(), &int_val)) {\n      if (rhs != nullptr) {\n        if (!strlen(rhs)) {\n          // Bad input of the format --int_flag=\n          tprintf(\"ERROR: Bad argument: %s\\n\", (*argv)[i]);\n          exit(1);\n        }\n        if (!SafeAtoi(rhs, &int_val)) {\n          tprintf(\"ERROR: Could not parse int from %s in flag %s\\n\", rhs, (*argv)[i]);\n          exit(1);\n        }\n      } else {\n        // We need to parse the next argument\n        if (i + 1 >= *argc) {\n          tprintf(\"ERROR: Could not find value argument for flag %s\\n\", lhs.c_str());\n          exit(1);\n        } else {\n          ++i;\n          if (!SafeAtoi((*argv)[i], &int_val)) {\n            tprintf(\"ERROR: Could not parse int32_t from %s\\n\", (*argv)[i]);\n            exit(1);\n          }\n        }\n      }\n      SetIntFlagValue(lhs.c_str(), int_val);\n      continue;\n    }\n\n    // double flag\n    double double_val;\n    if (DoubleFlagExists(lhs.c_str(), &double_val)) {\n      if (rhs != nullptr) {\n        if (!strlen(rhs)) {\n          // Bad input of the format --double_flag=\n          tprintf(\"ERROR: Bad argument: %s\\n\", (*argv)[i]);\n          exit(1);\n        }\n        if (!SafeAtod(rhs, &double_val)) {\n          tprintf(\"ERROR: Could not parse double from %s in flag %s\\n\", rhs, (*argv)[i]);\n          exit(1);\n        }\n      } else {\n        // We need to parse the next argument\n        if (i + 1 >= *argc) {\n          tprintf(\"ERROR: Could not find value argument for flag %s\\n\", lhs.c_str());\n          exit(1);\n        } else {\n          ++i;\n          if (!SafeAtod((*argv)[i], &double_val)) {\n            tprintf(\"ERROR: Could not parse double from %s\\n\", (*argv)[i]);\n            exit(1);\n          }\n        }\n      }\n      SetDoubleFlagValue(lhs.c_str(), double_val);\n      continue;\n    }\n\n    // Bool flag. Allow input forms --flag (equivalent to --flag=true),\n    // --flag=false, --flag=true, --flag=0 and --flag=1\n    bool bool_val;\n    if (BoolFlagExists(lhs.c_str(), &bool_val)) {\n      if (rhs == nullptr) {\n        // --flag form\n        bool_val = true;\n      } else {\n        if (!strlen(rhs)) {\n          // Bad input of the format --bool_flag=\n          tprintf(\"ERROR: Bad argument: %s\\n\", (*argv)[i]);\n          exit(1);\n        }\n        if (!strcmp(rhs, \"false\") || !strcmp(rhs, \"0\")) {\n          bool_val = false;\n        } else if (!strcmp(rhs, \"true\") || !strcmp(rhs, \"1\")) {\n          bool_val = true;\n        } else {\n          tprintf(\"ERROR: Could not parse bool from flag %s\\n\", (*argv)[i]);\n          exit(1);\n        }\n      }\n      SetBoolFlagValue(lhs.c_str(), bool_val);\n      continue;\n    }\n\n    // string flag\n    const char *string_val;\n    if (StringFlagExists(lhs.c_str(), &string_val)) {\n      if (rhs != nullptr) {\n        string_val = rhs;\n      } else {\n        // Pick the next argument\n        if (i + 1 >= *argc) {\n          tprintf(\"ERROR: Could not find string value for flag %s\\n\", lhs.c_str());\n          exit(1);\n        } else {\n          string_val = (*argv)[++i];\n        }\n      }\n      SetStringFlagValue(lhs.c_str(), string_val);\n      continue;\n    }\n\n    // Flag was not found. Exit with an error message.\n    tprintf(\"ERROR: Non-existent flag %s\\n\", (*argv)[i]);\n    exit(1);\n  } // for each argv\n  if (remove_flags) {\n    (*argv)[i - 1] = (*argv)[0];\n    (*argv) += (i - 1);\n    (*argc) -= (i - 1);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/common/commandlineflags.h",
    "content": "/**********************************************************************\n * File:        commandlineflags.h\n * Description: Header file for commandline flag parsing.\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n#ifndef TESSERACT_TRAINING_COMMANDLINEFLAGS_H_\n#define TESSERACT_TRAINING_COMMANDLINEFLAGS_H_\n\n#include \"export.h\"\n#include \"params.h\"\n\n#include <cstdlib>\n\n#define INT_PARAM_FLAG(name, val, comment) INT_VAR(FLAGS_##name, val, comment)\n#define DECLARE_INT_PARAM_FLAG(name) extern INT_VAR_H(FLAGS_##name)\n#define DOUBLE_PARAM_FLAG(name, val, comment) double_VAR(FLAGS_##name, val, comment)\n#define DECLARE_DOUBLE_PARAM_FLAG(name) extern double_VAR_H(FLAGS_##name)\n#define BOOL_PARAM_FLAG(name, val, comment) BOOL_VAR(FLAGS_##name, val, comment)\n#define DECLARE_BOOL_PARAM_FLAG(name) extern BOOL_VAR_H(FLAGS_##name)\n#define STRING_PARAM_FLAG(name, val, comment) STRING_VAR(FLAGS_##name, val, comment)\n#define DECLARE_STRING_PARAM_FLAG(name) extern STRING_VAR_H(FLAGS_##name)\n\nnamespace tesseract {\n\n// Flags from commontraining.cpp\n// Command line arguments for font_properties, xheights and unicharset.\nTESS_COMMON_TRAINING_API\nDECLARE_INT_PARAM_FLAG(debug_level);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(D);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(F);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(O);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(U);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(X);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(fonts_dir);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(output_trainer);\nTESS_COMMON_TRAINING_API\nDECLARE_STRING_PARAM_FLAG(test_ch);\n\n// Parse commandline flags and values. Prints the usage string and exits on\n// input of --help or --version.\n//\n// If remove_flags is true, the argv pointer is advanced so that (*argv)[1]\n// points to the first non-flag argument, (*argv)[0] points to the same string\n// as before, and argc is decremented to reflect the new shorter length of argv.\n// eg. If the input *argv is\n// { \"program\", \"--foo=4\", \"--bar=true\", \"file1\", \"file2\" } with *argc = 5, the\n// output *argv is { \"program\", \"file1\", \"file2\" } with *argc = 3\nTESS_COMMON_TRAINING_API\nvoid ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags);\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_COMMANDLINEFLAGS_H_\n"
  },
  {
    "path": "src/training/common/commontraining.cpp",
    "content": "// Copyright 2008 Google Inc. All Rights Reserved.\n// Author: scharron@google.com (Samuel Charron)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#define _USE_MATH_DEFINES // for M_PI\n\n#include \"commontraining.h\"\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include \"params.h\"\n#  include \"tprintf.h\"\n\nnamespace tesseract {\n\nINT_PARAM_FLAG(debug_level, 0, \"Level of Trainer debugging\");\nINT_PARAM_FLAG(load_images, 0, \"Load images with tr files\");\nSTRING_PARAM_FLAG(configfile, \"\", \"File to load more configs from\");\nSTRING_PARAM_FLAG(D, \"\", \"Directory to write output files to\");\nSTRING_PARAM_FLAG(F, \"font_properties\", \"File listing font properties\");\nSTRING_PARAM_FLAG(X, \"\", \"File listing font xheights\");\nSTRING_PARAM_FLAG(U, \"unicharset\", \"File to load unicharset from\");\nSTRING_PARAM_FLAG(O, \"\", \"File to write unicharset to\");\nSTRING_PARAM_FLAG(output_trainer, \"\", \"File to write trainer to\");\nSTRING_PARAM_FLAG(test_ch, \"\", \"UTF8 test character string\");\nSTRING_PARAM_FLAG(fonts_dir, \"\",\n                  \"If empty it uses system default. Otherwise it overrides \"\n                  \"system default font location\");\nSTRING_PARAM_FLAG(fontconfig_tmpdir, \"/tmp\", \"Overrides fontconfig default temporary dir\");\n\n/**\n * This routine parses the command line arguments that were\n * passed to the program and uses them to set relevant\n * training-related global parameters.\n *\n * Globals:\n * - Config  current clustering parameters\n * @param argc number of command line arguments to parse\n * @param argv command line arguments\n * @note Exceptions: Illegal options terminate the program.\n */\nvoid ParseArguments(int *argc, char ***argv) {\n  std::string usage;\n  if (*argc) {\n    usage += (*argv)[0];\n    usage += \" -v | --version | \";\n    usage += (*argv)[0];\n  }\n  usage += \" [.tr files ...]\";\n  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);\n}\n\n} // namespace tesseract.\n\n#else\n\n#  include <allheaders.h>\n#  include \"ccutil.h\"\n#  include \"classify.h\"\n#  include \"cluster.h\"\n#  include \"clusttool.h\"\n#  include \"featdefs.h\"\n#  include \"fontinfo.h\"\n#  include \"intfeaturespace.h\"\n#  include \"mastertrainer.h\"\n#  include \"mf.h\"\n#  include \"oldlist.h\"\n#  include \"params.h\"\n#  include \"shapetable.h\"\n#  include \"tessdatamanager.h\"\n#  include \"tprintf.h\"\n#  include \"unicity_table.h\"\n\nnamespace tesseract {\n\n// Global Variables.\n\n// global variable to hold configuration parameters to control clustering\n// -M 0.625   -B 0.05   -I 1.0   -C 1e-6.\nCLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0};\nFEATURE_DEFS_STRUCT feature_defs;\nstatic CCUtil ccutil;\n\nINT_PARAM_FLAG(debug_level, 0, \"Level of Trainer debugging\");\nstatic INT_PARAM_FLAG(load_images, 0, \"Load images with tr files\");\nstatic STRING_PARAM_FLAG(configfile, \"\", \"File to load more configs from\");\nSTRING_PARAM_FLAG(D, \"\", \"Directory to write output files to\");\nSTRING_PARAM_FLAG(F, \"font_properties\", \"File listing font properties\");\nSTRING_PARAM_FLAG(X, \"\", \"File listing font xheights\");\nSTRING_PARAM_FLAG(U, \"unicharset\", \"File to load unicharset from\");\nSTRING_PARAM_FLAG(O, \"\", \"File to write unicharset to\");\nSTRING_PARAM_FLAG(output_trainer, \"\", \"File to write trainer to\");\nSTRING_PARAM_FLAG(test_ch, \"\", \"UTF8 test character string\");\nSTRING_PARAM_FLAG(fonts_dir, \"\", \"\");\nSTRING_PARAM_FLAG(fontconfig_tmpdir, \"\", \"\");\nstatic DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,\n                         \"Min number of samples per proto as % of total\");\nstatic DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,\n                         \"Max percentage of samples in a cluster which have more\"\n                         \" than 1 feature in that cluster\");\nstatic DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,\n                         \"Desired independence between dimensions\");\nstatic DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,\n                         \"Desired confidence in prototypes created\");\n\n/**\n * This routine parses the command line arguments that were\n * passed to the program and uses them to set relevant\n * training-related global parameters.\n *\n * Globals:\n * - Config  current clustering parameters\n * @param argc number of command line arguments to parse\n * @param argv command line arguments\n */\nvoid ParseArguments(int *argc, char ***argv) {\n  std::string usage;\n  if (*argc) {\n    usage += (*argv)[0];\n    usage += \" -v | --version | \";\n    usage += (*argv)[0];\n  }\n  usage += \" [.tr files ...]\";\n  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);\n  // Set some global values based on the flags.\n  Config.MinSamples =\n      std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));\n  Config.MaxIllegal = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));\n  Config.Independence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));\n  Config.Confidence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));\n  // Set additional parameters from config file if specified.\n  if (!FLAGS_configfile.empty()) {\n    tesseract::ParamUtils::ReadParamsFile(\n        FLAGS_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, ccutil.params());\n  }\n}\n\n// Helper loads shape table from the given file.\nShapeTable *LoadShapeTable(const std::string &file_prefix) {\n  ShapeTable *shape_table = nullptr;\n  std::string shape_table_file = file_prefix;\n  shape_table_file += kShapeTableFileSuffix;\n  TFile shape_fp;\n  if (shape_fp.Open(shape_table_file.c_str(), nullptr)) {\n    shape_table = new ShapeTable;\n    if (!shape_table->DeSerialize(&shape_fp)) {\n      delete shape_table;\n      shape_table = nullptr;\n      tprintf(\"Error: Failed to read shape table %s\\n\", shape_table_file.c_str());\n    } else {\n      int num_shapes = shape_table->NumShapes();\n      tprintf(\"Read shape table %s of %d shapes\\n\", shape_table_file.c_str(), num_shapes);\n    }\n  } else {\n    tprintf(\"Warning: No shape table file present: %s\\n\", shape_table_file.c_str());\n  }\n  return shape_table;\n}\n\n// Helper to write the shape_table.\nvoid WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table) {\n  std::string shape_table_file = file_prefix;\n  shape_table_file += kShapeTableFileSuffix;\n  FILE *fp = fopen(shape_table_file.c_str(), \"wb\");\n  if (fp != nullptr) {\n    if (!shape_table.Serialize(fp)) {\n      fprintf(stderr, \"Error writing shape table: %s\\n\", shape_table_file.c_str());\n    }\n    fclose(fp);\n  } else {\n    fprintf(stderr, \"Error creating shape table: %s\\n\", shape_table_file.c_str());\n  }\n}\n\n/**\n * Creates a MasterTrainer and loads the training data into it:\n * Initializes feature_defs and IntegerFX.\n * Loads the shape_table if shape_table != nullptr.\n * Loads initial unicharset from -U command-line option.\n * If FLAGS_T is set, loads the majority of data from there, else:\n *  - Loads font info from -F option.\n *  - Loads xheights from -X option.\n *  - Loads samples from .tr files in remaining command-line args.\n *  - Deletes outliers and computes canonical samples.\n *  - If FLAGS_output_trainer is set, saves the trainer for future use.\n *    TODO: Who uses that? There is currently no code which reads it.\n * Computes canonical and cloud features.\n * If shape_table is not nullptr, but failed to load, make a fake flat one,\n * as shape clustering was not run.\n */\nstd::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,\n                                                ShapeTable **shape_table, std::string &file_prefix) {\n  InitFeatureDefs(&feature_defs);\n  InitIntegerFX();\n  file_prefix = \"\";\n  if (!FLAGS_D.empty()) {\n    file_prefix += FLAGS_D.c_str();\n    file_prefix += \"/\";\n  }\n  // If we are shape clustering (nullptr shape_table) or we successfully load\n  // a shape_table written by a previous shape clustering, then\n  // shape_analysis will be true, meaning that the MasterTrainer will replace\n  // some members of the unicharset with their fragments.\n  bool shape_analysis = false;\n  if (shape_table != nullptr) {\n    *shape_table = LoadShapeTable(file_prefix);\n    if (*shape_table != nullptr) {\n      shape_analysis = true;\n    }\n  } else {\n    shape_analysis = true;\n  }\n  auto trainer = std::make_unique<MasterTrainer>(NM_CHAR_ANISOTROPIC, shape_analysis, replication,\n                                                 FLAGS_debug_level);\n  IntFeatureSpace fs;\n  fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);\n  trainer->LoadUnicharset(FLAGS_U.c_str());\n  // Get basic font information from font_properties.\n  if (!FLAGS_F.empty()) {\n    if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {\n      return {};\n    }\n  }\n  if (!FLAGS_X.empty()) {\n    if (!trainer->LoadXHeights(FLAGS_X.c_str())) {\n      return {};\n    }\n  }\n  trainer->SetFeatureSpace(fs);\n  // Load training data from .tr files in filelist (terminated by nullptr).\n  for (const char *page_name = *filelist++; page_name != nullptr; page_name = *filelist++) {\n    tprintf(\"Reading %s ...\\n\", page_name);\n    trainer->ReadTrainingSamples(page_name, feature_defs, false);\n\n    // If there is a file with [lang].[fontname].exp[num].fontinfo present,\n    // read font spacing information in to fontinfo_table.\n    int pagename_len = strlen(page_name);\n    char *fontinfo_file_name = new char[pagename_len + 7];\n    strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove \"tr\"\n    strcpy(fontinfo_file_name + pagename_len - 2, \"fontinfo\"); // +\"fontinfo\"\n    trainer->AddSpacingInfo(fontinfo_file_name);\n    delete[] fontinfo_file_name;\n\n    // Load the images into memory if required by the classifier.\n    if (FLAGS_load_images) {\n      std::string image_name = page_name;\n      // Chop off the tr and replace with tif. Extension must be tif!\n      image_name.resize(image_name.length() - 2);\n      image_name += \"tif\";\n      trainer->LoadPageImages(image_name.c_str());\n    }\n  }\n  trainer->PostLoadCleanup();\n  // Write the master trainer if required.\n  if (!FLAGS_output_trainer.empty()) {\n    FILE *fp = fopen(FLAGS_output_trainer.c_str(), \"wb\");\n    if (fp == nullptr) {\n      tprintf(\"Can't create saved trainer data!\\n\");\n    } else {\n      trainer->Serialize(fp);\n      fclose(fp);\n    }\n  }\n  trainer->PreTrainingSetup();\n  if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {\n    fprintf(stderr, \"Failed to save unicharset to file %s\\n\", FLAGS_O.c_str());\n    return {};\n  }\n\n  if (shape_table != nullptr) {\n    // If we previously failed to load a shapetable, then shape clustering\n    // wasn't run so make a flat one now.\n    if (*shape_table == nullptr) {\n      *shape_table = new ShapeTable;\n      trainer->SetupFlatShapeTable(*shape_table);\n      tprintf(\"Flat shape table summary: %s\\n\", (*shape_table)->SummaryStr().c_str());\n    }\n    (*shape_table)->set_unicharset(trainer->unicharset());\n  }\n  return trainer;\n}\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine searches through a list of labeled lists to find\n * a list with the specified label.  If a matching labeled list\n * cannot be found, nullptr is returned.\n * @param List list to search\n * @param Label label to search for\n * @return Labeled list with the specified label or nullptr.\n * @note Globals: none\n */\nLABELEDLIST FindList(LIST List, const std::string &Label) {\n  LABELEDLIST LabeledList;\n\n  iterate(List) {\n    LabeledList = reinterpret_cast<LABELEDLIST>(List->first_node());\n    if (LabeledList->Label == Label) {\n      return (LabeledList);\n    }\n  }\n  return (nullptr);\n\n} /* FindList */\n\n/*---------------------------------------------------------------------------*/\n// TODO(rays) This is now used only by cntraining. Convert cntraining to use\n// the new method or get rid of it entirely.\n/**\n * This routine reads training samples from a file and\n * places them into a data structure which organizes the\n * samples by FontName and CharName.  It then returns this\n * data structure.\n * @param file open text file to read samples from\n * @param feature_definitions\n * @param feature_name\n * @param max_samples\n * @param unicharset\n * @param training_samples\n */\nvoid ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,\n                         int max_samples, UNICHARSET *unicharset, FILE *file,\n                         LIST *training_samples) {\n  char buffer[2048];\n  char unichar[UNICHAR_LEN + 1];\n  LABELEDLIST char_sample;\n  FEATURE_SET feature_samples;\n  uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);\n\n  // Zero out the font_sample_count for all the classes.\n  LIST it = *training_samples;\n  iterate(it) {\n    char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());\n    char_sample->font_sample_count = 0;\n  }\n\n  while (fgets(buffer, 2048, file) != nullptr) {\n    if (buffer[0] == '\\n') {\n      continue;\n    }\n\n    sscanf(buffer, \"%*s %s\", unichar);\n    if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {\n      unicharset->unichar_insert(unichar);\n      if (unicharset->size() > MAX_NUM_CLASSES) {\n        tprintf(\n            \"Error: Size of unicharset in training is \"\n            \"greater than MAX_NUM_CLASSES\\n\");\n        exit(1);\n      }\n    }\n    char_sample = FindList(*training_samples, unichar);\n    if (char_sample == nullptr) {\n      char_sample = new LABELEDLISTNODE(unichar);\n      *training_samples = push(*training_samples, char_sample);\n    }\n    auto char_desc = ReadCharDescription(feature_definitions, file);\n    feature_samples = char_desc->FeatureSets[feature_type];\n    if (char_sample->font_sample_count < max_samples || max_samples <= 0) {\n      char_sample->List = push(char_sample->List, feature_samples);\n      char_sample->SampleCount++;\n      char_sample->font_sample_count++;\n    } else {\n      delete feature_samples;\n    }\n    for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {\n      if (feature_type != i) {\n        delete char_desc->FeatureSets[i];\n      }\n      char_desc->FeatureSets[i] = nullptr;\n    }\n    delete char_desc;\n  }\n} // ReadTrainingSamples\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine deallocates all of the space allocated to\n * the specified list of training samples.\n * @param CharList list of all fonts in document\n */\nvoid FreeTrainingSamples(LIST CharList) {\n  LABELEDLIST char_sample;\n  FEATURE_SET FeatureSet;\n  LIST FeatureList;\n\n  LIST nodes = CharList;\n  iterate(CharList) { /* iterate through all of the fonts */\n    char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());\n    FeatureList = char_sample->List;\n    iterate(FeatureList) { /* iterate through all of the classes */\n      FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());\n      delete FeatureSet;\n    }\n    FreeLabeledList(char_sample);\n  }\n  destroy(nodes);\n} /* FreeTrainingSamples */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine deallocates all of the memory consumed by\n * a labeled list.  It does not free any memory which may be\n * consumed by the items in the list.\n * @param LabeledList labeled list to be freed\n * @note Globals: none\n */\nvoid FreeLabeledList(LABELEDLIST LabeledList) {\n  destroy(LabeledList->List);\n  delete LabeledList;\n} /* FreeLabeledList */\n\n/*---------------------------------------------------------------------------*/\n/**\n * This routine reads samples from a LABELEDLIST and enters\n * those samples into a clusterer data structure.  This\n * data structure is then returned to the caller.\n * @param char_sample: LABELEDLIST that holds all the feature information for a\n * @param FeatureDefs\n * @param program_feature_type\n * given character.\n * @return Pointer to new clusterer data structure.\n * @note Globals: None\n */\nCLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample,\n                              const char *program_feature_type) {\n  uint16_t N;\n  CLUSTERER *Clusterer;\n  LIST FeatureList = nullptr;\n  FEATURE_SET FeatureSet = nullptr;\n\n  int32_t desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);\n  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;\n  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);\n\n  FeatureList = char_sample->List;\n  uint32_t CharID = 0;\n  std::vector<float> Sample;\n  iterate(FeatureList) {\n    FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());\n    for (int i = 0; i < FeatureSet->MaxNumFeatures; i++) {\n      if (Sample.empty()) {\n        Sample.resize(N);\n      }\n      for (int j = 0; j < N; j++) {\n        Sample[j] = FeatureSet->Features[i]->Params[j];\n      }\n      MakeSample(Clusterer, &Sample[0], CharID);\n    }\n    CharID++;\n  }\n  return Clusterer;\n\n} /* SetUpForClustering */\n\n/*------------------------------------------------------------------------*/\nvoid MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer,\n                              CLUSTERCONFIG *clusterconfig) {\n  PROTOTYPE *Prototype;\n  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;\n\n  LIST pProtoList = ProtoList;\n  iterate(pProtoList) {\n    Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());\n    if (Prototype->Significant || Prototype->Merged) {\n      continue;\n    }\n    float best_dist = 0.125;\n    PROTOTYPE *best_match = nullptr;\n    // Find the nearest alive prototype.\n    LIST list_it = ProtoList;\n    iterate(list_it) {\n      auto *test_p = reinterpret_cast<PROTOTYPE *>(list_it->first_node());\n      if (test_p != Prototype && !test_p->Merged) {\n        float dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, &Prototype->Mean[0],\n                                     &test_p->Mean[0]);\n        if (dist < best_dist) {\n          best_match = test_p;\n          best_dist = dist;\n        }\n      }\n    }\n    if (best_match != nullptr && !best_match->Significant) {\n      if (debug) {\n        auto bestMatchNumSamples = best_match->NumSamples;\n        auto prototypeNumSamples = Prototype->NumSamples;\n        tprintf(\"Merging red clusters (%d+%d) at %g,%g and %g,%g\\n\", bestMatchNumSamples,\n                prototypeNumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0],\n                Prototype->Mean[1]);\n      }\n      best_match->NumSamples =\n          MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples,\n                        Prototype->NumSamples, &best_match->Mean[0], &best_match->Mean[0], &Prototype->Mean[0]);\n      Prototype->NumSamples = 0;\n      Prototype->Merged = true;\n    } else if (best_match != nullptr) {\n      if (debug) {\n        tprintf(\"Red proto at %g,%g matched a green one at %g,%g\\n\", Prototype->Mean[0],\n                Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]);\n      }\n      Prototype->Merged = true;\n    }\n  }\n  // Mark significant those that now have enough samples.\n  int min_samples = static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);\n  pProtoList = ProtoList;\n  iterate(pProtoList) {\n    Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());\n    // Process insignificant protos that do not match a green one\n    if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) {\n      if (debug) {\n        tprintf(\"Red proto at %g,%g becoming green\\n\", Prototype->Mean[0], Prototype->Mean[1]);\n      }\n      Prototype->Significant = true;\n    }\n  }\n} /* MergeInsignificantProtos */\n\n/*-----------------------------------------------------------------------------*/\nvoid CleanUpUnusedData(LIST ProtoList) {\n  PROTOTYPE *Prototype;\n\n  iterate(ProtoList) {\n    Prototype = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());\n    delete[] Prototype->Variance.Elliptical;\n    Prototype->Variance.Elliptical = nullptr;\n    delete[] Prototype->Magnitude.Elliptical;\n    Prototype->Magnitude.Elliptical = nullptr;\n    delete[] Prototype->Weight.Elliptical;\n    Prototype->Weight.Elliptical = nullptr;\n  }\n}\n\n/*------------------------------------------------------------------------*/\nLIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)\n\n{\n  LIST NewProtoList = NIL_LIST;\n  auto pProtoList = ProtoList;\n  iterate(pProtoList) {\n    auto Proto = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());\n    if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) {\n      auto NewProto = new PROTOTYPE;\n      NewProto->Mean = Proto->Mean;\n      NewProto->Significant = Proto->Significant;\n      NewProto->Style = Proto->Style;\n      NewProto->NumSamples = Proto->NumSamples;\n      NewProto->Cluster = nullptr;\n      NewProto->Distrib.clear();\n\n      if (Proto->Variance.Elliptical != nullptr) {\n        NewProto->Variance.Elliptical = new float[N];\n        for (int i = 0; i < N; i++) {\n          NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];\n        }\n      } else {\n        NewProto->Variance.Elliptical = nullptr;\n      }\n      //---------------------------------------------\n      if (Proto->Magnitude.Elliptical != nullptr) {\n        NewProto->Magnitude.Elliptical = new float[N];\n        for (int i = 0; i < N; i++) {\n          NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];\n        }\n      } else {\n        NewProto->Magnitude.Elliptical = nullptr;\n      }\n      //------------------------------------------------\n      if (Proto->Weight.Elliptical != nullptr) {\n        NewProto->Weight.Elliptical = new float[N];\n        for (int i = 0; i < N; i++) {\n          NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];\n        }\n      } else {\n        NewProto->Weight.Elliptical = nullptr;\n      }\n\n      NewProto->TotalMagnitude = Proto->TotalMagnitude;\n      NewProto->LogMagnitude = Proto->LogMagnitude;\n      NewProtoList = push_last(NewProtoList, NewProto);\n    }\n  }\n  FreeProtoList(&ProtoList);\n  return (NewProtoList);\n} /* RemoveInsignificantProtos */\n\n/*----------------------------------------------------------------------------*/\nMERGE_CLASS FindClass(LIST List, const std::string &Label) {\n  MERGE_CLASS MergeClass;\n\n  iterate(List) {\n    MergeClass = reinterpret_cast<MERGE_CLASS>(List->first_node());\n    if (MergeClass->Label == Label) {\n      return (MergeClass);\n    }\n  }\n  return (nullptr);\n\n} /* FindClass */\n\n/*-----------------------------------------------------------------------------*/\n/**\n * This routine deallocates all of the space allocated to\n * the specified list of training samples.\n * @param ClassList list of all fonts in document\n */\nvoid FreeLabeledClassList(LIST ClassList) {\n  MERGE_CLASS MergeClass;\n\n  LIST nodes = ClassList;\n  iterate(ClassList) /* iterate through all of the fonts */\n  {\n    MergeClass = reinterpret_cast<MERGE_CLASS>(ClassList->first_node());\n    FreeClass(MergeClass->Class);\n    delete MergeClass;\n  }\n  destroy(nodes);\n\n} /* FreeLabeledClassList */\n\n/* SetUpForFloat2Int */\nCLASS_STRUCT *SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList) {\n  MERGE_CLASS MergeClass;\n  CLASS_TYPE Class;\n  int NumProtos;\n  int NumConfigs;\n  int NumWords;\n  int i, j;\n  float Values[3];\n  PROTO_STRUCT *NewProto;\n  PROTO_STRUCT *OldProto;\n  BIT_VECTOR NewConfig;\n  BIT_VECTOR OldConfig;\n\n  //  printf(\"Float2Int ...\\n\");\n\n  auto *float_classes = new CLASS_STRUCT[unicharset.size()];\n  iterate(LabeledClassList) {\n    UnicityTable<int> font_set;\n    MergeClass = reinterpret_cast<MERGE_CLASS>(LabeledClassList->first_node());\n    Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label.c_str())];\n    NumProtos = MergeClass->Class->NumProtos;\n    NumConfigs = MergeClass->Class->NumConfigs;\n    font_set.move(&MergeClass->Class->font_set);\n    Class->NumProtos = NumProtos;\n    Class->MaxNumProtos = NumProtos;\n    Class->Prototypes.resize(NumProtos);\n    for (i = 0; i < NumProtos; i++) {\n      NewProto = ProtoIn(Class, i);\n      OldProto = ProtoIn(MergeClass->Class, i);\n      Values[0] = OldProto->X;\n      Values[1] = OldProto->Y;\n      Values[2] = OldProto->Angle;\n      Normalize(Values);\n      NewProto->X = OldProto->X;\n      NewProto->Y = OldProto->Y;\n      NewProto->Length = OldProto->Length;\n      NewProto->Angle = OldProto->Angle;\n      NewProto->A = Values[0];\n      NewProto->B = Values[1];\n      NewProto->C = Values[2];\n    }\n\n    Class->NumConfigs = NumConfigs;\n    Class->MaxNumConfigs = NumConfigs;\n    Class->font_set.move(&font_set);\n    Class->Configurations.resize(NumConfigs);\n    NumWords = WordsInVectorOfSize(NumProtos);\n    for (i = 0; i < NumConfigs; i++) {\n      NewConfig = NewBitVector(NumProtos);\n      OldConfig = MergeClass->Class->Configurations[i];\n      for (j = 0; j < NumWords; j++) {\n        NewConfig[j] = OldConfig[j];\n      }\n      Class->Configurations[i] = NewConfig;\n    }\n  }\n  return float_classes;\n} // SetUpForFloat2Int\n\n/*--------------------------------------------------------------------------*/\nvoid Normalize(float *Values) {\n  float Slope;\n  float Intercept;\n  float Normalizer;\n\n  Slope = tan(Values[2] * 2 * M_PI);\n  Intercept = Values[1] - Slope * Values[0];\n  Normalizer = 1 / sqrt(Slope * Slope + 1.0);\n\n  Values[0] = Slope * Normalizer;\n  Values[1] = -Normalizer;\n  Values[2] = Intercept * Normalizer;\n} // Normalize\n\n/*-------------------------------------------------------------------------*/\nvoid FreeNormProtoList(LIST CharList)\n\n{\n  LABELEDLIST char_sample;\n\n  LIST nodes = CharList;\n  iterate(CharList) /* iterate through all of the fonts */\n  {\n    char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());\n    FreeLabeledList(char_sample);\n  }\n  destroy(nodes);\n\n} // FreeNormProtoList\n\n/*---------------------------------------------------------------------------*/\nvoid AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName) {\n  auto LabeledProtoList = new LABELEDLISTNODE(CharName.c_str());\n  iterate(ProtoList) {\n    auto Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());\n    LabeledProtoList->List = push(LabeledProtoList->List, Proto);\n  }\n  *NormProtoList = push(*NormProtoList, LabeledProtoList);\n}\n\n/*---------------------------------------------------------------------------*/\nint NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos) {\n  int N = 0;\n  iterate(ProtoList) {\n    auto *Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());\n    if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) {\n      N++;\n    }\n  }\n  return (N);\n}\n\n} // namespace tesseract.\n\n#endif // def DISABLED_LEGACY_ENGINE\n"
  },
  {
    "path": "src/training/common/commontraining.h",
    "content": "// Copyright 2008 Google Inc. All Rights Reserved.\n// Author: scharron@google.com (Samuel Charron)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_TRAINING_COMMONTRAINING_H_\n#define TESSERACT_TRAINING_COMMONTRAINING_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"commandlineflags.h\"\n#include \"export.h\"\n#include \"tprintf.h\"\n\n#include <tesseract/baseapi.h>\n\n#include <memory>\n\nnamespace tesseract {\n\nTESS_COMMON_TRAINING_API\nvoid ParseArguments(int *argc, char ***argv);\n\n// Check whether the shared tesseract library is the right one.\n// This function must be inline because otherwise it would be part of\n// the shared library, so it could not compare the versions.\nstatic inline void CheckSharedLibraryVersion() {\n#ifdef HAVE_CONFIG_H\n  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {\n    tprintf(\n        \"ERROR: shared library version mismatch (was %s, expected %s\\n\"\n        \"Did you use a wrong shared tesseract library?\\n\",\n        TessBaseAPI::Version(), TESSERACT_VERSION_STR);\n    exit(1);\n  }\n#endif\n}\n\n} // namespace tesseract\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n#  include \"cluster.h\"\n#  include \"featdefs.h\"\n#  include \"intproto.h\"\n#  include \"oldlist.h\"\n\nnamespace tesseract {\n\nclass Classify;\nclass MasterTrainer;\nclass ShapeTable;\n\n//////////////////////////////////////////////////////////////////////////////\n// Globals ///////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////\n\nTESS_COMMON_TRAINING_API\nextern FEATURE_DEFS_STRUCT feature_defs;\n\n// Must be defined in the file that \"implements\" commonTraining facilities.\nTESS_COMMON_TRAINING_API\nextern CLUSTERCONFIG Config;\n\n//////////////////////////////////////////////////////////////////////////////\n// Structs ///////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////\nstruct LABELEDLISTNODE {\n  /// This constructor allocates a new, empty labeled list and gives\n  /// it the specified label.\n  /// @param Label label for new list\n  LABELEDLISTNODE(const char *label) : Label(label) {\n  }\n  std::string Label;\n  int SampleCount = 0;\n  int font_sample_count = 0;\n  LIST List = nullptr;\n};\nusing LABELEDLIST = LABELEDLISTNODE *;\n\nstruct MERGE_CLASS_NODE {\n  MERGE_CLASS_NODE(const char * label) : Label(label), Class(NewClass(MAX_NUM_PROTOS, MAX_NUM_CONFIGS)) {\n  }\n  std::string Label;\n  int NumMerged[MAX_NUM_PROTOS];\n  tesseract::CLASS_TYPE Class;\n};\nusing MERGE_CLASS = MERGE_CLASS_NODE *;\n\n//////////////////////////////////////////////////////////////////////////////\n// Functions /////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////\n\n// Helper loads shape table from the given file.\nShapeTable *LoadShapeTable(const std::string &file_prefix);\n// Helper to write the shape_table.\nTESS_COMMON_TRAINING_API\nvoid WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);\n\n// Creates a MasterTraininer and loads the training data into it:\n// Initializes feature_defs and IntegerFX.\n// Loads the shape_table if shape_table != nullptr.\n// Loads initial unicharset from -U command-line option.\n// If FLAGS_input_trainer is set, loads the majority of data from there, else:\n//   Loads font info from -F option.\n//   Loads xheights from -X option.\n//   Loads samples from .tr files in remaining command-line args.\n//   Deletes outliers and computes canonical samples.\n//   If FLAGS_output_trainer is set, saves the trainer for future use.\n// Computes canonical and cloud features.\n// If shape_table is not nullptr, but failed to load, make a fake flat one,\n// as shape clustering was not run.\nTESS_COMMON_TRAINING_API\nstd::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,\n                                                ShapeTable **shape_table, std::string &file_prefix);\n\nLABELEDLIST FindList(tesseract::LIST List, const std::string &Label);\n\nTESS_COMMON_TRAINING_API\nvoid ReadTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &feature_defs,\n                         const char *feature_name, int max_samples,\n                         tesseract::UNICHARSET *unicharset, FILE *file,\n                         tesseract::LIST *training_samples);\n\nvoid WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,\n                          tesseract::LIST CharList, const char *program_feature_type);\n\nTESS_COMMON_TRAINING_API\nvoid FreeTrainingSamples(tesseract::LIST CharList);\n\nTESS_COMMON_TRAINING_API\nvoid FreeLabeledList(LABELEDLIST LabeledList);\n\nTESS_COMMON_TRAINING_API\nvoid FreeLabeledClassList(tesseract::LIST ClassListList);\n\nTESS_COMMON_TRAINING_API\ntesseract::CLUSTERER *SetUpForClustering(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs,\n                                         LABELEDLIST CharSample, const char *program_feature_type);\n\nTESS_COMMON_TRAINING_API\ntesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,\n                                          bool KeepInsigProtos, int N);\n\nTESS_COMMON_TRAINING_API\nvoid CleanUpUnusedData(tesseract::LIST ProtoList);\n\nTESS_COMMON_TRAINING_API\nvoid MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,\n                              tesseract::CLUSTERER *Clusterer, tesseract::CLUSTERCONFIG *Config);\n\nTESS_COMMON_TRAINING_API\nMERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);\n\nTESS_COMMON_TRAINING_API\ntesseract::CLASS_STRUCT *SetUpForFloat2Int(const tesseract::UNICHARSET &unicharset,\n                                           tesseract::LIST LabeledClassList);\n\nvoid Normalize(float *Values);\n\nTESS_COMMON_TRAINING_API\nvoid FreeNormProtoList(tesseract::LIST CharList);\n\nTESS_COMMON_TRAINING_API\nvoid AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);\n\nTESS_COMMON_TRAINING_API\nint NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);\n\nvoid allocNormProtos();\n\n} // namespace tesseract\n\n#endif // def DISABLED_LEGACY_ENGINE\n\n#endif // TESSERACT_TRAINING_COMMONTRAINING_H_\n"
  },
  {
    "path": "src/training/common/ctc.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ctc.cpp\n// Description: Slightly improved standard CTC to compute the targets.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"ctc.h\"\n\n#include \"matrix.h\"\n#include \"network.h\"\n#include \"networkio.h\"\n#include \"scrollview.h\"\n\n#include <algorithm>\n#include <cfloat> // for FLT_MAX\n#include <cmath>\n#include <memory>\n\nnamespace tesseract {\n\n// Magic constants that keep CTC stable.\n// Minimum probability limit for softmax input to ctc_loss.\nconst float CTC::kMinProb_ = 1e-12;\n// Maximum absolute argument to exp().\nconst double CTC::kMaxExpArg_ = 80.0;\n// Minimum probability for total prob in time normalization.\nconst double CTC::kMinTotalTimeProb_ = 1e-8;\n// Minimum probability for total prob in final normalization.\nconst double CTC::kMinTotalFinalProb_ = 1e-6;\n\n// Builds a target using CTC. Slightly improved as follows:\n// Includes normalizations and clipping for stability.\n// labels should be pre-padded with nulls everywhere.\n// labels can be longer than the time sequence, but the total number of\n// essential labels (non-null plus nulls between equal labels) must not exceed\n// the number of timesteps in outputs.\n// outputs is the output of the network, and should have already been\n// normalized with NormalizeProbs.\n// On return targets is filled with the computed targets.\n// Returns false if there is insufficient time for the labels.\n/* static */\nbool CTC::ComputeCTCTargets(const std::vector<int> &labels, int null_char,\n                            const GENERIC_2D_ARRAY<float> &outputs, NetworkIO *targets) {\n  std::unique_ptr<CTC> ctc(new CTC(labels, null_char, outputs));\n  if (!ctc->ComputeLabelLimits()) {\n    return false; // Not enough time.\n  }\n  // Generate simple targets purely from the truth labels by spreading them\n  // evenly over time.\n  GENERIC_2D_ARRAY<float> simple_targets;\n  ctc->ComputeSimpleTargets(&simple_targets);\n  // Add the simple targets as a starter bias to the network outputs.\n  float bias_fraction = ctc->CalculateBiasFraction();\n  simple_targets *= bias_fraction;\n  ctc->outputs_ += simple_targets;\n  NormalizeProbs(&ctc->outputs_);\n  // Run regular CTC on the biased outputs.\n  // Run forward and backward\n  GENERIC_2D_ARRAY<double> log_alphas, log_betas;\n  ctc->Forward(&log_alphas);\n  ctc->Backward(&log_betas);\n  // Normalize and come out of log space with a clipped softmax over time.\n  log_alphas += log_betas;\n  ctc->NormalizeSequence(&log_alphas);\n  ctc->LabelsToClasses(log_alphas, targets);\n  NormalizeProbs(targets);\n  return true;\n}\n\nCTC::CTC(const std::vector<int> &labels, int null_char, const GENERIC_2D_ARRAY<float> &outputs)\n    : labels_(labels), outputs_(outputs), null_char_(null_char) {\n  num_timesteps_ = outputs.dim1();\n  num_classes_ = outputs.dim2();\n  num_labels_ = labels_.size();\n}\n\n// Computes vectors of min and max label index for each timestep, based on\n// whether skippability of nulls makes it possible to complete a valid path.\nbool CTC::ComputeLabelLimits() {\n  min_labels_.clear();\n  min_labels_.resize(num_timesteps_, 0);\n  max_labels_.clear();\n  max_labels_.resize(num_timesteps_, 0);\n  int min_u = num_labels_ - 1;\n  if (labels_[min_u] == null_char_) {\n    --min_u;\n  }\n  for (int t = num_timesteps_ - 1; t >= 0; --t) {\n    min_labels_[t] = min_u;\n    if (min_u > 0) {\n      --min_u;\n      if (labels_[min_u] == null_char_ && min_u > 0 && labels_[min_u + 1] != labels_[min_u - 1]) {\n        --min_u;\n      }\n    }\n  }\n  int max_u = labels_[0] == null_char_;\n  for (int t = 0; t < num_timesteps_; ++t) {\n    max_labels_[t] = max_u;\n    if (max_labels_[t] < min_labels_[t]) {\n      return false; // Not enough room.\n    }\n    if (max_u + 1 < num_labels_) {\n      ++max_u;\n      if (labels_[max_u] == null_char_ && max_u + 1 < num_labels_ &&\n          labels_[max_u + 1] != labels_[max_u - 1]) {\n        ++max_u;\n      }\n    }\n  }\n  return true;\n}\n\n// Computes targets based purely on the labels by spreading the labels evenly\n// over the available timesteps.\nvoid CTC::ComputeSimpleTargets(GENERIC_2D_ARRAY<float> *targets) const {\n  // Initialize all targets to zero.\n  targets->Resize(num_timesteps_, num_classes_, 0.0f);\n  std::vector<float> half_widths;\n  std::vector<int> means;\n  ComputeWidthsAndMeans(&half_widths, &means);\n  for (int l = 0; l < num_labels_; ++l) {\n    int label = labels_[l];\n    float left_half_width = half_widths[l];\n    float right_half_width = left_half_width;\n    int mean = means[l];\n    if (label == null_char_) {\n      if (!NeededNull(l)) {\n        if ((l > 0 && mean == means[l - 1]) || (l + 1 < num_labels_ && mean == means[l + 1])) {\n          continue; // Drop overlapping null.\n        }\n      }\n      // Make sure that no space is left unoccupied and that non-nulls always\n      // peak at 1 by stretching nulls to meet their neighbors.\n      if (l > 0) {\n        left_half_width = mean - means[l - 1];\n      }\n      if (l + 1 < num_labels_) {\n        right_half_width = means[l + 1] - mean;\n      }\n    }\n    if (mean >= 0 && mean < num_timesteps_) {\n      targets->put(mean, label, 1.0f);\n    }\n    for (int offset = 1; offset < left_half_width && mean >= offset; ++offset) {\n      float prob = 1.0f - offset / left_half_width;\n      if (mean - offset < num_timesteps_ && prob > targets->get(mean - offset, label)) {\n        targets->put(mean - offset, label, prob);\n      }\n    }\n    for (int offset = 1; offset < right_half_width && mean + offset < num_timesteps_; ++offset) {\n      float prob = 1.0f - offset / right_half_width;\n      if (mean + offset >= 0 && prob > targets->get(mean + offset, label)) {\n        targets->put(mean + offset, label, prob);\n      }\n    }\n  }\n}\n\n// Computes mean positions and half widths of the simple targets by spreading\n// the labels evenly over the available timesteps.\nvoid CTC::ComputeWidthsAndMeans(std::vector<float> *half_widths, std::vector<int> *means) const {\n  // Count the number of labels of each type, in regexp terms, counts plus\n  // (non-null or necessary null, which must occur at least once) and star\n  // (optional null).\n  int num_plus = 0, num_star = 0;\n  for (int i = 0; i < num_labels_; ++i) {\n    if (labels_[i] != null_char_ || NeededNull(i)) {\n      ++num_plus;\n    } else {\n      ++num_star;\n    }\n  }\n  // Compute the size for each type. If there is enough space for everything\n  // to have size>=1, then all are equal, otherwise plus_size=1 and star gets\n  // whatever is left-over.\n  float plus_size = 1.0f, star_size = 0.0f;\n  float total_floating = num_plus + num_star;\n  if (total_floating <= num_timesteps_) {\n    plus_size = star_size = num_timesteps_ / total_floating;\n  } else if (num_star > 0) {\n    star_size = static_cast<float>(num_timesteps_ - num_plus) / num_star;\n  }\n  // Set the width and compute the mean of each.\n  float mean_pos = 0.0f;\n  for (int i = 0; i < num_labels_; ++i) {\n    float half_width;\n    if (labels_[i] != null_char_ || NeededNull(i)) {\n      half_width = plus_size / 2.0f;\n    } else {\n      half_width = star_size / 2.0f;\n    }\n    mean_pos += half_width;\n    means->push_back(static_cast<int>(mean_pos));\n    mean_pos += half_width;\n    half_widths->push_back(half_width);\n  }\n}\n\n// Helper returns the index of the highest probability label at timestep t.\nstatic int BestLabel(const GENERIC_2D_ARRAY<float> &outputs, int t) {\n  int result = 0;\n  int num_classes = outputs.dim2();\n  const float *outputs_t = outputs[t];\n  for (int c = 1; c < num_classes; ++c) {\n    if (outputs_t[c] > outputs_t[result]) {\n      result = c;\n    }\n  }\n  return result;\n}\n\n// Calculates and returns a suitable fraction of the simple targets to add\n// to the network outputs.\nfloat CTC::CalculateBiasFraction() {\n  // Compute output labels via basic decoding.\n  std::vector<int> output_labels;\n  for (int t = 0; t < num_timesteps_; ++t) {\n    int label = BestLabel(outputs_, t);\n    while (t + 1 < num_timesteps_ && BestLabel(outputs_, t + 1) == label) {\n      ++t;\n    }\n    if (label != null_char_) {\n      output_labels.push_back(label);\n    }\n  }\n  // Simple bag of labels error calculation.\n  std::vector<int> truth_counts(num_classes_);\n  std::vector<int> output_counts(num_classes_);\n  for (int l = 0; l < num_labels_; ++l) {\n    ++truth_counts[labels_[l]];\n  }\n  for (auto l : output_labels) {\n    ++output_counts[l];\n  }\n  // Count the number of true and false positive non-nulls and truth labels.\n  int true_pos = 0, false_pos = 0, total_labels = 0;\n  for (int c = 0; c < num_classes_; ++c) {\n    if (c == null_char_) {\n      continue;\n    }\n    int truth_count = truth_counts[c];\n    int ocr_count = output_counts[c];\n    if (truth_count > 0) {\n      total_labels += truth_count;\n      if (ocr_count > truth_count) {\n        true_pos += truth_count;\n        false_pos += ocr_count - truth_count;\n      } else {\n        true_pos += ocr_count;\n      }\n    }\n    // We don't need to count classes that don't exist in the truth as\n    // false positives, because they don't affect CTC at all.\n  }\n  if (total_labels == 0) {\n    return 0.0f;\n  }\n  return exp(std::max(true_pos - false_pos, 1) * std::log(kMinProb_) / total_labels);\n}\n\n// Given ln(x) and ln(y), returns ln(x + y), using:\n// ln(x + y) = ln(y) + ln(1 + exp(ln(y) - ln(x)), ensuring that ln(x) is the\n// bigger number to maximize precision.\nstatic double LogSumExp(double ln_x, double ln_y) {\n  if (ln_x >= ln_y) {\n    return ln_x + log1p(exp(ln_y - ln_x));\n  } else {\n    return ln_y + log1p(exp(ln_x - ln_y));\n  }\n}\n\n// Runs the forward CTC pass, filling in log_probs.\nvoid CTC::Forward(GENERIC_2D_ARRAY<double> *log_probs) const {\n  log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);\n  log_probs->put(0, 0, log(outputs_(0, labels_[0])));\n  if (labels_[0] == null_char_) {\n    log_probs->put(0, 1, log(outputs_(0, labels_[1])));\n  }\n  for (int t = 1; t < num_timesteps_; ++t) {\n    const float *outputs_t = outputs_[t];\n    for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) {\n      // Continuing the same label.\n      double log_sum = log_probs->get(t - 1, u);\n      // Change from previous label.\n      if (u > 0) {\n        log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 1));\n      }\n      // Skip the null if allowed.\n      if (u >= 2 && labels_[u - 1] == null_char_ && labels_[u] != labels_[u - 2]) {\n        log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 2));\n      }\n      // Add in the log prob of the current label.\n      double label_prob = outputs_t[labels_[u]];\n      log_sum += log(label_prob);\n      log_probs->put(t, u, log_sum);\n    }\n  }\n}\n\n// Runs the backward CTC pass, filling in log_probs.\nvoid CTC::Backward(GENERIC_2D_ARRAY<double> *log_probs) const {\n  log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);\n  log_probs->put(num_timesteps_ - 1, num_labels_ - 1, 0.0);\n  if (labels_[num_labels_ - 1] == null_char_) {\n    log_probs->put(num_timesteps_ - 1, num_labels_ - 2, 0.0);\n  }\n  for (int t = num_timesteps_ - 2; t >= 0; --t) {\n    const float *outputs_tp1 = outputs_[t + 1];\n    for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) {\n      // Continuing the same label.\n      double log_sum = log_probs->get(t + 1, u) + std::log(outputs_tp1[labels_[u]]);\n      // Change from previous label.\n      if (u + 1 < num_labels_) {\n        double prev_prob = outputs_tp1[labels_[u + 1]];\n        log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 1) + log(prev_prob));\n      }\n      // Skip the null if allowed.\n      if (u + 2 < num_labels_ && labels_[u + 1] == null_char_ && labels_[u] != labels_[u + 2]) {\n        double skip_prob = outputs_tp1[labels_[u + 2]];\n        log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 2) + log(skip_prob));\n      }\n      log_probs->put(t, u, log_sum);\n    }\n  }\n}\n\n// Normalizes and brings probs out of log space with a softmax over time.\nvoid CTC::NormalizeSequence(GENERIC_2D_ARRAY<double> *probs) const {\n  double max_logprob = probs->Max();\n  for (int u = 0; u < num_labels_; ++u) {\n    double total = 0.0;\n    for (int t = 0; t < num_timesteps_; ++t) {\n      // Separate impossible path from unlikely probs.\n      double prob = probs->get(t, u);\n      if (prob > -FLT_MAX) {\n        prob = ClippedExp(prob - max_logprob);\n      } else {\n        prob = 0.0;\n      }\n      total += prob;\n      probs->put(t, u, prob);\n    }\n    // Note that although this is a probability distribution over time and\n    // therefore should sum to 1, it is important to allow some labels to be\n    // all zero, (or at least tiny) as it is necessary to skip some blanks.\n    if (total < kMinTotalTimeProb_) {\n      total = kMinTotalTimeProb_;\n    }\n    for (int t = 0; t < num_timesteps_; ++t) {\n      probs->put(t, u, probs->get(t, u) / total);\n    }\n  }\n}\n\n// For each timestep computes the max prob for each class over all\n// instances of the class in the labels_, and sets the targets to\n// the max observed prob.\nvoid CTC::LabelsToClasses(const GENERIC_2D_ARRAY<double> &probs, NetworkIO *targets) const {\n  // For each timestep compute the max prob for each class over all\n  // instances of the class in the labels_.\n  for (int t = 0; t < num_timesteps_; ++t) {\n    float *targets_t = targets->f(t);\n    std::vector<double> class_probs(num_classes_);\n    for (int u = 0; u < num_labels_; ++u) {\n      double prob = probs(t, u);\n      // Note that although Graves specifies sum over all labels of the same\n      // class, we need to allow skipped blanks to go to zero, so they don't\n      // interfere with the non-blanks, so max is better than sum.\n      if (prob > class_probs[labels_[u]]) {\n        class_probs[labels_[u]] = prob;\n      }\n      //         class_probs[labels_[u]] += prob;\n    }\n    int best_class = 0;\n    for (int c = 0; c < num_classes_; ++c) {\n      targets_t[c] = class_probs[c];\n      if (class_probs[c] > class_probs[best_class]) {\n        best_class = c;\n      }\n    }\n  }\n}\n\n// Normalizes the probabilities such that no target has a prob below min_prob,\n// and, provided that the initial total is at least min_total_prob, then all\n// probs will sum to 1, otherwise to sum/min_total_prob. The maximum output\n// probability is thus 1 - (num_classes-1)*min_prob.\n/* static */\nvoid CTC::NormalizeProbs(GENERIC_2D_ARRAY<float> *probs) {\n  int num_timesteps = probs->dim1();\n  int num_classes = probs->dim2();\n  for (int t = 0; t < num_timesteps; ++t) {\n    float *probs_t = (*probs)[t];\n    // Compute the total and clip that to prevent amplification of noise.\n    double total = 0.0;\n    for (int c = 0; c < num_classes; ++c) {\n      total += probs_t[c];\n    }\n    if (total < kMinTotalFinalProb_) {\n      total = kMinTotalFinalProb_;\n    }\n    // Compute the increased total as a result of clipping.\n    double increment = 0.0;\n    for (int c = 0; c < num_classes; ++c) {\n      double prob = probs_t[c] / total;\n      if (prob < kMinProb_) {\n        increment += kMinProb_ - prob;\n      }\n    }\n    // Now normalize with clipping. Any additional clipping is negligible.\n    total += increment;\n    for (int c = 0; c < num_classes; ++c) {\n      float prob = probs_t[c] / total;\n      probs_t[c] = std::max(prob, kMinProb_);\n    }\n  }\n}\n\n// Returns true if the label at index is a needed null.\nbool CTC::NeededNull(int index) const {\n  return labels_[index] == null_char_ && index > 0 && index + 1 < num_labels_ &&\n         labels_[index + 1] == labels_[index - 1];\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/common/ctc.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        ctc.h\n// Description: Slightly improved standard CTC to compute the targets.\n// Author:      Ray Smith\n// Created:     Wed Jul 13 15:17:06 PDT 2016\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_CTC_H_\n#define TESSERACT_LSTM_CTC_H_\n\n#include \"export.h\"\n#include \"network.h\"\n#include \"networkio.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n// Class to encapsulate CTC and simple target generation.\nclass TESS_COMMON_TRAINING_API CTC {\npublic:\n  // Normalizes the probabilities such that no target has a prob below min_prob,\n  // and, provided that the initial total is at least min_total_prob, then all\n  // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output\n  // probability is thus 1 - (num_classes-1)*min_prob.\n  static void NormalizeProbs(NetworkIO *probs) {\n    NormalizeProbs(probs->mutable_float_array());\n  }\n\n  // Builds a target using CTC. Slightly improved as follows:\n  // Includes normalizations and clipping for stability.\n  // labels should be pre-padded with nulls wherever desired, but they don't\n  // have to be between all labels. Allows for multi-label codes with no\n  // nulls between.\n  // labels can be longer than the time sequence, but the total number of\n  // essential labels (non-null plus nulls between equal labels) must not exceed\n  // the number of timesteps in outputs.\n  // outputs is the output of the network, and should have already been\n  // normalized with NormalizeProbs.\n  // On return targets is filled with the computed targets.\n  // Returns false if there is insufficient time for the labels.\n  static bool ComputeCTCTargets(const std::vector<int> &truth_labels, int null_char,\n                                const GENERIC_2D_ARRAY<float> &outputs, NetworkIO *targets);\n\nprivate:\n  // Constructor is private as the instance only holds information specific to\n  // the current labels, outputs etc, and is built by the static function.\n  CTC(const std::vector<int> &labels, int null_char, const GENERIC_2D_ARRAY<float> &outputs);\n\n  // Computes vectors of min and max label index for each timestep, based on\n  // whether skippability of nulls makes it possible to complete a valid path.\n  bool ComputeLabelLimits();\n  // Computes targets based purely on the labels by spreading the labels evenly\n  // over the available timesteps.\n  void ComputeSimpleTargets(GENERIC_2D_ARRAY<float> *targets) const;\n  // Computes mean positions and half widths of the simple targets by spreading\n  // the labels even over the available timesteps.\n  void ComputeWidthsAndMeans(std::vector<float> *half_widths, std::vector<int> *means) const;\n  // Calculates and returns a suitable fraction of the simple targets to add\n  // to the network outputs.\n  float CalculateBiasFraction();\n  // Runs the forward CTC pass, filling in log_probs.\n  void Forward(GENERIC_2D_ARRAY<double> *log_probs) const;\n  // Runs the backward CTC pass, filling in log_probs.\n  void Backward(GENERIC_2D_ARRAY<double> *log_probs) const;\n  // Normalizes and brings probs out of log space with a softmax over time.\n  void NormalizeSequence(GENERIC_2D_ARRAY<double> *probs) const;\n  // For each timestep computes the max prob for each class over all\n  // instances of the class in the labels_, and sets the targets to\n  // the max observed prob.\n  void LabelsToClasses(const GENERIC_2D_ARRAY<double> &probs, NetworkIO *targets) const;\n  // Normalizes the probabilities such that no target has a prob below min_prob,\n  // and, provided that the initial total is at least min_total_prob, then all\n  // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output\n  // probability is thus 1 - (num_classes-1)*min_prob.\n  static void NormalizeProbs(GENERIC_2D_ARRAY<float> *probs);\n  // Returns true if the label at index is a needed null.\n  bool NeededNull(int index) const;\n  // Returns exp(clipped(x)), clipping x to a reasonable range to prevent over/\n  // underflow.\n  static double ClippedExp(double x) {\n    if (x < -kMaxExpArg_) {\n      return exp(-kMaxExpArg_);\n    }\n    if (x > kMaxExpArg_) {\n      return exp(kMaxExpArg_);\n    }\n    return exp(x);\n  }\n\n  // Minimum probability limit for softmax input to ctc_loss.\n  static const float kMinProb_;\n  // Maximum absolute argument to exp().\n  static const double kMaxExpArg_;\n  // Minimum probability for total prob in time normalization.\n  static const double kMinTotalTimeProb_;\n  // Minimum probability for total prob in final normalization.\n  static const double kMinTotalFinalProb_;\n\n  // The truth label indices that are to be matched to outputs_.\n  const std::vector<int> &labels_;\n  // The network outputs.\n  GENERIC_2D_ARRAY<float> outputs_;\n  // The null or \"blank\" label.\n  int null_char_;\n  // Number of timesteps in outputs_.\n  int num_timesteps_;\n  // Number of classes in outputs_.\n  int num_classes_;\n  // Number of labels in labels_.\n  int num_labels_;\n  // Min and max valid label indices for each timestep.\n  std::vector<int> min_labels_;\n  std::vector<int> max_labels_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_LSTM_CTC_H_\n"
  },
  {
    "path": "src/training/common/errorcounter.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"errorcounter.h\"\n\n#include \"fontinfo.h\"\n#include \"sampleiterator.h\"\n#include \"shapeclassifier.h\"\n#include \"shapetable.h\"\n#include \"tesserrstream.h\"\n#include \"trainingsample.h\"\n#include \"trainingsampleset.h\"\n#include \"unicity_table.h\"\n\n#include <algorithm>\n#include <ctime>\n\nnamespace tesseract {\n\n// Difference in result rating to be thought of as an \"equal\" choice.\nconst double kRatingEpsilon = 1.0 / 32;\n\n// Tests a classifier, computing its error rate.\n// See errorcounter.h for description of arguments.\n// Iterates over the samples, calling the classifier in normal/silent mode.\n// If the classifier makes a CT_UNICHAR_TOPN_ERR error, and the appropriate\n// report_level is set (4 or greater), it will then call the classifier again\n// with a debug flag and a keep_this argument to find out what is going on.\ndouble ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_level,\n                                      CountTypes boosting_mode, const FontInfoTable &fontinfo_table,\n                                      const std::vector<Image > &page_images, SampleIterator *it,\n                                      double *unichar_error, double *scaled_error,\n                                      std::string *fonts_report) {\n  const int fontsize = it->sample_set()->NumFonts();\n  ErrorCounter counter(classifier->GetUnicharset(), fontsize);\n  std::vector<UnicharRating> results;\n\n  clock_t total_time = 0;\n  if (report_level > 1) {\n    total_time = clock();\n  }\n  unsigned total_samples = 0;\n  double unscaled_error = 0.0;\n  // Set a number of samples on which to run the classify debug mode.\n  int error_samples = report_level > 3 ? report_level * report_level : 0;\n  // Iterate over all the samples, accumulating errors.\n  for (it->Begin(); !it->AtEnd(); it->Next()) {\n    TrainingSample *mutable_sample = it->MutableSample();\n    int page_index = mutable_sample->page_num();\n    Image page_pix =\n        0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr;\n    // No debug, no keep this.\n    classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, &results);\n    bool debug_it = false;\n    int correct_id = mutable_sample->class_id();\n    if (counter.unicharset_.has_special_codes() &&\n        (correct_id == UNICHAR_SPACE || correct_id == UNICHAR_JOINED ||\n         correct_id == UNICHAR_BROKEN)) {\n      // This is junk so use the special counter.\n      debug_it = counter.AccumulateJunk(report_level > 3, results, mutable_sample);\n    } else {\n      debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode, fontinfo_table, results,\n                                          mutable_sample);\n    }\n    if (debug_it && error_samples > 0) {\n      // Running debug, keep the correct answer, and debug the classifier.\n      tprintf(\"Error on sample %d: %s Classifier debug output:\\n\", it->GlobalSampleIndex(),\n              it->sample_set()->SampleToString(*mutable_sample).c_str());\n#ifndef GRAPHICS_DISABLED\n      classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);\n#endif\n      --error_samples;\n    }\n    ++total_samples;\n  }\n  // Create the appropriate error report.\n  unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it,\n                                        unichar_error, fonts_report);\n  if (scaled_error != nullptr) {\n    *scaled_error = counter.scaled_error_;\n  }\n  if (report_level > 1 && total_samples > 0) {\n    // It is useful to know the time in microseconds/char.\n    total_time = 1000 * (clock() - total_time) / CLOCKS_PER_SEC;\n    tesserr << \"Errors computed in \" << total_time << \"  ms at \"\n            << 1000 * total_time / total_samples << \" μs/char\\n\";\n  }\n  return unscaled_error;\n}\n\n// Tests a pair of classifiers, debugging errors of the new against the old.\n// See errorcounter.h for description of arguments.\n// Iterates over the samples, calling the classifiers in normal/silent mode.\n// If the new_classifier makes a boosting_mode error that the old_classifier\n// does not, it will then call the new_classifier again with a debug flag\n// and a keep_this argument to find out what is going on.\nvoid ErrorCounter::DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier,\n                                  CountTypes boosting_mode, const FontInfoTable &fontinfo_table,\n                                  const std::vector<Image > &page_images, SampleIterator *it) {\n  int fontsize = it->sample_set()->NumFonts();\n  ErrorCounter old_counter(old_classifier->GetUnicharset(), fontsize);\n  ErrorCounter new_counter(new_classifier->GetUnicharset(), fontsize);\n  std::vector<UnicharRating> results;\n\n#if !defined(NDEBUG)\n  int total_samples = 0;\n#endif\n  int error_samples = 25;\n  int total_new_errors = 0;\n  // Iterate over all the samples, accumulating errors.\n  for (it->Begin(); !it->AtEnd(); it->Next()) {\n    TrainingSample *mutable_sample = it->MutableSample();\n    int page_index = mutable_sample->page_num();\n    Image page_pix =\n        0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr;\n    // No debug, no keep this.\n    old_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,\n                                          &results);\n    int correct_id = mutable_sample->class_id();\n    if (correct_id != 0 && !old_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,\n                                                         results, mutable_sample)) {\n      // old classifier was correct, check the new one.\n      new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,\n                                            &results);\n      if (correct_id != 0 && new_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,\n                                                          results, mutable_sample)) {\n        tprintf(\"New Error on sample %d: Classifier debug output:\\n\", it->GlobalSampleIndex());\n        ++total_new_errors;\n        new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 1, correct_id, &results);\n        if (results.size() > 0 && error_samples > 0) {\n#ifndef GRAPHICS_DISABLED\n          new_classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);\n#endif\n          --error_samples;\n        }\n      }\n    }\n#if !defined(NDEBUG)\n    ++total_samples;\n#endif\n  }\n  tprintf(\"Total new errors = %d\\n\", total_new_errors);\n}\n\n// Constructor is private. Only anticipated use of ErrorCounter is via\n// the static ComputeErrorRate.\nErrorCounter::ErrorCounter(const UNICHARSET &unicharset, int fontsize)\n    : scaled_error_(0.0)\n    , rating_epsilon_(kRatingEpsilon)\n    , unichar_counts_(unicharset.size(), unicharset.size(), 0)\n    , ok_score_hist_(0, 101)\n    , bad_score_hist_(0, 101)\n    , unicharset_(unicharset) {\n  Counts empty_counts;\n  font_counts_.clear();\n  font_counts_.resize(fontsize, empty_counts);\n  multi_unichar_counts_.clear();\n  multi_unichar_counts_.resize(unicharset.size(), 0);\n}\n\n// Accumulates the errors from the classifier results on a single sample.\n// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.\n// boosting_mode selects the type of error to be used for boosting and the\n// is_error_ member of sample is set according to whether the required type\n// of error occurred. The font_table provides access to font properties\n// for error counting and shape_table is used to understand the relationship\n// between unichar_ids and shape_ids in the results\nbool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,\n                                    const FontInfoTable &font_table,\n                                    const std::vector<UnicharRating> &results,\n                                    TrainingSample *sample) {\n  int num_results = results.size();\n  int answer_actual_rank = -1;\n  int font_id = sample->font_id();\n  int unichar_id = sample->class_id();\n  sample->set_is_error(false);\n  if (num_results == 0) {\n    // Reject. We count rejects as a separate category, but still mark the\n    // sample as an error in case any training module wants to use that to\n    // improve the classifier.\n    sample->set_is_error(true);\n    ++font_counts_[font_id].n[CT_REJECT];\n  } else {\n    // Find rank of correct unichar answer, using rating_epsilon_ to allow\n    // different answers to score as equal. (Ignoring the font.)\n    int epsilon_rank = 0;\n    int answer_epsilon_rank = -1;\n    int num_top_answers = 0;\n    double prev_rating = results[0].rating;\n    bool joined = false;\n    bool broken = false;\n    int res_index = 0;\n    while (res_index < num_results) {\n      if (results[res_index].rating < prev_rating - rating_epsilon_) {\n        ++epsilon_rank;\n        prev_rating = results[res_index].rating;\n      }\n      if (results[res_index].unichar_id == unichar_id && answer_epsilon_rank < 0) {\n        answer_epsilon_rank = epsilon_rank;\n        answer_actual_rank = res_index;\n      }\n      if (results[res_index].unichar_id == UNICHAR_JOINED && unicharset_.has_special_codes()) {\n        joined = true;\n      } else if (results[res_index].unichar_id == UNICHAR_BROKEN &&\n                 unicharset_.has_special_codes()) {\n        broken = true;\n      } else if (epsilon_rank == 0) {\n        ++num_top_answers;\n      }\n      ++res_index;\n    }\n    if (answer_actual_rank != 0) {\n      // Correct result is not absolute top.\n      ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR];\n      if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) {\n        sample->set_is_error(true);\n      }\n    }\n    if (answer_epsilon_rank == 0) {\n      ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK];\n      // Unichar OK, but count if multiple unichars.\n      if (num_top_answers > 1) {\n        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];\n        ++multi_unichar_counts_[unichar_id];\n      }\n      // Check to see if any font in the top choice has attributes that match.\n      // TODO(rays) It is easy to add counters for individual font attributes\n      // here if we want them.\n      if (font_table.SetContainsFontProperties(font_id, results[answer_actual_rank].fonts)) {\n        // Font attributes were matched.\n        // Check for multiple properties.\n        if (font_table.SetContainsMultipleFontProperties(results[answer_actual_rank].fonts)) {\n          ++font_counts_[font_id].n[CT_OK_MULTI_FONT];\n        }\n      } else {\n        // Font attributes weren't matched.\n        ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];\n      }\n    } else {\n      // This is a top unichar error.\n      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];\n      if (boosting_mode == CT_UNICHAR_TOP1_ERR) {\n        sample->set_is_error(true);\n      }\n      // Count maps from unichar id to wrong unichar id.\n      ++unichar_counts_(unichar_id, results[0].unichar_id);\n      if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {\n        // It is also a 2nd choice unichar error.\n        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];\n        if (boosting_mode == CT_UNICHAR_TOP2_ERR) {\n          sample->set_is_error(true);\n        }\n      }\n      if (answer_epsilon_rank < 0) {\n        // It is also a top-n choice unichar error.\n        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];\n        if (boosting_mode == CT_UNICHAR_TOPN_ERR) {\n          sample->set_is_error(true);\n        }\n        answer_epsilon_rank = epsilon_rank;\n      }\n    }\n    // Compute mean number of return values and mean rank of correct answer.\n    font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;\n    font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank;\n    if (joined) {\n      ++font_counts_[font_id].n[CT_OK_JOINED];\n    }\n    if (broken) {\n      ++font_counts_[font_id].n[CT_OK_BROKEN];\n    }\n  }\n  // If it was an error for boosting then sum the weight.\n  if (sample->is_error()) {\n    scaled_error_ += sample->weight();\n    if (debug) {\n      tprintf(\"%d results for char %s font %d :\", num_results,\n              unicharset_.id_to_unichar(unichar_id), font_id);\n      for (int i = 0; i < num_results; ++i) {\n        tprintf(\" %.3f : %s\\n\", results[i].rating,\n                unicharset_.id_to_unichar(results[i].unichar_id));\n      }\n      return true;\n    }\n    int percent = 0;\n    if (num_results > 0) {\n      percent = IntCastRounded(results[0].rating * 100);\n    }\n    bad_score_hist_.add(percent, 1);\n  } else {\n    int percent = 0;\n    if (answer_actual_rank >= 0) {\n      percent = IntCastRounded(results[answer_actual_rank].rating * 100);\n    }\n    ok_score_hist_.add(percent, 1);\n  }\n  return false;\n}\n\n// Accumulates counts for junk. Counts only whether the junk was correctly\n// rejected or not.\nbool ErrorCounter::AccumulateJunk(bool debug, const std::vector<UnicharRating> &results,\n                                  TrainingSample *sample) {\n  // For junk we accept no answer, or an explicit shape answer matching the\n  // class id of the sample.\n  const int num_results = results.size();\n  const int font_id = sample->font_id();\n  const int unichar_id = sample->class_id();\n  int percent = 0;\n  if (num_results > 0) {\n    percent = IntCastRounded(results[0].rating * 100);\n  }\n  if (num_results > 0 && results[0].unichar_id != unichar_id) {\n    // This is a junk error.\n    ++font_counts_[font_id].n[CT_ACCEPTED_JUNK];\n    sample->set_is_error(true);\n    // It counts as an error for boosting too so sum the weight.\n    scaled_error_ += sample->weight();\n    bad_score_hist_.add(percent, 1);\n    return debug;\n  } else {\n    // Correctly rejected.\n    ++font_counts_[font_id].n[CT_REJECTED_JUNK];\n    sample->set_is_error(false);\n    ok_score_hist_.add(percent, 1);\n  }\n  return false;\n}\n\n// Creates a report of the error rate. The report_level controls the detail\n// that is reported to stderr via tprintf:\n// 0   -> no output.\n// >=1 -> bottom-line error rate.\n// >=3 -> font-level error rate.\n// boosting_mode determines the return value. It selects which (un-weighted)\n// error rate to return.\n// The fontinfo_table from MasterTrainer provides the names of fonts.\n// The it determines the current subset of the training samples.\n// If not nullptr, the top-choice unichar error rate is saved in unichar_error.\n// If not nullptr, the report string is saved in fonts_report.\n// (Ignoring report_level).\ndouble ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,\n                                  const FontInfoTable &fontinfo_table, const SampleIterator &it,\n                                  double *unichar_error, std::string *fonts_report) {\n  // Compute totals over all the fonts and report individual font results\n  // when required.\n  Counts totals;\n  int fontsize = font_counts_.size();\n  for (int f = 0; f < fontsize; ++f) {\n    // Accumulate counts over fonts.\n    totals += font_counts_[f];\n    std::string font_report;\n    if (ReportString(false, font_counts_[f], font_report)) {\n      if (fonts_report != nullptr) {\n        *fonts_report += fontinfo_table.at(f).name;\n        *fonts_report += \": \";\n        *fonts_report += font_report;\n        *fonts_report += \"\\n\";\n      }\n      if (report_level > 2) {\n        // Report individual font error rates.\n        tprintf(\"%s: %s\\n\", fontinfo_table.at(f).name, font_report.c_str());\n      }\n    }\n  }\n  // Report the totals.\n  std::string total_report;\n  bool any_results = ReportString(true, totals, total_report);\n  if (fonts_report != nullptr && fonts_report->empty()) {\n    // Make sure we return something even if there were no samples.\n    *fonts_report = \"NoSamplesFound: \";\n    *fonts_report += total_report;\n    *fonts_report += \"\\n\";\n  }\n  if (report_level > 0) {\n    // Report the totals.\n    std::string total_report;\n    if (any_results) {\n      tprintf(\"TOTAL Scaled Err=%.4g%%, %s\\n\", scaled_error_ * 100.0, total_report.c_str());\n    }\n    // Report the worst substitution error only for now.\n    if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {\n      int charsetsize = unicharset_.size();\n      int worst_uni_id = 0;\n      int worst_result_id = 0;\n      int worst_err = 0;\n      for (int u = 0; u < charsetsize; ++u) {\n        for (int v = 0; v < charsetsize; ++v) {\n          if (unichar_counts_(u, v) > worst_err) {\n            worst_err = unichar_counts_(u, v);\n            worst_uni_id = u;\n            worst_result_id = v;\n          }\n        }\n      }\n      if (worst_err > 0) {\n        tprintf(\"Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\\n\", worst_uni_id,\n                unicharset_.id_to_unichar(worst_uni_id), unicharset_.id_to_unichar(worst_result_id),\n                worst_err, totals.n[CT_UNICHAR_TOP1_ERR],\n                100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);\n      }\n    }\n    tprintf(\"Multi-unichar shape use:\\n\");\n    for (int u = 0; u < multi_unichar_counts_.size(); ++u) {\n      if (multi_unichar_counts_[u] > 0) {\n        tprintf(\"%d multiple answers for unichar: %s\\n\", multi_unichar_counts_[u],\n                unicharset_.id_to_unichar(u));\n      }\n    }\n    tprintf(\"OK Score histogram:\\n\");\n    ok_score_hist_.print();\n    tprintf(\"ERROR Score histogram:\\n\");\n    bad_score_hist_.print();\n  }\n\n  double rates[CT_SIZE];\n  if (!ComputeRates(totals, rates)) {\n    return 0.0;\n  }\n  // Set output values if asked for.\n  if (unichar_error != nullptr) {\n    *unichar_error = rates[CT_UNICHAR_TOP1_ERR];\n  }\n  return rates[boosting_mode];\n}\n\n// Sets the report string to a combined human and machine-readable report\n// string of the error rates.\n// Returns false if there is no data, leaving report unchanged, unless\n// even_if_empty is true.\nbool ErrorCounter::ReportString(bool even_if_empty, const Counts &counts, std::string &report) {\n  // Compute the error rates.\n  double rates[CT_SIZE];\n  if (!ComputeRates(counts, rates) && !even_if_empty) {\n    return false;\n  }\n  // Using %.4g%%, the length of the output string should exactly match the\n  // length of the format string, but in case of overflow, allow for +eddd\n  // on each number.\n  const int kMaxExtraLength = 5; // Length of +eddd.\n  // Keep this format string and the snprintf in sync with the CountTypes enum.\n  const char format_str[] =\n      \"Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] \"\n      \"Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, \"\n      \"FontAttr=%.4g%%, Multi=%.4g%%, \"\n      \"Answers=%.3g, Rank=%.3g, \"\n      \"OKjunk=%.4g%%, Badjunk=%.4g%%\";\n  constexpr size_t max_str_len = sizeof(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1;\n  char formatted_str[max_str_len];\n  snprintf(formatted_str, max_str_len, format_str, rates[CT_UNICHAR_TOP1_ERR] * 100.0,\n           rates[CT_UNICHAR_TOP2_ERR] * 100.0, rates[CT_UNICHAR_TOPN_ERR] * 100.0,\n           rates[CT_UNICHAR_TOPTOP_ERR] * 100.0, rates[CT_OK_MULTI_UNICHAR] * 100.0,\n           rates[CT_OK_JOINED] * 100.0, rates[CT_OK_BROKEN] * 100.0, rates[CT_REJECT] * 100.0,\n           rates[CT_FONT_ATTR_ERR] * 100.0, rates[CT_OK_MULTI_FONT] * 100.0, rates[CT_NUM_RESULTS],\n           rates[CT_RANK], 100.0 * rates[CT_REJECTED_JUNK], 100.0 * rates[CT_ACCEPTED_JUNK]);\n  report = formatted_str;\n  // Now append each field of counts with a tab in front so the result can\n  // be loaded into a spreadsheet.\n  for (int ct : counts.n) {\n    report += \"\\t\" + std::to_string(ct);\n  }\n  return true;\n}\n\n// Computes the error rates and returns in rates which is an array of size\n// CT_SIZE. Returns false if there is no data, leaving rates unchanged.\nbool ErrorCounter::ComputeRates(const Counts &counts, double rates[CT_SIZE]) {\n  const int ok_samples =\n      counts.n[CT_UNICHAR_TOP_OK] + counts.n[CT_UNICHAR_TOP1_ERR] + counts.n[CT_REJECT];\n  const int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK];\n  // Compute rates for normal chars.\n  double denominator = static_cast<double>(std::max(ok_samples, 1));\n  for (int ct = 0; ct <= CT_RANK; ++ct) {\n    rates[ct] = counts.n[ct] / denominator;\n  }\n  // Compute rates for junk.\n  denominator = static_cast<double>(std::max(junk_samples, 1));\n  for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct) {\n    rates[ct] = counts.n[ct] / denominator;\n  }\n  return ok_samples != 0 || junk_samples != 0;\n}\n\nErrorCounter::Counts::Counts() {\n  memset(n, 0, sizeof(n[0]) * CT_SIZE);\n}\n// Adds other into this for computing totals.\nvoid ErrorCounter::Counts::operator+=(const Counts &other) {\n  for (int ct = 0; ct < CT_SIZE; ++ct) {\n    n[ct] += other.n[ct];\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/errorcounter.h",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_\n#define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_\n\n#include \"matrix.h\"\n#include \"statistc.h\"\n\nstruct Pix;\n\nnamespace tesseract {\n\ntemplate <typename T>\nclass UnicityTable;\nstruct FontInfo;\nclass FontInfoTable;\nclass SampleIterator;\nclass ShapeClassifier;\nclass TrainingSample;\nstruct UnicharRating;\n\n// Enumeration of the different types of error count.\n// Error counts work as follows:\n//\n// Ground truth is a valid unichar-id / font-id pair:\n//        Number of classifier answers?\n//          0                       >0\n//     CT_REJECT          unichar-id matches top shape?\n//     __________             yes!                      no\n//                   CT_UNICHAR_TOP_OK           CT_UNICHAR_TOP1_ERR\n//      Top shape-id has multiple unichars?   2nd shape unichar id matches?\n//            yes!              no              yes!              no\n//      CT_OK_MULTI_UNICHAR     |              _____    CT_UNICHAR_TOP2_ERR\n//             Font attributes match?                 Any unichar-id matches?\n//              yes!              no                  yes!        no\n//      CT_FONT_ATTR_OK   CT_FONT_ATTR_ERR          ______  CT_UNICHAR_TOPN_ERR\n//                |       __________________                 _________________\n//      Top shape-id has multiple font attrs?\n//            yes!              no\n//      CT_OK_MULTI_FONT\n//      _____________________________\n//\n// Note that multiple counts may be activated for a single sample!\n//\n// Ground truth is for a fragment/n-gram that is NOT in the unicharset.\n// This is called junk and is expected to be rejected:\n//        Number of classifier answers?\n//          0                       >0\n//     CT_REJECTED_JUNK     CT_ACCEPTED_JUNK\n//\n// Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores\n// the mean rank of the correct result, counting from 0, and with an error\n// receiving the number of answers as the correct rank.\n//\n// Keep in sync with the ReportString function.\nenum CountTypes {\n  CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.\n  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of\n  // kRatingEpsilon from the first result in each group. The real top choice\n  // is measured using TOPTOP.\n  CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.\n  CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.\n  CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.\n  CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.\n  CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.\n  CT_OK_JOINED,          // Top shape id is correct but marked joined.\n  CT_OK_BROKEN,          // Top shape id is correct but marked broken.\n  CT_REJECT,             // Classifier hates this.\n  CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.\n  CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.\n  CT_NUM_RESULTS,        // Number of answers produced.\n  CT_RANK,               // Rank of correct answer.\n  CT_REJECTED_JUNK,      // Junk that was correctly rejected.\n  CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.\n\n  CT_SIZE // Number of types for array sizing.\n};\n\n// Class to encapsulate all the functionality and sub-structures required\n// to count errors for an isolated character classifier (ShapeClassifier).\nclass ErrorCounter {\npublic:\n  // Computes and returns the unweighted boosting_mode error rate of the given\n  // classifier. Can be used for testing, or inside an iterative training\n  // system, including one that uses boosting.\n  // report_levels:\n  // 0 = no output.\n  // 1 = bottom-line error rate.\n  // 2 = bottom-line error rate + time.\n  // 3 = font-level error rate + time.\n  // 4 = list of all errors + short classifier debug output on 16 errors.\n  // 5 = list of all errors + short classifier debug output on 25 errors.\n  // * The boosting_mode determines which error type is used for computing the\n  //   scaled_error output, and setting the is_error flag in the samples.\n  // * The fontinfo_table is used to get string font names for the debug\n  //   output, and also to count font attributes errors.\n  // * The page_images vector may contain a Pix* (which may be nullptr) for each\n  //   page index assigned to the samples.\n  // * The it provides encapsulated iteration over some sample set.\n  // * The outputs unichar_error, scaled_error and totals_report are all\n  //   optional.\n  // * If not nullptr, unichar error gets the top1 unichar error rate.\n  // * Scaled_error gets the error chosen by boosting_mode weighted by the\n  //   weights on the samples.\n  // * Fonts_report gets a string summarizing the error rates for each font in\n  //   both human-readable form and as a tab-separated list of error counts.\n  //   The human-readable form is all before the first tab.\n  // * The return value is the un-weighted version of the scaled_error.\n  static double ComputeErrorRate(ShapeClassifier *classifier, int report_level,\n                                 CountTypes boosting_mode, const FontInfoTable &fontinfo_table,\n                                 const std::vector<Image > &page_images, SampleIterator *it,\n                                 double *unichar_error, double *scaled_error, std::string *fonts_report);\n  // Tests a pair of classifiers, debugging errors of the new against the old.\n  // See errorcounter.h for description of arguments.\n  // Iterates over the samples, calling the classifiers in normal/silent mode.\n  // If the new_classifier makes a boosting_mode error that the old_classifier\n  // does not, and the appropriate, it will then call the new_classifier again\n  // with a debug flag and a keep_this argument to find out what is going on.\n  static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier,\n                             CountTypes boosting_mode, const FontInfoTable &fontinfo_table,\n                             const std::vector<Image > &page_images, SampleIterator *it);\n\nprivate:\n  // Simple struct to hold an array of counts.\n  struct Counts {\n    Counts();\n    // Adds other into this for computing totals.\n    void operator+=(const Counts &other);\n\n    int n[CT_SIZE];\n  };\n\n  // Constructor is private. Only anticipated use of ErrorCounter is via\n  // the static ComputeErrorRate.\n  ErrorCounter(const UNICHARSET &unicharset, int fontsize);\n  ~ErrorCounter() = default;\n\n  // Accumulates the errors from the classifier results on a single sample.\n  // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.\n  // boosting_mode selects the type of error to be used for boosting and the\n  // is_error_ member of sample is set according to whether the required type\n  // of error occurred. The font_table provides access to font properties\n  // for error counting and shape_table is used to understand the relationship\n  // between unichar_ids and shape_ids in the results\n  bool AccumulateErrors(bool debug, CountTypes boosting_mode, const FontInfoTable &font_table,\n                        const std::vector<UnicharRating> &results, TrainingSample *sample);\n\n  // Accumulates counts for junk. Counts only whether the junk was correctly\n  // rejected or not.\n  bool AccumulateJunk(bool debug, const std::vector<UnicharRating> &results,\n                      TrainingSample *sample);\n\n  // Creates a report of the error rate. The report_level controls the detail\n  // that is reported to stderr via tprintf:\n  // 0   -> no output.\n  // >=1 -> bottom-line error rate.\n  // >=3 -> font-level error rate.\n  // boosting_mode determines the return value. It selects which (un-weighted)\n  // error rate to return.\n  // The fontinfo_table from MasterTrainer provides the names of fonts.\n  // The it determines the current subset of the training samples.\n  // If not nullptr, the top-choice unichar error rate is saved in\n  // unichar_error. If not nullptr, the report string is saved in fonts_report.\n  // (Ignoring report_level).\n  double ReportErrors(int report_level, CountTypes boosting_mode,\n                      const FontInfoTable &fontinfo_table, const SampleIterator &it,\n                      double *unichar_error, std::string *fonts_report);\n\n  // Sets the report string to a combined human and machine-readable report\n  // string of the error rates.\n  // Returns false if there is no data, leaving report unchanged, unless\n  // even_if_empty is true.\n  static bool ReportString(bool even_if_empty, const Counts &counts, std::string &report);\n\n  // Computes the error rates and returns in rates which is an array of size\n  // CT_SIZE. Returns false if there is no data, leaving rates unchanged.\n  static bool ComputeRates(const Counts &counts, double rates[CT_SIZE]);\n\n  // Total scaled error used by boosting algorithms.\n  double scaled_error_;\n  // Difference in result rating to be thought of as an \"equal\" choice.\n  double rating_epsilon_;\n  // Vector indexed by font_id from the samples of error accumulators.\n  std::vector<Counts> font_counts_;\n  // Counts of the results that map each unichar_id (from samples) to an\n  // incorrect shape_id.\n  GENERIC_2D_ARRAY<int> unichar_counts_;\n  // Count of the number of times each shape_id occurs, is correct, and multi-\n  // unichar.\n  std::vector<int> multi_unichar_counts_;\n  // Histogram of scores (as percent) for correct answers.\n  STATS ok_score_hist_;\n  // Histogram of scores (as percent) for incorrect answers.\n  STATS bad_score_hist_;\n  // Unicharset for printing character ids in results.\n  const UNICHARSET &unicharset_;\n};\n\n} // namespace tesseract.\n\n#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */\n"
  },
  {
    "path": "src/training/common/export.h",
    "content": "#pragma once\n\n#ifdef CMAKE_BUILD\n#  include <common_training_export.h>\n#endif\n"
  },
  {
    "path": "src/training/common/intfeaturedist.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturedist.cpp\n// Description: Fast set-difference-based feature distance calculator.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"intfeaturedist.h\"\n#include \"intfeaturemap.h\"\n\nnamespace tesseract {\n\nIntFeatureDist::IntFeatureDist()\n    : size_(0)\n    , total_feature_weight_(0.0)\n    , feature_map_(nullptr)\n    , features_(nullptr)\n    , features_delta_one_(nullptr)\n    , features_delta_two_(nullptr) {}\n\nIntFeatureDist::~IntFeatureDist() {\n  Clear();\n}\n\n// Initialize the table to the given size of feature space.\nvoid IntFeatureDist::Init(const IntFeatureMap *feature_map) {\n  size_ = feature_map->sparse_size();\n  Clear();\n  feature_map_ = feature_map;\n  features_ = new bool[size_];\n  features_delta_one_ = new bool[size_];\n  features_delta_two_ = new bool[size_];\n  memset(features_, false, size_ * sizeof(features_[0]));\n  memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));\n  memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));\n  total_feature_weight_ = 0.0;\n}\n\n// Setup the map for the given indexed_features that have been indexed by\n// feature_map.\nvoid IntFeatureDist::Set(const std::vector<int> &indexed_features, int canonical_count,\n                         bool value) {\n  total_feature_weight_ = canonical_count;\n  for (int f : indexed_features) {\n    features_[f] = value;\n    for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {\n      if (dir == 0) {\n        continue;\n      }\n      const int mapped_f = feature_map_->OffsetFeature(f, dir);\n      if (mapped_f >= 0) {\n        features_delta_one_[mapped_f] = value;\n        for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {\n          if (dir2 == 0) {\n            continue;\n          }\n          const int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);\n          if (mapped_f2 >= 0) {\n            features_delta_two_[mapped_f2] = value;\n          }\n        }\n      }\n    }\n  }\n}\n\n// Compute the distance between the given feature vector and the last\n// Set feature vector.\ndouble IntFeatureDist::FeatureDistance(const std::vector<int> &features) const {\n  const int num_test_features = features.size();\n  const double denominator = total_feature_weight_ + num_test_features;\n  double misses = denominator;\n  for (int i = 0; i < num_test_features; ++i) {\n    const int index = features[i];\n    const double weight = 1.0;\n    if (features_[index]) {\n      // A perfect match.\n      misses -= 2.0 * weight;\n    } else if (features_delta_one_[index]) {\n      misses -= 1.5 * weight;\n    } else if (features_delta_two_[index]) {\n      // A near miss.\n      misses -= 1.0 * weight;\n    }\n  }\n  return misses / denominator;\n}\n\n// Compute the distance between the given feature vector and the last\n// Set feature vector.\ndouble IntFeatureDist::DebugFeatureDistance(const std::vector<int> &features) const {\n  const int num_test_features = features.size();\n  const double denominator = total_feature_weight_ + num_test_features;\n  double misses = denominator;\n  for (int i = 0; i < num_test_features; ++i) {\n    const int index = features[i];\n    const double weight = 1.0;\n    INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);\n    tprintf(\"Testing feature weight %g:\", weight);\n    f.print();\n    if (features_[index]) {\n      // A perfect match.\n      misses -= 2.0 * weight;\n      tprintf(\"Perfect hit\\n\");\n    } else if (features_delta_one_[index]) {\n      misses -= 1.5 * weight;\n      tprintf(\"-1 hit\\n\");\n    } else if (features_delta_two_[index]) {\n      // A near miss.\n      misses -= 1.0 * weight;\n      tprintf(\"-2 hit\\n\");\n    } else {\n      tprintf(\"Total miss\\n\");\n    }\n  }\n  tprintf(\"Features present:\");\n  for (int i = 0; i < size_; ++i) {\n    if (features_[i]) {\n      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);\n      f.print();\n    }\n  }\n  tprintf(\"\\nMinus one features:\");\n  for (int i = 0; i < size_; ++i) {\n    if (features_delta_one_[i]) {\n      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);\n      f.print();\n    }\n  }\n  tprintf(\"\\nMinus two features:\");\n  for (int i = 0; i < size_; ++i) {\n    if (features_delta_two_[i]) {\n      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);\n      f.print();\n    }\n  }\n  tprintf(\"\\n\");\n  return misses / denominator;\n}\n\n// Clear all data.\nvoid IntFeatureDist::Clear() {\n  delete[] features_;\n  features_ = nullptr;\n  delete[] features_delta_one_;\n  features_delta_one_ = nullptr;\n  delete[] features_delta_two_;\n  features_delta_two_ = nullptr;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/common/intfeaturedist.h",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturedist.h\n// Description: Fast set-difference-based feature distance calculator.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_INTFEATUREDIST_H_\n#define TESSERACT_CLASSIFY_INTFEATUREDIST_H_\n\n#include <vector>\n\nnamespace tesseract {\n\nclass IntFeatureMap;\n\n// Feature distance calculator designed to provide a fast distance calculation\n// based on set difference between a given feature set and many other feature\n// sets in turn.\n// Representation of a feature set as an array of bools that are sparsely\n// true, and companion arrays that allow fast feature set distance\n// calculations with allowance of offsets in position.\n// Init is expensive, so for greatest efficiency, to re-initialize for a new\n// feature set, use Set(..., false) on the SAME feature set as was used to\n// setup with Set(..., true), to return to its initialized state before\n// reuse with Set(..., true) on a new feature set.\nclass IntFeatureDist {\npublic:\n  IntFeatureDist();\n  ~IntFeatureDist();\n\n  // Initialize the bool array to the given size of feature space.\n  // The feature_map is just borrowed, and must exist for the entire\n  // lifetime of the IntFeatureDist.\n  void Init(const IntFeatureMap *feature_map);\n\n  // Setup the map for the given indexed_features that have been indexed by\n  // feature_map. After use, use Set(..., false) to reset to the initial state\n  // as this is faster than calling Init for sparse spaces.\n  void Set(const std::vector<int> &indexed_features, int canonical_count, bool value);\n\n  // Compute the distance between the given feature vector and the last\n  // Set feature vector.\n  double FeatureDistance(const std::vector<int> &features) const;\n  double DebugFeatureDistance(const std::vector<int> &features) const;\n\nprivate:\n  // Clear all data.\n  void Clear();\n\n  // Size of the indexed feature space.\n  int size_;\n  // Total weight of features currently stored in the maps.\n  double total_feature_weight_;\n  // Pointer to IntFeatureMap given at Init to find offset features.\n  const IntFeatureMap *feature_map_;\n  // Array of bools indicating presence of a feature.\n  bool *features_;\n  // Array indicating the presence of a feature offset by one unit.\n  bool *features_delta_one_;\n  // Array indicating the presence of a feature offset by two units.\n  bool *features_delta_two_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_CLASSIFY_INTFEATUREDIST_H_\n"
  },
  {
    "path": "src/training/common/intfeaturemap.cpp",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturemap.cpp\n// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi\n//              to provide a subspace mapping and fast feature lookup.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"intfeaturemap.h\"\n\n#include \"intfeaturespace.h\"\n#include \"intfx.h\"\n// These includes do not exist yet, but will be coming soon.\n//#include \"sampleiterator.h\"\n//#include \"trainingsample.h\"\n//#include \"trainingsampleset.h\"\n\nnamespace tesseract {\n\nconst int kMaxOffsetDist = 32;\n\nIntFeatureMap::IntFeatureMap() : mapping_changed_(true), compact_size_(0) {\n  for (int dir = 0; dir < kNumOffsetMaps; ++dir) {\n    offset_plus_[dir] = nullptr;\n    offset_minus_[dir] = nullptr;\n  }\n}\n\nIntFeatureMap::~IntFeatureMap() {\n  Clear();\n}\n\n// Pseudo-accessors.\nint IntFeatureMap::IndexFeature(const INT_FEATURE_STRUCT &f) const {\n  return feature_space_.Index(f);\n}\nint IntFeatureMap::MapFeature(const INT_FEATURE_STRUCT &f) const {\n  return feature_map_.SparseToCompact(feature_space_.Index(f));\n}\nint IntFeatureMap::MapIndexFeature(int index_feature) const {\n  return feature_map_.SparseToCompact(index_feature);\n}\nINT_FEATURE_STRUCT IntFeatureMap::InverseIndexFeature(int index_feature) const {\n  return feature_space_.PositionFromIndex(index_feature);\n}\nINT_FEATURE_STRUCT IntFeatureMap::InverseMapFeature(int map_feature) const {\n  int index = feature_map_.CompactToSparse(map_feature);\n  return feature_space_.PositionFromIndex(index);\n}\nvoid IntFeatureMap::DeleteMapFeature(int map_feature) {\n  feature_map_.Merge(-1, map_feature);\n  mapping_changed_ = true;\n}\nbool IntFeatureMap::IsMapFeatureDeleted(int map_feature) const {\n  return feature_map_.IsCompactDeleted(map_feature);\n}\n\n// Copies the given feature_space and uses it as the index feature map\n// from INT_FEATURE_STRUCT.\nvoid IntFeatureMap::Init(const IntFeatureSpace &feature_space) {\n  feature_space_ = feature_space;\n  mapping_changed_ = false;\n  int sparse_size = feature_space_.Size();\n  feature_map_.Init(sparse_size, true);\n  feature_map_.Setup();\n  compact_size_ = feature_map_.CompactSize();\n  // Initialize look-up tables if needed.\n  FCOORD dir = FeatureDirection(0);\n  if (dir.x() == 0.0f && dir.y() == 0.0f) {\n    InitIntegerFX();\n  }\n  // Compute look-up tables to generate offset features.\n  for (int dir = 0; dir < kNumOffsetMaps; ++dir) {\n    delete[] offset_plus_[dir];\n    delete[] offset_minus_[dir];\n    offset_plus_[dir] = new int[sparse_size];\n    offset_minus_[dir] = new int[sparse_size];\n  }\n  for (int dir = 1; dir <= kNumOffsetMaps; ++dir) {\n    for (int i = 0; i < sparse_size; ++i) {\n      int offset_index = ComputeOffsetFeature(i, dir);\n      offset_plus_[dir - 1][i] = offset_index;\n      offset_index = ComputeOffsetFeature(i, -dir);\n      offset_minus_[dir - 1][i] = offset_index;\n    }\n  }\n}\n\n// Helper to return an offset index feature. In this context an offset\n// feature with a dir of +/-1 is a feature of a similar direction,\n// but shifted perpendicular to the direction of the feature. An offset\n// feature with a dir of +/-2 is feature at the same position, but rotated\n// by +/- one [compact] quantum. Returns the index of the generated offset\n// feature, or -1 if it doesn't exist. Dir should be in\n// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.\n// A dir of 0 is an identity transformation.\n// Both input and output are from the index(sparse) feature space, not\n// the mapped/compact feature space, but the offset feature is the minimum\n// distance moved from the input to guarantee that it maps to the next\n// available quantum in the mapped/compact space.\nint IntFeatureMap::OffsetFeature(int index_feature, int dir) const {\n  if (dir > 0 && dir <= kNumOffsetMaps) {\n    return offset_plus_[dir - 1][index_feature];\n  } else if (dir < 0 && -dir <= kNumOffsetMaps) {\n    return offset_minus_[-dir - 1][index_feature];\n  } else if (dir == 0) {\n    return index_feature;\n  } else {\n    return -1;\n  }\n}\n\n//#define EXPERIMENT_ON\n#ifdef EXPERIMENT_ON // This code is commented out as SampleIterator and\n// TrainingSample are not reviewed/checked in yet, but these functions are a\n// useful indicator of how an IntFeatureMap is setup.\n\n// Computes the features used by the subset of samples defined by\n// the iterator and sets up the feature mapping.\n// Returns the size of the compacted feature space.\nint IntFeatureMap::FindNZFeatureMapping(SampleIterator *it) {\n  feature_map_.Init(feature_space_.Size(), false);\n  int total_samples = 0;\n  for (it->Begin(); !it->AtEnd(); it->Next()) {\n    const TrainingSample &sample = it->GetSample();\n    std::vector<int> features;\n    feature_space_.IndexAndSortFeatures(sample.features(), sample.num_features(), &features);\n    int num_features = features.size();\n    for (int f = 0; f < num_features; ++f)\n      feature_map_.SetMap(features[f], true);\n    ++total_samples;\n  }\n  feature_map_.Setup();\n  compact_size_ = feature_map_.CompactSize();\n  mapping_changed_ = true;\n  FinalizeMapping(it);\n  tprintf(\"%d non-zero features found in %d samples\\n\", compact_size_, total_samples);\n  return compact_size_;\n}\n#endif\n\n// After deleting some features, finish setting up the mapping, and map\n// all the samples. Returns the size of the compacted feature space.\nint IntFeatureMap::FinalizeMapping(SampleIterator *it) {\n  if (mapping_changed_) {\n    feature_map_.CompleteMerges();\n    compact_size_ = feature_map_.CompactSize();\n#ifdef EXPERIMENT_ON\n    it->MapSampleFeatures(*this);\n#endif\n    mapping_changed_ = false;\n  }\n  return compact_size_;\n}\n\n// Prints the map features from the set in human-readable form.\nvoid IntFeatureMap::DebugMapFeatures(const std::vector<int> &map_features) const {\n  for (int map_feature : map_features) {\n    INT_FEATURE_STRUCT f = InverseMapFeature(map_feature);\n    f.print();\n  }\n}\n\nvoid IntFeatureMap::Clear() {\n  for (int dir = 0; dir < kNumOffsetMaps; ++dir) {\n    delete[] offset_plus_[dir];\n    delete[] offset_minus_[dir];\n    offset_plus_[dir] = nullptr;\n    offset_minus_[dir] = nullptr;\n  }\n}\n\n// Helper to compute an offset index feature. In this context an offset\n// feature with a dir of +/-1 is a feature of a similar direction,\n// but shifted perpendicular to the direction of the feature. An offset\n// feature with a dir of +/-2 is feature at the same position, but rotated\n// by +/- one [compact] quantum. Returns the index of the generated offset\n// feature, or -1 if it doesn't exist. Dir should be in\n// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.\n// A dir of 0 is an identity transformation.\n// Both input and output are from the index(sparse) feature space, not\n// the mapped/compact feature space, but the offset feature is the minimum\n// distance moved from the input to guarantee that it maps to the next\n// available quantum in the mapped/compact space.\nint IntFeatureMap::ComputeOffsetFeature(int index_feature, int dir) const {\n  INT_FEATURE_STRUCT f = InverseIndexFeature(index_feature);\n  ASSERT_HOST(IndexFeature(f) == index_feature);\n  if (dir == 0) {\n    return index_feature;\n  } else if (dir == 1 || dir == -1) {\n    FCOORD feature_dir = FeatureDirection(f.Theta);\n    FCOORD rotation90(0.0f, 1.0f);\n    feature_dir.rotate(rotation90);\n    // Find the nearest existing feature.\n    for (int m = 1; m < kMaxOffsetDist; ++m) {\n      double x_pos = f.X + feature_dir.x() * (m * dir);\n      double y_pos = f.Y + feature_dir.y() * (m * dir);\n      int x = IntCastRounded(x_pos);\n      int y = IntCastRounded(y_pos);\n      if (x >= 0 && x <= UINT8_MAX && y >= 0 && y <= UINT8_MAX) {\n        INT_FEATURE_STRUCT offset_f;\n        offset_f.X = x;\n        offset_f.Y = y;\n        offset_f.Theta = f.Theta;\n        int offset_index = IndexFeature(offset_f);\n        if (offset_index != index_feature && offset_index >= 0) {\n          return offset_index; // Found one.\n        }\n      } else {\n        return -1; // Hit the edge of feature space.\n      }\n    }\n  } else if (dir == 2 || dir == -2) {\n    // Find the nearest existing index_feature.\n    for (int m = 1; m < kMaxOffsetDist; ++m) {\n      int theta = f.Theta + m * dir / 2;\n      INT_FEATURE_STRUCT offset_f;\n      offset_f.X = f.X;\n      offset_f.Y = f.Y;\n      offset_f.Theta = Modulo(theta, 256);\n      int offset_index = IndexFeature(offset_f);\n      if (offset_index != index_feature && offset_index >= 0) {\n        return offset_index; // Found one.\n      }\n    }\n  }\n  return -1; // Nothing within the max distance.\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/intfeaturemap.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        intfeaturemap.h\n// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi\n//              to provide a subspace mapping and fast feature lookup.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_\n#define TESSERACT_CLASSIFY_INTFEATUREMAP_H_\n\n#include \"export.h\"\n#include \"indexmapbidi.h\"\n#include \"intfeaturespace.h\"\n#include \"intproto.h\"\n\nnamespace tesseract {\n\nclass SampleIterator;\n\n// Number of positive and negative offset maps.\nstatic const int kNumOffsetMaps = 2;\n\n// Class to map a feature space defined by INT_FEATURE_STRUCT to a compact\n// down-sampled subspace of actually used features.\n// The IntFeatureMap copes with 2 stages of transformation:\n// The first step is down-sampling (re-quantization) and converting to a\n// single index value from the 3-D input:\n//   INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and\n// the second is a feature-space compaction to map only the feature indices\n// that are actually used. This saves space in classifiers that are built\n// using the mapped feature space.\n//   index (sparse) feature <-> map (compact) feature via IndexMapBiDi.\n// Although the transformations are reversible, the inverses are lossy and do\n// not return the exact input INT_FEATURE_STRUCT, due to the many->one nature\n// of both transformations.\nclass TESS_COMMON_TRAINING_API IntFeatureMap {\npublic:\n  IntFeatureMap();\n  ~IntFeatureMap();\n\n  // Accessors.\n  int sparse_size() const {\n    return feature_space_.Size();\n  }\n  int compact_size() const {\n    return compact_size_;\n  }\n  const IntFeatureSpace &feature_space() const {\n    return feature_space_;\n  }\n  const IndexMapBiDi &feature_map() const {\n    return feature_map_;\n  }\n\n  // Pseudo-accessors.\n  int IndexFeature(const INT_FEATURE_STRUCT &f) const;\n  int MapFeature(const INT_FEATURE_STRUCT &f) const;\n  int MapIndexFeature(int index_feature) const;\n  INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;\n  INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;\n  void DeleteMapFeature(int map_feature);\n  bool IsMapFeatureDeleted(int map_feature) const;\n\n  // Copies the given feature_space and uses it as the index feature map\n  // from INT_FEATURE_STRUCT.\n  void Init(const IntFeatureSpace &feature_space);\n\n  // Helper to return an offset index feature. In this context an offset\n  // feature with a dir of +/-1 is a feature of a similar direction,\n  // but shifted perpendicular to the direction of the feature. An offset\n  // feature with a dir of +/-2 is feature at the same position, but rotated\n  // by +/- one [compact] quantum. Returns the index of the generated offset\n  // feature, or -1 if it doesn't exist. Dir should be in\n  // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.\n  // A dir of 0 is an identity transformation.\n  // Both input and output are from the index(sparse) feature space, not\n  // the mapped/compact feature space, but the offset feature is the minimum\n  // distance moved from the input to guarantee that it maps to the next\n  // available quantum in the mapped/compact space.\n  int OffsetFeature(int index_feature, int dir) const;\n\n  // Computes the features used by the subset of samples defined by\n  // the iterator and sets up the feature mapping.\n  // Returns the size of the compacted feature space.\n  int FindNZFeatureMapping(SampleIterator *it);\n\n  // After deleting some features, finish setting up the mapping, and map\n  // all the samples. Returns the size of the compacted feature space.\n  int FinalizeMapping(SampleIterator *it);\n\n  // Indexes the given array of features to a vector of sorted indices.\n  void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,\n                            std::vector<int> *sorted_features) const {\n    feature_space_.IndexAndSortFeatures(features, num_features, sorted_features);\n  }\n  // Maps the given array of index/sparse features to an array of map/compact\n  // features.\n  // Assumes the input is sorted. The output indices are sorted and uniqued.\n  // Returns the number of \"missed\" features, being features that\n  // don't map to the compact feature space.\n  int MapIndexedFeatures(const std::vector<int> &index_features,\n                         std::vector<int> *map_features) const {\n    return feature_map_.MapFeatures(index_features, map_features);\n  }\n\n  // Prints the map features from the set in human-readable form.\n  void DebugMapFeatures(const std::vector<int> &map_features) const;\n\nprivate:\n  void Clear();\n\n  // Helper to compute an offset index feature. In this context an offset\n  // feature with a dir of +/-1 is a feature of a similar direction,\n  // but shifted perpendicular to the direction of the feature. An offset\n  // feature with a dir of +/-2 is feature at the same position, but rotated\n  // by +/- one [compact] quantum. Returns the index of the generated offset\n  // feature, or -1 if it doesn't exist. Dir should be in\n  // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.\n  // A dir of 0 is an identity transformation.\n  // Both input and output are from the index(sparse) feature space, not\n  // the mapped/compact feature space, but the offset feature is the minimum\n  // distance moved from the input to guarantee that it maps to the next\n  // available quantum in the mapped/compact space.\n  int ComputeOffsetFeature(int index_feature, int dir) const;\n\n  // True if the mapping has changed since it was last finalized.\n  bool mapping_changed_;\n  // Size of the compacted feature space, after unused features are removed.\n  int compact_size_;\n  // Feature space quantization definition and indexing from INT_FEATURE_STRUCT.\n  IntFeatureSpace feature_space_;\n  // Mapping from indexed feature space to the compacted space with unused\n  // features mapping to -1.\n  IndexMapBiDi feature_map_;\n  // Index tables to map a feature index to the corresponding feature after a\n  // shift perpendicular to the feature direction, or a rotation in place.\n  // An entry of -1 indicates that there is no corresponding feature.\n  // Array of arrays of size feature_space_.Size() owned by this class.\n  int *offset_plus_[kNumOffsetMaps];\n  int *offset_minus_[kNumOffsetMaps];\n\n  // Don't use default copy and assign!\n  IntFeatureMap(const IntFeatureMap &);\n  void operator=(const IntFeatureMap &);\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_\n"
  },
  {
    "path": "src/training/common/mastertrainer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        mastertrainer.cpp\n// Description: Trainer to build the MasterClassifier.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <allheaders.h>\n#include <cmath>\n#include <ctime>\n#include \"boxread.h\"\n#include \"classify.h\"\n#include \"errorcounter.h\"\n#include \"featdefs.h\"\n#include \"mastertrainer.h\"\n#include \"sampleiterator.h\"\n#include \"shapeclassifier.h\"\n#include \"shapetable.h\"\n#ifndef GRAPHICS_DISABLED\n#  include \"svmnode.h\"\n#endif\n\n#include \"scanutils.h\"\n\nnamespace tesseract {\n\n// Constants controlling clustering. With a low kMinClusteredShapes and a high\n// kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.\n// Min number of shapes in the output.\nconst int kMinClusteredShapes = 1;\n// Max number of unichars in any individual cluster.\nconst int kMaxUnicharsPerCluster = 2000;\n// Mean font distance below which to merge fonts and unichars.\nconst float kFontMergeDistance = 0.025;\n\nMasterTrainer::MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,\n                             bool replicate_samples, int debug_level)\n    : norm_mode_(norm_mode),\n      samples_(fontinfo_table_),\n      junk_samples_(fontinfo_table_),\n      verify_samples_(fontinfo_table_),\n      charsetsize_(0),\n      enable_shape_analysis_(shape_analysis),\n      enable_replication_(replicate_samples),\n      fragments_(nullptr),\n      prev_unichar_id_(-1),\n      debug_level_(debug_level) {}\n\nMasterTrainer::~MasterTrainer() {\n  delete[] fragments_;\n  for (auto &page_image : page_images_) {\n    page_image.destroy();\n  }\n}\n\n// WARNING! Serialize/DeSerialize are only partial, providing\n// enough data to get the samples back and display them.\n// Writes to the given file. Returns false in case of error.\nbool MasterTrainer::Serialize(FILE *fp) const {\n  uint32_t value = norm_mode_;\n  if (!tesseract::Serialize(fp, &value)) {\n    return false;\n  }\n  if (!unicharset_.save_to_file(fp)) {\n    return false;\n  }\n  if (!feature_space_.Serialize(fp)) {\n    return false;\n  }\n  if (!samples_.Serialize(fp)) {\n    return false;\n  }\n  if (!junk_samples_.Serialize(fp)) {\n    return false;\n  }\n  if (!verify_samples_.Serialize(fp)) {\n    return false;\n  }\n  if (!master_shapes_.Serialize(fp)) {\n    return false;\n  }\n  if (!flat_shapes_.Serialize(fp)) {\n    return false;\n  }\n  if (!fontinfo_table_.Serialize(fp)) {\n    return false;\n  }\n  if (!tesseract::Serialize(fp, xheights_)) {\n    return false;\n  }\n  return true;\n}\n\n// Load an initial unicharset, or set one up if the file cannot be read.\nvoid MasterTrainer::LoadUnicharset(const char *filename) {\n  if (!unicharset_.load_from_file(filename)) {\n    tprintf(\n        \"Failed to load unicharset from file %s\\n\"\n        \"Building unicharset for training from scratch...\\n\",\n        filename);\n    unicharset_.clear();\n    UNICHARSET initialized;\n    // Add special characters, as they were removed by the clear, but the\n    // default constructor puts them in.\n    unicharset_.AppendOtherUnicharset(initialized);\n  }\n  charsetsize_ = unicharset_.size();\n  delete[] fragments_;\n  fragments_ = new int[charsetsize_];\n  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);\n  samples_.LoadUnicharset(filename);\n  junk_samples_.LoadUnicharset(filename);\n  verify_samples_.LoadUnicharset(filename);\n}\n\n// Reads the samples and their features from the given .tr format file,\n// adding them to the trainer with the font_id from the content of the file.\n// See mftraining.cpp for a description of the file format.\n// If verification, then these are verification samples, not training.\nvoid MasterTrainer::ReadTrainingSamples(const char *page_name,\n                                        const FEATURE_DEFS_STRUCT &feature_defs,\n                                        bool verification) {\n  char buffer[2048];\n  const int int_feature_type =\n      ShortNameToFeatureType(feature_defs, kIntFeatureType);\n  const int micro_feature_type =\n      ShortNameToFeatureType(feature_defs, kMicroFeatureType);\n  const int cn_feature_type =\n      ShortNameToFeatureType(feature_defs, kCNFeatureType);\n  const int geo_feature_type =\n      ShortNameToFeatureType(feature_defs, kGeoFeatureType);\n\n  FILE *fp = fopen(page_name, \"rb\");\n  if (fp == nullptr) {\n    tprintf(\"Failed to open tr file: %s\\n\", page_name);\n    return;\n  }\n  tr_filenames_.emplace_back(page_name);\n  while (fgets(buffer, sizeof(buffer), fp) != nullptr) {\n    if (buffer[0] == '\\n') {\n      continue;\n    }\n\n    char *space = strchr(buffer, ' ');\n    if (space == nullptr) {\n      tprintf(\"Bad format in tr file, reading fontname, unichar\\n\");\n      continue;\n    }\n    *space++ = '\\0';\n    int font_id = GetFontInfoId(buffer);\n    if (font_id < 0) {\n      font_id = 0;\n    }\n    int page_number;\n    std::string unichar;\n    TBOX bounding_box;\n    if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) {\n      tprintf(\"Bad format in tr file, reading box coords\\n\");\n      continue;\n    }\n    auto char_desc = ReadCharDescription(feature_defs, fp);\n    auto *sample = new TrainingSample;\n    sample->set_font_id(font_id);\n    sample->set_page_num(page_number + page_images_.size());\n    sample->set_bounding_box(bounding_box);\n    sample->ExtractCharDesc(int_feature_type, micro_feature_type,\n                            cn_feature_type, geo_feature_type, char_desc);\n    AddSample(verification, unichar.c_str(), sample);\n    delete char_desc;\n  }\n  charsetsize_ = unicharset_.size();\n  fclose(fp);\n}\n\n// Adds the given single sample to the trainer, setting the classid\n// appropriately from the given unichar_str.\nvoid MasterTrainer::AddSample(bool verification, const char *unichar,\n                              TrainingSample *sample) {\n  if (verification) {\n    verify_samples_.AddSample(unichar, sample);\n    prev_unichar_id_ = -1;\n  } else if (unicharset_.contains_unichar(unichar)) {\n    if (prev_unichar_id_ >= 0) {\n      fragments_[prev_unichar_id_] = -1;\n    }\n    prev_unichar_id_ = samples_.AddSample(unichar, sample);\n    if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0) {\n      flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());\n    }\n  } else {\n    const int junk_id = junk_samples_.AddSample(unichar, sample);\n    if (prev_unichar_id_ >= 0) {\n      CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar);\n      if (frag != nullptr && frag->is_natural()) {\n        if (fragments_[prev_unichar_id_] == 0) {\n          fragments_[prev_unichar_id_] = junk_id;\n        } else if (fragments_[prev_unichar_id_] != junk_id) {\n          fragments_[prev_unichar_id_] = -1;\n        }\n      }\n      delete frag;\n    }\n    prev_unichar_id_ = -1;\n  }\n}\n\n// Loads all pages from the given tif filename and append to page_images_.\n// Must be called after ReadTrainingSamples, as the current number of images\n// is used as an offset for page numbers in the samples.\nvoid MasterTrainer::LoadPageImages(const char *filename) {\n  size_t offset = 0;\n  int page;\n  Image pix;\n  for (page = 0;; page++) {\n    pix = pixReadFromMultipageTiff(filename, &offset);\n    if (!pix) {\n      break;\n    }\n    page_images_.push_back(pix);\n    if (!offset) {\n      break;\n    }\n  }\n  tprintf(\"Loaded %d page images from %s\\n\", page, filename);\n}\n\n// Cleans up the samples after initial load from the tr files, and prior to\n// saving the MasterTrainer:\n// Remaps fragmented chars if running shape analysis.\n// Sets up the samples appropriately for class/fontwise access.\n// Deletes outlier samples.\nvoid MasterTrainer::PostLoadCleanup() {\n  if (debug_level_ > 0) {\n    tprintf(\"PostLoadCleanup...\\n\");\n  }\n  if (enable_shape_analysis_) {\n    ReplaceFragmentedSamples();\n  }\n  SampleIterator sample_it;\n  sample_it.Init(nullptr, nullptr, true, &verify_samples_);\n  sample_it.NormalizeSamples();\n  verify_samples_.OrganizeByFontAndClass();\n\n  samples_.IndexFeatures(feature_space_);\n  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness\n  // against current training.\n  //  samples_.DeleteOutliers(feature_space_, debug_level_ > 0);\n  samples_.OrganizeByFontAndClass();\n  if (debug_level_ > 0) {\n    tprintf(\"ComputeCanonicalSamples...\\n\");\n  }\n  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);\n}\n\n// Gets the samples ready for training. Use after both\n// ReadTrainingSamples+PostLoadCleanup or DeSerialize.\n// Re-indexes the features and computes canonical and cloud features.\nvoid MasterTrainer::PreTrainingSetup() {\n  if (debug_level_ > 0) {\n    tprintf(\"PreTrainingSetup...\\n\");\n  }\n  samples_.IndexFeatures(feature_space_);\n  samples_.ComputeCanonicalFeatures();\n  if (debug_level_ > 0) {\n    tprintf(\"ComputeCloudFeatures...\\n\");\n  }\n  samples_.ComputeCloudFeatures(feature_space_.Size());\n}\n\n// Sets up the master_shapes_ table, which tells which fonts should stay\n// together until they get to a leaf node classifier.\nvoid MasterTrainer::SetupMasterShapes() {\n  tprintf(\"Building master shape table\\n\");\n  const int num_fonts = samples_.NumFonts();\n\n  ShapeTable char_shapes_begin_fragment(samples_.unicharset());\n  ShapeTable char_shapes_end_fragment(samples_.unicharset());\n  ShapeTable char_shapes(samples_.unicharset());\n  for (int c = 0; c < samples_.charsetsize(); ++c) {\n    ShapeTable shapes(samples_.unicharset());\n    for (int f = 0; f < num_fonts; ++f) {\n      if (samples_.NumClassSamples(f, c, true) > 0) {\n        shapes.AddShape(c, f);\n      }\n    }\n    ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);\n\n    const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);\n\n    if (fragment == nullptr) {\n      char_shapes.AppendMasterShapes(shapes, nullptr);\n    } else if (fragment->is_beginning()) {\n      char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);\n    } else if (fragment->is_ending()) {\n      char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);\n    } else {\n      char_shapes.AppendMasterShapes(shapes, nullptr);\n    }\n  }\n  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,\n                &char_shapes_begin_fragment);\n  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);\n  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,\n                &char_shapes_end_fragment);\n  char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);\n  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,\n                &char_shapes);\n  master_shapes_.AppendMasterShapes(char_shapes, nullptr);\n  tprintf(\"Master shape_table:%s\\n\", master_shapes_.SummaryStr().c_str());\n}\n\n// Adds the junk_samples_ to the main samples_ set. Junk samples are initially\n// fragments and n-grams (all incorrectly segmented characters).\n// Various training functions may result in incorrectly segmented characters\n// being added to the unicharset of the main samples, perhaps because they\n// form a \"radical\" decomposition of some (Indic) grapheme, or because they\n// just look the same as a real character (like rn/m)\n// This function moves all the junk samples, to the main samples_ set, but\n// desirable junk, being any sample for which the unichar already exists in\n// the samples_ unicharset gets the unichar-ids re-indexed to match, but\n// anything else gets re-marked as unichar_id 0 (space character) to identify\n// it as junk to the error counter.\nvoid MasterTrainer::IncludeJunk() {\n  // Get ids of fragments in junk_samples_ that replace the dead chars.\n  const UNICHARSET &junk_set = junk_samples_.unicharset();\n  const UNICHARSET &sample_set = samples_.unicharset();\n  int num_junks = junk_samples_.num_samples();\n  tprintf(\"Moving %d junk samples to master sample set.\\n\", num_junks);\n  for (int s = 0; s < num_junks; ++s) {\n    TrainingSample *sample = junk_samples_.mutable_sample(s);\n    int junk_id = sample->class_id();\n    const char *junk_utf8 = junk_set.id_to_unichar(junk_id);\n    int sample_id = sample_set.unichar_to_id(junk_utf8);\n    if (sample_id == INVALID_UNICHAR_ID) {\n      sample_id = 0;\n    }\n    sample->set_class_id(sample_id);\n    junk_samples_.extract_sample(s);\n    samples_.AddSample(sample_id, sample);\n  }\n  junk_samples_.DeleteDeadSamples();\n  samples_.OrganizeByFontAndClass();\n}\n\n// Replicates the samples and perturbs them if the enable_replication_ flag\n// is set. MUST be used after the last call to OrganizeByFontAndClass on\n// the training samples, ie after IncludeJunk if it is going to be used, as\n// OrganizeByFontAndClass will eat the replicated samples into the regular\n// samples.\nvoid MasterTrainer::ReplicateAndRandomizeSamplesIfRequired() {\n  if (enable_replication_) {\n    if (debug_level_ > 0) {\n      tprintf(\"ReplicateAndRandomize...\\n\");\n    }\n    verify_samples_.ReplicateAndRandomizeSamples();\n    samples_.ReplicateAndRandomizeSamples();\n    samples_.IndexFeatures(feature_space_);\n  }\n}\n\n// Loads the basic font properties file into fontinfo_table_.\n// Returns false on failure.\nbool MasterTrainer::LoadFontInfo(const char *filename) {\n  FILE *fp = fopen(filename, \"rb\");\n  if (fp == nullptr) {\n    fprintf(stderr, \"Failed to load font_properties from %s\\n\", filename);\n    return false;\n  }\n  int italic, bold, fixed, serif, fraktur;\n  while (!feof(fp)) {\n    FontInfo fontinfo;\n    char *font_name = new char[1024];\n    fontinfo.name = font_name;\n    fontinfo.properties = 0;\n    fontinfo.universal_id = 0;\n    if (tfscanf(fp, \"%1024s %i %i %i %i %i\\n\", font_name, &italic, &bold,\n                &fixed, &serif, &fraktur) != 6) {\n      delete[] font_name;\n      continue;\n    }\n    fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) +\n                          (serif << 3) + (fraktur << 4);\n    if (fontinfo_table_.get_index(fontinfo) < 0) {\n      // fontinfo not in table.\n      fontinfo_table_.push_back(fontinfo);\n    } else {\n      delete[] font_name;\n    }\n  }\n  fclose(fp);\n  return true;\n}\n\n// Loads the xheight font properties file into xheights_.\n// Returns false on failure.\nbool MasterTrainer::LoadXHeights(const char *filename) {\n  tprintf(\"fontinfo table is of size %d\\n\", fontinfo_table_.size());\n  xheights_.clear();\n  xheights_.resize(fontinfo_table_.size(), -1);\n  if (filename == nullptr) {\n    return true;\n  }\n  FILE *f = fopen(filename, \"rb\");\n  if (f == nullptr) {\n    fprintf(stderr, \"Failed to load font xheights from %s\\n\", filename);\n    return false;\n  }\n  tprintf(\"Reading x-heights from %s ...\\n\", filename);\n  FontInfo fontinfo;\n  fontinfo.properties = 0; // Not used to lookup in the table.\n  fontinfo.universal_id = 0;\n  char buffer[1024];\n  int xht;\n  int total_xheight = 0;\n  int xheight_count = 0;\n  while (!feof(f)) {\n    if (tfscanf(f, \"%1023s %d\\n\", buffer, &xht) != 2) {\n      continue;\n    }\n    buffer[1023] = '\\0';\n    fontinfo.name = buffer;\n    auto fontinfo_id = fontinfo_table_.get_index(fontinfo);\n    if (fontinfo_id < 0) {\n      // fontinfo not in table.\n      continue;\n    }\n    xheights_[fontinfo_id] = xht;\n    total_xheight += xht;\n    ++xheight_count;\n  }\n  if (xheight_count == 0) {\n    fprintf(stderr, \"No valid xheights in %s!\\n\", filename);\n    fclose(f);\n    return false;\n  }\n  int mean_xheight = DivRounded(total_xheight, xheight_count);\n  for (size_t i = 0; i < fontinfo_table_.size(); ++i) {\n    if (xheights_[i] < 0) {\n      xheights_[i] = mean_xheight;\n    }\n  }\n  fclose(f);\n  return true;\n} // LoadXHeights\n\n// Reads spacing stats from filename and adds them to fontinfo_table.\nbool MasterTrainer::AddSpacingInfo(const char *filename) {\n  FILE *fontinfo_file = fopen(filename, \"rb\");\n  if (fontinfo_file == nullptr) {\n    return true; // We silently ignore missing files!\n  }\n  // Find the fontinfo_id.\n  int fontinfo_id = GetBestMatchingFontInfoId(filename);\n  if (fontinfo_id < 0) {\n    tprintf(\"No font found matching fontinfo filename %s\\n\", filename);\n    fclose(fontinfo_file);\n    return false;\n  }\n  tprintf(\"Reading spacing from %s for font %d...\\n\", filename, fontinfo_id);\n  // TODO(rays) scale should probably be a double, but keep as an int for now\n  // to duplicate current behavior.\n  int scale = kBlnXHeight / xheights_[fontinfo_id];\n  int num_unichars;\n  char uch[UNICHAR_LEN];\n  char kerned_uch[UNICHAR_LEN];\n  int x_gap, x_gap_before, x_gap_after, num_kerned;\n  ASSERT_HOST(tfscanf(fontinfo_file, \"%d\\n\", &num_unichars) == 1);\n  FontInfo *fi = &fontinfo_table_.at(fontinfo_id);\n  fi->init_spacing(unicharset_.size());\n  FontSpacingInfo *spacing = nullptr;\n  for (int l = 0; l < num_unichars; ++l) {\n    if (tfscanf(fontinfo_file, \"%s %d %d %d\", uch, &x_gap_before, &x_gap_after,\n                &num_kerned) != 4) {\n      tprintf(\"Bad format of font spacing file %s\\n\", filename);\n      fclose(fontinfo_file);\n      return false;\n    }\n    bool valid = unicharset_.contains_unichar(uch);\n    if (valid) {\n      spacing = new FontSpacingInfo();\n      spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);\n      spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);\n    }\n    for (int k = 0; k < num_kerned; ++k) {\n      if (tfscanf(fontinfo_file, \"%s %d\", kerned_uch, &x_gap) != 2) {\n        tprintf(\"Bad format of font spacing file %s\\n\", filename);\n        fclose(fontinfo_file);\n        delete spacing;\n        return false;\n      }\n      if (!valid || !unicharset_.contains_unichar(kerned_uch)) {\n        continue;\n      }\n      spacing->kerned_unichar_ids.push_back(\n          unicharset_.unichar_to_id(kerned_uch));\n      spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));\n    }\n    if (valid) {\n      fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);\n    }\n  }\n  fclose(fontinfo_file);\n  return true;\n}\n\n// Returns the font id corresponding to the given font name.\n// Returns -1 if the font cannot be found.\nint MasterTrainer::GetFontInfoId(const char *font_name) {\n  FontInfo fontinfo;\n  // We are only borrowing the string, so it is OK to const cast it.\n  fontinfo.name = const_cast<char *>(font_name);\n  fontinfo.properties = 0; // Not used to lookup in the table\n  fontinfo.universal_id = 0;\n  return fontinfo_table_.get_index(fontinfo);\n}\n// Returns the font_id of the closest matching font name to the given\n// filename. It is assumed that a substring of the filename will match\n// one of the fonts. If more than one is matched, the longest is returned.\nint MasterTrainer::GetBestMatchingFontInfoId(const char *filename) {\n  int fontinfo_id = -1;\n  int best_len = 0;\n  for (size_t f = 0; f < fontinfo_table_.size(); ++f) {\n    if (strstr(filename, fontinfo_table_.at(f).name) != nullptr) {\n      int len = strlen(fontinfo_table_.at(f).name);\n      // Use the longest matching length in case a substring of a font matched.\n      if (len > best_len) {\n        best_len = len;\n        fontinfo_id = f;\n      }\n    }\n  }\n  return fontinfo_id;\n}\n\n// Sets up a flat shapetable with one shape per class/font combination.\nvoid MasterTrainer::SetupFlatShapeTable(ShapeTable *shape_table) {\n  // To exactly mimic the results of the previous implementation, the shapes\n  // must be clustered in order the fonts arrived, and reverse order of the\n  // characters within each font.\n  // Get a list of the fonts in the order they appeared.\n  std::vector<int> active_fonts;\n  int num_shapes = flat_shapes_.NumShapes();\n  for (int s = 0; s < num_shapes; ++s) {\n    int font = flat_shapes_.GetShape(s)[0].font_ids[0];\n    unsigned f = 0;\n    for (f = 0; f < active_fonts.size(); ++f) {\n      if (active_fonts[f] == font) {\n        break;\n      }\n    }\n    if (f == active_fonts.size()) {\n      active_fonts.push_back(font);\n    }\n  }\n  // For each font in order, add all the shapes with that font in reverse order.\n  int num_fonts = active_fonts.size();\n  for (int f = 0; f < num_fonts; ++f) {\n    for (int s = num_shapes - 1; s >= 0; --s) {\n      int font = flat_shapes_.GetShape(s)[0].font_ids[0];\n      if (font == active_fonts[f]) {\n        shape_table->AddShape(flat_shapes_.GetShape(s));\n      }\n    }\n  }\n}\n\n// Sets up a Clusterer for mftraining on a single shape_id.\n// Call FreeClusterer on the return value after use.\nCLUSTERER *MasterTrainer::SetupForClustering(\n    const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs,\n    int shape_id, int *num_samples) {\n  int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);\n  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;\n  ASSERT_HOST(num_params == (int)MicroFeatureParameter::MFCount);\n  CLUSTERER *clusterer = MakeClusterer(\n      num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);\n\n  // We want to iterate over the samples of just the one shape.\n  IndexMapBiDi shape_map;\n  shape_map.Init(shape_table.NumShapes(), false);\n  shape_map.SetMap(shape_id, true);\n  shape_map.Setup();\n  // Reverse the order of the samples to match the previous behavior.\n  std::vector<const TrainingSample *> sample_ptrs;\n  SampleIterator it;\n  it.Init(&shape_map, &shape_table, false, &samples_);\n  for (it.Begin(); !it.AtEnd(); it.Next()) {\n    sample_ptrs.push_back(&it.GetSample());\n  }\n  uint32_t sample_id = 0;\n  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {\n    const TrainingSample *sample = sample_ptrs[i];\n    uint32_t num_features = sample->num_micro_features();\n    for (uint32_t f = 0; f < num_features; ++f) {\n      MakeSample(clusterer, sample->micro_features()[f].data(), sample_id);\n    }\n    ++sample_id;\n  }\n  *num_samples = sample_id;\n  return clusterer;\n}\n\n// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp\n// to the given inttemp_file, and the corresponding pffmtable.\n// The unicharset is the original encoding of graphemes, and shape_set should\n// match the size of the shape_table, and may possibly be totally fake.\nvoid MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET &unicharset,\n                                             const UNICHARSET &shape_set,\n                                             const ShapeTable &shape_table,\n                                             CLASS_STRUCT *float_classes,\n                                             const char *inttemp_file,\n                                             const char *pffmtable_file) {\n  auto *classify = new tesseract::Classify();\n  // Move the fontinfo table to classify.\n  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());\n  INT_TEMPLATES_STRUCT *int_templates =\n      classify->CreateIntTemplates(float_classes, shape_set);\n  FILE *fp = fopen(inttemp_file, \"wb\");\n  if (fp == nullptr) {\n    tprintf(\"Error, failed to open file \\\"%s\\\"\\n\", inttemp_file);\n  } else {\n    classify->WriteIntTemplates(fp, int_templates, shape_set);\n    fclose(fp);\n  }\n  // Now write pffmtable. This is complicated by the fact that the adaptive\n  // classifier still wants one indexed by unichar-id, but the static\n  // classifier needs one indexed by its shape class id.\n  // We put the shapetable_cutoffs in a vector, and compute the\n  // unicharset cutoffs along the way.\n  std::vector<uint16_t> shapetable_cutoffs;\n  std::vector<uint16_t> unichar_cutoffs(unicharset.size());\n  /* then write out each class */\n  for (unsigned i = 0; i < int_templates->NumClasses; ++i) {\n    INT_CLASS_STRUCT *Class = ClassForClassId(int_templates, i);\n    // Todo: Test with min instead of max\n    // int MaxLength = LengthForConfigId(Class, 0);\n    uint16_t max_length = 0;\n    for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {\n      // Todo: Test with min instead of max\n      // if (LengthForConfigId (Class, config_id) < MaxLength)\n      uint16_t length = Class->ConfigLengths[config_id];\n      if (length > max_length) {\n        max_length = Class->ConfigLengths[config_id];\n      }\n      int shape_id = float_classes[i].font_set.at(config_id);\n      const Shape &shape = shape_table.GetShape(shape_id);\n      for (int c = 0; c < shape.size(); ++c) {\n        int unichar_id = shape[c].unichar_id;\n        if (length > unichar_cutoffs[unichar_id]) {\n          unichar_cutoffs[unichar_id] = length;\n        }\n      }\n    }\n    shapetable_cutoffs.push_back(max_length);\n  }\n  fp = fopen(pffmtable_file, \"wb\");\n  if (fp == nullptr) {\n    tprintf(\"Error, failed to open file \\\"%s\\\"\\n\", pffmtable_file);\n  } else {\n    tesseract::Serialize(fp, shapetable_cutoffs);\n    for (size_t c = 0; c < unicharset.size(); ++c) {\n      const char *unichar = unicharset.id_to_unichar(c);\n      if (strcmp(unichar, \" \") == 0) {\n        unichar = \"NULL\";\n      }\n      fprintf(fp, \"%s %d\\n\", unichar, unichar_cutoffs[c]);\n    }\n    fclose(fp);\n  }\n  delete int_templates;\n  delete classify;\n}\n\n// Generate debug output relating to the canonical distance between the\n// two given UTF8 grapheme strings.\nvoid MasterTrainer::DebugCanonical(const char *unichar_str1,\n                                   const char *unichar_str2) {\n  int class_id1 = unicharset_.unichar_to_id(unichar_str1);\n  int class_id2 = unicharset_.unichar_to_id(unichar_str2);\n  if (class_id2 == INVALID_UNICHAR_ID) {\n    class_id2 = class_id1;\n  }\n  if (class_id1 == INVALID_UNICHAR_ID) {\n    tprintf(\"No unicharset entry found for %s\\n\", unichar_str1);\n    return;\n  } else {\n    tprintf(\"Font ambiguities for unichar %d = %s and %d = %s\\n\", class_id1,\n            unichar_str1, class_id2, unichar_str2);\n  }\n  int num_fonts = samples_.NumFonts();\n  const IntFeatureMap &feature_map = feature_map_;\n  // Iterate the fonts to get the similarity with other fonst of the same\n  // class.\n  tprintf(\"      \");\n  for (int f = 0; f < num_fonts; ++f) {\n    if (samples_.NumClassSamples(f, class_id2, false) == 0) {\n      continue;\n    }\n    tprintf(\"%6d\", f);\n  }\n  tprintf(\"\\n\");\n  for (int f1 = 0; f1 < num_fonts; ++f1) {\n    // Map the features of the canonical_sample.\n    if (samples_.NumClassSamples(f1, class_id1, false) == 0) {\n      continue;\n    }\n    tprintf(\"%4d  \", f1);\n    for (int f2 = 0; f2 < num_fonts; ++f2) {\n      if (samples_.NumClassSamples(f2, class_id2, false) == 0) {\n        continue;\n      }\n      float dist =\n          samples_.ClusterDistance(f1, class_id1, f2, class_id2, feature_map);\n      tprintf(\" %5.3f\", dist);\n    }\n    tprintf(\"\\n\");\n  }\n  // Build a fake ShapeTable containing all the sample types.\n  ShapeTable shapes(unicharset_);\n  for (int f = 0; f < num_fonts; ++f) {\n    if (samples_.NumClassSamples(f, class_id1, true) > 0) {\n      shapes.AddShape(class_id1, f);\n    }\n    if (class_id1 != class_id2 &&\n        samples_.NumClassSamples(f, class_id2, true) > 0) {\n      shapes.AddShape(class_id2, f);\n    }\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n// Debugging for cloud/canonical features.\n// Displays a Features window containing:\n// If unichar_str2 is in the unicharset, and canonical_font is non-negative,\n// displays the canonical features of the char/font combination in red.\n// If unichar_str1 is in the unicharset, and cloud_font is non-negative,\n// displays the cloud feature of the char/font combination in green.\n// The canonical features are drawn first to show which ones have no\n// matches in the cloud features.\n// Until the features window is destroyed, each click in the features window\n// will display the samples that have that feature in a separate window.\nvoid MasterTrainer::DisplaySamples(const char *unichar_str1, int cloud_font,\n                                   const char *unichar_str2,\n                                   int canonical_font) {\n  const IntFeatureMap &feature_map = feature_map_;\n  const IntFeatureSpace &feature_space = feature_map.feature_space();\n  ScrollView *f_window = CreateFeatureSpaceWindow(\"Features\", 100, 500);\n  ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character,\n                          f_window);\n  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);\n  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {\n    const TrainingSample *sample =\n        samples_.GetCanonicalSample(canonical_font, class_id2);\n    for (uint32_t f = 0; f < sample->num_features(); ++f) {\n      RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);\n    }\n  }\n  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);\n  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {\n    const BitVector &cloud = samples_.GetCloudFeatures(cloud_font, class_id1);\n    for (int f = 0; f < cloud.size(); ++f) {\n      if (cloud[f]) {\n        INT_FEATURE_STRUCT feature = feature_map.InverseIndexFeature(f);\n        RenderIntFeature(f_window, &feature, ScrollView::GREEN);\n      }\n    }\n  }\n  f_window->Update();\n  ScrollView *s_window = CreateFeatureSpaceWindow(\"Samples\", 100, 500);\n  SVEventType ev_type;\n  do {\n    // Wait until a click or popup event.\n    auto ev = f_window->AwaitEvent(SVET_ANY);\n    ev_type = ev->type;\n    if (ev_type == SVET_CLICK) {\n      int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);\n      if (feature_index >= 0) {\n        // Iterate samples and display those with the feature.\n        Shape shape;\n        shape.AddToShape(class_id1, cloud_font);\n        s_window->Clear();\n        samples_.DisplaySamplesWithFeature(feature_index, shape, feature_space,\n                                           ScrollView::GREEN, s_window);\n        s_window->Update();\n      }\n    }\n  } while (ev_type != SVET_DESTROY);\n}\n#endif // !GRAPHICS_DISABLED\n\nvoid MasterTrainer::TestClassifierVOld(bool replicate_samples,\n                                       ShapeClassifier *test_classifier,\n                                       ShapeClassifier *old_classifier) {\n  SampleIterator sample_it;\n  sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);\n  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,\n                               CT_UNICHAR_TOPN_ERR, fontinfo_table_,\n                               page_images_, &sample_it);\n}\n\n// Tests the given test_classifier on the internal samples.\n// See TestClassifier for details.\nvoid MasterTrainer::TestClassifierOnSamples(CountTypes error_mode,\n                                            int report_level,\n                                            bool replicate_samples,\n                                            ShapeClassifier *test_classifier,\n                                            std::string *report_string) {\n  TestClassifier(error_mode, report_level, replicate_samples, &samples_,\n                 test_classifier, report_string);\n}\n\n// Tests the given test_classifier on the given samples.\n// error_mode indicates what counts as an error.\n// report_levels:\n// 0 = no output.\n// 1 = bottom-line error rate.\n// 2 = bottom-line error rate + time.\n// 3 = font-level error rate + time.\n// 4 = list of all errors + short classifier debug output on 16 errors.\n// 5 = list of all errors + short classifier debug output on 25 errors.\n// If replicate_samples is true, then the test is run on an extended test\n// sample including replicated and systematically perturbed samples.\n// If report_string is non-nullptr, a summary of the results for each font\n// is appended to the report_string.\ndouble MasterTrainer::TestClassifier(CountTypes error_mode, int report_level,\n                                     bool replicate_samples,\n                                     TrainingSampleSet *samples,\n                                     ShapeClassifier *test_classifier,\n                                     std::string *report_string) {\n  SampleIterator sample_it;\n  sample_it.Init(nullptr, nullptr, replicate_samples, samples);\n  if (report_level > 0) {\n    int num_samples = 0;\n    for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next()) {\n      ++num_samples;\n    }\n    tprintf(\"Iterator has charset size of %d/%d, %d shapes, %d samples\\n\",\n            sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),\n            test_classifier->GetShapeTable()->NumShapes(), num_samples);\n    tprintf(\"Testing %sREPLICATED:\\n\", replicate_samples ? \"\" : \"NON-\");\n  }\n  double unichar_error = 0.0;\n  ErrorCounter::ComputeErrorRate(test_classifier, report_level, error_mode,\n                                 fontinfo_table_, page_images_, &sample_it,\n                                 &unichar_error, nullptr, report_string);\n  return unichar_error;\n}\n\n// Returns the average (in some sense) distance between the two given\n// shapes, which may contain multiple fonts and/or unichars.\nfloat MasterTrainer::ShapeDistance(const ShapeTable &shapes, int s1, int s2) {\n  const IntFeatureMap &feature_map = feature_map_;\n  const Shape &shape1 = shapes.GetShape(s1);\n  const Shape &shape2 = shapes.GetShape(s2);\n  int num_chars1 = shape1.size();\n  int num_chars2 = shape2.size();\n  float dist_sum = 0.0f;\n  int dist_count = 0;\n  if (num_chars1 > 1 || num_chars2 > 1) {\n    // In the multi-char case try to optimize the calculation by computing\n    // distances between characters of matching font where possible.\n    for (int c1 = 0; c1 < num_chars1; ++c1) {\n      for (int c2 = 0; c2 < num_chars2; ++c2) {\n        dist_sum +=\n            samples_.UnicharDistance(shape1[c1], shape2[c2], true, feature_map);\n        ++dist_count;\n      }\n    }\n  } else {\n    // In the single unichar case, there is little alternative, but to compute\n    // the squared-order distance between pairs of fonts.\n    dist_sum =\n        samples_.UnicharDistance(shape1[0], shape2[0], false, feature_map);\n    ++dist_count;\n  }\n  return dist_sum / dist_count;\n}\n\n// Replaces samples that are always fragmented with the corresponding\n// fragment samples.\nvoid MasterTrainer::ReplaceFragmentedSamples() {\n  if (fragments_ == nullptr) {\n    return;\n  }\n  // Remove samples that are replaced by fragments. Each class that was\n  // always naturally fragmented should be replaced by its fragments.\n  int num_samples = samples_.num_samples();\n  for (int s = 0; s < num_samples; ++s) {\n    TrainingSample *sample = samples_.mutable_sample(s);\n    if (fragments_[sample->class_id()] > 0) {\n      samples_.KillSample(sample);\n    }\n  }\n  samples_.DeleteDeadSamples();\n\n  // Get ids of fragments in junk_samples_ that replace the dead chars.\n  const UNICHARSET &frag_set = junk_samples_.unicharset();\n#if 0\n  // TODO(rays) The original idea was to replace only graphemes that were\n  // always naturally fragmented, but that left a lot of the Indic graphemes\n  // out. Determine whether we can go back to that idea now that spacing\n  // is fixed in the training images, or whether this code is obsolete.\n  bool* good_junk = new bool[frag_set.size()];\n  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());\n  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {\n    int frag_ch = fragments_[dead_ch];\n    if (frag_ch <= 0) continue;\n    const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);\n    CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);\n    // Mark the chars for all parts of the fragment as good in good_junk.\n    for (int part = 0; part < frag->get_total(); ++part) {\n      frag->set_pos(part);\n      int good_ch = frag_set.unichar_to_id(frag->to_string().c_str());\n      if (good_ch != INVALID_UNICHAR_ID)\n        good_junk[good_ch] = true;  // We want this one.\n    }\n    delete frag;\n  }\n#endif\n  // For now just use all the junk that was from natural fragments.\n  // Get samples of fragments in junk_samples_ that replace the dead chars.\n  int num_junks = junk_samples_.num_samples();\n  for (int s = 0; s < num_junks; ++s) {\n    TrainingSample *sample = junk_samples_.mutable_sample(s);\n    int junk_id = sample->class_id();\n    const char *frag_utf8 = frag_set.id_to_unichar(junk_id);\n    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);\n    if (frag != nullptr && frag->is_natural()) {\n      junk_samples_.extract_sample(s);\n      samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);\n    }\n    delete frag;\n  }\n  junk_samples_.DeleteDeadSamples();\n  junk_samples_.OrganizeByFontAndClass();\n  samples_.OrganizeByFontAndClass();\n  unicharset_.clear();\n  unicharset_.AppendOtherUnicharset(samples_.unicharset());\n  // delete [] good_junk;\n  // Fragments_ no longer needed?\n  delete[] fragments_;\n  fragments_ = nullptr;\n}\n\n// Runs a hierarchical agglomerative clustering to merge shapes in the given\n// shape_table, while satisfying the given constraints:\n// * End with at least min_shapes left in shape_table,\n// * No shape shall have more than max_shape_unichars in it,\n// * Don't merge shapes where the distance between them exceeds max_dist.\nconst float kInfiniteDist = 999.0f;\nvoid MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,\n                                  float max_dist, ShapeTable *shapes) {\n  int num_shapes = shapes->NumShapes();\n  int max_merges = num_shapes - min_shapes;\n  // TODO: avoid new / delete.\n  auto *shape_dists = new std::vector<ShapeDist>[num_shapes];\n  float min_dist = kInfiniteDist;\n  int min_s1 = 0;\n  int min_s2 = 0;\n  tprintf(\"Computing shape distances...\");\n  for (int s1 = 0; s1 < num_shapes; ++s1) {\n    for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {\n      ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));\n      shape_dists[s1].push_back(dist);\n      if (dist.distance < min_dist) {\n        min_dist = dist.distance;\n        min_s1 = s1;\n        min_s2 = s2;\n      }\n    }\n    tprintf(\" %d\", s1);\n  }\n  tprintf(\"\\n\");\n  int num_merged = 0;\n  while (num_merged < max_merges && min_dist < max_dist) {\n    tprintf(\"Distance = %f: \", min_dist);\n    int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);\n    shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;\n    if (num_unichars > max_shape_unichars) {\n      tprintf(\"Merge of %d and %d with %d would exceed max of %d unichars\\n\",\n              min_s1, min_s2, num_unichars, max_shape_unichars);\n    } else {\n      shapes->MergeShapes(min_s1, min_s2);\n      shape_dists[min_s2].clear();\n      ++num_merged;\n\n      for (int s = 0; s < min_s1; ++s) {\n        if (!shape_dists[s].empty()) {\n          shape_dists[s][min_s1 - s - 1].distance =\n              ShapeDistance(*shapes, s, min_s1);\n          shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;\n        }\n      }\n      for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {\n        if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist) {\n          shape_dists[min_s1][s2 - min_s1 - 1].distance =\n              ShapeDistance(*shapes, min_s1, s2);\n        }\n      }\n      for (int s = min_s1 + 1; s < min_s2; ++s) {\n        if (!shape_dists[s].empty()) {\n          shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;\n        }\n      }\n    }\n    min_dist = kInfiniteDist;\n    for (int s1 = 0; s1 < num_shapes; ++s1) {\n      for (unsigned i = 0; i < shape_dists[s1].size(); ++i) {\n        if (shape_dists[s1][i].distance < min_dist) {\n          min_dist = shape_dists[s1][i].distance;\n          min_s1 = s1;\n          min_s2 = s1 + 1 + i;\n        }\n      }\n    }\n  }\n  tprintf(\"Stopped with %d merged, min dist %f\\n\", num_merged, min_dist);\n  delete[] shape_dists;\n  if (debug_level_ > 1) {\n    for (int s1 = 0; s1 < num_shapes; ++s1) {\n      if (shapes->MasterDestinationIndex(s1) == s1) {\n        tprintf(\"Master shape:%s\\n\", shapes->DebugStr(s1).c_str());\n      }\n    }\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/mastertrainer.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        mastertrainer.h\n// Description: Trainer to build the MasterClassifier.\n// Author:      Ray Smith\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TRAINING_MASTERTRAINER_H_\n#define TESSERACT_TRAINING_MASTERTRAINER_H_\n\n#include \"export.h\"\n\n#include \"classify.h\"\n#include \"cluster.h\"\n#include \"elst.h\"\n#include \"errorcounter.h\"\n#include \"featdefs.h\"\n#include \"fontinfo.h\"\n#include \"indexmapbidi.h\"\n#include \"intfeaturemap.h\"\n#include \"intfeaturespace.h\"\n#include \"intfx.h\"\n#include \"intmatcher.h\"\n#include \"params.h\"\n#include \"shapetable.h\"\n#include \"trainingsample.h\"\n#include \"trainingsampleset.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\nclass ShapeClassifier;\n\n// Simple struct to hold the distance between two shapes during clustering.\nstruct ShapeDist {\n  ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}\n  ShapeDist(int s1, int s2, float dist) : shape1(s1), shape2(s2), distance(dist) {}\n\n  // Sort operator to sort in ascending order of distance.\n  bool operator<(const ShapeDist &other) const {\n    return distance < other.distance;\n  }\n\n  int shape1;\n  int shape2;\n  float distance;\n};\n\n// Class to encapsulate training processes that use the TrainingSampleSet.\n// Initially supports shape clustering and mftrainining.\n// Other important features of the MasterTrainer are conditioning the data\n// by outlier elimination, replication with perturbation, and serialization.\nclass TESS_COMMON_TRAINING_API MasterTrainer {\npublic:\n  MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples,\n                int debug_level);\n  ~MasterTrainer();\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n\n  // Loads an initial unicharset, or sets one up if the file cannot be read.\n  void LoadUnicharset(const char *filename);\n\n  // Sets the feature space definition.\n  void SetFeatureSpace(const IntFeatureSpace &fs) {\n    feature_space_ = fs;\n    feature_map_.Init(fs);\n  }\n\n  // Reads the samples and their features from the given file,\n  // adding them to the trainer with the font_id from the content of the file.\n  // If verification, then these are verification samples, not training.\n  void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs,\n                           bool verification);\n\n  // Adds the given single sample to the trainer, setting the classid\n  // appropriately from the given unichar_str.\n  void AddSample(bool verification, const char *unichar_str, TrainingSample *sample);\n\n  // Loads all pages from the given tif filename and append to page_images_.\n  // Must be called after ReadTrainingSamples, as the current number of images\n  // is used as an offset for page numbers in the samples.\n  void LoadPageImages(const char *filename);\n\n  // Cleans up the samples after initial load from the tr files, and prior to\n  // saving the MasterTrainer:\n  // Remaps fragmented chars if running shape analysis.\n  // Sets up the samples appropriately for class/fontwise access.\n  // Deletes outlier samples.\n  void PostLoadCleanup();\n\n  // Gets the samples ready for training. Use after both\n  // ReadTrainingSamples+PostLoadCleanup or DeSerialize.\n  // Re-indexes the features and computes canonical and cloud features.\n  void PreTrainingSetup();\n\n  // Sets up the master_shapes_ table, which tells which fonts should stay\n  // together until they get to a leaf node classifier.\n  void SetupMasterShapes();\n\n  // Adds the junk_samples_ to the main samples_ set. Junk samples are initially\n  // fragments and n-grams (all incorrectly segmented characters).\n  // Various training functions may result in incorrectly segmented characters\n  // being added to the unicharset of the main samples, perhaps because they\n  // form a \"radical\" decomposition of some (Indic) grapheme, or because they\n  // just look the same as a real character (like rn/m)\n  // This function moves all the junk samples, to the main samples_ set, but\n  // desirable junk, being any sample for which the unichar already exists in\n  // the samples_ unicharset gets the unichar-ids re-indexed to match, but\n  // anything else gets re-marked as unichar_id 0 (space character) to identify\n  // it as junk to the error counter.\n  void IncludeJunk();\n\n  // Replicates the samples and perturbs them if the enable_replication_ flag\n  // is set. MUST be used after the last call to OrganizeByFontAndClass on\n  // the training samples, ie after IncludeJunk if it is going to be used, as\n  // OrganizeByFontAndClass will eat the replicated samples into the regular\n  // samples.\n  void ReplicateAndRandomizeSamplesIfRequired();\n\n  // Loads the basic font properties file into fontinfo_table_.\n  // Returns false on failure.\n  bool LoadFontInfo(const char *filename);\n\n  // Loads the xheight font properties file into xheights_.\n  // Returns false on failure.\n  bool LoadXHeights(const char *filename);\n\n  // Reads spacing stats from filename and adds them to fontinfo_table.\n  // Returns false on failure.\n  bool AddSpacingInfo(const char *filename);\n\n  // Returns the font id corresponding to the given font name.\n  // Returns -1 if the font cannot be found.\n  int GetFontInfoId(const char *font_name);\n  // Returns the font_id of the closest matching font name to the given\n  // filename. It is assumed that a substring of the filename will match\n  // one of the fonts. If more than one is matched, the longest is returned.\n  int GetBestMatchingFontInfoId(const char *filename);\n\n  // Returns the filename of the tr file corresponding to the command-line\n  // argument with the given index.\n  const std::string &GetTRFileName(int index) const {\n    return tr_filenames_[index];\n  }\n\n  // Sets up a flat shapetable with one shape per class/font combination.\n  void SetupFlatShapeTable(ShapeTable *shape_table);\n\n  // Sets up a Clusterer for mftraining on a single shape_id.\n  // Call FreeClusterer on the return value after use.\n  CLUSTERER *SetupForClustering(const ShapeTable &shape_table,\n                                const FEATURE_DEFS_STRUCT &feature_defs, int shape_id,\n                                int *num_samples);\n\n  // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp\n  // to the given inttemp_file, and the corresponding pffmtable.\n  // The unicharset is the original encoding of graphemes, and shape_set should\n  // match the size of the shape_table, and may possibly be totally fake.\n  void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set,\n                                const ShapeTable &shape_table, CLASS_STRUCT *float_classes,\n                                const char *inttemp_file, const char *pffmtable_file);\n\n  const UNICHARSET &unicharset() const {\n    return samples_.unicharset();\n  }\n  TrainingSampleSet *GetSamples() {\n    return &samples_;\n  }\n  const ShapeTable &master_shapes() const {\n    return master_shapes_;\n  }\n\n  // Generates debug output relating to the canonical distance between the\n  // two given UTF8 grapheme strings.\n  void DebugCanonical(const char *unichar_str1, const char *unichar_str2);\n#ifndef GRAPHICS_DISABLED\n  // Debugging for cloud/canonical features.\n  // Displays a Features window containing:\n  // If unichar_str2 is in the unicharset, and canonical_font is non-negative,\n  // displays the canonical features of the char/font combination in red.\n  // If unichar_str1 is in the unicharset, and cloud_font is non-negative,\n  // displays the cloud feature of the char/font combination in green.\n  // The canonical features are drawn first to show which ones have no\n  // matches in the cloud features.\n  // Until the features window is destroyed, each click in the features window\n  // will display the samples that have that feature in a separate window.\n  void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2,\n                      int canonical_font);\n#endif // !GRAPHICS_DISABLED\n\n  void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier,\n                          ShapeClassifier *old_classifier);\n\n  // Tests the given test_classifier on the internal samples.\n  // See TestClassifier for details.\n  void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples,\n                               ShapeClassifier *test_classifier, std::string *report_string);\n  // Tests the given test_classifier on the given samples\n  // error_mode indicates what counts as an error.\n  // report_levels:\n  // 0 = no output.\n  // 1 = bottom-line error rate.\n  // 2 = bottom-line error rate + time.\n  // 3 = font-level error rate + time.\n  // 4 = list of all errors + short classifier debug output on 16 errors.\n  // 5 = list of all errors + short classifier debug output on 25 errors.\n  // If replicate_samples is true, then the test is run on an extended test\n  // sample including replicated and systematically perturbed samples.\n  // If report_string is non-nullptr, a summary of the results for each font\n  // is appended to the report_string.\n  double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples,\n                        TrainingSampleSet *samples, ShapeClassifier *test_classifier,\n                        std::string *report_string);\n\n  // Returns the average (in some sense) distance between the two given\n  // shapes, which may contain multiple fonts and/or unichars.\n  // This function is public to facilitate testing.\n  float ShapeDistance(const ShapeTable &shapes, int s1, int s2);\n\nprivate:\n  // Replaces samples that are always fragmented with the corresponding\n  // fragment samples.\n  void ReplaceFragmentedSamples();\n\n  // Runs a hierarchical agglomerative clustering to merge shapes in the given\n  // shape_table, while satisfying the given constraints:\n  // * End with at least min_shapes left in shape_table,\n  // * No shape shall have more than max_shape_unichars in it,\n  // * Don't merge shapes where the distance between them exceeds max_dist.\n  void ClusterShapes(int min_shapes, int max_shape_unichars, float max_dist,\n                     ShapeTable *shape_table);\n\nprivate:\n  NormalizationMode norm_mode_;\n  // Character set we are training for.\n  UNICHARSET unicharset_;\n  // Original feature space. Subspace mapping is contained in feature_map_.\n  IntFeatureSpace feature_space_;\n  TrainingSampleSet samples_;\n  TrainingSampleSet junk_samples_;\n  TrainingSampleSet verify_samples_;\n  // Master shape table defines what fonts stay together until the leaves.\n  ShapeTable master_shapes_;\n  // Flat shape table has each unichar/font id pair in a separate shape.\n  ShapeTable flat_shapes_;\n  // Font metrics gathered from multiple files.\n  FontInfoTable fontinfo_table_;\n  // Array of xheights indexed by font ids in fontinfo_table_;\n  std::vector<int32_t> xheights_;\n\n  // Non-serialized data initialized by other means or used temporarily\n  // during loading of training samples.\n  // Number of different class labels in unicharset_.\n  int charsetsize_;\n  // Flag to indicate that we are running shape analysis and need fragments\n  // fixing.\n  bool enable_shape_analysis_;\n  // Flag to indicate that sample replication is required.\n  bool enable_replication_;\n  // Array of classids of fragments that replace the correctly segmented chars.\n  int *fragments_;\n  // Classid of previous correctly segmented sample that was added.\n  int prev_unichar_id_;\n  // Debug output control.\n  int debug_level_;\n  // Feature map used to construct reduced feature spaces for compact\n  // classifiers.\n  IntFeatureMap feature_map_;\n  // Vector of Pix pointers used for classifiers that need the image.\n  // Indexed by page_num_ in the samples.\n  // These images are owned by the trainer and need to be pixDestroyed.\n  std::vector<Image > page_images_;\n  // Vector of filenames of loaded tr files.\n  std::vector<std::string> tr_filenames_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TRAINING_MASTERTRAINER_H_\n"
  },
  {
    "path": "src/training/common/networkbuilder.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        networkbuilder.cpp\n// Description: Class to parse the network description language and\n//              build a corresponding network.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"networkbuilder.h\"\n\n#include \"convolve.h\"\n#include \"fullyconnected.h\"\n#include \"input.h\"\n#include \"lstm.h\"\n#include \"maxpool.h\"\n#include \"network.h\"\n#include \"parallel.h\"\n#include \"reconfig.h\"\n#include \"reversed.h\"\n#include \"series.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\n// Builds a network with a network_spec in the network description\n// language, to recognize a character set of num_outputs size.\n// If append_index is non-negative, then *network must be non-null and the\n// given network_spec will be appended to *network AFTER append_index, with\n// the top of the input *network discarded.\n// Note that network_spec is call by value to allow a non-const char* pointer\n// into the string for BuildFromString.\n// net_flags control network behavior according to the NetworkFlags enum.\n// The resulting network is returned via **network.\n// Returns false if something failed.\nbool NetworkBuilder::InitNetwork(int num_outputs, const char *network_spec, int append_index,\n                                 int net_flags, float weight_range, TRand *randomizer,\n                                 Network **network) {\n  NetworkBuilder builder(num_outputs);\n  Series *bottom_series = nullptr;\n  StaticShape input_shape;\n  if (append_index >= 0) {\n    // Split the current network after the given append_index.\n    ASSERT_HOST(*network != nullptr && (*network)->type() == NT_SERIES);\n    auto *series = static_cast<Series *>(*network);\n    Series *top_series = nullptr;\n    series->SplitAt(append_index, &bottom_series, &top_series);\n    if (bottom_series == nullptr || top_series == nullptr) {\n      tprintf(\"Yikes! Splitting current network failed!!\\n\");\n      return false;\n    }\n    input_shape = bottom_series->OutputShape(input_shape);\n    delete top_series;\n  }\n  *network = builder.BuildFromString(input_shape, &network_spec);\n  if (*network == nullptr) {\n    return false;\n  }\n  (*network)->SetNetworkFlags(net_flags);\n  (*network)->InitWeights(weight_range, randomizer);\n  (*network)->SetupNeedsBackprop(false);\n  if (bottom_series != nullptr) {\n    bottom_series->AppendSeries(*network);\n    *network = bottom_series;\n  }\n  (*network)->CacheXScaleFactor((*network)->XScaleFactor());\n  return true;\n}\n\n// Helper skips whitespace.\nstatic void SkipWhitespace(const char **str) {\n  while (**str == ' ' || **str == '\\t' || **str == '\\n') {\n    ++*str;\n  }\n}\n\n// Parses the given string and returns a network according to the network\n// description language in networkbuilder.h\nNetwork *NetworkBuilder::BuildFromString(const StaticShape &input_shape, const char **str) {\n  SkipWhitespace(str);\n  char code_ch = **str;\n  if (code_ch == '[') {\n    return ParseSeries(input_shape, nullptr, str);\n  }\n  if (input_shape.depth() == 0) {\n    // There must be an input at this point.\n    return ParseInput(str);\n  }\n  switch (code_ch) {\n    case '(':\n      return ParseParallel(input_shape, str);\n    case 'R':\n      return ParseR(input_shape, str);\n    case 'S':\n      return ParseS(input_shape, str);\n    case 'C':\n      return ParseC(input_shape, str);\n    case 'M':\n      return ParseM(input_shape, str);\n    case 'L':\n      return ParseLSTM(input_shape, str);\n    case 'F':\n      return ParseFullyConnected(input_shape, str);\n    case 'O':\n      return ParseOutput(input_shape, str);\n    default:\n      tprintf(\"Invalid network spec:%s\\n\", *str);\n  }\n  return nullptr;\n}\n\n// Parses an input specification and returns the result, which may include a\n// series.\nNetwork *NetworkBuilder::ParseInput(const char **str) {\n  // There must be an input at this point.\n  int length = 0;\n  int batch, height, width, depth;\n  int num_converted = sscanf(*str, \"%d,%d,%d,%d%n\", &batch, &height, &width, &depth, &length);\n  StaticShape shape;\n  shape.SetShape(batch, height, width, depth);\n  // num_converted may or may not include the length.\n  if (num_converted != 4 && num_converted != 5) {\n    tprintf(\"Must specify an input layer as the first layer, not %s!!\\n\", *str);\n    return nullptr;\n  }\n  *str += length;\n  auto *input = new Input(\"Input\", shape);\n  // We want to allow [<input>rest of net... or <input>[rest of net... so we\n  // have to check explicitly for '[' here.\n  SkipWhitespace(str);\n  if (**str == '[') {\n    return ParseSeries(shape, input, str);\n  }\n  return input;\n}\n\n// Parses a sequential series of networks, defined by [<net><net>...].\nNetwork *NetworkBuilder::ParseSeries(const StaticShape &input_shape, Input *input_layer,\n                                     const char **str) {\n  StaticShape shape = input_shape;\n  auto *series = new Series(\"Series\");\n  ++*str;\n  if (input_layer != nullptr) {\n    series->AddToStack(input_layer);\n    shape = input_layer->OutputShape(shape);\n  }\n  Network *network = nullptr;\n  while (**str != '\\0' && **str != ']' && (network = BuildFromString(shape, str)) != nullptr) {\n    shape = network->OutputShape(shape);\n    series->AddToStack(network);\n  }\n  if (**str != ']') {\n    tprintf(\"Missing ] at end of [Series]!\\n\");\n    delete series;\n    return nullptr;\n  }\n  ++*str;\n  return series;\n}\n\n// Parses a parallel set of networks, defined by (<net><net>...).\nNetwork *NetworkBuilder::ParseParallel(const StaticShape &input_shape, const char **str) {\n  auto *parallel = new Parallel(\"Parallel\", NT_PARALLEL);\n  ++*str;\n  Network *network = nullptr;\n  while (**str != '\\0' && **str != ')' &&\n         (network = BuildFromString(input_shape, str)) != nullptr) {\n    parallel->AddToStack(network);\n  }\n  if (**str != ')') {\n    tprintf(\"Missing ) at end of (Parallel)!\\n\");\n    delete parallel;\n    return nullptr;\n  }\n  ++*str;\n  return parallel;\n}\n\n// Parses a network that begins with 'R'.\nNetwork *NetworkBuilder::ParseR(const StaticShape &input_shape, const char **str) {\n  char dir = (*str)[1];\n  if (dir == 'x' || dir == 'y') {\n    std::string name = \"Reverse\";\n    name += dir;\n    *str += 2;\n    Network *network = BuildFromString(input_shape, str);\n    if (network == nullptr) {\n      return nullptr;\n    }\n    auto *rev = new Reversed(name, dir == 'y' ? NT_YREVERSED : NT_XREVERSED);\n    rev->SetNetwork(network);\n    return rev;\n  }\n  char *end;\n  int replicas = strtol(*str + 1, &end, 10);\n  *str = end;\n  if (replicas <= 0) {\n    tprintf(\"Invalid R spec!:%s\\n\", end);\n    return nullptr;\n  }\n  auto *parallel = new Parallel(\"Replicated\", NT_REPLICATED);\n  const char *str_copy = *str;\n  for (int i = 0; i < replicas; ++i) {\n    str_copy = *str;\n    Network *network = BuildFromString(input_shape, &str_copy);\n    if (network == nullptr) {\n      tprintf(\"Invalid replicated network!\\n\");\n      delete parallel;\n      return nullptr;\n    }\n    parallel->AddToStack(network);\n  }\n  *str = str_copy;\n  return parallel;\n}\n\n// Parses a network that begins with 'S'.\nNetwork *NetworkBuilder::ParseS(const StaticShape &input_shape, const char **str) {\n  char *end;\n  int y = strtol(*str + 1, &end, 10);\n  *str = end;\n  if (**str == ',') {\n    int x = strtol(*str + 1, &end, 10);\n    *str = end;\n    if (y <= 0 || x <= 0) {\n      tprintf(\"Invalid S spec!:%s\\n\", *str);\n      return nullptr;\n    }\n    return new Reconfig(\"Reconfig\", input_shape.depth(), x, y);\n  } else if (**str == '(') {\n    // TODO(rays) Add Generic reshape.\n    tprintf(\"Generic reshape not yet implemented!!\\n\");\n    return nullptr;\n  }\n  tprintf(\"Invalid S spec!:%s\\n\", *str);\n  return nullptr;\n}\n\n// Helper returns the fully-connected type for the character code.\nstatic NetworkType NonLinearity(char func) {\n  switch (func) {\n    case 's':\n      return NT_LOGISTIC;\n    case 't':\n      return NT_TANH;\n    case 'r':\n      return NT_RELU;\n    case 'l':\n      return NT_LINEAR;\n    case 'm':\n      return NT_SOFTMAX;\n    case 'p':\n      return NT_POSCLIP;\n    case 'n':\n      return NT_SYMCLIP;\n    default:\n      return NT_NONE;\n  }\n}\n\n// Parses a network that begins with 'C'.\nNetwork *NetworkBuilder::ParseC(const StaticShape &input_shape, const char **str) {\n  NetworkType type = NonLinearity((*str)[1]);\n  if (type == NT_NONE) {\n    tprintf(\"Invalid nonlinearity on C-spec!: %s\\n\", *str);\n    return nullptr;\n  }\n  int y = 0, x = 0, d = 0;\n  char *end;\n  if ((y = strtol(*str + 2, &end, 10)) <= 0 || *end != ',' ||\n      (x = strtol(end + 1, &end, 10)) <= 0 || *end != ',' || (d = strtol(end + 1, &end, 10)) <= 0) {\n    tprintf(\"Invalid C spec!:%s\\n\", end);\n    return nullptr;\n  }\n  *str = end;\n  if (x == 1 && y == 1) {\n    // No actual convolution. Just a FullyConnected on the current depth, to\n    // be slid over all batch,y,x.\n    return new FullyConnected(\"Conv1x1\", input_shape.depth(), d, type);\n  }\n  auto *series = new Series(\"ConvSeries\");\n  auto *convolve = new Convolve(\"Convolve\", input_shape.depth(), x / 2, y / 2);\n  series->AddToStack(convolve);\n  StaticShape fc_input = convolve->OutputShape(input_shape);\n  series->AddToStack(new FullyConnected(\"ConvNL\", fc_input.depth(), d, type));\n  return series;\n}\n\n// Parses a network that begins with 'M'.\nNetwork *NetworkBuilder::ParseM(const StaticShape &input_shape, const char **str) {\n  int y = 0, x = 0;\n  char *end;\n  if ((*str)[1] != 'p' || (y = strtol(*str + 2, &end, 10)) <= 0 || *end != ',' ||\n      (x = strtol(end + 1, &end, 10)) <= 0) {\n    tprintf(\"Invalid Mp spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  *str = end;\n  return new Maxpool(\"Maxpool\", input_shape.depth(), x, y);\n}\n\n// Parses an LSTM network, either individual, bi- or quad-directional.\nNetwork *NetworkBuilder::ParseLSTM(const StaticShape &input_shape, const char **str) {\n  bool two_d = false;\n  NetworkType type = NT_LSTM;\n  const char *spec_start = *str;\n  int chars_consumed = 1;\n  int num_outputs = 0;\n  char key = (*str)[chars_consumed], dir = 'f', dim = 'x';\n  if (key == 'S') {\n    type = NT_LSTM_SOFTMAX;\n    num_outputs = num_softmax_outputs_;\n    ++chars_consumed;\n  } else if (key == 'E') {\n    type = NT_LSTM_SOFTMAX_ENCODED;\n    num_outputs = num_softmax_outputs_;\n    ++chars_consumed;\n  } else if (key == '2' &&\n             (((*str)[2] == 'x' && (*str)[3] == 'y') || ((*str)[2] == 'y' && (*str)[3] == 'x'))) {\n    chars_consumed = 4;\n    dim = (*str)[3];\n    two_d = true;\n  } else if (key == 'f' || key == 'r' || key == 'b') {\n    dir = key;\n    dim = (*str)[2];\n    if (dim != 'x' && dim != 'y') {\n      tprintf(\"Invalid dimension (x|y) in L Spec!:%s\\n\", *str);\n      return nullptr;\n    }\n    chars_consumed = 3;\n    if ((*str)[chars_consumed] == 's') {\n      ++chars_consumed;\n      type = NT_LSTM_SUMMARY;\n    }\n  } else {\n    tprintf(\"Invalid direction (f|r|b) in L Spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  char *end;\n  int num_states = strtol(*str + chars_consumed, &end, 10);\n  if (num_states <= 0) {\n    tprintf(\"Invalid number of states in L Spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  *str = end;\n  Network *lstm = nullptr;\n  if (two_d) {\n    lstm = BuildLSTMXYQuad(input_shape.depth(), num_states);\n  } else {\n    if (num_outputs == 0) {\n      num_outputs = num_states;\n    }\n    std::string name(spec_start, *str - spec_start);\n    lstm = new LSTM(name, input_shape.depth(), num_states, num_outputs, false, type);\n    if (dir != 'f') {\n      auto *rev = new Reversed(\"RevLSTM\", NT_XREVERSED);\n      rev->SetNetwork(lstm);\n      lstm = rev;\n    }\n    if (dir == 'b') {\n      name += \"LTR\";\n      auto *parallel = new Parallel(\"BidiLSTM\", NT_PAR_RL_LSTM);\n      parallel->AddToStack(\n          new LSTM(name, input_shape.depth(), num_states, num_outputs, false, type));\n      parallel->AddToStack(lstm);\n      lstm = parallel;\n    }\n  }\n  if (dim == 'y') {\n    auto *rev = new Reversed(\"XYTransLSTM\", NT_XYTRANSPOSE);\n    rev->SetNetwork(lstm);\n    lstm = rev;\n  }\n  return lstm;\n}\n\n// Builds a set of 4 lstms with x and y reversal, running in true parallel.\nNetwork *NetworkBuilder::BuildLSTMXYQuad(int num_inputs, int num_states) {\n  auto *parallel = new Parallel(\"2DLSTMQuad\", NT_PAR_2D_LSTM);\n  parallel->AddToStack(new LSTM(\"L2DLTRDown\", num_inputs, num_states, num_states, true, NT_LSTM));\n  auto *rev = new Reversed(\"L2DLTRXRev\", NT_XREVERSED);\n  rev->SetNetwork(new LSTM(\"L2DRTLDown\", num_inputs, num_states, num_states, true, NT_LSTM));\n  parallel->AddToStack(rev);\n  rev = new Reversed(\"L2DRTLYRev\", NT_YREVERSED);\n  rev->SetNetwork(new LSTM(\"L2DRTLUp\", num_inputs, num_states, num_states, true, NT_LSTM));\n  auto *rev2 = new Reversed(\"L2DXRevU\", NT_XREVERSED);\n  rev2->SetNetwork(rev);\n  parallel->AddToStack(rev2);\n  rev = new Reversed(\"L2DXRevY\", NT_YREVERSED);\n  rev->SetNetwork(new LSTM(\"L2DLTRDown\", num_inputs, num_states, num_states, true, NT_LSTM));\n  parallel->AddToStack(rev);\n  return parallel;\n}\n\n// Helper builds a truly (0-d) fully connected layer of the given type.\nstatic Network *BuildFullyConnected(const StaticShape &input_shape, NetworkType type,\n                                    const std::string &name, int depth) {\n  if (input_shape.height() == 0 || input_shape.width() == 0) {\n    tprintf(\"Fully connected requires positive height and width, had %d,%d\\n\", input_shape.height(),\n            input_shape.width());\n    return nullptr;\n  }\n  int input_size = input_shape.height() * input_shape.width();\n  int input_depth = input_size * input_shape.depth();\n  Network *fc = new FullyConnected(name, input_depth, depth, type);\n  if (input_size > 1) {\n    auto *series = new Series(\"FCSeries\");\n    series->AddToStack(\n        new Reconfig(\"FCReconfig\", input_shape.depth(), input_shape.width(), input_shape.height()));\n    series->AddToStack(fc);\n    fc = series;\n  }\n  return fc;\n}\n\n// Parses a Fully connected network.\nNetwork *NetworkBuilder::ParseFullyConnected(const StaticShape &input_shape, const char **str) {\n  const char *spec_start = *str;\n  NetworkType type = NonLinearity((*str)[1]);\n  if (type == NT_NONE) {\n    tprintf(\"Invalid nonlinearity on F-spec!: %s\\n\", *str);\n    return nullptr;\n  }\n  char *end;\n  int depth = strtol(*str + 2, &end, 10);\n  if (depth <= 0) {\n    tprintf(\"Invalid F spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  *str = end;\n  std::string name(spec_start, *str - spec_start);\n  return BuildFullyConnected(input_shape, type, name, depth);\n}\n\n// Parses an Output spec.\nNetwork *NetworkBuilder::ParseOutput(const StaticShape &input_shape, const char **str) {\n  char dims_ch = (*str)[1];\n  if (dims_ch != '0' && dims_ch != '1' && dims_ch != '2') {\n    tprintf(\"Invalid dims (2|1|0) in output spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  char type_ch = (*str)[2];\n  if (type_ch != 'l' && type_ch != 's' && type_ch != 'c') {\n    tprintf(\"Invalid output type (l|s|c) in output spec!:%s\\n\", *str);\n    return nullptr;\n  }\n  char *end;\n  int depth = strtol(*str + 3, &end, 10);\n  if (depth != num_softmax_outputs_) {\n    tprintf(\"Warning: given outputs %d not equal to unicharset of %d.\\n\", depth,\n            num_softmax_outputs_);\n    depth = num_softmax_outputs_;\n  }\n  *str = end;\n  NetworkType type = NT_SOFTMAX;\n  if (type_ch == 'l') {\n    type = NT_LOGISTIC;\n  } else if (type_ch == 's') {\n    type = NT_SOFTMAX_NO_CTC;\n  }\n  if (dims_ch == '0') {\n    // Same as standard fully connected.\n    return BuildFullyConnected(input_shape, type, \"Output\", depth);\n  } else if (dims_ch == '2') {\n    // We don't care if x and/or y are variable.\n    return new FullyConnected(\"Output2d\", input_shape.depth(), depth, type);\n  }\n  // For 1-d y has to be fixed, and if not 1, moved to depth.\n  if (input_shape.height() == 0) {\n    tprintf(\"Fully connected requires fixed height!\\n\");\n    return nullptr;\n  }\n  int input_size = input_shape.height();\n  int input_depth = input_size * input_shape.depth();\n  Network *fc = new FullyConnected(\"Output\", input_depth, depth, type);\n  if (input_size > 1) {\n    auto *series = new Series(\"FCSeries\");\n    series->AddToStack(new Reconfig(\"FCReconfig\", input_shape.depth(), 1, input_shape.height()));\n    series->AddToStack(fc);\n    fc = series;\n  }\n  return fc;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/networkbuilder.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        networkbuilder.h\n// Description: Class to parse the network description language and\n//              build a corresponding network.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_NETWORKBUILDER_H_\n#define TESSERACT_LSTM_NETWORKBUILDER_H_\n\n#include \"export.h\"\n#include \"static_shape.h\"\n#include \"stridemap.h\"\n\nclass UNICHARSET;\n\nnamespace tesseract {\n\nclass Input;\nclass Network;\nclass Parallel;\nclass TRand;\n\nclass TESS_COMMON_TRAINING_API NetworkBuilder {\npublic:\n  explicit NetworkBuilder(int num_softmax_outputs) : num_softmax_outputs_(num_softmax_outputs) {}\n\n  // Builds a network with a network_spec in the network description\n  // language, to recognize a character set of num_outputs size.\n  // If append_index is non-negative, then *network must be non-null and the\n  // given network_spec will be appended to *network AFTER append_index, with\n  // the top of the input *network discarded.\n  // Note that network_spec is call by value to allow a non-const char* pointer\n  // into the string for BuildFromString.\n  // net_flags control network behavior according to the NetworkFlags enum.\n  // The resulting network is returned via **network.\n  // Returns false if something failed.\n  static bool InitNetwork(int num_outputs, const char *network_spec, int append_index,\n                          int net_flags, float weight_range, TRand *randomizer, Network **network);\n\n  // Parses the given string and returns a network according to the following\n  // language:\n  //  ============ Syntax of description below: ============\n  // <d> represents a number.\n  // <net> represents any single network element, including (recursively) a\n  //   [...] series or (...) parallel construct.\n  // (s|t|r|l|m) (regex notation) represents a single required letter.\n  // NOTE THAT THROUGHOUT, x and y are REVERSED from conventional mathematics,\n  // to use the same convention as Tensor Flow. The reason TF adopts this\n  // convention is to eliminate the need to transpose images on input, since\n  // adjacent memory locations in images increase x and then y, while adjacent\n  // memory locations in tensors in TF, and NetworkIO in tesseract increase the\n  // rightmost index first, then the next-left and so-on, like C arrays.\n  // ============ INPUTS ============\n  // <b>,<h>,<w>,<d> A batch of b images with height h, width w, and depth d.\n  //   b, h and/or w may be zero, to indicate variable size. Some network layer\n  //   (summarizing LSTM) must be used to make a variable h known.\n  //   d may be 1 for greyscale, 3 for color.\n  // NOTE that throughout the constructed network, the inputs/outputs are all of\n  // the same [batch,height,width,depth] dimensions, even if a different size.\n  // ============ PLUMBING ============\n  // [...] Execute ... networks in series (layers).\n  // (...) Execute ... networks in parallel, with their output depths added.\n  // R<d><net> Execute d replicas of net in parallel, with their output depths\n  //   added.\n  // Rx<net> Execute <net> with x-dimension reversal.\n  // Ry<net> Execute <net> with y-dimension reversal.\n  // S<y>,<x> Rescale 2-D input by shrink factor x,y, rearranging the data by\n  //   increasing the depth of the input by factor xy.\n  // Mp<y>,<x> Maxpool the input, reducing the size by an (x,y) rectangle.\n  // ============ FUNCTIONAL UNITS ============\n  // C(s|t|r|l|m)<y>,<x>,<d> Convolves using a (x,y) window, with no shrinkage,\n  //   random infill, producing d outputs, then applies a non-linearity:\n  //   s: Sigmoid, t: Tanh, r: Relu, l: Linear, m: Softmax.\n  // F(s|t|r|l|m)<d> Truly fully-connected with s|t|r|l|m non-linearity and d\n  //   outputs. Connects to every x,y,depth position of the input, reducing\n  //   height, width to 1, producing a single <d> vector as the output.\n  //   Input height and width must be constant.\n  //   For a sliding-window linear or non-linear map that connects just to the\n  //   input depth, and leaves the input image size as-is, use a 1x1 convolution\n  //   eg. Cr1,1,64 instead of Fr64.\n  // L(f|r|b)(x|y)[s]<n> LSTM cell with n states/outputs.\n  //   The LSTM must have one of:\n  //    f runs the LSTM forward only.\n  //    r runs the LSTM reversed only.\n  //    b runs the LSTM bidirectionally.\n  //   It will operate on either the x- or y-dimension, treating the other\n  //     dimension independently (as if part of the batch).\n  //   s (optional) summarizes the output in the requested dimension,\n  //     outputting only the final step, collapsing the dimension to a\n  //     single element.\n  // LS<n> Forward-only LSTM cell in the x-direction, with built-in Softmax.\n  // LE<n> Forward-only LSTM cell in the x-direction, with built-in softmax,\n  //       with binary Encoding.\n  // L2xy<n> Full 2-d LSTM operating in quad-directions (bidi in x and y) and\n  //   all the output depths added.\n  // ============ OUTPUTS ============\n  // The network description must finish with an output specification:\n  // O(2|1|0)(l|s|c)<n> output layer with n classes\n  //  2 (heatmap) Output is a 2-d vector map of the input (possibly at\n  //    different scale).\n  //  1 (sequence) Output is a 1-d sequence of vector values.\n  //  0 (category) Output is a 0-d single vector value.\n  //  l uses a logistic non-linearity on the output, allowing multiple\n  //    hot elements in any output vector value.\n  //  s uses a softmax non-linearity, with one-hot output in each value.\n  //  c uses a softmax with CTC. Can only be used with s (sequence).\n  //  NOTE1: Only O1s and O1c are currently supported.\n  //  NOTE2: n is totally ignored, and for compatibility purposes only. The\n  //         output number of classes is obtained automatically from the\n  //         unicharset.\n  Network *BuildFromString(const StaticShape &input_shape, const char **str);\n\nprivate:\n  // Parses an input specification and returns the result, which may include a\n  // series.\n  Network *ParseInput(const char **str);\n  // Parses a sequential series of networks, defined by [<net><net>...].\n  Network *ParseSeries(const StaticShape &input_shape, Input *input_layer, const char **str);\n  // Parses a parallel set of networks, defined by (<net><net>...).\n  Network *ParseParallel(const StaticShape &input_shape, const char **str);\n  // Parses a network that begins with 'R'.\n  Network *ParseR(const StaticShape &input_shape, const char **str);\n  // Parses a network that begins with 'S'.\n  Network *ParseS(const StaticShape &input_shape, const char **str);\n  // Parses a network that begins with 'C'.\n  Network *ParseC(const StaticShape &input_shape, const char **str);\n  // Parses a network that begins with 'M'.\n  Network *ParseM(const StaticShape &input_shape, const char **str);\n  // Parses an LSTM network, either individual, bi- or quad-directional.\n  Network *ParseLSTM(const StaticShape &input_shape, const char **str);\n  // Builds a set of 4 lstms with t and y reversal, running in true parallel.\n  static Network *BuildLSTMXYQuad(int num_inputs, int num_states);\n  // Parses a Fully connected network.\n  Network *ParseFullyConnected(const StaticShape &input_shape, const char **str);\n  // Parses an Output spec.\n  Network *ParseOutput(const StaticShape &input_shape, const char **str);\n\nprivate:\n  int num_softmax_outputs_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_NETWORKBUILDER_H_\n"
  },
  {
    "path": "src/training/common/sampleiterator.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"sampleiterator.h\"\n\n#include \"intfeaturemap.h\"\n\n#include \"indexmapbidi.h\"\n#include \"shapetable.h\"\n#include \"trainingsample.h\"\n#include \"trainingsampleset.h\"\n\nnamespace tesseract {\n\n// ================== SampleIterator Implementation =================\n\nSampleIterator::SampleIterator()\n    : charset_map_(nullptr)\n    , shape_table_(nullptr)\n    , sample_set_(nullptr)\n    , randomize_(false)\n    , owned_shape_table_(nullptr) {\n  num_shapes_ = 0;\n  Begin();\n}\n\nSampleIterator::~SampleIterator() {\n  Clear();\n}\n\nvoid SampleIterator::Clear() {\n  delete owned_shape_table_;\n  owned_shape_table_ = nullptr;\n}\n\n// See class comment for arguments.\nvoid SampleIterator::Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table,\n                          bool randomize, TrainingSampleSet *sample_set) {\n  Clear();\n  charset_map_ = charset_map;\n  shape_table_ = shape_table;\n  sample_set_ = sample_set;\n  randomize_ = randomize;\n  if (shape_table_ == nullptr && charset_map_ != nullptr) {\n    // The caller wishes to iterate by class. The easiest way to do this\n    // is to create a dummy shape_table_ that we will own.\n    int num_fonts = sample_set_->NumFonts();\n    owned_shape_table_ = new ShapeTable(sample_set_->unicharset());\n    int charsetsize = sample_set_->unicharset().size();\n    for (int c = 0; c < charsetsize; ++c) {\n      // We always add a shape for each character to keep the index in sync\n      // with the unichar_id.\n      int shape_id = owned_shape_table_->AddShape(c, 0);\n      for (int f = 1; f < num_fonts; ++f) {\n        if (sample_set_->NumClassSamples(f, c, true) > 0) {\n          owned_shape_table_->AddToShape(shape_id, c, f);\n        }\n      }\n    }\n    shape_table_ = owned_shape_table_;\n  }\n  if (shape_table_ != nullptr) {\n    num_shapes_ = shape_table_->NumShapes();\n  } else {\n    num_shapes_ = randomize ? sample_set_->num_samples() : sample_set_->num_raw_samples();\n  }\n  Begin();\n}\n\n// Iterator functions designed for use with a simple for loop:\n// for (it.Begin(); !it.AtEnd(); it.Next()) {\n//   const TrainingSample& sample = it.GetSample();\n// }\nvoid SampleIterator::Begin() {\n  shape_index_ = -1;\n  shape_char_index_ = 0;\n  num_shape_chars_ = 0;\n  shape_font_index_ = 0;\n  num_shape_fonts_ = 0;\n  sample_index_ = 0;\n  num_samples_ = 0;\n  // Find the first indexable sample.\n  Next();\n}\n\nbool SampleIterator::AtEnd() const {\n  return shape_index_ >= num_shapes_;\n}\n\nconst TrainingSample &SampleIterator::GetSample() const {\n  if (shape_table_ != nullptr) {\n    const UnicharAndFonts *shape_entry = GetShapeEntry();\n    int char_id = shape_entry->unichar_id;\n    int font_id = shape_entry->font_ids[shape_font_index_];\n    return *sample_set_->GetSample(font_id, char_id, sample_index_);\n  } else {\n    return *sample_set_->GetSample(shape_index_);\n  }\n}\n\nTrainingSample *SampleIterator::MutableSample() const {\n  if (shape_table_ != nullptr) {\n    const UnicharAndFonts *shape_entry = GetShapeEntry();\n    int char_id = shape_entry->unichar_id;\n    int font_id = shape_entry->font_ids[shape_font_index_];\n    return sample_set_->MutableSample(font_id, char_id, sample_index_);\n  } else {\n    return sample_set_->mutable_sample(shape_index_);\n  }\n}\n\n// Returns the total index (from the original set of samples) of the current\n// sample.\nint SampleIterator::GlobalSampleIndex() const {\n  if (shape_table_ != nullptr) {\n    const UnicharAndFonts *shape_entry = GetShapeEntry();\n    int char_id = shape_entry->unichar_id;\n    int font_id = shape_entry->font_ids[shape_font_index_];\n    return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);\n  } else {\n    return shape_index_;\n  }\n}\n\n// Returns the index of the current sample in compact charset space, so\n// in a 2-class problem between x and y, the returned indices will all be\n// 0 or 1, and have nothing to do with the unichar_ids.\n// If the charset_map_ is nullptr, then this is equal to GetSparseClassID().\nint SampleIterator::GetCompactClassID() const {\n  return charset_map_ != nullptr ? charset_map_->SparseToCompact(shape_index_) : GetSparseClassID();\n}\n// Returns the index of the current sample in sparse charset space, so\n// in a 2-class problem between x and y, the returned indices will all be\n// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids\n// with a shape_table_.\nint SampleIterator::GetSparseClassID() const {\n  return shape_table_ != nullptr ? shape_index_ : GetSample().class_id();\n}\n\n// Moves on to the next indexable sample. If the end is reached, leaves\n// the state such that AtEnd() is true.\nvoid SampleIterator::Next() {\n  if (shape_table_ != nullptr) {\n    // Next sample in this class/font combination.\n    ++sample_index_;\n    if (sample_index_ < num_samples_) {\n      return;\n    }\n    // Next font in this class in this shape.\n    sample_index_ = 0;\n    do {\n      ++shape_font_index_;\n      if (shape_font_index_ >= num_shape_fonts_) {\n        // Next unichar in this shape.\n        shape_font_index_ = 0;\n        ++shape_char_index_;\n        if (shape_char_index_ >= num_shape_chars_) {\n          // Find the next shape that is mapped in the charset_map_.\n          shape_char_index_ = 0;\n          do {\n            ++shape_index_;\n          } while (shape_index_ < num_shapes_ && charset_map_ != nullptr &&\n                   charset_map_->SparseToCompact(shape_index_) < 0);\n          if (shape_index_ >= num_shapes_) {\n            return; // The end.\n          }\n          num_shape_chars_ = shape_table_->GetShape(shape_index_).size();\n        }\n      }\n      const UnicharAndFonts *shape_entry = GetShapeEntry();\n      num_shape_fonts_ = shape_entry->font_ids.size();\n      int char_id = shape_entry->unichar_id;\n      int font_id = shape_entry->font_ids[shape_font_index_];\n      num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);\n    } while (num_samples_ == 0);\n  } else {\n    // We are just iterating over the samples.\n    ++shape_index_;\n  }\n}\n\n// Returns the size of the compact charset space.\nint SampleIterator::CompactCharsetSize() const {\n  return charset_map_ != nullptr ? charset_map_->CompactSize() : SparseCharsetSize();\n}\n\n// Returns the size of the sparse charset space.\nint SampleIterator::SparseCharsetSize() const {\n  return charset_map_ != nullptr\n             ? charset_map_->SparseSize()\n             : (shape_table_ != nullptr ? shape_table_->NumShapes() : sample_set_->charsetsize());\n}\n\n// Sets the mapped_features_ from the features using the provided\n// feature_map.\nstatic void MapFeatures(TrainingSample &s, const IntFeatureMap &feature_map) {\n  std::vector<int> indexed_features;\n  feature_map.feature_space().IndexAndSortFeatures(s.features(), s.num_features(),\n                                                   &indexed_features);\n  feature_map.MapIndexedFeatures(indexed_features, &s.mapped_features_);\n  s.features_are_indexed_ = false;\n  s.features_are_mapped_ = true;\n}\n\n// Apply the supplied feature_space/feature_map transform to all samples\n// accessed by this iterator.\nvoid SampleIterator::MapSampleFeatures(const IntFeatureMap &feature_map) {\n  for (Begin(); !AtEnd(); Next()) {\n    TrainingSample *sample = MutableSample();\n    MapFeatures(*sample, feature_map);\n  }\n}\n\n// Adjust the weights of all the samples to be uniform in the given charset.\n// Returns the number of samples in the iterator.\nint SampleIterator::UniformSamples() {\n  int num_good_samples = 0;\n  for (Begin(); !AtEnd(); Next()) {\n    TrainingSample *sample = MutableSample();\n    sample->set_weight(1.0);\n    ++num_good_samples;\n  }\n  NormalizeSamples();\n  return num_good_samples;\n}\n\n// Normalize the weights of all the samples in the charset_map so they sum\n// to 1. Returns the minimum assigned sample weight.\ndouble SampleIterator::NormalizeSamples() {\n  double total_weight = 0.0;\n  for (Begin(); !AtEnd(); Next()) {\n    const TrainingSample &sample = GetSample();\n    total_weight += sample.weight();\n  }\n  // Normalize samples.\n  double min_assigned_sample_weight = 1.0;\n  if (total_weight > 0.0) {\n    for (Begin(); !AtEnd(); Next()) {\n      TrainingSample *sample = MutableSample();\n      double weight = sample->weight() / total_weight;\n      if (weight < min_assigned_sample_weight) {\n        min_assigned_sample_weight = weight;\n      }\n      sample->set_weight(weight);\n    }\n  }\n  return min_assigned_sample_weight;\n}\n\n// Helper returns the current UnicharAndFont shape_entry.\nconst UnicharAndFonts *SampleIterator::GetShapeEntry() const {\n  const Shape &shape = shape_table_->GetShape(shape_index_);\n  return &shape[shape_char_index_];\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/sampleiterator.h",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_\n#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_\n\nnamespace tesseract {\n\nclass IndexMapBiDi;\nclass IntFeatureMap;\nclass ShapeTable;\nclass TrainingSample;\nclass TrainingSampleSet;\nstruct UnicharAndFonts;\n\n// Iterator class to encapsulate the complex iteration involved in getting\n// all samples of all shapes needed for a classification problem.\n//\n// =====INPUTS TO Init FUNCTION=====\n// The charset_map defines a subset of the sample_set classes (with a nullptr\n// shape_table, or the shape_table classes if not nullptr.)\n//\n// The shape_table (if not nullptr) defines the mapping from shapes to\n// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.\n//\n// The sample_set holds the samples and provides indexed access to samples\n// of font_id/class_id pairs.\n//\n// If randomize is true, the samples are perturbed slightly, but the\n// perturbation is guaranteed to be the same for multiple identical\n// iterations.\n//\n// =====DIFFERENT COMBINATIONS OF INPUTS=====\n// nullptr shape_table:\n// Without a shape_table, everything works in UNICHAR_IDs.\n//\n// nullptr shape_table, nullptr charset_map:\n// Iterations simply run over the samples in the order the samples occur in the\n// input files.\n// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.\n//\n// nullptr shape_table, non-nullptr charset_map:\n// When shape_table is nullptr, the charset_map indexes unichar_ids directly,\n// and an iteration returns all samples of all chars in the charset_map, which\n// is a subset of the full unicharset.\n// The iteration will be in groups of the same unichar_id, in the order\n// defined by the charset_map.\n// GetCompactClassID returns the charset_map index of a sample, and\n// GetSparseClassID returns the sample UNICHAR_ID.\n//\n// Non-nullptr shape_table:\n// With a shape_table, samples are grouped according to the shape_table, so\n// multiple UNICHAR_IDs and fonts may be grouped together, and everything\n// works in shape_ids.\n//\n// Non-nullptr shape_table, nullptr charset_map.\n// Iterations simply run over the samples in the order of shape_id.\n// GetCompactClassID and GetSparseClassID both return the shape_id.\n// (If you want the unichar_id or font_id, the sample still has them.)\n//\n// Non-nullptr shape_table, non-nullptr charset_map.\n// When shape_table is not nullptr, the charset_map indexes and subsets shapes\n// in the shape_table, and iterations will be in shape_table order, not\n// charset_map order.\n// GetCompactClassID returns the charset_map index of a shape, and\n// GetSparseClassID returns the shape_id.\n//\n// =====What is SampleIterator good for?=====\n// Inside a classifier training module, the SampleIterator has abstracted away\n// all the different modes above.\n// Use the following iteration to train your classifier:\n// for (it.Begin(); !it.AtEnd(); it.Next()) {\n//   const TrainingSample& sample = it.GetSample();\n//   int class_id = it.GetCompactClassID();\n// Your classifier may or may not be dealing with a shape_table, and may be\n// dealing with some subset of the character/shape set. It doesn't need to\n// know and shouldn't care. It is just learning shapes with compact class ids\n// in the range [0, it.CompactCharsetSize()).\nclass SampleIterator {\npublic:\n  SampleIterator();\n  ~SampleIterator();\n\n  void Clear();\n\n  // See class comment for arguments.\n  void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize,\n            TrainingSampleSet *sample_set);\n\n  // Iterator functions designed for use with a simple for loop:\n  // for (it.Begin(); !it.AtEnd(); it.Next()) {\n  //   const TrainingSample& sample = it.GetSample();\n  //   int class_id = it.GetCompactClassID();\n  //   ...\n  // }\n  void Begin();\n  bool AtEnd() const;\n  const TrainingSample &GetSample() const;\n  TrainingSample *MutableSample() const;\n  // Returns the total index (from the original set of samples) of the current\n  // sample.\n  int GlobalSampleIndex() const;\n  // Returns the index of the current sample in compact charset space, so\n  // in a 2-class problem between x and y, the returned indices will all be\n  // 0 or 1, and have nothing to do with the unichar_ids.\n  // If the charset_map_ is nullptr, then this is equal to GetSparseClassID().\n  int GetCompactClassID() const;\n  // Returns the index of the current sample in sparse charset space, so\n  // in a 2-class problem between x and y, the returned indices will all be\n  // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids\n  // with a shape_table_.\n  int GetSparseClassID() const;\n  // Moves on to the next indexable sample. If the end is reached, leaves\n  // the state such that AtEnd() is true.\n  void Next();\n\n  // Returns the size of the compact charset space.\n  int CompactCharsetSize() const;\n  // Returns the size of the sparse charset space.\n  int SparseCharsetSize() const;\n\n  const IndexMapBiDi &charset_map() const {\n    return *charset_map_;\n  }\n  const ShapeTable *shape_table() const {\n    return shape_table_;\n  }\n  // Sample set operations.\n  const TrainingSampleSet *sample_set() const {\n    return sample_set_;\n  }\n\n  // A set of functions that do something to all the samples accessed by the\n  // iterator, as it is currently setup.\n\n  // Apply the supplied feature_space/feature_map transform to all samples\n  // accessed by this iterator.\n  void MapSampleFeatures(const IntFeatureMap &feature_map);\n\n  // Adjust the weights of all the samples to be uniform in the given charset.\n  // Returns the number of samples in the iterator.\n  int UniformSamples();\n\n  // Normalize the weights of all the samples defined by the iterator so they\n  // sum to 1. Returns the minimum assigned sample weight.\n  double NormalizeSamples();\n\nprivate:\n  // Helper returns the current UnicharAndFont shape_entry.\n  const UnicharAndFonts *GetShapeEntry() const;\n\n  // Map to subset the actual charset space.\n  const IndexMapBiDi *charset_map_;\n  // Shape table to recombine character classes into shapes\n  const ShapeTable *shape_table_;\n  // The samples to iterate over.\n  TrainingSampleSet *sample_set_;\n  // Flag to control randomizing the sample features.\n  bool randomize_;\n  // Shape table owned by this used to iterate character classes.\n  ShapeTable *owned_shape_table_;\n\n  // Top-level iteration. Shape index in sparse charset_map space.\n  int shape_index_;\n  int num_shapes_;\n  // Index to the character class within a shape.\n  int shape_char_index_;\n  int num_shape_chars_;\n  // Index to the font within a shape/class pair.\n  int shape_font_index_;\n  int num_shape_fonts_;\n  // The lowest level iteration. sample_index_/num_samples_ counts samples\n  // in the current shape/class/font combination.\n  int sample_index_;\n  int num_samples_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_\n"
  },
  {
    "path": "src/training/common/trainingsampleset.cpp",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <algorithm>\n\n#include <allheaders.h>\n#include \"boxread.h\"\n#include \"fontinfo.h\"\n//#include \"helpers.h\"\n#include \"indexmapbidi.h\"\n#include \"intfeaturedist.h\"\n#include \"intfeaturemap.h\"\n#include \"intfeaturespace.h\"\n#include \"shapetable.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"trainingsample.h\"\n#include \"trainingsampleset.h\"\n#include \"unicity_table.h\"\n\nnamespace tesseract {\n\nconst int kTestChar = -1; // 37;\n// Max number of distances to compute the squared way\nconst int kSquareLimit = 25;\n// Prime numbers for subsampling distances.\nconst int kPrime1 = 17;\nconst int kPrime2 = 13;\n\nTrainingSampleSet::FontClassInfo::FontClassInfo()\n    : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {}\n\n// Writes to the given file. Returns false in case of error.\nbool TrainingSampleSet::FontClassInfo::Serialize(FILE *fp) const {\n  if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) {\n    return false;\n  }\n  if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) {\n    return false;\n  }\n  if (!::tesseract::Serialize(fp, samples)) {\n    return false;\n  }\n  return true;\n}\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE *fp) {\n  if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) {\n    return false;\n  }\n  if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) {\n    return false;\n  }\n  if (!::tesseract::DeSerialize(swap, fp, samples)) {\n    return false;\n  }\n  if (swap) {\n    ReverseN(&num_raw_samples, sizeof(num_raw_samples));\n    ReverseN(&canonical_sample, sizeof(canonical_sample));\n    ReverseN(&canonical_dist, sizeof(canonical_dist));\n  }\n  return true;\n}\n\nTrainingSampleSet::TrainingSampleSet(const FontInfoTable &font_table)\n    : num_raw_samples_(0)\n    , unicharset_size_(0)\n    , font_class_array_(nullptr)\n    , fontinfo_table_(font_table) {}\n\nTrainingSampleSet::~TrainingSampleSet() {\n  for (auto sample : samples_) {\n    delete sample;\n  }\n  delete font_class_array_;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool TrainingSampleSet::Serialize(FILE *fp) const {\n  if (!tesseract::Serialize(fp, samples_)) {\n    return false;\n  }\n  if (!unicharset_.save_to_file(fp)) {\n    return false;\n  }\n  if (!font_id_map_.Serialize(fp)) {\n    return false;\n  }\n  int8_t not_null = font_class_array_ != nullptr;\n  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) {\n    return false;\n  }\n  if (not_null) {\n    if (!font_class_array_->SerializeClasses(fp)) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// Reads from the given file. Returns false in case of error.\n// If swap is true, assumes a big/little-endian swap is needed.\nbool TrainingSampleSet::DeSerialize(bool swap, FILE *fp) {\n  if (!tesseract::DeSerialize(swap, fp, samples_)) {\n    return false;\n  }\n  num_raw_samples_ = samples_.size();\n  if (!unicharset_.load_from_file(fp)) {\n    return false;\n  }\n  if (!font_id_map_.DeSerialize(swap, fp)) {\n    return false;\n  }\n  delete font_class_array_;\n  font_class_array_ = nullptr;\n  int8_t not_null;\n  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) {\n    return false;\n  }\n  if (not_null) {\n    FontClassInfo empty;\n    font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(1, 1, empty);\n    if (!font_class_array_->DeSerializeClasses(swap, fp)) {\n      return false;\n    }\n  }\n  unicharset_size_ = unicharset_.size();\n  return true;\n}\n\n// Load an initial unicharset, or set one up if the file cannot be read.\nvoid TrainingSampleSet::LoadUnicharset(const char *filename) {\n  if (!unicharset_.load_from_file(filename)) {\n    tprintf(\n        \"Failed to load unicharset from file %s\\n\"\n        \"Building unicharset from scratch...\\n\",\n        filename);\n    unicharset_.clear();\n    // Add special characters as they were removed by the clear.\n    UNICHARSET empty;\n    unicharset_.AppendOtherUnicharset(empty);\n  }\n  unicharset_size_ = unicharset_.size();\n}\n\n// Adds a character sample to this sample set.\n// If the unichar is not already in the local unicharset, it is added.\n// Returns the unichar_id of the added sample, from the local unicharset.\nint TrainingSampleSet::AddSample(const char *unichar, TrainingSample *sample) {\n  if (!unicharset_.contains_unichar(unichar)) {\n    unicharset_.unichar_insert(unichar);\n    if (unicharset_.size() > MAX_NUM_CLASSES) {\n      tprintf(\n          \"Error: Size of unicharset in TrainingSampleSet::AddSample is \"\n          \"greater than MAX_NUM_CLASSES\\n\");\n      return -1;\n    }\n  }\n  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);\n  AddSample(char_id, sample);\n  return char_id;\n}\n\n// Adds a character sample to this sample set with the given unichar_id,\n// which must correspond to the local unicharset (in this).\nvoid TrainingSampleSet::AddSample(int unichar_id, TrainingSample *sample) {\n  sample->set_class_id(unichar_id);\n  samples_.push_back(sample);\n  num_raw_samples_ = samples_.size();\n  unicharset_size_ = unicharset_.size();\n}\n\n// Returns the number of samples for the given font,class pair.\n// If randomize is true, returns the number of samples accessible\n// with randomizing on. (Increases the number of samples if small.)\n// OrganizeByFontAndClass must have been already called.\nint TrainingSampleSet::NumClassSamples(int font_id, int class_id, bool randomize) const {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.SparseSize() ||\n      class_id >= unicharset_size_) {\n    // There are no samples because the font or class doesn't exist.\n    return 0;\n  }\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return 0; // The font has no samples.\n  }\n  if (randomize) {\n    return (*font_class_array_)(font_index, class_id).samples.size();\n  } else {\n    return (*font_class_array_)(font_index, class_id).num_raw_samples;\n  }\n}\n\n// Gets a sample by its index.\nconst TrainingSample *TrainingSampleSet::GetSample(int index) const {\n  return samples_[index];\n}\n\n// Gets a sample by its font, class, index.\n// OrganizeByFontAndClass must have been already called.\nconst TrainingSample *TrainingSampleSet::GetSample(int font_id, int class_id, int index) const {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return nullptr;\n  }\n  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];\n  return samples_[sample_index];\n}\n\n// Get a sample by its font, class, index. Does not randomize.\n// OrganizeByFontAndClass must have been already called.\nTrainingSample *TrainingSampleSet::MutableSample(int font_id, int class_id, int index) {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return nullptr;\n  }\n  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];\n  return samples_[sample_index];\n}\n\n// Returns a string debug representation of the given sample:\n// font, unichar_str, bounding box, page.\nstd::string TrainingSampleSet::SampleToString(const TrainingSample &sample) const {\n  std::string boxfile_str;\n  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), sample.bounding_box(),\n                 sample.page_num(), boxfile_str);\n  return std::string(fontinfo_table_.at(sample.font_id()).name) + \" \" + boxfile_str;\n}\n\n// Gets the combined set of features used by all the samples of the given\n// font/class combination.\nconst BitVector &TrainingSampleSet::GetCloudFeatures(int font_id, int class_id) const {\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  ASSERT_HOST(font_index >= 0);\n  return (*font_class_array_)(font_index, class_id).cloud_features;\n}\n// Gets the indexed features of the canonical sample of the given\n// font/class combination.\nconst std::vector<int> &TrainingSampleSet::GetCanonicalFeatures(int font_id, int class_id) const {\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  ASSERT_HOST(font_index >= 0);\n  return (*font_class_array_)(font_index, class_id).canonical_features;\n}\n\n// Returns the distance between the given UniCharAndFonts pair.\n// If matched_fonts, only matching fonts, are considered, unless that yields\n// the empty set.\n// OrganizeByFontAndClass must have been already called.\nfloat TrainingSampleSet::UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2,\n                                         bool matched_fonts, const IntFeatureMap &feature_map) {\n  int num_fonts1 = uf1.font_ids.size();\n  int c1 = uf1.unichar_id;\n  int num_fonts2 = uf2.font_ids.size();\n  int c2 = uf2.unichar_id;\n  double dist_sum = 0.0;\n  int dist_count = 0;\n  const bool debug = false;\n  if (matched_fonts) {\n    // Compute distances only where fonts match.\n    for (int i = 0; i < num_fonts1; ++i) {\n      int f1 = uf1.font_ids[i];\n      for (int j = 0; j < num_fonts2; ++j) {\n        int f2 = uf2.font_ids[j];\n        if (f1 == f2) {\n          dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);\n          ++dist_count;\n        }\n      }\n    }\n  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {\n    // Small enough sets to compute all the distances.\n    for (int i = 0; i < num_fonts1; ++i) {\n      int f1 = uf1.font_ids[i];\n      for (int j = 0; j < num_fonts2; ++j) {\n        int f2 = uf2.font_ids[j];\n        dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);\n        if (debug) {\n          tprintf(\"Cluster dist %d %d %d %d = %g\\n\", f1, c1, f2, c2,\n                  ClusterDistance(f1, c1, f2, c2, feature_map));\n        }\n        ++dist_count;\n      }\n    }\n  } else {\n    // Subsample distances, using the largest set once, and stepping through\n    // the smaller set so as to ensure that all the pairs are different.\n    int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;\n    int index = 0;\n    int num_samples = std::max(num_fonts1, num_fonts2);\n    for (int i = 0; i < num_samples; ++i, index += increment) {\n      int f1 = uf1.font_ids[i % num_fonts1];\n      int f2 = uf2.font_ids[index % num_fonts2];\n      if (debug) {\n        tprintf(\"Cluster dist %d %d %d %d = %g\\n\", f1, c1, f2, c2,\n                ClusterDistance(f1, c1, f2, c2, feature_map));\n      }\n      dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);\n      ++dist_count;\n    }\n  }\n  if (dist_count == 0) {\n    if (matched_fonts) {\n      return UnicharDistance(uf1, uf2, false, feature_map);\n    }\n    return 0.0f;\n  }\n  return dist_sum / dist_count;\n}\n\n// Returns the distance between the given pair of font/class pairs.\n// Finds in cache or computes and caches.\n// OrganizeByFontAndClass must have been already called.\nfloat TrainingSampleSet::ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,\n                                         const IntFeatureMap &feature_map) {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index1 = font_id_map_.SparseToCompact(font_id1);\n  int font_index2 = font_id_map_.SparseToCompact(font_id2);\n  if (font_index1 < 0 || font_index2 < 0) {\n    return 0.0f;\n  }\n  FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1);\n  if (font_id1 == font_id2) {\n    // Special case cache for speed.\n    if (fc_info.unichar_distance_cache.empty()) {\n      fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f);\n    }\n    if (fc_info.unichar_distance_cache[class_id2] < 0) {\n      // Distance has to be calculated.\n      float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);\n      fc_info.unichar_distance_cache[class_id2] = result;\n      // Copy to the symmetric cache entry.\n      FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);\n      if (fc_info2.unichar_distance_cache.empty()) {\n        fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f);\n      }\n      fc_info2.unichar_distance_cache[class_id1] = result;\n    }\n    return fc_info.unichar_distance_cache[class_id2];\n  } else if (class_id1 == class_id2) {\n    // Another special-case cache for equal class-id.\n    if (fc_info.font_distance_cache.empty()) {\n      fc_info.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);\n    }\n    if (fc_info.font_distance_cache[font_index2] < 0) {\n      // Distance has to be calculated.\n      float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);\n      fc_info.font_distance_cache[font_index2] = result;\n      // Copy to the symmetric cache entry.\n      FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);\n      if (fc_info2.font_distance_cache.empty()) {\n        fc_info2.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);\n      }\n      fc_info2.font_distance_cache[font_index1] = result;\n    }\n    return fc_info.font_distance_cache[font_index2];\n  }\n  // Both font and class are different. Linear search for class_id2/font_id2\n  // in what is a hopefully short list of distances.\n  size_t cache_index = 0;\n  while (cache_index < fc_info.distance_cache.size() &&\n         (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||\n          fc_info.distance_cache[cache_index].font_id != font_id2)) {\n    ++cache_index;\n  }\n  if (cache_index == fc_info.distance_cache.size()) {\n    // Distance has to be calculated.\n    float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);\n    FontClassDistance fc_dist = {class_id2, font_id2, result};\n    fc_info.distance_cache.push_back(fc_dist);\n    // Copy to the symmetric cache entry. We know it isn't there already, as\n    // we always copy to the symmetric entry.\n    FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);\n    fc_dist.unichar_id = class_id1;\n    fc_dist.font_id = font_id1;\n    fc_info2.distance_cache.push_back(fc_dist);\n  }\n  return fc_info.distance_cache[cache_index].distance;\n}\n\n// Computes the distance between the given pair of font/class pairs.\nfloat TrainingSampleSet::ComputeClusterDistance(int font_id1, int class_id1, int font_id2,\n                                                int class_id2,\n                                                const IntFeatureMap &feature_map) const {\n  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map, false);\n  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map, false);\n  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();\n  denominator += GetCanonicalFeatures(font_id2, class_id2).size();\n  return static_cast<float>(dist) / denominator;\n}\n\n// Helper to add a feature and its near neighbors to the good_features.\n// levels indicates how many times to compute the offset features of what is\n// already there. This is done by iteration rather than recursion.\nstatic void AddNearFeatures(const IntFeatureMap &feature_map, int f, int levels,\n                            std::vector<int> *good_features) {\n  int prev_num_features = 0;\n  good_features->push_back(f);\n  int num_features = 1;\n  for (int level = 0; level < levels; ++level) {\n    for (int i = prev_num_features; i < num_features; ++i) {\n      int feature = (*good_features)[i];\n      for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {\n        if (dir == 0) {\n          continue;\n        }\n        int f1 = feature_map.OffsetFeature(feature, dir);\n        if (f1 >= 0) {\n          good_features->push_back(f1);\n        }\n      }\n    }\n    prev_num_features = num_features;\n    num_features = good_features->size();\n  }\n}\n\n// Returns the number of canonical features of font/class 2 for which\n// neither the feature nor any of its near neighbors occurs in the cloud\n// of font/class 1. Each such feature is a reliable separation between\n// the classes, ASSUMING that the canonical sample is sufficiently\n// representative that every sample has a feature near that particular\n// feature. To check that this is so on the fly would be prohibitively\n// expensive, but it might be possible to pre-qualify the canonical features\n// to include only those for which this assumption is true.\n// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called\n// first, or the results will be nonsense.\nint TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2,\n                                         const IntFeatureMap &feature_map, bool thorough) const {\n  int result = 0;\n  const TrainingSample *sample2 = GetCanonicalSample(font_id2, class_id2);\n  if (sample2 == nullptr) {\n    return 0; // There are no canonical features.\n  }\n  const std::vector<int> &canonical2 = GetCanonicalFeatures(font_id2, class_id2);\n  const BitVector &cloud1 = GetCloudFeatures(font_id1, class_id1);\n  if (cloud1.empty()) {\n    return canonical2.size(); // There are no cloud features.\n  }\n\n  // Find a canonical2 feature that is not in cloud1.\n  for (int feature : canonical2) {\n    if (cloud1[feature]) {\n      continue;\n    }\n    // Gather the near neighbours of f.\n    std::vector<int> good_features;\n    AddNearFeatures(feature_map, feature, 1, &good_features);\n    // Check that none of the good_features are in the cloud.\n    bool found = false;\n    for (auto good_f : good_features) {\n      if (cloud1[good_f]) {\n        found = true;\n        break;\n      }\n    }\n    if (found) {\n      continue; // Found one in the cloud.\n    }\n    ++result;\n  }\n  return result;\n}\n\n// Returns the total index of the requested sample.\n// OrganizeByFontAndClass must have been already called.\nint TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id, int index) const {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return -1;\n  }\n  return (*font_class_array_)(font_index, class_id).samples[index];\n}\n\n// Gets the canonical sample for the given font, class pair.\n// ComputeCanonicalSamples must have been called first.\nconst TrainingSample *TrainingSampleSet::GetCanonicalSample(int font_id, int class_id) const {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return nullptr;\n  }\n  const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample;\n  return sample_index >= 0 ? samples_[sample_index] : nullptr;\n}\n\n// Gets the max distance for the given canonical sample.\n// ComputeCanonicalSamples must have been called first.\nfloat TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_index = font_id_map_.SparseToCompact(font_id);\n  if (font_index < 0) {\n    return 0.0f;\n  }\n  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) {\n    return (*font_class_array_)(font_index, class_id).canonical_dist;\n  } else {\n    return 0.0f;\n  }\n}\n\n// Generates indexed features for all samples with the supplied feature_space.\nvoid TrainingSampleSet::IndexFeatures(const IntFeatureSpace &feature_space) {\n  for (auto &sample : samples_) {\n    sample->IndexFeatures(feature_space);\n  }\n}\n\n// Marks the given sample index for deletion.\n// Deletion is actually completed by DeleteDeadSamples.\nvoid TrainingSampleSet::KillSample(TrainingSample *sample) {\n  sample->set_sample_index(-1);\n}\n\n// Deletes all samples with zero features marked by KillSample.\nvoid TrainingSampleSet::DeleteDeadSamples() {\n  using namespace std::placeholders; // for _1\n  for (auto &&it = samples_.begin(); it < samples_.end();) {\n    if (*it == nullptr || (*it)->class_id() < 0) {\n      samples_.erase(it);\n      delete *it;\n    } else {\n      ++it;\n    }\n  }\n  num_raw_samples_ = samples_.size();\n  // Samples must be re-organized now we have deleted a few.\n}\n\n// Construct an array to access the samples by font,class pair.\nvoid TrainingSampleSet::OrganizeByFontAndClass() {\n  // Font indexes are sparse, so we used a map to compact them, so we can\n  // have an efficient 2-d array of fonts and character classes.\n  SetupFontIdMap();\n  int compact_font_size = font_id_map_.CompactSize();\n  // Get a 2-d array of generic vectors.\n  delete font_class_array_;\n  FontClassInfo empty;\n  font_class_array_ =\n      new GENERIC_2D_ARRAY<FontClassInfo>(compact_font_size, unicharset_size_, empty);\n  for (size_t s = 0; s < samples_.size(); ++s) {\n    int font_id = samples_[s]->font_id();\n    int class_id = samples_[s]->class_id();\n    if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {\n      tesserr << \"Font id = \" << font_id << '/' << font_id_map_.SparseSize()\n              << \", class id = \" << class_id << '/' << unicharset_size_\n              << \" on sample \" << s << '\\n';\n    }\n    ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());\n    ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);\n    int font_index = font_id_map_.SparseToCompact(font_id);\n    (*font_class_array_)(font_index, class_id).samples.push_back(s);\n  }\n  // Set the num_raw_samples member of the FontClassInfo, to set the boundary\n  // between the raw samples and the replicated ones.\n  for (int f = 0; f < compact_font_size; ++f) {\n    for (int c = 0; c < unicharset_size_; ++c) {\n      (*font_class_array_)(f, c).num_raw_samples = (*font_class_array_)(f, c).samples.size();\n    }\n  }\n  // This is the global number of samples and also marks the boundary between\n  // real and replicated samples.\n  num_raw_samples_ = samples_.size();\n}\n\n// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact\n// index for the font_class_array_.\nvoid TrainingSampleSet::SetupFontIdMap() {\n  // Number of samples for each font_id.\n  std::vector<int> font_counts;\n  for (auto &sample : samples_) {\n    const int font_id = sample->font_id();\n    while (font_id >= font_counts.size()) {\n      font_counts.push_back(0);\n    }\n    ++font_counts[font_id];\n  }\n  font_id_map_.Init(font_counts.size(), false);\n  for (size_t f = 0; f < font_counts.size(); ++f) {\n    font_id_map_.SetMap(f, font_counts[f] > 0);\n  }\n  font_id_map_.Setup();\n}\n\n// Finds the sample for each font, class pair that has least maximum\n// distance to all the other samples of the same font, class.\n// OrganizeByFontAndClass must have been already called.\nvoid TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap &map, bool debug) {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  IntFeatureDist f_table;\n  if (debug) {\n    tprintf(\"feature table size %d\\n\", map.sparse_size());\n  }\n  f_table.Init(&map);\n  int worst_s1 = 0;\n  int worst_s2 = 0;\n  double global_worst_dist = 0.0;\n  // Compute distances independently for each font and char index.\n  int font_size = font_id_map_.CompactSize();\n  for (int font_index = 0; font_index < font_size; ++font_index) {\n    int font_id = font_id_map_.CompactToSparse(font_index);\n    for (int c = 0; c < unicharset_size_; ++c) {\n      int samples_found = 0;\n      FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);\n      if (fcinfo.samples.empty() || (kTestChar >= 0 && c != kTestChar)) {\n        fcinfo.canonical_sample = -1;\n        fcinfo.canonical_dist = 0.0f;\n        if (debug) {\n          tprintf(\"Skipping class %d\\n\", c);\n        }\n        continue;\n      }\n      // The canonical sample will be the one with the min_max_dist, which\n      // is the sample with the lowest maximum distance to all other samples.\n      double min_max_dist = 2.0;\n      // We keep track of the farthest apart pair (max_s1, max_s2) which\n      // are max_max_dist apart, so we can see how bad the variability is.\n      double max_max_dist = 0.0;\n      int max_s1 = 0;\n      int max_s2 = 0;\n      fcinfo.canonical_sample = fcinfo.samples[0];\n      fcinfo.canonical_dist = 0.0f;\n      for (auto s1 : fcinfo.samples) {\n        const std::vector<int> &features1 = samples_[s1]->indexed_features();\n        f_table.Set(features1, features1.size(), true);\n        double max_dist = 0.0;\n        // Run the full squared-order search for similar samples. It is still\n        // reasonably fast because f_table.FeatureDistance is fast, but we\n        // may have to reconsider if we start playing with too many samples\n        // of a single char/font.\n        for (int s2 : fcinfo.samples) {\n          if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) {\n            continue;\n          }\n          std::vector<int> features2 = samples_[s2]->indexed_features();\n          double dist = f_table.FeatureDistance(features2);\n          if (dist > max_dist) {\n            max_dist = dist;\n            if (dist > max_max_dist) {\n              max_max_dist = dist;\n              max_s1 = s1;\n              max_s2 = s2;\n            }\n          }\n        }\n        // Using Set(..., false) is far faster than re initializing, due to\n        // the sparseness of the feature space.\n        f_table.Set(features1, features1.size(), false);\n        samples_[s1]->set_max_dist(max_dist);\n        ++samples_found;\n        if (max_dist < min_max_dist) {\n          fcinfo.canonical_sample = s1;\n          fcinfo.canonical_dist = max_dist;\n        }\n        UpdateRange(max_dist, &min_max_dist, &max_max_dist);\n      }\n      if (max_max_dist > global_worst_dist) {\n        // Keep a record of the worst pair over all characters/fonts too.\n        global_worst_dist = max_max_dist;\n        worst_s1 = max_s1;\n        worst_s2 = max_s2;\n      }\n      if (debug) {\n        tprintf(\n            \"Found %d samples of class %d=%s, font %d, \"\n            \"dist range [%g, %g], worst pair= %s, %s\\n\",\n            samples_found, c, unicharset_.debug_str(c).c_str(), font_index, min_max_dist,\n            max_max_dist, SampleToString(*samples_[max_s1]).c_str(),\n            SampleToString(*samples_[max_s2]).c_str());\n      }\n    }\n  }\n  if (debug) {\n    tprintf(\"Global worst dist = %g, between sample %d and %d\\n\", global_worst_dist, worst_s1,\n            worst_s2);\n  }\n}\n\n// Replicates the samples to a minimum frequency defined by\n// 2 * kSampleRandomSize, or for larger counts duplicates all samples.\n// After replication, the replicated samples are perturbed slightly, but\n// in a predictable and repeatable way.\n// Use after OrganizeByFontAndClass().\nvoid TrainingSampleSet::ReplicateAndRandomizeSamples() {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_size = font_id_map_.CompactSize();\n  for (int font_index = 0; font_index < font_size; ++font_index) {\n    for (int c = 0; c < unicharset_size_; ++c) {\n      FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);\n      int sample_count = fcinfo.samples.size();\n      int min_samples = 2 * std::max(kSampleRandomSize, sample_count);\n      if (sample_count > 0 && sample_count < min_samples) {\n        int base_count = sample_count;\n        for (int base_index = 0; sample_count < min_samples; ++sample_count) {\n          int src_index = fcinfo.samples[base_index++];\n          if (base_index >= base_count) {\n            base_index = 0;\n          }\n          TrainingSample *sample =\n              samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize);\n          int sample_index = samples_.size();\n          sample->set_sample_index(sample_index);\n          samples_.push_back(sample);\n          fcinfo.samples.push_back(sample_index);\n        }\n      }\n    }\n  }\n}\n\n// Caches the indexed features of the canonical samples.\n// ComputeCanonicalSamples must have been already called.\n// TODO(rays) see note on ReliablySeparable and try restricting the\n// canonical features to those that truly represent all samples.\nvoid TrainingSampleSet::ComputeCanonicalFeatures() {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  const int font_size = font_id_map_.CompactSize();\n  for (int font_index = 0; font_index < font_size; ++font_index) {\n    const int font_id = font_id_map_.CompactToSparse(font_index);\n    for (int c = 0; c < unicharset_size_; ++c) {\n      int num_samples = NumClassSamples(font_id, c, false);\n      if (num_samples == 0) {\n        continue;\n      }\n      const TrainingSample *sample = GetCanonicalSample(font_id, c);\n      FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);\n      fcinfo.canonical_features = sample->indexed_features();\n    }\n  }\n}\n\n// Computes the combined set of features used by all the samples of each\n// font/class combination. Use after ReplicateAndRandomizeSamples.\nvoid TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {\n  ASSERT_HOST(font_class_array_ != nullptr);\n  int font_size = font_id_map_.CompactSize();\n  for (int font_index = 0; font_index < font_size; ++font_index) {\n    int font_id = font_id_map_.CompactToSparse(font_index);\n    for (int c = 0; c < unicharset_size_; ++c) {\n      int num_samples = NumClassSamples(font_id, c, false);\n      if (num_samples == 0) {\n        continue;\n      }\n      FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);\n      fcinfo.cloud_features.Init(feature_space_size);\n      for (int s = 0; s < num_samples; ++s) {\n        const TrainingSample *sample = GetSample(font_id, c, s);\n        const std::vector<int> &sample_features = sample->indexed_features();\n        for (int sample_feature : sample_features) {\n          fcinfo.cloud_features.SetBit(sample_feature);\n        }\n      }\n    }\n  }\n}\n\n// Adds all fonts of the given class to the shape.\nvoid TrainingSampleSet::AddAllFontsForClass(int class_id, Shape *shape) const {\n  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {\n    const int font_id = font_id_map_.CompactToSparse(f);\n    shape->AddToShape(class_id, font_id);\n  }\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Display the samples with the given indexed feature that also match\n// the given shape.\nvoid TrainingSampleSet::DisplaySamplesWithFeature(int f_index, const Shape &shape,\n                                                  const IntFeatureSpace &space,\n                                                  ScrollView::Color color,\n                                                  ScrollView *window) const {\n  for (int s = 0; s < num_raw_samples(); ++s) {\n    const TrainingSample *sample = GetSample(s);\n    if (shape.ContainsUnichar(sample->class_id())) {\n      std::vector<int> indexed_features;\n      space.IndexAndSortFeatures(sample->features(), sample->num_features(), &indexed_features);\n      for (int indexed_feature : indexed_features) {\n        if (indexed_feature == f_index) {\n          sample->DisplayFeatures(color, window);\n        }\n      }\n    }\n  }\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/common/trainingsampleset.h",
    "content": "// Copyright 2010 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H_\n#define TESSERACT_TRAINING_TRAININGSAMPLESET_H_\n\n#include \"bitvector.h\"\n#include \"indexmapbidi.h\"\n#include \"matrix.h\"\n#include \"shapetable.h\"\n#include \"trainingsample.h\"\n\nnamespace tesseract {\n\nclass UNICHARSET;\nstruct FontInfo;\nclass FontInfoTable;\nclass IntFeatureMap;\nclass IntFeatureSpace;\nclass TrainingSample;\nstruct UnicharAndFonts;\n\n// Collection of TrainingSample used for training or testing a classifier.\n// Provides several useful methods to operate on the collection as a whole,\n// including outlier detection and deletion, providing access by font and\n// class, finding the canonical sample, finding the \"cloud\" features (OR of\n// all features in all samples), replication of samples, caching of distance\n// metrics.\nclass TrainingSampleSet {\npublic:\n  explicit TrainingSampleSet(const FontInfoTable &fontinfo_table);\n  ~TrainingSampleSet();\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(FILE *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  // If swap is true, assumes a big/little-endian swap is needed.\n  bool DeSerialize(bool swap, FILE *fp);\n\n  // Accessors\n  int num_samples() const {\n    return samples_.size();\n  }\n  int num_raw_samples() const {\n    return num_raw_samples_;\n  }\n  int NumFonts() const {\n    return font_id_map_.SparseSize();\n  }\n  const UNICHARSET &unicharset() const {\n    return unicharset_;\n  }\n  int charsetsize() const {\n    return unicharset_size_;\n  }\n  const FontInfoTable &fontinfo_table() const {\n    return fontinfo_table_;\n  }\n\n  // Loads an initial unicharset, or sets one up if the file cannot be read.\n  void LoadUnicharset(const char *filename);\n\n  // Adds a character sample to this sample set.\n  // If the unichar is not already in the local unicharset, it is added.\n  // Returns the unichar_id of the added sample, from the local unicharset.\n  int AddSample(const char *unichar, TrainingSample *sample);\n  // Adds a character sample to this sample set with the given unichar_id,\n  // which must correspond to the local unicharset (in this).\n  void AddSample(int unichar_id, TrainingSample *sample);\n\n  // Returns the number of samples for the given font,class pair.\n  // If randomize is true, returns the number of samples accessible\n  // with randomizing on. (Increases the number of samples if small.)\n  // OrganizeByFontAndClass must have been already called.\n  int NumClassSamples(int font_id, int class_id, bool randomize) const;\n\n  // Gets a sample by its index.\n  const TrainingSample *GetSample(int index) const;\n\n  // Gets a sample by its font, class, index.\n  // OrganizeByFontAndClass must have been already called.\n  const TrainingSample *GetSample(int font_id, int class_id, int index) const;\n\n  // Get a sample by its font, class, index. Does not randomize.\n  // OrganizeByFontAndClass must have been already called.\n  TrainingSample *MutableSample(int font_id, int class_id, int index);\n\n  // Returns a string debug representation of the given sample:\n  // font, unichar_str, bounding box, page.\n  std::string SampleToString(const TrainingSample &sample) const;\n\n  // Gets the combined set of features used by all the samples of the given\n  // font/class combination.\n  const BitVector &GetCloudFeatures(int font_id, int class_id) const;\n  // Gets the indexed features of the canonical sample of the given\n  // font/class combination.\n  const std::vector<int> &GetCanonicalFeatures(int font_id, int class_id) const;\n\n  // Returns the distance between the given UniCharAndFonts pair.\n  // If matched_fonts, only matching fonts, are considered, unless that yields\n  // the empty set.\n  // OrganizeByFontAndClass must have been already called.\n  float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts,\n                        const IntFeatureMap &feature_map);\n\n  // Returns the distance between the given pair of font/class pairs.\n  // Finds in cache or computes and caches.\n  // OrganizeByFontAndClass must have been already called.\n  float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,\n                        const IntFeatureMap &feature_map);\n\n  // Computes the distance between the given pair of font/class pairs.\n  float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,\n                               const IntFeatureMap &feature_map) const;\n\n  // Returns the number of canonical features of font/class 2 for which\n  // neither the feature nor any of its near neighbors occurs in the cloud\n  // of font/class 1. Each such feature is a reliable separation between\n  // the classes, ASSUMING that the canonical sample is sufficiently\n  // representative that every sample has a feature near that particular\n  // feature. To check that this is so on the fly would be prohibitively\n  // expensive, but it might be possible to pre-qualify the canonical features\n  // to include only those for which this assumption is true.\n  // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called\n  // first, or the results will be nonsense.\n  int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2,\n                        const IntFeatureMap &feature_map, bool thorough) const;\n\n  // Returns the total index of the requested sample.\n  // OrganizeByFontAndClass must have been already called.\n  int GlobalSampleIndex(int font_id, int class_id, int index) const;\n\n  // Gets the canonical sample for the given font, class pair.\n  // ComputeCanonicalSamples must have been called first.\n  const TrainingSample *GetCanonicalSample(int font_id, int class_id) const;\n  // Gets the max distance for the given canonical sample.\n  // ComputeCanonicalSamples must have been called first.\n  float GetCanonicalDist(int font_id, int class_id) const;\n\n  // Returns a mutable pointer to the sample with the given index.\n  TrainingSample *mutable_sample(int index) {\n    return samples_[index];\n  }\n  // Gets ownership of the sample with the given index, removing it from this.\n  TrainingSample *extract_sample(int index) {\n    TrainingSample *sample = samples_[index];\n    samples_[index] = nullptr;\n    return sample;\n  }\n\n  // Generates indexed features for all samples with the supplied feature_space.\n  void IndexFeatures(const IntFeatureSpace &feature_space);\n\n  // Marks the given sample for deletion.\n  // Deletion is actually completed by DeleteDeadSamples.\n  void KillSample(TrainingSample *sample);\n\n  // Deletes all samples with a negative sample index marked by KillSample.\n  // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass\n  // must be called after as the samples have been renumbered.\n  void DeleteDeadSamples();\n\n  // Construct an array to access the samples by font,class pair.\n  void OrganizeByFontAndClass();\n\n  // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact\n  // index for the font_class_array_.\n  void SetupFontIdMap();\n\n  // Finds the sample for each font, class pair that has least maximum\n  // distance to all the other samples of the same font, class.\n  // OrganizeByFontAndClass must have been already called.\n  void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug);\n\n  // Replicates the samples to a minimum frequency defined by\n  // 2 * kSampleRandomSize, or for larger counts duplicates all samples.\n  // After replication, the replicated samples are perturbed slightly, but\n  // in a predictable and repeatable way.\n  // Use after OrganizeByFontAndClass().\n  void ReplicateAndRandomizeSamples();\n\n  // Caches the indexed features of the canonical samples.\n  // ComputeCanonicalSamples must have been already called.\n  void ComputeCanonicalFeatures();\n  // Computes the combined set of features used by all the samples of each\n  // font/class combination. Use after ReplicateAndRandomizeSamples.\n  void ComputeCloudFeatures(int feature_space_size);\n\n  // Adds all fonts of the given class to the shape.\n  void AddAllFontsForClass(int class_id, Shape *shape) const;\n\n  // Display the samples with the given indexed feature that also match\n  // the given shape.\n  void DisplaySamplesWithFeature(int f_index, const Shape &shape,\n                                 const IntFeatureSpace &feature_space, ScrollView::Color color,\n                                 ScrollView *window) const;\n\nprivate:\n  // Struct to store a triplet of unichar, font, distance in the distance cache.\n  struct FontClassDistance {\n    int unichar_id;\n    int font_id; // Real font id.\n    float distance;\n  };\n  // Simple struct to store information related to each font/class combination.\n  struct FontClassInfo {\n    FontClassInfo();\n\n    // Writes to the given file. Returns false in case of error.\n    bool Serialize(FILE *fp) const;\n    // Reads from the given file. Returns false in case of error.\n    // If swap is true, assumes a big/little-endian swap is needed.\n    bool DeSerialize(bool swap, FILE *fp);\n\n    // Number of raw samples.\n    int32_t num_raw_samples;\n    // Index of the canonical sample.\n    int32_t canonical_sample;\n    // Max distance of the canonical sample from any other.\n    float canonical_dist;\n    // Sample indices for the samples, including replicated.\n    std::vector<int32_t> samples;\n\n    // Non-serialized cache data.\n    // Indexed features of the canonical sample.\n    std::vector<int> canonical_features;\n    // The mapped features of all the samples.\n    BitVector cloud_features;\n\n    // Caches for ClusterDistance.\n    // Caches for other fonts but matching this unichar. -1 indicates not set.\n    // Indexed by compact font index from font_id_map_.\n    std::vector<float> font_distance_cache;\n    // Caches for other unichars but matching this font. -1 indicates not set.\n    std::vector<float> unichar_distance_cache;\n    // Cache for the rest (non matching font and unichar.)\n    // A cache of distances computed by ReliablySeparable.\n    std::vector<FontClassDistance> distance_cache;\n  };\n\n  std::vector<TrainingSample *> samples_;\n  // Number of samples before replication/randomization.\n  int num_raw_samples_;\n  // Character set we are training for.\n  UNICHARSET unicharset_;\n  // Character set size to which the 2-d arrays below refer.\n  int unicharset_size_;\n  // Map to allow the font_class_array_ below to be compact.\n  // The sparse space is the real font_id, used in samples_ .\n  // The compact space is an index to font_class_array_\n  IndexMapBiDi font_id_map_;\n  // A 2-d array of FontClassInfo holding information related to each\n  // (font_id, class_id) pair.\n  GENERIC_2D_ARRAY<FontClassInfo> *font_class_array_;\n\n  // Reference to the fontinfo_table_ in MasterTrainer. Provides names\n  // for font_ids in the samples. Not serialized!\n  const FontInfoTable &fontinfo_table_;\n};\n\n} // namespace tesseract.\n\n#endif // TRAININGSAMPLESETSET_H_\n"
  },
  {
    "path": "src/training/dawg2wordlist.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        dawg2wordlist.cpp\n// Description: Program to create a word list from a DAWG and unicharset.\n// Author:      David Eger\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"dawg.h\"\n#include \"trie.h\"\n#include \"unicharset.h\"\n\n#include \"serialis.h\"\n\nusing namespace tesseract;\n\nstatic std::unique_ptr<tesseract::Dawg> LoadSquishedDawg(const UNICHARSET &unicharset, const char *filename) {\n  const int kDictDebugLevel = 1;\n  tesseract::TFile dawg_file;\n  if (!dawg_file.Open(filename, nullptr)) {\n    tprintf(\"Could not open %s for reading.\\n\", filename);\n    return nullptr;\n  }\n  tprintf(\"Loading word list from %s\\n\", filename);\n  auto retval = std::make_unique<tesseract::SquishedDawg>(tesseract::DAWG_TYPE_WORD, \"eng\",\n                                                          SYSTEM_DAWG_PERM, kDictDebugLevel);\n  if (!retval->Load(&dawg_file)) {\n    tprintf(\"Could not read %s\\n\", filename);\n    return nullptr;\n  }\n  tprintf(\"Word list loaded.\\n\");\n  return retval;\n}\n\nclass WordOutputter {\npublic:\n  WordOutputter(FILE *file) : file_(file) {}\n  void output_word(const char *word) {\n    fprintf(file_, \"%s\\n\", word);\n  }\n\nprivate:\n  FILE *file_;\n};\n\n// returns 0 if successful.\nstatic int WriteDawgAsWordlist(const UNICHARSET &unicharset, const tesseract::Dawg *dawg,\n                               const char *outfile_name) {\n  FILE *out = fopen(outfile_name, \"wb\");\n  if (out == nullptr) {\n    tprintf(\"Could not open %s for writing.\\n\", outfile_name);\n    return EXIT_FAILURE;\n  }\n  WordOutputter outputter(out);\n  using namespace std::placeholders; // for _1\n  dawg->iterate_words(unicharset, std::bind(&WordOutputter::output_word, &outputter, _1));\n  return fclose(out);\n}\n\nint main(int argc, char *argv[]) {\n  tesseract::CheckSharedLibraryVersion();\n\n  if (argc > 1 && (!strcmp(argv[1], \"-v\") || !strcmp(argv[1], \"--version\"))) {\n    printf(\"%s\\n\", tesseract::TessBaseAPI::Version());\n    return 0;\n  } else if (argc != 4) {\n    tprintf(\"Print all the words in a given dawg.\\n\");\n    tprintf(\n        \"Usage: %s -v | --version | %s <unicharset> <dawgfile> \"\n        \"<wordlistfile>\\n\",\n        argv[0], argv[0]);\n    return EXIT_FAILURE;\n  }\n  const char *unicharset_file = argv[1];\n  const char *dawg_file = argv[2];\n  const char *wordlist_file = argv[3];\n  UNICHARSET unicharset;\n  if (!unicharset.load_from_file(unicharset_file)) {\n    tprintf(\"Error loading unicharset from %s.\\n\", unicharset_file);\n    return EXIT_FAILURE;\n  }\n  auto dict = LoadSquishedDawg(unicharset, dawg_file);\n  if (dict == nullptr) {\n    tprintf(\"Error loading dictionary from %s.\\n\", dawg_file);\n    return EXIT_FAILURE;\n  }\n  int retval = WriteDawgAsWordlist(unicharset, dict.get(), wordlist_file);\n  return retval;\n}\n"
  },
  {
    "path": "src/training/degradeimage.cpp",
    "content": "/**********************************************************************\n * File:        degradeimage.cpp\n * Description: Function to degrade an image (usually of text) as if it\n *              has been printed and then scanned.\n * Authors:     Ray Smith\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"degradeimage.h\"\n\n#include <allheaders.h> // from leptonica\n#include <cstdlib>\n#include \"helpers.h\" // For TRand.\n#include \"rect.h\"\n\nnamespace tesseract {\n\n// A randomized perspective distortion can be applied to synthetic input.\n// The perspective distortion comes from leptonica, which uses 2 sets of 4\n// corners to determine the distortion. There are random values for each of\n// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead\n// defined in terms of a single shear value. This reduces the degrees of\n// freedom enough to make the distortion more realistic than it would otherwise\n// be if all 8 coordinates could move independently.\n// One additional factor is used for the color of the pixels that don't exist\n// in the source image.\n// Name for each of the randomizing factors.\nenum FactorNames {\n  FN_INCOLOR,\n  FN_Y0,\n  FN_Y1,\n  FN_Y2,\n  FN_Y3,\n  FN_X0,\n  FN_X1,\n  FN_SHEAR,\n  // x2 = x1 - shear\n  // x3 = x0 + shear\n  FN_NUM_FACTORS\n};\n\n// Rotation is +/- kRotationRange radians.\nconst float kRotationRange = 0.02f;\n// Number of grey levels to shift by for each exposure step.\nconst int kExposureFactor = 16;\n// Salt and pepper noise is +/- kSaltnPepper.\nconst int kSaltnPepper = 5;\n// Min sum of width + height on which to operate the ramp.\nconst int kMinRampSize = 1000;\n\n// Degrade the pix as if by a print/copy/scan cycle with exposure > 0\n// corresponding to darkening on the copier and <0 lighter and 0 not copied.\n// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.\n// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the\n// pix is rotated by *rotation else it is randomly rotated and *rotation is\n// modified.\n//\n// HOW IT WORKS:\n// Most of the process is really dictated by the fact that the minimum\n// available convolution is 3X3, which is too big really to simulate a\n// good quality print/scan process. (2X2 would be better.)\n// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the\n// images generally biased to being too light, so most of the work is to make\n// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,\n// (using a greyscale erosion) one heavy (by being before convolution) and one\n// light (after convolution).\n// With no dilation, after covolution, the images are so light that a heavy\n// constant offset is required to make the 0 image look reasonable. A simple\n// constant offset multiple of exposure to undo this value is enough to achieve\n// all the required lighting. This gives the advantage that exposure level 1\n// with a single dilation gives a good impression of the broken-yet-too-dark\n// problem that is often seen in scans.\n// A small random rotation gives some varying greyscale values on the edges,\n// and some random salt and pepper noise on top helps to realistically jaggy-up\n// the edges.\n// Finally a greyscale ramp provides a continuum of effects between exposure\n// levels.\nImage DegradeImage(Image input, int exposure, TRand *randomizer, float *rotation) {\n  Image pix = pixConvertTo8(input, false);\n  input.destroy();\n  input = pix;\n  int width = pixGetWidth(input);\n  int height = pixGetHeight(input);\n\n  if (exposure >= 2) {\n    // An erosion simulates the spreading darkening of a dark copy.\n    // This is backwards to binary morphology,\n    // see http://www.leptonica.com/grayscale-morphology.html\n    pix = input;\n    input = pixErodeGray(pix, 3, 3);\n    pix.destroy();\n  }\n  // A convolution is essential to any mode as no scanner produces an\n  // image as sharp as the electronic image.\n  pix = pixBlockconv(input, 1, 1);\n  input.destroy();\n  // A small random rotation helps to make the edges jaggy in a realistic way.\n  if (rotation != nullptr) {\n    float radians_clockwise = 0.0f;\n    if (*rotation) {\n      radians_clockwise = *rotation;\n    } else if (randomizer != nullptr) {\n      radians_clockwise = randomizer->SignedRand(kRotationRange);\n    }\n\n    input = pixRotate(pix, radians_clockwise, L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, 0, 0);\n    // Rotate the boxes to match.\n    *rotation = radians_clockwise;\n    pix.destroy();\n  } else {\n    input = pix;\n  }\n\n  if (exposure >= 3 || exposure == 1) {\n    // Erosion after the convolution is not as heavy as before, so it is\n    // good for level 1 and in addition as a level 3.\n    // This is backwards to binary morphology,\n    // see http://www.leptonica.com/grayscale-morphology.html\n    pix = input;\n    input = pixErodeGray(pix, 3, 3);\n    pix.destroy();\n  }\n  // The convolution really needed to be 2x2 to be realistic enough, but\n  // we only have 3x3, so we have to bias the image darker or lose thin\n  // strokes.\n  int erosion_offset = 0;\n  // For light and 0 exposure, there is no dilation, so compensate for the\n  // convolution with a big darkening bias which is undone for lighter\n  // exposures.\n  if (exposure <= 0) {\n    erosion_offset = -3 * kExposureFactor;\n  }\n  // Add in a general offset of the greyscales for the exposure level so\n  // a threshold of 128 gives a reasonable binary result.\n  erosion_offset -= exposure * kExposureFactor;\n  // Add a gradual fade over the page and a small amount of salt and pepper\n  // noise to simulate noise in the sensor/paper fibres and varying\n  // illumination.\n  l_uint32 *data = pixGetData(input);\n  for (int y = 0; y < height; ++y) {\n    for (int x = 0; x < width; ++x) {\n      int pixel = GET_DATA_BYTE(data, x);\n      if (randomizer != nullptr) {\n        pixel += randomizer->IntRand() % (kSaltnPepper * 2 + 1) - kSaltnPepper;\n      }\n      if (height + width > kMinRampSize) {\n        pixel -= (2 * x + y) * 32 / (height + width);\n      }\n      pixel += erosion_offset;\n      if (pixel < 0) {\n        pixel = 0;\n      }\n      if (pixel > 255) {\n        pixel = 255;\n      }\n      SET_DATA_BYTE(data, x, pixel);\n    }\n    data += pixGetWpl(input);\n  }\n  return input;\n}\n\n// Creates and returns a Pix distorted by various means according to the bool\n// flags. If boxes is not nullptr, the boxes are resized/positioned according to\n// any spatial distortion and also by the integer reduction factor box_scale\n// so they will match what the network will output.\n// Returns nullptr on error. The returned Pix must be pixDestroyed.\nImage PrepareDistortedPix(const Image pix, bool perspective, bool invert, bool white_noise,\n                         bool smooth_noise, bool blur, int box_reduction, TRand *randomizer,\n                         std::vector<TBOX> *boxes) {\n  Image distorted = pix.copy();\n  // Things to do to synthetic training data.\n  if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {\n    // TODO(rays) Cook noise in a more thread-safe manner than rand().\n    // Attempt to make the sequences reproducible.\n    srand(randomizer->IntRand());\n    Image pixn = pixAddGaussianNoise(distorted, 8.0);\n    distorted.destroy();\n    if (smooth_noise) {\n      distorted = pixBlockconv(pixn, 1, 1);\n      pixn.destroy();\n    } else {\n      distorted = pixn;\n    }\n  }\n  if (blur && randomizer->SignedRand(1.0) > 0.0) {\n    Image blurred = pixBlockconv(distorted, 1, 1);\n    distorted.destroy();\n    distorted = blurred;\n  }\n  if (perspective) {\n    GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);\n  }\n  if (boxes != nullptr) {\n    for (auto &b : *boxes) {\n      b.scale(1.0f / box_reduction);\n      if (b.width() <= 0) {\n        b.set_right(b.left() + 1);\n      }\n    }\n  }\n  if (invert && randomizer->SignedRand(1.0) < -0) {\n    pixInvert(distorted, distorted);\n  }\n  return distorted;\n}\n\n// Distorts anything that has a non-null pointer with the same pseudo-random\n// perspective distortion. Width and height only need to be set if there\n// is no pix. If there is a pix, then they will be taken from there.\nvoid GeneratePerspectiveDistortion(int width, int height, TRand *randomizer, Image *pix,\n                                   std::vector<TBOX> *boxes) {\n  if (pix != nullptr && *pix != nullptr) {\n    width = pixGetWidth(*pix);\n    height = pixGetHeight(*pix);\n  }\n  float *im_coeffs = nullptr;\n  float *box_coeffs = nullptr;\n  l_int32 incolor = ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);\n  if (pix != nullptr && *pix != nullptr) {\n    // Transform the image.\n    Image transformed = pixProjective(*pix, im_coeffs, incolor);\n    if (transformed == nullptr) {\n      tprintf(\"Projective transformation failed!!\\n\");\n      return;\n    }\n    pix->destroy();\n    *pix = transformed;\n  }\n  if (boxes != nullptr) {\n    // Transform the boxes.\n    for (auto &b : *boxes) {\n      int x1, y1, x2, y2;\n      const TBOX &box = b;\n      projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, &y1);\n      projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), &x2, &y2);\n      TBOX new_box1(x1, height - y2, x2, height - y1);\n      projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), &x1, &y1);\n      projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, &y2);\n      TBOX new_box2(x1, height - y1, x2, height - y2);\n      b = new_box1.bounding_union(new_box2);\n    }\n  }\n  lept_free(im_coeffs);\n  lept_free(box_coeffs);\n}\n\n// Computes the coefficients of a randomized projective transformation.\n// The image transform requires backward transformation coefficient, and the\n// box transform the forward coefficients.\n// Returns the incolor arg to pixProjective.\nint ProjectiveCoeffs(int width, int height, TRand *randomizer, float **im_coeffs,\n                     float **box_coeffs) {\n  // Setup \"from\" points.\n  Pta *src_pts = ptaCreate(4);\n  ptaAddPt(src_pts, 0.0f, 0.0f);\n  ptaAddPt(src_pts, width, 0.0f);\n  ptaAddPt(src_pts, width, height);\n  ptaAddPt(src_pts, 0.0f, height);\n  // Extract factors from pseudo-random sequence.\n  float factors[FN_NUM_FACTORS];\n  float shear = 0.0f; // Shear is signed.\n  for (int i = 0; i < FN_NUM_FACTORS; ++i) {\n    // Everything is squared to make wild values rarer.\n    if (i == FN_SHEAR) {\n      // Shear is signed.\n      shear = randomizer->SignedRand(0.5 / 3.0);\n      shear = shear >= 0.0 ? shear * shear : -shear * shear;\n      // Keep the sheared points within the original rectangle.\n      if (shear < -factors[FN_X0]) {\n        shear = -factors[FN_X0];\n      }\n      if (shear > factors[FN_X1]) {\n        shear = factors[FN_X1];\n      }\n      factors[i] = shear;\n    } else if (i != FN_INCOLOR) {\n      factors[i] = fabs(randomizer->SignedRand(1.0));\n      if (i <= FN_Y3) {\n        factors[i] *= 5.0 / 8.0;\n      } else {\n        factors[i] *= 0.5;\n      }\n      factors[i] *= factors[i];\n    }\n  }\n  // Setup \"to\" points.\n  Pta *dest_pts = ptaCreate(4);\n  ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);\n  ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);\n  ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, (1 - factors[FN_Y2]) * height);\n  ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, (1 - factors[FN_Y3]) * height);\n  getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);\n  getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);\n  ptaDestroy(&src_pts);\n  ptaDestroy(&dest_pts);\n  return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/degradeimage.h",
    "content": "/**********************************************************************\n * File:        degradeimage.h\n * Description: Function to degrade an image (usually of text) as if it\n *              has been printed and then scanned.\n * Authors:     Ray Smith\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_\n#define TESSERACT_TRAINING_DEGRADEIMAGE_H_\n\n#include <allheaders.h>\n#include \"helpers.h\" // For TRand.\n#include \"rect.h\"\n\nnamespace tesseract {\n\n// Degrade the pix as if by a print/copy/scan cycle with exposure > 0\n// corresponding to darkening on the copier and <0 lighter and 0 not copied.\n// If rotation is not nullptr, the clockwise rotation in radians is saved there.\n// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)\n// The input image is destroyed and a different image returned.\nImage DegradeImage(Image input, int exposure, TRand *randomizer, float *rotation);\n\n// Creates and returns a Pix distorted by various means according to the bool\n// flags. If boxes is not nullptr, the boxes are resized/positioned according to\n// any spatial distortion and also by the integer reduction factor box_scale\n// so they will match what the network will output.\n// Returns nullptr on error. The returned Pix must be pixDestroyed.\nImage PrepareDistortedPix(const Image pix, bool perspective, bool invert, bool white_noise,\n                         bool smooth_noise, bool blur, int box_reduction, TRand *randomizer,\n                         std::vector<TBOX> *boxes);\n// Distorts anything that has a non-null pointer with the same pseudo-random\n// perspective distortion. Width and height only need to be set if there\n// is no pix. If there is a pix, then they will be taken from there.\nvoid GeneratePerspectiveDistortion(int width, int height, TRand *randomizer, Image *pix,\n                                   std::vector<TBOX> *boxes);\n// Computes the coefficients of a randomized projective transformation.\n// The image transform requires backward transformation coefficient, and the\n// box transform the forward coefficients.\n// Returns the incolor arg to pixProjective.\nint ProjectiveCoeffs(int width, int height, TRand *randomizer, float **im_coeffs,\n                     float **box_coeffs);\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_\n"
  },
  {
    "path": "src/training/lstmeval.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmeval.cpp\n// Description: Evaluation program for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"commontraining.h\"\n#include \"lstmtester.h\"\n#include \"tprintf.h\"\n\nusing namespace tesseract;\n\nstatic STRING_PARAM_FLAG(model, \"\", \"Name of model file (training or recognition)\");\nstatic STRING_PARAM_FLAG(traineddata, \"\",\n                         \"If model is a training checkpoint, then traineddata must \"\n                         \"be the traineddata file that was given to the trainer\");\nstatic STRING_PARAM_FLAG(eval_listfile, \"\", \"File listing sample files in lstmf training format.\");\nstatic INT_PARAM_FLAG(max_image_MB, 2000, \"Max memory to use for images.\");\nstatic INT_PARAM_FLAG(verbosity, 1, \"Amount of diagnosting information to output (0-2).\");\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n  ParseArguments(&argc, &argv);\n  if (FLAGS_model.empty()) {\n    tprintf(\"Must provide a --model!\\n\");\n    return EXIT_FAILURE;\n  }\n  if (FLAGS_eval_listfile.empty()) {\n    tprintf(\"Must provide a --eval_listfile!\\n\");\n    return EXIT_FAILURE;\n  }\n  tesseract::TessdataManager mgr;\n  if (!mgr.Init(FLAGS_model.c_str())) {\n    if (FLAGS_traineddata.empty()) {\n      tprintf(\"Must supply --traineddata to eval a training checkpoint!\\n\");\n      return EXIT_FAILURE;\n    }\n    tprintf(\"%s is not a recognition model, trying training checkpoint...\\n\", FLAGS_model.c_str());\n    if (!mgr.Init(FLAGS_traineddata.c_str())) {\n      tprintf(\"Failed to load language model from %s!\\n\", FLAGS_traineddata.c_str());\n      return EXIT_FAILURE;\n    }\n    std::vector<char> model_data;\n    if (!tesseract::LoadDataFromFile(FLAGS_model.c_str(), &model_data)) {\n      tprintf(\"Failed to load model from: %s\\n\", FLAGS_model.c_str());\n      return EXIT_FAILURE;\n    }\n    mgr.OverwriteEntry(tesseract::TESSDATA_LSTM, &model_data[0], model_data.size());\n  }\n  tesseract::LSTMTester tester(static_cast<int64_t>(FLAGS_max_image_MB) * 1048576);\n  if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) {\n    tprintf(\"Failed to load eval data from: %s\\n\", FLAGS_eval_listfile.c_str());\n    return EXIT_FAILURE;\n  }\n  double errs = 0.0;\n  std::string result = tester.RunEvalSync(0, &errs, mgr,\n                                          /*training_stage (irrelevant)*/ 0, FLAGS_verbosity);\n  tprintf(\"%s\\n\", result.c_str());\n  return EXIT_SUCCESS;\n} /* main */\n"
  },
  {
    "path": "src/training/lstmtraining.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmtraining.cpp\n// Description: Training program for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include <cerrno>\n#include <locale> // for std::locale::classic\n#if defined(__USE_GNU)\n#  include <cfenv> // for feenableexcept\n#endif\n#include \"commontraining.h\"\n#include \"fileio.h\" // for LoadFileLinesToStrings\n#include \"lstmtester.h\"\n#include \"lstmtrainer.h\"\n#include \"params.h\"\n#include \"tprintf.h\"\n#include \"unicharset_training_utils.h\"\n\nusing namespace tesseract;\n\nstatic INT_PARAM_FLAG(debug_interval, 0, \"How often to display the alignment.\");\nstatic STRING_PARAM_FLAG(net_spec, \"\", \"Network specification\");\nstatic INT_PARAM_FLAG(net_mode, 192, \"Controls network behavior.\");\nstatic INT_PARAM_FLAG(perfect_sample_delay, 0, \"How many imperfect samples between perfect ones.\");\nstatic DOUBLE_PARAM_FLAG(target_error_rate, 0.01, \"Final error rate in percent.\");\nstatic DOUBLE_PARAM_FLAG(weight_range, 0.1, \"Range of initial random weights.\");\nstatic DOUBLE_PARAM_FLAG(learning_rate, 10.0e-4, \"Weight factor for new deltas.\");\nstatic BOOL_PARAM_FLAG(reset_learning_rate, false,\n                       \"Resets all stored learning rates to the value specified by --learning_rate.\");\nstatic DOUBLE_PARAM_FLAG(momentum, 0.5, \"Decay factor for repeating deltas.\");\nstatic DOUBLE_PARAM_FLAG(adam_beta, 0.999, \"Decay factor for repeating deltas.\");\nstatic INT_PARAM_FLAG(max_image_MB, 6000, \"Max memory to use for images.\");\nstatic STRING_PARAM_FLAG(continue_from, \"\", \"Existing model to extend\");\nstatic STRING_PARAM_FLAG(model_output, \"lstmtrain\", \"Basename for output models\");\nstatic STRING_PARAM_FLAG(train_listfile, \"\",\n                         \"File listing training files in lstmf training format.\");\nstatic STRING_PARAM_FLAG(eval_listfile, \"\", \"File listing eval files in lstmf training format.\");\n#if defined(__USE_GNU)\nstatic BOOL_PARAM_FLAG(debug_float, false, \"Raise error on certain float errors.\");\n#endif\nstatic BOOL_PARAM_FLAG(stop_training, false, \"Just convert the training model to a runtime model.\");\nstatic BOOL_PARAM_FLAG(convert_to_int, false, \"Convert the recognition model to an integer model.\");\nstatic BOOL_PARAM_FLAG(sequential_training, false,\n                       \"Use the training files sequentially instead of round-robin.\");\nstatic INT_PARAM_FLAG(append_index, -1,\n                      \"Index in continue_from Network at which to\"\n                      \" attach the new network defined by net_spec\");\nstatic BOOL_PARAM_FLAG(debug_network, false, \"Get info on distribution of weight values\");\nstatic INT_PARAM_FLAG(max_iterations, 0, \"If set, exit after this many iterations\");\nstatic STRING_PARAM_FLAG(traineddata, \"\", \"Combined Dawgs/Unicharset/Recoder for language model\");\nstatic STRING_PARAM_FLAG(old_traineddata, \"\",\n                         \"When changing the character set, this specifies the old\"\n                         \" character set that is to be replaced\");\nstatic BOOL_PARAM_FLAG(randomly_rotate, false,\n                       \"Train OSD and randomly turn training samples upside-down\");\n\n// Number of training images to train between calls to MaintainCheckpoints.\nconst int kNumPagesPerBatch = 100;\n\n// Apart from command-line flags, input is a collection of lstmf files, that\n// were previously created using tesseract with the lstm.train config file.\n// The program iterates over the inputs, feeding the data to the network,\n// until the error rate reaches a specified target or max_iterations is reached.\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n  ParseArguments(&argc, &argv);\n#if defined(__USE_GNU)\n  if (FLAGS_debug_float) {\n    // Raise SIGFPE for unwanted floating point calculations.\n    feenableexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_INVALID);\n  }\n#endif\n  if (FLAGS_model_output.empty()) {\n    tprintf(\"Must provide a --model_output!\\n\");\n    return EXIT_FAILURE;\n  }\n  if (FLAGS_traineddata.empty()) {\n    tprintf(\"Must provide a --traineddata see training documentation\\n\");\n    return EXIT_FAILURE;\n  }\n\n  // Check write permissions.\n  std::string test_file = FLAGS_model_output;\n  test_file += \"_wtest\";\n  FILE *f = fopen(test_file.c_str(), \"wb\");\n  if (f != nullptr) {\n    fclose(f);\n    if (remove(test_file.c_str()) != 0) {\n      tprintf(\"Error, failed to remove %s: %s\\n\", test_file.c_str(), strerror(errno));\n      return EXIT_FAILURE;\n    }\n  } else {\n    tprintf(\"Error, model output cannot be written: %s\\n\", strerror(errno));\n    return EXIT_FAILURE;\n  }\n\n  // Setup the trainer.\n  std::string checkpoint_file = FLAGS_model_output;\n  checkpoint_file += \"_checkpoint\";\n  std::string checkpoint_bak = checkpoint_file + \".bak\";\n  tesseract::LSTMTrainer trainer(FLAGS_model_output, checkpoint_file,\n                                 FLAGS_debug_interval,\n                                 static_cast<int64_t>(FLAGS_max_image_MB) * 1048576);\n  if (!trainer.InitCharSet(FLAGS_traineddata.c_str())) {\n    tprintf(\"Error, failed to read %s\\n\", FLAGS_traineddata.c_str());\n    return EXIT_FAILURE;\n  }\n\n  // Reading something from an existing model doesn't require many flags,\n  // so do it now and exit.\n  if (FLAGS_stop_training || FLAGS_debug_network) {\n    if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str(), nullptr)) {\n      tprintf(\"Failed to read continue from: %s\\n\", FLAGS_continue_from.c_str());\n      return EXIT_FAILURE;\n    }\n    if (FLAGS_debug_network) {\n      trainer.DebugNetwork();\n    } else {\n      if (FLAGS_convert_to_int) {\n        trainer.ConvertToInt();\n      }\n      if (!trainer.SaveTraineddata(FLAGS_model_output.c_str())) {\n        tprintf(\"Failed to write recognition model : %s\\n\", FLAGS_model_output.c_str());\n      }\n    }\n    return EXIT_SUCCESS;\n  }\n\n  // Get the list of files to process.\n  if (FLAGS_train_listfile.empty()) {\n    tprintf(\"Must supply a list of training filenames! --train_listfile\\n\");\n    return EXIT_FAILURE;\n  }\n  std::vector<std::string> filenames;\n  if (!tesseract::LoadFileLinesToStrings(FLAGS_train_listfile.c_str(), &filenames)) {\n    tprintf(\"Failed to load list of training filenames from %s\\n\", FLAGS_train_listfile.c_str());\n    return EXIT_FAILURE;\n  }\n\n  // Checkpoints always take priority if they are available.\n  if (trainer.TryLoadingCheckpoint(checkpoint_file.c_str(), nullptr) ||\n      trainer.TryLoadingCheckpoint(checkpoint_bak.c_str(), nullptr)) {\n    tprintf(\"Successfully restored trainer from %s\\n\", checkpoint_file.c_str());\n  } else {\n    if (!FLAGS_continue_from.empty()) {\n      // Load a past model file to improve upon.\n      if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str(),\n                                        FLAGS_append_index >= 0 ? FLAGS_continue_from.c_str()\n                                                                : FLAGS_old_traineddata.c_str())) {\n        tprintf(\"Failed to continue from: %s\\n\", FLAGS_continue_from.c_str());\n        return EXIT_FAILURE;\n      }\n      tprintf(\"Continuing from %s\\n\", FLAGS_continue_from.c_str());\n      if (FLAGS_reset_learning_rate) {\n        trainer.SetLearningRate(FLAGS_learning_rate);\n        tprintf(\"Set learning rate to %f\\n\", static_cast<float>(FLAGS_learning_rate));\n      }\n      trainer.InitIterations();\n    }\n    if (FLAGS_continue_from.empty() || FLAGS_append_index >= 0) {\n      if (FLAGS_append_index >= 0) {\n        tprintf(\"Appending a new network to an old one!!\");\n        if (FLAGS_continue_from.empty()) {\n          tprintf(\"Must set --continue_from for appending!\\n\");\n          return EXIT_FAILURE;\n        }\n      }\n      // We are initializing from scratch.\n      if (!trainer.InitNetwork(FLAGS_net_spec.c_str(), FLAGS_append_index, FLAGS_net_mode,\n                               FLAGS_weight_range, FLAGS_learning_rate, FLAGS_momentum,\n                               FLAGS_adam_beta)) {\n        tprintf(\"Failed to create network from spec: %s\\n\", FLAGS_net_spec.c_str());\n        return EXIT_FAILURE;\n      }\n      trainer.set_perfect_delay(FLAGS_perfect_sample_delay);\n    }\n  }\n  if (!trainer.LoadAllTrainingData(\n          filenames,\n          FLAGS_sequential_training ? tesseract::CS_SEQUENTIAL : tesseract::CS_ROUND_ROBIN,\n          FLAGS_randomly_rotate)) {\n    tprintf(\"Load of images failed!!\\n\");\n    return EXIT_FAILURE;\n  }\n\n  tesseract::LSTMTester tester(static_cast<int64_t>(FLAGS_max_image_MB) * 1048576);\n  tesseract::TestCallback tester_callback = nullptr;\n  if (!FLAGS_eval_listfile.empty()) {\n    using namespace std::placeholders; // for _1, _2, _3...\n    if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) {\n      tprintf(\"Failed to load eval data from: %s\\n\", FLAGS_eval_listfile.c_str());\n      return EXIT_FAILURE;\n    }\n    tester_callback = std::bind(&tesseract::LSTMTester::RunEvalAsync, &tester, _1, _2, _3, _4);\n  }\n\n  int max_iterations = FLAGS_max_iterations;\n  if (max_iterations < 0) {\n    // A negative value is interpreted as epochs\n    max_iterations = filenames.size() * (-max_iterations);\n  } else if (max_iterations == 0) {\n    // \"Infinite\" iterations.\n    max_iterations = INT_MAX;\n  }\n\n  do {\n    // Train a few.\n    int iteration = trainer.training_iteration();\n    for (int target_iteration = iteration + kNumPagesPerBatch;\n         iteration < target_iteration && iteration < max_iterations;\n         iteration = trainer.training_iteration()) {\n      trainer.TrainOnLine(&trainer, false);\n    }\n    std::stringstream log_str;\n    log_str.imbue(std::locale::classic());\n    trainer.MaintainCheckpoints(tester_callback, log_str);\n    tprintf(\"%s\\n\", log_str.str().c_str());\n  } while (trainer.best_error_rate() > FLAGS_target_error_rate &&\n           (trainer.training_iteration() < max_iterations));\n  tprintf(\"Finished! Selected model with minimal training error rate (BCER) = %g\\n\",\n          trainer.best_error_rate());\n  return EXIT_SUCCESS;\n} /* main */\n"
  },
  {
    "path": "src/training/merge_unicharsets.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        merge_unicharsets.cpp\n// Description: Simple tool to merge two or more unicharsets.\n// Author:      Ray Smith\n//\n// (C) Copyright 2015, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"unicharset.h\"\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  if (argc > 1 && (!strcmp(argv[1], \"-v\") || !strcmp(argv[1], \"--version\"))) {\n    printf(\"%s\\n\", tesseract::TessBaseAPI::Version());\n    return EXIT_SUCCESS;\n  } else if (argc < 4) {\n    // Print usage\n    printf(\n        \"Usage: %s -v | --version |\\n\"\n        \"       %s unicharset-in-1 ... unicharset-in-n unicharset-out\\n\",\n        argv[0], argv[0]);\n    return EXIT_FAILURE;\n  }\n\n  tesseract::UNICHARSET input_unicharset, result_unicharset;\n  for (int arg = 1; arg < argc - 1; ++arg) {\n    // Load the input unicharset\n    if (input_unicharset.load_from_file(argv[arg])) {\n      printf(\"Loaded unicharset of size %zu from file %s\\n\", input_unicharset.size(), argv[arg]);\n      result_unicharset.AppendOtherUnicharset(input_unicharset);\n    } else {\n      printf(\"Failed to load unicharset from file %s!!\\n\", argv[arg]);\n      return EXIT_FAILURE;\n    }\n  }\n\n  // Save the combined unicharset.\n  if (result_unicharset.save_to_file(argv[argc - 1])) {\n    printf(\"Wrote unicharset file %s.\\n\", argv[argc - 1]);\n  } else {\n    printf(\"Cannot save unicharset file %s.\\n\", argv[argc - 1]);\n    return EXIT_FAILURE;\n  }\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "src/training/mergenf.cpp",
    "content": "/******************************************************************************\n**  Filename:    MergeNF.c\n**  Purpose:     Program for merging similar nano-feature protos\n**  Author:      Dan Johnson\n**\n** (c) Copyright Hewlett-Packard Company, 1988.\n** Licensed under the Apache License, Version 2.0 (the \"License\");\n** you may not use this file except in compliance with the License.\n** You may obtain a copy of the License at\n** http://www.apache.org/licenses/LICENSE-2.0\n** Unless required by applicable law or agreed to in writing, software\n** distributed under the License is distributed on an \"AS IS\" BASIS,\n** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n** See the License for the specific language governing permissions and\n** limitations under the License.\n******************************************************************************/\n\n#define _USE_MATH_DEFINES // for M_PI\n#include <algorithm>\n#include <cfloat> // for FLT_MAX\n#include <cmath>  // for M_PI\n#include <cstdio>\n#include <cstring>\n\n#include \"cluster.h\"\n#include \"clusttool.h\"\n#include \"featdefs.h\"\n#include \"intproto.h\"\n#include \"mergenf.h\"\n#include \"ocrfeatures.h\"\n#include \"oldlist.h\"\n#include \"params.h\"\n#include \"protos.h\"\n\nusing namespace tesseract;\n\n/*-------------------once in subfeat---------------------------------*/\nstatic double_VAR(training_angle_match_scale, 1.0, \"Angle Match Scale ...\");\n\nstatic double_VAR(training_similarity_midpoint, 0.0075, \"Similarity Midpoint ...\");\n\nstatic double_VAR(training_similarity_curl, 2.0, \"Similarity Curl ...\");\n\n/*-----------------------------once in\n * fasttrain----------------------------------*/\nstatic double_VAR(training_tangent_bbox_pad, 0.5, \"Tangent bounding box pad ...\");\n\nstatic double_VAR(training_orthogonal_bbox_pad, 2.5, \"Orthogonal bounding box pad ...\");\n\nstatic double_VAR(training_angle_pad, 45.0, \"Angle pad ...\");\n\n/**\n * Compare protos p1 and p2 and return an estimate of the\n * worst evidence rating that will result for any part of p1\n * that is compared to p2.  In other words, if p1 were broken\n * into pico-features and each pico-feature was matched to p2,\n * what is the worst evidence rating that will be achieved for\n * any pico-feature.\n *\n * @param p1, p2    protos to be compared\n *\n * Globals: none\n *\n * @return Worst possible result when matching p1 to p2.\n */\nfloat CompareProtos(PROTO_STRUCT *p1, PROTO_STRUCT *p2) {\n  float WorstEvidence = WORST_EVIDENCE;\n  float Evidence;\n  float Angle, Length;\n\n  /* if p1 and p2 are not close in length, don't let them match */\n  Length = std::fabs(p1->Length - p2->Length);\n  if (Length > MAX_LENGTH_MISMATCH) {\n    return (0.0);\n  }\n\n  /* create a dummy pico-feature to be used for comparisons */\n  auto Feature = new FEATURE_STRUCT(&PicoFeatDesc);\n  Feature->Params[PicoFeatDir] = p1->Angle;\n\n  /* convert angle to radians */\n  Angle = p1->Angle * 2.0 * M_PI;\n\n  /* find distance from center of p1 to 1/2 picofeat from end */\n  Length = p1->Length / 2.0 - GetPicoFeatureLength() / 2.0;\n  if (Length < 0) {\n    Length = 0;\n  }\n\n  /* set the dummy pico-feature at one end of p1 and match it to p2 */\n  Feature->Params[PicoFeatX] = p1->X + std::cos(Angle) * Length;\n  Feature->Params[PicoFeatY] = p1->Y + std::sin(Angle) * Length;\n  if (DummyFastMatch(Feature, p2)) {\n    Evidence = SubfeatureEvidence(Feature, p2);\n    if (Evidence < WorstEvidence) {\n      WorstEvidence = Evidence;\n    }\n  } else {\n    delete Feature;\n    return 0.0;\n  }\n\n  /* set the dummy pico-feature at the other end of p1 and match it to p2 */\n  Feature->Params[PicoFeatX] = p1->X - std::cos(Angle) * Length;\n  Feature->Params[PicoFeatY] = p1->Y - std::sin(Angle) * Length;\n  if (DummyFastMatch(Feature, p2)) {\n    Evidence = SubfeatureEvidence(Feature, p2);\n    if (Evidence < WorstEvidence) {\n      WorstEvidence = Evidence;\n    }\n  } else {\n    delete Feature;\n    return 0.0;\n  }\n\n  delete Feature;\n  return (WorstEvidence);\n\n} /* CompareProtos */\n\n/**\n * This routine computes a proto which is the weighted\n * average of protos p1 and p2.  The new proto is returned\n * in MergedProto.\n *\n * @param p1, p2    protos to be merged\n * @param w1, w2    weight of each proto\n * @param MergedProto place to put resulting merged proto\n */\nvoid ComputeMergedProto(PROTO_STRUCT *p1, PROTO_STRUCT *p2, float w1, float w2, PROTO_STRUCT *MergedProto) {\n  float TotalWeight;\n\n  TotalWeight = w1 + w2;\n  w1 /= TotalWeight;\n  w2 /= TotalWeight;\n\n  MergedProto->X = p1->X * w1 + p2->X * w2;\n  MergedProto->Y = p1->Y * w1 + p2->Y * w2;\n  MergedProto->Length = p1->Length * w1 + p2->Length * w2;\n  MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;\n  FillABC(MergedProto);\n} /* ComputeMergedProto */\n\n/**\n * This routine searches through all of the prototypes in\n * Class and returns the id of the proto which would provide\n * the best approximation of Prototype.  If no close\n * approximation can be found, NO_PROTO is returned.\n *\n * @param Class   class to search for matching old proto in\n * @param NumMerged # of protos merged into each proto of Class\n * @param  Prototype new proto to find match for\n *\n * Globals: none\n *\n * @return Id of closest proto in Class or NO_PROTO.\n */\nint FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype) {\n  PROTO_STRUCT NewProto;\n  PROTO_STRUCT MergedProto;\n  int Pid;\n  PROTO_STRUCT *Proto;\n  int BestProto;\n  float BestMatch;\n  float Match, OldMatch, NewMatch;\n\n  MakeNewFromOld(&NewProto, Prototype);\n\n  BestProto = NO_PROTO;\n  BestMatch = WORST_MATCH_ALLOWED;\n  for (Pid = 0; Pid < Class->NumProtos; Pid++) {\n    Proto = ProtoIn(Class, Pid);\n    ComputeMergedProto(Proto, &NewProto, static_cast<float>(NumMerged[Pid]), 1.0, &MergedProto);\n    OldMatch = CompareProtos(Proto, &MergedProto);\n    NewMatch = CompareProtos(&NewProto, &MergedProto);\n    Match = std::min(OldMatch, NewMatch);\n    if (Match > BestMatch) {\n      BestProto = Pid;\n      BestMatch = Match;\n    }\n  }\n  return BestProto;\n} /* FindClosestExistingProto */\n\n/**\n * This fills in the fields of the New proto based on the\n * fields of the Old proto.\n *\n * @param New new proto to be filled in\n * @param Old old proto to be converted\n *\n *  Globals: none\n */\nvoid MakeNewFromOld(PROTO_STRUCT *New, PROTOTYPE *Old) {\n  New->X = CenterX(Old->Mean);\n  New->Y = CenterY(Old->Mean);\n  New->Length = LengthOf(Old->Mean);\n  New->Angle = OrientationOf(Old->Mean);\n  FillABC(New);\n} /* MakeNewFromOld */\n\n/*-------------------once in subfeat---------------------------------*/\n\n/**\n * @name SubfeatureEvidence\n *\n * Compare a feature to a prototype. Print the result.\n */\nfloat SubfeatureEvidence(FEATURE Feature, PROTO_STRUCT *Proto) {\n  float Distance;\n  float Dangle;\n\n  Dangle = Proto->Angle - Feature->Params[PicoFeatDir];\n  if (Dangle < -0.5) {\n    Dangle += 1.0;\n  }\n  if (Dangle > 0.5) {\n    Dangle -= 1.0;\n  }\n  Dangle *= training_angle_match_scale;\n\n  Distance =\n      Proto->A * Feature->Params[PicoFeatX] + Proto->B * Feature->Params[PicoFeatY] + Proto->C;\n\n  return (EvidenceOf(Distance * Distance + Dangle * Dangle));\n}\n\n/**\n * @name EvidenceOf\n *\n * Return the new type of evidence number corresponding to this\n * distance value.  This number is no longer based on the chi squared\n * approximation.  The equation that represents the transform is:\n *       1 / (1 + (sim / midpoint) ^ curl)\n */\ndouble EvidenceOf(double Similarity) {\n  Similarity /= training_similarity_midpoint;\n\n  if (training_similarity_curl == 3) {\n    Similarity = Similarity * Similarity * Similarity;\n  } else if (training_similarity_curl == 2) {\n    Similarity = Similarity * Similarity;\n  } else {\n    Similarity = pow(Similarity, training_similarity_curl);\n  }\n\n  return (1.0 / (1.0 + Similarity));\n}\n\n/**\n * This routine returns true if Feature would be matched\n * by a fast match table built from Proto.\n *\n * @param Feature   feature to be \"fast matched\" to proto\n * @param Proto   proto being \"fast matched\" against\n *\n * Globals:\n * - training_tangent_bbox_pad    bounding box pad tangent to proto\n * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto\n *\n * @return true if feature could match Proto.\n */\nbool DummyFastMatch(FEATURE Feature, PROTO_STRUCT *Proto) {\n  FRECT BoundingBox;\n  float MaxAngleError;\n  float AngleError;\n\n  MaxAngleError = training_angle_pad / 360.0;\n  AngleError = std::fabs(Proto->Angle - Feature->Params[PicoFeatDir]);\n  if (AngleError > 0.5) {\n    AngleError = 1.0 - AngleError;\n  }\n\n  if (AngleError > MaxAngleError) {\n    return false;\n  }\n\n  ComputePaddedBoundingBox(Proto, training_tangent_bbox_pad * GetPicoFeatureLength(),\n                           training_orthogonal_bbox_pad * GetPicoFeatureLength(), &BoundingBox);\n\n  return PointInside(&BoundingBox, Feature->Params[PicoFeatX], Feature->Params[PicoFeatY]);\n} /* DummyFastMatch */\n\n/**\n * This routine computes a bounding box that encloses the\n * specified proto along with some padding.  The\n * amount of padding is specified as separate distances\n * in the tangential and orthogonal directions.\n *\n * @param Proto   proto to compute bounding box for\n * @param TangentPad  amount of pad to add in direction of segment\n * @param OrthogonalPad amount of pad to add orthogonal to segment\n * @param[out] BoundingBox place to put results\n */\nvoid ComputePaddedBoundingBox(PROTO_STRUCT *Proto, float TangentPad, float OrthogonalPad,\n                              FRECT *BoundingBox) {\n  float Length = Proto->Length / 2.0 + TangentPad;\n  float Angle = Proto->Angle * 2.0 * M_PI;\n  float CosOfAngle = fabs(std::cos(Angle));\n  float SinOfAngle = fabs(std::sin(Angle));\n\n  float Pad = std::max(CosOfAngle * Length, SinOfAngle * OrthogonalPad);\n  BoundingBox->MinX = Proto->X - Pad;\n  BoundingBox->MaxX = Proto->X + Pad;\n\n  Pad = std::max(SinOfAngle * Length, CosOfAngle * OrthogonalPad);\n  BoundingBox->MinY = Proto->Y - Pad;\n  BoundingBox->MaxY = Proto->Y + Pad;\n\n} /* ComputePaddedBoundingBox */\n\n/**\n * Return true if point (X,Y) is inside of Rectangle.\n *\n * Globals: none\n *\n * @return true if point (X,Y) is inside of Rectangle.\n */\nbool PointInside(FRECT *Rectangle, float X, float Y) {\n  return (X >= Rectangle->MinX) && (X <= Rectangle->MaxX) && (Y >= Rectangle->MinY) &&\n         (Y <= Rectangle->MaxY);\n} /* PointInside */\n"
  },
  {
    "path": "src/training/mergenf.h",
    "content": "/******************************************************************************\n ** Filename:   MergeNF.c\n ** Purpose:    Program for merging similar nano-feature protos\n ** Author:     Dan Johnson\n **\n ** (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *****************************************************************************/\n\n#ifndef TESSERACT_TRAINING_MERGENF_H_\n#define TESSERACT_TRAINING_MERGENF_H_\n\n/**----------------------------------------------------------------------------\n     Include Files and Type Defines\n----------------------------------------------------------------------------**/\n#include \"cluster.h\"\n#include \"ocrfeatures.h\"\n#include \"picofeat.h\"\n#include \"protos.h\"\n\n#define WORST_MATCH_ALLOWED (0.9)\n#define WORST_EVIDENCE (1.0)\n#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength())\n\n#define PROTO_SUFFIX \".mf.p\"\n#define CONFIG_SUFFIX \".cl\"\n#define NO_PROTO (-1)\n#define XPOSITION 0\n#define YPOSITION 1\n#define MFLENGTH 2\n#define ORIENTATION 3\n\nstruct FRECT {\n  float MinX, MaxX, MinY, MaxY;\n};\n\n/**----------------------------------------------------------------------------\n      Public Macros\n----------------------------------------------------------------------------**/\n#define CenterX(M) ((M)[XPOSITION])\n#define CenterY(M) ((M)[YPOSITION])\n#define LengthOf(M) ((M)[MFLENGTH])\n#define OrientationOf(M) ((M)[ORIENTATION])\n\n/**----------------------------------------------------------------------------\n     Public Function Prototypes\n----------------------------------------------------------------------------**/\nfloat CompareProtos(tesseract::PROTO_STRUCT *p1, tesseract::PROTO_STRUCT *p2);\n\nvoid ComputeMergedProto(tesseract::PROTO_STRUCT *p1, tesseract::PROTO_STRUCT *p2, float w1, float w2,\n                        tesseract::PROTO_STRUCT *MergedProto);\n\nint FindClosestExistingProto(tesseract::CLASS_TYPE Class, int NumMerged[],\n                             tesseract::PROTOTYPE *Prototype);\n\nvoid MakeNewFromOld(tesseract::PROTO_STRUCT *New, tesseract::PROTOTYPE *Old);\n\nfloat SubfeatureEvidence(tesseract::FEATURE Feature, tesseract::PROTO_STRUCT *Proto);\n\ndouble EvidenceOf(double Similarity);\n\nbool DummyFastMatch(tesseract::FEATURE Feature, tesseract::PROTO_STRUCT *Proto);\n\nvoid ComputePaddedBoundingBox(tesseract::PROTO_STRUCT *Proto, float TangentPad, float OrthogonalPad,\n                              FRECT *BoundingBox);\n\nbool PointInside(FRECT *Rectangle, float X, float Y);\n\n#endif // TESSERACT_TRAINING_MERGENF_H_\n"
  },
  {
    "path": "src/training/mftraining.cpp",
    "content": "/******************************************************************************\n ** Filename:   mftraining.c\n ** Purpose:    Separates training pages into files for each character.\n **             Strips from files only the features and there parameters of\n **             the feature type mf.\n ** Author:     Dan Johnson\n ** Revisment:  Christy Russon\n **\n **  (c) Copyright Hewlett-Packard Company, 1988.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n ******************************************************************************/\n/*----------------------------------------------------------------------------\n          Include Files and Type Defines\n----------------------------------------------------------------------------*/\n\n#define _USE_MATH_DEFINES // for M_PI\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <cmath> // for M_PI\n#include <cstdio>\n#include <cstring>\n\n#include \"classify.h\"\n#include \"cluster.h\"\n#include \"clusttool.h\"\n#include \"commontraining.h\"\n#include \"featdefs.h\"\n#include \"fontinfo.h\"\n#include \"indexmapbidi.h\"\n#include \"intproto.h\"\n#include \"mastertrainer.h\"\n#include \"mergenf.h\"\n#include \"mf.h\"\n#include \"ocrfeatures.h\"\n#include \"oldlist.h\"\n#include \"protos.h\"\n#include \"shapetable.h\"\n#include \"tprintf.h\"\n#include \"unicity_table.h\"\n\nusing namespace tesseract;\n\n/*----------------------------------------------------------------------------\n            Public Code\n-----------------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\nstatic void DisplayProtoList(const char *ch, LIST protolist) {\n  auto window = std::make_unique<ScrollView>(\"Char samples\", 50, 200, 520, 520, 260, 260, true);\n  LIST proto = protolist;\n  iterate(proto) {\n    auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node());\n    if (prototype->Significant) {\n      window->Pen(ScrollView::GREEN);\n    } else if (prototype->NumSamples == 0) {\n      window->Pen(ScrollView::BLUE);\n    } else if (prototype->Merged) {\n      window->Pen(ScrollView::MAGENTA);\n    } else {\n      window->Pen(ScrollView::RED);\n    }\n    float x = CenterX(prototype->Mean);\n    float y = CenterY(prototype->Mean);\n    double angle = OrientationOf(prototype->Mean) * 2 * M_PI;\n    auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);\n    auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);\n    window->SetCursor((x - dx) * 256, (y - dy) * 256);\n    window->DrawTo((x + dx) * 256, (y + dy) * 256);\n    auto prototypeNumSamples = prototype->NumSamples;\n    if (prototype->Significant) {\n      tprintf(\"Green proto at (%g,%g)+(%g,%g) %d samples\\n\", x, y, dx, dy, prototypeNumSamples);\n    } else if (prototype->NumSamples > 0 && !prototype->Merged) {\n      tprintf(\"Red proto at (%g,%g)+(%g,%g) %d samples\\n\", x, y, dx, dy, prototypeNumSamples);\n    }\n  }\n  window->Update();\n}\n#endif // !GRAPHICS_DISABLED\n\n// Helper to run clustering on a single config.\n// Mostly copied from the old mftraining, but with renamed variables.\nstatic LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes,\n                             const ShapeTable &shape_table, MasterTrainer *trainer) {\n  int num_samples;\n  CLUSTERER *clusterer =\n      trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples);\n  Config.MagicSamples = num_samples;\n  LIST proto_list = ClusterSamples(clusterer, &Config);\n  CleanUpUnusedData(proto_list);\n\n  // Merge protos where reasonable to make more of them significant by\n  // representing almost all samples of the class/font.\n  MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);\n#ifndef GRAPHICS_DISABLED\n  if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) {\n    DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);\n  }\n#endif // !GRAPHICS_DISABLED\n  // Delete the protos that will not be used in the inttemp output file.\n  proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize);\n  FreeClusterer(clusterer);\n  MERGE_CLASS merge_class = FindClass(mf_classes, class_label);\n  if (merge_class == nullptr) {\n    merge_class = new MERGE_CLASS_NODE(class_label);\n    mf_classes = push(mf_classes, merge_class);\n  }\n  int config_id = AddConfigToClass(merge_class->Class);\n  merge_class->Class->font_set.push_back(shape_id);\n  LIST proto_it = proto_list;\n  iterate(proto_it) {\n    auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node());\n    // See if proto can be approximated by existing proto.\n    int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype);\n    if (p_id == NO_PROTO) {\n      // Need to make a new proto, as it doesn't match anything.\n      p_id = AddProtoToClass(merge_class->Class);\n      MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);\n      merge_class->NumMerged[p_id] = 1;\n    } else {\n      PROTO_STRUCT dummy_proto;\n      MakeNewFromOld(&dummy_proto, prototype);\n      // Merge with the similar proto.\n      ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,\n                         static_cast<float>(merge_class->NumMerged[p_id]), 1.0,\n                         ProtoIn(merge_class->Class, p_id));\n      merge_class->NumMerged[p_id]++;\n    }\n    AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);\n  }\n  FreeProtoList(&proto_list);\n  return mf_classes;\n}\n\n// Helper to setup the config map.\n// Setup an index mapping from the shapes in the shape table to the classes\n// that will be trained. In keeping with the original design, each shape\n// with the same list of unichars becomes a different class and the configs\n// represent the different combinations of fonts.\nstatic void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) {\n  int num_configs = shape_table->NumShapes();\n  config_map->Init(num_configs, true);\n  config_map->Setup();\n  for (int c1 = 0; c1 < num_configs; ++c1) {\n    // Only process ids that are not already merged.\n    if (config_map->SparseToCompact(c1) == c1) {\n      Shape *shape1 = shape_table->MutableShape(c1);\n      // Find all the subsequent shapes that are equal.\n      for (int c2 = c1 + 1; c2 < num_configs; ++c2) {\n        if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {\n          config_map->Merge(c1, c2);\n        }\n      }\n    }\n  }\n  config_map->CompleteMerges();\n}\n\n/**\n * This program reads in a text file consisting of feature\n * samples from a training page in the following format:\n * @verbatim\n      FontName UTF8-char-str xmin ymin xmax ymax page-number\n       NumberOfFeatureTypes(N)\n         FeatureTypeName1 NumberOfFeatures(M)\n            Feature1\n            ...\n            FeatureM\n         FeatureTypeName2 NumberOfFeatures(M)\n            Feature1\n            ...\n            FeatureM\n         ...\n         FeatureTypeNameN NumberOfFeatures(M)\n            Feature1\n            ...\n            FeatureM\n      FontName CharName ...\n    @endverbatim\n * The result of this program is a binary inttemp file used by\n * the OCR engine.\n * @param  argc  number of command line arguments\n * @param  argv  array of command line arguments\n * @return 0 if no error occurred\n */\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  ParseArguments(&argc, &argv);\n\n  ShapeTable *shape_table = nullptr;\n  std::string file_prefix;\n  // Load the training data.\n  auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix);\n  if (trainer == nullptr) {\n    return EXIT_FAILURE; // Failed.\n  }\n\n  // Setup an index mapping from the shapes in the shape table to the classes\n  // that will be trained. In keeping with the original design, each shape\n  // with the same list of unichars becomes a different class and the configs\n  // represent the different combinations of fonts.\n  IndexMapBiDi config_map;\n  SetupConfigMap(shape_table, &config_map);\n\n  WriteShapeTable(file_prefix, *shape_table);\n  // If the shape_table is flat, then either we didn't run shape clustering, or\n  // it did nothing, so we just output the trainer's unicharset.\n  // Otherwise shape_set will hold a fake unicharset with an entry for each\n  // shape in the shape table, and we will output that instead.\n  UNICHARSET shape_set;\n  const UNICHARSET *unicharset = &trainer->unicharset();\n  // If we ran shapeclustering (and it worked) then at least one shape will\n  // have multiple unichars, so we have to build a fake unicharset.\n  if (shape_table->AnyMultipleUnichars()) {\n    unicharset = &shape_set;\n    // Now build a fake unicharset for the compact shape space to keep the\n    // output modules happy that we are doing things correctly.\n    int num_shapes = config_map.CompactSize();\n    for (int s = 0; s < num_shapes; ++s) {\n      char shape_label[14];\n      snprintf(shape_label, sizeof(shape_label), \"sh%04d\", s);\n      shape_set.unichar_insert(shape_label);\n    }\n  }\n\n  // Now train each config separately.\n  int num_configs = shape_table->NumShapes();\n  LIST mf_classes = NIL_LIST;\n  for (int s = 0; s < num_configs; ++s) {\n    int unichar_id, font_id;\n    if (unicharset == &shape_set) {\n      // Using fake unichar_ids from the config_map/shape_set.\n      unichar_id = config_map.SparseToCompact(s);\n    } else {\n      // Get the real unichar_id from the shape table/unicharset.\n      shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);\n    }\n    const char *class_label = unicharset->id_to_unichar(unichar_id);\n    mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get());\n  }\n  std::string inttemp_file = file_prefix;\n  inttemp_file += \"inttemp\";\n  std::string pffmtable_file = std::move(file_prefix);\n  pffmtable_file += \"pffmtable\";\n  CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes);\n  // Now write the inttemp and pffmtable.\n  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes,\n                                    inttemp_file.c_str(), pffmtable_file.c_str());\n  for (size_t c = 0; c < unicharset->size(); ++c) {\n    FreeClassFields(&float_classes[c]);\n  }\n  delete[] float_classes;\n  FreeLabeledClassList(mf_classes);\n  delete shape_table;\n  printf(\"Done!\\n\");\n  if (!FLAGS_test_ch.empty()) {\n    // If we are displaying debug window(s), wait for the user to look at them.\n    printf(\"Hit return to exit...\\n\");\n    while (getchar() != '\\n') {\n      ;\n    }\n  }\n  return EXIT_SUCCESS;\n} /* main */\n"
  },
  {
    "path": "src/training/pango/boxchar.cpp",
    "content": "/**********************************************************************\n * File:        boxchar.cpp\n * Description: Simple class to associate a Tesseract classification unit with\n *              its bounding box so that the boxes can be rotated as the image\n *              is rotated for degradation.  Also includes routines to output\n *              the character-tagged boxes to a boxfile.\n * Author:      Ray Smith\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"boxchar.h\"\n\n#include \"fileio.h\"\n#include \"normstrngs.h\"\n#include \"tesserrstream.h\"  // for tesserr\n#include \"tprintf.h\"\n#include \"unicharset.h\"\n#include \"unicode/uchar.h\" // from libicu\n\n#include <algorithm>\n#include <cstddef>\n#include <vector>\n\n// Absolute Ratio of dx:dy or dy:dx to be a newline.\nconst int kMinNewlineRatio = 5;\n\nnamespace tesseract {\n\nBoxChar::BoxChar(const char *utf8_str, int len)\n    : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}\n\nBoxChar::~BoxChar() {\n  boxDestroy(&box_);\n}\n\nvoid BoxChar::AddBox(int x, int y, int width, int height) {\n  box_ = boxCreate(x, y, width, height);\n}\n\n// Increments *num_rtl and *num_ltr according to the directionality of\n// characters in the box.\nvoid BoxChar::GetDirection(int *num_rtl, int *num_ltr) const {\n  // Convert the unichar to UTF32 representation\n  std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());\n  if (uni_vector.empty()) {\n    tprintf(\"Illegal utf8 in boxchar string:%s = \", ch_.c_str());\n    for (char c : ch_) {\n      tprintf(\" 0x%x\", c);\n    }\n    tprintf(\"\\n\");\n    return;\n  }\n  for (char32 ch : uni_vector) {\n    UCharDirection dir = u_charDirection(ch);\n    if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) {\n      ++*num_rtl;\n    } else if ((dir == U_ARABIC_NUMBER) ||\n               (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {\n      ++*num_ltr;\n    }\n  }\n}\n\n// Reverses the order of unicodes within the box. If Pango generates a\n// ligature, these will get reversed on output, so reverse now.\nvoid BoxChar::ReverseUnicodesInBox() {\n  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());\n  std::reverse(unicodes.begin(), unicodes.end());\n  ch_ = UNICHAR::UTF32ToUTF8(unicodes);\n}\n\n/* static */\nvoid BoxChar::TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes) {\n  for (auto &boxe : *boxes) {\n    Box *box = boxe->box_;\n    if (box != nullptr) {\n      box->x += xshift;\n      box->y += yshift;\n    }\n  }\n}\n\n// Prepares for writing the boxes to a file by inserting newlines, spaces,\n// and re-ordering so the boxes are strictly left-to-right.\n/* static */\nvoid BoxChar::PrepareToWrite(std::vector<BoxChar *> *boxes) {\n  bool rtl_rules = ContainsMostlyRTL(*boxes);\n  bool vertical_rules = MostlyVertical(*boxes);\n  InsertNewlines(rtl_rules, vertical_rules, boxes);\n  InsertSpaces(rtl_rules, vertical_rules, boxes);\n  for (size_t i = 0; i < boxes->size(); ++i) {\n    if ((*boxes)[i]->box_ == nullptr) {\n      tesserr << \"Null box at index \" << i << '\\n';\n    }\n  }\n  if (rtl_rules) {\n    ReorderRTLText(boxes);\n  }\n}\n\n// Inserts newline (tab) characters into the vector at newline positions.\n/* static */\nvoid BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) {\n  size_t prev_i = SIZE_MAX;\n  int max_shift = 0;\n  for (size_t i = 0; i < boxes->size(); ++i) {\n    Box *box = (*boxes)[i]->box_;\n    if (box == nullptr) {\n      if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {\n        // Erase null boxes at the start of a line and after another null box.\n        do {\n          delete (*boxes)[i];\n          boxes->erase(boxes->begin() + i);\n          if (i == 0) {\n            break;\n          }\n        } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);\n      }\n      continue;\n    }\n    if (prev_i != SIZE_MAX) {\n      Box *prev_box = (*boxes)[prev_i]->box_;\n      int shift = box->x - prev_box->x;\n      if (vertical_rules) {\n        shift = box->y - prev_box->y;\n      } else if (rtl_rules) {\n        shift = -shift;\n      }\n      if (-shift > max_shift) {\n        // This is a newline. Since nothing cares about the size of the box,\n        // except the out-of-bounds checker, minimize the chance of creating\n        // a box outside the image by making the width and height 1.\n        int width = 1;\n        int height = 1;\n        int x = prev_box->x + prev_box->w;\n        int y = prev_box->y;\n        if (vertical_rules) {\n          x = prev_box->x;\n          y = prev_box->y + prev_box->h;\n        } else if (rtl_rules) {\n          x = prev_box->x - width;\n          if (x < 0) {\n            tprintf(\"prev x = %d, width=%d\\n\", prev_box->x, width);\n            x = 0;\n          }\n        }\n        if (prev_i + 1 == i) {\n          // New character needed.\n          auto *new_box = new BoxChar(\"\\t\", 1);\n          new_box->AddBox(x, y, width, height);\n          new_box->page_ = (*boxes)[i]->page_;\n          boxes->insert(boxes->begin() + i, new_box);\n          ++i;\n        } else {\n          (*boxes)[i - 1]->AddBox(x, y, width, height);\n          (*boxes)[i - 1]->ch_ = \"\\t\";\n        }\n        max_shift = 0;\n      } else if (shift > max_shift) {\n        max_shift = shift;\n      }\n    }\n    prev_i = i;\n  }\n}\n\n// Converts nullptr boxes to space characters, with appropriate bounding boxes.\n/* static */\nvoid BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) {\n  // After InsertNewlines, any remaining null boxes are not newlines, and are\n  // singletons, so add a box to each remaining null box.\n  for (size_t i = 1; i + 1 < boxes->size(); ++i) {\n    Box *box = (*boxes)[i]->box_;\n    if (box == nullptr) {\n      Box *prev = (*boxes)[i - 1]->box_;\n      Box *next = (*boxes)[i + 1]->box_;\n      ASSERT_HOST(prev != nullptr && next != nullptr);\n      int top = std::min(prev->y, next->y);\n      int bottom = std::max(prev->y + prev->h, next->y + next->h);\n      int left = prev->x + prev->w;\n      int right = next->x;\n      if (vertical_rules) {\n        top = prev->y + prev->h;\n        bottom = next->y;\n        left = std::min(prev->x, next->x);\n        right = std::max(prev->x + prev->w, next->x + next->w);\n      } else if (rtl_rules) {\n        // With RTL we have to account for BiDi.\n        // Right becomes the min left of all prior boxes back to the first\n        // space or newline.\n        right = prev->x;\n        left = next->x + next->w;\n        for (int j = i - 2; j >= 0 && (*boxes)[j]->ch_ != \" \" && (*boxes)[j]->ch_ != \"\\t\"; --j) {\n          prev = (*boxes)[j]->box_;\n          ASSERT_HOST(prev != nullptr);\n          if (prev->x < right) {\n            right = prev->x;\n          }\n        }\n        // Left becomes the max right of all next boxes forward to the first\n        // space or newline.\n        for (size_t j = i + 2;\n             j < boxes->size() && (*boxes)[j]->box_ != nullptr && (*boxes)[j]->ch_ != \"\\t\"; ++j) {\n          next = (*boxes)[j]->box_;\n          if (next->x + next->w > left) {\n            left = next->x + next->w;\n          }\n        }\n      }\n      // Italic and stylized characters can produce negative spaces, which\n      // Leptonica doesn't like, so clip to a positive size.\n      if (right <= left) {\n        right = left + 1;\n      }\n      if (bottom <= top) {\n        bottom = top + 1;\n      }\n      (*boxes)[i]->AddBox(left, top, right - left, bottom - top);\n      (*boxes)[i]->ch_ = \" \";\n    }\n  }\n}\n\n// Reorders text in a right-to-left script in left-to-right order.\n/* static */\nvoid BoxChar::ReorderRTLText(std::vector<BoxChar *> *boxes) {\n  // Ideally we need the inverse of the algorithm used by ResultIterator.\n  // For now, let's try a sort that reverses original positions for RTL\n  // characters, otherwise by x-position. This should be much closer to\n  // correct than just sorting by x-position.\n  size_t num_boxes = boxes->size();\n  for (size_t i = 0; i < num_boxes; ++i) {\n    int num_rtl = 0, num_ltr = 0;\n    (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);\n    if (num_rtl > num_ltr) {\n      (*boxes)[i]->set_rtl_index(i);\n      (*boxes)[i]->ReverseUnicodesInBox();\n    }\n  }\n  BoxCharPtrSort sorter;\n  size_t end = 0;\n  for (size_t start = 0; start < boxes->size(); start = end + 1) {\n    end = start + 1;\n    while (end < boxes->size() && (*boxes)[end]->ch_ != \"\\t\") {\n      ++end;\n    }\n    std::sort(boxes->begin() + start, boxes->begin() + end, sorter);\n  }\n}\n\n// Returns true if the vector contains mostly RTL characters.\n/* static */\nbool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar *> &boxes) {\n  int num_rtl = 0, num_ltr = 0;\n  for (auto boxe : boxes) {\n    boxe->GetDirection(&num_rtl, &num_ltr);\n  }\n  return num_rtl > num_ltr;\n}\n\n// Returns true if the text is mostly laid out vertically.\n/* static */\nbool BoxChar::MostlyVertical(const std::vector<BoxChar *> &boxes) {\n  int64_t total_dx = 0, total_dy = 0;\n  for (size_t i = 1; i < boxes.size(); ++i) {\n    if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&\n        boxes[i - 1]->page_ == boxes[i]->page_) {\n      int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;\n      int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;\n      if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) {\n        total_dx += static_cast<int64_t>(dx) * dx;\n        total_dy += static_cast<int64_t>(dy) * dy;\n      }\n    }\n  }\n  return total_dy > total_dx;\n}\n\n// Returns the total length of all the strings in the boxes.\n/* static */\nint BoxChar::TotalByteLength(const std::vector<BoxChar *> &boxes) {\n  int total_length = 0;\n  for (auto boxe : boxes) {\n    total_length += boxe->ch_.size();\n  }\n  return total_length;\n}\n\n// Rotate the boxes in [start_box, end_box) by the given rotation.\n// The rotation is in radians clockwise about the given center.\n/* static */\nvoid BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box,\n                          std::vector<BoxChar *> *boxes) {\n  Boxa *orig = boxaCreate(0);\n  for (int i = start_box; i < end_box; ++i) {\n    Box *box = (*boxes)[i]->box_;\n    if (box) {\n      boxaAddBox(orig, box, L_CLONE);\n    }\n  }\n  Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation);\n  boxaDestroy(&orig);\n  for (int i = start_box, box_ind = 0; i < end_box; ++i) {\n    if ((*boxes)[i]->box_) {\n      boxDestroy(&((*boxes)[i]->box_));\n      (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);\n    }\n  }\n  boxaDestroy(&rotated);\n}\n\nconst int kMaxLineLength = 1024;\n/* static */\nvoid BoxChar::WriteTesseractBoxFile(const std::string &filename, int height,\n                                    const std::vector<BoxChar *> &boxes) {\n  std::string output = GetTesseractBoxStr(height, boxes);\n  File::WriteStringToFileOrDie(output, filename);\n}\n\n/* static */\nstd::string BoxChar::GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes) {\n  std::string output;\n  char buffer[kMaxLineLength];\n  for (auto boxe : boxes) {\n    const Box *box = boxe->box_;\n    if (box == nullptr) {\n      tprintf(\"Error: Call PrepareToWrite before WriteTesseractBoxFile!!\\n\");\n      return \"\";\n    }\n    int nbytes = snprintf(buffer, kMaxLineLength, \"%s %d %d %d %d %d\\n\", boxe->ch_.c_str(), box->x,\n                          height - box->y - box->h, box->x + box->w, height - box->y, boxe->page_);\n    output.append(buffer, nbytes);\n  }\n  return output;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/pango/boxchar.h",
    "content": "/**********************************************************************\n * File:        boxchar.h\n * Description: Simple class to associate a Tesseract classification unit with\n *              its bounding box so that the boxes can be rotated as the image\n *              is rotated for degradation.  Also includes routines to output\n *              the character-tagged boxes to a boxfile.\n * Author:      Ray Smith\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_TRAINING_BOXCHAR_H_\n#define TESSERACT_TRAINING_BOXCHAR_H_\n\n#include <string>\n#include <vector>\n\n#include <allheaders.h>   // for Leptonica API\n#if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1\n#include <pix_internal.h> // for fast access to Box geometry\n#endif\n#include <tesseract/export.h>\n\nnamespace tesseract {\n\nclass BoxChar {\npublic:\n  BoxChar(const char *utf8_str, int len);\n\n  ~BoxChar();\n\n  // Accessors.\n  const std::string &ch() const {\n    return ch_;\n  }\n  const Box *box() const {\n    return box_;\n  }\n  const int &page() const {\n    return page_;\n  }\n  void set_rtl_index(int index) {\n    rtl_index_ = index;\n  }\n  const int &rtl_index() const {\n    return rtl_index_;\n  }\n\n  // Set the box_ member.\n  void AddBox(int x, int y, int width, int height);\n\n  void set_page(int page) {\n    page_ = page;\n  }\n\n  std::string *mutable_ch() {\n    return &ch_;\n  }\n  Box *mutable_box() {\n    return box_;\n  }\n\n  // Sort function for sorting by left edge of box. Note that this will not\n  // work properly until after InsertNewlines and InsertSpaces.\n  bool operator<(const BoxChar &other) const {\n    if (box_ == nullptr) {\n      return true;\n    }\n    if (other.box_ == nullptr) {\n      return false;\n    }\n    return box_->x < other.box_->x;\n  }\n  // Increments *num_rtl and *num_ltr according to the directionality of\n  // characters in the box.\n  void GetDirection(int *num_rtl, int *num_ltr) const;\n  // Reverses the order of unicodes within the box. If Pango generates a\n  // ligature, these will get reversed on output, so reverse now.\n  void ReverseUnicodesInBox();\n\n  static void TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes);\n\n  // Prepares for writing the boxes to a file by inserting newlines, spaces,\n  // and re-ordering so the boxes are strictly left-to-right.\n  static void PrepareToWrite(std::vector<BoxChar *> *boxes);\n  // Inserts newline (tab) characters into the vector at newline positions.\n  static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes);\n  // Converts nullptr boxes to space characters, with appropriate bounding\n  // boxes.\n  static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes);\n  // Reorders text in a right-to-left script in left-to-right order.\n  static void ReorderRTLText(std::vector<BoxChar *> *boxes);\n  // Returns true if the vector contains mostly RTL characters.\n  static bool ContainsMostlyRTL(const std::vector<BoxChar *> &boxes);\n  // Returns true if the text is mostly laid out vertically.\n  static bool MostlyVertical(const std::vector<BoxChar *> &boxes);\n\n  // Returns the total length of all the strings in the boxes.\n  static int TotalByteLength(const std::vector<BoxChar *> &boxes);\n\n  // Rotate the vector of boxes between start and end by the given rotation.\n  // The rotation is in radians clockwise about the given center.\n  static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box,\n                          std::vector<BoxChar *> *boxes);\n\n  // Create a tesseract box file from the vector of boxes. The image height\n  // is needed to convert to tesseract coordinates.\n  static void WriteTesseractBoxFile(const std::string &name, int height,\n                                    const std::vector<BoxChar *> &boxes);\n  // Gets the tesseract box file as a string from the vector of boxes.\n  // The image height is needed to convert to tesseract coordinates.\n  static std::string GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes);\n\nprivate:\n  std::string ch_;\n  Box *box_;\n  int page_;\n  // If the box is an RTL character, contains the original position in the\n  // array of boxes (before reversal), otherwise -1.\n  int rtl_index_;\n};\n\n// Sort predicate to sort a vector of BoxChar*.\nstruct BoxCharPtrSort {\n  bool operator()(const BoxChar *box1, const BoxChar *box2) const {\n    if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0) {\n      return box2->rtl_index() < box1->rtl_index();\n    }\n    return *box1 < *box2;\n  }\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_BOXCHAR_H_\n"
  },
  {
    "path": "src/training/pango/export.h",
    "content": "#pragma once\n\n#ifdef CMAKE_BUILD\n#  include <pango_training_export.h>\n#endif\n"
  },
  {
    "path": "src/training/pango/ligature_table.cpp",
    "content": "/**********************************************************************\n * File:        ligature_table.cpp\n * Description: Class for adding and removing optional latin ligatures,\n *              conditional on codepoint support by a specified font\n *              (if specified).\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"ligature_table.h\"\n\n#include <tesseract/unichar.h>\n#include \"pango_font_info.h\"\n#include \"tlog.h\"\n#include \"unicharset.h\"\n#include \"unicode/errorcode.h\" // from libicu\n#include \"unicode/normlzr.h\"   // from libicu\n#include \"unicode/unistr.h\"    // from libicu\n#include \"unicode/utypes.h\"    // from libicu\n\n#include <utility>\n\nnamespace tesseract {\n\nstatic std::string EncodeAsUTF8(const char32 ch32) {\n  UNICHAR uni_ch(ch32);\n  return std::string(uni_ch.utf8(), uni_ch.utf8_len());\n}\n\n// Range of optional latin ligature characters in Unicode to build ligatures\n// from. Note that this range does not contain the custom ligatures that we\n// encode in the private use area.\nconst int kMinLigature = 0xfb00;\nconst int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.\n\n/* static */\nstd::unique_ptr<LigatureTable> LigatureTable::instance_;\n\n/* static */\nLigatureTable *LigatureTable::Get() {\n  if (instance_ == nullptr) {\n    instance_.reset(new LigatureTable());\n    instance_->Init();\n  }\n  return instance_.get();\n}\n\nLigatureTable::LigatureTable()\n    : min_lig_length_(0), max_lig_length_(0), min_norm_length_(0), max_norm_length_(0) {}\n\nvoid LigatureTable::Init() {\n  if (norm_to_lig_table_.empty()) {\n    for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {\n      // For each char in the range, convert to utf8, nfc normalize, and if\n      // the strings are different put the both mappings in the hash_maps.\n      std::string lig8 = EncodeAsUTF8(lig);\n      icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));\n      icu::UnicodeString normed8_result;\n      icu::ErrorCode status;\n      icu::Normalizer::normalize(unicode_lig8, UNORM_NFC, 0, normed8_result, status);\n      std::string normed8;\n      normed8_result.toUTF8String(normed8);\n      int lig_length = lig8.length();\n      int norm_length = normed8.size();\n      if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {\n        norm_to_lig_table_[normed8] = lig8;\n        lig_to_norm_table_[lig8] = std::move(normed8);\n        if (min_lig_length_ == 0 || lig_length < min_lig_length_) {\n          min_lig_length_ = lig_length;\n        }\n        if (lig_length > max_lig_length_) {\n          max_lig_length_ = lig_length;\n        }\n        if (min_norm_length_ == 0 || norm_length < min_norm_length_) {\n          min_norm_length_ = norm_length;\n        }\n        if (norm_length > max_norm_length_) {\n          max_norm_length_ = norm_length;\n        }\n      }\n    }\n    // Add custom extra ligatures.\n    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {\n      norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] = UNICHARSET::kCustomLigatures[i][1];\n      int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);\n      if (min_norm_length_ == 0 || norm_length < min_norm_length_) {\n        min_norm_length_ = norm_length;\n      }\n      if (norm_length > max_norm_length_) {\n        max_norm_length_ = norm_length;\n      }\n\n      lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] = UNICHARSET::kCustomLigatures[i][0];\n    }\n  }\n}\n\nstd::string LigatureTable::RemoveLigatures(const std::string &str) const {\n  std::string result;\n  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());\n  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());\n  char tmp[5];\n  int len;\n  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {\n    len = it.get_utf8(tmp);\n    tmp[len] = '\\0';\n    auto lig_it = lig_to_norm_table_.find(tmp);\n    if (lig_it != lig_to_norm_table_.end()) {\n      result += lig_it->second;\n    } else {\n      result += tmp;\n    }\n  }\n  return result;\n}\n\nstd::string LigatureTable::RemoveCustomLigatures(const std::string &str) const {\n  std::string result;\n  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());\n  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());\n  char tmp[5];\n  int len;\n  int norm_ind;\n  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {\n    len = it.get_utf8(tmp);\n    tmp[len] = '\\0';\n    norm_ind = -1;\n    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {\n      if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {\n        norm_ind = i;\n      }\n    }\n    if (norm_ind >= 0) {\n      result += UNICHARSET::kCustomLigatures[norm_ind][0];\n    } else {\n      result += tmp;\n    }\n  }\n  return result;\n}\n\nstd::string LigatureTable::AddLigatures(const std::string &str, const PangoFontInfo *font) const {\n  std::string result;\n  int len = str.size();\n  int step = 0;\n  int i = 0;\n  for (i = 0; i < len - min_norm_length_ + 1; i += step) {\n    step = 0;\n    for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {\n      if (i + liglen <= len) {\n        std::string lig_cand = str.substr(i, liglen);\n        auto it = norm_to_lig_table_.find(lig_cand);\n        if (it != norm_to_lig_table_.end()) {\n          tlog(3, \"Considering %s -> %s\\n\", lig_cand.c_str(), it->second.c_str());\n          if (font) {\n            // Test for renderability.\n            if (!font->CanRenderString(it->second.data(), it->second.length())) {\n              continue; // Not renderable\n            }\n          }\n          // Found a match so convert it.\n          step = liglen;\n          result += it->second;\n          tlog(2, \"Substituted %s -> %s\\n\", lig_cand.c_str(), it->second.c_str());\n          break;\n        }\n      }\n    }\n    if (step == 0) {\n      result += str[i];\n      step = 1;\n    }\n  }\n  result += str.substr(i, len - i);\n  return result;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/pango/ligature_table.h",
    "content": "/**********************************************************************\n * File:        ligature_table.h\n * Description: Class for adding and removing optional latin ligatures,\n *              conditional on codepoint support by a specified font\n *              (if specified).\n * Author:      Ranjith Unnikrishnan\n * Created:     Mon Nov 18 2013\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TRAININGDATA_LIGATURE_TABLE_H_\n#define TRAININGDATA_LIGATURE_TABLE_H_\n\n#include \"export.h\"\n\n#include <memory>\n#include <string>\n#include <unordered_map>\n\nnamespace tesseract {\n\nclass PangoFontInfo; // defined in pango_font_info.h\n\n// Map to substitute strings for ligatures.\nusing LigHash = std::unordered_map<std::string, std::string>;\n\nclass TESS_PANGO_TRAINING_API LigatureTable {\npublic:\n  // Get a static instance of this class.\n  static LigatureTable *Get();\n\n  // Convert the utf8 string so that ligaturizable sequences, such as \"fi\" get\n  // replaced by the (utf8 code for) appropriate ligature characters. Only do so\n  // if the corresponding ligature character is renderable in the current font.\n  std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const;\n  // Remove all ligatures.\n  std::string RemoveLigatures(const std::string &str) const;\n  // Remove only custom ligatures (eg. \"ct\") encoded in the private-use-area.\n  std::string RemoveCustomLigatures(const std::string &str) const;\n\n  const LigHash &norm_to_lig_table() const {\n    return norm_to_lig_table_;\n  }\n  const LigHash &lig_to_norm_table() const {\n    return lig_to_norm_table_;\n  }\n\nprotected:\n  LigatureTable();\n  // Initialize the hash tables mapping between ligature strings and the\n  // corresponding ligature characters.\n  void Init();\n\n  static std::unique_ptr<LigatureTable> instance_;\n  LigHash norm_to_lig_table_;\n  LigHash lig_to_norm_table_;\n  int min_lig_length_;\n  int max_lig_length_;\n  int min_norm_length_;\n  int max_norm_length_;\n\nprivate:\n  LigatureTable(const LigatureTable &) = delete;\n  void operator=(const LigatureTable &) = delete;\n};\n\n} // namespace tesseract\n\n#endif // OCR_TRAININGDATA_TYPESETTING_LIGATURE_TABLE_H_\n"
  },
  {
    "path": "src/training/pango/pango_font_info.cpp",
    "content": "/**********************************************************************\n * File:        pango_font_info.cpp\n * Description: Font-related objects and helper functions\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#if (defined __CYGWIN__)\n// workaround for stdlib.h and putenv\n#  undef __STRICT_ANSI__\n#endif\n\n#include \"commandlineflags.h\"\n#include \"fileio.h\"\n#include \"normstrngs.h\"\n#include \"pango_font_info.h\"\n#include \"tlog.h\"\n\n#include <tesseract/unichar.h>\n\n#include \"pango/pango.h\"\n#include \"pango/pangocairo.h\"\n#include \"pango/pangofc-font.h\"\n\n#include <algorithm>\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n\n#ifndef _MSC_VER\n#  include <sys/param.h>\n#endif\n\n#define DISABLE_HEAP_LEAK_CHECK\n\nusing namespace tesseract;\n\nnamespace tesseract {\n\n// Default assumed output resolution. Required only for providing font metrics\n// in pixels.\nconst int kDefaultResolution = 300;\n\nstd::string PangoFontInfo::fonts_dir_;\nstd::string PangoFontInfo::cache_dir_;\n\nstatic PangoGlyph get_glyph(PangoFont *font, gunichar wc) {\n#if PANGO_VERSION_CHECK(1, 44, 0)\n  // pango_font_get_hb_font requires Pango 1.44 or newer.\n  hb_font_t *hb_font = pango_font_get_hb_font(font);\n  hb_codepoint_t glyph;\n  hb_font_get_nominal_glyph(hb_font, wc, &glyph);\n#else\n  // Use deprecated pango_fc_font_get_glyph for older Pango versions.\n  PangoGlyph glyph = pango_fc_font_get_glyph(PANGO_FC_FONT(font), wc);\n#endif\n  return glyph;\n}\n\nPangoFontInfo::PangoFontInfo() : desc_(nullptr), resolution_(kDefaultResolution) {\n  Clear();\n}\n\nPangoFontInfo::PangoFontInfo(const std::string &desc)\n    : desc_(nullptr), resolution_(kDefaultResolution) {\n  if (!ParseFontDescriptionName(desc)) {\n    tprintf(\"ERROR: Could not parse %s\\n\", desc.c_str());\n    Clear();\n  }\n}\n\nvoid PangoFontInfo::Clear() {\n  font_size_ = 0;\n  family_name_.clear();\n  font_type_ = UNKNOWN;\n  if (desc_) {\n    pango_font_description_free(desc_);\n    desc_ = nullptr;\n  }\n}\n\nPangoFontInfo::~PangoFontInfo() {\n  pango_font_description_free(desc_);\n}\n\nstd::string PangoFontInfo::DescriptionName() const {\n  if (!desc_) {\n    return \"\";\n  }\n  char *desc_str = pango_font_description_to_string(desc_);\n  std::string desc_name(desc_str);\n  g_free(desc_str);\n  return desc_name;\n}\n\n// If not already initialized, initializes FontConfig by setting its\n// environment variable and creating a fonts.conf file that points to the\n// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.\n/* static */\nvoid PangoFontInfo::SoftInitFontConfig() {\n  if (fonts_dir_.empty()) {\n    HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());\n  }\n}\n\n// Re-initializes font config, whether or not already initialized.\n// If already initialized, any existing cache is deleted, just to be sure.\n/* static */\nvoid PangoFontInfo::HardInitFontConfig(const char *fonts_dir, const char *cache_dir) {\n  if (!cache_dir_.empty()) {\n    File::DeleteMatchingFiles(File::JoinPath(cache_dir_.c_str(), \"*cache-?\").c_str());\n  }\n  const int MAX_FONTCONF_FILESIZE = 1024;\n  char fonts_conf_template[MAX_FONTCONF_FILESIZE];\n  cache_dir_ = cache_dir;\n  fonts_dir_ = fonts_dir;\n  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,\n           \"<?xml version=\\\"1.0\\\"?>\\n\"\n           \"<!DOCTYPE fontconfig SYSTEM \\\"fonts.dtd\\\">\\n\"\n           \"<fontconfig>\\n\"\n           \"<dir>%s</dir>\\n\"\n           \"<cachedir>%s</cachedir>\\n\"\n           \"<config></config>\\n\"\n           \"</fontconfig>\\n\",\n           fonts_dir, cache_dir);\n  std::string fonts_conf_file = File::JoinPath(cache_dir, \"fonts.conf\");\n  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);\n#ifdef _WIN32\n  std::string env(\"FONTCONFIG_PATH=\");\n  env.append(cache_dir);\n  _putenv(env.c_str());\n  _putenv(\"LANG=en_US.utf8\");\n#else\n  setenv(\"FONTCONFIG_PATH\", cache_dir, true);\n  // Fix the locale so that the reported font names are consistent.\n  setenv(\"LANG\", \"en_US.utf8\", true);\n#endif // _WIN32\n\n  if (FcInitReinitialize() != FcTrue) {\n    tprintf(\"FcInitiReinitialize failed!!\\n\");\n  }\n  FontUtils::ReInit();\n  // Clear Pango's font cache too.\n  pango_cairo_font_map_set_default(nullptr);\n}\n\nstatic void ListFontFamilies(PangoFontFamily ***families, int *n_families) {\n  PangoFontInfo::SoftInitFontConfig();\n  PangoFontMap *font_map = pango_cairo_font_map_get_default();\n  DISABLE_HEAP_LEAK_CHECK;\n  pango_font_map_list_families(font_map, families, n_families);\n}\n\nbool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {\n  Clear();\n  const char *family = pango_font_description_get_family(desc);\n  if (!family) {\n    char *desc_str = pango_font_description_to_string(desc);\n    tprintf(\"WARNING: Could not parse family name from description: '%s'\\n\", desc_str);\n    g_free(desc_str);\n    return false;\n  }\n  family_name_ = std::string(family);\n  desc_ = pango_font_description_copy(desc);\n\n  // Set font size in points\n  font_size_ = pango_font_description_get_size(desc);\n  if (!pango_font_description_get_size_is_absolute(desc)) {\n    font_size_ /= PANGO_SCALE;\n  }\n\n  return true;\n}\n\nbool PangoFontInfo::ParseFontDescriptionName(const std::string &name) {\n  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());\n  bool success = ParseFontDescription(desc);\n  pango_font_description_free(desc);\n  return success;\n}\n\n// Returns the PangoFont structure corresponding to the closest available font\n// in the font map. Note that if the font is wholly missing, this could\n// correspond to a completely different font family and face.\nPangoFont *PangoFontInfo::ToPangoFont() const {\n  SoftInitFontConfig();\n  PangoFontMap *font_map = pango_cairo_font_map_get_default();\n  PangoContext *context = pango_context_new();\n  pango_cairo_context_set_resolution(context, resolution_);\n  pango_context_set_font_map(context, font_map);\n  PangoFont *font = nullptr;\n  {\n    DISABLE_HEAP_LEAK_CHECK;\n    font = pango_font_map_load_font(font_map, context, desc_);\n  }\n  g_object_unref(context);\n  return font;\n}\n\nbool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const {\n  PangoFont *font = ToPangoFont();\n  if (font == nullptr) {\n    // Font not found.\n    return false;\n  }\n  PangoCoverage *coverage = pango_font_get_coverage(font, nullptr);\n  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);\n       it != UNICHAR::end(utf8_text, byte_length); ++it) {\n    if (IsWhitespace(*it) || pango_is_zero_width(*it)) {\n      continue;\n    }\n    if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {\n      char tmp[5];\n      int len = it.get_utf8(tmp);\n      tmp[len] = '\\0';\n      tlog(2, \"'%s' (U+%x) not covered by font\\n\", tmp, *it);\n#if PANGO_VERSION_CHECK(1, 50, 4)\n      g_object_unref(coverage);\n#else\n      pango_coverage_unref(coverage);\n#endif\n      g_object_unref(font);\n      return false;\n    }\n  }\n#if PANGO_VERSION_CHECK(1, 50, 4)\n  g_object_unref(coverage);\n#else\n  pango_coverage_unref(coverage);\n#endif\n  g_object_unref(font);\n  return true;\n}\n\n// This variant of strncpy permits src and dest to overlap. It will copy the\n// first byte first.\nstatic char *my_strnmove(char *dest, const char *src, size_t n) {\n  char *ret = dest;\n\n  // Copy characters until n reaches zero or the src byte is a nul.\n  do {\n    *dest = *src;\n    --n;\n    ++dest;\n    ++src;\n  } while (n && src[0]);\n\n  // If we reached a nul byte and there are more 'n' left, zero them out.\n  while (n) {\n    *dest = '\\0';\n    --n;\n    ++dest;\n  }\n  return ret;\n}\n\nint PangoFontInfo::DropUncoveredChars(std::string *utf8_text) const {\n  int num_dropped_chars = 0;\n  PangoFont *font = ToPangoFont();\n  if (font == nullptr) {\n    // Font not found, drop all characters.\n    num_dropped_chars = utf8_text->length();\n    utf8_text->clear();\n    return num_dropped_chars;\n  }\n  PangoCoverage *coverage = pango_font_get_coverage(font, nullptr);\n  // Maintain two iterators that point into the string. For space efficiency, we\n  // will repeatedly copy one covered UTF8 character from one to the other, and\n  // at the end resize the string to the right length.\n  char *out = const_cast<char *>(utf8_text->c_str());\n  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_text->c_str(), utf8_text->length());\n  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_text->c_str(), utf8_text->length());\n  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {\n    // Skip bad utf-8.\n    if (!it.is_legal()) {\n      ++it; // One suitable error message will still be issued.\n      continue;\n    }\n    int unicode = *it;\n    int utf8_len = it.utf8_len();\n    const char *utf8_char = it.utf8_data();\n    // Move it forward before the data gets modified.\n    ++it;\n    if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&\n        pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {\n      if (TLOG_IS_ON(2)) {\n        UNICHAR unichar(unicode);\n        char *str = unichar.utf8_str();\n        tlog(2, \"'%s' (U+%x) not covered by font\\n\", str, unicode);\n        delete[] str;\n      }\n      ++num_dropped_chars;\n      continue;\n    }\n    my_strnmove(out, utf8_char, utf8_len);\n    out += utf8_len;\n  }\n#if PANGO_VERSION_CHECK(1, 50, 4)\n  g_object_unref(coverage);\n#else\n  pango_coverage_unref(coverage);\n#endif\n  g_object_unref(font);\n  utf8_text->resize(out - utf8_text->c_str());\n  return num_dropped_chars;\n}\n\nbool PangoFontInfo::GetSpacingProperties(const std::string &utf8_char, int *x_bearing,\n                                         int *x_advance) const {\n  // Convert to equivalent PangoFont structure\n  PangoFont *font = ToPangoFont();\n  if (!font) {\n    return false;\n  }\n  // Find the glyph index in the font for the supplied utf8 character.\n  int total_advance = 0;\n  int min_bearing = 0;\n  // Handle multi-unicode strings by reporting the left-most position of the\n  // x-bearing, and right-most position of the x-advance if the string were to\n  // be rendered.\n  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(), utf8_char.length());\n  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(), utf8_char.length());\n  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {\n    PangoGlyph glyph_index = get_glyph(font, *it);\n    if (!glyph_index) {\n      // Glyph for given unicode character doesn't exist in font.\n      g_object_unref(font);\n      return false;\n    }\n    // Find the ink glyph extents for the glyph\n    PangoRectangle ink_rect, logical_rect;\n    pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);\n    pango_extents_to_pixels(&ink_rect, nullptr);\n    pango_extents_to_pixels(&logical_rect, nullptr);\n\n    int bearing = total_advance + PANGO_LBEARING(ink_rect);\n    if (it == it_begin || bearing < min_bearing) {\n      min_bearing = bearing;\n    }\n    total_advance += PANGO_RBEARING(logical_rect);\n  }\n  *x_bearing = min_bearing;\n  *x_advance = total_advance;\n  g_object_unref(font);\n  return true;\n}\n\nbool PangoFontInfo::CanRenderString(const char *utf8_word, int len) const {\n  std::vector<std::string> graphemes;\n  return CanRenderString(utf8_word, len, &graphemes);\n}\n\nbool PangoFontInfo::CanRenderString(const char *utf8_word, int len,\n                                    std::vector<std::string> *graphemes) const {\n  if (graphemes) {\n    graphemes->clear();\n  }\n  // We check for font coverage of the text first, as otherwise Pango could\n  // (undesirably) fall back to another font that does have the required\n  // coverage.\n  if (!CoversUTF8Text(utf8_word, len)) {\n    return false;\n  }\n  // U+25CC dotted circle character that often (but not always) gets rendered\n  // when there is an illegal grapheme sequence.\n  const char32 kDottedCircleGlyph = 9676;\n  bool bad_glyph = false;\n  PangoFontMap *font_map = pango_cairo_font_map_get_default();\n  PangoContext *context = pango_context_new();\n  pango_context_set_font_map(context, font_map);\n  PangoLayout *layout;\n  {\n    // Pango is not releasing the cached layout.\n    DISABLE_HEAP_LEAK_CHECK;\n    layout = pango_layout_new(context);\n  }\n  if (desc_) {\n    pango_layout_set_font_description(layout, desc_);\n  } else {\n    PangoFontDescription *desc = pango_font_description_from_string(DescriptionName().c_str());\n    pango_layout_set_font_description(layout, desc);\n    pango_font_description_free(desc);\n  }\n  pango_layout_set_text(layout, utf8_word, len);\n  PangoLayoutIter *run_iter = nullptr;\n  { // Fontconfig caches some information here that is not freed before exit.\n    DISABLE_HEAP_LEAK_CHECK;\n    run_iter = pango_layout_get_iter(layout);\n  }\n  do {\n    PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);\n    if (!run) {\n      tlog(2, \"Found end of line nullptr run marker\\n\");\n      continue;\n    }\n    PangoGlyph dotted_circle_glyph;\n    PangoFont *font = run->item->analysis.font;\n\n    dotted_circle_glyph = get_glyph(font, kDottedCircleGlyph);\n\n    if (TLOG_IS_ON(2)) {\n      PangoFontDescription *desc = pango_font_describe(font);\n      char *desc_str = pango_font_description_to_string(desc);\n      tlog(2, \"Desc of font in run: %s\\n\", desc_str);\n      g_free(desc_str);\n      pango_font_description_free(desc);\n    }\n\n    PangoGlyphItemIter cluster_iter;\n    gboolean have_cluster;\n    for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, utf8_word);\n         have_cluster && !bad_glyph;\n         have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {\n      const int start_byte_index = cluster_iter.start_index;\n      const int end_byte_index = cluster_iter.end_index;\n      int start_glyph_index = cluster_iter.start_glyph;\n      int end_glyph_index = cluster_iter.end_glyph;\n      std::string cluster_text =\n          std::string(utf8_word + start_byte_index, end_byte_index - start_byte_index);\n      if (graphemes) {\n        graphemes->push_back(cluster_text);\n      }\n      if (IsUTF8Whitespace(cluster_text.c_str())) {\n        tlog(2, \"Skipping whitespace\\n\");\n        continue;\n      }\n      if (TLOG_IS_ON(2)) {\n        printf(\"start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d \", start_byte_index,\n               end_byte_index, start_glyph_index, end_glyph_index);\n      }\n      for (int i = start_glyph_index, step = (end_glyph_index > start_glyph_index) ? 1 : -1;\n           !bad_glyph && i != end_glyph_index; i += step) {\n        const bool unknown_glyph =\n            (cluster_iter.glyph_item->glyphs->glyphs[i].glyph & PANGO_GLYPH_UNKNOWN_FLAG);\n        const bool illegal_glyph =\n            (cluster_iter.glyph_item->glyphs->glyphs[i].glyph == dotted_circle_glyph);\n        bad_glyph = unknown_glyph || illegal_glyph;\n        if (TLOG_IS_ON(2)) {\n          printf(\"(%d=%d)\", cluster_iter.glyph_item->glyphs->glyphs[i].glyph, bad_glyph ? 1 : 0);\n        }\n      }\n      if (TLOG_IS_ON(2)) {\n        printf(\"  '%s'\\n\", cluster_text.c_str());\n      }\n      if (bad_glyph)\n        tlog(1, \"Found illegal glyph!\\n\");\n    }\n  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));\n\n  pango_layout_iter_free(run_iter);\n  g_object_unref(context);\n  g_object_unref(layout);\n  if (bad_glyph && graphemes) {\n    graphemes->clear();\n  }\n  return !bad_glyph;\n}\n\n// ------------------------ FontUtils ------------------------------------\nstd::vector<std::string> FontUtils::available_fonts_; // cache list\n\n// Returns whether the specified font description is available in the fonts\n// directory.\n//\n// The generated list of font families and faces includes \"synthesized\" font\n// faces that are not truly loadable. Pango versions >=1.18 have a\n// pango_font_face_is_synthesized method that can be used to prune the list.\n// Until then, we are restricted to using a hack where we try to load the font\n// from the font_map, and then check what we loaded to see if it has the\n// description we expected. If it is not, then the font is deemed unavailable.\n//\n// TODO: This function reports also some not synthesized fonts as not available\n// e.g. 'Bitstream Charter Medium Italic', 'LMRoman17', so we need this hack\n// until  other solution is found.\n/* static */\nbool FontUtils::IsAvailableFont(const char *input_query_desc, std::string *best_match) {\n  std::string query_desc(input_query_desc);\n  PangoFontDescription *desc = pango_font_description_from_string(query_desc.c_str());\n  PangoFont *selected_font = nullptr;\n  {\n    PangoFontInfo::SoftInitFontConfig();\n    PangoFontMap *font_map = pango_cairo_font_map_get_default();\n    PangoContext *context = pango_context_new();\n    pango_context_set_font_map(context, font_map);\n    {\n      DISABLE_HEAP_LEAK_CHECK;\n      selected_font = pango_font_map_load_font(font_map, context, desc);\n    }\n    g_object_unref(context);\n  }\n  if (selected_font == nullptr) {\n    pango_font_description_free(desc);\n    tlog(4, \"** Font '%s' failed to load from font map!\\n\", input_query_desc);\n    return false;\n  }\n  PangoFontDescription *selected_desc = pango_font_describe(selected_font);\n\n  bool equal = pango_font_description_equal(desc, selected_desc);\n  tlog(3, \"query weight = %d \\t selected weight =%d\\n\", pango_font_description_get_weight(desc),\n       pango_font_description_get_weight(selected_desc));\n\n  char *selected_desc_str = pango_font_description_to_string(selected_desc);\n  tlog(2, \"query_desc: '%s' Selected: '%s'\\n\", query_desc.c_str(), selected_desc_str);\n  if (!equal && best_match != nullptr) {\n    *best_match = selected_desc_str;\n    // Clip the ending ' 0' if there is one. It seems that, if there is no\n    // point size on the end of the fontname, then Pango always appends ' 0'.\n    auto len = best_match->size();\n    if (len > 2 && best_match->at(len - 1) == '0' && best_match->at(len - 2) == ' ') {\n      best_match->resize(len - 2);\n    }\n  }\n  g_free(selected_desc_str);\n  pango_font_description_free(selected_desc);\n  g_object_unref(selected_font);\n  pango_font_description_free(desc);\n  if (!equal)\n    tlog(4, \"** Font '%s' failed pango_font_description_equal!\\n\", input_query_desc);\n  return equal;\n}\n\nstatic bool ShouldIgnoreFontFamilyName(const char *query) {\n  static const char *kIgnoredFamilyNames[] = {\"Sans\", \"Serif\", \"Monospace\", nullptr};\n  const char **list = kIgnoredFamilyNames;\n  for (; *list != nullptr; ++list) {\n    if (!strcmp(*list, query)) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Outputs description names of available fonts.\n/* static */\nconst std::vector<std::string> &FontUtils::ListAvailableFonts() {\n  if (!available_fonts_.empty()) {\n    return available_fonts_;\n  }\n\n  PangoFontFamily **families = nullptr;\n  int n_families = 0;\n  ListFontFamilies(&families, &n_families);\n  for (int i = 0; i < n_families; ++i) {\n    const char *family_name = pango_font_family_get_name(families[i]);\n    tlog(2, \"Listing family %s\\n\", family_name);\n    if (ShouldIgnoreFontFamilyName(family_name)) {\n      continue;\n    }\n\n    int n_faces;\n    PangoFontFace **faces = nullptr;\n    pango_font_family_list_faces(families[i], &faces, &n_faces);\n    for (int j = 0; j < n_faces; ++j) {\n      PangoFontDescription *desc = pango_font_face_describe(faces[j]);\n      char *desc_str = pango_font_description_to_string(desc);\n      // \"synthesized\" font faces that are not truly loadable, so we skip it\n      if (!pango_font_face_is_synthesized(faces[j]) && IsAvailableFont(desc_str)) {\n        available_fonts_.emplace_back(desc_str);\n      }\n      pango_font_description_free(desc);\n      g_free(desc_str);\n    }\n    g_free(faces);\n  }\n  g_free(families);\n  std::sort(available_fonts_.begin(), available_fonts_.end());\n  return available_fonts_;\n}\n\n// Utilities written to be backward compatible with StringRender\n\n/* static */\nint FontUtils::FontScore(const std::unordered_map<char32, int64_t> &ch_map,\n                         const std::string &fontname, int *raw_score, std::vector<bool> *ch_flags) {\n  PangoFontInfo font_info;\n  if (!font_info.ParseFontDescriptionName(fontname)) {\n    tprintf(\"ERROR: Could not parse %s\\n\", fontname.c_str());\n  }\n  PangoFont *font = font_info.ToPangoFont();\n  PangoCoverage *coverage = nullptr;\n  if (font != nullptr) {\n    coverage = pango_font_get_coverage(font, nullptr);\n  }\n  if (ch_flags) {\n    ch_flags->clear();\n    ch_flags->reserve(ch_map.size());\n  }\n  *raw_score = 0;\n  int ok_chars = 0;\n  for (auto &&it : ch_map) {\n    bool covered =\n        (coverage != nullptr) && (IsWhitespace(it.first) ||\n                                  (pango_coverage_get(coverage, it.first) == PANGO_COVERAGE_EXACT));\n    if (covered) {\n      ++(*raw_score);\n      ok_chars += it.second;\n    }\n    if (ch_flags) {\n      ch_flags->push_back(covered);\n    }\n  }\n#if PANGO_VERSION_CHECK(1, 50, 4)\n  g_object_unref(coverage);\n#else\n  pango_coverage_unref(coverage);\n#endif\n  g_object_unref(font);\n  return ok_chars;\n}\n\n/* static */\nstd::string FontUtils::BestFonts(const std::unordered_map<char32, int64_t> &ch_map,\n                                 std::vector<std::pair<const char *, std::vector<bool>>> *fonts) {\n  const double kMinOKFraction = 0.99;\n  // Weighted fraction of characters that must be renderable in a font to make\n  // it OK even if the raw count is not good.\n  const double kMinWeightedFraction = 0.99995;\n\n  fonts->clear();\n  std::vector<std::vector<bool>> font_flags;\n  std::vector<int> font_scores;\n  std::vector<int> raw_scores;\n  int most_ok_chars = 0;\n  int best_raw_score = 0;\n  const std::vector<std::string> &font_names = FontUtils::ListAvailableFonts();\n  for (const auto &font_name : font_names) {\n    std::vector<bool> ch_flags;\n    int raw_score = 0;\n    int ok_chars = FontScore(ch_map, font_name, &raw_score, &ch_flags);\n    most_ok_chars = std::max(ok_chars, most_ok_chars);\n    best_raw_score = std::max(raw_score, best_raw_score);\n\n    font_flags.push_back(ch_flags);\n    font_scores.push_back(ok_chars);\n    raw_scores.push_back(raw_score);\n  }\n\n  // Now select the fonts with a score above a threshold fraction\n  // of both the raw and weighted best scores. To prevent bogus fonts being\n  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of\n  // BOTH weighted and raw scores.\n  // In low character-count scripts, the issue is more getting enough fonts,\n  // when only 1 or 2 might have all those rare dingbats etc in them, so we\n  // allow a font with a very high weighted (coverage) score\n  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.\n  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);\n  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);\n  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);\n\n  std::string font_list;\n  for (unsigned i = 0; i < font_names.size(); ++i) {\n    int score = font_scores[i];\n    int raw_score = raw_scores[i];\n    if ((score >= least_good_enough && raw_score >= least_raw_enough) || score >= override_enough) {\n      fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i]));\n      tlog(1, \"OK font %s = %.4f%%, raw = %d = %.2f%%\\n\", font_names[i].c_str(),\n           100.0 * score / most_ok_chars, raw_score, 100.0 * raw_score / best_raw_score);\n      font_list += font_names[i];\n      font_list += \"\\n\";\n    } else if (score >= least_good_enough || raw_score >= least_raw_enough) {\n      tlog(1, \"Runner-up font %s = %.4f%%, raw = %d = %.2f%%\\n\", font_names[i].c_str(),\n           100.0 * score / most_ok_chars, raw_score, 100.0 * raw_score / best_raw_score);\n    }\n  }\n  return font_list;\n}\n\n/* static */\nbool FontUtils::SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name,\n                           std::vector<std::string> *graphemes) {\n  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name, graphemes);\n}\n\n/* static */\nbool FontUtils::SelectFont(const char *utf8_word, const int utf8_len,\n                           const std::vector<std::string> &all_fonts, std::string *font_name,\n                           std::vector<std::string> *graphemes) {\n  if (font_name) {\n    font_name->clear();\n  }\n  if (graphemes) {\n    graphemes->clear();\n  }\n  for (const auto &all_font : all_fonts) {\n    PangoFontInfo font;\n    std::vector<std::string> found_graphemes;\n    ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_font), \"Could not parse font desc name %s\\n\",\n                    all_font.c_str());\n    if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {\n      if (graphemes) {\n        graphemes->swap(found_graphemes);\n      }\n      if (font_name) {\n        *font_name = all_font;\n      }\n      return true;\n    }\n  }\n  return false;\n}\n\n// PangoFontInfo is reinitialized, so clear the static list of fonts.\n/* static */\nvoid FontUtils::ReInit() {\n  available_fonts_.clear();\n}\n\n// Print info about used font backend\n/* static */\nvoid FontUtils::PangoFontTypeInfo() {\n  PangoFontMap *font_map = pango_cairo_font_map_get_default();\n  if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==\n      CAIRO_FONT_TYPE_TOY) {\n    printf(\"Using CAIRO_FONT_TYPE_TOY.\\n\");\n  } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==\n             CAIRO_FONT_TYPE_FT) {\n    printf(\"Using CAIRO_FONT_TYPE_FT.\\n\");\n  } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==\n             CAIRO_FONT_TYPE_WIN32) {\n    printf(\"Using CAIRO_FONT_TYPE_WIN32.\\n\");\n  } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==\n             CAIRO_FONT_TYPE_QUARTZ) {\n    printf(\"Using CAIRO_FONT_TYPE_QUARTZ.\\n\");\n  } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==\n             CAIRO_FONT_TYPE_USER) {\n    printf(\"Using CAIRO_FONT_TYPE_USER.\\n\");\n  } else if (!font_map) {\n    printf(\"Cannot create pango cairo font map!\\n\");\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/pango/pango_font_info.h",
    "content": "/**********************************************************************\n * File:        pango_font_info.h\n * Description: Font-related objects and helper functions\n * Author:      Ranjith Unnikrishnan\n * Created:     Mon Nov 18 2013\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_\n#define TESSERACT_TRAINING_PANGO_FONT_INFO_H_\n\n#include \"export.h\"\n\n#include \"commandlineflags.h\"\n\n#include \"pango/pango-font.h\"\n#include \"pango/pango.h\"\n#include \"pango/pangocairo.h\"\n\n#include <string>\n#include <unordered_map>\n#include <utility>\n#include <vector>\n\nusing char32 = signed int;\n\nnamespace tesseract {\n\n// Data holder class for a font, intended to avoid having to work with Pango or\n// FontConfig-specific objects directly.\nclass TESS_PANGO_TRAINING_API PangoFontInfo {\npublic:\n  enum FontTypeEnum {\n    UNKNOWN,\n    SERIF,\n    SANS_SERIF,\n    DECORATIVE,\n  };\n  PangoFontInfo();\n  ~PangoFontInfo();\n  // Initialize from parsing a font description name, defined as a string of the\n  // format:\n  //   \"FamilyName [FaceName] [PointSize]\"\n  // where a missing FaceName implies the default regular face.\n  // eg. \"Arial Italic 12\", \"Verdana\"\n  //\n  // FaceName is a combination of:\n  //   [StyleName] [Variant] [Weight] [Stretch]\n  // with (all optional) Pango-defined values of:\n  // StyleName: Oblique, Italic\n  // Variant  : Small-Caps\n  // Weight   : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy\n  // Stretch  : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,\n  //            Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.\n  explicit PangoFontInfo(const std::string &name);\n  bool ParseFontDescriptionName(const std::string &name);\n\n  // Returns true if the font have codepoint coverage for the specified text.\n  bool CoversUTF8Text(const char *utf8_text, int byte_length) const;\n  // Modifies string to remove unicode points that are not covered by the\n  // font. Returns the number of characters dropped.\n  int DropUncoveredChars(std::string *utf8_text) const;\n\n  // Returns true if the entire string can be rendered by the font with full\n  // character coverage and no unknown glyph or dotted-circle glyph\n  // substitutions on encountering a badly formed unicode sequence.\n  // If true, returns individual graphemes. Any whitespace characters in the\n  // original string are also included in the list.\n  bool CanRenderString(const char *utf8_word, int len, std::vector<std::string> *graphemes) const;\n  bool CanRenderString(const char *utf8_word, int len) const;\n\n  // Retrieves the x_bearing and x_advance for the given utf8 character in the\n  // font. Returns false if the glyph for the character could not be found in\n  // the font.\n  // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html\n  bool GetSpacingProperties(const std::string &utf8_char, int *x_bearing, int *x_advance) const;\n\n  // If not already initialized, initializes FontConfig by setting its\n  // environment variable and creating a fonts.conf file that points to the\n  // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.\n  static void SoftInitFontConfig();\n  // Re-initializes font config, whether or not already initialized.\n  // If already initialized, any existing cache is deleted, just to be sure.\n  static void HardInitFontConfig(const char *fonts_dir, const char *cache_dir);\n\n  // Accessors\n  std::string DescriptionName() const;\n  // Font Family name eg. \"Arial\"\n  const std::string &family_name() const {\n    return family_name_;\n  }\n  // Size in points (1/72\"), rounded to the nearest integer.\n  int font_size() const {\n    return font_size_;\n  }\n  FontTypeEnum font_type() const {\n    return font_type_;\n  }\n\n  int resolution() const {\n    return resolution_;\n  }\n  void set_resolution(const int resolution) {\n    resolution_ = resolution;\n  }\n\nprivate:\n  friend class FontUtils;\n  void Clear();\n  bool ParseFontDescription(const PangoFontDescription *desc);\n  // Returns the PangoFont structure corresponding to the closest available font\n  // in the font map.\n  PangoFont *ToPangoFont() const;\n\n  // Font properties set automatically from parsing the font description name.\n  std::string family_name_;\n  int font_size_;\n  FontTypeEnum font_type_;\n  // The Pango description that was used to initialize the instance.\n  PangoFontDescription *desc_;\n  // Default output resolution to assume for GetSpacingProperties() and any\n  // other methods that returns pixel values.\n  int resolution_;\n  // Fontconfig operates through an environment variable, so it intrinsically\n  // cannot be thread-friendly, but you can serialize multiple independent\n  // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir).\n  // These hold the last initialized values set by HardInitFontConfig or\n  // the first call to SoftInitFontConfig.\n  // Directory to be scanned for font files.\n  static std::string fonts_dir_;\n  // Directory to store the cache of font information. (Can be the same as\n  // fonts_dir_)\n  static std::string cache_dir_;\n\nprivate:\n  PangoFontInfo(const PangoFontInfo &) = delete;\n  void operator=(const PangoFontInfo &) = delete;\n};\n\n// Static utility methods for querying font availability and font-selection\n// based on codepoint coverage.\nclass TESS_PANGO_TRAINING_API FontUtils {\npublic:\n  // Returns true if the font of the given description name is available in the\n  // target directory specified by --fonts_dir\n  static bool IsAvailableFont(const char *font_desc) {\n    return IsAvailableFont(font_desc, nullptr);\n  }\n  // Returns true if the font of the given description name is available in the\n  // target directory specified by --fonts_dir. If false is returned, and\n  // best_match is not nullptr, the closest matching font is returned there.\n  static bool IsAvailableFont(const char *font_desc, std::string *best_match);\n  // Outputs description names of available fonts.\n  static const std::vector<std::string> &ListAvailableFonts();\n\n  // Picks font among available fonts that covers and can render the given word,\n  // and returns the font description name and the decomposition of the word to\n  // graphemes. Returns false if no suitable font was found.\n  static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name,\n                         std::vector<std::string> *graphemes);\n\n  // Picks font among all_fonts that covers and can render the given word,\n  // and returns the font description name and the decomposition of the word to\n  // graphemes. Returns false if no suitable font was found.\n  static bool SelectFont(const char *utf8_word, const int utf8_len,\n                         const std::vector<std::string> &all_fonts, std::string *font_name,\n                         std::vector<std::string> *graphemes);\n\n  // NOTE: The following utilities were written to be backward compatible with\n  // StringRender.\n\n  // BestFonts returns a font name and a bit vector of the characters it\n  // can render for the fonts that score within some fraction of the best\n  // font on the characters in the given hash map.\n  // In the flags vector, each flag is set according to whether the\n  // corresponding character (in order of iterating ch_map) can be rendered.\n  // The return string is a list of the acceptable fonts that were used.\n  static std::string BestFonts(const std::unordered_map<char32, int64_t> &ch_map,\n                               std::vector<std::pair<const char *, std::vector<bool>>> *font_flag);\n\n  // FontScore returns the weighted renderability score of the given\n  // hash map character table in the given font. The unweighted score\n  // is also returned in raw_score.\n  // The values in the bool vector ch_flags correspond to whether the\n  // corresponding character (in order of iterating ch_map) can be rendered.\n  static int FontScore(const std::unordered_map<char32, int64_t> &ch_map,\n                       const std::string &fontname, int *raw_score, std::vector<bool> *ch_flags);\n\n  // PangoFontInfo is reinitialized, so clear the static list of fonts.\n  static void ReInit();\n  static void PangoFontTypeInfo();\n\nprivate:\n  static std::vector<std::string> available_fonts_; // cache list\n};\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_\n"
  },
  {
    "path": "src/training/pango/stringrenderer.cpp",
    "content": "/**********************************************************************\n * File:        stringrenderer.cpp\n * Description: Class for rendering UTF-8 text to an image, and retrieving\n *              bounding boxes around each grapheme cluster.\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"stringrenderer.h\"\n\n#include <allheaders.h> // from leptonica\n#include \"boxchar.h\"\n#include \"helpers.h\" // for TRand\n#include \"ligature_table.h\"\n#include \"normstrngs.h\"\n#include \"tlog.h\"\n\n#include <tesseract/unichar.h>\n\n#include \"pango/pango-font.h\"\n#include \"pango/pango-glyph-item.h\"\n#include \"unicode/uchar.h\" // from libicu\n\n#include <algorithm>\n#include <cassert>\n#include <cstdio>\n#include <cstring>\n#include <map>\n#include <utility>\n#include <vector>\n\n#define DISABLE_HEAP_LEAK_CHECK\n\nnamespace tesseract {\n\nstatic const int kDefaultOutputResolution = 300;\n\n// Word joiner (U+2060) inserted after letters in ngram mode, as per\n// recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at\n// hyphens and other non-alpha characters.\nstatic const char *kWordJoinerUTF8 = \"\\u2060\";\n\nstatic bool IsCombiner(int ch) {\n  const int char_type = u_charType(ch);\n  return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) ||\n          (char_type == U_COMBINING_SPACING_MARK));\n}\n\nstatic std::string EncodeAsUTF8(const char32 ch32) {\n  UNICHAR uni_ch(ch32);\n  return std::string(uni_ch.utf8(), uni_ch.utf8_len());\n}\n\n// Returns true with probability 'prob'.\nstatic bool RandBool(const double prob, TRand *rand) {\n  if (prob == 1.0) {\n    return true;\n  }\n  if (prob == 0.0) {\n    return false;\n  }\n  return rand->UnsignedRand(1.0) < prob;\n}\n\n/* static */\nstatic Image CairoARGB32ToPixFormat(cairo_surface_t *surface) {\n  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {\n    printf(\"Unexpected surface format %d\\n\", cairo_image_surface_get_format(surface));\n    return nullptr;\n  }\n  const int width = cairo_image_surface_get_width(surface);\n  const int height = cairo_image_surface_get_height(surface);\n  Image pix = pixCreate(width, height, 32);\n  int byte_stride = cairo_image_surface_get_stride(surface);\n\n  for (int i = 0; i < height; ++i) {\n    memcpy(reinterpret_cast<unsigned char *>(pixGetData(pix) + i * pixGetWpl(pix)) + 1,\n           cairo_image_surface_get_data(surface) + i * byte_stride,\n           byte_stride - ((i == height - 1) ? 1 : 0));\n  }\n  return pix;\n}\n\nStringRenderer::StringRenderer(const std::string &font_desc, int page_width, int page_height)\n    : font_(font_desc)\n    , page_width_(page_width)\n    , page_height_(page_height)\n    , h_margin_(50)\n    , v_margin_(50)\n    , pen_color_{0.0, 0.0, 0.0}\n    , char_spacing_(0)\n    , leading_(0)\n    , vertical_text_(false)\n    , gravity_hint_strong_(false)\n    , render_fullwidth_latin_(false)\n    , underline_start_prob_(0)\n    , underline_continuation_prob_(0)\n    , underline_style_(PANGO_UNDERLINE_SINGLE)\n    , drop_uncovered_chars_(true)\n    , strip_unrenderable_words_(false)\n    , add_ligatures_(false)\n    , output_word_boxes_(false)\n    , surface_(nullptr)\n    , cr_(nullptr)\n    , layout_(nullptr)\n    , start_box_(0)\n    , page_(0)\n    , box_padding_(0)\n    , page_boxes_(nullptr)\n    , total_chars_(0)\n    , font_index_(0)\n    , last_offset_(0) {\n  set_resolution(kDefaultOutputResolution);\n  set_font(font_desc);\n}\n\nbool StringRenderer::set_font(const std::string &desc) {\n  bool success = font_.ParseFontDescriptionName(desc);\n  font_.set_resolution(resolution_);\n  return success;\n}\n\nvoid StringRenderer::set_resolution(const int resolution) {\n  resolution_ = resolution;\n  font_.set_resolution(resolution);\n}\n\nvoid StringRenderer::set_underline_start_prob(const double frac) {\n  underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);\n}\n\nvoid StringRenderer::set_underline_continuation_prob(const double frac) {\n  underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);\n}\n\nStringRenderer::~StringRenderer() {\n  ClearBoxes();\n  FreePangoCairo();\n}\n\nvoid StringRenderer::InitPangoCairo() {\n  FreePangoCairo();\n  surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, page_height_);\n  cr_ = cairo_create(surface_);\n  {\n    DISABLE_HEAP_LEAK_CHECK;\n    layout_ = pango_cairo_create_layout(cr_);\n  }\n\n  if (vertical_text_) {\n    PangoContext *context = pango_layout_get_context(layout_);\n    pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);\n    if (gravity_hint_strong_) {\n      pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);\n    }\n    pango_layout_context_changed(layout_);\n  }\n\n  SetLayoutProperties();\n}\n\nvoid StringRenderer::SetLayoutProperties() {\n  std::string font_desc = font_.DescriptionName();\n  // Specify the font via a description name\n  PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str());\n  // Assign the font description to the layout\n  pango_layout_set_font_description(layout_, desc);\n  pango_font_description_free(desc); // free the description\n  pango_cairo_context_set_resolution(pango_layout_get_context(layout_), resolution_);\n\n  int max_width = page_width_ - 2 * h_margin_;\n  int max_height = page_height_ - 2 * v_margin_;\n  tlog(3, \"max_width = %d, max_height = %d\\n\", max_width, max_height);\n  if (vertical_text_) {\n    using std::swap;\n    swap(max_width, max_height);\n  }\n  pango_layout_set_width(layout_, max_width * PANGO_SCALE);\n  // Ultra-wide Thai strings need to wrap at char level.\n  pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);\n\n  // Adjust character spacing\n  PangoAttrList *attr_list = pango_attr_list_new();\n  if (char_spacing_) {\n    PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);\n    spacing_attr->start_index = 0;\n    spacing_attr->end_index = static_cast<guint>(-1);\n    pango_attr_list_change(attr_list, spacing_attr);\n  }\n\n  if (add_ligatures_) {\n    set_features(\"liga, clig, dlig, hlig\");\n    PangoAttribute *feature_attr = pango_attr_font_features_new(features_.c_str());\n    pango_attr_list_change(attr_list, feature_attr);\n  }\n\n  pango_layout_set_attributes(layout_, attr_list);\n  pango_attr_list_unref(attr_list);\n  // Adjust line spacing\n  if (leading_) {\n    pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);\n  }\n}\n\nvoid StringRenderer::FreePangoCairo() {\n  if (layout_) {\n    g_object_unref(layout_);\n    layout_ = nullptr;\n  }\n  if (cr_) {\n    cairo_destroy(cr_);\n    cr_ = nullptr;\n  }\n  if (surface_) {\n    cairo_surface_destroy(surface_);\n    surface_ = nullptr;\n  }\n}\n\nvoid StringRenderer::SetWordUnderlineAttributes(const std::string &page_text) {\n  if (underline_start_prob_ == 0) {\n    return;\n  }\n  PangoAttrList *attr_list = pango_layout_get_attributes(layout_);\n\n  const char *text = page_text.c_str();\n  size_t offset = 0;\n  TRand rand;\n  bool started_underline = false;\n  PangoAttribute *und_attr = nullptr;\n\n  while (offset < page_text.length()) {\n    offset += SpanUTF8Whitespace(text + offset);\n    if (offset == page_text.length()) {\n      break;\n    }\n\n    int word_start = offset;\n    int word_len = SpanUTF8NotWhitespace(text + offset);\n    offset += word_len;\n    if (started_underline) {\n      // Should we continue the underline to the next word?\n      if (RandBool(underline_continuation_prob_, &rand)) {\n        // Continue the current underline to this word.\n        und_attr->end_index = word_start + word_len;\n      } else {\n        // Otherwise end the current underline attribute at the end of the\n        // previous word.\n        pango_attr_list_insert(attr_list, und_attr);\n        started_underline = false;\n        und_attr = nullptr;\n      }\n    }\n    if (!started_underline && RandBool(underline_start_prob_, &rand)) {\n      // Start a new underline attribute\n      und_attr = pango_attr_underline_new(underline_style_);\n      und_attr->start_index = word_start;\n      und_attr->end_index = word_start + word_len;\n      started_underline = true;\n    }\n  }\n  // Finish the current underline attribute at the end of the page.\n  if (started_underline) {\n    und_attr->end_index = page_text.length();\n    pango_attr_list_insert(attr_list, und_attr);\n  }\n}\n\n// Returns offset in utf8 bytes to first page.\nint StringRenderer::FindFirstPageBreakOffset(const char *text, int text_length) {\n  if (!text_length) {\n    return 0;\n  }\n  const int max_height = (page_height_ - 2 * v_margin_);\n  const int max_width = (page_width_ - 2 * h_margin_);\n  const int max_layout_height = vertical_text_ ? max_width : max_height;\n\n  UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);\n  const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);\n  const int kMaxUnicodeBufLength = 15000;\n  for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i) {\n    ;\n  }\n  int buf_length = it.utf8_data() - text;\n  tlog(1, \"len = %d  buf_len = %d\\n\", text_length, buf_length);\n  pango_layout_set_text(layout_, text, buf_length);\n\n  PangoLayoutIter *line_iter = nullptr;\n  { // Fontconfig caches some info here that is not freed before exit.\n    DISABLE_HEAP_LEAK_CHECK;\n    line_iter = pango_layout_get_iter(layout_);\n  }\n  bool first_page = true;\n  int page_top = 0;\n  int offset = buf_length;\n  do {\n    // Get bounding box of the current line\n    PangoRectangle line_ink_rect;\n    pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);\n    pango_extents_to_pixels(&line_ink_rect, nullptr);\n    PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter);\n    if (first_page) {\n      page_top = line_ink_rect.y;\n      first_page = false;\n    }\n    int line_bottom = line_ink_rect.y + line_ink_rect.height;\n    if (line_bottom - page_top > max_layout_height) {\n      offset = line->start_index;\n      tlog(1, \"Found offset = %d\\n\", offset);\n      break;\n    }\n  } while (pango_layout_iter_next_line(line_iter));\n  pango_layout_iter_free(line_iter);\n  return offset;\n}\n\nconst std::vector<BoxChar *> &StringRenderer::GetBoxes() const {\n  return boxchars_;\n}\n\nBoxa *StringRenderer::GetPageBoxes() const {\n  return page_boxes_;\n}\n\nvoid StringRenderer::RotatePageBoxes(float rotation) {\n  BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2, start_box_, boxchars_.size(),\n                       &boxchars_);\n}\n\nvoid StringRenderer::ClearBoxes() {\n  for (auto &boxchar : boxchars_) {\n    delete boxchar;\n  }\n  boxchars_.clear();\n  boxaDestroy(&page_boxes_);\n}\n\nstd::string StringRenderer::GetBoxesStr() {\n  BoxChar::PrepareToWrite(&boxchars_);\n  return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);\n}\n\nvoid StringRenderer::WriteAllBoxes(const std::string &filename) {\n  BoxChar::PrepareToWrite(&boxchars_);\n  BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);\n}\n\n// Returns cluster strings in logical order.\nbool StringRenderer::GetClusterStrings(std::vector<std::string> *cluster_text) {\n  std::map<int, std::string> start_byte_to_text;\n  PangoLayoutIter *run_iter = pango_layout_get_iter(layout_);\n  const char *full_text = pango_layout_get_text(layout_);\n  do {\n    PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);\n    if (!run) {\n      // End of line nullptr run marker\n      tlog(2, \"Found end of line marker\\n\");\n      continue;\n    }\n    PangoGlyphItemIter cluster_iter;\n    gboolean have_cluster;\n    for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text);\n         have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {\n      const int start_byte_index = cluster_iter.start_index;\n      const int end_byte_index = cluster_iter.end_index;\n      std::string text =\n          std::string(full_text + start_byte_index, end_byte_index - start_byte_index);\n      if (IsUTF8Whitespace(text.c_str())) {\n        tlog(2, \"Found whitespace\\n\");\n        text = \" \";\n      }\n      tlog(2, \"start_byte=%d end_byte=%d : '%s'\\n\", start_byte_index, end_byte_index, text.c_str());\n      if (add_ligatures_) {\n        // Make sure the output box files have ligatured text in case the font\n        // decided to use an unmapped glyph.\n        text = LigatureTable::Get()->AddLigatures(text, nullptr);\n      }\n      start_byte_to_text[start_byte_index] = std::move(text);\n    }\n  } while (pango_layout_iter_next_run(run_iter));\n  pango_layout_iter_free(run_iter);\n\n  cluster_text->clear();\n  for (auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) {\n    cluster_text->push_back(it->second);\n  }\n  return !cluster_text->empty();\n}\n\n// Merges an array of BoxChars into words based on the identification of\n// BoxChars containing the space character as inter-word separators.\n//\n// Sometime two adjacent characters in the sequence may be detected as lying on\n// different lines based on their spatial positions. This may be the result of a\n// newline character at end of the last word on a line in the source text, or of\n// a discretionary line-break created by Pango at intra-word locations like\n// hyphens. When this is detected the word is split at that location into\n// multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and\n// its bounding box.\nstatic void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) {\n  std::vector<BoxChar *> result;\n  bool started_word = false;\n  for (auto &boxchar : *boxchars) {\n    if (boxchar->ch() == \" \" || boxchar->box() == nullptr) {\n      result.push_back(boxchar);\n      boxchar = nullptr;\n      started_word = false;\n      continue;\n    }\n\n    if (!started_word) {\n      // Begin new word\n      started_word = true;\n      result.push_back(boxchar);\n      boxchar = nullptr;\n    } else {\n      BoxChar *last_boxchar = result.back();\n      // Compute bounding box union\n      const Box *box = boxchar->box();\n      Box *last_box = last_boxchar->mutable_box();\n      int left = std::min(last_box->x, box->x);\n      int right = std::max(last_box->x + last_box->w, box->x + box->w);\n      int top = std::min(last_box->y, box->y);\n      int bottom = std::max(last_box->y + last_box->h, box->y + box->h);\n      // Conclude that the word was broken to span multiple lines based on the\n      // size of the merged bounding box in relation to those of the individual\n      // characters seen so far.\n      if (right - left > last_box->w + 5 * box->w) {\n        tlog(1, \"Found line break after '%s'\", last_boxchar->ch().c_str());\n        // Insert a fake interword space and start a new word with the current\n        // boxchar.\n        result.push_back(new BoxChar(\" \", 1));\n        result.push_back(boxchar);\n        boxchar = nullptr;\n        continue;\n      }\n      // Append to last word\n      last_boxchar->mutable_ch()->append(boxchar->ch());\n      last_box->x = left;\n      last_box->w = right - left;\n      last_box->y = top;\n      last_box->h = bottom - top;\n      delete boxchar;\n      boxchar = nullptr;\n    }\n  }\n  boxchars->swap(result);\n}\n\nvoid StringRenderer::ComputeClusterBoxes() {\n  const char *text = pango_layout_get_text(layout_);\n  PangoLayoutIter *cluster_iter = pango_layout_get_iter(layout_);\n\n  // Do a first pass to store cluster start indexes.\n  std::vector<int> cluster_start_indices;\n  do {\n    cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));\n    tlog(3, \"Added %d\\n\", cluster_start_indices.back());\n  } while (pango_layout_iter_next_cluster(cluster_iter));\n  pango_layout_iter_free(cluster_iter);\n  cluster_start_indices.push_back(strlen(text));\n  tlog(3, \"Added last index %d\\n\", cluster_start_indices.back());\n  // Sort the indices and create a map from start to end indices.\n  std::sort(cluster_start_indices.begin(), cluster_start_indices.end());\n  std::map<int, int> cluster_start_to_end_index;\n  for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {\n    cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1];\n  }\n\n  // Iterate again to compute cluster boxes and their text with the obtained\n  // cluster extent information.\n  cluster_iter = pango_layout_get_iter(layout_);\n  // Store BoxChars* sorted by their byte start positions\n  std::map<int, BoxChar *> start_byte_to_box;\n  do {\n    PangoRectangle cluster_rect;\n    pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);\n    pango_extents_to_pixels(&cluster_rect, nullptr);\n    const int start_byte_index = pango_layout_iter_get_index(cluster_iter);\n    const int end_byte_index = cluster_start_to_end_index[start_byte_index];\n    std::string cluster_text =\n        std::string(text + start_byte_index, end_byte_index - start_byte_index);\n    if (!cluster_text.empty() && cluster_text[0] == '\\n') {\n      tlog(2, \"Skipping newlines at start of text.\\n\");\n      continue;\n    }\n    if (!cluster_rect.width || !cluster_rect.height || IsUTF8Whitespace(cluster_text.c_str())) {\n      tlog(2, \"Skipping whitespace with boxdim (%d,%d) '%s'\\n\", cluster_rect.width,\n           cluster_rect.height, cluster_text.c_str());\n      auto *boxchar = new BoxChar(\" \", 1);\n      boxchar->set_page(page_);\n      start_byte_to_box[start_byte_index] = boxchar;\n      continue;\n    }\n    // Prepare a boxchar for addition at this byte position.\n    tlog(2, \"[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\\n\", cluster_rect.x, cluster_rect.y,\n         cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index,\n         cluster_text.c_str());\n    ASSERT_HOST_MSG(cluster_rect.width, \"cluster_text:%s  start_byte_index:%d\\n\",\n                    cluster_text.c_str(), start_byte_index);\n    ASSERT_HOST_MSG(cluster_rect.height, \"cluster_text:%s  start_byte_index:%d\\n\",\n                    cluster_text.c_str(), start_byte_index);\n    if (box_padding_) {\n      cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);\n      cluster_rect.width += 2 * box_padding_;\n      cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);\n      cluster_rect.height += 2 * box_padding_;\n    }\n    if (add_ligatures_) {\n      // Make sure the output box files have ligatured text in case the font\n      // decided to use an unmapped glyph.\n      cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);\n    }\n    auto *boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());\n    boxchar->set_page(page_);\n    boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height);\n    start_byte_to_box[start_byte_index] = boxchar;\n  } while (pango_layout_iter_next_cluster(cluster_iter));\n  pango_layout_iter_free(cluster_iter);\n\n  // There is a subtle bug in the cluster text reported by the PangoLayoutIter\n  // on ligatured characters (eg. The word \"Lam-Aliph\" in arabic). To work\n  // around this, we use text reported using the PangoGlyphIter which is\n  // accurate.\n  // TODO(ranjith): Revisit whether this is still needed in newer versions of\n  // pango.\n  std::vector<std::string> cluster_text;\n  if (GetClusterStrings(&cluster_text)) {\n    ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());\n    int ind = 0;\n    for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) {\n      it->second->mutable_ch()->swap(cluster_text[ind]);\n    }\n  }\n\n  // Append to the boxchars list in byte order.\n  std::vector<BoxChar *> page_boxchars;\n  page_boxchars.reserve(start_byte_to_box.size());\n  std::string last_ch;\n  for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) {\n    if (it->second->ch() == kWordJoinerUTF8) {\n      // Skip zero-width joiner characters (ZWJs) here.\n      delete it->second;\n    } else {\n      page_boxchars.push_back(it->second);\n    }\n  }\n  CorrectBoxPositionsToLayout(&page_boxchars);\n\n  if (render_fullwidth_latin_) {\n    for (auto &it : start_byte_to_box) {\n      // Convert fullwidth Latin characters to their halfwidth forms.\n      std::string half(ConvertFullwidthLatinToBasicLatin(it.second->ch()));\n      it.second->mutable_ch()->swap(half);\n    }\n  }\n\n  // Merge the character boxes into word boxes if we are rendering n-grams.\n  if (output_word_boxes_) {\n    MergeBoxCharsToWords(&page_boxchars);\n  }\n\n  boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());\n\n  // Compute the page bounding box\n  Box *page_box = nullptr;\n  Boxa *all_boxes = nullptr;\n  for (auto &page_boxchar : page_boxchars) {\n    if (page_boxchar->box() == nullptr) {\n      continue;\n    }\n    if (all_boxes == nullptr) {\n      all_boxes = boxaCreate(0);\n    }\n    boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE);\n  }\n  if (all_boxes != nullptr) {\n    boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);\n    boxaDestroy(&all_boxes);\n    if (page_boxes_ == nullptr) {\n      page_boxes_ = boxaCreate(0);\n    }\n    boxaAddBox(page_boxes_, page_box, L_INSERT);\n  }\n}\n\nvoid StringRenderer::CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars) {\n  if (vertical_text_) {\n    const double rotation = -pango_gravity_to_rotation(\n        pango_context_get_base_gravity(pango_layout_get_context(layout_)));\n    BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);\n    BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 0, boxchars->size(),\n                         boxchars);\n  } else {\n    BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);\n  }\n}\n\nint StringRenderer::StripUnrenderableWords(std::string *utf8_text) const {\n  std::string output_text;\n  std::string unrenderable_words;\n  const char *text = utf8_text->c_str();\n  size_t offset = 0;\n  int num_dropped = 0;\n  while (offset < utf8_text->length()) {\n    int space_len = SpanUTF8Whitespace(text + offset);\n    output_text.append(text + offset, space_len);\n    offset += space_len;\n    if (offset == utf8_text->length()) {\n      break;\n    }\n\n    int word_len = SpanUTF8NotWhitespace(text + offset);\n    if (font_.CanRenderString(text + offset, word_len)) {\n      output_text.append(text + offset, word_len);\n    } else {\n      ++num_dropped;\n      unrenderable_words.append(text + offset, word_len);\n      unrenderable_words.append(\" \");\n    }\n    offset += word_len;\n  }\n  utf8_text->swap(output_text);\n\n  if (num_dropped > 0) {\n    tprintf(\"Stripped %d unrenderable word(s): '%s'\\n\", num_dropped, unrenderable_words.c_str());\n  }\n  return num_dropped;\n}\n\nint StringRenderer::RenderToGrayscaleImage(const char *text, int text_length, Image *pix) {\n  Image orig_pix = nullptr;\n  int offset = RenderToImage(text, text_length, &orig_pix);\n  if (orig_pix) {\n    *pix = pixConvertTo8(orig_pix, false);\n    orig_pix.destroy();\n  }\n  return offset;\n}\n\nint StringRenderer::RenderToBinaryImage(const char *text, int text_length, int threshold,\n                                        Image *pix) {\n  Image orig_pix = nullptr;\n  int offset = RenderToImage(text, text_length, &orig_pix);\n  if (orig_pix) {\n    Image gray_pix = pixConvertTo8(orig_pix, false);\n    orig_pix.destroy();\n    *pix = pixThresholdToBinary(gray_pix, threshold);\n    gray_pix.destroy();\n  } else {\n    *pix = orig_pix;\n  }\n  return offset;\n}\n\n// Add word joiner (WJ) characters between adjacent non-space characters except\n// immediately before a combiner.\n/* static */\nstd::string StringRenderer::InsertWordJoiners(const std::string &text) {\n  std::string out_str;\n  const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), text.length());\n  for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); it != it_end;\n       ++it) {\n    // Add the symbol to the output string.\n    out_str.append(it.utf8_data(), it.utf8_len());\n    // Check the next symbol.\n    UNICHAR::const_iterator next_it = it;\n    ++next_it;\n    bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');\n    bool next_char_is_combiner = (next_it == it_end) ? false : IsCombiner(*next_it);\n    if (*it != ' ' && *it != '\\n' && !next_char_is_boundary && !next_char_is_combiner) {\n      out_str += kWordJoinerUTF8;\n    }\n  }\n  return out_str;\n}\n\n// Convert halfwidth Basic Latin characters to their fullwidth forms.\nstd::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string &str) {\n  std::string full_str;\n  const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());\n  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {\n    // Convert printable and non-space 7-bit ASCII characters to\n    // their fullwidth forms.\n    if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {\n      // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.\n      char32 full_char = *it + 0xFEE0;\n      full_str.append(EncodeAsUTF8(full_char));\n    } else {\n      full_str.append(it.utf8_data(), it.utf8_len());\n    }\n  }\n  return full_str;\n}\n\n// Convert fullwidth Latin characters to their halfwidth forms.\nstd::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string &str) {\n  std::string half_str;\n  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());\n  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {\n    char32 half_char = FullwidthToHalfwidth(*it);\n    // Convert fullwidth Latin characters to their halfwidth forms\n    // only if halfwidth forms are printable and non-space 7-bit ASCII.\n    if (IsInterchangeValid7BitAscii(half_char) && isprint(half_char) && !isspace(half_char)) {\n      half_str.append(EncodeAsUTF8(half_char));\n    } else {\n      half_str.append(it.utf8_data(), it.utf8_len());\n    }\n  }\n  return half_str;\n}\n\n// Returns offset to end of text substring rendered in this method.\nint StringRenderer::RenderToImage(const char *text, int text_length, Image *pix) {\n  if (pix && *pix) {\n    pix->destroy();\n  }\n  InitPangoCairo();\n\n  const int page_offset = FindFirstPageBreakOffset(text, text_length);\n  if (!page_offset) {\n    return 0;\n  }\n  start_box_ = boxchars_.size();\n\n  if (!vertical_text_) {\n    // Translate by the specified margin\n    cairo_translate(cr_, h_margin_, v_margin_);\n  } else {\n    // Vertical text rendering is achieved by a two-step process of first\n    // performing regular horizontal layout with character orientation set to\n    // EAST, and then translating and rotating the layout before rendering onto\n    // the desired image surface. The settings required for the former step are\n    // done within InitPangoCairo().\n    //\n    // Translate to the top-right margin of page\n    cairo_translate(cr_, page_width_ - h_margin_, v_margin_);\n    // Rotate the layout\n    double rotation = -pango_gravity_to_rotation(\n        pango_context_get_base_gravity(pango_layout_get_context(layout_)));\n    tlog(2, \"Rotating by %f radians\\n\", rotation);\n    cairo_rotate(cr_, rotation);\n    pango_cairo_update_layout(cr_, layout_);\n  }\n  std::string page_text(text, page_offset);\n  if (render_fullwidth_latin_) {\n    // Convert Basic Latin to their fullwidth forms.\n    page_text = ConvertBasicLatinToFullwidthLatin(page_text);\n  }\n  if (strip_unrenderable_words_) {\n    StripUnrenderableWords(&page_text);\n  }\n  if (drop_uncovered_chars_ && !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {\n    int num_dropped = font_.DropUncoveredChars(&page_text);\n    if (num_dropped) {\n      tprintf(\"WARNING: Dropped %d uncovered characters\\n\", num_dropped);\n    }\n  }\n  if (add_ligatures_) {\n    // Add ligatures wherever possible, including custom ligatures.\n    page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);\n  }\n  if (underline_start_prob_ > 0) {\n    SetWordUnderlineAttributes(page_text);\n  }\n\n  pango_layout_set_text(layout_, page_text.c_str(), page_text.length());\n\n  if (pix) {\n    // Set a white background for the target image surface.\n    cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white\n    // Fill the surface with the active colour (if you don't do this, you will\n    // be given a surface with a transparent background to draw on)\n    cairo_paint(cr_);\n    // Set the ink color to black\n    cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);\n    // If the target surface or transformation properties of the cairo instance\n    // have changed, update the pango layout to reflect this\n    pango_cairo_update_layout(cr_, layout_);\n    {\n      DISABLE_HEAP_LEAK_CHECK; // for Fontconfig\n      // Draw the pango layout onto the cairo surface\n      pango_cairo_show_layout(cr_, layout_);\n    }\n    *pix = CairoARGB32ToPixFormat(surface_);\n  }\n  ComputeClusterBoxes();\n  FreePangoCairo();\n  // Update internal state variables.\n  ++page_;\n  return page_offset;\n}\n\n// Render a string to an image, returning it as an 8 bit pix.  Behaves as\n// RenderString, except that it ignores the font set at construction and works\n// through all the fonts, returning 0 until they are exhausted, at which point\n// it returns the value it should have returned all along, but no pix this time.\n// Fonts that don't contain a given proportion of the characters in the string\n// get skipped.\n// Fonts that work each get rendered and the font name gets added\n// to the image.\n// NOTE that no boxes are produced by this function.\n//\n// Example usage: To render a null terminated char-array \"txt\"\n//\n// int offset = 0;\n// do {\n//   Image pix;\n//   offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,\n//                                            strlen(txt + offset), nullptr,\n//                                            &pix);\n//   ...\n// } while (offset < strlen(text));\n//\nint StringRenderer::RenderAllFontsToImage(double min_coverage, const char *text, int text_length,\n                                          std::string *font_used, Image *image) {\n  *image = nullptr;\n  // Select a suitable font to render the title with.\n  const char kTitleTemplate[] = \"%s : %d hits = %.2f%%, raw = %d = %.2f%%\";\n  std::string title_font;\n  if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), &title_font, nullptr)) {\n    tprintf(\"WARNING: Could not find a font to render image title with!\\n\");\n    title_font = \"Arial\";\n  }\n  title_font += \" 8\";\n  tlog(1, \"Selected title font: %s\\n\", title_font.c_str());\n  if (font_used) {\n    font_used->clear();\n  }\n\n  std::string orig_font = font_.DescriptionName();\n  if (char_map_.empty()) {\n    total_chars_ = 0;\n    // Fill the hash table and use that for computing which fonts to use.\n    for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);\n         it != UNICHAR::end(text, text_length); ++it) {\n      ++total_chars_;\n      ++char_map_[*it];\n    }\n    tprintf(\"Total chars = %d\\n\", total_chars_);\n  }\n  const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();\n\n  for (size_t i = font_index_; i < all_fonts.size(); ++i) {\n    ++font_index_;\n    int raw_score = 0;\n    int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);\n    if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {\n      set_font(all_fonts[i]);\n      int offset = RenderToBinaryImage(text, text_length, 128, image);\n      ClearBoxes(); // Get rid of them as they are garbage.\n      const int kMaxTitleLength = 1024;\n      char title[kMaxTitleLength];\n      snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[i].c_str(), ok_chars,\n               100.0 * ok_chars / total_chars_, raw_score, 100.0 * raw_score / char_map_.size());\n      tprintf(\"%s\\n\", title);\n      // This is a good font! Store the offset to return once we've tried all\n      // the fonts.\n      if (offset) {\n        last_offset_ = offset;\n        if (font_used) {\n          *font_used = all_fonts[i];\n        }\n      }\n      // Add the font to the image.\n      set_font(title_font);\n      v_margin_ /= 8;\n      Image title_image = nullptr;\n      RenderToBinaryImage(title, strlen(title), 128, &title_image);\n      *image |= title_image;\n      title_image.destroy();\n\n      v_margin_ *= 8;\n      set_font(orig_font);\n      // We return the real offset only after cycling through the list of fonts.\n      return 0;\n    } else {\n      tprintf(\"Font %s failed with %d hits = %.2f%%\\n\", all_fonts[i].c_str(), ok_chars,\n              100.0 * ok_chars / total_chars_);\n    }\n  }\n  font_index_ = 0;\n  char_map_.clear();\n  return last_offset_ == 0 ? -1 : last_offset_;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/pango/stringrenderer.h",
    "content": "/**********************************************************************\n * File:        stringrenderer.h\n * Description: Class for rendering UTF-8 text to an image, and retrieving\n *              bounding boxes around each grapheme cluster.\n *\n *              Instances are created using a font description string\n *              (eg. \"Arial Italic 12\"; see pango_font_info.h for the format)\n *              and the page dimensions. Other renderer properties such as\n *              spacing, ligaturization, as well a preprocessing behavior such\n *              as removal of unrenderable words and a special n-gram mode may\n *              be set using respective set_* methods.\n *\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_TRAINING_STRINGRENDERER_H_\n#define TESSERACT_TRAINING_STRINGRENDERER_H_\n\n#include \"export.h\"\n\n#include \"pango/pango-layout.h\"\n#include \"pango/pangocairo.h\"\n#include \"pango_font_info.h\"\n\n#include \"image.h\"\n\n#include <string>\n#include <unordered_map>\n#include <vector>\n\nstruct Boxa;\nstruct Pix;\n\nnamespace tesseract {\n\nclass BoxChar;\n\nclass TESS_PANGO_TRAINING_API StringRenderer {\npublic:\n  StringRenderer(const std::string &font_desc, int page_width, int page_height);\n  ~StringRenderer();\n\n  // Renders the text with the chosen font and returns the byte offset up to\n  // which the text could be rendered so as to fit the specified page\n  // dimensions.\n  int RenderToImage(const char *text, int text_length, Image *pix);\n  int RenderToGrayscaleImage(const char *text, int text_length, Image *pix);\n  int RenderToBinaryImage(const char *text, int text_length, int threshold, Image *pix);\n  // Renders a line of text with all available fonts that were able to render\n  // at least min_coverage fraction of the input text. Use 1.0 to require that\n  // a font be able to render all the text.\n  int RenderAllFontsToImage(double min_coverage, const char *text, int text_length,\n                            std::string *font_used, Image *pix);\n\n  bool set_font(const std::string &desc);\n  // Char spacing is in PIXELS!!!!.\n  void set_char_spacing(int char_spacing) {\n    char_spacing_ = char_spacing;\n  }\n  void set_leading(int leading) {\n    leading_ = leading;\n  }\n  void set_resolution(const int resolution);\n  void set_vertical_text(bool vertical_text) {\n    vertical_text_ = vertical_text;\n  }\n  void set_gravity_hint_strong(bool gravity_hint_strong) {\n    gravity_hint_strong_ = gravity_hint_strong;\n  }\n  void set_render_fullwidth_latin(bool render_fullwidth_latin) {\n    render_fullwidth_latin_ = render_fullwidth_latin;\n  }\n  // Sets the probability (value in [0, 1]) of starting to render a word with an\n  // underline. This implementation consider words to be space-delimited\n  // sequences of characters.\n  void set_underline_start_prob(const double frac);\n  // Set the probability (value in [0, 1]) of continuing a started underline to\n  // the next word.\n  void set_underline_continuation_prob(const double frac);\n  void set_underline_style(const PangoUnderline style) {\n    underline_style_ = style;\n  }\n  void set_features(const char *features) {\n    features_ = features;\n  }\n  void set_page(int page) {\n    page_ = page;\n  }\n  void set_box_padding(int val) {\n    box_padding_ = val;\n  }\n  void set_drop_uncovered_chars(bool val) {\n    drop_uncovered_chars_ = val;\n  }\n  void set_strip_unrenderable_words(bool val) {\n    strip_unrenderable_words_ = val;\n  }\n  void set_output_word_boxes(bool val) {\n    output_word_boxes_ = val;\n  }\n  // Before rendering the string, replace latin characters with their optional\n  // ligatured forms (such as \"fi\", \"ffi\" etc.) if the font_ covers those\n  // unicodes.\n  void set_add_ligatures(bool add_ligatures) {\n    add_ligatures_ = add_ligatures;\n  }\n  // Set the rgb value of the text ink. Values range in [0, 1.0]\n  void set_pen_color(double r, double g, double b) {\n    pen_color_[0] = r;\n    pen_color_[1] = g;\n    pen_color_[2] = b;\n  }\n  void set_h_margin(const int h_margin) {\n    h_margin_ = h_margin;\n  }\n  void set_v_margin(const int v_margin) {\n    v_margin_ = v_margin;\n  }\n  const PangoFontInfo &font() const {\n    return font_;\n  }\n  int h_margin() const {\n    return h_margin_;\n  }\n  int v_margin() const {\n    return v_margin_;\n  }\n\n  // Get the boxchars of all clusters rendered thus far (or since the last call\n  // to ClearBoxes()).\n  const std::vector<BoxChar *> &GetBoxes() const;\n  // Get the rendered page bounding boxes of all pages created thus far (or\n  // since last call to ClearBoxes()).\n  Boxa *GetPageBoxes() const;\n\n  // Rotate the boxes on the most recent page by the given rotation.\n  void RotatePageBoxes(float rotation);\n  // Delete all boxes.\n  void ClearBoxes();\n  // Returns the boxes in a boxfile string.\n  std::string GetBoxesStr();\n  // Writes the boxes to a boxfile.\n  void WriteAllBoxes(const std::string &filename);\n  // Removes space-delimited words from the string that are not renderable by\n  // the current font and returns the count of such words.\n  int StripUnrenderableWords(std::string *utf8_text) const;\n\n  // Insert a Word Joiner symbol (U+2060) between adjacent characters, excluding\n  // spaces and combining types, in each word before rendering to ensure words\n  // are not broken across lines. The output boxchars will not contain the\n  // joiner.\n  static std::string InsertWordJoiners(const std::string &text);\n\n  // Helper functions to convert fullwidth Latin and halfwidth Basic Latin.\n  static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text);\n  static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text);\n\nprotected:\n  // Init and free local renderer objects.\n  void InitPangoCairo();\n  void FreePangoCairo();\n  // Set rendering properties.\n  void SetLayoutProperties();\n  void SetWordUnderlineAttributes(const std::string &page_text);\n  // Compute bounding boxes around grapheme clusters.\n  void ComputeClusterBoxes();\n  void CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars);\n  bool GetClusterStrings(std::vector<std::string> *cluster_text);\n  int FindFirstPageBreakOffset(const char *text, int text_length);\n\n  PangoFontInfo font_;\n  // Page properties\n  int page_width_, page_height_, h_margin_, v_margin_;\n  // Text rendering properties\n  double pen_color_[3];\n  int char_spacing_;\n  int leading_, resolution_;\n  bool vertical_text_;\n  bool gravity_hint_strong_;\n  bool render_fullwidth_latin_;\n  double underline_start_prob_;\n  double underline_continuation_prob_;\n  PangoUnderline underline_style_;\n  std::string features_;\n  // Text filtering options\n  bool drop_uncovered_chars_;\n  bool strip_unrenderable_words_;\n  bool add_ligatures_;\n  bool output_word_boxes_;\n  // Pango and cairo specific objects\n  cairo_surface_t *surface_;\n  cairo_t *cr_;\n  PangoLayout *layout_;\n  // Internal state of current page number, updated on successive calls to\n  // RenderToImage()\n  int start_box_;\n  int page_;\n  // Boxes and associated text for all pages rendered with RenderToImage() since\n  // the last call to ClearBoxes().\n  std::vector<BoxChar *> boxchars_;\n  int box_padding_;\n  // Bounding boxes for pages since the last call to ClearBoxes().\n  Boxa *page_boxes_;\n\n  // Objects cached for subsequent calls to RenderAllFontsToImage()\n  std::unordered_map<char32, int64_t> char_map_; // Time-saving char histogram.\n  int total_chars_;                              // Number in the string to be rendered.\n  unsigned int font_index_;                      // Index of next font to use in font list.\n  int last_offset_;                              // Offset returned from last successful rendering\n\nprivate:\n  StringRenderer(const StringRenderer &) = delete;\n  void operator=(const StringRenderer &) = delete;\n};\n} // namespace tesseract\n\n#endif // THIRD_PARTY_TESSERACT_TRAINING_STRINGRENDERER_H_\n"
  },
  {
    "path": "src/training/pango/tlog.cpp",
    "content": "/**********************************************************************\n * File:        tlog.cpp\n * Description: Variant of printf with logging level controllable by a\n *              commandline flag.\n * Author:      Ranjith Unnikrishnan\n * Created:     Wed Nov 20 2013\n *\n * (C) Copyright 2013, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include \"tlog.h\"\n\nusing namespace tesseract;\n\nINT_PARAM_FLAG(tlog_level, 0, \"Minimum logging level for tlog() output\");\n"
  },
  {
    "path": "src/training/pango/tlog.h",
    "content": "/**********************************************************************\n * File:        tlog.h\n * Description: Variant of printf with logging level controllable by a\n *              commandline flag.\n * Author:      Ranjith Unnikrishnan\n * Created:     Wed Nov 20 2013\n *\n * (C) Copyright 2013, Google Inc.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n#ifndef TESSERACT_TRAINING_TLOG_H_\n#define TESSERACT_TRAINING_TLOG_H_\n\n#include \"export.h\"\n\n#include \"commandlineflags.h\"\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\nTESS_PANGO_TRAINING_API\nDECLARE_INT_PARAM_FLAG(tlog_level);\n\n// Variant guarded by the numeric logging level parameter FLAGS_tlog_level\n// (default 0).  Code using ParseCommandLineFlags() can control its value using\n// the --tlog_level commandline argument. Otherwise it must be specified in a\n// config file like other params.\n#define tlog(level, ...)             \\\n  {                                  \\\n    if (FLAGS_tlog_level >= level) { \\\n      tprintf(__VA_ARGS__);          \\\n    }                                \\\n  }\n\n#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)\n\n#endif // TESSERACT_TRAINING_TLOG_H_\n"
  },
  {
    "path": "src/training/set_unicharset_properties.cpp",
    "content": "// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// This program reads a unicharset file, puts the result in a UNICHARSET\n// object, fills it with properties about the unichars it contains and writes\n// the result back to a file.\n\n#include \"commandlineflags.h\"\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"tprintf.h\"\n#include \"unicharset_training_utils.h\"\n\nusing namespace tesseract;\n\n// The directory that is searched for universal script unicharsets.\nstatic STRING_PARAM_FLAG(script_dir, \"\", \"Directory name for input script unicharsets/xheights\");\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);\n\n  // Check validity of input flags.\n  if (FLAGS_U.empty() || FLAGS_O.empty()) {\n    tprintf(\"Specify both input and output unicharsets!\\n\");\n    return EXIT_FAILURE;\n  }\n  if (FLAGS_script_dir.empty()) {\n    tprintf(\"Must specify a script_dir!\\n\");\n    return EXIT_FAILURE;\n  }\n\n  tesseract::SetPropertiesForInputFile(FLAGS_script_dir.c_str(), FLAGS_U.c_str(), FLAGS_O.c_str(),\n                                       FLAGS_X.c_str());\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "src/training/shapeclustering.cpp",
    "content": "// Copyright 2011 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n//  Filename: shapeclustering.cpp\n//  Purpose:  Generates a master shape table to merge similarly-shaped\n//            training data of whole, partial or multiple characters.\n//  Author:   Ray Smith\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"commontraining.h\"\n#include \"mastertrainer.h\"\n#include \"params.h\"\n\nusing namespace tesseract;\n\nstatic INT_PARAM_FLAG(display_cloud_font, -1, \"Display cloud of this font, canonical_class1\");\nstatic INT_PARAM_FLAG(display_canonical_font, -1,\n                      \"Display canonical sample of this font, canonical_class2\");\nstatic STRING_PARAM_FLAG(canonical_class1, \"\", \"Class to show ambigs for\");\nstatic STRING_PARAM_FLAG(canonical_class2, \"\", \"Class to show ambigs for\");\n\n// Loads training data, if requested displays debug information, otherwise\n// creates the master shape table by shape clustering and writes it to a file.\n// If FLAGS_display_cloud_font is set, then the cloud features of\n// FLAGS_canonical_class1/FLAGS_display_cloud_font are shown in green ON TOP\n// OF the red canonical features of FLAGS_canonical_class2/\n// FLAGS_display_canonical_font, so as to show which canonical features are\n// NOT in the cloud.\n// Otherwise, if FLAGS_canonical_class1 is set, prints a table of font-wise\n// cluster distances between FLAGS_canonical_class1 and FLAGS_canonical_class2.\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  ParseArguments(&argc, &argv);\n\n  std::string file_prefix;\n  auto trainer = tesseract::LoadTrainingData(argv + 1, false, nullptr, file_prefix);\n\n  if (!trainer) {\n    return EXIT_FAILURE;\n  }\n\n  if (FLAGS_display_cloud_font >= 0) {\n#ifndef GRAPHICS_DISABLED\n    trainer->DisplaySamples(FLAGS_canonical_class1.c_str(), FLAGS_display_cloud_font,\n                            FLAGS_canonical_class2.c_str(), FLAGS_display_canonical_font);\n#endif // !GRAPHICS_DISABLED\n    return EXIT_SUCCESS;\n  } else if (!FLAGS_canonical_class1.empty()) {\n    trainer->DebugCanonical(FLAGS_canonical_class1.c_str(), FLAGS_canonical_class2.c_str());\n    return EXIT_SUCCESS;\n  }\n  trainer->SetupMasterShapes();\n  WriteShapeTable(file_prefix, trainer->master_shapes());\n\n  return EXIT_SUCCESS;\n} /* main */\n"
  },
  {
    "path": "src/training/text2image.cpp",
    "content": "/**********************************************************************\n * File:        text2image.cpp\n * Description: Program to generate OCR training pages. Given a text file it\n *              outputs an image with a given font and degradation.\n *\n *              Note that since the results depend on the fonts available on\n *              your system, running the code on a different machine, or\n *              different OS, or even at a different time on the same machine,\n *              may produce different fonts even if --font is given explicitly.\n *              To see names of available fonts, use --list_available_fonts with\n *              the appropriate --fonts_dir path.\n *              Specifying --use_only_legacy_fonts will restrict the available\n *              fonts to those listed in legacy_fonts.h\n * Authors:     Ranjith Unnikrishnan, Ray Smith\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"boxchar.h\"\n#include \"commandlineflags.h\"\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"degradeimage.h\"\n#include \"errcode.h\"\n#include \"fileio.h\"\n#include \"helpers.h\"\n#include \"normstrngs.h\"\n#include \"stringrenderer.h\"\n#include \"tlog.h\"\n#include \"unicharset.h\"\n\n#include <allheaders.h> // from leptonica\n\n#include <algorithm>\n#include <cstdlib>\n#include <cstring>\n#include <iostream>\n#include <map>\n#include <random>\n#include <string>\n#include <utility>\n#include <vector>\n\n#ifdef _MSC_VER\n#  define putenv(s) _putenv(s)\n#endif\n\nusing namespace tesseract;\n\n// A number with which to initialize the random number generator.\nconst int kRandomSeed = 0x18273645;\n\n// The text input file.\nstatic STRING_PARAM_FLAG(text, \"\", \"File name of text input to process\");\n\n// The text output file.\nstatic STRING_PARAM_FLAG(outputbase, \"\", \"Basename for output image/box file\");\n\n// Degrade the rendered image to mimic scanner quality.\nstatic BOOL_PARAM_FLAG(degrade_image, true,\n                       \"Degrade rendered image with speckle noise, dilation/erosion \"\n                       \"and rotation\");\n\n// Rotate the rendered image to have more realistic glyph borders\nstatic BOOL_PARAM_FLAG(rotate_image, true, \"Rotate the image in a random way.\");\n\n// Degradation to apply to the image.\nstatic INT_PARAM_FLAG(exposure, 0, \"Exposure level in photocopier\");\n\n// Distort the rendered image by various means according to the bool flags.\nstatic BOOL_PARAM_FLAG(distort_image, false, \"Degrade rendered image with noise, blur, invert.\");\n\n// Distortion to apply to the image.\nstatic BOOL_PARAM_FLAG(invert, true, \"Invert the image\");\n\n// Distortion to apply to the image.\nstatic BOOL_PARAM_FLAG(white_noise, true, \"Add  Gaussian Noise\");\n\n// Distortion to apply to the image.\nstatic BOOL_PARAM_FLAG(smooth_noise, true, \"Smoothen Noise\");\n\n// Distortion to apply to the image.\nstatic BOOL_PARAM_FLAG(blur, true, \"Blur the image\");\n\n#if 0\n\n// Distortion to apply to the image.\nstatic BOOL_PARAM_FLAG(perspective, false, \"Generate Perspective Distortion\");\n\n// Distortion to apply to the image.\nstatic INT_PARAM_FLAG(box_reduction, 0, \"Integer reduction factor box_scale\");\n\n#endif\n\n// Output image resolution.\nstatic INT_PARAM_FLAG(resolution, 300, \"Pixels per inch\");\n\n// Width of output image (in pixels).\nstatic INT_PARAM_FLAG(xsize, 3600, \"Width of output image\");\n\n// Max height of output image (in pixels).\nstatic INT_PARAM_FLAG(ysize, 4800, \"Height of output image\");\n\n// Max number of pages to produce.\nstatic INT_PARAM_FLAG(max_pages, 0, \"Maximum number of pages to output (0=unlimited)\");\n\n// Margin around text (in pixels).\nstatic INT_PARAM_FLAG(margin, 100, \"Margin round edges of image\");\n\n// Size of text (in points).\nstatic INT_PARAM_FLAG(ptsize, 12, \"Size of printed text\");\n\n// Inter-character space (in ems).\nstatic DOUBLE_PARAM_FLAG(char_spacing, 0, \"Inter-character space in ems\");\n\n// Sets the probability (value in [0, 1]) of starting to render a word with an\n// underline. Words are assumed to be space-delimited.\nstatic DOUBLE_PARAM_FLAG(underline_start_prob, 0,\n                         \"Fraction of words to underline (value in [0,1])\");\n// Set the probability (value in [0, 1]) of continuing a started underline to\n// the next word.\nstatic DOUBLE_PARAM_FLAG(underline_continuation_prob, 0,\n                         \"Fraction of words to underline (value in [0,1])\");\n\n// Inter-line space (in pixels).\nstatic INT_PARAM_FLAG(leading, 12, \"Inter-line space (in pixels)\");\n\n// Layout and glyph orientation on rendering.\nstatic STRING_PARAM_FLAG(writing_mode, \"horizontal\",\n                         \"Specify one of the following writing\"\n                         \" modes.\\n\"\n                         \"'horizontal' : Render regular horizontal text. (default)\\n\"\n                         \"'vertical' : Render vertical text. Glyph orientation is\"\n                         \" selected by Pango.\\n\"\n                         \"'vertical-upright' : Render vertical text. Glyph \"\n                         \" orientation is set to be upright.\");\n\nstatic INT_PARAM_FLAG(box_padding, 0, \"Padding around produced bounding boxes\");\n\nstatic BOOL_PARAM_FLAG(strip_unrenderable_words, true,\n                       \"Remove unrenderable words from source text\");\n\n// Font name.\nstatic STRING_PARAM_FLAG(font, \"Arial\", \"Font description name to use\");\n\nstatic BOOL_PARAM_FLAG(ligatures, false, \"Rebuild and render ligatures\");\n\nstatic BOOL_PARAM_FLAG(find_fonts, false, \"Search for all fonts that can render the text\");\nstatic BOOL_PARAM_FLAG(render_per_font, true,\n                       \"If find_fonts==true, render each font to its own image. \"\n                       \"Image filenames are of the form output_name.font_name.tif\");\nstatic DOUBLE_PARAM_FLAG(min_coverage, 1.0,\n                         \"If find_fonts==true, the minimum coverage the font has of \"\n                         \"the characters in the text file to include it, between \"\n                         \"0 and 1.\");\n\nstatic BOOL_PARAM_FLAG(list_available_fonts, false, \"List available fonts and quit.\");\n\nstatic BOOL_PARAM_FLAG(render_ngrams, false,\n                       \"Put each space-separated entity from the\"\n                       \" input file into one bounding box. The ngrams in the input\"\n                       \" file will be randomly permuted before rendering (so that\"\n                       \" there is sufficient variety of characters on each line).\");\n\nstatic BOOL_PARAM_FLAG(output_word_boxes, false,\n                       \"Output word bounding boxes instead of character boxes. \"\n                       \"This is used for Cube training, and implied by \"\n                       \"--render_ngrams.\");\n\nstatic STRING_PARAM_FLAG(unicharset_file, \"\",\n                         \"File with characters in the unicharset. If --render_ngrams\"\n                         \" is true and --unicharset_file is specified, ngrams with\"\n                         \" characters that are not in unicharset will be omitted\");\n\nstatic BOOL_PARAM_FLAG(bidirectional_rotation, false, \"Rotate the generated characters both ways.\");\n\nstatic BOOL_PARAM_FLAG(only_extract_font_properties, false,\n                       \"Assumes that the input file contains a list of ngrams. Renders\"\n                       \" each ngram, extracts spacing properties and records them in\"\n                       \" output_base/[font_name].fontinfo file.\");\n\n// Use these flags to output zero-padded, square individual character images\nstatic BOOL_PARAM_FLAG(output_individual_glyph_images, false,\n                       \"If true also outputs individual character images\");\nstatic INT_PARAM_FLAG(glyph_resized_size, 0,\n                      \"Each glyph is square with this side length in pixels\");\nstatic INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0,\n                      \"Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad\");\n\nnamespace tesseract {\n\nstruct SpacingProperties {\n  SpacingProperties() : x_gap_before(0), x_gap_after(0) {}\n  SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {}\n  // These values are obtained from FT_Glyph_Metrics struct\n  // used by the FreeType font engine.\n  int x_gap_before; // horizontal x bearing\n  int x_gap_after;  // horizontal advance - x_gap_before - width\n  std::map<std::string, int> kerned_x_gaps;\n};\n\nstatic bool IsWhitespaceBox(const BoxChar *boxchar) {\n  return (boxchar->box() == nullptr || SpanUTF8Whitespace(boxchar->ch().c_str()));\n}\n\nstatic std::string StringReplace(const std::string &in, const std::string &oldsub,\n                                 const std::string &newsub) {\n  std::string out;\n  size_t start_pos = 0, pos;\n  while ((pos = in.find(oldsub, start_pos)) != std::string::npos) {\n    out.append(in.data() + start_pos, pos - start_pos);\n    out.append(newsub.data(), newsub.length());\n    start_pos = pos + oldsub.length();\n  }\n  out.append(in.data() + start_pos, in.length() - start_pos);\n  return out;\n}\n\n// Assumes that each word (whitespace-separated entity) in text is a bigram.\n// Renders the bigrams and calls FontInfo::GetSpacingProperties() to\n// obtain spacing information. Produces the output .fontinfo file with a line\n// per unichar of the form:\n// unichar space_before space_after kerned1 kerned_space1 kerned2 ...\n// Fox example, if unichar \"A\" has spacing of 0 pixels before and -1 pixels\n// after, is kerned with \"V\" resulting in spacing of \"AV\" to be -7 and kerned\n// with \"T\", such that \"AT\" has spacing of -5, the entry/line for unichar \"A\"\n// in .fontinfo file will be:\n// A 0 -1 T -5 V -7\nstatic void ExtractFontProperties(const std::string &utf8_text, StringRenderer *render,\n                                  const std::string &output_base) {\n  std::map<std::string, SpacingProperties> spacing_map;\n  std::map<std::string, SpacingProperties>::iterator spacing_map_it0;\n  std::map<std::string, SpacingProperties>::iterator spacing_map_it1;\n  int x_bearing, x_advance;\n  int len = utf8_text.length();\n  int offset = 0;\n  const char *text = utf8_text.c_str();\n  while (offset < len) {\n    offset += render->RenderToImage(text + offset, strlen(text + offset), nullptr);\n    const std::vector<BoxChar *> &boxes = render->GetBoxes();\n\n    // If the page break split a bigram, correct the offset so we try the bigram\n    // on the next iteration.\n    if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&\n        IsWhitespaceBox(boxes[boxes.size() - 2])) {\n      if (boxes.size() > 3) {\n        tprintf(\"WARNING: Adjusting to bad page break after '%s%s'\\n\",\n                boxes[boxes.size() - 4]->ch().c_str(), boxes[boxes.size() - 3]->ch().c_str());\n      }\n      offset -= boxes[boxes.size() - 1]->ch().size();\n    }\n\n    for (size_t b = 0; b < boxes.size(); b += 2) {\n      while (b < boxes.size() && IsWhitespaceBox(boxes[b])) {\n        ++b;\n      }\n      if (b + 1 >= boxes.size()) {\n        break;\n      }\n      const std::string &ch0 = boxes[b]->ch();\n      // We encountered a ligature. This happens in at least two scenarios:\n      // One is when the rendered bigram forms a grapheme cluster (eg. the\n      // second character in the bigram is a combining vowel), in which case we\n      // correctly output only one bounding box.\n      // A second far less frequent case is when caused some fonts like 'DejaVu\n      // Sans Ultra-Light' force Pango to render a ligatured character even if\n      // the input consists of the separated characters.  NOTE(ranjith): As per\n      // behdad@ this is not currently controllable at the level of the Pango\n      // API.\n      // The most frequent of all is a single character \"word\" made by the CJK\n      // segmenter.\n      // Safeguard against these cases here by just skipping the bigram.\n      if (IsWhitespaceBox(boxes[b + 1])) {\n        continue;\n      }\n      int xgap = (boxes[b + 1]->box()->x - (boxes[b]->box()->x + boxes[b]->box()->w));\n      spacing_map_it0 = spacing_map.find(ch0);\n      int ok_count = 0;\n      if (spacing_map_it0 == spacing_map.end() &&\n          render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {\n        spacing_map[ch0] = SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b]->box()->w);\n        spacing_map_it0 = spacing_map.find(ch0);\n        ++ok_count;\n      }\n      const std::string &ch1 = boxes[b + 1]->ch();\n      tlog(3, \"%s%s\\n\", ch0.c_str(), ch1.c_str());\n      spacing_map_it1 = spacing_map.find(ch1);\n      if (spacing_map_it1 == spacing_map.end() &&\n          render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {\n        spacing_map[ch1] =\n            SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b + 1]->box()->w);\n        spacing_map_it1 = spacing_map.find(ch1);\n        ++ok_count;\n      }\n      if (ok_count == 2 &&\n          xgap != (spacing_map_it0->second.x_gap_after + spacing_map_it1->second.x_gap_before)) {\n        spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;\n      }\n    }\n    render->ClearBoxes();\n  }\n  std::string output_string;\n  const int kBufSize = 1024;\n  char buf[kBufSize];\n  snprintf(buf, kBufSize, \"%d\\n\", static_cast<int>(spacing_map.size()));\n  output_string.append(buf);\n  std::map<std::string, SpacingProperties>::const_iterator spacing_map_it;\n  for (spacing_map_it = spacing_map.begin(); spacing_map_it != spacing_map.end();\n       ++spacing_map_it) {\n    snprintf(buf, kBufSize, \"%s %d %d %d\", spacing_map_it->first.c_str(),\n             spacing_map_it->second.x_gap_before, spacing_map_it->second.x_gap_after,\n             static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));\n    output_string.append(buf);\n    std::map<std::string, int>::const_iterator kern_it;\n    for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();\n         kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {\n      snprintf(buf, kBufSize, \" %s %d\", kern_it->first.c_str(), kern_it->second);\n      output_string.append(buf);\n    }\n    output_string.append(\"\\n\");\n  }\n  File::WriteStringToFileOrDie(output_string, output_base + \".fontinfo\");\n}\n\nstatic bool MakeIndividualGlyphs(Image pix, const std::vector<BoxChar *> &vbox,\n                                 const int input_tiff_page) {\n  // If checks fail, return false without exiting text2image\n  if (!pix) {\n    tprintf(\"ERROR: MakeIndividualGlyphs(): Input Pix* is nullptr\\n\");\n    return false;\n  } else if (FLAGS_glyph_resized_size <= 0) {\n    tprintf(\"ERROR: --glyph_resized_size must be positive\\n\");\n    return false;\n  } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {\n    tprintf(\"ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\\n\");\n    return false;\n  }\n\n  const int n_boxes = vbox.size();\n  int n_boxes_saved = 0;\n  int current_tiff_page = 0;\n  int y_previous = 0;\n  static int glyph_count = 0;\n  for (int i = 0; i < n_boxes; i++) {\n    // Get one bounding box\n    Box *b = vbox[i]->mutable_box();\n    if (!b) {\n      continue;\n    }\n    const int x = b->x;\n    const int y = b->y;\n    const int w = b->w;\n    const int h = b->h;\n    // Check present tiff page (for multipage tiff)\n    if (y < y_previous - pixGetHeight(pix) / 10) {\n      tprintf(\"ERROR: Wrap-around encountered, at i=%d\\n\", i);\n      current_tiff_page++;\n    }\n    if (current_tiff_page < input_tiff_page) {\n      continue;\n    } else if (current_tiff_page > input_tiff_page) {\n      break;\n    }\n    // Check box validity\n    if (x < 0 || y < 0 || (x + w - 1) >= pixGetWidth(pix) || (y + h - 1) >= pixGetHeight(pix)) {\n      tprintf(\n          \"ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d\"\n          \" (x=%d, y=%d, w=%d, h=%d\\n)\",\n          i, x, y, w, h);\n      continue;\n    } else if (w < FLAGS_glyph_num_border_pixels_to_pad &&\n               h < FLAGS_glyph_num_border_pixels_to_pad) {\n      tprintf(\"ERROR: Input image too small to be a character, at i=%d\\n\", i);\n      continue;\n    }\n    // Crop the boxed character\n    Image pix_glyph = pixClipRectangle(pix, b, nullptr);\n    if (!pix_glyph) {\n      tprintf(\"ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\\n\", i);\n      continue;\n    }\n    // Resize to square\n    Image pix_glyph_sq =\n        pixScaleToSize(pix_glyph, FLAGS_glyph_resized_size, FLAGS_glyph_resized_size);\n    if (!pix_glyph_sq) {\n      tprintf(\"ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\\n\", i);\n      continue;\n    }\n    // Zero-pad\n    Image pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq, FLAGS_glyph_num_border_pixels_to_pad, 0);\n    if (!pix_glyph_sq_pad) {\n      tprintf(\"ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\\n\", i);\n      continue;\n    }\n    // Write out\n    Image pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);\n    char filename[1024];\n    snprintf(filename, 1024, \"%s_%d.jpg\", FLAGS_outputbase.c_str(), glyph_count++);\n    if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {\n      tprintf(\n          \"ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,\"\n          \" at i=%d\\n\",\n          filename, i);\n      continue;\n    }\n\n    pix_glyph.destroy();\n    pix_glyph_sq.destroy();\n    pix_glyph_sq_pad.destroy();\n    pix_glyph_sq_pad_8.destroy();\n    n_boxes_saved++;\n    y_previous = y;\n  }\n  if (n_boxes_saved == 0) {\n    return false;\n  } else {\n    tprintf(\"Total number of characters saved = %d\\n\", n_boxes_saved);\n    return true;\n  }\n}\n} // namespace tesseract\n\nusing tesseract::DegradeImage;\nusing tesseract::ExtractFontProperties;\nusing tesseract::File;\nusing tesseract::FontUtils;\nusing tesseract::SpanUTF8NotWhitespace;\nusing tesseract::SpanUTF8Whitespace;\nusing tesseract::StringRenderer;\n\nstatic int Main() {\n  if (FLAGS_list_available_fonts) {\n    const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();\n    for (unsigned int i = 0; i < all_fonts.size(); ++i) {\n      // Remove trailing comma: pango-font-description-to-string adds a comma\n      // to some fonts.\n      // See https://github.com/tesseract-ocr/tesseract/issues/408\n      std::string font_name(all_fonts[i].c_str());\n      if (font_name.back() == ',') {\n        font_name.pop_back();\n      }\n      printf(\"%3u: %s\\n\", i, font_name.c_str());\n      ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()),\n                      \"Font %s is unrecognized.\\n\", all_fonts[i].c_str());\n    }\n    return EXIT_SUCCESS;\n  }\n\n  // Check validity of input flags.\n  if (FLAGS_text.empty()) {\n    tprintf(\"'--text' option is missing!\\n\");\n    return EXIT_FAILURE;\n  }\n  if (FLAGS_outputbase.empty()) {\n    tprintf(\"'--outputbase' option is missing!\\n\");\n    return EXIT_FAILURE;\n  }\n  if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) {\n    tprintf(\"Use '--unicharset_file' only if '--render_ngrams' is set.\\n\");\n    return EXIT_FAILURE;\n  }\n\n  std::string font_name = FLAGS_font.c_str();\n  if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(font_name.c_str())) {\n    font_name += ',';\n    std::string pango_name;\n    if (!FontUtils::IsAvailableFont(font_name.c_str(), &pango_name)) {\n      tprintf(\"Could not find font named '%s'.\\n\", FLAGS_font.c_str());\n      if (!pango_name.empty()) {\n        tprintf(\"Pango suggested font '%s'.\\n\", pango_name.c_str());\n      }\n      tprintf(\"Please correct --font arg.\\n\");\n      return EXIT_FAILURE;\n    }\n  }\n\n  if (FLAGS_render_ngrams) {\n    FLAGS_output_word_boxes = true;\n  }\n\n  char font_desc_name[1024];\n  snprintf(font_desc_name, 1024, \"%s %d\", font_name.c_str(), static_cast<int>(FLAGS_ptsize));\n\n  StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize);\n  render.set_add_ligatures(FLAGS_ligatures);\n  render.set_leading(FLAGS_leading);\n  render.set_resolution(FLAGS_resolution);\n  render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize);\n  render.set_h_margin(FLAGS_margin);\n  render.set_v_margin(FLAGS_margin);\n  render.set_output_word_boxes(FLAGS_output_word_boxes);\n  render.set_box_padding(FLAGS_box_padding);\n  render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words);\n  render.set_underline_start_prob(FLAGS_underline_start_prob);\n  render.set_underline_continuation_prob(FLAGS_underline_continuation_prob);\n\n  // Set text rendering orientation and their forms.\n  if (FLAGS_writing_mode == \"horizontal\") {\n    // Render regular horizontal text (default).\n    render.set_vertical_text(false);\n    render.set_gravity_hint_strong(false);\n    render.set_render_fullwidth_latin(false);\n  } else if (FLAGS_writing_mode == \"vertical\") {\n    // Render vertical text. Glyph orientation is selected by Pango.\n    render.set_vertical_text(true);\n    render.set_gravity_hint_strong(false);\n    render.set_render_fullwidth_latin(false);\n  } else if (FLAGS_writing_mode == \"vertical-upright\") {\n    // Render vertical text. Glyph orientation is set to be upright.\n    // Also Basic Latin characters are converted to their fullwidth forms\n    // on rendering, since fullwidth Latin characters are well designed to fit\n    // vertical text lines, while .box files store halfwidth Basic Latin\n    // unichars.\n    render.set_vertical_text(true);\n    render.set_gravity_hint_strong(true);\n    render.set_render_fullwidth_latin(true);\n  } else {\n    tprintf(\"Invalid writing mode: %s\\n\", FLAGS_writing_mode.c_str());\n    return EXIT_FAILURE;\n  }\n\n  std::string src_utf8;\n  // This c_str is NOT redundant!\n  if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) {\n    tprintf(\"Failed to read file: %s\\n\", FLAGS_text.c_str());\n    return EXIT_FAILURE;\n  }\n\n  // Remove the unicode mark if present.\n  if (strncmp(src_utf8.c_str(), \"\\xef\\xbb\\xbf\", 3) == 0) {\n    src_utf8.erase(0, 3);\n  }\n  tlog(1, \"Render string of size %zu\\n\", src_utf8.length());\n\n  if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) {\n    // Try to preserve behavior of old text2image by expanding inter-word\n    // spaces by a factor of 4.\n    const std::string kSeparator = FLAGS_render_ngrams ? \"    \" : \" \";\n    // Also restrict the number of characters per line to try and avoid\n    // line-breaking in the middle of words like \"-A\", \"R$\" etc. which are\n    // otherwise allowed by the standard unicode line-breaking rules.\n    const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100;\n    std::string rand_utf8;\n    UNICHARSET unicharset;\n    if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() &&\n        !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) {\n      tprintf(\"Failed to load unicharset from file %s\\n\", FLAGS_unicharset_file.c_str());\n      return EXIT_FAILURE;\n    }\n\n    // If we are rendering ngrams that will be OCRed later, shuffle them so that\n    // tesseract does not have difficulties finding correct baseline, word\n    // spaces, etc.\n    const char *str8 = src_utf8.c_str();\n    int len = src_utf8.length();\n    int step;\n    std::vector<std::pair<int, int>> offsets;\n    int offset = SpanUTF8Whitespace(str8);\n    while (offset < len) {\n      step = SpanUTF8NotWhitespace(str8 + offset);\n      offsets.emplace_back(offset, step);\n      offset += step;\n      offset += SpanUTF8Whitespace(str8 + offset);\n    }\n    if (FLAGS_render_ngrams) {\n      std::seed_seq seed{kRandomSeed};\n      std::mt19937 random_gen(seed);\n      std::shuffle(offsets.begin(), offsets.end(), random_gen);\n    }\n\n    for (size_t i = 0, line = 1; i < offsets.size(); ++i) {\n      const char *curr_pos = str8 + offsets[i].first;\n      int ngram_len = offsets[i].second;\n      // Skip words that contain characters not in found in unicharset.\n      std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);\n      if (!FLAGS_unicharset_file.empty() &&\n          !unicharset.encodable_string(cleaned.c_str(), nullptr)) {\n        continue;\n      }\n      rand_utf8.append(curr_pos, ngram_len);\n      if (rand_utf8.length() > line * kCharsPerLine) {\n        rand_utf8.append(\" \\n\");\n        ++line;\n        if (line & 0x1) {\n          rand_utf8.append(kSeparator);\n        }\n      } else {\n        rand_utf8.append(kSeparator);\n      }\n    }\n    tlog(1, \"Rendered ngram string of size %zu\\n\", rand_utf8.length());\n    src_utf8.swap(rand_utf8);\n  }\n  if (FLAGS_only_extract_font_properties) {\n    tprintf(\"Extracting font properties only\\n\");\n    ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str());\n    tprintf(\"Done!\\n\");\n    return EXIT_SUCCESS;\n  }\n\n  int im = 0;\n  std::vector<float> page_rotation;\n  const char *to_render_utf8 = src_utf8.c_str();\n\n  tesseract::TRand randomizer;\n  randomizer.set_seed(kRandomSeed);\n  std::vector<std::string> font_names;\n  // We use a two pass mechanism to rotate images in both direction.\n  // The first pass(0) will rotate the images in random directions and\n  // the second pass(1) will mirror those rotations.\n  int num_pass = FLAGS_bidirectional_rotation ? 2 : 1;\n  for (int pass = 0; pass < num_pass; ++pass) {\n    int page_num = 0;\n    std::string font_used;\n    for (size_t offset = 0;\n         offset < strlen(to_render_utf8) && (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages);\n         ++im, ++page_num) {\n      tlog(1, \"Starting page %d\\n\", im);\n      Image pix = nullptr;\n      if (FLAGS_find_fonts) {\n        offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset,\n                                               strlen(to_render_utf8 + offset), &font_used, &pix);\n      } else {\n        offset +=\n            render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix);\n      }\n      if (pix != nullptr) {\n        float rotation = 0;\n        if (pass == 1) {\n          // Pass 2, do mirror rotation.\n          rotation = -1 * page_rotation[page_num];\n        }\n        if (FLAGS_degrade_image) {\n          pix = DegradeImage(pix, FLAGS_exposure, &randomizer,\n                             FLAGS_rotate_image ? &rotation : nullptr);\n        }\n        if (FLAGS_distort_image) {\n          // TODO: perspective is set to false and box_reduction to 1.\n          pix = PrepareDistortedPix(pix, false, FLAGS_invert, FLAGS_white_noise, FLAGS_smooth_noise,\n                                    FLAGS_blur, 1, &randomizer, nullptr);\n        }\n        render.RotatePageBoxes(rotation);\n\n        if (pass == 0) {\n          // Pass 1, rotate randomly and store the rotation..\n          page_rotation.push_back(rotation);\n        }\n\n        Image gray_pix = pixConvertTo8(pix, false);\n        pix.destroy();\n        Image binary = pixThresholdToBinary(gray_pix, 128);\n        gray_pix.destroy();\n        char tiff_name[1024];\n        if (FLAGS_find_fonts) {\n          if (FLAGS_render_per_font) {\n            std::string fontname_for_file = tesseract::StringReplace(font_used, \" \", \"_\");\n            snprintf(tiff_name, 1024, \"%s.%s.tif\", FLAGS_outputbase.c_str(),\n                     fontname_for_file.c_str());\n            pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, \"w\");\n            tprintf(\"Rendered page %d to file %s\\n\", im, tiff_name);\n          } else {\n            font_names.push_back(font_used);\n          }\n        } else {\n          snprintf(tiff_name, 1024, \"%s.tif\", FLAGS_outputbase.c_str());\n          pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? \"w\" : \"a\");\n          tprintf(\"Rendered page %d to file %s\\n\", im, tiff_name);\n        }\n        // Make individual glyphs\n        if (FLAGS_output_individual_glyph_images) {\n          if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) {\n            tprintf(\"ERROR: Individual glyphs not saved\\n\");\n          }\n        }\n        binary.destroy();\n      }\n      if (FLAGS_find_fonts && offset != 0) {\n        // We just want a list of names, or some sample images so we don't need\n        // to render more than the first page of the text.\n        break;\n      }\n    }\n  }\n  if (!FLAGS_find_fonts) {\n    std::string box_name = FLAGS_outputbase.c_str();\n    box_name += \".box\";\n    render.WriteAllBoxes(box_name);\n  } else if (!FLAGS_render_per_font && !font_names.empty()) {\n    std::string filename = FLAGS_outputbase.c_str();\n    filename += \".fontlist.txt\";\n    FILE *fp = fopen(filename.c_str(), \"wb\");\n    if (fp == nullptr) {\n      tprintf(\"Failed to create output font list %s\\n\", filename.c_str());\n    } else {\n      for (auto &font_name : font_names) {\n        fprintf(fp, \"%s\\n\", font_name.c_str());\n      }\n      fclose(fp);\n    }\n  }\n\n  return EXIT_SUCCESS;\n}\n\nint main(int argc, char **argv) {\n  // Respect environment variable. could be:\n  // fc (fontconfig), win32, and coretext\n  // If not set force fontconfig for Mac OS.\n  // See https://github.com/tesseract-ocr/tesseract/issues/736\n  char *backend;\n  backend = getenv(\"PANGOCAIRO_BACKEND\");\n  if (backend == nullptr) {\n    static char envstring[] = \"PANGOCAIRO_BACKEND=fc\";\n    putenv(envstring);\n  } else {\n    printf(\n        \"Using '%s' as pango cairo backend based on environment \"\n        \"variable.\\n\",\n        backend);\n  }\n  tesseract::CheckSharedLibraryVersion();\n  if (argc > 1) {\n    if ((strcmp(argv[1], \"-v\") == 0) || (strcmp(argv[1], \"--version\") == 0)) {\n      FontUtils::PangoFontTypeInfo();\n      printf(\"Pango version: %s\\n\", pango_version_string());\n    }\n  }\n  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);\n  return Main();\n}\n"
  },
  {
    "path": "src/training/unicharset/export.h",
    "content": "#pragma once\n\n#ifdef CMAKE_BUILD\n#  include <unicharset_training_export.h>\n#endif\n"
  },
  {
    "path": "src/training/unicharset/fileio.cpp",
    "content": "/**********************************************************************\n * File:        fileio.cpp\n * Description: File I/O utilities.\n * Author:      Samuel Charron\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n * use this file except in compliance with the License.  You may obtain a copy\n * of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required\n * by applicable law or agreed to in writing, software distributed under the\n * License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n * OF ANY KIND, either express or implied.  See the License for the specific\n * language governing permissions and limitations under the License.\n *\n **********************************************************************/\n\n#ifdef _WIN32\n#  ifndef unlink\n#    include <io.h>\n#  endif\n#else\n#  include <glob.h>\n#  include <unistd.h>\n#endif\n\n#include <cerrno>\n#include <cstdio>\n#include <cstdlib>\n#include <string>\n\n#include \"errcode.h\"\n#include \"fileio.h\"\n#include \"host.h\" // includes windows.h for BOOL, ...\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n///////////////////////////////////////////////////////////////////////////////\n// File::\n///////////////////////////////////////////////////////////////////////////////\nFILE *File::Open(const std::string &filename, const std::string &mode) {\n  return fopen(filename.c_str(), mode.c_str());\n}\n\nFILE *File::OpenOrDie(const std::string &filename, const std::string &mode) {\n  FILE *stream = fopen(filename.c_str(), mode.c_str());\n  if (stream == nullptr) {\n    tprintf(\"Unable to open '%s' in mode '%s': %s\\n\", filename.c_str(), mode.c_str(),\n            strerror(errno));\n  }\n  return stream;\n}\n\nvoid File::WriteStringToFileOrDie(const std::string &str, const std::string &filename) {\n  FILE *stream = fopen(filename.c_str(), \"wb\");\n  if (stream == nullptr) {\n    tprintf(\"Unable to open '%s' for writing: %s\\n\", filename.c_str(), strerror(errno));\n    return;\n  }\n  fputs(str.c_str(), stream);\n  ASSERT_HOST(fclose(stream) == 0);\n}\n\nbool File::Readable(const std::string &filename) {\n  FILE *stream = fopen(filename.c_str(), \"rb\");\n  if (stream == nullptr) {\n    return false;\n  }\n  fclose(stream);\n  return true;\n}\n\nbool File::ReadFileToString(const std::string &filename, std::string *out) {\n  FILE *stream = File::Open(filename, \"rb\");\n  if (stream == nullptr) {\n    return false;\n  }\n  InputBuffer in(stream);\n  *out = \"\";\n  in.Read(out);\n  return in.CloseFile();\n}\n\nstd::string File::JoinPath(const std::string &prefix, const std::string &suffix) {\n  return (prefix.empty() || prefix[prefix.size() - 1] == '/') ? prefix + suffix\n                                                              : prefix + \"/\" + suffix;\n}\n\nbool File::Delete(const char *pathname) {\n#if !defined(_WIN32) || defined(__MINGW32__)\n  const int status = unlink(pathname);\n#else\n  const int status = _unlink(pathname);\n#endif\n  if (status != 0) {\n    tprintf(\"ERROR: Unable to delete file '%s$: %s\\n\", pathname, strerror(errno));\n    return false;\n  }\n  return true;\n}\n\n#ifdef _WIN32\nbool File::DeleteMatchingFiles(const char *pattern) {\n  WIN32_FIND_DATA data;\n  BOOL result = TRUE;\n  HANDLE handle = FindFirstFile(pattern, &data);\n  bool all_deleted = true;\n  if (handle != INVALID_HANDLE_VALUE) {\n    for (; result; result = FindNextFile(handle, &data)) {\n      all_deleted &= File::Delete(data.cFileName);\n    }\n    FindClose(handle);\n  }\n  return all_deleted;\n}\n#else\nbool File::DeleteMatchingFiles(const char *pattern) {\n  glob_t pglob;\n  char **paths;\n  bool all_deleted = true;\n  if (glob(pattern, 0, nullptr, &pglob) == 0) {\n    for (paths = pglob.gl_pathv; *paths != nullptr; paths++) {\n      all_deleted &= File::Delete(*paths);\n    }\n    globfree(&pglob);\n  }\n  return all_deleted;\n}\n#endif\n\n///////////////////////////////////////////////////////////////////////////////\n// InputBuffer::\n///////////////////////////////////////////////////////////////////////////////\nInputBuffer::InputBuffer(FILE *stream) : stream_(stream) {}\n\nInputBuffer::InputBuffer(FILE *stream, size_t) : stream_(stream) {}\n\nInputBuffer::~InputBuffer() {\n  if (stream_ != nullptr) {\n    fclose(stream_);\n  }\n}\n\nbool InputBuffer::Read(std::string *out) {\n  char buf[BUFSIZ + 1];\n  int l;\n  while ((l = fread(buf, 1, BUFSIZ, stream_)) > 0) {\n    if (ferror(stream_)) {\n      clearerr(stream_);\n      return false;\n    }\n    buf[l] = 0;\n    out->append(buf);\n  }\n  return true;\n}\n\nbool InputBuffer::CloseFile() {\n  int ret = fclose(stream_);\n  stream_ = nullptr;\n  return ret == 0;\n}\n\n///////////////////////////////////////////////////////////////////////////////\n// OutputBuffer::\n///////////////////////////////////////////////////////////////////////////////\n\nOutputBuffer::OutputBuffer(FILE *stream) : stream_(stream) {}\n\nOutputBuffer::OutputBuffer(FILE *stream, size_t) : stream_(stream) {}\n\nOutputBuffer::~OutputBuffer() {\n  if (stream_ != nullptr) {\n    fclose(stream_);\n  }\n}\n\nvoid OutputBuffer::WriteString(const std::string &str) {\n  fputs(str.c_str(), stream_);\n}\n\nbool OutputBuffer::CloseFile() {\n  int ret = fclose(stream_);\n  stream_ = nullptr;\n  return ret == 0;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/fileio.h",
    "content": "/**********************************************************************\n * File:        fileio.h\n * Description: File I/O utilities.\n * Author:      Samuel Charron\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n * use this file except in compliance with the License.  You may obtain a copy\n * of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required\n * by applicable law or agreed to in writing, software distributed under the\n * License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS\n * OF ANY KIND, either express or implied.  See the License for the specific\n * language governing permissions and limitations under the License.\n *\n **********************************************************************/\n#ifndef TESSERACT_TRAINING_FILEIO_H_\n#define TESSERACT_TRAINING_FILEIO_H_\n\n#include \"export.h\"\n#include \"helpers.h\"  // for split\n#include \"serialis.h\" // for LoadDataFromFile\n\n#include <tesseract/export.h>\n\n#include <cstddef>\n#include <cstdio>\n#include <string>\n\nnamespace tesseract {\n\n// Reads a file as a vector of string.\ninline bool LoadFileLinesToStrings(const char *filename, std::vector<std::string> *lines) {\n  std::vector<char> data;\n  if (!LoadDataFromFile(filename, &data)) {\n    return false;\n  }\n  // TODO: optimize.\n  std::string lines_str(&data[0], data.size());\n  *lines = split(lines_str, '\\n');\n  return true;\n}\n\n// A class to manipulate FILE*s.\nclass TESS_UNICHARSET_TRAINING_API File {\npublic:\n  // Try to open the file 'filename' in mode 'mode'.\n  // Stop the program if it cannot open it.\n  static FILE *OpenOrDie(const std::string &filename, const std::string &mode);\n  static FILE *Open(const std::string &filename, const std::string &mode);\n\n  // Try to open the file 'filename' and to write 'str' in it.\n  // Stop the program if it fails.\n  static void WriteStringToFileOrDie(const std::string &str, const std::string &filename);\n\n  // Return true if the file 'filename' is readable.\n  static bool Readable(const std::string &filename);\n\n  static bool ReadFileToString(const std::string &filename, std::string *out);\n\n  // Helper methods\n\n  // Concatenate file paths removing any extra intervening '/' symbols.\n  static std::string JoinPath(const std::string &prefix, const std::string &suffix);\n  // Delete a filename or all filenames matching a glob pattern.\n  static bool Delete(const char *pathname);\n  static bool DeleteMatchingFiles(const char *pattern);\n};\n\n// A class to manipulate Files for reading.\nclass TESS_UNICHARSET_TRAINING_API InputBuffer {\npublic:\n  explicit InputBuffer(FILE *stream);\n  // 'size' is ignored.\n  InputBuffer(FILE *stream, size_t size);\n\n  ~InputBuffer();\n\n  // Read data until end-of-file.\n  // The data is stored in '*out'.\n  // Return false if an error occurs, true otherwise.\n  bool Read(std::string *out);\n\n  // Close the FILE* used by InputBuffer.\n  // Return false if an error occurs, true otherwise.\n  bool CloseFile();\n\nprivate:\n  FILE *stream_;\n};\n\n// A class to manipulate Files for writing.\nclass TESS_UNICHARSET_TRAINING_API OutputBuffer {\npublic:\n  explicit OutputBuffer(FILE *stream);\n  // 'size' is ignored.\n  OutputBuffer(FILE *stream, size_t size);\n\n  ~OutputBuffer();\n\n  // Write string 'str' to the open FILE*.\n  void WriteString(const std::string &str);\n\n  // Close the FILE* used by InputBuffer.\n  // Return false if an error occurs, true otherwise.\n  bool CloseFile();\n\nprivate:\n  FILE *stream_;\n};\n\n} // namespace tesseract\n#endif // TESSERACT_TRAINING_FILEIO_H_\n"
  },
  {
    "path": "src/training/unicharset/icuerrorcode.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"icuerrorcode.h\"\n\nnamespace tesseract {\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nIcuErrorCode::~IcuErrorCode() {\n  if (isFailure()) {\n    handleFailure();\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/unicharset/icuerrorcode.h",
    "content": "/**********************************************************************\n * File:        icuerrorcode.h\n * Description: Wrapper class for UErrorCode, with conversion operators for\n *              direct use in ICU C and C++ APIs.\n * Author:      Fredrik Roubert\n * Created:     Thu July 4 2013\n *\n * Features:\n * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,\n *   removing one common source of errors.\n * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking\n *   UErrorCode& (reference), via conversion operators.\n * - Automatic checking for success when it goes out of scope. On failure,\n *   the destructor will log an error message and exit.\n *\n * Most of ICU will handle errors gracefully and provide sensible fallbacks.\n * Using IcuErrorCode, it is therefore possible to write very compact code\n * that does sensible things on failure and provides logging for debugging.\n *\n * Example:\n * IcuErrorCode icuerrorcode;\n * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_\n#define TESSERACT_CCUTIL_ICUERRORCODE_H_\n\n#include <cstdlib> // for exit\n#include \"tprintf.h\"\n#include \"unicode/errorcode.h\" // From libicu\n\nnamespace tesseract {\n\nclass IcuErrorCode : public icu::ErrorCode {\npublic:\n  IcuErrorCode() = default;\n  ~IcuErrorCode() override;\n\nprotected:\n  void handleFailure() const override {\n    tprintf(\"ICU ERROR: %s\\n\", errorName());\n    exit(errorCode);\n  }\n\nprivate:\n  // Disallow implicit copying of object.\n  IcuErrorCode(const IcuErrorCode &) = delete;\n  void operator=(const IcuErrorCode &) = delete;\n};\n\n} // namespace tesseract\n#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_\n"
  },
  {
    "path": "src/training/unicharset/lang_model_helpers.cpp",
    "content": "// Copyright 2017 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Purpose: Collection of convenience functions to simplify creation of the\n//          unicharset, recoder, and dawgs for an LSTM model.\n\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"lang_model_helpers.h\"\n\n#include \"dawg.h\"\n#include \"fileio.h\"\n#include \"tessdatamanager.h\"\n#include \"trie.h\"\n#include \"unicharcompress.h\"\n\n#include <cstdlib>\n\n#include <sys/stat.h>\n#include <sys/types.h>\n\n#if defined(_WIN32)\n#  include <direct.h>\n#endif\n\nnamespace tesseract {\n\n// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data\n// to the file, using writer if not null, otherwise, a default writer.\n// Default writer will overwrite any existing file, but a supplied writer\n// can do its own thing. If lang is empty, returns true but does nothing.\n// NOTE that suffix should contain any required . for the filename.\nbool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix,\n               const std::vector<char> &data, FileWriter writer) {\n  if (lang.empty()) {\n    return true;\n  }\n  std::string dirname = output_dir + \"/\" + lang;\n  // Attempt to make the directory, but ignore errors, as it may not be a\n  // standard filesystem, and the writer will complain if not successful.\n#if defined(_WIN32)\n  _mkdir(dirname.c_str());\n#else\n  mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);\n#endif\n  std::string filename = dirname + \"/\" + lang + suffix;\n  if (writer == nullptr) {\n    return SaveDataToFile(data, filename.c_str());\n  } else {\n    return (*writer)(data, filename.c_str());\n  }\n}\n\n// Helper reads a file with optional reader and returns a string.\n// On failure emits a warning message and returns an empty string.\nstd::string ReadFile(const std::string &filename, FileReader reader) {\n  if (filename.empty()) {\n    return std::string();\n  }\n  std::vector<char> data;\n  bool read_result;\n  if (reader == nullptr) {\n    read_result = LoadDataFromFile(filename.c_str(), &data);\n  } else {\n    read_result = (*reader)(filename.c_str(), &data);\n  }\n  if (read_result) {\n    return std::string(&data[0], data.size());\n  }\n  tprintf(\"Failed to read data from: %s\\n\", filename.c_str());\n  return std::string();\n}\n\n// Helper writes the unicharset to file and to the traineddata.\nbool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir,\n                     const std::string &lang, FileWriter writer, TessdataManager *traineddata) {\n  std::vector<char> unicharset_data;\n  TFile fp;\n  fp.OpenWrite(&unicharset_data);\n  if (!unicharset.save_to_file(&fp)) {\n    return false;\n  }\n  traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],\n                              unicharset_data.size());\n  return WriteFile(output_dir, lang, \".unicharset\", unicharset_data, writer);\n}\n\n// Helper creates the recoder and writes it to the traineddata, and a human-\n// readable form to file.\nbool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,\n                  const std::string &lang, FileWriter writer, std::string *radical_table_data,\n                  TessdataManager *traineddata) {\n  UnicharCompress recoder;\n  // Where the unicharset is carefully setup already to contain a good\n  // compact encoding, use a pass-through recoder that does nothing.\n  // For scripts that have a large number of unicodes (Han, Hangul) we want\n  // to use the recoder to compress the symbol space by re-encoding each\n  // unicode as multiple codes from a smaller 'alphabet' that are related to the\n  // shapes in the character. Hangul Jamo is a perfect example of this.\n  // See the Hangul Syllables section, sub-section \"Equivalence\" in:\n  // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf\n  if (pass_through) {\n    recoder.SetupPassThrough(unicharset);\n  } else {\n    int null_char = unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();\n    tprintf(\"Null char=%d\\n\", null_char);\n    if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {\n      tprintf(\"Creation of encoded unicharset failed!!\\n\");\n      return false;\n    }\n  }\n  TFile fp;\n  std::vector<char> recoder_data;\n  fp.OpenWrite(&recoder_data);\n  if (!recoder.Serialize(&fp)) {\n    return false;\n  }\n  traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size());\n  std::string encoding = recoder.GetEncodingAsString(unicharset);\n  recoder_data.resize(encoding.length(), 0);\n  memcpy(&recoder_data[0], &encoding[0], encoding.length());\n  std::string suffix;\n  suffix += \".charset_size=\" + std::to_string(recoder.code_range());\n  suffix += \".txt\";\n  return WriteFile(output_dir, lang, suffix, recoder_data, writer);\n}\n\n// Helper builds a dawg from the given words, using the unicharset as coding,\n// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.\nstatic bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset,\n                      Trie::RTLReversePolicy reverse_policy, TessdataType file_type,\n                      TessdataManager *traineddata) {\n  // The first 3 arguments are not used in this case.\n  Trie trie(DAWG_TYPE_WORD, \"\", SYSTEM_DAWG_PERM, unicharset.size(), 0);\n  trie.add_word_list(words, unicharset, reverse_policy);\n  tprintf(\"Reducing Trie to SquishedDawg\\n\");\n  std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());\n  if (dawg == nullptr || dawg->NumEdges() == 0) {\n    return false;\n  }\n  TFile fp;\n  std::vector<char> dawg_data;\n  fp.OpenWrite(&dawg_data);\n  if (!dawg->write_squished_dawg(&fp)) {\n    return false;\n  }\n  traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());\n  return true;\n}\n\n// Builds and writes the dawgs, given a set of words, punctuation\n// patterns, number patterns, to the traineddata. Encoding uses the given\n// unicharset, and the punc dawgs is reversed if lang_is_rtl.\nstatic bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs,\n                       const std::vector<std::string> &numbers, bool lang_is_rtl,\n                       const UNICHARSET &unicharset, TessdataManager *traineddata) {\n  if (puncs.empty()) {\n    tprintf(\"Must have non-empty puncs list to use language models!!\\n\");\n    return false;\n  }\n  // For each of the dawg types, make the dawg, and write to traineddata.\n  // Dawgs are reversed as follows:\n  // Words: According to the word content.\n  // Puncs: According to lang_is_rtl.\n  // Numbers: Never.\n  // System dawg (main wordlist).\n  if (!words.empty() && !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,\n                                   TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {\n    return false;\n  }\n  // punc/punc-dawg.\n  Trie::RTLReversePolicy reverse_policy =\n      lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE;\n  if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG, traineddata)) {\n    return false;\n  }\n  // numbers/number-dawg.\n  if (!numbers.empty() && !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,\n                                     TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {\n    return false;\n  }\n  return true;\n}\n\n// The main function for combine_lang_model.cpp.\n// Returns EXIT_SUCCESS or EXIT_FAILURE for error.\nint CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,\n                     const std::string &version_str, const std::string &output_dir,\n                     const std::string &lang, bool pass_through_recoder,\n                     const std::vector<std::string> &words, const std::vector<std::string> &puncs,\n                     const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,\n                     FileWriter writer) {\n  // Build the traineddata file.\n  TessdataManager traineddata;\n  if (!version_str.empty()) {\n    traineddata.SetVersionString(traineddata.VersionString() + \":\" + version_str);\n  }\n  // Unicharset and recoder.\n  if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {\n    tprintf(\"Error writing unicharset!!\\n\");\n    return EXIT_FAILURE;\n  } else {\n    tprintf(\"Config file is optional, continuing...\\n\");\n  }\n  // If there is a config file, read it and add to traineddata.\n  std::string config_filename = script_dir + \"/\" + lang + \"/\" + lang + \".config\";\n  std::string config_file = ReadFile(config_filename, reader);\n  if (config_file.length() > 0) {\n    traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length());\n  }\n  std::string radical_filename = script_dir + \"/radical-stroke.txt\";\n  std::string radical_data = ReadFile(radical_filename, reader);\n  if (radical_data.empty()) {\n    tprintf(\"Error reading radical code table %s\\n\", radical_filename.c_str());\n    return EXIT_FAILURE;\n  }\n  if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data,\n                    &traineddata)) {\n    tprintf(\"Error writing recoder!!\\n\");\n  }\n  if (!words.empty() || !puncs.empty() || !numbers.empty()) {\n    if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) {\n      tprintf(\"Error during conversion of wordlists to DAWGs!!\\n\");\n      return EXIT_FAILURE;\n    }\n  }\n\n  // Traineddata file.\n  std::vector<char> traineddata_data;\n  traineddata.Serialize(&traineddata_data);\n  if (!WriteFile(output_dir, lang, \".traineddata\", traineddata_data, writer)) {\n    tprintf(\"Error writing output traineddata file!!\\n\");\n    return EXIT_FAILURE;\n  }\n  tprintf(\"Created %s/%s/%s.traineddata\", output_dir.c_str(), lang.c_str(), lang.c_str());\n  return EXIT_SUCCESS;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/lang_model_helpers.h",
    "content": "// Copyright 2017 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n// Purpose: Collection of convenience functions to simplify creation of the\n//          unicharset, recoder, and dawgs for an LSTM model.\n\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#ifndef TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_\n#define TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_\n\n#include \"export.h\"\n\n#include \"serialis.h\"\n#include \"tessdatamanager.h\"\n#include \"unicharset.h\"\n\n#include <string>\n\nnamespace tesseract {\n\n// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data\n// to the file, using writer if not null, otherwise, a default writer.\n// Default writer will overwrite any existing file, but a supplied writer\n// can do its own thing. If lang is empty, returns true but does nothing.\n// NOTE that suffix should contain any required . for the filename.\nTESS_UNICHARSET_TRAINING_API\nbool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix,\n               const std::vector<char> &data, FileWriter writer);\n// Helper reads a file with optional reader and returns a string.\n// On failure emits a warning message and returns and empty string.\nTESS_UNICHARSET_TRAINING_API\nstd::string ReadFile(const std::string &filename, FileReader reader = nullptr);\n\n// Helper writes the unicharset to file and to the traineddata.\nbool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir,\n                     const std::string &lang, FileWriter writer, TessdataManager *traineddata);\n// Helper creates the recoder from the unicharset and writes it to the\n// traineddata, with a human-readable form to file at:\n// <output_dir>/<lang>/<lang>.charset_size=<num> for some num being the size\n// of the re-encoded character set. The charset_size file is written using\n// writer if not null, or using a default file writer otherwise, overwriting\n// any existing content.\n// If pass_through is true, then the recoder will be a no-op, passing the\n// unicharset codes through unchanged. Otherwise, the recoder will \"compress\"\n// the unicharset by encoding Hangul in Jamos, decomposing multi-unicode\n// symbols into sequences of unicodes, and encoding Han using the data in the\n// radical_table_data, which must be the content of the file:\n// langdata/radical-stroke.txt.\nbool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,\n                  const std::string &lang, FileWriter writer, std::string *radical_table_data,\n                  TessdataManager *traineddata);\n\n// The main function for combine_lang_model.cpp.\n// Returns EXIT_SUCCESS or EXIT_FAILURE for error.\n// unicharset: can be a hand-created file with incomplete fields. Its basic\n//             and script properties will be set before it is used.\n// script_dir: should point to the langdata (github repo) directory.\n// version_str: arbitrary version label.\n// Output files will be written to <output_dir>/<lang>/<lang>.*\n// If pass_through_recoder is true, the unicharset will be used unchanged as\n// labels in the classifier, otherwise, the unicharset will be \"compressed\" to\n// make the recognition task simpler and faster.\n// The words/puncs/numbers lists may be all empty. If any are non-empty then\n// puncs must be non-empty.\n// lang_is_rtl indicates that the language is generally written from right\n// to left (eg Arabic/Hebrew).\nTESS_UNICHARSET_TRAINING_API\nint CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,\n                     const std::string &version_str, const std::string &output_dir,\n                     const std::string &lang, bool pass_through_recoder,\n                     const std::vector<std::string> &words, const std::vector<std::string> &puncs,\n                     const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,\n                     FileWriter writer);\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_\n"
  },
  {
    "path": "src/training/unicharset/lstmtester.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmtester.cpp\n// Description: Top-level line evaluation class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"lstmtester.h\"\n#include <iomanip>  // for std::setprecision\n#include <thread>   // for std::thread\n#include \"fileio.h\" // for LoadFileLinesToStrings\n\nnamespace tesseract {\n\nLSTMTester::LSTMTester(int64_t max_memory) : test_data_(max_memory) {}\n\n// Loads a set of lstmf files that were created using the lstm.train config to\n// tesseract into memory ready for testing. Returns false if nothing was\n// loaded. The arg is a filename of a file that lists the filenames.\nbool LSTMTester::LoadAllEvalData(const char *filenames_file) {\n  std::vector<std::string> filenames;\n  if (!LoadFileLinesToStrings(filenames_file, &filenames)) {\n    tprintf(\"Failed to load list of eval filenames from %s\\n\", filenames_file);\n    return false;\n  }\n  return LoadAllEvalData(filenames);\n}\n\n// Loads a set of lstmf files that were created using the lstm.train config to\n// tesseract into memory ready for testing. Returns false if nothing was\n// loaded.\nbool LSTMTester::LoadAllEvalData(const std::vector<std::string> &filenames) {\n  test_data_.Clear();\n  bool result = test_data_.LoadDocuments(filenames, CS_SEQUENTIAL, nullptr);\n  total_pages_ = test_data_.TotalPages();\n  return result;\n}\n\n// Runs an evaluation asynchronously on the stored data and returns a string\n// describing the results of the previous test.\nstd::string LSTMTester::RunEvalAsync(int iteration, const double *training_errors,\n                                     const TessdataManager &model_mgr, int training_stage) {\n  std::string result;\n  if (total_pages_ == 0) {\n    result += \"No test data at iteration \" + std::to_string(iteration);\n    return result;\n  }\n  if (!LockIfNotRunning()) {\n    result += \"Previous test incomplete, skipping test at iteration \" + std::to_string(iteration);\n    return result;\n  }\n  // Save the args.\n  std::string prev_result = test_result_;\n  test_result_ = \"\";\n  if (training_errors != nullptr) {\n    test_iteration_ = iteration;\n    test_training_errors_ = training_errors;\n    test_model_mgr_ = model_mgr;\n    test_training_stage_ = training_stage;\n    std::thread t(&LSTMTester::ThreadFunc, this);\n    t.detach();\n  } else {\n    UnlockRunning();\n  }\n  return prev_result;\n}\n\n// Runs an evaluation synchronously on the stored data and returns a string\n// describing the results.\nstd::string LSTMTester::RunEvalSync(int iteration, const double *training_errors,\n                                    const TessdataManager &model_mgr, int training_stage,\n                                    int verbosity) {\n  LSTMTrainer trainer;\n  trainer.InitCharSet(model_mgr);\n  TFile fp;\n  if (!model_mgr.GetComponent(TESSDATA_LSTM, &fp) || !trainer.DeSerialize(&model_mgr, &fp)) {\n    return \"Deserialize failed\";\n  }\n  int eval_iteration = 0;\n  double char_error = 0.0;\n  double word_error = 0.0;\n  int error_count = 0;\n  while (error_count < total_pages_) {\n    const ImageData *trainingdata = test_data_.GetPageBySerial(eval_iteration);\n    trainer.SetIteration(++eval_iteration);\n    NetworkIO fwd_outputs, targets;\n    Trainability result = trainer.PrepareForBackward(trainingdata, &fwd_outputs, &targets);\n    if (result != UNENCODABLE) {\n      char_error += trainer.NewSingleError(tesseract::ET_CHAR_ERROR);\n      word_error += trainer.NewSingleError(tesseract::ET_WORD_RECERR);\n      ++error_count;\n      if (verbosity > 1 || (verbosity > 0 && result != PERFECT)) {\n        tprintf(\"Truth:%s\\n\", trainingdata->transcription().c_str());\n        std::vector<int> ocr_labels;\n        std::vector<int> xcoords;\n        trainer.LabelsFromOutputs(fwd_outputs, &ocr_labels, &xcoords);\n        std::string ocr_text = trainer.DecodeLabels(ocr_labels);\n        tprintf(\"OCR  :%s\\n\", ocr_text.c_str());\n        if (verbosity > 2 || (verbosity > 1 && result != PERFECT)) {\n          tprintf(\"Line BCER=%f, BWER=%f\\n\\n\",\n                  trainer.NewSingleError(tesseract::ET_CHAR_ERROR),\n                  trainer.NewSingleError(tesseract::ET_WORD_RECERR));\n        }\n      }\n    }\n  }\n  char_error *= 100.0 / total_pages_;\n  word_error *= 100.0 / total_pages_;\n  std::stringstream result;\n  result.imbue(std::locale::classic());\n  result << std::fixed << std::setprecision(3);\n  if (iteration != 0 || training_stage != 0) {\n    result << \"At iteration \" << iteration\n           << \", stage \" << training_stage << \", \";\n  }\n  result << \"BCER eval=\" << char_error << \", BWER eval=\" << word_error;\n  return result.str();\n}\n\n// Helper thread function for RunEvalAsync.\n// LockIfNotRunning must have returned true before calling ThreadFunc, and\n// it will call UnlockRunning to release the lock after RunEvalSync completes.\nvoid LSTMTester::ThreadFunc() {\n  test_result_ =\n      RunEvalSync(test_iteration_, test_training_errors_, test_model_mgr_, test_training_stage_,\n                  /*verbosity*/ 0);\n  UnlockRunning();\n}\n\n// Returns true if there is currently nothing running, and takes the lock\n// if there is nothing running.\nbool LSTMTester::LockIfNotRunning() {\n  std::lock_guard<std::mutex> lock(running_mutex_);\n  if (async_running_) {\n    return false;\n  }\n  async_running_ = true;\n  return true;\n}\n\n// Releases the running lock.\nvoid LSTMTester::UnlockRunning() {\n  std::lock_guard<std::mutex> lock(running_mutex_);\n  async_running_ = false;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/lstmtester.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmtester.h\n// Description: Top-level line evaluation class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2016, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TRAINING_LSTMTESTER_H_\n#define TESSERACT_TRAINING_LSTMTESTER_H_\n\n#include \"export.h\"\n\n#include \"lstmtrainer.h\"\n\n#include <mutex>\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\nclass TESS_UNICHARSET_TRAINING_API LSTMTester {\npublic:\n  LSTMTester(int64_t max_memory);\n\n  // Loads a set of lstmf files that were created using the lstm.train config to\n  // tesseract into memory ready for testing. Returns false if nothing was\n  // loaded. The arg is a filename of a file that lists the filenames, with one\n  // name per line. Conveniently, tesstrain.py generates such a file, along\n  // with the files themselves.\n  bool LoadAllEvalData(const char *filenames_file);\n  // Loads a set of lstmf files that were created using the lstm.train config to\n  // tesseract into memory ready for testing. Returns false if nothing was\n  // loaded.\n  bool LoadAllEvalData(const std::vector<std::string> &filenames);\n\n  // Runs an evaluation asynchronously on the stored eval data and returns a\n  // string describing the results of the previous test. Args match TestCallback\n  // declared in lstmtrainer.h:\n  // iteration: Current learning iteration number.\n  // training_errors: If not null, is an array of size ET_COUNT, indexed by\n  //   the ErrorTypes enum and indicates the current errors measured by the\n  //   trainer, and this is a serious request to run an evaluation. If null,\n  //   then the caller is just polling for the results of the previous eval.\n  // model_data: is the model to evaluate, which should be a serialized\n  //   LSTMTrainer.\n  // training_stage: an arbitrary number on the progress of training.\n  std::string RunEvalAsync(int iteration, const double *training_errors,\n                           const TessdataManager &model_mgr, int training_stage);\n  // Runs an evaluation synchronously on the stored eval data and returns a\n  // string describing the results. Args as RunEvalAsync, except verbosity,\n  // which outputs errors, if 1, or all results if 2.\n  std::string RunEvalSync(int iteration, const double *training_errors, const TessdataManager &model_mgr,\n                          int training_stage, int verbosity);\n\nprivate:\n  // Helper thread function for RunEvalAsync.\n  // LockIfNotRunning must have returned true before calling ThreadFunc, and\n  // it will call UnlockRunning to release the lock after RunEvalSync completes.\n  void ThreadFunc();\n  // Returns true if there is currently nothing running, and takes the lock\n  // if there is nothing running.\n  bool LockIfNotRunning();\n  // Releases the running lock.\n  void UnlockRunning();\n\n  // The data to test with.\n  DocumentCache test_data_;\n  int total_pages_ = 0;\n  // Flag that indicates an asynchronous test is currently running.\n  // Protected by running_mutex_.\n  bool async_running_ = false;\n  std::mutex running_mutex_;\n  // Stored copies of the args for use while running asynchronously.\n  int test_iteration_ = 0;\n  const double *test_training_errors_ = nullptr;\n  TessdataManager test_model_mgr_;\n  int test_training_stage_ = 0;\n  std::string test_result_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_LSTMTESTER_H_\n"
  },
  {
    "path": "src/training/unicharset/lstmtrainer.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmtrainer.cpp\n// Description: Top-level line trainer class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#define _USE_MATH_DEFINES // needed to get definition of M_SQRT1_2\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include <cmath>\n#include <iomanip>             // for std::setprecision\n#include <locale>              // for std::locale::classic\n#include <string>\n#include \"lstmtrainer.h\"\n\n#include <allheaders.h>\n#include \"boxread.h\"\n#include \"ctc.h\"\n#include \"imagedata.h\"\n#include \"input.h\"\n#include \"networkbuilder.h\"\n#include \"ratngs.h\"\n#include \"recodebeam.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Min actual error rate increase to constitute divergence.\nconst double kMinDivergenceRate = 50.0;\n// Min iterations since last best before acting on a stall.\nconst int kMinStallIterations = 10000;\n// Fraction of current char error rate that sub_trainer_ has to be ahead\n// before we declare the sub_trainer_ a success and switch to it.\nconst double kSubTrainerMarginFraction = 3.0 / 128;\n// Factor to reduce learning rate on divergence.\nconst double kLearningRateDecay = M_SQRT1_2;\n// LR adjustment iterations.\nconst int kNumAdjustmentIterations = 100;\n// How often to add data to the error_graph_.\nconst int kErrorGraphInterval = 1000;\n// Number of training images to train between calls to MaintainCheckpoints.\nconst int kNumPagesPerBatch = 100;\n// Min percent error rate to consider start-up phase over.\nconst int kMinStartedErrorRate = 75;\n// Error rate at which to transition to stage 1.\nconst double kStageTransitionThreshold = 10.0;\n// Confidence beyond which the truth is more likely wrong than the recognizer.\nconst double kHighConfidence = 0.9375; // 15/16.\n// Fraction of weight sign-changing total to constitute a definite improvement.\nconst double kImprovementFraction = 15.0 / 16.0;\n// Fraction of last written best to make it worth writing another.\nconst double kBestCheckpointFraction = 31.0 / 32.0;\n#ifndef GRAPHICS_DISABLED\n// Scale factor for display of target activations of CTC.\nconst int kTargetXScale = 5;\nconst int kTargetYScale = 100;\n#endif // !GRAPHICS_DISABLED\n\nLSTMTrainer::LSTMTrainer()\n    : randomly_rotate_(false), training_data_(0), sub_trainer_(nullptr) {\n  EmptyConstructor();\n  debug_interval_ = 0;\n}\n\nLSTMTrainer::LSTMTrainer(const std::string &model_base, const std::string &checkpoint_name,\n                         int debug_interval, int64_t max_memory)\n    : randomly_rotate_(false),\n      training_data_(max_memory),\n      sub_trainer_(nullptr) {\n  EmptyConstructor();\n  debug_interval_ = debug_interval;\n  model_base_ = model_base;\n  checkpoint_name_ = checkpoint_name;\n}\n\nLSTMTrainer::~LSTMTrainer() {\n#ifndef GRAPHICS_DISABLED\n  delete align_win_;\n  delete target_win_;\n  delete ctc_win_;\n  delete recon_win_;\n#endif\n}\n\n// Tries to deserialize a trainer from the given file and silently returns\n// false in case of failure.\nbool LSTMTrainer::TryLoadingCheckpoint(const char *filename,\n                                       const char *old_traineddata) {\n  std::vector<char> data;\n  if (!LoadDataFromFile(filename, &data)) {\n    return false;\n  }\n  tprintf(\"Loaded file %s, unpacking...\\n\", filename);\n  if (!ReadTrainingDump(data, *this)) {\n    return false;\n  }\n  if (IsIntMode()) {\n    tprintf(\"Error, %s is an integer (fast) model, cannot continue training\\n\",\n            filename);\n    return false;\n  }\n  if (((old_traineddata == nullptr || *old_traineddata == '\\0') &&\n       network_->NumOutputs() == recoder_.code_range()) ||\n      filename == old_traineddata) {\n    return true; // Normal checkpoint load complete.\n  }\n  tprintf(\"Code range changed from %d to %d!\\n\", network_->NumOutputs(),\n          recoder_.code_range());\n  if (old_traineddata == nullptr || *old_traineddata == '\\0') {\n    tprintf(\"Must supply the old traineddata for code conversion!\\n\");\n    return false;\n  }\n  TessdataManager old_mgr;\n  ASSERT_HOST(old_mgr.Init(old_traineddata));\n  TFile fp;\n  if (!old_mgr.GetComponent(TESSDATA_LSTM_UNICHARSET, &fp)) {\n    return false;\n  }\n  UNICHARSET old_chset;\n  if (!old_chset.load_from_file(&fp, false)) {\n    return false;\n  }\n  if (!old_mgr.GetComponent(TESSDATA_LSTM_RECODER, &fp)) {\n    return false;\n  }\n  UnicharCompress old_recoder;\n  if (!old_recoder.DeSerialize(&fp)) {\n    return false;\n  }\n  std::vector<int> code_map = MapRecoder(old_chset, old_recoder);\n  // Set the null_char_ to the new value.\n  int old_null_char = null_char_;\n  SetNullChar();\n  // Map the softmax(s) in the network.\n  network_->RemapOutputs(old_recoder.code_range(), code_map);\n  tprintf(\"Previous null char=%d mapped to %d\\n\", old_null_char, null_char_);\n  return true;\n}\n\n// Initializes the trainer with a network_spec in the network description\n// net_flags control network behavior according to the NetworkFlags enum.\n// There isn't really much difference between them - only where the effects\n// are implemented.\n// For other args see NetworkBuilder::InitNetwork.\n// Note: Be sure to call InitCharSet before InitNetwork!\nbool LSTMTrainer::InitNetwork(const char *network_spec, int append_index,\n                              int net_flags, float weight_range,\n                              float learning_rate, float momentum,\n                              float adam_beta) {\n  mgr_.SetVersionString(mgr_.VersionString() + \":\" + network_spec);\n  adam_beta_ = adam_beta;\n  learning_rate_ = learning_rate;\n  momentum_ = momentum;\n  SetNullChar();\n  if (!NetworkBuilder::InitNetwork(recoder_.code_range(), network_spec,\n                                   append_index, net_flags, weight_range,\n                                   &randomizer_, &network_)) {\n    return false;\n  }\n  network_str_ += network_spec;\n  tprintf(\"Built network:%s from request %s\\n\", network_->spec().c_str(),\n          network_spec);\n  tprintf(\n      \"Training parameters:\\n  Debug interval = %d,\"\n      \" weights = %g, learning rate = %g, momentum=%g\\n\",\n      debug_interval_, weight_range, learning_rate_, momentum_);\n  tprintf(\"null char=%d\\n\", null_char_);\n  return true;\n}\n\n// Resets all the iteration counters for fine tuning or traininng a head,\n// where we want the error reporting to reset.\nvoid LSTMTrainer::InitIterations() {\n  sample_iteration_ = 0;\n  training_iteration_ = 0;\n  learning_iteration_ = 0;\n  prev_sample_iteration_ = 0;\n  best_error_rate_ = 100.0;\n  best_iteration_ = 0;\n  worst_error_rate_ = 0.0;\n  worst_iteration_ = 0;\n  stall_iteration_ = kMinStallIterations;\n  best_error_history_.clear();\n  best_error_iterations_.clear();\n  improvement_steps_ = kMinStallIterations;\n  perfect_delay_ = 0;\n  last_perfect_training_iteration_ = 0;\n  for (int i = 0; i < ET_COUNT; ++i) {\n    best_error_rates_[i] = 100.0;\n    worst_error_rates_[i] = 0.0;\n    error_buffers_[i].clear();\n    error_buffers_[i].resize(kRollingBufferSize_);\n    error_rates_[i] = 100.0;\n  }\n  error_rate_of_last_saved_best_ = kMinStartedErrorRate;\n}\n\n// If the training sample is usable, grid searches for the optimal\n// dict_ratio/cert_offset, and returns the results in a string of space-\n// separated triplets of ratio,offset=worderr.\nTrainability LSTMTrainer::GridSearchDictParams(\n    const ImageData *trainingdata, int iteration, double min_dict_ratio,\n    double dict_ratio_step, double max_dict_ratio, double min_cert_offset,\n    double cert_offset_step, double max_cert_offset, std::string &results) {\n  sample_iteration_ = iteration;\n  NetworkIO fwd_outputs, targets;\n  Trainability result =\n      PrepareForBackward(trainingdata, &fwd_outputs, &targets);\n  if (result == UNENCODABLE || result == HI_PRECISION_ERR || dict_ == nullptr) {\n    return result;\n  }\n\n  // Encode/decode the truth to get the normalization.\n  std::vector<int> truth_labels, ocr_labels, xcoords;\n  ASSERT_HOST(EncodeString(trainingdata->transcription(), &truth_labels));\n  // NO-dict error.\n  RecodeBeamSearch base_search(recoder_, null_char_, SimpleTextOutput(),\n                               nullptr);\n  base_search.Decode(fwd_outputs, 1.0, 0.0, RecodeBeamSearch::kMinCertainty,\n                     nullptr);\n  base_search.ExtractBestPathAsLabels(&ocr_labels, &xcoords);\n  std::string truth_text = DecodeLabels(truth_labels);\n  std::string ocr_text = DecodeLabels(ocr_labels);\n  double baseline_error = ComputeWordError(&truth_text, &ocr_text);\n  results += \"0,0=\" + std::to_string(baseline_error);\n\n  RecodeBeamSearch search(recoder_, null_char_, SimpleTextOutput(), dict_);\n  for (double r = min_dict_ratio; r < max_dict_ratio; r += dict_ratio_step) {\n    for (double c = min_cert_offset; c < max_cert_offset;\n         c += cert_offset_step) {\n      search.Decode(fwd_outputs, r, c, RecodeBeamSearch::kMinCertainty,\n                    nullptr);\n      search.ExtractBestPathAsLabels(&ocr_labels, &xcoords);\n      truth_text = DecodeLabels(truth_labels);\n      ocr_text = DecodeLabels(ocr_labels);\n      // This is destructive on both strings.\n      double word_error = ComputeWordError(&truth_text, &ocr_text);\n      if ((r == min_dict_ratio && c == min_cert_offset) ||\n          !std::isfinite(word_error)) {\n        std::string t = DecodeLabels(truth_labels);\n        std::string o = DecodeLabels(ocr_labels);\n        tprintf(\"r=%g, c=%g, truth=%s, ocr=%s, wderr=%g, truth[0]=%d\\n\", r, c,\n                t.c_str(), o.c_str(), word_error, truth_labels[0]);\n      }\n      results += \" \" + std::to_string(r);\n      results += \",\" + std::to_string(c);\n      results += \"=\" + std::to_string(word_error);\n    }\n  }\n  return result;\n}\n\n// Provides output on the distribution of weight values.\nvoid LSTMTrainer::DebugNetwork() {\n  network_->DebugWeights();\n}\n\n// Loads a set of lstmf files that were created using the lstm.train config to\n// tesseract into memory ready for training. Returns false if nothing was\n// loaded.\nbool LSTMTrainer::LoadAllTrainingData(const std::vector<std::string> &filenames,\n                                      CachingStrategy cache_strategy,\n                                      bool randomly_rotate) {\n  randomly_rotate_ = randomly_rotate;\n  training_data_.Clear();\n  return training_data_.LoadDocuments(filenames, cache_strategy,\n                                      LoadDataFromFile);\n}\n\n// Keeps track of best and locally worst char error_rate and launches tests\n// using tester, when a new min or max is reached.\n// Writes checkpoints at appropriate times and builds and returns a log message\n// to indicate progress. Returns false if nothing interesting happened.\nbool LSTMTrainer::MaintainCheckpoints(const TestCallback &tester,\n                                      std::stringstream &log_msg) {\n  PrepareLogMsg(log_msg);\n  double error_rate = CharError();\n  int iteration = learning_iteration();\n  if (iteration >= stall_iteration_ &&\n      error_rate > best_error_rate_ * (1.0 + kSubTrainerMarginFraction) &&\n      best_error_rate_ < kMinStartedErrorRate && !best_trainer_.empty()) {\n    // It hasn't got any better in a long while, and is a margin worse than the\n    // best, so go back to the best model and try a different learning rate.\n    StartSubtrainer(log_msg);\n  }\n  SubTrainerResult sub_trainer_result = STR_NONE;\n  if (sub_trainer_ != nullptr) {\n    sub_trainer_result = UpdateSubtrainer(log_msg);\n    if (sub_trainer_result == STR_REPLACED) {\n      // Reset the inputs, as we have overwritten *this.\n      error_rate = CharError();\n      iteration = learning_iteration();\n      PrepareLogMsg(log_msg);\n    }\n  }\n  bool result = true; // Something interesting happened.\n  std::vector<char> rec_model_data;\n  if (error_rate < best_error_rate_) {\n    SaveRecognitionDump(&rec_model_data);\n    log_msg << \" New best BCER = \" << error_rate;\n    log_msg << UpdateErrorGraph(iteration, error_rate, rec_model_data, tester);\n    // If sub_trainer_ is not nullptr, either *this beat it to a new best, or it\n    // just overwrote *this. In either case, we have finished with it.\n    sub_trainer_.reset();\n    stall_iteration_ = learning_iteration() + kMinStallIterations;\n    if (TransitionTrainingStage(kStageTransitionThreshold)) {\n      log_msg << \" Transitioned to stage \" << CurrentTrainingStage();\n    }\n    SaveTrainingDump(NO_BEST_TRAINER, *this, &best_trainer_);\n    if (error_rate < error_rate_of_last_saved_best_ * kBestCheckpointFraction) {\n      std::string best_model_name = DumpFilename();\n      if (!SaveDataToFile(best_trainer_, best_model_name.c_str())) {\n        log_msg << \" failed to write best model:\";\n      } else {\n        log_msg << \" wrote best model:\";\n        error_rate_of_last_saved_best_ = best_error_rate_;\n      }\n      log_msg << best_model_name;\n    }\n  } else if (error_rate > worst_error_rate_) {\n    SaveRecognitionDump(&rec_model_data);\n    log_msg << \" New worst BCER = \" << error_rate;\n    log_msg << UpdateErrorGraph(iteration, error_rate, rec_model_data, tester);\n    if (worst_error_rate_ > best_error_rate_ + kMinDivergenceRate &&\n        best_error_rate_ < kMinStartedErrorRate && !best_trainer_.empty()) {\n      // Error rate has ballooned. Go back to the best model.\n      log_msg << \"\\nDivergence! \";\n      // Copy best_trainer_ before reading it, as it will get overwritten.\n      std::vector<char> revert_data(best_trainer_);\n      if (ReadTrainingDump(revert_data, *this)) {\n        LogIterations(\"Reverted to\", log_msg);\n        ReduceLearningRates(this, log_msg);\n      } else {\n        LogIterations(\"Failed to Revert at\", log_msg);\n      }\n      // If it fails again, we will wait twice as long before reverting again.\n      stall_iteration_ = iteration + 2 * (iteration - learning_iteration());\n      // Re-save the best trainer with the new learning rates and stall\n      // iteration.\n      SaveTrainingDump(NO_BEST_TRAINER, *this, &best_trainer_);\n    }\n  } else {\n    // Something interesting happened only if the sub_trainer_ was trained.\n    result = sub_trainer_result != STR_NONE;\n  }\n  if (checkpoint_name_.length() > 0) {\n    // Write a current checkpoint.\n    std::vector<char> checkpoint;\n    if (!SaveTrainingDump(FULL, *this, &checkpoint) ||\n        !SaveDataToFile(checkpoint, checkpoint_name_.c_str())) {\n      log_msg << \" failed to write checkpoint.\";\n    } else {\n      log_msg << \" wrote checkpoint.\";\n    }\n  }\n  return result;\n}\n\n// Builds a string containing a progress message with current error rates.\nvoid LSTMTrainer::PrepareLogMsg(std::stringstream &log_msg) const {\n  LogIterations(\"At\", log_msg);\n  log_msg << std::fixed << std::setprecision(3)\n          << \", mean rms=\" << error_rates_[ET_RMS]\n          << \"%, delta=\" << error_rates_[ET_DELTA]\n          << \"%, BCER train=\" << error_rates_[ET_CHAR_ERROR]\n          << \"%, BWER train=\" << error_rates_[ET_WORD_RECERR]\n          << \"%, skip ratio=\" << error_rates_[ET_SKIP_RATIO] << \"%,\";\n}\n\n// Appends <intro_str> iteration learning_iteration()/training_iteration()/\n// sample_iteration() to the log_msg.\nvoid LSTMTrainer::LogIterations(const char *intro_str,\n                                std::stringstream &log_msg) const {\n  log_msg << intro_str\n          << \" iteration \" << learning_iteration()\n          << \"/\" << training_iteration()\n          << \"/\" << sample_iteration();\n}\n\n// Returns true and increments the training_stage_ if the error rate has just\n// passed through the given threshold for the first time.\nbool LSTMTrainer::TransitionTrainingStage(float error_threshold) {\n  if (best_error_rate_ < error_threshold &&\n      training_stage_ + 1 < num_training_stages_) {\n    ++training_stage_;\n    return true;\n  }\n  return false;\n}\n\n// Writes to the given file. Returns false in case of error.\nbool LSTMTrainer::Serialize(SerializeAmount serialize_amount,\n                            const TessdataManager *mgr, TFile *fp) const {\n  if (!LSTMRecognizer::Serialize(mgr, fp)) {\n    return false;\n  }\n  if (!fp->Serialize(&learning_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&prev_sample_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&perfect_delay_)) {\n    return false;\n  }\n  if (!fp->Serialize(&last_perfect_training_iteration_)) {\n    return false;\n  }\n  for (const auto &error_buffer : error_buffers_) {\n    if (!fp->Serialize(error_buffer)) {\n      return false;\n    }\n  }\n  if (!fp->Serialize(&error_rates_[0], countof(error_rates_))) {\n    return false;\n  }\n  if (!fp->Serialize(&training_stage_)) {\n    return false;\n  }\n  uint8_t amount = serialize_amount;\n  if (!fp->Serialize(&amount)) {\n    return false;\n  }\n  if (serialize_amount == LIGHT) {\n    return true; // We are done.\n  }\n  if (!fp->Serialize(&best_error_rate_)) {\n    return false;\n  }\n  if (!fp->Serialize(&best_error_rates_[0], countof(best_error_rates_))) {\n    return false;\n  }\n  if (!fp->Serialize(&best_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&worst_error_rate_)) {\n    return false;\n  }\n  if (!fp->Serialize(&worst_error_rates_[0], countof(worst_error_rates_))) {\n    return false;\n  }\n  if (!fp->Serialize(&worst_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(&stall_iteration_)) {\n    return false;\n  }\n  if (!fp->Serialize(best_model_data_)) {\n    return false;\n  }\n  if (!fp->Serialize(worst_model_data_)) {\n    return false;\n  }\n  if (serialize_amount != NO_BEST_TRAINER && !fp->Serialize(best_trainer_)) {\n    return false;\n  }\n  std::vector<char> sub_data;\n  if (sub_trainer_ != nullptr &&\n      !SaveTrainingDump(LIGHT, *sub_trainer_, &sub_data)) {\n    return false;\n  }\n  if (!fp->Serialize(sub_data)) {\n    return false;\n  }\n  if (!fp->Serialize(best_error_history_)) {\n    return false;\n  }\n  if (!fp->Serialize(best_error_iterations_)) {\n    return false;\n  }\n  return fp->Serialize(&improvement_steps_);\n}\n\n// Reads from the given file. Returns false in case of error.\n// NOTE: It is assumed that the trainer is never read cross-endian.\nbool LSTMTrainer::DeSerialize(const TessdataManager *mgr, TFile *fp) {\n  if (!LSTMRecognizer::DeSerialize(mgr, fp)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&learning_iteration_)) {\n    // Special case. If we successfully decoded the recognizer, but fail here\n    // then it means we were just given a recognizer, so issue a warning and\n    // allow it.\n    tprintf(\"Warning: LSTMTrainer deserialized an LSTMRecognizer!\\n\");\n    learning_iteration_ = 0;\n    network_->SetEnableTraining(TS_ENABLED);\n    return true;\n  }\n  if (!fp->DeSerialize(&prev_sample_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&perfect_delay_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&last_perfect_training_iteration_)) {\n    return false;\n  }\n  for (auto &error_buffer : error_buffers_) {\n    if (!fp->DeSerialize(error_buffer)) {\n      return false;\n    }\n  }\n  if (!fp->DeSerialize(&error_rates_[0], countof(error_rates_))) {\n    return false;\n  }\n  if (!fp->DeSerialize(&training_stage_)) {\n    return false;\n  }\n  uint8_t amount;\n  if (!fp->DeSerialize(&amount)) {\n    return false;\n  }\n  if (amount == LIGHT) {\n    return true; // Don't read the rest.\n  }\n  if (!fp->DeSerialize(&best_error_rate_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&best_error_rates_[0], countof(best_error_rates_))) {\n    return false;\n  }\n  if (!fp->DeSerialize(&best_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&worst_error_rate_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&worst_error_rates_[0], countof(worst_error_rates_))) {\n    return false;\n  }\n  if (!fp->DeSerialize(&worst_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(&stall_iteration_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(best_model_data_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(worst_model_data_)) {\n    return false;\n  }\n  if (amount != NO_BEST_TRAINER && !fp->DeSerialize(best_trainer_)) {\n    return false;\n  }\n  std::vector<char> sub_data;\n  if (!fp->DeSerialize(sub_data)) {\n    return false;\n  }\n  if (sub_data.empty()) {\n    sub_trainer_ = nullptr;\n  } else {\n    sub_trainer_ = std::make_unique<LSTMTrainer>();\n    if (!ReadTrainingDump(sub_data, *sub_trainer_)) {\n      return false;\n    }\n  }\n  if (!fp->DeSerialize(best_error_history_)) {\n    return false;\n  }\n  if (!fp->DeSerialize(best_error_iterations_)) {\n    return false;\n  }\n  return fp->DeSerialize(&improvement_steps_);\n}\n\n// De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the\n// learning rates (by scaling reduction, or layer specific, according to\n// NF_LAYER_SPECIFIC_LR).\nvoid LSTMTrainer::StartSubtrainer(std::stringstream &log_msg) {\n  sub_trainer_ = std::make_unique<LSTMTrainer>();\n  if (!ReadTrainingDump(best_trainer_, *sub_trainer_)) {\n    log_msg << \" Failed to revert to previous best for trial!\";\n    sub_trainer_.reset();\n  } else {\n    log_msg << \" Trial sub_trainer_ from iteration \"\n            << sub_trainer_->training_iteration();\n    // Reduce learning rate so it doesn't diverge this time.\n    sub_trainer_->ReduceLearningRates(this, log_msg);\n    // If it fails again, we will wait twice as long before reverting again.\n    int stall_offset =\n        learning_iteration() - sub_trainer_->learning_iteration();\n    stall_iteration_ = learning_iteration() + 2 * stall_offset;\n    sub_trainer_->stall_iteration_ = stall_iteration_;\n    // Re-save the best trainer with the new learning rates and stall iteration.\n    SaveTrainingDump(NO_BEST_TRAINER, *sub_trainer_, &best_trainer_);\n  }\n}\n\n// While the sub_trainer_ is behind the current training iteration and its\n// training error is at least kSubTrainerMarginFraction better than the\n// current training error, trains the sub_trainer_, and returns STR_UPDATED if\n// it did anything. If it catches up, and has a better error rate than the\n// current best, as well as a margin over the current error rate, then the\n// trainer in *this is replaced with sub_trainer_, and STR_REPLACED is\n// returned. STR_NONE is returned if the subtrainer wasn't good enough to\n// receive any training iterations.\nSubTrainerResult LSTMTrainer::UpdateSubtrainer(std::stringstream &log_msg) {\n  double training_error = CharError();\n  double sub_error = sub_trainer_->CharError();\n  double sub_margin = (training_error - sub_error) / sub_error;\n  if (sub_margin >= kSubTrainerMarginFraction) {\n    log_msg << \" sub_trainer=\" << sub_error\n            << \" margin=\" << 100.0 * sub_margin << \"\\n\";\n    // Catch up to current iteration.\n    int end_iteration = training_iteration();\n    while (sub_trainer_->training_iteration() < end_iteration &&\n           sub_margin >= kSubTrainerMarginFraction) {\n      int target_iteration =\n          sub_trainer_->training_iteration() + kNumPagesPerBatch;\n      while (sub_trainer_->training_iteration() < target_iteration) {\n        sub_trainer_->TrainOnLine(this, false);\n      }\n      std::stringstream batch_log(\"Sub:\");\n      batch_log.imbue(std::locale::classic());\n      sub_trainer_->PrepareLogMsg(batch_log);\n      batch_log << \"\\n\";\n      tprintf(\"UpdateSubtrainer:%s\", batch_log.str().c_str());\n      log_msg << batch_log.str();\n      sub_error = sub_trainer_->CharError();\n      sub_margin = (training_error - sub_error) / sub_error;\n    }\n    if (sub_error < best_error_rate_ &&\n        sub_margin >= kSubTrainerMarginFraction) {\n      // The sub_trainer_ has won the race to a new best. Switch to it.\n      std::vector<char> updated_trainer;\n      SaveTrainingDump(LIGHT, *sub_trainer_, &updated_trainer);\n      ReadTrainingDump(updated_trainer, *this);\n      log_msg << \" Sub trainer wins at iteration \"\n              << training_iteration() << \"\\n\";\n      return STR_REPLACED;\n    }\n    return STR_UPDATED;\n  }\n  return STR_NONE;\n}\n\n// Reduces network learning rates, either for everything, or for layers\n// independently, according to NF_LAYER_SPECIFIC_LR.\nvoid LSTMTrainer::ReduceLearningRates(LSTMTrainer *samples_trainer,\n                                      std::stringstream &log_msg) {\n  if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) {\n    int num_reduced = ReduceLayerLearningRates(\n        kLearningRateDecay, kNumAdjustmentIterations, samples_trainer);\n    log_msg << \"\\nReduced learning rate on layers: \" << num_reduced;\n  } else {\n    ScaleLearningRate(kLearningRateDecay);\n    log_msg << \"\\nReduced learning rate to :\" << learning_rate_;\n  }\n  log_msg << \"\\n\";\n}\n\n// Considers reducing the learning rate independently for each layer down by\n// factor(<1), or leaving it the same, by double-training the given number of\n// samples and minimizing the amount of changing of sign of weight updates.\n// Even if it looks like all weights should remain the same, an adjustment\n// will be made to guarantee a different result when reverting to an old best.\n// Returns the number of layer learning rates that were reduced.\nint LSTMTrainer::ReduceLayerLearningRates(TFloat factor, int num_samples,\n                                          LSTMTrainer *samples_trainer) {\n  enum WhichWay {\n    LR_DOWN, // Learning rate will go down by factor.\n    LR_SAME, // Learning rate will stay the same.\n    LR_COUNT // Size of arrays.\n  };\n  std::vector<std::string> layers = EnumerateLayers();\n  int num_layers = layers.size();\n  std::vector<int> num_weights(num_layers);\n  std::vector<TFloat> bad_sums[LR_COUNT];\n  std::vector<TFloat> ok_sums[LR_COUNT];\n  for (int i = 0; i < LR_COUNT; ++i) {\n    bad_sums[i].resize(num_layers, 0.0);\n    ok_sums[i].resize(num_layers, 0.0);\n  }\n  auto momentum_factor = 1 / (1 - momentum_);\n  std::vector<char> orig_trainer;\n  samples_trainer->SaveTrainingDump(LIGHT, *this, &orig_trainer);\n  for (int i = 0; i < num_layers; ++i) {\n    Network *layer = GetLayer(layers[i]);\n    num_weights[i] = layer->IsTraining() ? layer->num_weights() : 0;\n  }\n  int iteration = sample_iteration();\n  for (int s = 0; s < num_samples; ++s) {\n    // Which way will we modify the learning rate?\n    for (int ww = 0; ww < LR_COUNT; ++ww) {\n      // Transfer momentum to learning rate and adjust by the ww factor.\n      auto ww_factor = momentum_factor;\n      if (ww == LR_DOWN) {\n        ww_factor *= factor;\n      }\n      // Make a copy of *this, so we can mess about without damaging anything.\n      LSTMTrainer copy_trainer;\n      samples_trainer->ReadTrainingDump(orig_trainer, copy_trainer);\n      // Clear the updates, doing nothing else.\n      copy_trainer.network_->Update(0.0, 0.0, 0.0, 0);\n      // Adjust the learning rate in each layer.\n      for (int i = 0; i < num_layers; ++i) {\n        if (num_weights[i] == 0) {\n          continue;\n        }\n        copy_trainer.ScaleLayerLearningRate(layers[i], ww_factor);\n      }\n      copy_trainer.SetIteration(iteration);\n      // Train on the sample, but keep the update in updates_ instead of\n      // applying to the weights.\n      const ImageData *trainingdata =\n          copy_trainer.TrainOnLine(samples_trainer, true);\n      if (trainingdata == nullptr) {\n        continue;\n      }\n      // We'll now use this trainer again for each layer.\n      std::vector<char> updated_trainer;\n      samples_trainer->SaveTrainingDump(LIGHT, copy_trainer, &updated_trainer);\n      for (int i = 0; i < num_layers; ++i) {\n        if (num_weights[i] == 0) {\n          continue;\n        }\n        LSTMTrainer layer_trainer;\n        samples_trainer->ReadTrainingDump(updated_trainer, layer_trainer);\n        Network *layer = layer_trainer.GetLayer(layers[i]);\n        // Update the weights in just the layer, using Adam if enabled.\n        layer->Update(0.0, momentum_, adam_beta_,\n                      layer_trainer.training_iteration_ + 1);\n        // Zero the updates matrix again.\n        layer->Update(0.0, 0.0, 0.0, 0);\n        // Train again on the same sample, again holding back the updates.\n        layer_trainer.TrainOnLine(trainingdata, true);\n        // Count the sign changes in the updates in layer vs in copy_trainer.\n        float before_bad = bad_sums[ww][i];\n        float before_ok = ok_sums[ww][i];\n        layer->CountAlternators(*copy_trainer.GetLayer(layers[i]),\n                                &ok_sums[ww][i], &bad_sums[ww][i]);\n        float bad_frac =\n            bad_sums[ww][i] + ok_sums[ww][i] - before_bad - before_ok;\n        if (bad_frac > 0.0f) {\n          bad_frac = (bad_sums[ww][i] - before_bad) / bad_frac;\n        }\n      }\n    }\n    ++iteration;\n  }\n  int num_lowered = 0;\n  for (int i = 0; i < num_layers; ++i) {\n    if (num_weights[i] == 0) {\n      continue;\n    }\n    Network *layer = GetLayer(layers[i]);\n    float lr = GetLayerLearningRate(layers[i]);\n    TFloat total_down = bad_sums[LR_DOWN][i] + ok_sums[LR_DOWN][i];\n    TFloat total_same = bad_sums[LR_SAME][i] + ok_sums[LR_SAME][i];\n    TFloat frac_down = bad_sums[LR_DOWN][i] / total_down;\n    TFloat frac_same = bad_sums[LR_SAME][i] / total_same;\n    tprintf(\"Layer %d=%s: lr %g->%g%%, lr %g->%g%%\", i, layer->name().c_str(),\n            lr * factor, 100.0 * frac_down, lr, 100.0 * frac_same);\n    if (frac_down < frac_same * kImprovementFraction) {\n      tprintf(\" REDUCED\\n\");\n      ScaleLayerLearningRate(layers[i], factor);\n      ++num_lowered;\n    } else {\n      tprintf(\" SAME\\n\");\n    }\n  }\n  if (num_lowered == 0) {\n    // Just lower everything to make sure.\n    for (int i = 0; i < num_layers; ++i) {\n      if (num_weights[i] > 0) {\n        ScaleLayerLearningRate(layers[i], factor);\n        ++num_lowered;\n      }\n    }\n  }\n  return num_lowered;\n}\n\n// Converts the string to integer class labels, with appropriate null_char_s\n// in between if not in SimpleTextOutput mode. Returns false on failure.\n/* static */\nbool LSTMTrainer::EncodeString(const std::string &str,\n                               const UNICHARSET &unicharset,\n                               const UnicharCompress *recoder, bool simple_text,\n                               int null_char, std::vector<int> *labels) {\n  if (str.c_str() == nullptr || str.length() <= 0) {\n    tprintf(\"Empty truth string!\\n\");\n    return false;\n  }\n  unsigned err_index;\n  std::vector<int> internal_labels;\n  labels->clear();\n  if (!simple_text) {\n    labels->push_back(null_char);\n  }\n  std::string cleaned = unicharset.CleanupString(str.c_str());\n  if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, nullptr,\n                               &err_index)) {\n    bool success = true;\n    for (auto internal_label : internal_labels) {\n      if (recoder != nullptr) {\n        // Re-encode labels via recoder.\n        RecodedCharID code;\n        int len = recoder->EncodeUnichar(internal_label, &code);\n        if (len > 0) {\n          for (int j = 0; j < len; ++j) {\n            labels->push_back(code(j));\n            if (!simple_text) {\n              labels->push_back(null_char);\n            }\n          }\n        } else {\n          success = false;\n          err_index = 0;\n          break;\n        }\n      } else {\n        labels->push_back(internal_label);\n        if (!simple_text) {\n          labels->push_back(null_char);\n        }\n      }\n    }\n    if (success) {\n      return true;\n    }\n  }\n  tprintf(\"Encoding of string failed! Failure bytes:\");\n  while (err_index < cleaned.size()) {\n    tprintf(\" %x\", cleaned[err_index++] & 0xff);\n  }\n  tprintf(\"\\n\");\n  return false;\n}\n\n// Performs forward-backward on the given trainingdata.\n// Returns a Trainability enum to indicate the suitability of the sample.\nTrainability LSTMTrainer::TrainOnLine(const ImageData *trainingdata,\n                                      bool batch) {\n  NetworkIO fwd_outputs, targets;\n  Trainability trainable =\n      PrepareForBackward(trainingdata, &fwd_outputs, &targets);\n  ++sample_iteration_;\n  if (trainable == UNENCODABLE || trainable == NOT_BOXED) {\n    return trainable; // Sample was unusable.\n  }\n  bool debug =\n      debug_interval_ > 0 && training_iteration() % debug_interval_ == 0;\n  // Run backprop on the output.\n  NetworkIO bp_deltas;\n  if (network_->IsTraining() &&\n      (trainable != PERFECT ||\n       training_iteration() >\n           last_perfect_training_iteration_ + perfect_delay_)) {\n    network_->Backward(debug, targets, &scratch_space_, &bp_deltas);\n    network_->Update(learning_rate_, batch ? -1.0f : momentum_, adam_beta_,\n                     training_iteration_ + 1);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (debug_interval_ == 1 && debug_win_ != nullptr) {\n    debug_win_->AwaitEvent(SVET_CLICK);\n  }\n#endif // !GRAPHICS_DISABLED\n  // Roll the memory of past means.\n  RollErrorBuffers();\n  return trainable;\n}\n\n// Prepares the ground truth, runs forward, and prepares the targets.\n// Returns a Trainability enum to indicate the suitability of the sample.\nTrainability LSTMTrainer::PrepareForBackward(const ImageData *trainingdata,\n                                             NetworkIO *fwd_outputs,\n                                             NetworkIO *targets) {\n  if (trainingdata == nullptr) {\n    tprintf(\"Null trainingdata.\\n\");\n    return UNENCODABLE;\n  }\n  // Ensure repeatability of random elements even across checkpoints.\n  bool debug =\n      debug_interval_ > 0 && training_iteration() % debug_interval_ == 0;\n  std::vector<int> truth_labels;\n  if (!EncodeString(trainingdata->transcription(), &truth_labels)) {\n    tprintf(\"Can't encode transcription: '%s' in language '%s'\\n\",\n            trainingdata->transcription().c_str(),\n            trainingdata->language().c_str());\n    return UNENCODABLE;\n  }\n  bool upside_down = false;\n  if (randomly_rotate_) {\n    // This ensures consistent training results.\n    SetRandomSeed();\n    upside_down = randomizer_.SignedRand(1.0) > 0.0;\n    if (upside_down) {\n      // Modify the truth labels to match the rotation:\n      // Apart from space and null, increment the label. This changes the\n      // script-id to the same script-id but upside-down.\n      // The labels need to be reversed in order, as the first is now the last.\n      for (auto truth_label : truth_labels) {\n        if (truth_label != UNICHAR_SPACE && truth_label != null_char_) {\n          ++truth_label;\n        }\n      }\n      std::reverse(truth_labels.begin(), truth_labels.end());\n    }\n  }\n  unsigned w = 0;\n  while (w < truth_labels.size() &&\n         (truth_labels[w] == UNICHAR_SPACE || truth_labels[w] == null_char_)) {\n    ++w;\n  }\n  if (w == truth_labels.size()) {\n    tprintf(\"Blank transcription: %s\\n\", trainingdata->transcription().c_str());\n    return UNENCODABLE;\n  }\n  float image_scale;\n  NetworkIO inputs;\n  bool invert = trainingdata->boxes().empty();\n  if (!RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,\n                     &image_scale, &inputs, fwd_outputs)) {\n    tprintf(\"Image %s not trainable\\n\", trainingdata->imagefilename().c_str());\n    return UNENCODABLE;\n  }\n  targets->Resize(*fwd_outputs, network_->NumOutputs());\n  LossType loss_type = OutputLossType();\n  if (loss_type == LT_SOFTMAX) {\n    if (!ComputeTextTargets(*fwd_outputs, truth_labels, targets)) {\n      tprintf(\"Compute simple targets failed for %s!\\n\",\n              trainingdata->imagefilename().c_str());\n      return UNENCODABLE;\n    }\n  } else if (loss_type == LT_CTC) {\n    if (!ComputeCTCTargets(truth_labels, fwd_outputs, targets)) {\n      tprintf(\"Compute CTC targets failed for %s!\\n\",\n              trainingdata->imagefilename().c_str());\n      return UNENCODABLE;\n    }\n  } else {\n    tprintf(\"Logistic outputs not implemented yet!\\n\");\n    return UNENCODABLE;\n  }\n  std::vector<int> ocr_labels;\n  std::vector<int> xcoords;\n  LabelsFromOutputs(*fwd_outputs, &ocr_labels, &xcoords);\n  // CTC does not produce correct target labels to begin with.\n  if (loss_type != LT_CTC) {\n    LabelsFromOutputs(*targets, &truth_labels, &xcoords);\n  }\n  if (!DebugLSTMTraining(inputs, *trainingdata, *fwd_outputs, truth_labels,\n                         *targets)) {\n    tprintf(\"Input width was %d\\n\", inputs.Width());\n    return UNENCODABLE;\n  }\n  std::string ocr_text = DecodeLabels(ocr_labels);\n  std::string truth_text = DecodeLabels(truth_labels);\n  targets->SubtractAllFromFloat(*fwd_outputs);\n  if (debug_interval_ != 0) {\n    if (truth_text != ocr_text) {\n      tprintf(\"Iteration %d: BEST OCR TEXT : %s\\n\", training_iteration(),\n              ocr_text.c_str());\n    }\n  }\n  double char_error = ComputeCharError(truth_labels, ocr_labels);\n  double word_error = ComputeWordError(&truth_text, &ocr_text);\n  double delta_error = ComputeErrorRates(*targets, char_error, word_error);\n  if (debug_interval_ != 0) {\n    tprintf(\"File %s line %d %s:\\n\", trainingdata->imagefilename().c_str(),\n            trainingdata->page_number(), delta_error == 0.0 ? \"(Perfect)\" : \"\");\n  }\n  if (delta_error == 0.0) {\n    return PERFECT;\n  }\n  if (targets->AnySuspiciousTruth(kHighConfidence)) {\n    return HI_PRECISION_ERR;\n  }\n  return TRAINABLE;\n}\n\n// Writes the trainer to memory, so that the current training state can be\n// restored.  *this must always be the master trainer that retains the only\n// copy of the training data and language model. trainer is the model that is\n// actually serialized.\nbool LSTMTrainer::SaveTrainingDump(SerializeAmount serialize_amount,\n                                   const LSTMTrainer &trainer,\n                                   std::vector<char> *data) const {\n  TFile fp;\n  fp.OpenWrite(data);\n  return trainer.Serialize(serialize_amount, &mgr_, &fp);\n}\n\n// Restores the model to *this.\nbool LSTMTrainer::ReadLocalTrainingDump(const TessdataManager *mgr,\n                                        const char *data, int size) {\n  if (size == 0) {\n    tprintf(\"Warning: data size is 0 in LSTMTrainer::ReadLocalTrainingDump\\n\");\n    return false;\n  }\n  TFile fp;\n  fp.Open(data, size);\n  return DeSerialize(mgr, &fp);\n}\n\n// Writes the full recognition traineddata to the given filename.\nbool LSTMTrainer::SaveTraineddata(const char *filename) {\n  std::vector<char> recognizer_data;\n  SaveRecognitionDump(&recognizer_data);\n  mgr_.OverwriteEntry(TESSDATA_LSTM, &recognizer_data[0],\n                      recognizer_data.size());\n  return mgr_.SaveFile(filename, SaveDataToFile);\n}\n\n// Writes the recognizer to memory, so that it can be used for testing later.\nvoid LSTMTrainer::SaveRecognitionDump(std::vector<char> *data) const {\n  TFile fp;\n  fp.OpenWrite(data);\n  network_->SetEnableTraining(TS_TEMP_DISABLE);\n  ASSERT_HOST(LSTMRecognizer::Serialize(&mgr_, &fp));\n  network_->SetEnableTraining(TS_RE_ENABLE);\n}\n\n// Returns a suitable filename for a training dump, based on the model_base_,\n// best_error_rate_, best_iteration_ and training_iteration_.\nstd::string LSTMTrainer::DumpFilename() const {\n  std::stringstream filename;\n  filename.imbue(std::locale::classic());\n  filename << model_base_ << std::fixed << std::setprecision(3)\n           << \"_\" << best_error_rate_\n           << \"_\" << best_iteration_\n           << \"_\" << training_iteration_\n           << \".checkpoint\";\n  return filename.str();\n}\n\n// Fills the whole error buffer of the given type with the given value.\nvoid LSTMTrainer::FillErrorBuffer(double new_error, ErrorTypes type) {\n  for (int i = 0; i < kRollingBufferSize_; ++i) {\n    error_buffers_[type][i] = new_error;\n  }\n  error_rates_[type] = 100.0 * new_error;\n}\n\n// Helper generates a map from each current recoder_ code (ie softmax index)\n// to the corresponding old_recoder code, or -1 if there isn't one.\nstd::vector<int> LSTMTrainer::MapRecoder(\n    const UNICHARSET &old_chset, const UnicharCompress &old_recoder) const {\n  int num_new_codes = recoder_.code_range();\n  int num_new_unichars = GetUnicharset().size();\n  std::vector<int> code_map(num_new_codes, -1);\n  for (int c = 0; c < num_new_codes; ++c) {\n    int old_code = -1;\n    // Find all new unichar_ids that recode to something that includes c.\n    // The <= is to include the null char, which may be beyond the unicharset.\n    for (int uid = 0; uid <= num_new_unichars; ++uid) {\n      RecodedCharID codes;\n      int length = recoder_.EncodeUnichar(uid, &codes);\n      int code_index = 0;\n      while (code_index < length && codes(code_index) != c) {\n        ++code_index;\n      }\n      if (code_index == length) {\n        continue;\n      }\n      // The old unicharset must have the same unichar.\n      int old_uid =\n          uid < num_new_unichars\n              ? old_chset.unichar_to_id(GetUnicharset().id_to_unichar(uid))\n              : old_chset.size() - 1;\n      if (old_uid == INVALID_UNICHAR_ID) {\n        continue;\n      }\n      // The encoding of old_uid at the same code_index is the old code.\n      RecodedCharID old_codes;\n      if (code_index < old_recoder.EncodeUnichar(old_uid, &old_codes)) {\n        old_code = old_codes(code_index);\n        break;\n      }\n    }\n    code_map[c] = old_code;\n  }\n  return code_map;\n}\n\n// Private version of InitCharSet above finishes the job after initializing\n// the mgr_ data member.\nvoid LSTMTrainer::InitCharSet() {\n  EmptyConstructor();\n  training_flags_ = TF_COMPRESS_UNICHARSET;\n  // Initialize the unicharset and recoder.\n  if (!LoadCharsets(&mgr_)) {\n    ASSERT_HOST(\n        \"Must provide a traineddata containing lstm_unicharset and\"\n        \" lstm_recoder!\\n\" != nullptr);\n  }\n  SetNullChar();\n}\n\n// Helper computes and sets the null_char_.\nvoid LSTMTrainer::SetNullChar() {\n  null_char_ = GetUnicharset().has_special_codes() ? UNICHAR_BROKEN\n                                                   : GetUnicharset().size();\n  RecodedCharID code;\n  recoder_.EncodeUnichar(null_char_, &code);\n  null_char_ = code(0);\n}\n\n// Factored sub-constructor sets up reasonable default values.\nvoid LSTMTrainer::EmptyConstructor() {\n#ifndef GRAPHICS_DISABLED\n  align_win_ = nullptr;\n  target_win_ = nullptr;\n  ctc_win_ = nullptr;\n  recon_win_ = nullptr;\n#endif\n  checkpoint_iteration_ = 0;\n  training_stage_ = 0;\n  num_training_stages_ = 2;\n  InitIterations();\n}\n\n// Outputs the string and periodically displays the given network inputs\n// as an image in the given window, and the corresponding labels at the\n// corresponding x_starts.\n// Returns false if the truth string is empty.\nbool LSTMTrainer::DebugLSTMTraining(const NetworkIO &inputs,\n                                    const ImageData &trainingdata,\n                                    const NetworkIO &fwd_outputs,\n                                    const std::vector<int> &truth_labels,\n                                    const NetworkIO &outputs) {\n  const std::string &truth_text = DecodeLabels(truth_labels);\n  if (truth_text.c_str() == nullptr || truth_text.length() <= 0) {\n    tprintf(\"Empty truth string at decode time!\\n\");\n    return false;\n  }\n  if (debug_interval_ != 0) {\n    // Get class labels, xcoords and string.\n    std::vector<int> labels;\n    std::vector<int> xcoords;\n    LabelsFromOutputs(outputs, &labels, &xcoords);\n    std::string text = DecodeLabels(labels);\n    tprintf(\"Iteration %d: GROUND  TRUTH : %s\\n\", training_iteration(),\n            truth_text.c_str());\n    if (truth_text != text) {\n      tprintf(\"Iteration %d: ALIGNED TRUTH : %s\\n\", training_iteration(),\n              text.c_str());\n    }\n    if (debug_interval_ > 0 && training_iteration() % debug_interval_ == 0) {\n      tprintf(\"TRAINING activation path for truth string %s\\n\",\n              truth_text.c_str());\n      DebugActivationPath(outputs, labels, xcoords);\n#ifndef GRAPHICS_DISABLED\n      DisplayForward(inputs, labels, xcoords, \"LSTMTraining\", &align_win_);\n      if (OutputLossType() == LT_CTC) {\n        DisplayTargets(fwd_outputs, \"CTC Outputs\", &ctc_win_);\n        DisplayTargets(outputs, \"CTC Targets\", &target_win_);\n      }\n#endif\n    }\n  }\n  return true;\n}\n\n#ifndef GRAPHICS_DISABLED\n\n// Displays the network targets as line a line graph.\nvoid LSTMTrainer::DisplayTargets(const NetworkIO &targets,\n                                 const char *window_name, ScrollView **window) {\n  int width = targets.Width();\n  int num_features = targets.NumFeatures();\n  Network::ClearWindow(true, window_name, width * kTargetXScale, kTargetYScale,\n                       window);\n  for (int c = 0; c < num_features; ++c) {\n    int color = c % (ScrollView::GREEN_YELLOW - 1) + 2;\n    (*window)->Pen(static_cast<ScrollView::Color>(color));\n    int start_t = -1;\n    for (int t = 0; t < width; ++t) {\n      double target = targets.f(t)[c];\n      target *= kTargetYScale;\n      if (target >= 1) {\n        if (start_t < 0) {\n          (*window)->SetCursor(t - 1, 0);\n          start_t = t;\n        }\n        (*window)->DrawTo(t, target);\n      } else if (start_t >= 0) {\n        (*window)->DrawTo(t, 0);\n        (*window)->DrawTo(start_t - 1, 0);\n        start_t = -1;\n      }\n    }\n    if (start_t >= 0) {\n      (*window)->DrawTo(width, 0);\n      (*window)->DrawTo(start_t - 1, 0);\n    }\n  }\n  (*window)->Update();\n}\n\n#endif // !GRAPHICS_DISABLED\n\n// Builds a no-compromises target where the first positions should be the\n// truth labels and the rest is padded with the null_char_.\nbool LSTMTrainer::ComputeTextTargets(const NetworkIO &outputs,\n                                     const std::vector<int> &truth_labels,\n                                     NetworkIO *targets) {\n  if (truth_labels.size() > targets->Width()) {\n    tprintf(\"Error: transcription %s too long to fit into target of width %d\\n\",\n            DecodeLabels(truth_labels).c_str(), targets->Width());\n    return false;\n  }\n  int i = 0;\n  for (auto truth_label : truth_labels) {\n    targets->SetActivations(i, truth_label, 1.0);\n    ++i;\n  }\n  for (i = truth_labels.size(); i < targets->Width(); ++i) {\n    targets->SetActivations(i, null_char_, 1.0);\n  }\n  return true;\n}\n\n// Builds a target using standard CTC. truth_labels should be pre-padded with\n// nulls wherever desired. They don't have to be between all labels.\n// outputs is input-output, as it gets clipped to minimum probability.\nbool LSTMTrainer::ComputeCTCTargets(const std::vector<int> &truth_labels,\n                                    NetworkIO *outputs, NetworkIO *targets) {\n  // Bottom-clip outputs to a minimum probability.\n  CTC::NormalizeProbs(outputs);\n  return CTC::ComputeCTCTargets(truth_labels, null_char_,\n                                outputs->float_array(), targets);\n}\n\n// Computes network errors, and stores the results in the rolling buffers,\n// along with the supplied text_error.\n// Returns the delta error of the current sample (not running average.)\ndouble LSTMTrainer::ComputeErrorRates(const NetworkIO &deltas,\n                                      double char_error, double word_error) {\n  UpdateErrorBuffer(ComputeRMSError(deltas), ET_RMS);\n  // Delta error is the fraction of timesteps with >0.5 error in the top choice\n  // score. If zero, then the top choice characters are guaranteed correct,\n  // even when there is residue in the RMS error.\n  double delta_error = ComputeWinnerError(deltas);\n  UpdateErrorBuffer(delta_error, ET_DELTA);\n  UpdateErrorBuffer(word_error, ET_WORD_RECERR);\n  UpdateErrorBuffer(char_error, ET_CHAR_ERROR);\n  // Skip ratio measures the difference between sample_iteration_ and\n  // training_iteration_, which reflects the number of unusable samples,\n  // usually due to unencodable truth text, or the text not fitting in the\n  // space for the output.\n  double skip_count = sample_iteration_ - prev_sample_iteration_;\n  UpdateErrorBuffer(skip_count, ET_SKIP_RATIO);\n  return delta_error;\n}\n\n// Computes the network activation RMS error rate.\ndouble LSTMTrainer::ComputeRMSError(const NetworkIO &deltas) {\n  double total_error = 0.0;\n  int width = deltas.Width();\n  int num_classes = deltas.NumFeatures();\n  for (int t = 0; t < width; ++t) {\n    const float *class_errs = deltas.f(t);\n    for (int c = 0; c < num_classes; ++c) {\n      double error = class_errs[c];\n      total_error += error * error;\n    }\n  }\n  return sqrt(total_error / (width * num_classes));\n}\n\n// Computes network activation winner error rate. (Number of values that are\n// in error by >= 0.5 divided by number of time-steps.) More closely related\n// to final character error than RMS, but still directly calculable from\n// just the deltas. Because of the binary nature of the targets, zero winner\n// error is a sufficient but not necessary condition for zero char error.\ndouble LSTMTrainer::ComputeWinnerError(const NetworkIO &deltas) {\n  int num_errors = 0;\n  int width = deltas.Width();\n  int num_classes = deltas.NumFeatures();\n  for (int t = 0; t < width; ++t) {\n    const float *class_errs = deltas.f(t);\n    for (int c = 0; c < num_classes; ++c) {\n      float abs_delta = std::fabs(class_errs[c]);\n      // TODO(rays) Filtering cases where the delta is very large to cut out\n      // GT errors doesn't work. Find a better way or get better truth.\n      if (0.5 <= abs_delta) {\n        ++num_errors;\n      }\n    }\n  }\n  return static_cast<double>(num_errors) / width;\n}\n\n// Computes a very simple bag of chars char error rate.\ndouble LSTMTrainer::ComputeCharError(const std::vector<int> &truth_str,\n                                     const std::vector<int> &ocr_str) {\n  std::vector<int> label_counts(NumOutputs());\n  unsigned truth_size = 0;\n  for (auto ch : truth_str) {\n    if (ch != null_char_) {\n      ++label_counts[ch];\n      ++truth_size;\n    }\n  }\n  for (auto ch : ocr_str) {\n    if (ch != null_char_) {\n      --label_counts[ch];\n    }\n  }\n  unsigned char_errors = 0;\n  for (auto label_count : label_counts) {\n    char_errors += abs(label_count);\n  }\n  // Limit BCER to interval [0,1] and avoid division by zero.\n  if (truth_size <= char_errors) {\n    return (char_errors == 0) ? 0.0 : 1.0;\n  }\n  return static_cast<double>(char_errors) / truth_size;\n}\n\n// Computes word recall error rate using a very simple bag of words algorithm.\n// NOTE that this is destructive on both input strings.\ndouble LSTMTrainer::ComputeWordError(std::string *truth_str,\n                                     std::string *ocr_str) {\n  using StrMap = std::unordered_map<std::string, int, std::hash<std::string>>;\n  std::vector<std::string> truth_words = split(*truth_str, ' ');\n  if (truth_words.empty()) {\n    return 0.0;\n  }\n  std::vector<std::string> ocr_words = split(*ocr_str, ' ');\n  StrMap word_counts;\n  for (const auto &truth_word : truth_words) {\n    std::string truth_word_string(truth_word.c_str());\n    auto it = word_counts.find(truth_word_string);\n    if (it == word_counts.end()) {\n      word_counts.insert(std::make_pair(truth_word_string, 1));\n    } else {\n      ++it->second;\n    }\n  }\n  for (const auto &ocr_word : ocr_words) {\n    std::string ocr_word_string(ocr_word.c_str());\n    auto it = word_counts.find(ocr_word_string);\n    if (it == word_counts.end()) {\n      word_counts.insert(std::make_pair(ocr_word_string, -1));\n    } else {\n      --it->second;\n    }\n  }\n  int word_recall_errs = 0;\n  for (const auto &word_count : word_counts) {\n    if (word_count.second > 0) {\n      word_recall_errs += word_count.second;\n    }\n  }\n  return static_cast<double>(word_recall_errs) / truth_words.size();\n}\n\n// Updates the error buffer and corresponding mean of the given type with\n// the new_error.\nvoid LSTMTrainer::UpdateErrorBuffer(double new_error, ErrorTypes type) {\n  int index = training_iteration_ % kRollingBufferSize_;\n  error_buffers_[type][index] = new_error;\n  // Compute the mean error.\n  int mean_count =\n      std::min<int>(training_iteration_ + 1, error_buffers_[type].size());\n  double buffer_sum = 0.0;\n  for (int i = 0; i < mean_count; ++i) {\n    buffer_sum += error_buffers_[type][i];\n  }\n  double mean = buffer_sum / mean_count;\n  // Trim precision to 1/1000 of 1%.\n  error_rates_[type] = IntCastRounded(100000.0 * mean) / 1000.0;\n}\n\n// Rolls error buffers and reports the current means.\nvoid LSTMTrainer::RollErrorBuffers() {\n  prev_sample_iteration_ = sample_iteration_;\n  if (NewSingleError(ET_DELTA) > 0.0) {\n    ++learning_iteration_;\n  } else {\n    last_perfect_training_iteration_ = training_iteration_;\n  }\n  ++training_iteration_;\n  if (debug_interval_ != 0) {\n    tprintf(\"Mean rms=%g%%, delta=%g%%, train=%g%%(%g%%), skip ratio=%g%%\\n\",\n            error_rates_[ET_RMS], error_rates_[ET_DELTA],\n            error_rates_[ET_CHAR_ERROR], error_rates_[ET_WORD_RECERR],\n            error_rates_[ET_SKIP_RATIO]);\n  }\n}\n\n// Given that error_rate is either a new min or max, updates the best/worst\n// error rates, and record of progress.\n// Tester is an externally supplied callback function that tests on some\n// data set with a given model and records the error rates in a graph.\nstd::string LSTMTrainer::UpdateErrorGraph(int iteration, double error_rate,\n                                          const std::vector<char> &model_data,\n                                          const TestCallback &tester) {\n  if (error_rate > best_error_rate_ &&\n      iteration < best_iteration_ + kErrorGraphInterval) {\n    // Too soon to record a new point.\n    if (tester != nullptr && !worst_model_data_.empty()) {\n      mgr_.OverwriteEntry(TESSDATA_LSTM, &worst_model_data_[0],\n                          worst_model_data_.size());\n      return tester(worst_iteration_, nullptr, mgr_, CurrentTrainingStage());\n    } else {\n      return \"\";\n    }\n  }\n  std::string result;\n  // NOTE: there are 2 asymmetries here:\n  // 1. We are computing the global minimum, but the local maximum in between.\n  // 2. If the tester returns an empty string, indicating that it is busy,\n  //    call it repeatedly on new local maxima to test the previous min, but\n  //    not the other way around, as there is little point testing the maxima\n  //    between very frequent minima.\n  if (error_rate < best_error_rate_) {\n    // This is a new (global) minimum.\n    if (tester != nullptr && !worst_model_data_.empty()) {\n      mgr_.OverwriteEntry(TESSDATA_LSTM, &worst_model_data_[0],\n                          worst_model_data_.size());\n      result = tester(worst_iteration_, worst_error_rates_, mgr_,\n                      CurrentTrainingStage());\n      worst_model_data_.clear();\n      best_model_data_ = model_data;\n    }\n    best_error_rate_ = error_rate;\n    memcpy(best_error_rates_, error_rates_, sizeof(error_rates_));\n    best_iteration_ = iteration;\n    best_error_history_.push_back(error_rate);\n    best_error_iterations_.push_back(iteration);\n    // Compute 2% decay time.\n    double two_percent_more = error_rate + 2.0;\n    int i;\n    for (i = best_error_history_.size() - 1;\n         i >= 0 && best_error_history_[i] < two_percent_more; --i) {\n    }\n    int old_iteration = i >= 0 ? best_error_iterations_[i] : 0;\n    improvement_steps_ = iteration - old_iteration;\n    tprintf(\"2 Percent improvement time=%d, best error was %g @ %d\\n\",\n            improvement_steps_, i >= 0 ? best_error_history_[i] : 100.0,\n            old_iteration);\n  } else if (error_rate > best_error_rate_) {\n    // This is a new (local) maximum.\n    if (tester != nullptr) {\n      if (!best_model_data_.empty()) {\n        mgr_.OverwriteEntry(TESSDATA_LSTM, &best_model_data_[0],\n                            best_model_data_.size());\n        result = tester(best_iteration_, best_error_rates_, mgr_,\n                        CurrentTrainingStage());\n      } else if (!worst_model_data_.empty()) {\n        // Allow for multiple data points with \"worst\" error rate.\n        mgr_.OverwriteEntry(TESSDATA_LSTM, &worst_model_data_[0],\n                            worst_model_data_.size());\n        result = tester(worst_iteration_, worst_error_rates_, mgr_,\n                        CurrentTrainingStage());\n      }\n      if (result.length() > 0) {\n        best_model_data_.clear();\n      }\n      worst_model_data_ = model_data;\n    }\n  }\n  worst_error_rate_ = error_rate;\n  memcpy(worst_error_rates_, error_rates_, sizeof(error_rates_));\n  worst_iteration_ = iteration;\n  return result;\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "src/training/unicharset/lstmtrainer.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lstmtrainer.h\n// Description: Top-level line trainer class for LSTM-based networks.\n// Author:      Ray Smith\n//\n// (C) Copyright 2013, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_LSTM_LSTMTRAINER_H_\n#define TESSERACT_LSTM_LSTMTRAINER_H_\n\n#include \"export.h\"\n\n#include \"imagedata.h\" // for DocumentCache\n#include \"lstmrecognizer.h\"\n#include \"rect.h\"\n\n#include <functional> // for std::function\n#include <sstream>    // for std::stringstream\n\nnamespace tesseract {\n\nclass LSTM;\nclass LSTMTester;\nclass LSTMTrainer;\nclass Parallel;\nclass Reversed;\nclass Softmax;\nclass Series;\n\n// Enum for the types of errors that are counted.\nenum ErrorTypes {\n  ET_RMS,         // RMS activation error.\n  ET_DELTA,       // Number of big errors in deltas.\n  ET_WORD_RECERR, // Output text string word recall error.\n  ET_CHAR_ERROR,  // Output text string total char error.\n  ET_SKIP_RATIO,  // Fraction of samples skipped.\n  ET_COUNT        // For array sizing.\n};\n\n// Enum for the trainability_ flags.\nenum Trainability {\n  TRAINABLE,        // Non-zero delta error.\n  PERFECT,          // Zero delta error.\n  UNENCODABLE,      // Not trainable due to coding/alignment trouble.\n  HI_PRECISION_ERR, // Hi confidence disagreement.\n  NOT_BOXED,        // Early in training and has no character boxes.\n};\n\n// Enum to define the amount of data to get serialized.\nenum SerializeAmount {\n  LIGHT,           // Minimal data for remote training.\n  NO_BEST_TRAINER, // Save an empty vector in place of best_trainer_.\n  FULL,            // All data including best_trainer_.\n};\n\n// Enum to indicate how the sub_trainer_ training went.\nenum SubTrainerResult {\n  STR_NONE,    // Did nothing as not good enough.\n  STR_UPDATED, // Subtrainer was updated, but didn't replace *this.\n  STR_REPLACED // Subtrainer replaced *this.\n};\n\nclass LSTMTrainer;\n// Function to compute and record error rates on some external test set(s).\n// Args are: iteration, mean errors, model, training stage.\n// Returns a string containing logging information about the tests.\nusing TestCallback = std::function<std::string(int, const double *,\n                                               const TessdataManager &, int)>;\n\n// Trainer class for LSTM networks. Most of the effort is in creating the\n// ideal target outputs from the transcription. A box file is used if it is\n// available, otherwise estimates of the char widths from the unicharset are\n// used to guide a DP search for the best fit to the transcription.\nclass TESS_UNICHARSET_TRAINING_API LSTMTrainer : public LSTMRecognizer {\npublic:\n  LSTMTrainer();\n  LSTMTrainer(const std::string &model_base,\n              const std::string &checkpoint_name,\n              int debug_interval, int64_t max_memory);\n  virtual ~LSTMTrainer();\n\n  // Tries to deserialize a trainer from the given file and silently returns\n  // false in case of failure. If old_traineddata is not null, then it is\n  // assumed that the character set is to be re-mapped from old_traineddata to\n  // the new, with consequent change in weight matrices etc.\n  bool TryLoadingCheckpoint(const char *filename, const char *old_traineddata);\n\n  // Initializes the character set encode/decode mechanism directly from a\n  // previously setup traineddata containing dawgs, UNICHARSET and\n  // UnicharCompress. Note: Call before InitNetwork!\n  bool InitCharSet(const std::string &traineddata_path) {\n    bool success = mgr_.Init(traineddata_path.c_str());\n    if (success) {\n      InitCharSet();\n    }\n    return success;\n  }\n  void InitCharSet(const TessdataManager &mgr) {\n    mgr_ = mgr;\n    InitCharSet();\n  }\n\n  // Initializes the trainer with a network_spec in the network description\n  // net_flags control network behavior according to the NetworkFlags enum.\n  // There isn't really much difference between them - only where the effects\n  // are implemented.\n  // For other args see NetworkBuilder::InitNetwork.\n  // Note: Be sure to call InitCharSet before InitNetwork!\n  bool InitNetwork(const char *network_spec, int append_index, int net_flags,\n                   float weight_range, float learning_rate, float momentum,\n                   float adam_beta);\n  // Resets all the iteration counters for fine tuning or training a head,\n  // where we want the error reporting to reset.\n  void InitIterations();\n\n  // Accessors.\n  double ActivationError() const {\n    return error_rates_[ET_DELTA];\n  }\n  double CharError() const {\n    return error_rates_[ET_CHAR_ERROR];\n  }\n  const double *error_rates() const {\n    return error_rates_;\n  }\n  double best_error_rate() const {\n    return best_error_rate_;\n  }\n  int best_iteration() const {\n    return best_iteration_;\n  }\n  int learning_iteration() const {\n    return learning_iteration_;\n  }\n  int32_t improvement_steps() const {\n    return improvement_steps_;\n  }\n  void set_perfect_delay(int delay) {\n    perfect_delay_ = delay;\n  }\n  const std::vector<char> &best_trainer() const {\n    return best_trainer_;\n  }\n  // Returns the error that was just calculated by PrepareForBackward.\n  double NewSingleError(ErrorTypes type) const {\n    return error_buffers_[type][training_iteration() % kRollingBufferSize_];\n  }\n  // Returns the error that was just calculated by TrainOnLine. Since\n  // TrainOnLine rolls the error buffers, this is one further back than\n  // NewSingleError.\n  double LastSingleError(ErrorTypes type) const {\n    return error_buffers_[type]\n                         [(training_iteration() + kRollingBufferSize_ - 1) %\n                          kRollingBufferSize_];\n  }\n  const DocumentCache &training_data() const {\n    return training_data_;\n  }\n  DocumentCache *mutable_training_data() {\n    return &training_data_;\n  }\n\n  // If the training sample is usable, grid searches for the optimal\n  // dict_ratio/cert_offset, and returns the results in a string of space-\n  // separated triplets of ratio,offset=worderr.\n  Trainability GridSearchDictParams(\n      const ImageData *trainingdata, int iteration, double min_dict_ratio,\n      double dict_ratio_step, double max_dict_ratio, double min_cert_offset,\n      double cert_offset_step, double max_cert_offset, std::string &results);\n\n  // Provides output on the distribution of weight values.\n  void DebugNetwork();\n\n  // Loads a set of lstmf files that were created using the lstm.train config to\n  // tesseract into memory ready for training. Returns false if nothing was\n  // loaded.\n  bool LoadAllTrainingData(const std::vector<std::string> &filenames,\n                           CachingStrategy cache_strategy,\n                           bool randomly_rotate);\n\n  // Keeps track of best and locally worst error rate, using internally computed\n  // values. See MaintainCheckpointsSpecific for more detail.\n  bool MaintainCheckpoints(const TestCallback &tester, std::stringstream &log_msg);\n  // Keeps track of best and locally worst error_rate (whatever it is) and\n  // launches tests using rec_model, when a new min or max is reached.\n  // Writes checkpoints using train_model at appropriate times and builds and\n  // returns a log message to indicate progress. Returns false if nothing\n  // interesting happened.\n  bool MaintainCheckpointsSpecific(int iteration,\n                                   const std::vector<char> *train_model,\n                                   const std::vector<char> *rec_model,\n                                   TestCallback tester, std::stringstream &log_msg);\n  // Builds a progress message with current error rates.\n  void PrepareLogMsg(std::stringstream &log_msg) const;\n  // Appends <intro_str> iteration learning_iteration()/training_iteration()/\n  // sample_iteration() to the log_msg.\n  void LogIterations(const char *intro_str, std::stringstream &log_msg) const;\n\n  // TODO(rays) Add curriculum learning.\n  // Returns true and increments the training_stage_ if the error rate has just\n  // passed through the given threshold for the first time.\n  bool TransitionTrainingStage(float error_threshold);\n  // Returns the current training stage.\n  int CurrentTrainingStage() const {\n    return training_stage_;\n  }\n\n  // Writes to the given file. Returns false in case of error.\n  bool Serialize(SerializeAmount serialize_amount, const TessdataManager *mgr,\n                 TFile *fp) const;\n  // Reads from the given file. Returns false in case of error.\n  bool DeSerialize(const TessdataManager *mgr, TFile *fp);\n\n  // De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the\n  // learning rates (by scaling reduction, or layer specific, according to\n  // NF_LAYER_SPECIFIC_LR).\n  void StartSubtrainer(std::stringstream &log_msg);\n  // While the sub_trainer_ is behind the current training iteration and its\n  // training error is at least kSubTrainerMarginFraction better than the\n  // current training error, trains the sub_trainer_, and returns STR_UPDATED if\n  // it did anything. If it catches up, and has a better error rate than the\n  // current best, as well as a margin over the current error rate, then the\n  // trainer in *this is replaced with sub_trainer_, and STR_REPLACED is\n  // returned. STR_NONE is returned if the subtrainer wasn't good enough to\n  // receive any training iterations.\n  SubTrainerResult UpdateSubtrainer(std::stringstream &log_msg);\n  // Reduces network learning rates, either for everything, or for layers\n  // independently, according to NF_LAYER_SPECIFIC_LR.\n  void ReduceLearningRates(LSTMTrainer *samples_trainer, std::stringstream &log_msg);\n  // Considers reducing the learning rate independently for each layer down by\n  // factor(<1), or leaving it the same, by double-training the given number of\n  // samples and minimizing the amount of changing of sign of weight updates.\n  // Even if it looks like all weights should remain the same, an adjustment\n  // will be made to guarantee a different result when reverting to an old best.\n  // Returns the number of layer learning rates that were reduced.\n  int ReduceLayerLearningRates(TFloat factor, int num_samples,\n                               LSTMTrainer *samples_trainer);\n\n  // Converts the string to integer class labels, with appropriate null_char_s\n  // in between if not in SimpleTextOutput mode. Returns false on failure.\n  bool EncodeString(const std::string &str, std::vector<int> *labels) const {\n    return EncodeString(str, GetUnicharset(),\n                        IsRecoding() ? &recoder_ : nullptr, SimpleTextOutput(),\n                        null_char_, labels);\n  }\n  // Static version operates on supplied unicharset, encoder, simple_text.\n  static bool EncodeString(const std::string &str, const UNICHARSET &unicharset,\n                           const UnicharCompress *recoder, bool simple_text,\n                           int null_char, std::vector<int> *labels);\n\n  // Performs forward-backward on the given trainingdata.\n  // Returns the sample that was used or nullptr if the next sample was deemed\n  // unusable. samples_trainer could be this or an alternative trainer that\n  // holds the training samples.\n  const ImageData *TrainOnLine(LSTMTrainer *samples_trainer, bool batch) {\n    int sample_index = sample_iteration();\n    const ImageData *image =\n        samples_trainer->training_data_.GetPageBySerial(sample_index);\n    if (image != nullptr) {\n      Trainability trainable = TrainOnLine(image, batch);\n      if (trainable == UNENCODABLE || trainable == NOT_BOXED) {\n        return nullptr; // Sample was unusable.\n      }\n    } else {\n      ++sample_iteration_;\n    }\n    return image;\n  }\n  Trainability TrainOnLine(const ImageData *trainingdata, bool batch);\n\n  // Prepares the ground truth, runs forward, and prepares the targets.\n  // Returns a Trainability enum to indicate the suitability of the sample.\n  Trainability PrepareForBackward(const ImageData *trainingdata,\n                                  NetworkIO *fwd_outputs, NetworkIO *targets);\n\n  // Writes the trainer to memory, so that the current training state can be\n  // restored.  *this must always be the master trainer that retains the only\n  // copy of the training data and language model. trainer is the model that is\n  // actually serialized.\n  bool SaveTrainingDump(SerializeAmount serialize_amount,\n                        const LSTMTrainer &trainer,\n                        std::vector<char> *data) const;\n\n  // Reads previously saved trainer from memory. *this must always be the\n  // master trainer that retains the only copy of the training data and\n  // language model. trainer is the model that is restored.\n  bool ReadTrainingDump(const std::vector<char> &data,\n                        LSTMTrainer &trainer) const {\n    if (data.empty()) {\n      return false;\n    }\n    return ReadSizedTrainingDump(&data[0], data.size(), trainer);\n  }\n  bool ReadSizedTrainingDump(const char *data, int size,\n                             LSTMTrainer &trainer) const {\n    return trainer.ReadLocalTrainingDump(&mgr_, data, size);\n  }\n  // Restores the model to *this.\n  bool ReadLocalTrainingDump(const TessdataManager *mgr, const char *data,\n                             int size);\n\n  // Sets up the data for MaintainCheckpoints from a light ReadTrainingDump.\n  void SetupCheckpointInfo();\n\n  // Writes the full recognition traineddata to the given filename.\n  bool SaveTraineddata(const char *filename);\n\n  // Writes the recognizer to memory, so that it can be used for testing later.\n  void SaveRecognitionDump(std::vector<char> *data) const;\n\n  // Returns a suitable filename for a training dump, based on the model_base_,\n  // the iteration and the error rates.\n  std::string DumpFilename() const;\n\n  // Fills the whole error buffer of the given type with the given value.\n  void FillErrorBuffer(double new_error, ErrorTypes type);\n  // Helper generates a map from each current recoder_ code (ie softmax index)\n  // to the corresponding old_recoder code, or -1 if there isn't one.\n  std::vector<int> MapRecoder(const UNICHARSET &old_chset,\n                              const UnicharCompress &old_recoder) const;\n\nprotected:\n  // Private version of InitCharSet above finishes the job after initializing\n  // the mgr_ data member.\n  void InitCharSet();\n  // Helper computes and sets the null_char_.\n  void SetNullChar();\n\n  // Factored sub-constructor sets up reasonable default values.\n  void EmptyConstructor();\n\n  // Outputs the string and periodically displays the given network inputs\n  // as an image in the given window, and the corresponding labels at the\n  // corresponding x_starts.\n  // Returns false if the truth string is empty.\n  bool DebugLSTMTraining(const NetworkIO &inputs, const ImageData &trainingdata,\n                         const NetworkIO &fwd_outputs,\n                         const std::vector<int> &truth_labels,\n                         const NetworkIO &outputs);\n  // Displays the network targets as line a line graph.\n  void DisplayTargets(const NetworkIO &targets, const char *window_name,\n                      ScrollView **window);\n\n  // Builds a no-compromises target where the first positions should be the\n  // truth labels and the rest is padded with the null_char_.\n  bool ComputeTextTargets(const NetworkIO &outputs,\n                          const std::vector<int> &truth_labels,\n                          NetworkIO *targets);\n\n  // Builds a target using standard CTC. truth_labels should be pre-padded with\n  // nulls wherever desired. They don't have to be between all labels.\n  // outputs is input-output, as it gets clipped to minimum probability.\n  bool ComputeCTCTargets(const std::vector<int> &truth_labels,\n                         NetworkIO *outputs, NetworkIO *targets);\n\n  // Computes network errors, and stores the results in the rolling buffers,\n  // along with the supplied text_error.\n  // Returns the delta error of the current sample (not running average.)\n  double ComputeErrorRates(const NetworkIO &deltas, double char_error,\n                           double word_error);\n\n  // Computes the network activation RMS error rate.\n  double ComputeRMSError(const NetworkIO &deltas);\n\n  // Computes network activation winner error rate. (Number of values that are\n  // in error by >= 0.5 divided by number of time-steps.) More closely related\n  // to final character error than RMS, but still directly calculable from\n  // just the deltas. Because of the binary nature of the targets, zero winner\n  // error is a sufficient but not necessary condition for zero char error.\n  double ComputeWinnerError(const NetworkIO &deltas);\n\n  // Computes a very simple bag of chars char error rate.\n  double ComputeCharError(const std::vector<int> &truth_str,\n                          const std::vector<int> &ocr_str);\n  // Computes a very simple bag of words word recall error rate.\n  // NOTE that this is destructive on both input strings.\n  double ComputeWordError(std::string *truth_str, std::string *ocr_str);\n\n  // Updates the error buffer and corresponding mean of the given type with\n  // the new_error.\n  void UpdateErrorBuffer(double new_error, ErrorTypes type);\n\n  // Rolls error buffers and reports the current means.\n  void RollErrorBuffers();\n\n  // Given that error_rate is either a new min or max, updates the best/worst\n  // error rates, and record of progress.\n  std::string UpdateErrorGraph(int iteration, double error_rate,\n                               const std::vector<char> &model_data,\n                               const TestCallback &tester);\n\nprotected:\n#ifndef GRAPHICS_DISABLED\n  // Alignment display window.\n  ScrollView *align_win_;\n  // CTC target display window.\n  ScrollView *target_win_;\n  // CTC output display window.\n  ScrollView *ctc_win_;\n  // Reconstructed image window.\n  ScrollView *recon_win_;\n#endif\n  // How often to display a debug image.\n  int debug_interval_;\n  // Iteration at which the last checkpoint was dumped.\n  int checkpoint_iteration_;\n  // Basename of files to save best models to.\n  std::string model_base_;\n  // Checkpoint filename.\n  std::string checkpoint_name_;\n  // Training data.\n  bool randomly_rotate_;\n  DocumentCache training_data_;\n  // Name to use when saving best_trainer_.\n  std::string best_model_name_;\n  // Number of available training stages.\n  int num_training_stages_;\n\n  // ===Serialized data to ensure that a restart produces the same results.===\n  // These members are only serialized when serialize_amount != LIGHT.\n  // Best error rate so far.\n  double best_error_rate_;\n  // Snapshot of all error rates at best_iteration_.\n  double best_error_rates_[ET_COUNT];\n  // Iteration of best_error_rate_.\n  int best_iteration_;\n  // Worst error rate since best_error_rate_.\n  double worst_error_rate_;\n  // Snapshot of all error rates at worst_iteration_.\n  double worst_error_rates_[ET_COUNT];\n  // Iteration of worst_error_rate_.\n  int worst_iteration_;\n  // Iteration at which the process will be thought stalled.\n  int stall_iteration_;\n  // Saved recognition models for computing test error for graph points.\n  std::vector<char> best_model_data_;\n  std::vector<char> worst_model_data_;\n  // Saved trainer for reverting back to last known best.\n  std::vector<char> best_trainer_;\n  // A subsidiary trainer running with a different learning rate until either\n  // *this or sub_trainer_ hits a new best.\n  std::unique_ptr<LSTMTrainer> sub_trainer_;\n  // Error rate at which last best model was dumped.\n  float error_rate_of_last_saved_best_;\n  // Current stage of training.\n  int training_stage_;\n  // History of best error rate against iteration. Used for computing the\n  // number of steps to each 2% improvement.\n  std::vector<double> best_error_history_;\n  std::vector<int32_t> best_error_iterations_;\n  // Number of iterations since the best_error_rate_ was 2% more than it is now.\n  int32_t improvement_steps_;\n  // Number of iterations that yielded a non-zero delta error and thus provided\n  // significant learning. learning_iteration_ <= training_iteration_.\n  // learning_iteration_ is used to measure rate of learning progress.\n  int learning_iteration_;\n  // Saved value of sample_iteration_ before looking for the next sample.\n  int prev_sample_iteration_;\n  // How often to include a PERFECT training sample in backprop.\n  // A PERFECT training sample is used if the current\n  // training_iteration_ > last_perfect_training_iteration_ + perfect_delay_,\n  // so with perfect_delay_ == 0, all samples are used, and with\n  // perfect_delay_ == 4, at most 1 in 5 samples will be perfect.\n  int perfect_delay_;\n  // Value of training_iteration_ at which the last PERFECT training sample\n  // was used in back prop.\n  int last_perfect_training_iteration_;\n  // Rolling buffers storing recent training errors are indexed by\n  // training_iteration % kRollingBufferSize_.\n  static const int kRollingBufferSize_ = 1000;\n  std::vector<double> error_buffers_[ET_COUNT];\n  // Rounded mean percent trailing training errors in the buffers.\n  double error_rates_[ET_COUNT]; // RMS training error.\n  // Traineddata file with optional dawgs + UNICHARSET and recoder.\n  TessdataManager mgr_;\n};\n\n} // namespace tesseract.\n\n#endif // TESSERACT_LSTM_LSTMTRAINER_H_\n"
  },
  {
    "path": "src/training/unicharset/normstrngs.cpp",
    "content": "/**********************************************************************\n * File:        normstrngs.cpp\n * Description: Utilities to normalize and manipulate UTF-32 and\n *              UTF-8 strings.\n * Author:      Ranjith Unnikrishnan\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"normstrngs.h\"\n\n#include <string>\n#include <unordered_map>\n#include <vector>\n\n#include <tesseract/unichar.h>\n#include \"errcode.h\"\n#include \"icuerrorcode.h\"\n#include \"unicode/normalizer2.h\" // From libicu\n#include \"unicode/translit.h\"    // From libicu\n#include \"unicode/uchar.h\"       // From libicu\n#include \"unicode/unorm2.h\"      // From libicu\n#include \"unicode/uscript.h\"     // From libicu\n\nnamespace tesseract {\n\nstatic bool is_hyphen_punc(const char32 ch) {\n  static const char32 kHyphenPuncUnicodes[] = {\n      '-',\n      0x2010, // hyphen\n      0x2011, // non-breaking hyphen\n      0x2012, // figure dash\n      0x2013, // en dash\n      0x2014, // em dash\n      0x2015, // horizontal bar\n      // how about 0x2043 hyphen bullet?\n      // how about 0x2500 box drawings light horizontal?\n      0x207b, // superscript minus\n      0x208b, // subscript minus\n      0x2212, // minus sign\n      0xfe58, // small em dash\n      0xfe63, // small hyphen-minus\n      0xff0d, // fullwidth hyphen-minus\n      0x2e17  // double oblique hyphen (Fraktur)\n  };\n  for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {\n    if (kHyphenPuncUnicode == ch) {\n      return true;\n    }\n  }\n  return false;\n}\n\nstatic bool is_single_quote(const char32 ch) {\n  static const char32 kSingleQuoteUnicodes[] = {\n      '\\'', '`',\n      0x2018, // left single quotation mark (English, others)\n      0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)\n              // We may have to introduce a comma set with 0x201a\n      0x201A, // single low-9 quotation mark (German)\n      0x201B, // single high-reversed-9 quotation mark (PropList.txt)\n      0x2032, // prime\n      0x300C, // left corner bracket (East Asian languages)\n      0xFF07  // fullwidth apostrophe\n  };\n  for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) {\n    if (kSingleQuoteUnicode == ch) {\n      return true;\n    }\n  }\n  return false;\n}\n\nstatic bool is_double_quote(const char32 ch) {\n  static const char32 kDoubleQuoteUnicodes[] = {\n      '\"',\n      0x201C, // left double quotation mark (English, others)\n      0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)\n      0x201F, // double high-reversed-9 quotation mark (PropList.txt)\n      0x2033, // double prime\n      0x201E, // double low-9 quotation mark (German)\n      0x301D, // reversed double prime quotation mark (East Asian langs,\n              // horiz.)\n      0x301E, // close double prime (East Asian languages written horizontally)\n      0xFF02  // fullwidth quotation mark\n  };\n  for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) {\n    if (kDoubleQuoteUnicode == ch) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Helper runs a standard unicode normalization, optional OCR normalization,\n// and leaves the result as char32 for subsequent processing.\nstatic void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, const char *str8,\n                                 std::vector<char32> *normed32) {\n  // Convert to ICU string for unicode normalization.\n  icu::UnicodeString uch_str(str8, \"UTF-8\");\n  IcuErrorCode error_code;\n  // Convert the enum to the new weird icu representation.\n  const char *norm_type =\n      u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC ? \"nfkc\" : \"nfc\";\n  UNormalization2Mode compose = u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC\n                                    ? UNORM2_COMPOSE\n                                    : UNORM2_DECOMPOSE;\n  // Pointer to singleton does not require deletion.\n  const icu::Normalizer2 *normalizer =\n      icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);\n  error_code.assertSuccess();\n  error_code.reset();\n  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);\n  error_code.assertSuccess();\n  // Convert to char32 for output. OCR normalization if required.\n  normed32->reserve(norm_str.length()); // An approximation.\n  for (int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) {\n    char32 ch = norm_str.char32At(offset);\n    // Skip all ZWS, RTL and LTR marks.\n    if (Validator::IsZeroWidthMark(ch)) {\n      continue;\n    }\n    if (ocr_normalize == OCRNorm::kNormalize) {\n      ch = OCRNormalize(ch);\n    }\n    normed32->push_back(ch);\n  }\n}\n\n// Helper removes joiners from strings that contain no letters.\nstatic void StripJoiners(std::vector<char32> *str32) {\n  for (char32 ch : *str32) {\n    if (u_isalpha(ch)) {\n      return;\n    }\n  }\n  int len = 0;\n  for (char32 ch : *str32) {\n    if (ch != Validator::kZeroWidthJoiner && ch != Validator::kZeroWidthNonJoiner) {\n      (*str32)[len++] = ch;\n    }\n  }\n  str32->resize(len);\n}\n\n// Normalizes a UTF8 string according to the given modes. Returns true on\n// success. If false is returned, some failure or invalidity was present, and\n// the result string is produced on a \"best effort\" basis.\nbool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,\n                         GraphemeNorm grapheme_normalize, const char *str8,\n                         std::string *normalized) {\n  std::vector<char32> normed32;\n  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);\n  if (grapheme_normalize == GraphemeNorm::kNormalize) {\n    StripJoiners(&normed32);\n    std::vector<std::vector<char32>> graphemes;\n    bool success = Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, false,\n                                                      normed32, &graphemes);\n    if (graphemes.empty() || graphemes[0].empty()) {\n      success = false;\n    } else if (normalized != nullptr) {\n      *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);\n    }\n    return success;\n  }\n  if (normalized != nullptr) {\n    *normalized = UNICHAR::UTF32ToUTF8(normed32);\n  }\n  return true;\n}\n\n// Normalizes a UTF8 string according to the given modes and splits into\n// graphemes according to g_mode. Returns true on success. If false is returned,\n// some failure or invalidity was present, and the result string is produced on\n// a \"best effort\" basis.\nbool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,\n                                  GraphemeNormMode g_mode, bool report_errors, const char *str8,\n                                  std::vector<std::string> *graphemes) {\n  std::vector<char32> normed32;\n  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);\n  StripJoiners(&normed32);\n  std::vector<std::vector<char32>> graphemes32;\n  bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors, normed32, &graphemes32);\n  if (g_mode != GraphemeNormMode::kSingleString && success) {\n    // If we modified the string to clean it up, the segmentation may not be\n    // correct, so check for changes and do it again.\n    std::vector<char32> cleaned32;\n    for (const auto &g : graphemes32) {\n      cleaned32.insert(cleaned32.end(), g.begin(), g.end());\n    }\n    if (cleaned32 != normed32) {\n      graphemes32.clear();\n      success = Validator::ValidateCleanAndSegment(g_mode, report_errors, cleaned32, &graphemes32);\n    }\n  }\n  graphemes->clear();\n  graphemes->reserve(graphemes32.size());\n  for (const auto &grapheme : graphemes32) {\n    graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));\n  }\n  return success;\n}\n\n// Apply just the OCR-specific normalizations and return the normalized char.\nchar32 OCRNormalize(char32 ch) {\n  if (is_hyphen_punc(ch)) {\n    return '-';\n  } else if (is_single_quote(ch)) {\n    return '\\'';\n  } else if (is_double_quote(ch)) {\n    return '\"';\n  }\n  return ch;\n}\n\nbool IsOCREquivalent(char32 ch1, char32 ch2) {\n  return OCRNormalize(ch1) == OCRNormalize(ch2);\n}\n\nbool IsValidCodepoint(const char32 ch) {\n  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]\n  return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);\n}\n\nbool IsWhitespace(const char32 ch) {\n  ASSERT_HOST_MSG(IsValidCodepoint(ch), \"Invalid Unicode codepoint: 0x%x\\n\", ch);\n  return u_isUWhiteSpace(static_cast<UChar32>(ch));\n}\n\nbool IsUTF8Whitespace(const char *text) {\n  return SpanUTF8Whitespace(text) == strlen(text);\n}\n\nunsigned int SpanUTF8Whitespace(const char *text) {\n  int n_white = 0;\n  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));\n       it != UNICHAR::end(text, strlen(text)); ++it) {\n    if (!IsWhitespace(*it)) {\n      break;\n    }\n    n_white += it.utf8_len();\n  }\n  return n_white;\n}\n\nunsigned int SpanUTF8NotWhitespace(const char *text) {\n  int n_notwhite = 0;\n  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));\n       it != UNICHAR::end(text, strlen(text)); ++it) {\n    if (IsWhitespace(*it)) {\n      break;\n    }\n    n_notwhite += it.utf8_len();\n  }\n  return n_notwhite;\n}\n\nbool IsInterchangeValid(const char32 ch) {\n  return IsValidCodepoint(ch) && !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.\n         !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&\n         !(ch >= 0x2FFFE && ch <= 0x2FFFF) && !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&\n         !(ch >= 0x4FFFE && ch <= 0x4FFFF) && !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&\n         !(ch >= 0x6FFFE && ch <= 0x6FFFF) && !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&\n         !(ch >= 0x8FFFE && ch <= 0x8FFFF) && !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&\n         !(ch >= 0xAFFFE && ch <= 0xAFFFF) && !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&\n         !(ch >= 0xCFFFE && ch <= 0xCFFFF) && !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&\n         !(ch >= 0xEFFFE && ch <= 0xEFFFF) && !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&\n         !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&\n         (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\\n' || ch == '\\f' || ch == '\\t' ||\n          ch == '\\r');\n}\n\nbool IsInterchangeValid7BitAscii(const char32 ch) {\n  return IsValidCodepoint(ch) && ch <= 128 &&\n         (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\\n' || ch == '\\f' || ch == '\\t' ||\n          ch == '\\r');\n}\n\nchar32 FullwidthToHalfwidth(const char32 ch) {\n  // Return unchanged if not in the fullwidth-halfwidth Unicode block.\n  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {\n    if (ch != 0x3000) {\n      return ch;\n    }\n  }\n  // Special case for fullwidth left and right \"white parentheses\".\n  if (ch == 0xFF5F) {\n    return 0x2985;\n  }\n  if (ch == 0xFF60) {\n    return 0x2986;\n  }\n  // Construct a full-to-half width transliterator.\n  IcuErrorCode error_code;\n  icu::UnicodeString uch_str(static_cast<UChar32>(ch));\n  const icu::Transliterator *fulltohalf =\n      icu::Transliterator::createInstance(\"Fullwidth-Halfwidth\", UTRANS_FORWARD, error_code);\n  error_code.assertSuccess();\n  error_code.reset();\n\n  fulltohalf->transliterate(uch_str);\n  delete fulltohalf;\n  ASSERT_HOST(uch_str.length() != 0);\n  return uch_str[0];\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/normstrngs.h",
    "content": "/**********************************************************************\n * File:        normstrngs.h\n * Description: Utilities to normalize and manipulate UTF-32 and\n *              UTF-8 strings.\n * Author:      Ranjith Unnikrishnan\n * Created:     Thu July 4 2013\n *\n * (C) Copyright 2013, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_\n#define TESSERACT_CCUTIL_NORMSTRNGS_H_\n\n#include \"export.h\"\n\n#include \"validator.h\"\n\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\n// The standard unicode normalizations.\nenum class UnicodeNormMode {\n  kNFD,\n  kNFC,\n  kNFKD,\n  kNFKC,\n};\n\n// To normalize away differences in punctuation that are ambiguous, like\n// curly quotes and different widths of dash.\nenum class OCRNorm {\n  kNone,\n  kNormalize,\n};\n\n// To validate and normalize away some subtle differences that can occur in\n// Indic scripts, eg ensuring that an explicit virama is always followed by\n// a zero-width non-joiner.\nenum class GraphemeNorm {\n  kNone,\n  kNormalize,\n};\n\n// Normalizes a UTF8 string according to the given modes. Returns true on\n// success. If false is returned, some failure or invalidity was present, and\n// the result string is produced on a \"best effort\" basis.\nTESS_UNICHARSET_TRAINING_API\nbool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,\n                         GraphemeNorm grapheme_normalize, const char *str8,\n                         std::string *normalized);\n\n// Normalizes a UTF8 string according to the given modes and splits into\n// graphemes according to g_mode. Returns true on success. If false is returned,\n// some failure or invalidity was present, and the result string is produced on\n// a \"best effort\" basis.\nTESS_UNICHARSET_TRAINING_API\nbool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,\n                                  GraphemeNormMode g_mode, bool report_errors, const char *str8,\n                                  std::vector<std::string> *graphemes);\n\n// Applies just the OCR-specific normalizations and return the normalized char.\nchar32 OCRNormalize(char32 ch);\n\n// Returns true if the OCRNormalized ch1 and ch2 are the same.\nbool IsOCREquivalent(char32 ch1, char32 ch2);\n\n// Returns true if the value lies in the range of valid unicodes.\nbool IsValidCodepoint(const char32 ch);\n\n// Returns true a code point has the White_Space Unicode property.\nTESS_UNICHARSET_TRAINING_API\nbool IsWhitespace(const char32 ch);\n\n// Returns true if every char in the given (null-terminated) string has the\n// White_Space Unicode property.\nTESS_UNICHARSET_TRAINING_API\nbool IsUTF8Whitespace(const char *text);\n\n// Returns the length of bytes of the prefix of 'text' that have the White_Space\n// unicode property.\nTESS_UNICHARSET_TRAINING_API\nunsigned int SpanUTF8Whitespace(const char *text);\n\n// Returns the length of bytes of the prefix of 'text' that DO NOT have the\n// White_Space unicode property.\nTESS_UNICHARSET_TRAINING_API\nunsigned int SpanUTF8NotWhitespace(const char *text);\n\n// Returns true if the char is interchange valid i.e. no C0 or C1 control codes\n// (other than CR LF HT FF) and no non-characters.\nTESS_UNICHARSET_TRAINING_API\nbool IsInterchangeValid(const char32 ch);\n\n// Same as above but restricted to 7-bit ASCII.\nTESS_UNICHARSET_TRAINING_API\nbool IsInterchangeValid7BitAscii(const char32 ch);\n\n// Convert a full-width UTF-8 string to half-width.\nTESS_UNICHARSET_TRAINING_API\nchar32 FullwidthToHalfwidth(const char32 ch);\n\n} // namespace tesseract\n\n#endif // TESSERACT_CCUTIL_NORMSTRNGS_H_\n"
  },
  {
    "path": "src/training/unicharset/unicharset_training_utils.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharset_training_utils.cpp\n// Description: Training utilities for UNICHARSET.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"unicharset_training_utils.h\"\n\n#include <cstdlib>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include <tesseract/unichar.h>\n#include \"fileio.h\"\n#include \"icuerrorcode.h\"\n#include \"normstrngs.h\"\n#include \"statistc.h\"\n#include \"tesserrstream.h\"   // for tesserr\n#include \"unicharset.h\"\n#include \"unicode/uchar.h\"   // from libicu\n#include \"unicode/uscript.h\" // from libicu\n\nnamespace tesseract {\n\n// Helper sets the character attribute properties and sets up the script table.\n// Does not set tops and bottoms.\nvoid SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset) {\n  for (size_t unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {\n    // Convert any custom ligatures.\n    const char *unichar_str = unicharset->id_to_unichar(unichar_id);\n    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {\n      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {\n        unichar_str = UNICHARSET::kCustomLigatures[i][0];\n        break;\n      }\n    }\n\n    // Convert the unichar to UTF32 representation\n    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);\n\n    // Assume that if the property is true for any character in the string,\n    // then it holds for the whole \"character\".\n    bool unichar_isalpha = false;\n    bool unichar_islower = false;\n    bool unichar_isupper = false;\n    bool unichar_isdigit = false;\n    bool unichar_ispunct = false;\n\n    for (char32 u_ch : uni_vector) {\n      if (u_isalpha(u_ch)) {\n        unichar_isalpha = true;\n      }\n      if (u_islower(u_ch)) {\n        unichar_islower = true;\n      }\n      if (u_isupper(u_ch)) {\n        unichar_isupper = true;\n      }\n      if (u_isdigit(u_ch)) {\n        unichar_isdigit = true;\n      }\n      if (u_ispunct(u_ch)) {\n        unichar_ispunct = true;\n      }\n    }\n\n    unicharset->set_isalpha(unichar_id, unichar_isalpha);\n    unicharset->set_islower(unichar_id, unichar_islower);\n    unicharset->set_isupper(unichar_id, unichar_isupper);\n    unicharset->set_isdigit(unichar_id, unichar_isdigit);\n    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);\n\n    tesseract::IcuErrorCode err;\n    unicharset->set_script(unichar_id, uscript_getName(uscript_getScript(uni_vector[0], err)));\n\n    const int num_code_points = uni_vector.size();\n    // Obtain the lower/upper case if needed and record it in the properties.\n    unicharset->set_other_case(unichar_id, unichar_id);\n    if (unichar_islower || unichar_isupper) {\n      std::vector<char32> other_case(num_code_points, 0);\n      for (int i = 0; i < num_code_points; ++i) {\n        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.\n        // However since they deal with UChars (so need a conversion function\n        // from char32 or UTF8string) and require a meaningful locale string,\n        // for now u_tolower()/u_toupper() are used.\n        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]);\n      }\n      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);\n      UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str());\n      if (other_case_id != INVALID_UNICHAR_ID) {\n        unicharset->set_other_case(unichar_id, other_case_id);\n      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {\n        tprintf(\"Other case %s of %s is not in unicharset\\n\", other_case_uch.c_str(), unichar_str);\n      }\n    }\n\n    // Set RTL property and obtain mirror unichar ID from ICU.\n    std::vector<char32> mirrors(num_code_points, 0);\n    for (int i = 0; i < num_code_points; ++i) {\n      mirrors[i] = u_charMirror(uni_vector[i]);\n      if (i == 0) { // set directionality to that of the 1st code point\n        unicharset->set_direction(\n            unichar_id, static_cast<UNICHARSET::Direction>(u_charDirection(uni_vector[i])));\n      }\n    }\n    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);\n    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());\n    if (mirror_uch_id != INVALID_UNICHAR_ID) {\n      unicharset->set_mirror(unichar_id, mirror_uch_id);\n    } else if (report_errors) {\n      tprintf(\"Mirror %s of %s is not in unicharset\\n\", mirror_uch.c_str(), unichar_str);\n    }\n\n    // Record normalized version of this unichar.\n    std::string normed_str;\n    if (unichar_id != 0 &&\n        tesseract::NormalizeUTF8String(\n            decompose ? tesseract::UnicodeNormMode::kNFD : tesseract::UnicodeNormMode::kNFC,\n            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str,\n            &normed_str) &&\n        !normed_str.empty()) {\n      unicharset->set_normed(unichar_id, normed_str.c_str());\n    } else {\n      unicharset->set_normed(unichar_id, unichar_str);\n    }\n    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());\n  }\n  unicharset->post_load_setup();\n}\n\n// Helper sets the properties from universal script unicharsets, if found.\nvoid SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset) {\n  for (int s = 0; s < unicharset->get_script_table_size(); ++s) {\n    // Load the unicharset for the script if available.\n    std::string filename =\n        script_dir + \"/\" + unicharset->get_script_from_script_id(s) + \".unicharset\";\n    UNICHARSET script_set;\n    if (script_set.load_from_file(filename.c_str())) {\n      unicharset->SetPropertiesFromOther(script_set);\n    } else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {\n      tprintf(\"Failed to load script unicharset from:%s\\n\", filename.c_str());\n    }\n  }\n  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {\n    if (unicharset->PropertiesIncomplete(c)) {\n      tprintf(\"Warning: properties incomplete for index %d = %s\\n\", c,\n              unicharset->id_to_unichar(c));\n    }\n  }\n}\n\n// Helper gets the combined x-heights string.\nstd::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset) {\n  std::string xheights_str;\n  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {\n    // Load the xheights for the script if available.\n    std::string filename = script_dir + \"/\" + unicharset.get_script_from_script_id(s) + \".xheights\";\n    std::string script_heights;\n    if (File::ReadFileToString(filename, &script_heights)) {\n      xheights_str += script_heights;\n    }\n  }\n  return xheights_str;\n}\n\n// Helper to set the properties for an input unicharset file, writes to the\n// output file. If an appropriate script unicharset can be found in the\n// script_dir directory, then the tops and bottoms are expanded using the\n// script unicharset.\n// If non-empty, xheight data for the fonts are written to the xheights_file.\nvoid SetPropertiesForInputFile(const std::string &script_dir,\n                               const std::string &input_unicharset_file,\n                               const std::string &output_unicharset_file,\n                               const std::string &output_xheights_file) {\n  UNICHARSET unicharset;\n\n  // Load the input unicharset\n  unicharset.load_from_file(input_unicharset_file.c_str());\n  tesserr << \"Loaded unicharset of size \" << unicharset.size()\n          << \" from file \" << input_unicharset_file << '\\n';\n\n  // Set unichar properties\n  tprintf(\"Setting unichar properties\\n\");\n  SetupBasicProperties(true, false, &unicharset);\n  tprintf(\"Setting script properties\\n\");\n  SetScriptProperties(script_dir, &unicharset);\n  if (!output_xheights_file.empty()) {\n    std::string xheights_str = GetXheightString(script_dir, unicharset);\n    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);\n  }\n\n  // Write the output unicharset\n  tprintf(\"Writing unicharset to file %s\\n\", output_unicharset_file.c_str());\n  unicharset.save_to_file(output_unicharset_file.c_str());\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/unicharset_training_utils.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharset_training_utils.h\n// Description: Training utilities for UNICHARSET.\n// Author:      Ray Smith\n//\n// (C) Copyright 2014, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_\n#define TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_\n\n#include \"export.h\"\n\n#include <tesseract/export.h>\n\n#include <string>\n\nnamespace tesseract {\n\nclass UNICHARSET;\n\n// Helper sets the character attribute properties and sets up the script table.\n// Does not set tops and bottoms.\nTESS_UNICHARSET_TRAINING_API\nvoid SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset);\n// Default behavior is to compose, until it is proven that decomposed benefits\n// at least one language.\ninline void SetupBasicProperties(bool report_errors, UNICHARSET *unicharset) {\n  SetupBasicProperties(report_errors, false, unicharset);\n}\n// Helper sets the properties from universal script unicharsets, if found.\nTESS_UNICHARSET_TRAINING_API\nvoid SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset);\n// Helper gets the combined x-heights string.\nstd::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset);\n\n// Helper to set the properties for an input unicharset file, writes to the\n// output file. If an appropriate script unicharset can be found in the\n// script_dir directory, then the tops and bottoms are expanded using the\n// script unicharset.\n// If non-empty, xheight data for the fonts are written to the xheights_file.\nTESS_UNICHARSET_TRAINING_API\nvoid SetPropertiesForInputFile(const std::string &script_dir,\n                               const std::string &input_unicharset_file,\n                               const std::string &output_unicharset_file,\n                               const std::string &output_xheights_file);\n\n} // namespace tesseract.\n\n#endif // TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_\n"
  },
  {
    "path": "src/training/unicharset/validate_grapheme.cpp",
    "content": "#include \"validate_grapheme.h\"\n#include \"tprintf.h\"\n#include \"unicode/uchar.h\" // From libicu\n\nnamespace tesseract {\n\nbool ValidateGrapheme::ConsumeGraphemeIfValid() {\n  const unsigned num_codes = codes_.size();\n  char32 prev_prev_ch = ' ';\n  char32 prev_ch = ' ';\n  CharClass prev_cc = CharClass::kWhitespace;\n  int num_codes_in_grapheme = 0;\n  while (codes_used_ < num_codes) {\n    CharClass cc = codes_[codes_used_].first;\n    char32 ch = codes_[codes_used_].second;\n    const bool is_combiner = cc == CharClass::kCombiner || cc == CharClass::kVirama;\n// TODO: Make this code work well with RTL text.\n// See\n// https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751\n#if 0\n    // Reject easily detected badly formed sequences.\n    if (prev_cc == CharClass::kWhitespace && is_combiner) {\n      if (report_errors_) tprintf(\"Word started with a combiner:0x%x\\n\", ch);\n     return false;\n    }\n#endif\n    if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {\n      if (report_errors_) {\n        tprintf(\"Two grapheme links in a row:0x%x 0x%x\\n\", prev_ch, ch);\n      }\n      return false;\n    }\n    if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&\n        IsBadlyFormed(prev_ch, ch)) {\n      return false;\n    }\n    bool prev_is_fwd_combiner = prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||\n                                (prev_ch == kZeroWidthNonJoiner &&\n                                 (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));\n    if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) {\n      break;\n    }\n    CodeOnlyToOutput();\n    ++num_codes_in_grapheme;\n    prev_prev_ch = prev_ch;\n    prev_ch = ch;\n    prev_cc = cc;\n  }\n  if (num_codes_in_grapheme > 0) {\n    MultiCodePart(num_codes_in_grapheme);\n  }\n  return true;\n}\n\nValidator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const {\n  if (IsVedicAccent(ch)) {\n    return CharClass::kVedicMark;\n  }\n  // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they\n  // always combine with the previous character.\n  if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) {\n    return CharClass::kVirama;\n  }\n  if (u_isUWhiteSpace(ch)) {\n    return CharClass::kWhitespace;\n  }\n  // Workaround for Javanese Aksara's Taling, do not label it as a combiner\n  if (ch == 0xa9ba) {\n    return CharClass::kConsonant;\n  }\n  int char_type = u_charType(ch);\n  if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||\n      char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||\n      ch == kZeroWidthJoiner) {\n    return CharClass::kCombiner;\n  }\n  return CharClass::kOther;\n}\n\n// Helper returns true if the sequence prev_ch,ch is invalid.\nbool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {\n  // Reject badly formed Indic vowels.\n  if (IsBadlyFormedIndicVowel(prev_ch, ch)) {\n    if (report_errors_) {\n      tprintf(\"Badly formed Indic vowel sequence:0x%x 0x%x\\n\", prev_ch, ch);\n    }\n    return true;\n  }\n  if (IsBadlyFormedThai(prev_ch, ch)) {\n    if (report_errors_) {\n      tprintf(\"Badly formed Thai:0x%x 0x%x\\n\", prev_ch, ch);\n    }\n    return true;\n  }\n  return false;\n}\n\n// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.\n// Some vowels in Indic scripts may be analytically decomposed into atomic pairs\n// of components that are themselves valid unicode symbols. (See Table 12-1 in\n// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf\n// for examples in Devanagari). The Unicode standard discourages specifying\n// vowels this way, but they are sometimes encountered in text, probably because\n// some editors still permit it. Renderers however dislike such pairs, and so\n// this function may be used to detect their occurrence for removal.\n// TODO(rays) This function only covers a subset of Indic languages and doesn't\n// include all rules. Add rules as appropriate to support other languages or\n// find a way to generalize these existing rules that makes use of the\n// regularity of the mapping from ISCII to Unicode.\n/* static */\nbool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {\n  return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || (prev_ch == 0x909 && ch == 0x941) ||\n          (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||\n          (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||\n          (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||\n          // Illegal combinations of two dependent Devanagari vowels.\n          (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||\n          // Dependent Devanagari vowels following a virama.\n          (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||\n          // Bengali vowels (Table 9-5, pg 313)\n          (prev_ch == 0x985 && ch == 0x9BE) ||\n          // Telugu vowels (Table 9-19, pg 331)\n          (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||\n          // Kannada vowels (Table 9-20, pg 332)\n          (prev_ch == 0xC92 && ch == 0xCCC));\n}\n\n// Helper returns true if ch is a Thai consonant.\nstatic bool IsThaiConsonant(char32 ch) {\n  return 0xe01 <= ch && ch <= 0xe2e;\n}\n\n// Helper returns true is ch is a before-consonant vowel.\nstatic bool IsThaiBeforeConsonantVowel(char32 ch) {\n  return 0xe40 <= ch && ch <= 0xe44;\n}\n\n// Helper returns true if ch is a Thai tone mark.\nstatic bool IsThaiToneMark(char32 ch) {\n  return 0xe48 <= ch && ch <= 0xe4b;\n}\n\n// Helper returns true if ch is a Thai vowel that may be followed by a tone\n// mark.\nstatic bool IsThaiTonableVowel(char32 ch) {\n  return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;\n}\n\n// Helper returns true if the sequence prev_ch,ch is invalid Thai.\n// These rules come from a native Thai speaker, and are not covered by the\n// Thai section in the unicode book:\n// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf\n// Comments below added by Ray interpreting the code ranges.\n/* static */\nbool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {\n  // Tone marks must follow consonants or specific vowels.\n  if (IsThaiToneMark(ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {\n    return true;\n  }\n  // Tonable vowels must follow consonants.\n  if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {\n    return true;\n  }\n  // Thanthakhat must follow consonant or specific vowels.\n  if (ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {\n    return true;\n  }\n  // Nikkhahit must follow a consonant ?or certain markers?.\n  // TODO(rays) confirm this, but there were so many in the ground truth of the\n  // validation set that it seems reasonable to assume it is valid.\n  if (ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {\n    return true;\n  }\n  // The vowels e30, e32, e33 can be used more liberally.\n  if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&\n      !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&\n      !(prev_ch == 0xe32 && ch == 0xe30) && !(prev_ch == 0xe4d && ch == 0xe32)) {\n    return true;\n  }\n  // Some vowels come before consonants, and therefore cannot follow things\n  // that cannot end a syllable.\n  if (IsThaiBeforeConsonantVowel(ch) &&\n      (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) {\n    return true;\n  }\n  // Don't allow the standalone vowel U+0e24 to be followed by other vowels.\n  if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {\n    return true;\n  }\n  return false;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validate_grapheme.h",
    "content": "#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_\n#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_\n\n#include \"validator.h\"\n\nnamespace tesseract {\n\n// Subclass of Validator that validates and segments generic unicode into\n// grapheme clusters, including Latin with diacritics.\nclass ValidateGrapheme : public Validator {\npublic:\n  ValidateGrapheme(ViramaScript script, bool report_errors) : Validator(script, report_errors) {}\n  ~ValidateGrapheme() override = default;\n\nprotected:\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  bool ConsumeGraphemeIfValid() override;\n  // Returns the CharClass corresponding to the given Unicode ch.\n  CharClass UnicodeToCharClass(char32 ch) const override;\n\nprivate:\n  // Helper returns true if the sequence prev_ch,ch is invalid.\n  bool IsBadlyFormed(char32 prev_ch, char32 ch);\n  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.\n  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);\n  // Helper returns true if the sequence prev_ch,ch is invalid Thai.\n  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_\n"
  },
  {
    "path": "src/training/unicharset/validate_indic.cpp",
    "content": "#include \"validate_indic.h\"\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Returns whether codes matches the pattern for an Indic Grapheme.\n// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf\n// has a BNF for valid syllables (Graphemes) which is modified slightly\n// for Unicode.  Notably U+200C and U+200D are used before/after the\n// virama/virama to express explicit or soft viramas.\n// Also the unicode v.9 Malayalam entry states that CZHC can be used in several\n// Indic languages to request traditional ligatures, and CzHC is Malayalam-\n// specific for requesting open conjuncts.\n//\n//  + vowel Grapheme:  V[D](v)*\n//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*\nbool ValidateIndic::ConsumeGraphemeIfValid() {\n  switch (codes_[codes_used_].first) {\n    case CharClass::kConsonant:\n      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();\n    case CharClass::kVowel:\n    case CharClass::kVedicMark:\n      return ConsumeVowelIfValid();\n    case CharClass::kZeroWidthJoiner:\n    case CharClass::kZeroWidthNonJoiner:\n      // Apart from within an aksara, joiners are silently dropped.\n      if (report_errors_) {\n        tprintf(\"Dropping isolated joiner: 0x%x\\n\", codes_[codes_used_].second);\n      }\n      ++codes_used_;\n      return true;\n    case CharClass::kOther:\n      UseMultiCode(1);\n      return true;\n    default:\n      if (report_errors_) {\n        tprintf(\"Invalid start of grapheme sequence:%c=0x%x\\n\",\n                static_cast<int>(codes_[codes_used_].first),\n                codes_[codes_used_].second);\n      }\n      return false;\n  }\n}\n\nValidator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {\n  if (IsVedicAccent(ch)) {\n    return CharClass::kVedicMark;\n  }\n  if (ch == kZeroWidthNonJoiner) {\n    return CharClass::kZeroWidthNonJoiner;\n  }\n  if (ch == kZeroWidthJoiner) {\n    return CharClass::kZeroWidthJoiner;\n  }\n  // Offset from the start of the relevant unicode code block aka code page.\n  int base = static_cast<char32>(script_);\n  int off = ch - base;\n  // Anything in another code block is other.\n  if (off < 0 || off >= kIndicCodePageSize) {\n    return CharClass::kOther;\n  }\n  // Exception for Tamil. The aytham character is considered a letter.\n  if (script_ == ViramaScript::kTamil && off == 0x03) {\n    return CharClass::kVowel;\n  }\n  if (off < 0x4) {\n    return CharClass::kVowelModifier;\n  }\n  if (script_ == ViramaScript::kSinhala) {\n    // Sinhala is an exception.\n    if (off <= 0x19) {\n      return CharClass::kVowel;\n    }\n    if (off <= 0x49) {\n      return CharClass::kConsonant;\n    }\n    if (off == 0x4a) {\n      return CharClass::kVirama;\n    }\n    if (off <= 0x5f) {\n      return CharClass::kMatra;\n    }\n  } else {\n    if (off <= 0x14 || off == 0x50) {\n      return CharClass::kVowel;\n    }\n    if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) {\n      return CharClass::kConsonant;\n    }\n    // Sinhala doesn't have Nukta or Avagraha.\n    if (off == 0x3c) {\n      return CharClass::kNukta;\n    }\n    if (off == 0x3d) {\n      return CharClass::kVowel; // avagraha\n    }\n    if (off <= 0x4c || (0x51 <= off && off <= 0x54)) {\n      return CharClass::kMatra;\n    }\n    if (0x55 <= off && off <= 0x57) {\n      return CharClass::kMatraPiece;\n    }\n    if (off == 0x4d) {\n      return CharClass::kVirama;\n    }\n  }\n  if (off == 0x60 || off == 0x61) {\n    return CharClass::kVowel;\n  }\n  if (off == 0x62 || off == 0x63) {\n    return CharClass::kMatra;\n  }\n  // Danda and digits up to 6f are OK as other.\n  // 70-7f are script-specific.\n  // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.\n  if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) {\n    return CharClass::kOther;\n  }\n  // 0BF3-0BFA are other Tamil symbols.\n  if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) {\n    return CharClass::kOther;\n  }\n  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) {\n    return CharClass::kConsonant;\n  }\n  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) {\n    return CharClass::kConsonant;\n  }\n  if (script_ == ViramaScript::kSinhala && off == 0x70) {\n    return CharClass::kConsonant;\n  }\n  if (script_ == ViramaScript::kDevanagari && off == 0x70) {\n    return CharClass::kOther;\n  }\n  if (0x70 <= off && off <= 0x73) {\n    return CharClass::kVowelModifier;\n  }\n  // Non Indic, Digits, Measures, danda, etc.\n  return CharClass::kOther;\n}\n\n// Helper consumes/copies a virama and any associated post-virama joiners.\n// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or\n// no joiner at all) must be followed by a consonant.\n// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non\n// consonant, space, or character from a different script. We clean up the\n// representation to make it consistent by adding a ZWNJ if missing from a\n// non-linking virama. Returns false with an invalid sequence.\nbool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {\n  const unsigned num_codes = codes_.size();\n  if (joiner.first == CharClass::kOther) {\n    CodeOnlyToOutput();\n    if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {\n      // Post-matra viramas must be explicit, so no joiners allowed here.\n      if (post_matra) {\n        if (report_errors_) {\n          tprintf(\"ZWJ after a post-matra virama!!\\n\");\n        }\n        return false;\n      }\n      if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kRayana &&\n          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||\n           codes_[codes_used_ + 1].second == kYayana ||\n           codes_[codes_used_ + 1].second == kRayana)) {\n        // This combination will be picked up later.\n        ASSERT_HOST(!CodeOnlyToOutput());\n      } else {\n        // Half-form with optional Nukta.\n        unsigned len = output_.size() + 1 - output_used_;\n        if (UseMultiCode(len)) {\n          return true;\n        }\n      }\n      if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {\n        if (output_used_ == output_.size() || output_[output_used_] != kRayana) {\n          if (report_errors_) {\n            tprintf(\"Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\\n\", static_cast<int>(script_));\n          }\n          return false;\n        }\n        // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]\n        if (UseMultiCode(4)) {\n          return true;\n        }\n      }\n    } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||\n               post_matra) {\n      if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {\n        // It is valid to have an unterminated virama at the end of a word, but\n        // for consistency, we will always add ZWNJ if not present.\n        output_.push_back(kZeroWidthNonJoiner);\n      } else {\n        CodeOnlyToOutput();\n      }\n      // Explicit virama [H z]\n      MultiCodePart(2);\n    }\n  } else {\n    // Pre-virama joiner [{Z|z} H] requests specific conjunct.\n    if (UseMultiCode(2)) {\n      if (report_errors_) {\n        tprintf(\"Invalid pre-virama joiner with no 2nd consonant!!\\n\");\n      }\n      return false;\n    }\n    if (codes_[codes_used_].second == kZeroWidthJoiner ||\n        codes_[codes_used_].second == kZeroWidthNonJoiner) {\n      if (report_errors_) {\n        tprintf(\"JHJ!!: 0x%x 0x%x 0x%x\\n\", joiner.second, output_.back(),\n                codes_[codes_used_].second);\n      }\n      return false;\n    }\n  }\n  // It is good so far as it goes.\n  return true;\n}\n\n// Helper consumes/copies a series of consonants separated by viramas while\n// valid, but not any vowel or other modifiers.\nbool ValidateIndic::ConsumeConsonantHeadIfValid() {\n  const unsigned num_codes = codes_.size();\n  // Consonant aksara\n  do {\n    CodeOnlyToOutput();\n    // Special Sinhala case of [H Z Yayana/Rayana].\n    int index = output_.size() - 3;\n    if (output_used_ + 3 <= output_.size() &&\n        (output_.back() == kYayana || output_.back() == kRayana) && IsVirama(output_[index]) &&\n        output_[index + 1] == kZeroWidthJoiner) {\n      MultiCodePart(3);\n    }\n    bool have_nukta = false;\n    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {\n      have_nukta = true;\n      CodeOnlyToOutput();\n    }\n    // Test for subscript conjunct.\n    index = output_.size() - 2 - have_nukta;\n    if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&\n        IsVirama(output_[index])) {\n      // Output previous virama, consonant + optional nukta.\n      MultiCodePart(2 + have_nukta);\n    }\n    IndicPair joiner(CharClass::kOther, 0);\n    if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||\n                                    (codes_[codes_used_].second == kZeroWidthNonJoiner &&\n                                     script_ == ViramaScript::kMalayalam))) {\n      joiner = codes_[codes_used_];\n      if (++codes_used_ == num_codes) {\n        if (report_errors_) {\n          tprintf(\"Skipping ending joiner: 0x%x 0x%x\\n\", output_.back(), joiner.second);\n        }\n        return true;\n      }\n      if (codes_[codes_used_].first == CharClass::kVirama) {\n        output_.push_back(joiner.second);\n      } else {\n        if (report_errors_) {\n          tprintf(\"Skipping unnecessary joiner: 0x%x 0x%x 0x%x\\n\", output_.back(), joiner.second,\n                  codes_[codes_used_].second);\n        }\n        joiner = std::make_pair(CharClass::kOther, 0);\n      }\n    }\n    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {\n      if (!ConsumeViramaIfValid(joiner, false)) {\n        return false;\n      }\n    } else {\n      break; // No virama, so the run of consonants is over.\n    }\n  } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);\n  if (output_used_ < output_.size()) {\n    MultiCodePart(1);\n  }\n  return true;\n}\n\n// Helper consumes/copies a tail part of a consonant, comprising optional\n// matra/piece, vowel modifier, vedic mark, terminating virama.\nbool ValidateIndic::ConsumeConsonantTailIfValid() {\n  if (codes_used_ == codes_.size()) {\n    return true;\n  }\n  // No virama: Finish the grapheme.\n  // Are multiple matras allowed?\n  if (codes_[codes_used_].first == CharClass::kMatra) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    if (codes_[codes_used_].first == CharClass::kMatraPiece) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVowelModifier) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    // Only Malayalam allows only repeated 0xd02.\n    if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {\n      break;\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVedicMark) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  if (codes_[codes_used_].first == CharClass::kVirama) {\n    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {\n      return false;\n    }\n  }\n  // What we have consumed so far is a valid consonant cluster.\n  if (output_used_ < output_.size()) {\n    MultiCodePart(1);\n  }\n\n  return true;\n}\n\n// Helper consumes/copies a vowel and optional modifiers.\nbool ValidateIndic::ConsumeVowelIfValid() {\n  if (UseMultiCode(1)) {\n    return true;\n  }\n  while (codes_[codes_used_].first == CharClass::kVowelModifier) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    // Only Malayalam allows repeated modifiers?\n    if (script_ != ViramaScript::kMalayalam) {\n      break;\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVedicMark) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  // What we have consumed so far is a valid vowel cluster.\n  return true;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validate_indic.h",
    "content": "#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_\n#define TESSERACT_TRAINING_VALIDATE_INDIC_H_\n\n#include \"validator.h\"\n\nnamespace tesseract {\n\n// Subclass of Validator that validates and segments Indic scripts in the\n// unicode range 0x900-0xdff (Devanagari-Sinhala).\nclass ValidateIndic : public Validator {\npublic:\n  ValidateIndic(ViramaScript script, bool report_errors) : Validator(script, report_errors) {}\n  ~ValidateIndic() override = default;\n\nprotected:\n  // Returns whether codes matches the pattern for an Indic Grapheme.\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  bool ConsumeGraphemeIfValid() override;\n  // Returns the CharClass corresponding to the given Unicode ch.\n  Validator::CharClass UnicodeToCharClass(char32 ch) const override;\n\nprivate:\n  // Helper consumes/copies a virama and any associated post-virama joiners.\n  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);\n  // Helper consumes/copies a series of consonants separated by viramas while\n  // valid, but not any vowel or other modifiers.\n  bool ConsumeConsonantHeadIfValid();\n  // Helper consumes/copies a tail part of a consonant, comprising optional\n  // matra/piece, vowel modifier, vedic mark, terminating virama.\n  bool ConsumeConsonantTailIfValid();\n  // Helper consumes/copies a vowel and optional modifiers.\n  bool ConsumeVowelIfValid();\n\n  // Some special unicodes used only for Indic processing.\n  static const char32 kYayana = 0xdba; // Sinhala Ya\n  static const char32 kRayana = 0xdbb; // Sinhala Ra\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_\n"
  },
  {
    "path": "src/training/unicharset/validate_javanese.cpp",
    "content": "/**********************************************************************\n * File:        validate_javanese.cpp\n * Description: Text validator for Javanese Script - aksara jawa.\n * Author:      Shree Devi Kumar\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#include \"validate_javanese.h\"\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Returns whether codes matches the pattern for a Javanese Grapheme.\n// Taken from unicode standard:\n// http://www.unicode.org/charts/PDF/UA980.pdf\n// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf\n// The Consonant class here includes independent vowels.\n// The order of components in an orthographic syllable as expressed in BNF is:\n// {C F} C {{R}Y} {V{A}} {Z}\n// Translated to the codes used by the CharClass enum:\n// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]\n// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.\n// Validation rules copied from validate_indic.cpp and modified for Javanese.\n// Indic - for reference\n//  + vowel Grapheme:  V[D](v)*\n//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*\n\nbool ValidateJavanese::ConsumeGraphemeIfValid() {\n  switch (codes_[codes_used_].first) {\n    case CharClass::kConsonant:\n      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();\n    case CharClass::kVowel:\n    case CharClass::kVedicMark:\n      return ConsumeVowelIfValid();\n    case CharClass::kZeroWidthJoiner:\n    case CharClass::kZeroWidthNonJoiner:\n      // Apart from within an aksara, joiners are silently dropped.\n      if (report_errors_) {\n        tprintf(\"Dropping isolated joiner: 0x%x\\n\", codes_[codes_used_].second);\n      }\n      ++codes_used_;\n      return true;\n    case CharClass::kOther:\n      UseMultiCode(1);\n      return true;\n    default:\n      if (report_errors_) {\n        tprintf(\"Invalid start of grapheme sequence:%c=0x%x\\n\",\n                static_cast<int>(codes_[codes_used_].first),\n                codes_[codes_used_].second);\n      }\n      return false;\n  }\n}\n\n// Helper consumes/copies a virama and any associated post-virama joiners.\n// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or\n// no joiner at all) must be followed by a consonant.\n// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non\n// consonant, space, or character from a different script. We clean up the\n// representation to make it consistent by adding a ZWNJ if missing from a\n// non-linking virama. Returns false with an invalid sequence.\nbool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {\n  const unsigned num_codes = codes_.size();\n  if (joiner.first == CharClass::kOther) {\n    CodeOnlyToOutput();\n    if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {\n      // Post-matra viramas must be explicit, so no joiners allowed here.\n      if (post_matra) {\n        if (report_errors_) {\n          tprintf(\"ZWJ after a post-matra virama!!\\n\");\n        }\n        return false;\n      }\n      if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kCakra &&\n          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||\n           codes_[codes_used_ + 1].second == kPengkal ||\n           codes_[codes_used_ + 1].second == kCakra)) {\n        // This combination will be picked up later.\n        ASSERT_HOST(!CodeOnlyToOutput());\n      } else {\n        // Half-form with optional Nukta.\n        unsigned len = output_.size() + 1 - output_used_;\n        if (UseMultiCode(len)) {\n          return true;\n        }\n      }\n      if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {\n        if (output_used_ == output_.size() || output_[output_used_] != kCakra) {\n          if (report_errors_) {\n            tprintf(\"Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\\n\", static_cast<int>(script_));\n          }\n          return false;\n        }\n        // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]\n        if (UseMultiCode(4)) {\n          return true;\n        }\n      }\n    } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||\n               post_matra) {\n      if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {\n        // It is valid to have an unterminated virama at the end of a word, but\n        // for consistency, we will always add ZWNJ if not present.\n        CodeOnlyToOutput();\n      } else {\n        CodeOnlyToOutput();\n      }\n      // Explicit virama [H z]\n      MultiCodePart(2);\n    }\n  } else {\n    // Pre-virama joiner [{Z|z} H] requests specific conjunct.\n    if (UseMultiCode(2)) {\n      if (report_errors_) {\n        tprintf(\"Invalid pre-virama joiner with no 2nd consonant!!\\n\");\n      }\n      return false;\n    }\n    if (codes_[codes_used_].second == kZeroWidthJoiner ||\n        codes_[codes_used_].second == kZeroWidthNonJoiner) {\n      if (report_errors_) {\n        tprintf(\"JHJ!!: 0x%x 0x%x 0x%x\\n\", joiner.second, output_.back(),\n                codes_[codes_used_].second);\n      }\n      return false;\n    }\n  }\n  // It is good so far as it goes.\n  return true;\n}\n\n// Helper consumes/copies a series of consonants separated by viramas while\n// valid, but not any vowel or other modifiers.\nbool ValidateJavanese::ConsumeConsonantHeadIfValid() {\n  const unsigned num_codes = codes_.size();\n  // Consonant aksara\n  do {\n    CodeOnlyToOutput();\n    // Special Sinhala case of [H Z Yayana/Rayana].\n    int index = output_.size() - 3;\n    if (output_used_ + 3 <= output_.size() &&\n        (output_.back() == kPengkal || output_.back() == kCakra) && IsVirama(output_[index]) &&\n        output_[index + 1] == kZeroWidthJoiner) {\n      MultiCodePart(3);\n    }\n    bool have_nukta = false;\n    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {\n      have_nukta = true;\n      CodeOnlyToOutput();\n    }\n    // Test for subscript conjunct.\n    index = output_.size() - 2 - have_nukta;\n    if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&\n        IsVirama(output_[index])) {\n      // Output previous virama, consonant + optional nukta.\n      MultiCodePart(2 + have_nukta);\n    }\n    IndicPair joiner(CharClass::kOther, 0);\n    if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||\n                                    (codes_[codes_used_].second == kZeroWidthNonJoiner &&\n                                     script_ == ViramaScript::kMalayalam))) {\n      joiner = codes_[codes_used_];\n      if (++codes_used_ == num_codes) {\n        if (report_errors_) {\n          tprintf(\"Skipping ending joiner: 0x%x 0x%x\\n\", output_.back(), joiner.second);\n        }\n        return true;\n      }\n      if (codes_[codes_used_].first == CharClass::kVirama) {\n        output_.push_back(joiner.second);\n      } else {\n        if (report_errors_) {\n          tprintf(\"Skipping unnecessary joiner: 0x%x 0x%x 0x%x\\n\", output_.back(), joiner.second,\n                  codes_[codes_used_].second);\n        }\n        joiner = std::make_pair(CharClass::kOther, 0);\n      }\n    }\n    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {\n      if (!ConsumeViramaIfValid(joiner, false)) {\n        return false;\n      }\n    } else {\n      break; // No virama, so the run of consonants is over.\n    }\n  } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);\n  if (output_used_ < output_.size()) {\n    MultiCodePart(1);\n  }\n  return true;\n}\n\n// Helper consumes/copies a tail part of a consonant, comprising optional\n// matra/piece, vowel modifier, vedic mark, terminating virama.\nbool ValidateJavanese::ConsumeConsonantTailIfValid() {\n  if (codes_used_ == codes_.size()) {\n    return true;\n  }\n  // No virama: Finish the grapheme.\n  // Are multiple matras allowed?\n  if (codes_[codes_used_].first == CharClass::kMatra) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    if (codes_[codes_used_].first == CharClass::kMatraPiece) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n    }\n  }\n  // Tarung also used for long versions of u and o vowels and vocalic r\n  // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ\n  while (codes_[codes_used_].first == CharClass::kMatraPiece) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVowelModifier) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    // Only Malayalam allows only repeated 0xd02.\n    if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {\n      break;\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVedicMark) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  if (codes_[codes_used_].first == CharClass::kVirama) {\n    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {\n      return false;\n    }\n  }\n  // What we have consumed so far is a valid consonant cluster.\n  if (output_used_ < output_.size()) {\n    MultiCodePart(1);\n  }\n\n  return true;\n}\n\n// Helper consumes/copies a vowel and optional modifiers.\nbool ValidateJavanese::ConsumeVowelIfValid() {\n  if (UseMultiCode(1)) {\n    return true;\n  }\n  while (codes_[codes_used_].first == CharClass::kVowelModifier) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    // Only Malayalam allows repeated modifiers?\n    if (script_ != ViramaScript::kMalayalam) {\n      break;\n    }\n  }\n  while (codes_[codes_used_].first == CharClass::kVedicMark) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  // What we have consumed so far is a valid vowel cluster.\n  return true;\n}\n\nValidator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {\n  if (ch == kZeroWidthNonJoiner) {\n    return CharClass::kZeroWidthNonJoiner;\n  }\n  if (ch == kZeroWidthJoiner) {\n    return CharClass::kZeroWidthJoiner;\n  }\n  // Offset from the start of the relevant unicode code block aka code page.\n  int off = ch - static_cast<char32>(script_);\n  // Anything in another code block is other.\n  if (off < 0 || off >= kIndicCodePageSize) {\n    return CharClass::kOther;\n  }\n  if (off < 0x4) {\n    return CharClass::kVowelModifier;\n  }\n  if (off <= 0x32) {\n    return CharClass::kConsonant; // includes independent vowels\n  }\n  if (off == 0x33) {\n    return CharClass::kNukta; // A9B3 CECAK TELU\n  }\n  if (off == 0x34) {\n    return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels\n  }\n  if (off <= 0x39) {\n    return CharClass::kMatra;\n  }\n  if (off <= 0x3a) {\n    return CharClass::kConsonant; // A9BA TALING - pre base vowel\n  }\n  if (off <= 0x3d) {\n    return CharClass::kMatra;\n  }\n  if (off <= 0x3f) {\n    return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants\n  }\n  if (off == 0x40) {\n    return CharClass::kVirama; // A9C0 PANGKON\n  }\n  return CharClass::kOther;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validate_javanese.h",
    "content": "/**********************************************************************\n * File:        validate_javanese.h\n * Description: Text validator for Javanese Script - aksara jawa.\n * Author:      Shree Devi Kumar\n * Created:     August 03, 2018\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_\n#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_\n\n#include \"validator.h\"\n\nnamespace tesseract {\n\n// Subclass of Validator that validates and segments Javanese scripts\n\nclass ValidateJavanese : public Validator {\npublic:\n  ValidateJavanese(ViramaScript script, bool report_errors) : Validator(script, report_errors) {}\n  ~ValidateJavanese() override = default;\n\nprotected:\n  // Returns whether codes matches the pattern for an Javanese Grapheme.\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  bool ConsumeGraphemeIfValid() override;\n  // Returns the CharClass corresponding to the given Unicode ch.\n  Validator::CharClass UnicodeToCharClass(char32 ch) const override;\n\nprivate:\n  // Helper consumes/copies a virama and any associated post-virama joiners.\n  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);\n  // Helper consumes/copies a series of consonants separated by viramas while\n  // valid, but not any vowel or other modifiers.\n  bool ConsumeConsonantHeadIfValid();\n  // Helper consumes/copies a tail part of a consonant, comprising optional\n  // matra/piece, vowel modifier, vedic mark, terminating virama.\n  bool ConsumeConsonantTailIfValid();\n  // Helper consumes/copies a vowel and optional modifiers.\n  bool ConsumeVowelIfValid();\n\n  // Some special unicodes used only for Javanese processing.\n  static const char32 kPengkal = 0xa9be; // Javanese Ya\n  static const char32 kCakra = 0xa9bf;   // Javanese Ra\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_\n"
  },
  {
    "path": "src/training/unicharset/validate_khmer.cpp",
    "content": "#include \"validate_khmer.h\"\n#include \"errcode.h\"\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Returns whether codes matches the pattern for a Khmer Grapheme.\n// Taken from unicode standard:\n// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.\n// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation\n// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.\n// Translated to the codes used by the CharClass enum:\n// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}\n// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.\n// Also the Consonant class here includes independent vowels, as they are\n// treated the same anyway.\n// In the split grapheme mode, the only characters that get grouped are the\n// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in\n// the BNF syntax, so who knows what they do.\nbool ValidateKhmer::ConsumeGraphemeIfValid() {\n  const unsigned num_codes = codes_.size();\n  if (codes_used_ == num_codes) {\n    return false;\n  }\n  if (codes_[codes_used_].first == CharClass::kOther) {\n    UseMultiCode(1);\n    return true;\n  }\n  if (codes_[codes_used_].first != CharClass::kConsonant) {\n    if (report_errors_) {\n      tprintf(\"Invalid start of Khmer syllable:0x%x\\n\", codes_[codes_used_].second);\n    }\n    return false;\n  }\n  if (UseMultiCode(1)) {\n    return true;\n  }\n  if (codes_[codes_used_].first == CharClass::kRobat ||\n      codes_[codes_used_].first == CharClass::kNukta) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&\n         codes_[codes_used_ + 1].first == CharClass::kConsonant) {\n    ASSERT_HOST(!CodeOnlyToOutput());\n    if (UseMultiCode(2)) {\n      return true;\n    }\n    if (codes_[codes_used_].first == CharClass::kRobat) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n    }\n  }\n  unsigned num_matra_parts = 0;\n  if (codes_[codes_used_].second == kZeroWidthJoiner ||\n      codes_[codes_used_].second == kZeroWidthNonJoiner) {\n    if (CodeOnlyToOutput()) {\n      if (report_errors_) {\n        tprintf(\"Unterminated joiner: 0x%x\\n\", output_.back());\n      }\n      return false;\n    }\n    ++num_matra_parts;\n  }\n  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its\n  // own or as an addition to other matras.\n  if (codes_[codes_used_].first == CharClass::kMatra ||\n      codes_[codes_used_].first == CharClass::kMatraPiece) {\n    ++num_matra_parts;\n    if (UseMultiCode(num_matra_parts)) {\n      return true;\n    }\n  } else if (num_matra_parts) {\n    if (report_errors_) {\n      tprintf(\"Joiner with non-dependent vowel after it!:0x%x 0x%x\\n\", output_.back(),\n              codes_[codes_used_].second);\n    }\n    return false;\n  }\n  if (codes_[codes_used_].first == CharClass::kMatraPiece &&\n      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  if (codes_[codes_used_].first == CharClass::kVowelModifier) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&\n      codes_[codes_used_ + 1].first == CharClass::kConsonant) {\n    ASSERT_HOST(!CodeOnlyToOutput());\n    if (UseMultiCode(2)) {\n      return true;\n    }\n  }\n  return true;\n}\n\nValidator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {\n  if (IsVedicAccent(ch)) {\n    return CharClass::kVedicMark;\n  }\n  if (ch == kZeroWidthNonJoiner) {\n    return CharClass::kZeroWidthNonJoiner;\n  }\n  if (ch == kZeroWidthJoiner) {\n    return CharClass::kZeroWidthJoiner;\n  }\n  // Offset from the start of the relevant unicode code block aka code page.\n  int off = ch - static_cast<char32>(script_);\n  // Anything in another code block is other.\n  if (off < 0 || off >= kIndicCodePageSize) {\n    return CharClass::kOther;\n  }\n  if (off <= 0x33) {\n    return CharClass::kConsonant;\n  }\n  if (off <= 0x45) {\n    return CharClass::kMatra;\n  }\n  if (off == 0x46) {\n    return CharClass::kMatraPiece;\n  }\n  if (off == 0x4c) {\n    return CharClass::kRobat;\n  }\n  if (off == 0x49 || off == 0x4a) {\n    return CharClass::kNukta;\n  }\n  if (off <= 0x51) {\n    return CharClass::kVowelModifier;\n  }\n  if (off == 0x52) {\n    return CharClass::kVirama;\n  }\n  return CharClass::kOther;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validate_khmer.h",
    "content": "#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_\n#define TESSERACT_TRAINING_VALIDATE_KHMER_H_\n\n#include \"validator.h\"\n\nnamespace tesseract {\n\n// Subclass of Validator that validates and segments Khmer.\nclass ValidateKhmer : public Validator {\npublic:\n  ValidateKhmer(ViramaScript script, bool report_errors) : Validator(script, report_errors) {}\n  ~ValidateKhmer() override = default;\n\nprotected:\n  // Returns whether codes matches the pattern for an Khmer Grapheme.\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  bool ConsumeGraphemeIfValid() override;\n  // Returns the CharClass corresponding to the given Unicode ch.\n  CharClass UnicodeToCharClass(char32 ch) const override;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_\n"
  },
  {
    "path": "src/training/unicharset/validate_myanmar.cpp",
    "content": "#include \"validate_myanmar.h\"\n#include \"errcode.h\"\n#include \"icuerrorcode.h\"\n#include \"tprintf.h\"\n#include \"unicode/uchar.h\"   // From libicu\n#include \"unicode/uscript.h\" // From libicu\n\nnamespace tesseract {\n\n// Returns whether codes matches the pattern for a Myanmar Grapheme.\n// Taken directly from the unicode table 16-3.\n// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf\nbool ValidateMyanmar::ConsumeGraphemeIfValid() {\n  const unsigned num_codes = codes_.size();\n  if (codes_used_ == num_codes) {\n    return true;\n  }\n  // Other.\n  if (IsMyanmarOther(codes_[codes_used_].second)) {\n    UseMultiCode(1);\n    return true;\n  }\n  // Kinzi.\n  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&\n      codes_[codes_used_ + 1].second == kMyanmarAsat &&\n      codes_[codes_used_ + 2].second == kMyanmarVirama) {\n    ASSERT_HOST(!CodeOnlyToOutput());\n    ASSERT_HOST(!CodeOnlyToOutput());\n    if (UseMultiCode(3)) {\n      return true;\n    }\n  }\n  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be\n  // optional, except the base, this is the only place where invalid input can\n  // be detected and false returned.\n  if (IsMyanmarLetter(codes_[codes_used_].second)) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  } else {\n    if (report_errors_) {\n      tprintf(\"Invalid start of Myanmar syllable:0x%x\\n\", codes_[codes_used_].second);\n    }\n    return false; // One of these is required.\n  }\n  if (ConsumeSubscriptIfPresent()) {\n    return true;\n  }\n  ConsumeOptionalSignsIfPresent();\n  // What we have consumed so far is a valid syllable.\n  return true;\n}\n\n// TODO(rays) Doesn't use intermediate coding like the other scripts, as there\n// is little correspondence between the content of table 16-3 and the char\n// classes of the Indic languages. (Experts may disagree and improve!)\n// In unicode table 16-3 there is basically a long list of optional characters,\n// which can be coded quite easily.\n// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!\n// The table also allows sequences that still result in dotted circles!!\n// So with a lot of guesswork the rest have been added in a reasonable place.\nValidator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {\n  if (IsMyanmarLetter(ch)) {\n    return CharClass::kConsonant;\n  }\n  return CharClass::kOther;\n}\n\n// Helper consumes/copies a virama and any subscript consonant.\n// Returns true if the end of input is reached.\nbool ValidateMyanmar::ConsumeSubscriptIfPresent() {\n  // Subscript consonant. It appears there can be only one.\n  const unsigned num_codes = codes_.size();\n  if (codes_used_ + 1 < num_codes && codes_[codes_used_].second == kMyanmarVirama) {\n    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {\n      ASSERT_HOST(!CodeOnlyToOutput());\n      if (UseMultiCode(2)) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n// Helper consumes/copies a series of optional signs.\n// Returns true if the end of input is reached.\nbool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {\n  // The following characters are allowed, all optional, and in sequence.\n  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.\n  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, 0x103d, 0x103e,\n                                      0x105e, 0x105f, 0x1060, 0x1081, 0x1031});\n  for (char32 ch : kMedials) {\n    if (codes_[codes_used_].second == ch) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n      if (ch == kMyanmarMedialYa && codes_[codes_used_].second == kMyanmarAsat) {\n        if (UseMultiCode(1)) {\n          return true;\n        }\n      }\n    }\n  }\n  // Vowel sign i, ii, ai.\n  char32 ch = codes_[codes_used_].second;\n  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  // Vowel sign u, uu, and extensions.\n  ch = codes_[codes_used_].second;\n  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || ch == 0x1062 ||\n      ch == 0x1067 || ch == 0x1068 || (0x1071 <= ch && ch <= 0x1074) ||\n      (0x1083 <= ch && ch <= 0x1086) || ch == 0x109c || ch == 0x109d) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  // Tall aa, aa with optional asat.\n  if (codes_[codes_used_].second == 0x102b || codes_[codes_used_].second == 0x102c) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n    if (codes_[codes_used_].second == kMyanmarAsat) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n    }\n  }\n  // The following characters are allowed, all optional, and in sequence.\n  // Anusvar, Dot below, Visarga\n  const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});\n  for (char32 ch : kSigns) {\n    if (codes_[codes_used_].second == ch) {\n      if (UseMultiCode(1)) {\n        return true;\n      }\n    }\n  }\n  // Tone mark extensions.\n  ch = codes_[codes_used_].second;\n  if (ch == 0x102c || ch == 0x1038 || ch == kMyanmarAsat || (0x1062 <= ch && ch <= 0x1064) ||\n      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || ch == 0x108f ||\n      ch == 0x109a || ch == 0x109b || (0xaa7b <= ch && ch <= 0xaa7d)) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  // Sgaw tones 0x1062, 0x1063 must be followed by asat.\n  // W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal).\n  ch = codes_[codes_used_].second;\n  if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) {\n    if (UseMultiCode(1)) {\n      return true;\n    }\n  }\n  return false;\n}\n\n// Returns true if the unicode is a Myanmar \"letter\" including consonants\n// and independent vowels. Although table 16-3 distinguishes between some\n// base consonants and vowels, the extensions make no such distinction, so we\n// put them all into a single bucket.\n// Update MYANMAR LETTER based on following:\n// https://unicode.org/charts/PDF/U1000.pdf - Myanmar\n// http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A\n// http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B\n/* static */\nbool ValidateMyanmar::IsMyanmarLetter(char32 ch) {\n  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || (0x104c <= ch && ch <= 0x1055) ||\n         (0x105a <= ch && ch <= 0x105d) || ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||\n         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) || ch == 0x108e ||\n         (0xa9e0 <= ch && ch <= 0xa9e4) || (0xa9e7 <= ch && ch <= 0xa9ef) ||\n         (0xa9fa <= ch && ch <= 0xa9fe) || (0xaa60 <= ch && ch <= 0xaa6f) ||\n         (0xaa71 <= ch && ch <= 0xaa73) || ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;\n}\n\n// Returns true if ch is a Myanmar digit or other symbol that does not take\n// part in being a syllable eg. punctuation marks.\n// MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM\n// REDUPLICATION MARKS\n/* static */\nbool ValidateMyanmar::IsMyanmarOther(char32 ch) {\n  IcuErrorCode err;\n  UScriptCode script_code = uscript_getScript(ch, err);\n  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&\n      ch != Validator::kZeroWidthNonJoiner) {\n    return true;\n  }\n  return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||\n         (0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||\n         (ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validate_myanmar.h",
    "content": "#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_\n#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_\n\n#include \"validator.h\"\n\nnamespace tesseract {\n\n// Subclass of Validator that validates and segments Myanmar.\nclass ValidateMyanmar : public Validator {\npublic:\n  ValidateMyanmar(ViramaScript script, bool report_errors) : Validator(script, report_errors) {}\n  ~ValidateMyanmar() override = default;\n\nprotected:\n  // Returns whether codes matches the pattern for a Myanmar Grapheme.\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  bool ConsumeGraphemeIfValid() override;\n  // Returns the CharClass corresponding to the given Unicode ch.\n  Validator::CharClass UnicodeToCharClass(char32 ch) const override;\n\nprivate:\n  // Helper consumes/copies a virama and any subscript consonant.\n  // Returns true if the end of input is reached.\n  bool ConsumeSubscriptIfPresent();\n  // Helper consumes/copies a series of optional signs.\n  // Returns true if the end of input is reached.\n  bool ConsumeOptionalSignsIfPresent();\n  // Returns true if the unicode is a Myanmar \"letter\" including consonants\n  // and independent vowels. Although table 16-3 distinguishes between some\n  // base consonants and vowels, the extensions make no such distinction, so we\n  // put them all into a single bucket.\n  static bool IsMyanmarLetter(char32 ch);\n  // Returns true if ch is a Myanmar digit or other symbol that does not take\n  // part in being a syllable.\n  static bool IsMyanmarOther(char32 ch);\n\n  // Some special unicodes used only for Myanmar processing.\n  static const char32 kMyanmarAsat = 0x103a;\n  static const char32 kMyanmarMedialYa = 0x103b;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_\n"
  },
  {
    "path": "src/training/unicharset/validator.cpp",
    "content": "#include \"validator.h\"\n\n#include <algorithm>\n#include <iterator>\n#include <unordered_map>\n#include <vector>\n\n#include \"icuerrorcode.h\"\n#include \"unicode/uchar.h\"   // From libicu\n#include \"unicode/uscript.h\" // From libicu\n#include \"validate_grapheme.h\"\n#include \"validate_indic.h\"\n#include \"validate_javanese.h\"\n#include \"validate_khmer.h\"\n#include \"validate_myanmar.h\"\n\nnamespace tesseract {\n\n// Some specific but universally useful unicodes.\nconst char32 Validator::kZeroWidthSpace = 0x200B;\nconst char32 Validator::kZeroWidthNonJoiner = 0x200C;\nconst char32 Validator::kZeroWidthJoiner = 0x200D;\nconst char32 Validator::kLeftToRightMark = 0x200E;\nconst char32 Validator::kRightToLeftMark = 0x200F;\nconst char32 Validator::kInvalid = 0xfffd;\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nValidator::~Validator() = default;\n\n// Validates and cleans the src vector of unicodes to the *dest, according to\n// g_mode. In the case of kSingleString, a single vector containing the whole\n// result is added to *dest. With kCombined, multiple vectors are added to\n// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are\n// added to *dest with a smaller unit representing a glyph in each.\n// In case of validation error, returns false and as much as possible of the\n// input, without discarding invalid text.\n/* static */\nbool Validator::ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,\n                                        const std::vector<char32> &src,\n                                        std::vector<std::vector<char32>> *dest) {\n  ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);\n  std::vector<std::vector<char32>> graphemes;\n  ViramaScript script = MostFrequentViramaScript(src);\n  bool success = true;\n  if (script == ViramaScript::kNonVirama) {\n    // The grapheme segmenter's maximum segmentation is the grapheme unit, so\n    // up the mode by 1 to get the desired effect.\n    if (g_mode == GraphemeNormMode::kCombined) {\n      g_mode = GraphemeNormMode::kGlyphSplit;\n    } else if (g_mode == GraphemeNormMode::kGlyphSplit) {\n      g_mode = GraphemeNormMode::kIndividualUnicodes;\n    }\n    // Just do grapheme segmentation.\n    success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);\n  } else {\n    success =\n        g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes);\n    std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors));\n    for (const auto &grapheme : graphemes) {\n      if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {\n        success = false;\n      }\n    }\n  }\n  return success;\n}\n\n// Factory method that understands how to map script to the right subclass.\nstd::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script, bool report_errors) {\n  switch (script) {\n#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)\n    CASE(kNonVirama, ValidateGrapheme);\n    CASE(kJavanese, ValidateJavanese);\n    CASE(kMyanmar, ValidateMyanmar);\n    CASE(kKhmer, ValidateKhmer);\n#undef CASE\n    default:\n      return std::make_unique<ValidateIndic>(script, report_errors);\n  }\n}\n\n// Internal version of the public static ValidateCleanAndSegment.\n// Validates and cleans the src vector of unicodes to the *dest, according to\n// its type and the given g_mode.\n// In case of validation error, returns false and returns as much as possible\n// of the input, without discarding invalid text.\nbool Validator::ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,\n                                                const std::vector<char32> &src,\n                                                std::vector<std::vector<char32>> *dest) {\n  Clear();\n  ComputeClassCodes(src);\n  bool success = true;\n  for (codes_used_ = 0; codes_used_ < codes_.size();) {\n    if (!ConsumeGraphemeIfValid()) {\n      success = false;\n      ++codes_used_;\n    }\n  }\n  MoveResultsToDest(g_mode, dest);\n  return success;\n}\n\n// Moves the results from parts_ or output_ to dest according to g_mode.\nvoid Validator::MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest) {\n  if (g_mode == GraphemeNormMode::kIndividualUnicodes) {\n    // Append each element of the combined output_ that we made as a new vector\n    // in dest.\n    dest->reserve(dest->size() + output_.size());\n    for (char32 ch : output_) {\n      dest->push_back({ch});\n    }\n  } else if (g_mode == GraphemeNormMode::kGlyphSplit) {\n    // Append all the parts_ that we made onto dest.\n    std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));\n  } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {\n    // Append the combined output_ that we made onto dest as one new vector.\n    dest->push_back(std::vector<char32>());\n    output_.swap(dest->back());\n  } else { // kNone.\n    // Append the combined output_ that we made onto the last existing element\n    // of dest.\n    dest->back().insert(dest->back().end(), output_.begin(), output_.end());\n  }\n}\n\nstatic bool CmpPairSecond(const std::pair<int, int> &p1, const std::pair<int, int> &p2) {\n  return p1.second < p2.second;\n}\n\n// Computes and returns the ViramaScript corresponding to the most frequent\n// virama-using script in the input, or kNonVirama if none are present.\n/* static */\nViramaScript Validator::MostFrequentViramaScript(const std::vector<char32> &utf32) {\n  std::unordered_map<int, int> histogram;\n  for (char32 ch : utf32) {\n    // Determine the codepage base. For the Indic scripts, Khmer and Javanese,\n    // it is sufficient to divide by kIndicCodePageSize but Myanmar is all over\n    // the unicode code space, so use its script id.\n    int base = ch / kIndicCodePageSize;\n    IcuErrorCode err;\n    UScriptCode script_code = uscript_getScript(ch, err);\n    if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) ||\n        script_code == USCRIPT_MYANMAR) {\n      if (script_code == USCRIPT_MYANMAR) {\n        base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;\n      }\n      ++histogram[base];\n    }\n  }\n  if (!histogram.empty()) {\n    int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;\n    auto codebase = static_cast<char32>(base * kIndicCodePageSize);\n    // Check for validity.\n    if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||\n        codebase == static_cast<char32>(ViramaScript::kJavanese) ||\n        codebase == static_cast<char32>(ViramaScript::kKhmer) ||\n        (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&\n         codebase <= static_cast<char32>(ViramaScript::kSinhala))) {\n      return static_cast<ViramaScript>(codebase);\n    }\n  }\n  return ViramaScript::kNonVirama;\n}\n\n// Returns true if the given UTF-32 unicode is a \"virama\" character.\n/* static */\nbool Validator::IsVirama(char32 unicode) {\n  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&\n          (unicode & 0x7f) == 0x4d) ||\n         unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama ||\n         unicode == kKhmerVirama;\n}\n\n// Returns true if the given UTF-32 unicode is a vedic accent.\n/* static */\nbool Validator::IsVedicAccent(char32 unicode) {\n  return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||\n         (0x951 <= unicode && unicode <= 0x954);\n}\n\n// Returns true if the script is one that uses subscripts for conjuncts.\nbool Validator::IsSubscriptScript() const {\n  return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada ||\n         script_ == ViramaScript::kJavanese || script_ == ViramaScript::kMyanmar ||\n         script_ == ViramaScript::kKhmer;\n}\n\nvoid Validator::ComputeClassCodes(const std::vector<char32> &text) {\n  codes_.reserve(text.size());\n  for (char32 c : text) {\n    codes_.emplace_back(UnicodeToCharClass(c), c);\n  }\n}\n\n// Resets to the initial state.\nvoid Validator::Clear() {\n  codes_.clear();\n  parts_.clear();\n  output_.clear();\n  codes_used_ = 0;\n  output_used_ = 0;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/training/unicharset/validator.h",
    "content": "/**********************************************************************\n * File:        validator.h\n * Description: Base class for various text validators. Intended mainly for\n *              scripts that use a virama character.\n * Author:      Ray Smith\n *\n * (C) Copyright 2017, Google Inc.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n * http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n **********************************************************************/\n\n#ifndef TESSERACT_TRAINING_VALIDATOR_H_\n#define TESSERACT_TRAINING_VALIDATOR_H_\n\n#include \"export.h\"\n\n#include <tesseract/unichar.h>\n\n#include <memory>\n#include <vector>\n\nnamespace tesseract {\n\n// Different kinds of grapheme normalization - not just for Indic!\n// A grapheme is a syllable unit in Indic and can be several unicodes.\n// In other scripts, a grapheme is a base character and accent/diacritic\n// combination, as not all accented characters have a single composed form.\nenum class GraphemeNormMode {\n  // Validation result is a single string, even if input is multi-word.\n  kSingleString,\n  // Standard unicode graphemes are validated and output as grapheme units.\n  kCombined,\n  // Graphemes are validated and sub-divided. For virama-using scripts, units\n  // that correspond to repeatable glyphs are generated. (Mostly single unicodes\n  // but viramas and joiners are paired with the most sensible neighbor.)\n  // For non-virama scripts, this means that base/accent pairs are separated,\n  // ie the output is individual unicodes.\n  kGlyphSplit,\n  // The output is always single unicodes, regardless of the script.\n  kIndividualUnicodes,\n};\n\n// An enum representing the scripts that use a virama character. It is\n// guaranteed that the value of any element, (except kNonVirama) can be cast\n// to a unicode (char32) value that represents the start of the unicode range\n// of the corresponding script.\nenum class ViramaScript : char32 {\n  kNonVirama = 0,\n  kDevanagari = 0x900,\n  kBengali = 0x980,\n  kGurmukhi = 0xa00,\n  kGujarati = 0xa80,\n  kOriya = 0xb00,\n  kTamil = 0xb80,\n  kTelugu = 0xc00,\n  kKannada = 0xc80,\n  kMalayalam = 0xd00,\n  kSinhala = 0xd80,\n  kMyanmar = 0x1000,\n  kKhmer = 0x1780,\n  kJavanese = 0xa980,\n};\n\n// Base class offers a validation API and protected methods to allow subclasses\n// to easily build the validated/segmented output.\nclass TESS_UNICHARSET_TRAINING_API Validator {\npublic:\n  // Validates and cleans the src vector of unicodes to the *dest, according to\n  // g_mode. In the case of kSingleString, a single vector containing the whole\n  // result is added to *dest. With kCombined, multiple vectors are added to\n  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are\n  // added to *dest with a smaller unit representing a glyph in each.\n  // In case of validation error, returns false and as much as possible of the\n  // input, without discarding invalid text.\n  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,\n                                      const std::vector<char32> &src,\n                                      std::vector<std::vector<char32>> *dest);\n\n  // Returns true if the unicode ch is a non-printing zero-width mark of no\n  // significance to OCR training or evaluation.\n  static bool IsZeroWidthMark(char32 ch) {\n    return ch == kZeroWidthSpace || ch == kLeftToRightMark || ch == kRightToLeftMark ||\n           ch == kInvalid;\n  }\n  virtual ~Validator();\n\n  // Some specific but universally useful unicodes.\n  static const char32 kZeroWidthSpace;\n  static const char32 kZeroWidthNonJoiner;\n  static const char32 kZeroWidthJoiner;\n  static const char32 kLeftToRightMark;\n  static const char32 kRightToLeftMark;\n  static const char32 kInvalid;\n\nprotected:\n  // These are more or less the character class identifiers in the ISCII\n  // standard, section 8.  They have been augmented with the Unicode meta\n  // characters Zero Width Joiner and Zero Width Non Joiner, and the\n  // Unicode Vedic Marks.\n  // The best sources of information on Unicode and Indic scripts are:\n  //   http://varamozhi.sourceforge.net/iscii91.pdf\n  //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf\n  //   http://unicode.org/faq/indic.html\n  //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx\n  enum class CharClass {\n    // NOTE: The values of the enum members are meaningless and arbitrary, ie\n    // they are not used for sorting, or any other risky application.\n    // The reason they are what they are is they are a single character\n    // abbreviation that can be used in a regexp/BNF definition of a grammar,\n    // IN A COMMENT, and still not relied upon in the code.\n    kConsonant = 'C',\n    kVowel = 'V',\n    kVirama = 'H',             // (aka Halant)\n    kMatra = 'M',              // (aka Dependent Vowel)\n    kMatraPiece = 'P',         // unicode provides pieces of Matras.\n    kVowelModifier = 'D',      // (candrabindu, anusvara, visarga, other marks)\n    kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C\n    kZeroWidthJoiner = 'Z',    // Unicode Zero Width Joiner U+200D\n    kVedicMark = 'v',          // Modifiers can come modify any indic syllable.\n    kNukta = 'N',              // Occurs only immediately after consonants.\n    kRobat = 'R',              // Khmer only.\n    kOther = 'O',              // (digits, measures, non-Indic, etc)\n    // Additional classes used only by ValidateGrapheme.\n    kWhitespace = ' ',\n    kCombiner = 'c', // Combiners other than virama.\n  };\n  using IndicPair = std::pair<CharClass, char32>;\n\n  Validator(ViramaScript script, bool report_errors)\n      : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {}\n\n  // Factory method that understands how to map script to the right subclass.\n  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script, bool report_errors);\n\n  // Internal version of the public static ValidateCleanAndSegment.\n  // Validates and cleans the src vector of unicodes to the *dest, according to\n  // its type and the given g_mode.\n  // In case of validation error, returns false and returns as much as possible\n  // of the input, without discarding invalid text.\n  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector<char32> &src,\n                                       std::vector<std::vector<char32>> *dest);\n  // Moves the results from parts_ or output_ to dest according to g_mode.\n  void MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest);\n\n  // Computes and returns the ViramaScript corresponding to the most frequent\n  // virama-using script in the input, or kNonVirama if none are present.\n  static ViramaScript MostFrequentViramaScript(const std::vector<char32> &utf32);\n  // Returns true if the given UTF-32 unicode is a \"virama\" character.\n  static bool IsVirama(char32 unicode);\n  // Returns true if the given UTF-32 unicode is a vedic accent.\n  static bool IsVedicAccent(char32 unicode);\n  // Returns true if the script is one that uses subscripts for conjuncts.\n  bool IsSubscriptScript() const;\n\n  // Helper function appends the next element of codes_ only to output_,\n  // without touching parts_\n  // Returns true at the end of codes_.\n  bool CodeOnlyToOutput() {\n    output_.push_back(codes_[codes_used_].second);\n    return ++codes_used_ == codes_.size();\n  }\n\n  // Helper function adds a length-element vector to parts_ from the last length\n  // elements of output_. If there are more than length unused elements in\n  // output_, adds unicodes as single-element vectors to parts_ to catch\n  // output_used_ up to output->size() - length before adding the length-element\n  // vector.\n  void MultiCodePart(unsigned length) {\n    while (output_used_ + length < output_.size()) {\n      parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]});\n    }\n    parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});\n    while (++output_used_ < output_.size()) {\n      parts_.back().push_back(output_[output_used_]);\n    }\n  }\n\n  // Helper function appends the next element of codes_ to output_, and then\n  // calls MultiCodePart to add the appropriate components to parts_.\n  // Returns true at the end of codes_.\n  bool UseMultiCode(unsigned length) {\n    output_.push_back(codes_[codes_used_].second);\n    MultiCodePart(length);\n    return ++codes_used_ == codes_.size();\n  }\n\n  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to\n  // parts_ and output_. Returns true if a valid Grapheme was consumed,\n  // otherwise does not increment codes_used_.\n  virtual bool ConsumeGraphemeIfValid() = 0;\n  // Sets codes_ to the class codes for the given unicode text.\n  void ComputeClassCodes(const std::vector<char32> &text);\n  // Returns the CharClass corresponding to the given Unicode ch.\n  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;\n  // Resets to the initial state.\n  void Clear();\n\n  // Number of unicodes in each Indic codepage.\n  static const int kIndicCodePageSize = 128;\n  // Lowest unicode value of any Indic script. (Devanagari).\n  static const char32 kMinIndicUnicode = 0x900;\n  // Highest unicode value of any consistent (ISCII-based) Indic script.\n  static const char32 kMaxSinhalaUnicode = 0xdff;\n  // Highest unicode value of any virama-using script. (Khmer).\n  static const char32 kMaxViramaScriptUnicode = 0x17ff;\n  // Some special unicodes.\n  static const char32 kSinhalaVirama = 0xdca;\n  static const char32 kMyanmarVirama = 0x1039;\n  static const char32 kKhmerVirama = 0x17d2;\n  // Javanese Script - aksarajawa\n  static const char32 kJavaneseVirama = 0xa9c0;\n  static const char32 kMaxJavaneseUnicode = 0xa9df;\n\n  // Script we are operating on.\n  ViramaScript script_;\n  // Input unicodes with assigned CharClass is the data to be validated.\n  std::vector<IndicPair> codes_;\n  // Glyph-like components of the input.\n  std::vector<std::vector<char32>> parts_;\n  // Copied validated unicodes from codes_ that are OK to output.\n  std::vector<char32> output_;\n  // The number of elements of codes_ that have been processed so far.\n  unsigned codes_used_;\n  // The number of elements of output_ that have already been added to parts_.\n  unsigned output_used_;\n  // Log error messages for reasons why text is invalid.\n  bool report_errors_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_TRAINING_VALIDATOR_H_\n"
  },
  {
    "path": "src/training/unicharset_extractor.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        unicharset_extractor.cpp\n// Description: Unicode character/ligature set extractor.\n// Author:      Thomas Kielbus\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Given a list of box files or text files on the command line, this program\n// normalizes the text according to command-line options and generates\n// a unicharset.\n\n#include <cstdlib>\n#include <filesystem>\n#include \"boxread.h\"\n#include \"commandlineflags.h\"\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"lang_model_helpers.h\"\n#include \"normstrngs.h\"\n#include \"unicharset.h\"\n#include \"unicharset_training_utils.h\"\n\nusing namespace tesseract;\n\nstatic STRING_PARAM_FLAG(output_unicharset, \"unicharset\", \"Output file path\");\nstatic INT_PARAM_FLAG(norm_mode, 1,\n                      \"Normalization mode: 1=Combine graphemes, \"\n                      \"2=Split graphemes, 3=Pure unicode\");\n\nnamespace tesseract {\n\n// Helper normalizes and segments the given strings according to norm_mode, and\n// adds the segmented parts to unicharset.\nstatic void AddStringsToUnicharset(const std::vector<std::string> &strings, int norm_mode,\n                                   UNICHARSET *unicharset) {\n  for (const auto &string : strings) {\n    std::vector<std::string> normalized;\n    if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                     static_cast<GraphemeNormMode>(norm_mode),\n                                     /*report_errors*/ true, string.c_str(), &normalized)) {\n      for (const std::string &normed : normalized) {\n        // normed is a UTF-8 encoded string\n        if (normed.empty() || IsUTF8Whitespace(normed.c_str())) {\n          continue;\n        }\n        unicharset->unichar_insert(normed.c_str());\n      }\n    } else {\n      tprintf(\"Normalization failed for string '%s'\\n\", string.c_str());\n    }\n  }\n}\n\nstatic int Main(int argc, char **argv) {\n  UNICHARSET unicharset;\n  // Load input files\n  for (int arg = 1; arg < argc; ++arg) {\n    std::filesystem::path filePath = argv[arg];\n    std::string file_data = tesseract::ReadFile(argv[arg]);\n    if (file_data.empty()) {\n      continue;\n    }\n    std::vector<std::string> texts;\n    if (filePath.extension() == \".box\") {\n      tprintf(\"Extracting unicharset from box file %s\\n\", argv[arg]);\n      bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],\n                   /*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,\n                   /*box_texts*/ nullptr, /*pages*/ nullptr);\n      if (!res) {\n        tprintf(\"Cannot read box data from '%s'\\n\", argv[arg]);\n        return EXIT_FAILURE;\n      }\n    } else {\n      tprintf(\"Extracting unicharset from plain text file %s\\n\", argv[arg]);\n      texts.clear();\n      texts = split(file_data, '\\n');\n    }\n    AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);\n  }\n  SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, &unicharset);\n  // Write unicharset file.\n  if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {\n    tprintf(\"Wrote unicharset file %s\\n\", FLAGS_output_unicharset.c_str());\n  } else {\n    tprintf(\"Cannot save unicharset file %s\\n\", FLAGS_output_unicharset.c_str());\n    return EXIT_FAILURE;\n  }\n  return EXIT_SUCCESS;\n}\n\n} // namespace tesseract\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n  if (argc > 1) {\n    tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);\n  }\n  if (argc < 2) {\n    tprintf(\n        \"Usage: %s [--output_unicharset filename] [--norm_mode mode]\"\n        \" box_or_text_file [...]\\n\",\n        argv[0]);\n    tprintf(\"Where mode means:\\n\");\n    tprintf(\" 1=combine graphemes (use for Latin and other simple scripts)\\n\");\n    tprintf(\" 2=split graphemes (use for Indic/Khmer/Myanmar)\\n\");\n    tprintf(\" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\\n\");\n    tprintf(\"Reads box or plain text files to extract the unicharset.\\n\");\n    return EXIT_FAILURE;\n  }\n  return tesseract::Main(argc, argv);\n}\n"
  },
  {
    "path": "src/training/wordlist2dawg.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        wordlist2dawg.cpp\n// Description: Program to generate a DAWG from a word list file\n// Author:      Thomas Kielbus\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n// Given a file that contains a list of words (one word per line) this program\n// generates the corresponding squished DAWG file.\n\n#include \"classify.h\"\n#include \"commontraining.h\" // CheckSharedLibraryVersion\n#include \"dawg.h\"\n#include \"dict.h\"\n#include \"helpers.h\"\n#include \"serialis.h\"\n#include \"trie.h\"\n#include \"unicharset.h\"\n\nusing namespace tesseract;\n\nint main(int argc, char **argv) {\n  tesseract::CheckSharedLibraryVersion();\n\n  if (argc > 1 && (!strcmp(argv[1], \"-v\") || !strcmp(argv[1], \"--version\"))) {\n    printf(\"%s\\n\", tesseract::TessBaseAPI::Version());\n    return EXIT_SUCCESS;\n  } else if (!(argc == 4 || (argc == 5 && strcmp(argv[1], \"-t\") == 0) ||\n               (argc == 6 && strcmp(argv[1], \"-r\") == 0))) {\n    printf(\n        \"Usage: %s -v | --version |\\n\"\n        \"       %s [-t | -r [reverse policy] ] word_list_file\"\n        \" dawg_file unicharset_file\\n\",\n        argv[0], argv[0]);\n    return EXIT_FAILURE;\n  }\n  tesseract::Classify classify;\n  int argv_index = 0;\n  if (argc == 5) {\n    ++argv_index;\n  }\n  tesseract::Trie::RTLReversePolicy reverse_policy = tesseract::Trie::RRP_DO_NO_REVERSE;\n  if (argc == 6) {\n    ++argv_index;\n    int tmp_int;\n    sscanf(argv[++argv_index], \"%d\", &tmp_int);\n    reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int);\n    tprintf(\"Set reverse_policy to %s\\n\", tesseract::Trie::get_reverse_policy_name(reverse_policy));\n  }\n  const char *wordlist_filename = argv[++argv_index];\n  const char *dawg_filename = argv[++argv_index];\n  const char *unicharset_file = argv[++argv_index];\n  tprintf(\"Loading unicharset from '%s'\\n\", unicharset_file);\n  if (!classify.getDict().getUnicharset().load_from_file(unicharset_file)) {\n    tprintf(\"Failed to load unicharset from '%s'\\n\", unicharset_file);\n    return EXIT_FAILURE;\n  }\n  const UNICHARSET &unicharset = classify.getDict().getUnicharset();\n  if (argc == 4 || argc == 6) {\n    tesseract::Trie trie(\n        // the first 3 arguments are not used in this case\n        tesseract::DAWG_TYPE_WORD, \"\", SYSTEM_DAWG_PERM, unicharset.size(),\n        classify.getDict().dawg_debug_level);\n    tprintf(\"Reading word list from '%s'\\n\", wordlist_filename);\n    if (!trie.read_and_add_word_list(wordlist_filename, unicharset, reverse_policy)) {\n      tprintf(\"Failed to add word list from '%s'\\n\", wordlist_filename);\n      return EXIT_FAILURE;\n    }\n    tprintf(\"Reducing Trie to SquishedDawg\\n\");\n    std::unique_ptr<tesseract::SquishedDawg> dawg(trie.trie_to_dawg());\n    if (dawg && dawg->NumEdges() > 0) {\n      tprintf(\"Writing squished DAWG to '%s'\\n\", dawg_filename);\n      dawg->write_squished_dawg(dawg_filename);\n    } else {\n      tprintf(\"Dawg is empty, skip producing the output file\\n\");\n    }\n  } else if (argc == 5) {\n    tprintf(\"Loading dawg DAWG from '%s'\\n\", dawg_filename);\n    tesseract::SquishedDawg words(dawg_filename,\n                                  // these 3 arguments are not used in this case\n                                  tesseract::DAWG_TYPE_WORD, \"\", SYSTEM_DAWG_PERM,\n                                  classify.getDict().dawg_debug_level);\n    tprintf(\"Checking word list from '%s'\\n\", wordlist_filename);\n    words.check_for_words(wordlist_filename, unicharset, true);\n  } else { // should never get here\n    tprintf(\"Invalid command-line options\\n\");\n    return EXIT_FAILURE;\n  }\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "src/viewer/scrollview.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        scrollview.cpp\n// Description: ScrollView\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"scrollview.h\"\n\n#include \"svutil.h\" // for SVNetwork\n\n#include <allheaders.h>\n\n#include <algorithm>\n#include <climits>\n#include <cstdarg>\n#include <cstring>\n#include <map>\n#include <memory> // for std::unique_ptr\n#include <mutex> // for std::mutex\n#include <string>\n#include <thread> // for std::thread\n#include <utility>\n#include <vector>\n\nnamespace tesseract {\n\nconst int kSvPort = 8461;\nconst int kMaxIntPairSize = 45; // Holds %d,%d, for up to 64 bit.\n\nstruct SVPolyLineBuffer {\n  bool empty; // Independent indicator to allow SendMsg to call SendPolygon.\n  std::vector<int> xcoords;\n  std::vector<int> ycoords;\n};\n\n// A map between the window IDs and their corresponding pointers.\nstatic std::map<int, ScrollView *> svmap;\nstatic std::mutex *svmap_mu;\n// A map of all semaphores waiting for a specific event on a specific window.\nstatic std::map<std::pair<ScrollView *, SVEventType>,\n                std::pair<SVSemaphore *, std::unique_ptr<SVEvent>>> waiting_for_events;\nstatic std::mutex *waiting_for_events_mu;\n\nstd::unique_ptr<SVEvent> SVEvent::copy() const {\n  auto any = std::unique_ptr<SVEvent>(new SVEvent);\n  any->command_id = command_id;\n  any->counter = counter;\n  any->parameter = new char[strlen(parameter) + 1];\n  strcpy(any->parameter, parameter);\n  any->type = type;\n  any->x = x;\n  any->y = y;\n  any->x_size = x_size;\n  any->y_size = y_size;\n  any->window = window;\n  return any;\n}\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of weak vtables in every compilation unit.\nSVEventHandler::~SVEventHandler() = default;\n\n#ifndef GRAPHICS_DISABLED\n/// This is the main loop which handles the ScrollView-logic from the server\n/// to the client. It basically loops through messages, parses them to events\n/// and distributes it to the waiting handlers.\n/// It is run from a different thread and synchronizes via SVSync.\nvoid ScrollView::MessageReceiver() {\n  int counter_event_id = 0; // ongoing counter\n  char *message = nullptr;\n  // Wait until a new message appears in the input stream_.\n  do {\n    message = ScrollView::GetStream()->Receive();\n  } while (message == nullptr);\n\n  // This is the main loop which iterates until the server is dead (strlen =\n  // -1). It basically parses for 3 different messagetypes and then distributes\n  // the events accordingly.\n  while (true) {\n    // The new event we create.\n    std::unique_ptr<SVEvent> cur(new SVEvent);\n    // The ID of the corresponding window.\n    int window_id;\n\n    int ev_type;\n\n    int n;\n    // Fill the new SVEvent properly.\n    sscanf(message, \"%d,%d,%d,%d,%d,%d,%d,%n\", &window_id, &ev_type, &cur->x, &cur->y, &cur->x_size,\n           &cur->y_size, &cur->command_id, &n);\n    char *p = (message + n);\n\n    svmap_mu->lock();\n    cur->window = svmap[window_id];\n\n    if (cur->window != nullptr) {\n      auto length = strlen(p);\n      cur->parameter = new char[length + 1];\n      strcpy(cur->parameter, p);\n      if (length > 0) { // remove the last \\n\n        cur->parameter[length - 1] = '\\0';\n      }\n      cur->type = static_cast<SVEventType>(ev_type);\n      // Correct selection coordinates so x,y is the min pt and size is +ve.\n      if (cur->x_size > 0) {\n        cur->x -= cur->x_size;\n      } else {\n        cur->x_size = -cur->x_size;\n      }\n      if (cur->y_size > 0) {\n        cur->y -= cur->y_size;\n      } else {\n        cur->y_size = -cur->y_size;\n      }\n      // Returned y will be the bottom-left if y is reversed.\n      if (cur->window->y_axis_is_reversed_) {\n        cur->y = cur->window->TranslateYCoordinate(cur->y + cur->y_size);\n      }\n      cur->counter = counter_event_id;\n      // Increase by 2 since we will also create an SVET_ANY event from cur,\n      // which will have a counter_id of cur + 1 (and thus gets processed\n      // after cur).\n      counter_event_id += 2;\n\n      // In case of an SVET_EXIT event, quit the whole application.\n      if (ev_type == SVET_EXIT) {\n        SendRawMessage(\"svmain:exit()\");\n        break;\n      }\n\n      // Place two copies of it in the table for the window.\n      cur->window->SetEvent(cur.get());\n\n      // Check if any of the threads currently waiting want it.\n      std::pair<ScrollView *, SVEventType> awaiting_list(cur->window, cur->type);\n      std::pair<ScrollView *, SVEventType> awaiting_list_any(cur->window, SVET_ANY);\n      std::pair<ScrollView *, SVEventType> awaiting_list_any_window((ScrollView *)nullptr,\n                                                                    SVET_ANY);\n      waiting_for_events_mu->lock();\n      if (waiting_for_events.count(awaiting_list) > 0) {\n        waiting_for_events[awaiting_list].second = std::move(cur);\n        waiting_for_events[awaiting_list].first->Signal();\n      } else if (waiting_for_events.count(awaiting_list_any) > 0) {\n        waiting_for_events[awaiting_list_any].second = std::move(cur);\n        waiting_for_events[awaiting_list_any].first->Signal();\n      } else if (waiting_for_events.count(awaiting_list_any_window) > 0) {\n        waiting_for_events[awaiting_list_any_window].second = std::move(cur);\n        waiting_for_events[awaiting_list_any_window].first->Signal();\n      }\n      waiting_for_events_mu->unlock();\n      // Signal the corresponding semaphore twice (for both copies).\n      ScrollView *sv = svmap[window_id];\n      if (sv != nullptr) {\n        sv->Signal();\n        sv->Signal();\n      }\n    }\n    svmap_mu->unlock();\n\n    // Wait until a new message appears in the input stream_.\n    do {\n      message = ScrollView::GetStream()->Receive();\n    } while (message == nullptr);\n  }\n}\n\n// Table to implement the color index values in the old system.\nstatic const uint8_t table_colors[ScrollView::GREEN_YELLOW + 1][4] = {\n    {0, 0, 0, 0},         // NONE (transparent)\n    {0, 0, 0, 255},       // BLACK.\n    {255, 255, 255, 255}, // WHITE.\n    {255, 0, 0, 255},     // RED.\n    {255, 255, 0, 255},   // YELLOW.\n    {0, 255, 0, 255},     // GREEN.\n    {0, 255, 255, 255},   // CYAN.\n    {0, 0, 255, 255},     // BLUE.\n    {255, 0, 255, 255},   // MAGENTA.\n    {0, 128, 255, 255},   // AQUAMARINE.\n    {0, 0, 64, 255},      // DARK_SLATE_BLUE.\n    {128, 128, 255, 255}, // LIGHT_BLUE.\n    {64, 64, 255, 255},   // MEDIUM_BLUE.\n    {0, 0, 32, 255},      // MIDNIGHT_BLUE.\n    {0, 0, 128, 255},     // NAVY_BLUE.\n    {192, 192, 255, 255}, // SKY_BLUE.\n    {64, 64, 128, 255},   // SLATE_BLUE.\n    {32, 32, 64, 255},    // STEEL_BLUE.\n    {255, 128, 128, 255}, // CORAL.\n    {128, 64, 0, 255},    // BROWN.\n    {128, 128, 0, 255},   // SANDY_BROWN.\n    {192, 192, 0, 255},   // GOLD.\n    {192, 192, 128, 255}, // GOLDENROD.\n    {0, 64, 0, 255},      // DARK_GREEN.\n    {32, 64, 0, 255},     // DARK_OLIVE_GREEN.\n    {64, 128, 0, 255},    // FOREST_GREEN.\n    {128, 255, 0, 255},   // LIME_GREEN.\n    {192, 255, 192, 255}, // PALE_GREEN.\n    {192, 255, 0, 255},   // YELLOW_GREEN.\n    {192, 192, 192, 255}, // LIGHT_GREY.\n    {64, 64, 128, 255},   // DARK_SLATE_GREY.\n    {64, 64, 64, 255},    // DIM_GREY.\n    {128, 128, 128, 255}, // GREY.\n    {64, 192, 0, 255},    // KHAKI.\n    {255, 0, 192, 255},   // MAROON.\n    {255, 128, 0, 255},   // ORANGE.\n    {255, 128, 64, 255},  // ORCHID.\n    {255, 192, 192, 255}, // PINK.\n    {128, 0, 128, 255},   // PLUM.\n    {255, 0, 64, 255},    // INDIAN_RED.\n    {255, 64, 0, 255},    // ORANGE_RED.\n    {255, 0, 192, 255},   // VIOLET_RED.\n    {255, 192, 128, 255}, // SALMON.\n    {128, 128, 0, 255},   // TAN.\n    {0, 255, 255, 255},   // TURQUOISE.\n    {0, 128, 128, 255},   // DARK_TURQUOISE.\n    {192, 0, 255, 255},   // VIOLET.\n    {128, 128, 0, 255},   // WHEAT.\n    {128, 255, 0, 255}    // GREEN_YELLOW\n};\n\n/*******************************************************************************\n * Scrollview implementation.\n *******************************************************************************/\n\nSVNetwork *ScrollView::stream_ = nullptr;\nint ScrollView::nr_created_windows_ = 0;\nint ScrollView::image_index_ = 0;\n\n/// Calls Initialize with all arguments given.\nScrollView::ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size,\n                       int x_canvas_size, int y_canvas_size, bool y_axis_reversed,\n                       const char *server_name) {\n  Initialize(name, x_pos, y_pos, x_size, y_size, x_canvas_size, y_canvas_size, y_axis_reversed,\n             server_name);\n}\n\n/// Calls Initialize with default argument for server_name_.\nScrollView::ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size,\n                       int x_canvas_size, int y_canvas_size, bool y_axis_reversed) {\n  Initialize(name, x_pos, y_pos, x_size, y_size, x_canvas_size, y_canvas_size, y_axis_reversed,\n             \"localhost\");\n}\n\n/// Calls Initialize with default argument for server_name_ & y_axis_reversed.\nScrollView::ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size,\n                       int x_canvas_size, int y_canvas_size) {\n  Initialize(name, x_pos, y_pos, x_size, y_size, x_canvas_size, y_canvas_size, false, \"localhost\");\n}\n\n/// Sets up a ScrollView window, depending on the constructor variables.\nvoid ScrollView::Initialize(const char *name, int x_pos, int y_pos, int x_size, int y_size,\n                            int x_canvas_size, int y_canvas_size, bool y_axis_reversed,\n                            const char *server_name) {\n  // If this is the first ScrollView Window which gets created, there is no\n  // network connection yet and we have to set it up in a different thread.\n  if (stream_ == nullptr) {\n    nr_created_windows_ = 0;\n    stream_ = new SVNetwork(server_name, kSvPort);\n    waiting_for_events_mu = new std::mutex();\n    svmap_mu = new std::mutex();\n    SendRawMessage(\"svmain = luajava.bindClass('com.google.scrollview.ScrollView')\\n\");\n    std::thread t(&ScrollView::MessageReceiver);\n    t.detach();\n  }\n\n  // Set up the variables on the clientside.\n  nr_created_windows_++;\n  event_handler_ = nullptr;\n  event_handler_ended_ = false;\n  y_axis_is_reversed_ = y_axis_reversed;\n  y_size_ = y_canvas_size;\n  window_name_ = name;\n  window_id_ = nr_created_windows_;\n  // Set up polygon buffering.\n  points_ = new SVPolyLineBuffer;\n  points_->empty = true;\n\n  svmap_mu->lock();\n  svmap[window_id_] = this;\n  svmap_mu->unlock();\n\n  for (auto &i : event_table_) {\n    i = nullptr;\n  }\n\n  semaphore_ = new SVSemaphore();\n\n  // Set up an actual Window on the client side.\n  char message[kMaxMsgSize];\n  snprintf(message, sizeof(message),\n           \"w%d = luajava.newInstance('com.google.scrollview.ui\"\n           \".SVWindow','%s',%u,%u,%u,%u,%u,%u,%u)\\n\",\n           window_id_, window_name_, window_id_, x_pos, y_pos, x_size, y_size, x_canvas_size,\n           y_canvas_size);\n  SendRawMessage(message);\n\n  std::thread t(&ScrollView::StartEventHandler, this);\n  t.detach();\n}\n\n/// Sits and waits for events on this window.\nvoid ScrollView::StartEventHandler() {\n  for (;;) {\n    stream_->Flush();\n    semaphore_->Wait();\n    int serial = -1;\n    int k = -1;\n    mutex_.lock();\n    // Check every table entry if it is valid and not already processed.\n\n    for (int i = 0; i < SVET_COUNT; i++) {\n      if (event_table_[i] != nullptr && (serial < 0 || event_table_[i]->counter < serial)) {\n        serial = event_table_[i]->counter;\n        k = i;\n      }\n    }\n    // If we didn't find anything we had an old alarm and just sleep again.\n    if (k != -1) {\n      auto new_event = std::move(event_table_[k]);\n      mutex_.unlock();\n      if (event_handler_ != nullptr) {\n        event_handler_->Notify(new_event.get());\n      }\n      if (new_event->type == SVET_DESTROY) {\n        // Signal the destructor that it is safe to terminate.\n        event_handler_ended_ = true;\n        return;\n      }\n    } else {\n      mutex_.unlock();\n    }\n    // The thread should run as long as its associated window is alive.\n  }\n}\n#endif // !GRAPHICS_DISABLED\n\nScrollView::~ScrollView() {\n#ifndef GRAPHICS_DISABLED\n  svmap_mu->lock();\n  if (svmap[window_id_] != nullptr) {\n    svmap_mu->unlock();\n    // So the event handling thread can quit.\n    SendMsg(\"destroy()\");\n\n    AwaitEvent(SVET_DESTROY);\n    svmap_mu->lock();\n    svmap[window_id_] = nullptr;\n    svmap_mu->unlock();\n    // The event handler thread for this window *must* receive the\n    // destroy event and set its pointer to this to nullptr before we allow\n    // the destructor to exit.\n    while (!event_handler_ended_) {\n      Update();\n    }\n  } else {\n    svmap_mu->unlock();\n  }\n  delete semaphore_;\n  delete points_;\n#endif // !GRAPHICS_DISABLED\n}\n\n#ifndef GRAPHICS_DISABLED\n/// Send a message to the server, attaching the window id.\nvoid ScrollView::SendMsg(const char *format, ...) {\n  if (!points_->empty) {\n    SendPolygon();\n  }\n  va_list args;\n  char message[kMaxMsgSize - 4];\n\n  va_start(args, format); // variable list\n  vsnprintf(message, sizeof(message), format, args);\n  va_end(args);\n\n  char form[kMaxMsgSize];\n  snprintf(form, sizeof(form), \"w%d:%s\\n\", window_id_, message);\n\n  stream_->Send(form);\n}\n\n/// Send a message to the server without a\n/// window id. Used for global events like exit().\nvoid ScrollView::SendRawMessage(const char *msg) {\n  stream_->Send(msg);\n}\n\n/// Add an Event Listener to this ScrollView Window\nvoid ScrollView::AddEventHandler(SVEventHandler *listener) {\n  event_handler_ = listener;\n}\n\nvoid ScrollView::Signal() {\n  semaphore_->Signal();\n}\n\nvoid ScrollView::SetEvent(const SVEvent *svevent) {\n  // Copy event\n  auto any = svevent->copy();\n  auto specific = svevent->copy();\n  any->counter = specific->counter + 1;\n\n  // Place both events into the queue.\n  std::lock_guard<std::mutex> guard(mutex_);\n\n  event_table_[specific->type] = std::move(specific);\n  event_table_[SVET_ANY] = std::move(any);\n}\n\n/// Block until an event of the given type is received.\n/// Note: The calling function is responsible for deleting the returned\n/// SVEvent afterwards!\nstd::unique_ptr<SVEvent> ScrollView::AwaitEvent(SVEventType type) {\n  // Initialize the waiting semaphore.\n  auto *sem = new SVSemaphore();\n  std::pair<ScrollView *, SVEventType> ea(this, type);\n  waiting_for_events_mu->lock();\n  waiting_for_events[ea] = {sem, nullptr};\n  waiting_for_events_mu->unlock();\n  // Wait on it, but first flush.\n  stream_->Flush();\n  sem->Wait();\n  // Process the event we got woken up for (its in waiting_for_events pair).\n  waiting_for_events_mu->lock();\n  auto ret = std::move(waiting_for_events[ea].second);\n  waiting_for_events.erase(ea);\n  delete sem;\n  waiting_for_events_mu->unlock();\n  return ret;\n}\n\n// Send the current buffered polygon (if any) and clear it.\nvoid ScrollView::SendPolygon() {\n  if (!points_->empty) {\n    points_->empty = true; // Allows us to use SendMsg.\n    int length = points_->xcoords.size();\n    // length == 1 corresponds to 2 SetCursors in a row and only the\n    // last setCursor has any effect.\n    if (length == 2) {\n      // An isolated line!\n      SendMsg(\"drawLine(%d,%d,%d,%d)\", points_->xcoords[0], points_->ycoords[0],\n              points_->xcoords[1], points_->ycoords[1]);\n    } else if (length > 2) {\n      // A polyline.\n      SendMsg(\"createPolyline(%d)\", length);\n      char coordpair[kMaxIntPairSize];\n      std::string decimal_coords;\n      for (int i = 0; i < length; ++i) {\n        snprintf(coordpair, kMaxIntPairSize, \"%d,%d,\", points_->xcoords[i], points_->ycoords[i]);\n        decimal_coords += coordpair;\n      }\n      decimal_coords += '\\n';\n      SendRawMessage(decimal_coords.c_str());\n      SendMsg(\"drawPolyline()\");\n    }\n    points_->xcoords.clear();\n    points_->ycoords.clear();\n  }\n}\n\n/*******************************************************************************\n * LUA \"API\" functions.\n *******************************************************************************/\n\n// Sets the position from which to draw to (x,y).\nvoid ScrollView::SetCursor(int x, int y) {\n  SendPolygon();\n  DrawTo(x, y);\n}\n\n// Draws from the current position to (x,y) and sets the new position to it.\nvoid ScrollView::DrawTo(int x, int y) {\n  points_->xcoords.push_back(x);\n  points_->ycoords.push_back(TranslateYCoordinate(y));\n  points_->empty = false;\n}\n\n// Draw a line using the current pen color.\nvoid ScrollView::Line(int x1, int y1, int x2, int y2) {\n  if (!points_->xcoords.empty() && x1 == points_->xcoords.back() &&\n      TranslateYCoordinate(y1) == points_->ycoords.back()) {\n    // We are already at x1, y1, so just draw to x2, y2.\n    DrawTo(x2, y2);\n  } else if (!points_->xcoords.empty() && x2 == points_->xcoords.back() &&\n             TranslateYCoordinate(y2) == points_->ycoords.back()) {\n    // We are already at x2, y2, so just draw to x1, y1.\n    DrawTo(x1, y1);\n  } else {\n    // This is a new line.\n    SetCursor(x1, y1);\n    DrawTo(x2, y2);\n  }\n}\n\n// Set the visibility of the window.\nvoid ScrollView::SetVisible(bool visible) {\n  if (visible) {\n    SendMsg(\"setVisible(true)\");\n  } else {\n    SendMsg(\"setVisible(false)\");\n  }\n}\n\n// Set the alwaysOnTop flag.\nvoid ScrollView::AlwaysOnTop(bool b) {\n  if (b) {\n    SendMsg(\"setAlwaysOnTop(true)\");\n  } else {\n    SendMsg(\"setAlwaysOnTop(false)\");\n  }\n}\n\n// Adds a message entry to the message box.\nvoid ScrollView::AddMessage(const char *message) {\n  char form[kMaxMsgSize];\n  snprintf(form, sizeof(form), \"w%d:%s\", window_id_, message);\n\n  char *esc = AddEscapeChars(form);\n  SendMsg(\"addMessage(\\\"%s\\\")\", esc);\n  delete[] esc;\n}\n\nvoid ScrollView::AddMessageF(const char *format, ...) {\n  va_list args;\n  char message[kMaxMsgSize - 4];\n\n  va_start(args, format); // variable list\n  vsnprintf(message, sizeof(message), format, args);\n  va_end(args);\n\n  AddMessage(message);\n}\n\n// Set a messagebox.\nvoid ScrollView::AddMessageBox() {\n  SendMsg(\"addMessageBox()\");\n}\n\n// Exit the client completely (and notify the server of it).\nvoid ScrollView::Exit() {\n  SendRawMessage(\"svmain:exit()\");\n  exit(0);\n}\n\n// Clear the canvas.\nvoid ScrollView::Clear() {\n  SendMsg(\"clear()\");\n}\n\n// Set the stroke width.\nvoid ScrollView::Stroke(float width) {\n  SendMsg(\"setStrokeWidth(%f)\", width);\n}\n\n// Draw a rectangle using the current pen color.\n// The rectangle is filled with the current brush color.\nvoid ScrollView::Rectangle(int x1, int y1, int x2, int y2) {\n  if (x1 == x2 && y1 == y2) {\n    return; // Scrollviewer locks up.\n  }\n  SendMsg(\"drawRectangle(%d,%d,%d,%d)\", x1, TranslateYCoordinate(y1), x2, TranslateYCoordinate(y2));\n}\n\n// Draw an ellipse using the current pen color.\n// The ellipse is filled with the current brush color.\nvoid ScrollView::Ellipse(int x1, int y1, int width, int height) {\n  SendMsg(\"drawEllipse(%d,%d,%u,%u)\", x1, TranslateYCoordinate(y1), width, height);\n}\n\n// Set the pen color to the given RGB values.\nvoid ScrollView::Pen(int red, int green, int blue) {\n  SendMsg(\"pen(%d,%d,%d)\", red, green, blue);\n}\n\n// Set the pen color to the given RGB values.\nvoid ScrollView::Pen(int red, int green, int blue, int alpha) {\n  SendMsg(\"pen(%d,%d,%d,%d)\", red, green, blue, alpha);\n}\n\n// Set the brush color to the given RGB values.\nvoid ScrollView::Brush(int red, int green, int blue) {\n  SendMsg(\"brush(%d,%d,%d)\", red, green, blue);\n}\n\n// Set the brush color to the given RGB values.\nvoid ScrollView::Brush(int red, int green, int blue, int alpha) {\n  SendMsg(\"brush(%d,%d,%d,%d)\", red, green, blue, alpha);\n}\n\n// Set the attributes for future Text(..) calls.\nvoid ScrollView::TextAttributes(const char *font, int pixel_size, bool bold, bool italic,\n                                bool underlined) {\n  const char *b;\n  const char *i;\n  const char *u;\n\n  if (bold) {\n    b = \"true\";\n  } else {\n    b = \"false\";\n  }\n  if (italic) {\n    i = \"true\";\n  } else {\n    i = \"false\";\n  }\n  if (underlined) {\n    u = \"true\";\n  } else {\n    u = \"false\";\n  }\n  SendMsg(\"textAttributes('%s',%u,%s,%s,%s)\", font, pixel_size, b, i, u);\n}\n\n// Draw text at the given coordinates.\nvoid ScrollView::Text(int x, int y, const char *mystring) {\n  SendMsg(\"drawText(%d,%d,'%s')\", x, TranslateYCoordinate(y), mystring);\n}\n\n// Open and draw an image given a name at (x,y).\nvoid ScrollView::Draw(const char *image, int x_pos, int y_pos) {\n  SendMsg(\"openImage('%s')\", image);\n  SendMsg(\"drawImage('%s',%d,%d)\", image, x_pos, TranslateYCoordinate(y_pos));\n}\n\n// Add new checkboxmenuentry to menubar.\nvoid ScrollView::MenuItem(const char *parent, const char *name, int cmdEvent, bool flag) {\n  if (parent == nullptr) {\n    parent = \"\";\n  }\n  if (flag) {\n    SendMsg(\"addMenuBarItem('%s','%s',%d,true)\", parent, name, cmdEvent);\n  } else {\n    SendMsg(\"addMenuBarItem('%s','%s',%d,false)\", parent, name, cmdEvent);\n  }\n}\n\n// Add new menuentry to menubar.\nvoid ScrollView::MenuItem(const char *parent, const char *name, int cmdEvent) {\n  if (parent == nullptr) {\n    parent = \"\";\n  }\n  SendMsg(\"addMenuBarItem('%s','%s',%d)\", parent, name, cmdEvent);\n}\n\n// Add new submenu to menubar.\nvoid ScrollView::MenuItem(const char *parent, const char *name) {\n  if (parent == nullptr) {\n    parent = \"\";\n  }\n  SendMsg(\"addMenuBarItem('%s','%s')\", parent, name);\n}\n\n// Add new submenu to popupmenu.\nvoid ScrollView::PopupItem(const char *parent, const char *name) {\n  if (parent == nullptr) {\n    parent = \"\";\n  }\n  SendMsg(\"addPopupMenuItem('%s','%s')\", parent, name);\n}\n\n// Add new submenuentry to popupmenu.\nvoid ScrollView::PopupItem(const char *parent, const char *name, int cmdEvent, const char *value,\n                           const char *desc) {\n  if (parent == nullptr) {\n    parent = \"\";\n  }\n  char *esc = AddEscapeChars(value);\n  char *esc2 = AddEscapeChars(desc);\n  SendMsg(\"addPopupMenuItem('%s','%s',%d,'%s','%s')\", parent, name, cmdEvent, esc, esc2);\n  delete[] esc;\n  delete[] esc2;\n}\n\n// Send an update message for a single window.\nvoid ScrollView::UpdateWindow() {\n  SendMsg(\"update()\");\n}\n\n// Note: this is an update to all windows\nvoid ScrollView::Update() {\n  std::lock_guard<std::mutex> guard(*svmap_mu);\n  for (auto &iter : svmap) {\n    if (iter.second != nullptr) {\n      iter.second->UpdateWindow();\n    }\n  }\n}\n\n// Set the pen color, using an enum value (e.g. ScrollView::ORANGE)\nvoid ScrollView::Pen(Color color) {\n  Pen(table_colors[color][0], table_colors[color][1], table_colors[color][2],\n      table_colors[color][3]);\n}\n\n// Set the brush color, using an enum value (e.g. ScrollView::ORANGE)\nvoid ScrollView::Brush(Color color) {\n  Brush(table_colors[color][0], table_colors[color][1], table_colors[color][2],\n        table_colors[color][3]);\n}\n\n// Shows a modal Input Dialog which can return any kind of String\nchar *ScrollView::ShowInputDialog(const char *msg) {\n  SendMsg(\"showInputDialog(\\\"%s\\\")\", msg);\n  // wait till an input event (all others are thrown away)\n  auto ev = AwaitEvent(SVET_INPUT);\n  char *p = new char[strlen(ev->parameter) + 1];\n  strcpy(p, ev->parameter);\n  return p;\n}\n\n// Shows a modal Yes/No Dialog which will return 'y' or 'n'\nint ScrollView::ShowYesNoDialog(const char *msg) {\n  SendMsg(\"showYesNoDialog(\\\"%s\\\")\", msg);\n  // Wait till an input event (all others are thrown away)\n  auto ev = AwaitEvent(SVET_INPUT);\n  int a = ev->parameter[0];\n  return a;\n}\n\n// Zoom the window to the rectangle given upper left corner and\n// lower right corner.\nvoid ScrollView::ZoomToRectangle(int x1, int y1, int x2, int y2) {\n  y1 = TranslateYCoordinate(y1);\n  y2 = TranslateYCoordinate(y2);\n  SendMsg(\"zoomRectangle(%d,%d,%d,%d)\", std::min(x1, x2), std::min(y1, y2), std::max(x1, x2),\n          std::max(y1, y2));\n}\n\n// Send an image of type Pix.\nvoid ScrollView::Draw(Image image, int x_pos, int y_pos) {\n  l_uint8 *data;\n  size_t size;\n  pixWriteMem(&data, &size, image, IFF_PNG);\n  int base64_len = (size + 2) / 3 * 4;\n  y_pos = TranslateYCoordinate(y_pos);\n  SendMsg(\"readImage(%d,%d,%d)\", x_pos, y_pos, base64_len);\n  // Base64 encode the data.\n  const char kBase64Table[64] = {\n      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',\n      'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',\n      'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',\n      'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',\n  };\n  char *base64 = new char[base64_len + 1];\n  memset(base64, '=', base64_len);\n  base64[base64_len] = '\\0';\n  int remainder = 0;\n  int bits_left = 0;\n  int code_len = 0;\n  for (size_t i = 0; i < size; ++i) {\n    int code = (data[i] >> (bits_left + 2)) | remainder;\n    base64[code_len++] = kBase64Table[code & 63];\n    bits_left += 2;\n    remainder = data[i] << (6 - bits_left);\n    if (bits_left == 6) {\n      base64[code_len++] = kBase64Table[remainder & 63];\n      bits_left = 0;\n      remainder = 0;\n    }\n  }\n  if (bits_left > 0) {\n    base64[code_len++] = kBase64Table[remainder & 63];\n  }\n  SendRawMessage(base64);\n  delete[] base64;\n  lept_free(data);\n}\n\n// Escapes the ' character with a \\, so it can be processed by LUA.\n// Note: The caller will have to make sure it deletes the newly allocated item.\nchar *ScrollView::AddEscapeChars(const char *input) {\n  const char *nextptr = strchr(input, '\\'');\n  const char *lastptr = input;\n  char *message = new char[kMaxMsgSize];\n  int pos = 0;\n  while (nextptr != nullptr) {\n    strncpy(message + pos, lastptr, nextptr - lastptr);\n    pos += nextptr - lastptr;\n    message[pos] = '\\\\';\n    pos += 1;\n    lastptr = nextptr;\n    nextptr = strchr(nextptr + 1, '\\'');\n  }\n  strcpy(message + pos, lastptr);\n  return message;\n}\n\n// Inverse the Y axis if the coordinates are actually inversed.\nint ScrollView::TranslateYCoordinate(int y) {\n  if (!y_axis_is_reversed_) {\n    return y;\n  } else {\n    return y_size_ - y;\n  }\n}\n\nchar ScrollView::Wait() {\n  // Wait till an input or click event (all others are thrown away)\n  char ret = '\\0';\n  SVEventType ev_type = SVET_ANY;\n  do {\n    std::unique_ptr<SVEvent> ev(AwaitEvent(SVET_ANY));\n    ev_type = ev->type;\n    if (ev_type == SVET_INPUT) {\n      ret = ev->parameter[0];\n    }\n  } while (ev_type != SVET_INPUT && ev_type != SVET_CLICK);\n  return ret;\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/viewer/scrollview.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        scrollview.h\n// Description: ScrollView\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// ScrollView is designed as an UI which can be run remotely. This is the\n// client code for it, the server part is written in java. The client consists\n// mainly of 2 parts:\n// The \"core\" ScrollView which sets up the remote connection,\n// takes care of event handling etc.\n// The other part of ScrollView consists of predefined API calls through LUA,\n// which can basically be used to get a zoomable canvas in which it is possible\n// to draw lines, text etc.\n// Technically, thanks to LUA, its even possible to bypass the here defined LUA\n// API calls at all and generate a java user interface from scratch (or\n// basically generate any kind of java program, possibly even dangerous ones).\n\n#ifndef TESSERACT_VIEWER_SCROLLVIEW_H_\n#define TESSERACT_VIEWER_SCROLLVIEW_H_\n\n#include \"image.h\"\n\n#include <tesseract/export.h>\n\n#include <cstdio>\n#include <memory>\n#include <mutex>\n\nnamespace tesseract {\n\n#if !defined(__GNUC__) && !defined(__attribute__)\n# define __attribute__(attr) // compiler without support for __attribute__\n#endif\n\nclass ScrollView;\nclass SVNetwork;\nclass SVSemaphore;\nstruct SVPolyLineBuffer;\n\nenum SVEventType {\n  SVET_DESTROY,   // Window has been destroyed by user.\n  SVET_EXIT,      // User has destroyed the last window by clicking on the 'X'.\n  SVET_CLICK,     // Left button pressed.\n  SVET_SELECTION, // Left button selection.\n  SVET_INPUT,     // There is some input (single key or a whole string).\n  SVET_MOUSE,     // The mouse has moved with a button pressed.\n  SVET_MOTION,    // The mouse has moved with no button pressed.\n  SVET_HOVER,     // The mouse has stayed still for a second.\n  SVET_POPUP,     // A command selected through a popup menu.\n  SVET_MENU,      // A command selected through the menubar.\n  SVET_ANY,       // Any of the above.\n\n  SVET_COUNT // Array sizing.\n};\n\nstruct SVEvent {\n  ~SVEvent() {\n    delete[] parameter;\n  }\n  std::unique_ptr<SVEvent> copy() const;\n  SVEventType type = SVET_DESTROY; // What kind of event.\n  ScrollView *window = nullptr;    // Window event relates to.\n  char *parameter = nullptr;       // Any string that might have been passed as argument.\n  int x = 0;                       // Coords of click or selection.\n  int y = 0;\n  int x_size = 0; // Size of selection.\n  int y_size = 0;\n  int command_id = 0; // The ID of the possibly associated event (e.g. MENU)\n  int counter = 0;    // Used to detect which kind of event to process next.\n\n  SVEvent() = default;\n  SVEvent(const SVEvent &);\n  SVEvent &operator=(const SVEvent &);\n};\n\n// The SVEventHandler class is used for Event handling: If you register your\n// class as SVEventHandler to a ScrollView Window, the SVEventHandler will be\n// called whenever an appropriate event occurs.\nclass TESS_API SVEventHandler {\npublic:\n  virtual ~SVEventHandler();\n\n  // Gets called by the SV Window. Does nothing on default, overwrite this\n  // to implement the desired behaviour\n  virtual void Notify(const SVEvent *sve) {\n    (void)sve;\n  }\n};\n\n// The ScrollView class provides the external API to the scrollviewer process.\n// The scrollviewer process manages windows and displays images, graphics and\n// text while allowing the user to zoom and scroll the windows arbitrarily.\n// Each ScrollView class instance represents one window, and stuff is drawn in\n// the window through method calls on the class. The constructor is used to\n// create the class instance (and the window).\nclass TESS_API ScrollView {\npublic:\n  // Color enum for pens and brushes.\n  enum Color {\n    NONE,\n    BLACK,\n    WHITE,\n    RED,\n    YELLOW,\n    GREEN,\n    CYAN,\n    BLUE,\n    MAGENTA,\n    AQUAMARINE,\n    DARK_SLATE_BLUE,\n    LIGHT_BLUE,\n    MEDIUM_BLUE,\n    MIDNIGHT_BLUE,\n    NAVY_BLUE,\n    SKY_BLUE,\n    SLATE_BLUE,\n    STEEL_BLUE,\n    CORAL,\n    BROWN,\n    SANDY_BROWN,\n    GOLD,\n    GOLDENROD,\n    DARK_GREEN,\n    DARK_OLIVE_GREEN,\n    FOREST_GREEN,\n    LIME_GREEN,\n    PALE_GREEN,\n    YELLOW_GREEN,\n    LIGHT_GREY,\n    DARK_SLATE_GREY,\n    DIM_GREY,\n    GREY,\n    KHAKI,\n    MAROON,\n    ORANGE,\n    ORCHID,\n    PINK,\n    PLUM,\n    INDIAN_RED,\n    ORANGE_RED,\n    VIOLET_RED,\n    SALMON,\n    TAN,\n    TURQUOISE,\n    DARK_TURQUOISE,\n    VIOLET,\n    WHEAT,\n    GREEN_YELLOW // Make sure this one is last.\n  };\n\n  ~ScrollView();\n\n#ifndef GRAPHICS_DISABLED\n\n  // Create a window. The pixel size of the window may be 0,0, in which case\n  // a default size is selected based on the size of your canvas.\n  // The canvas may not be 0,0 in size!\n  ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size, int x_canvas_size,\n             int y_canvas_size);\n  // With a flag whether the x axis is reversed.\n  ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size, int x_canvas_size,\n             int y_canvas_size, bool y_axis_reversed);\n  // Connect to a server other than localhost.\n  ScrollView(const char *name, int x_pos, int y_pos, int x_size, int y_size, int x_canvas_size,\n             int y_canvas_size, bool y_axis_reversed, const char *server_name);\n  /*******************************************************************************\n   * Event handling\n   * To register as listener, the class has to derive from the SVEventHandler\n   * class, which consists of a notifyMe(SVEvent*) function that should be\n   * overwritten to process the event the way you want.\n   *******************************************************************************/\n\n  // Add an Event Listener to this ScrollView Window.\n  void AddEventHandler(SVEventHandler *listener);\n\n  // Block until an event of the given type is received.\n  std::unique_ptr<SVEvent> AwaitEvent(SVEventType type);\n\n  /*******************************************************************************\n   * Getters and Setters\n   *******************************************************************************/\n\n  // Returns the title of the window.\n  const char *GetName() {\n    return window_name_;\n  }\n\n  // Returns the unique ID of the window.\n  int GetId() {\n    return window_id_;\n  }\n\n  /*******************************************************************************\n   * API functions for LUA calls\n   * the implementations for these can be found in svapi.cc\n   * (keep in mind that the window is actually created through the ScrollView\n   * constructor, so this is not listed here)\n   *******************************************************************************/\n\n  // Draw an image on (x,y).\n  void Draw(Image image, int x_pos, int y_pos);\n\n  // Flush buffers and update display.\n  static void Update();\n\n  // Exit the program.\n  static void Exit();\n\n  // Update the contents of a specific window.\n  void UpdateWindow();\n\n  // Erase all content from the window, but do not destroy it.\n  void Clear();\n\n  // Set pen color with an enum.\n  void Pen(Color color);\n\n  // Set pen color to RGB (0-255).\n  void Pen(int red, int green, int blue);\n\n  // Set pen color to RGBA (0-255).\n  void Pen(int red, int green, int blue, int alpha);\n\n  // Set brush color with an enum.\n  void Brush(Color color);\n\n  // Set brush color to RGB (0-255).\n  void Brush(int red, int green, int blue);\n\n  // Set brush color to RGBA (0-255).\n  void Brush(int red, int green, int blue, int alpha);\n\n  // Set attributes for future text, like font name (e.g.\n  // \"Times New Roman\"), font size etc..\n  // Note: The underlined flag is currently not supported\n  void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined);\n\n  // Draw line from (x1,y1) to (x2,y2) with the current pencolor.\n  void Line(int x1, int y1, int x2, int y2);\n\n  // Set the stroke width of the pen.\n  void Stroke(float width);\n\n  // Draw a rectangle given upper left corner and lower right corner.\n  // The current pencolor is used as outline, the brushcolor to fill the shape.\n  void Rectangle(int x1, int y1, int x2, int y2);\n\n  // Draw an ellipse centered on (x,y).\n  // The current pencolor is used as outline, the brushcolor to fill the shape.\n  void Ellipse(int x, int y, int width, int height);\n\n  // Draw text with the current pencolor\n  void Text(int x, int y, const char *mystring);\n\n  // Draw an image from a local filename. This should be faster than\n  // createImage. WARNING: This only works on a local machine. This also only\n  // works image types supported by java (like bmp,jpeg,gif,png) since the image\n  // is opened by the server.\n  void Draw(const char *image, int x_pos, int y_pos);\n\n  // Set the current position to draw from (x,y). In conjunction with...\n  void SetCursor(int x, int y);\n\n  // ...this function, which draws a line from the current to (x,y) and then\n  // sets the new position to the new (x,y), this can be used to easily draw\n  // polygons using vertices\n  void DrawTo(int x, int y);\n\n  // Set the SVWindow visible/invisible.\n  void SetVisible(bool visible);\n\n  // Set the SVWindow always on top or not always on top.\n  void AlwaysOnTop(bool b);\n\n  // Shows a modal dialog with \"msg\" as question and returns 'y' or 'n'.\n  int ShowYesNoDialog(const char *msg);\n\n  // Shows a modal dialog with \"msg\" as question and returns a char* string.\n  // Constraint: As return, only words (e.g. no whitespaces etc.) are allowed.\n  char *ShowInputDialog(const char *msg);\n\n  // Adds a messagebox to the SVWindow. This way, it can show the messages...\n  void AddMessageBox();\n\n  // ...which can be added by this command.\n  // This is intended as an \"debug\" output window.\n  void AddMessage(const char *message);\n  void AddMessageF(const char *format, ...) __attribute__((format(printf, 2, 3)));\n\n  // Zoom the window to the rectangle given upper left corner and\n  // lower right corner.\n  void ZoomToRectangle(int x1, int y1, int x2, int y2);\n\n  // Custom messages (manipulating java code directly) can be send through this.\n  // Send a message to the server and attach the Id of the corresponding window.\n  // Note: This should only be called if you are know what you are doing, since\n  // you are fiddling with the Java objects on the server directly. Calling\n  // this just for fun will likely break your application!\n  // It is public so you can actually take use of the LUA functionalities, but\n  // be careful!\n  void SendMsg(const char* msg, ...) __attribute__((format(printf, 2, 3)));\n\n  // Custom messages (manipulating java code directly) can be send through this.\n  // Send a message to the server without adding the\n  // window id. Used for global events like Exit().\n  // Note: This should only be called if you are know what you are doing, since\n  // you are fiddling with the Java objects on the server directly. Calling\n  // this just for fun will likely break your application!\n  // It is public so you can actually take use of the LUA functionalities, but\n  // be careful!\n  static void SendRawMessage(const char *msg);\n\n  /*******************************************************************************\n   * Add new menu entries to parent. If parent is \"\", the entry gets added to\n   *the main menubar (toplevel).\n   *******************************************************************************/\n  // This adds a new submenu to the menubar.\n  void MenuItem(const char *parent, const char *name);\n\n  // This adds a new (normal) menu entry with an associated eventID, which\n  // should be unique among menubar eventIDs.\n  void MenuItem(const char *parent, const char *name, int cmdEvent);\n\n  // This adds a new checkbox entry, which might initially be flagged.\n  void MenuItem(const char *parent, const char *name, int cmdEvent, bool flagged);\n\n  // This adds a new popup submenu to the popup menu. If parent is \"\", the entry\n  // gets added at \"toplevel\" popupmenu.\n  void PopupItem(const char *parent, const char *name);\n\n  // This adds a new popup entry with the associated eventID, which should be\n  // unique among popup eventIDs.\n  // If value and desc are given, on a click the server will ask you to modify\n  // the value and return the new value.\n  void PopupItem(const char *parent, const char *name, int cmdEvent, const char *value,\n                 const char *desc);\n\n  // Returns the correct Y coordinate for a window, depending on whether it\n  // might have to be flipped (by ySize).\n  int TranslateYCoordinate(int y);\n\n  char Wait();\n\nprivate:\n  // Transfers a binary Image.\n  void TransferBinaryImage(Image image);\n  // Transfers a gray scale Image.\n  void TransferGrayImage(Image image);\n  // Transfers a 32-Bit Image.\n  void Transfer32bppImage(Image image);\n\n  // Sets up ScrollView, depending on the variables from the constructor.\n  void Initialize(const char *name, int x_pos, int y_pos, int x_size, int y_size, int x_canvas_size,\n                  int y_canvas_size, bool y_axis_reversed, const char *server_name);\n\n  // Send the current buffered polygon (if any) and clear it.\n  void SendPolygon();\n\n  // Start the message receiving thread.\n  static void MessageReceiver();\n\n  // Place an event into the event_table (synchronized).\n  void SetEvent(const SVEvent *svevent);\n\n  // Wake up the semaphore.\n  void Signal();\n\n  // Returns the unique, shared network stream.\n  static SVNetwork *GetStream() {\n    return stream_;\n  }\n\n  // Starts a new event handler.\n  // Called asynchronously whenever a new window is created.\n  void StartEventHandler();\n\n  // Escapes the ' character with a \\, so it can be processed by LUA.\n  char *AddEscapeChars(const char *input);\n\n  // The event handler for this window.\n  SVEventHandler *event_handler_;\n  // The name of the window.\n  const char *window_name_;\n  // The id of the window.\n  int window_id_;\n  // The points of the currently under-construction polyline.\n  SVPolyLineBuffer *points_;\n  // Whether the axis is reversed.\n  bool y_axis_is_reversed_;\n  // Set to true only after the event handler has terminated.\n  bool event_handler_ended_;\n  // If the y axis is reversed, flip all y values by ySize.\n  int y_size_;\n  // # of created windows (used to assign an id to each ScrollView* for svmap).\n  static int nr_created_windows_;\n  // Serial number of sent images to ensure that the viewer knows they\n  // are distinct.\n  static int image_index_;\n\n  // The stream through which the c++ client is connected to the server.\n  static SVNetwork *stream_;\n\n  // Table of all the currently queued events.\n  std::unique_ptr<SVEvent> event_table_[SVET_COUNT];\n\n  // Mutex to access the event_table_ in a synchronized fashion.\n  std::mutex mutex_;\n\n  // Semaphore to the thread belonging to this window.\n  SVSemaphore *semaphore_;\n#endif // !GRAPHICS_DISABLED\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_VIEWER_SCROLLVIEW_H_\n"
  },
  {
    "path": "src/viewer/svmnode.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        svmnode.cpp\n// description_: ScrollView Menu Node\n// Author:      Joern Wanke\n// Created:     Thu Nov 29 2007\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// A SVMenuNode is an entity which contains the mapping from a menu entry on\n// the server side to the corresponding associated commands on the client.\n// It is designed to be a tree structure with a root node, which can then be\n// used to generate the appropriate messages to the server to display the\n// menu structure there.\n// A SVMenuNode can both be used in the context_ of popup menus as well as\n// menu bars.\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#ifndef GRAPHICS_DISABLED\n\n#include \"svmnode.h\"\n\n#include <cstring>\n#include <iostream>\n\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n// Create the empty root menu node. with just a caption. All other nodes should\n// be added to this or one of the submenus.\nSVMenuNode::SVMenuNode() {\n  cmd_event_ = -1;\n  child_ = nullptr;\n  next_ = nullptr;\n  parent_ = nullptr;\n  toggle_value_ = false;\n  is_check_box_entry_ = false;\n}\n\nSVMenuNode::~SVMenuNode() = default;\n\n// Create a new sub menu node with just a caption.  This is used to create\n// nodes which act as parent nodes to other nodes (e.g. submenus).\nSVMenuNode *SVMenuNode::AddChild(const char *txt) {\n  auto *s = new SVMenuNode(-1, txt, false, false);\n  this->AddChild(s);\n  return s;\n}\n\n// Create a \"normal\" menu node which is associated with a command event.\nvoid SVMenuNode::AddChild(const char *txt, int command_event) {\n  this->AddChild(new SVMenuNode(command_event, txt, false, false));\n}\n\n// Create a menu node with an associated value (which might be changed\n// through the gui).\nvoid SVMenuNode::AddChild(const char *txt, int command_event, const char *val) {\n  this->AddChild(new SVMenuNode(command_event, txt, false, false, val));\n}\n\n// Create a menu node with an associated value and description_.\nvoid SVMenuNode::AddChild(const char *txt, int command_event, const char *val, const char *desc) {\n  this->AddChild(new SVMenuNode(command_event, txt, false, false, val, desc));\n}\n\n// Create a flag menu node.\nvoid SVMenuNode::AddChild(const char *txt, int command_event, int tv) {\n  this->AddChild(new SVMenuNode(command_event, txt, tv, true));\n}\n\n// Convenience function called from the different constructors to initialize\n// the different values of the menu node.\nSVMenuNode::SVMenuNode(int command_event, const char *txt, int tv, bool check_box_entry,\n                       const char *val, const char *desc)\n    : text_(txt), value_(val), description_(desc) {\n  cmd_event_ = command_event;\n\n  child_ = nullptr;\n  next_ = nullptr;\n  parent_ = nullptr;\n  toggle_value_ = tv != 0;\n  is_check_box_entry_ = check_box_entry;\n}\n\n// Add a child node to this menu node.\nvoid SVMenuNode::AddChild(SVMenuNode *svmn) {\n  svmn->parent_ = this;\n  // No children yet.\n  if (child_ == nullptr) {\n    child_ = svmn;\n  } else {\n    SVMenuNode *cur = child_;\n    while (cur->next_ != nullptr) {\n      cur = cur->next_;\n    }\n    cur->next_ = svmn;\n  }\n}\n\n// Build a menu structure for the server and send the necessary messages.\n// Should be called on the root node. If menu_bar is true, a menu_bar menu\n// is built (e.g. on top of the window), if it is false a popup menu is\n// built which gets shown by right clicking on the window.\n// Deletes itself afterwards.\nvoid SVMenuNode::BuildMenu(ScrollView *sv, bool menu_bar) {\n  if ((parent_ != nullptr) && (menu_bar)) {\n    if (is_check_box_entry_) {\n      sv->MenuItem(parent_->text_.c_str(), text_.c_str(), cmd_event_, toggle_value_);\n    } else {\n      sv->MenuItem(parent_->text_.c_str(), text_.c_str(), cmd_event_);\n    }\n  } else if ((parent_ != nullptr) && (!menu_bar)) {\n    if (description_.length() > 0) {\n      sv->PopupItem(parent_->text_.c_str(), text_.c_str(), cmd_event_, value_.c_str(),\n                    description_.c_str());\n    } else {\n      sv->PopupItem(parent_->text_.c_str(), text_.c_str());\n    }\n  }\n  if (child_ != nullptr) {\n    child_->BuildMenu(sv, menu_bar);\n    delete child_;\n  }\n  if (next_ != nullptr) {\n    next_->BuildMenu(sv, menu_bar);\n    delete next_;\n  }\n}\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n"
  },
  {
    "path": "src/viewer/svmnode.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        svmnode.h\n// description_: ScrollView Menu Node\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// A SVMenuNode is an entity which contains the mapping from a menu entry on\n// the server side to the corresponding associated commands on the client.\n// It is designed to be a tree structure with a root node, which can then be\n// used to generate the appropriate messages to the server to display the\n// menu structure there.\n// A SVMenuNode can both be used in the context_ of popup menus as well as\n// menu bars.\n\n#ifndef TESSERACT_VIEWER_SVMNODE_H_\n#define TESSERACT_VIEWER_SVMNODE_H_\n\n#ifndef GRAPHICS_DISABLED\n\n#include <tesseract/export.h>\n\n#include <string>\n\nnamespace tesseract {\n\nclass ScrollView;\n\nclass TESS_API SVMenuNode {\npublic:\n  // Creating the (empty) root menu node.\n  SVMenuNode();\n\n  // Destructor for every node.\n  ~SVMenuNode();\n\n  // Create a new sub menu node with just a caption.  This is used to create\n  // nodes which act as parent nodes to other nodes (e.g. submenus).\n  SVMenuNode *AddChild(const char *txt);\n\n  // Create a \"normal\" menu node which is associated with a command event.\n  void AddChild(const char *txt, int command_event);\n\n  // Create a flag menu node.\n  void AddChild(const char *txt, int command_event, int tv);\n\n  // Create a menu node with an associated value (which might be changed\n  // through the gui).\n  void AddChild(const char *txt, int command_event, const char *val);\n\n  // Create a menu node with an associated value and description_.\n  void AddChild(const char *txt, int command_event, const char *val, const char *desc);\n\n  // Build a menu structure for the server and send the necessary messages.\n  // Should be called on the root node. If menu_bar is true, a menu_bar menu\n  // is built (e.g. on top of the window), if it is false a popup menu is\n  // built which gets shown by right clicking on the window.\n  void BuildMenu(ScrollView *sv, bool menu_bar = true);\n\nprivate:\n  // Constructor holding the actual node data.\n  SVMenuNode(int command_event, const char *txt, int tv, bool check_box_entry, const char *val = \"\",\n             const char *desc = \"\");\n\n  // Adds a new menu node to the current node.\n  void AddChild(SVMenuNode *svmn);\n\n  // The parent node of this node.\n  SVMenuNode *parent_;\n  // The first child of this node.\n  SVMenuNode *child_;\n  // The next \"sibling\" of this node (e.g. same parent).\n  SVMenuNode *next_;\n  // Whether this menu node actually is a flag.\n  bool is_check_box_entry_;\n  // The value of the flag (if this menu node is a flag).\n  bool toggle_value_;\n\n  // The command event associated with a specific menu node. Should be unique.\n  int cmd_event_;\n  // The caption associated with a specific menu node.\n  std::string text_;\n  // The value of the menu node. (optional)\n   std::string value_;\n  // A description_ of the value. (optional)\n   std::string description_;\n};\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n\n#endif // TESSERACT_VIEWER_SVMNODE_H_\n"
  },
  {
    "path": "src/viewer/svutil.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        svutil.cpp\n// Description: ScrollView Utilities\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// SVUtil contains the SVSync and SVNetwork classes, which are used for\n// thread/process creation & synchronization and network connection.\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"svutil.h\"\n\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <iostream>\n#include <memory>\n#include <string>\n#include <thread> // for std::this_thread\n#include <vector>\n\n#ifdef _WIN32\n#  pragma comment(lib, \"Ws2_32.lib\")\n#  include <winsock2.h> // for fd_set, send, ..\n#  include <ws2tcpip.h> // for addrinfo\n#else\n#  include <arpa/inet.h>\n#  include <netdb.h>\n#  include <netinet/in.h>\n#  include <semaphore.h>\n#  include <sys/select.h>\n#  include <sys/socket.h>\n#  include <csignal>\n#  ifdef __linux__\n#    include <sys/prctl.h>\n#  endif\n#  include <unistd.h>\n#endif\n\n#if defined(_WIN32) && !defined(__GNUC__)\n#  define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)\n#endif /* _WIN32 && !__GNUC__ */\n\n#ifndef GRAPHICS_DISABLED\n\nnamespace tesseract {\n\n// Starts a new process.\nvoid SVSync::StartProcess(const char *executable, const char *args) {\n  std::string proc;\n  proc.append(executable);\n  proc.append(\" \");\n  proc.append(args);\n  std::cout << \"Starting \" << proc << std::endl;\n#  ifdef _WIN32\n  STARTUPINFO start_info;\n  PROCESS_INFORMATION proc_info;\n  GetStartupInfo(&start_info);\n  if (!CreateProcess(nullptr, const_cast<char *>(proc.c_str()), nullptr,\n                     nullptr, FALSE, CREATE_NO_WINDOW | DETACHED_PROCESS,\n                     nullptr, nullptr, &start_info, &proc_info))\n    return;\n#  else\n  int pid = fork();\n  if (pid != 0) { // The father process returns\n  } else {\n#    ifdef __linux__\n    // Make sure the java process terminates on exit, since its\n    // broken socket detection seems to be useless.\n    prctl(PR_SET_PDEATHSIG, 2, 0, 0, 0);\n#    endif\n    std::string mutable_args(args);\n    int argc = 1;\n    for (auto ch : mutable_args) {\n      if (ch == ' ') {\n        ++argc;\n      }\n    }\n    std::unique_ptr<char *[]> argv(new char *[argc + 2]);\n    std::string argv0(executable);\n    argv[0] = &argv0[0];\n    argv[1] = &mutable_args[0];\n    argc = 2;\n    bool inquote = false;\n    for (int i = 0; mutable_args[i]; ++i) {\n      if (!inquote && mutable_args[i] == ' ') {\n        mutable_args[i] = '\\0';\n        argv[argc++] = &mutable_args[i + 1];\n      } else if (mutable_args[i] == '\"') {\n        inquote = !inquote;\n        mutable_args[i] = ' ';\n      }\n    }\n    argv[argc] = nullptr;\n    execvp(executable, argv.get());\n  }\n#  endif\n}\n\nSVSemaphore::SVSemaphore() {\n#  ifdef _WIN32\n  semaphore_ = CreateSemaphore(0, 0, 10, 0);\n#  elif defined(__APPLE__)\n  auto name = std::to_string(random());\n  sem_unlink(name.c_str());\n  semaphore_ = sem_open(name.c_str(), O_CREAT, S_IWUSR, 0);\n  if (semaphore_ == SEM_FAILED) {\n    perror(\"sem_open\");\n  }\n#  else\n  sem_init(&semaphore_, 0, 0);\n#  endif\n}\n\nSVSemaphore::~SVSemaphore() {\n#  ifdef _WIN32\n  CloseHandle(semaphore_);\n#  elif defined(__APPLE__)\n  sem_close(semaphore_);\n#  else\n  sem_close(&semaphore_);\n#  endif\n}\n\nvoid SVSemaphore::Signal() {\n#  ifdef _WIN32\n  ReleaseSemaphore(semaphore_, 1, nullptr);\n#  elif defined(__APPLE__)\n  sem_post(semaphore_);\n#  else\n  sem_post(&semaphore_);\n#  endif\n}\n\nvoid SVSemaphore::Wait() {\n#  ifdef _WIN32\n  WaitForSingleObject(semaphore_, INFINITE);\n#  elif defined(__APPLE__)\n  sem_wait(semaphore_);\n#  else\n  sem_wait(&semaphore_);\n#  endif\n}\n\n// Place a message in the message buffer (and flush it).\nvoid SVNetwork::Send(const char *msg) {\n  std::lock_guard<std::mutex> guard(mutex_send_);\n  msg_buffer_out_.append(msg);\n}\n\n// Send the whole buffer.\nvoid SVNetwork::Flush() {\n  std::lock_guard<std::mutex> guard(mutex_send_);\n  while (!msg_buffer_out_.empty()) {\n    int i = send(stream_, msg_buffer_out_.c_str(), msg_buffer_out_.length(), 0);\n    msg_buffer_out_.erase(0, i);\n  }\n}\n\n// Receive a message from the server.\n// This will always return one line of char* (denoted by \\n).\nchar *SVNetwork::Receive() {\n  char *result = nullptr;\n  if (buffer_ptr_ != nullptr) {\n    result = strtok_r(nullptr, \"\\n\", &buffer_ptr_);\n  }\n\n  // This means there is something left in the buffer and we return it.\n  if (result != nullptr) {\n    return result;\n    // Otherwise, we read from the stream_.\n  } else {\n    buffer_ptr_ = nullptr;\n\n    // The timeout length is not really important since we are looping anyway\n    // until a new message is delivered.\n    struct timeval tv;\n    tv.tv_sec = 10;\n    tv.tv_usec = 0;\n\n    // Set the flags to return when the stream_ is ready to be read.\n    fd_set readfds;\n    FD_ZERO(&readfds);\n    FD_SET(stream_, &readfds);\n\n    int i = select(stream_ + 1, &readfds, nullptr, nullptr, &tv);\n\n    // The stream_ died.\n    if (i == 0) {\n      return nullptr;\n    }\n\n    // Read the message buffer.\n    i = recv(stream_, msg_buffer_in_, kMaxMsgSize, 0);\n\n    // Server quit (0) or error (-1).\n    if (i <= 0) {\n      return nullptr;\n    }\n    msg_buffer_in_[i] = '\\0';\n    // Setup a new string tokenizer.\n    return strtok_r(msg_buffer_in_, \"\\n\", &buffer_ptr_);\n  }\n}\n\n// Close the connection to the server.\nvoid SVNetwork::Close() {\n#  ifdef _WIN32\n  closesocket(stream_);\n#  else\n  close(stream_);\n#  endif\n  // Mark stream_ as invalid.\n  stream_ = -1;\n}\n\n// The program to invoke to start ScrollView\nstatic const char *ScrollViewProg() {\n#  ifdef _WIN32\n  const char *prog = \"java -Xms512m -Xmx1024m\";\n#  else\n  const char *prog = \"sh\";\n#  endif\n  return prog;\n}\n\n// The arguments to the program to invoke to start ScrollView\nstatic std::string ScrollViewCommand(const std::string &scrollview_path) {\n  // Quote our paths on Windows to deal with spaces\n#  ifdef _WIN32\n  const char cmd_template[] =\n      \"-Djava.library.path=\\\"%s\\\" -jar \\\"%s/ScrollView.jar\\\"\";\n#  else\n  const char cmd_template[] =\n      \"-c \\\"trap 'kill %%1' 0 1 2 ; java \"\n      \"-Xms1024m -Xmx2048m -jar %s/ScrollView.jar\"\n      \" & wait\\\"\";\n#  endif\n  size_t cmdlen = sizeof(cmd_template) + 2 * scrollview_path.size() + 1;\n  std::vector<char> cmd(cmdlen);\n  const char *sv_path = scrollview_path.c_str();\n#  ifdef _WIN32\n  snprintf(&cmd[0], cmdlen, cmd_template, sv_path, sv_path);\n#  else\n  snprintf(&cmd[0], cmdlen, cmd_template, sv_path);\n#  endif\n  std::string command(&cmd[0]);\n  return command;\n}\n\n// Set up a connection to a ScrollView on hostname:port.\nSVNetwork::SVNetwork(const char *hostname, int port) {\n  msg_buffer_in_ = new char[kMaxMsgSize + 1];\n  msg_buffer_in_[0] = '\\0';\n\n  buffer_ptr_ = nullptr;\n\n  auto port_string = std::to_string(port);\n#  ifdef _WIN32\n  // Initialize Winsock\n  WSADATA wsaData;\n  int iResult = WSAStartup(MAKEWORD(2, 2), &wsaData);\n  if (iResult != 0) {\n    std::cerr << \"WSAStartup failed: \" << iResult << std::endl;\n  }\n#  endif // _WIN32\n\n  struct addrinfo *addr_info = nullptr;\n  struct addrinfo hints = {};\n  hints.ai_family = AF_INET;\n  hints.ai_socktype = SOCK_STREAM;\n  if (getaddrinfo(hostname, port_string.c_str(), &hints, &addr_info) != 0) {\n    std::cerr << \"Error resolving name for ScrollView host \"\n              << std::string(hostname) << \":\" << port << std::endl;\n#  ifdef _WIN32\n    WSACleanup();\n#  endif // _WIN32\n  }\n\n  if (addr_info == nullptr) {\n    // Mark stream_ as invalid.\n    stream_ = -1;\n  } else {\n    stream_ = socket(addr_info->ai_family, addr_info->ai_socktype,\n                     addr_info->ai_protocol);\n  }\n\n  if (stream_ < 0) {\n    std::cerr << \"Failed to open socket\" << std::endl;\n  } else if (connect(stream_, addr_info->ai_addr, addr_info->ai_addrlen) < 0) {\n    // If server is not there, we will start a new server as local child\n    // process.\n    const char *scrollview_path = getenv(\"SCROLLVIEW_PATH\");\n    if (scrollview_path == nullptr) {\n#  ifdef SCROLLVIEW_PATH\n#    define _STR(a) #    a\n#    define _XSTR(a) _STR(a)\n      scrollview_path = _XSTR(SCROLLVIEW_PATH);\n#    undef _XSTR\n#    undef _STR\n#  else\n      scrollview_path = \".\";\n#  endif\n    }\n    const char *prog = ScrollViewProg();\n    std::string command = ScrollViewCommand(scrollview_path);\n    SVSync::StartProcess(prog, command.c_str());\n\n    // Wait for server to show up.\n    // Note: There is no exception handling in case the server never turns up.\n\n    Close();\n    for (;;) {\n      stream_ = socket(addr_info->ai_family, addr_info->ai_socktype,\n                       addr_info->ai_protocol);\n      if (stream_ >= 0) {\n        if (connect(stream_, addr_info->ai_addr, addr_info->ai_addrlen) == 0) {\n          break;\n        }\n\n        Close();\n\n        std::cout << \"ScrollView: Waiting for server...\\n\";\n        std::this_thread::sleep_for(std::chrono::seconds(1));\n      }\n    }\n  }\n#  ifdef _WIN32\n  // WSACleanup();  // This cause ScrollView windows is not displayed\n#  endif // _WIN32\n  freeaddrinfo(addr_info);\n}\n\nSVNetwork::~SVNetwork() {\n  Close();\n  delete[] msg_buffer_in_;\n}\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n"
  },
  {
    "path": "src/viewer/svutil.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        svutil.h\n// Description: ScrollView Utilities\n// Author:      Joern Wanke\n//\n// (C) Copyright 2007, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n//\n// SVUtil contains the SVSync, SVSemaphore and SVNetwork\n// classes, which are used for thread/process creation & synchronization\n// and network connection.\n\n#ifndef TESSERACT_VIEWER_SVUTIL_H_\n#define TESSERACT_VIEWER_SVUTIL_H_\n\n#ifdef _WIN32\n#  include \"host.h\" // also includes windows.h\n#else\n#  include <semaphore.h>\n#endif\n\n#include <mutex>\n#include <string>\n\nnamespace tesseract {\n\n// Maximum message size for ScrollView network communication\nconst int kMaxMsgSize = 4096;\n\n/// The SVSync class provides functionality for Thread & Process Creation\nclass SVSync {\npublic:\n  /// Starts a new process.\n  static void StartProcess(const char *executable, const char *args);\n};\n\n/// A semaphore class which encapsulates the main signaling\n/// and wait abilities of semaphores for windows and unix.\nclass SVSemaphore {\npublic:\n  /// Sets up a semaphore.\n  SVSemaphore();\n  /// Cleans up the mutex\n  ~SVSemaphore();\n  /// Signal a semaphore.\n  void Signal();\n  /// Wait on a semaphore.\n  void Wait();\n\nprivate:\n#ifdef _WIN32\n  HANDLE semaphore_;\n#elif defined(__APPLE__)\n  sem_t *semaphore_;\n#else\n  sem_t semaphore_;\n#endif\n};\n\n/// The SVNetwork class takes care of the remote connection for ScrollView\n/// This means setting up and maintaining a remote connection, sending and\n/// receiving messages and closing the connection.\n/// It is designed to work on both Linux and Windows.\nclass SVNetwork {\npublic:\n  /// Set up a connection to hostname on port.\n  SVNetwork(const char *hostname, int port);\n\n  /// Destructor.\n  ~SVNetwork();\n\n  /// Put a message in the messagebuffer to the server and try to send it.\n  void Send(const char *msg);\n\n  /// Receive a message from the server.\n  /// This will always return one line of char* (denoted by \\\\n).\n  char *Receive();\n\n  /// Close the connection to the server.\n  void Close();\n\n  /// Flush the buffer.\n  void Flush();\n\nprivate:\n  /// The mutex for access to Send() and Flush().\n  std::mutex mutex_send_;\n  /// The actual stream_ to the server.\n  int stream_;\n  /// Stores the last received message-chunk from the server.\n  char *msg_buffer_in_;\n\n  /// Stores the messages which are supposed to go out.\n  std::string msg_buffer_out_;\n\n  /// Where we are at in our msg_buffer_in_\n  char *buffer_ptr_; // strtok_r, strtok_s\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_VIEWER_SVUTIL_H_\n"
  },
  {
    "path": "src/wordrec/associate.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        associate.cpp\n// Description: Functions for scoring segmentation paths according to\n//              their character widths, gap widths and seam cuts.\n// Author:      Daria Antonova\n// Created:     Mon Mar 8 11:26:43 PDT 2010\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <cmath>\n#include <cstdio>\n\n#include \"associate.h\"\n#include \"normalis.h\"\n#include \"pageres.h\"\n\nnamespace tesseract {\n\nconst float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f;\nconst float AssociateUtils::kMinGap = 0.03f;\n\nvoid AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats,\n                                  int parent_path_length, bool fixed_pitch, float max_char_wh_ratio,\n                                  WERD_RES *word_res, bool debug, AssociateStats *stats) {\n  stats->Clear();\n\n  ASSERT_HOST(word_res != nullptr);\n  if (word_res->blob_widths.empty()) {\n    return;\n  }\n  if (debug) {\n    tprintf(\"AssociateUtils::ComputeStats() for col=%d, row=%d%s\\n\", col, row,\n            fixed_pitch ? \" (fixed pitch)\" : \"\");\n  }\n  float normalizing_height = kBlnXHeight;\n  ROW *blob_row = word_res->blob_row;\n  // TODO(rays/daria) Can unicharset.script_has_xheight be useful here?\n  if (fixed_pitch && blob_row != nullptr) {\n    // For fixed pitch language like CJK, we use the full text height\n    // as the normalizing factor so we are not dependent on xheight\n    // calculation.\n    if (blob_row->body_size() > 0.0f) {\n      normalizing_height = word_res->denorm.y_scale() * blob_row->body_size();\n    } else {\n      normalizing_height =\n          word_res->denorm.y_scale() * (blob_row->x_height() + blob_row->ascenders());\n    }\n    if (debug) {\n      tprintf(\"normalizing height = %g (scale %g xheight %g ascenders %g)\\n\", normalizing_height,\n              word_res->denorm.y_scale(), blob_row->x_height(), blob_row->ascenders());\n    }\n  }\n  float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height;\n  if (wh_ratio > max_char_wh_ratio) {\n    stats->bad_shape = true;\n  }\n  // Compute the gap sum for this shape. If there are only negative or only\n  // positive gaps, record their sum in stats->gap_sum. However, if there is\n  // a mixture, record only the sum of the positive gaps.\n  // TODO(antonova): explain fragment.\n  int negative_gap_sum = 0;\n  for (int c = col; c < row; ++c) {\n    int gap = word_res->GetBlobsGap(c);\n    (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap;\n  }\n  if (stats->gap_sum == 0) {\n    stats->gap_sum = negative_gap_sum;\n  }\n  if (debug) {\n    tprintf(\"wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\\n\", wh_ratio, max_char_wh_ratio,\n            stats->gap_sum, stats->bad_shape ? \"bad_shape\" : \"\");\n  }\n  // Compute shape_cost (for fixed pitch mode).\n  if (fixed_pitch) {\n    bool end_row = (row == (word_res->ratings->dimension() - 1));\n\n    // Ensure that the blob has gaps on the left and the right sides\n    // (except for beginning and ending punctuation) and that there is\n    // no cutting through ink at the blob boundaries.\n    if (col > 0) {\n      float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height;\n      SEAM *left_seam = word_res->seam_array[col - 1];\n      if ((!end_row && left_gap < kMinGap) || left_seam->priority() > 0.0f) {\n        stats->bad_shape = true;\n      }\n      if (debug) {\n        tprintf(\"left_gap %g, left_seam %g %s\\n\", left_gap, left_seam->priority(),\n                stats->bad_shape ? \"bad_shape\" : \"\");\n      }\n    }\n    float right_gap = 0.0f;\n    if (!end_row) {\n      right_gap = word_res->GetBlobsGap(row) / normalizing_height;\n      SEAM *right_seam = word_res->seam_array[row];\n      if (right_gap < kMinGap || right_seam->priority() > 0.0f) {\n        stats->bad_shape = true;\n        if (right_gap < kMinGap) {\n          stats->bad_fixed_pitch_right_gap = true;\n        }\n      }\n      if (debug) {\n        tprintf(\"right_gap %g right_seam %g %s\\n\", right_gap, right_seam->priority(),\n                stats->bad_shape ? \"bad_shape\" : \"\");\n      }\n    }\n\n    // Impose additional segmentation penalties if blob widths or gaps\n    // distribution don't fit a fixed-pitch model.\n    // Since we only know the widths and gaps of the path explored so far,\n    // the means and variances are computed for the path so far (not\n    // considering characters to the right of the last character on the path).\n    stats->full_wh_ratio = wh_ratio + right_gap;\n    if (parent_stats != nullptr) {\n      stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio);\n      float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length + 1);\n      stats->full_wh_ratio_var =\n          parent_stats->full_wh_ratio_var + pow(mean - stats->full_wh_ratio, 2);\n    } else {\n      stats->full_wh_ratio_total = stats->full_wh_ratio;\n    }\n    if (debug) {\n      tprintf(\"full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\\n\",\n              stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var);\n    }\n\n    stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio);\n\n    // For some reason Tesseract prefers to treat the whole CJ words\n    // as one blob when the initial segmentation is particularly bad.\n    // This hack is to avoid favoring such states.\n    if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) {\n      stats->shape_cost += 10;\n    }\n    stats->shape_cost += stats->full_wh_ratio_var;\n    if (debug) {\n      tprintf(\"shape_cost %g\\n\", stats->shape_cost);\n    }\n  }\n}\n\nfloat AssociateUtils::FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos,\n                                          float max_char_wh_ratio) {\n  float cost = 0.0f;\n  if (norm_width > max_char_wh_ratio) {\n    cost += norm_width;\n  }\n  if (norm_width > kMaxFixedPitchCharAspectRatio) {\n    cost += norm_width * norm_width; // extra penalty for merging CJK chars\n  }\n  // Penalize skinny blobs, except for punctuation in the last position.\n  if (norm_width + right_gap < 0.5f && !end_pos) {\n    cost += 1.0f - (norm_width + right_gap);\n  }\n  return cost;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/associate.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        associate.h\n// Description: Structs, classes, typedefs useful for the segmentation\n//              search. Functions for scoring segmentation paths according\n//              to their character widths, gap widths and seam cuts.\n// Author:      Daria Antonova\n// Created:     Mon Mar 8 11:26:43 PDT 2010\n//\n// (C) Copyright 2010, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef ASSOCIATE_H\n#define ASSOCIATE_H\n\n#include \"blobs.h\"\n#include \"elst.h\"\n#include \"ratngs.h\"\n#include \"seam.h\"\n#include \"split.h\"\n\nnamespace tesseract {\n\nclass WERD_RES;\n\n// Statistics about character widths, gaps and seams.\nstruct AssociateStats {\n  AssociateStats() {\n    Clear();\n  }\n\n  void Clear() {\n    shape_cost = 0.0f;\n    bad_shape = false;\n    full_wh_ratio = 0.0f;\n    full_wh_ratio_total = 0.0f;\n    full_wh_ratio_var = 0.0f;\n    bad_fixed_pitch_right_gap = false;\n    bad_fixed_pitch_wh_ratio = false;\n    gap_sum = 0;\n  }\n\n  void Print() {\n    tprintf(\"AssociateStats: s(%g %d)\\n\", shape_cost, bad_shape);\n  }\n\n  float shape_cost;               // cost of blob shape\n  bool bad_shape;                 // true if the shape of the blob is unacceptable\n  float full_wh_ratio;            // width-to-height ratio + gap on the right\n  float full_wh_ratio_total;      // sum of width-to-height ratios\n                                  // on the path terminating at this blob\n  float full_wh_ratio_var;        // variance of full_wh_ratios on the path\n  bool bad_fixed_pitch_right_gap; // true if there is no gap before\n                                  // the blob on the right\n  bool bad_fixed_pitch_wh_ratio;  // true if the blobs has width-to-height\n                                  // ratio > kMaxFixedPitchCharAspectRatio\n  int gap_sum;                    // sum of gaps within the blob\n};\n\n// Utility functions for scoring segmentation paths according to their\n// character widths, gap widths, seam characteristics.\nclass AssociateUtils {\npublic:\n  static const float kMaxFixedPitchCharAspectRatio;\n  static const float kMinGap;\n\n  // Returns outline length of the given blob is computed as:\n  // rating_cert_scale * rating / certainty\n  // Since from Wordrec::SegSearch() in segsearch.cpp\n  // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale\n  // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp\n  // Rating = Certainty = next.rating\n  // Rating *= rating_scale * Results->BlobLength\n  // Certainty *= -(getDict().certainty_scale)\n  static inline float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b) {\n    return rating_cert_scale * b.rating() / b.certainty();\n  }\n  static inline float ComputeRating(float rating_cert_scale, float cert, int width) {\n    return static_cast<float>(width) * cert / rating_cert_scale;\n  }\n\n  // Computes character widths, gaps and seams stats given the\n  // AssociateStats of the path so far, col, row of the blob that\n  // is being added to the path, and WERD_RES containing information\n  // about character widths, gaps and seams.\n  // Fills associate_cost with the combined shape, gap and seam cost\n  // of adding a unichar from (col, row) to the path (note that since\n  // this function could be used to compute the prioritization for\n  // pain points, (col, row) entry might not be classified yet; thus\n  // information in the (col, row) entry of the ratings matrix is not used).\n  //\n  // Note: the function assumes that word_res, stats and\n  // associate_cost pointers are not nullptr.\n  static void ComputeStats(int col, int row, const AssociateStats *parent_stats,\n                           int parent_path_length, bool fixed_pitch, float max_char_wh_ratio,\n                           WERD_RES *word_res, bool debug, AssociateStats *stats);\n\n  // Returns the width cost for fixed-pitch text.\n  static float FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos,\n                                   float max_char_wh_ratio);\n\n  // Returns the gap cost for fixed-pitch text (penalizes vertically\n  // overlapping components).\n  static inline float FixedPitchGapCost(float norm_gap, bool end_pos) {\n    return (norm_gap < 0.05 && !end_pos) ? 5.0f : 0.0f;\n  }\n};\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/chop.cpp",
    "content": "/******************************************************************************\n *\n * File:        chop.cpp  (Formerly chop.c)\n * Author:      Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#define _USE_MATH_DEFINES // for M_PI\n#include \"chop.h\"\n#include <cmath> // for M_PI\n#include \"outlines.h\"\n#include \"plotedges.h\"\n#include \"wordrec.h\"\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\nnamespace tesseract {\n\n// Show if the line is going in the positive or negative X direction.\nstatic int direction(const EDGEPT *point) {\n  //* direction to return\n  int dir = 0;\n  //* prev point\n  const EDGEPT *prev = point->prev;\n  //* next point\n  const EDGEPT *next = point->next;\n\n  if (((prev->pos.x <= point->pos.x) && (point->pos.x < next->pos.x)) ||\n      ((prev->pos.x < point->pos.x) && (point->pos.x <= next->pos.x))) {\n    dir = 1;\n  }\n  if (((prev->pos.x >= point->pos.x) && (point->pos.x > next->pos.x)) ||\n      ((prev->pos.x > point->pos.x) && (point->pos.x >= next->pos.x))) {\n    dir = -1;\n  }\n\n  return dir;\n}\n\n/**\n * @name point_priority\n *\n * Assign a priority to and edge point that might be used as part of a\n * split. The argument should be of type EDGEPT.\n */\nPRIORITY Wordrec::point_priority(EDGEPT *point) {\n  return static_cast<PRIORITY>(angle_change(point->prev, point, point->next));\n}\n\n/**\n * @name add_point_to_list\n *\n * Add an edge point to a POINT_GROUP containing a list of other points.\n */\nvoid Wordrec::add_point_to_list(PointHeap *point_heap, EDGEPT *point) {\n  if (point_heap->size() < MAX_NUM_POINTS - 2) {\n    PointPair pair(point_priority(point), point);\n    point_heap->Push(&pair);\n  }\n\n#ifndef GRAPHICS_DISABLED\n  if (chop_debug > 2) {\n    mark_outline(point);\n  }\n#endif\n}\n\n// Returns true if the edgept supplied as input is an inside angle.  This\n// is determined by the angular change of the vectors from point to point.\nbool Wordrec::is_inside_angle(EDGEPT *pt) {\n  return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;\n}\n\n/**\n * @name angle_change\n *\n * Return the change in angle (degrees) of the line segments between\n * points one and two, and two and three.\n */\nint Wordrec::angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) {\n  VECTOR vector1;\n  VECTOR vector2;\n\n  int angle;\n\n  /* Compute angle */\n  vector1.x = point2->pos.x - point1->pos.x;\n  vector1.y = point2->pos.y - point1->pos.y;\n  vector2.x = point3->pos.x - point2->pos.x;\n  vector2.y = point3->pos.y - point2->pos.y;\n  /* Use cross product */\n  float length = std::sqrt(static_cast<float>(vector1.length2()) * vector2.length2());\n  if (static_cast<int>(length) == 0) {\n    return (0);\n  }\n  auto f = vector1.cross(vector2) / length;\n  // Avoid FP exception in std::asin caused by illegal values of f\n  // (caused by rounding errors).\n  if (f <= -1.0f) {\n    angle = -90;\n  } else if (f >= 1.0f) {\n    angle = 90;\n  } else {\n    angle = static_cast<int>(floor(std::asin(f) / M_PI * 180.0 + 0.5));\n    // Use dot product.\n    if (vector1.dot(vector2) < 0) {\n      angle = 180 - angle;\n    }\n    // Adjust angle.\n    if (angle > 180) {\n      angle -= 360;\n    } else if (angle <= -180) {\n      angle += 360;\n    }\n  }\n  return angle;\n}\n\n/**\n * @name pick_close_point\n *\n * Choose the edge point that is closest to the critical point.  This\n * point may not be exactly vertical from the critical point.\n */\nEDGEPT *Wordrec::pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) {\n  EDGEPT *best_point = nullptr;\n  int this_distance;\n  bool found_better;\n\n  do {\n    found_better = false;\n\n    this_distance = edgept_dist(critical_point, vertical_point);\n    if (this_distance <= *best_dist) {\n      if (!(same_point(critical_point->pos, vertical_point->pos) ||\n            same_point(critical_point->pos, vertical_point->next->pos) ||\n            (best_point && same_point(best_point->pos, vertical_point->pos)) ||\n            is_exterior_point(critical_point, vertical_point))) {\n        *best_dist = this_distance;\n        best_point = vertical_point;\n        if (chop_vertical_creep) {\n          found_better = true;\n        }\n      }\n    }\n    vertical_point = vertical_point->next;\n  } while (found_better == true);\n\n  return (best_point);\n}\n\n/**\n * @name prioritize_points\n *\n * Find a list of edge points from the outer outline of this blob.  For\n * each of these points assign a priority.  Sort these points using a\n * heap structure so that they can be visited in order.\n */\nvoid Wordrec::prioritize_points(TESSLINE *outline, PointHeap *points) {\n  EDGEPT *this_point;\n  EDGEPT *local_min = nullptr;\n  EDGEPT *local_max = nullptr;\n\n  this_point = outline->loop;\n  local_min = this_point;\n  local_max = this_point;\n  do {\n    if (this_point->vec.y < 0) {\n      /* Look for minima */\n      if (local_max != nullptr) {\n        new_max_point(local_max, points);\n      } else if (is_inside_angle(this_point)) {\n        add_point_to_list(points, this_point);\n      }\n      local_max = nullptr;\n      local_min = this_point->next;\n    } else if (this_point->vec.y > 0) {\n      /* Look for maxima */\n      if (local_min != nullptr) {\n        new_min_point(local_min, points);\n      } else if (is_inside_angle(this_point)) {\n        add_point_to_list(points, this_point);\n      }\n      local_min = nullptr;\n      local_max = this_point->next;\n    } else {\n      /* Flat area */\n      if (local_max != nullptr) {\n        if (local_max->prev->vec.y != 0) {\n          new_max_point(local_max, points);\n        }\n        local_max = this_point->next;\n        local_min = nullptr;\n      } else {\n        if (local_min->prev->vec.y != 0) {\n          new_min_point(local_min, points);\n        }\n        local_min = this_point->next;\n        local_max = nullptr;\n      }\n    }\n\n    /* Next point */\n    this_point = this_point->next;\n  } while (this_point != outline->loop);\n}\n\n/**\n * @name new_min_point\n *\n * Found a new minimum point try to decide whether to save it or not.\n * Return the new value for the local minimum.  If a point is saved then\n * the local minimum is reset to nullptr.\n */\nvoid Wordrec::new_min_point(EDGEPT *local_min, PointHeap *points) {\n  int16_t dir;\n\n  dir = direction(local_min);\n\n  if (dir < 0) {\n    add_point_to_list(points, local_min);\n    return;\n  }\n\n  if (dir == 0 && point_priority(local_min) < 0) {\n    add_point_to_list(points, local_min);\n    return;\n  }\n}\n\n/**\n * @name new_max_point\n *\n * Found a new minimum point try to decide whether to save it or not.\n * Return the new value for the local minimum.  If a point is saved then\n * the local minimum is reset to nullptr.\n */\nvoid Wordrec::new_max_point(EDGEPT *local_max, PointHeap *points) {\n  int16_t dir;\n\n  dir = direction(local_max);\n\n  if (dir > 0) {\n    add_point_to_list(points, local_max);\n    return;\n  }\n\n  if (dir == 0 && point_priority(local_max) < 0) {\n    add_point_to_list(points, local_max);\n    return;\n  }\n}\n\n/**\n * @name vertical_projection_point\n *\n * For one point on the outline, find the corresponding point on the\n * other side of the outline that is a likely projection for a split\n * point.  This is done by iterating through the edge points until the\n * X value of the point being looked at is greater than the X value of\n * the split point.  Ensure that the point being returned is not right\n * next to the split point.  Return the edge point in *best_point as\n * a result, and any points that were newly created are also saved on\n * the new_points list.\n */\nvoid Wordrec::vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,\n                                        EDGEPT **best_point, EDGEPT_CLIST *new_points) {\n  EDGEPT *p;           /* Iterator */\n  EDGEPT *this_edgept; /* Iterator */\n  EDGEPT_C_IT new_point_it(new_points);\n  int x = split_point->pos.x;     /* X value of vertical */\n  int best_dist = LARGE_DISTANCE; /* Best point found */\n\n  if (*best_point != nullptr) {\n    best_dist = edgept_dist(split_point, *best_point);\n  }\n\n  p = target_point;\n  /* Look at each edge point */\n  do {\n    if (((p->pos.x <= x && x <= p->next->pos.x) || (p->next->pos.x <= x && x <= p->pos.x)) &&\n        !same_point(split_point->pos, p->pos) && !same_point(split_point->pos, p->next->pos) &&\n        !p->IsChopPt() && (*best_point == nullptr || !same_point((*best_point)->pos, p->pos))) {\n      if (near_point(split_point, p, p->next, &this_edgept)) {\n        new_point_it.add_before_then_move(this_edgept);\n      }\n\n      if (*best_point == nullptr) {\n        best_dist = edgept_dist(split_point, this_edgept);\n      }\n\n      this_edgept = pick_close_point(split_point, this_edgept, &best_dist);\n      if (this_edgept) {\n        *best_point = this_edgept;\n      }\n    }\n\n    p = p->next;\n  } while (p != target_point);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/chop.h",
    "content": "/******************************************************************************\n *\n * File:        chop.h\n * Author:      Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef CHOP_H\n#define CHOP_H\n\n#include \"genericheap.h\"\n#include \"kdpair.h\"\n#include \"seam.h\"\n\nnamespace tesseract {\n\n#define MAX_NUM_POINTS 50\n\n// The PointPair elements do NOT own the EDGEPTs.\nusing PointPair = KDPairInc<float, EDGEPT *>;\nusing PointHeap = GenericHeap<PointPair>;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/chopper.cpp",
    "content": "/******************************************************************************\n *\n * File:         chopper.cpp  (Formerly chopper.c)\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"blamer.h\"         // for BlamerBundle, IRR_CORRECT\n#include \"blobs.h\"          // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob\n#include \"dict.h\"           // for Dict\n#include \"lm_pain_points.h\" // for LMPainPoints\n#include \"lm_state.h\"       // for BestChoiceBundle\n#include \"matrix.h\"         // for MATRIX\n#include \"normalis.h\"       // for DENORM\n#include \"pageres.h\"        // for WERD_RES\n#include \"params.h\"         // for IntParam, BoolParam\n#include \"ratngs.h\"         // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ...\n#include \"rect.h\"           // for TBOX\n#include \"render.h\"         // for display_blob\n#include \"seam.h\"           // for SEAM\n#include \"split.h\"          // for remove_edgept\n#include \"stopper.h\"        // for DANGERR\n#include \"tprintf.h\"        // for tprintf\n#include \"wordrec.h\"        // for Wordrec, SegSearchPending (ptr only)\n\nnamespace tesseract {\n\n// Even though the limit on the number of chunks may now be removed, keep\n// the same limit for repeatable behavior, and it may be a speed advantage.\nstatic const int kMaxNumChunks = 64;\n\n/*----------------------------------------------------------------------\n          F u n c t i o n s\n----------------------------------------------------------------------*/\n\n/**\n * @name check_blob\n *\n * @return true if blob has a non whole outline.\n */\nstatic int check_blob(TBLOB *blob) {\n  TESSLINE *outline;\n  EDGEPT *edgept;\n\n  for (outline = blob->outlines; outline != nullptr; outline = outline->next) {\n    edgept = outline->loop;\n    do {\n      if (edgept == nullptr) {\n        break;\n      }\n      edgept = edgept->next;\n    } while (edgept != outline->loop);\n    if (edgept == nullptr) {\n      return 1;\n    }\n  }\n  return 0;\n}\n\n/**\n * @name any_shared_split_points\n *\n * Return true if any of the splits share a point with this one.\n */\nstatic int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) {\n  int length;\n  int index;\n\n  length = seams.size();\n  for (index = 0; index < length; index++) {\n    if (seam->SharesPosition(*seams[index])) {\n      return true;\n    }\n  }\n  return false;\n}\n\n/**\n * @name preserve_outline_tree\n *\n * Copy the list of outlines.\n */\nstatic void preserve_outline(EDGEPT *start) {\n  EDGEPT *srcpt;\n\n  if (start == nullptr) {\n    return;\n  }\n  srcpt = start;\n  do {\n    srcpt->runlength = 1;\n    srcpt = srcpt->next;\n  } while (srcpt != start);\n  srcpt->runlength = 2;\n}\n\nstatic void preserve_outline_tree(TESSLINE *srcline) {\n  TESSLINE *outline;\n\n  for (outline = srcline; outline != nullptr; outline = outline->next) {\n    preserve_outline(outline->loop);\n  }\n}\n\n/**\n * @name restore_outline_tree\n *\n * Copy the list of outlines.\n */\nstatic EDGEPT *restore_outline(EDGEPT *start) {\n  EDGEPT *srcpt;\n  EDGEPT *real_start;\n\n  if (start == nullptr) {\n    return nullptr;\n  }\n  srcpt = start;\n  do {\n    if (srcpt->runlength == 2) {\n      break;\n    }\n    srcpt = srcpt->next;\n  } while (srcpt != start);\n  real_start = srcpt;\n  do {\n    srcpt = srcpt->next;\n    if (srcpt->prev->runlength == 0) {\n      remove_edgept(srcpt->prev);\n    }\n  } while (srcpt != real_start);\n  return real_start;\n}\n\nstatic void restore_outline_tree(TESSLINE *srcline) {\n  TESSLINE *outline;\n\n  for (outline = srcline; outline != nullptr; outline = outline->next) {\n    outline->loop = restore_outline(outline->loop);\n    outline->start = outline->loop->pos;\n  }\n}\n\n/**********************************************************************\n * total_containment\n *\n * Check to see if one of these outlines is totally contained within\n * the bounding box of the other.\n **********************************************************************/\nstatic int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {\n  TBOX box1 = blob1->bounding_box();\n  TBOX box2 = blob2->bounding_box();\n  return box1.contains(box2) || box2.contains(box1);\n}\n\n// Helper runs all the checks on a seam to make sure it is valid.\n// Returns the seam if OK, otherwise deletes the seam and returns nullptr.\nstatic SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob,\n                       TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) {\n  if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr ||\n      total_containment(blob, other_blob) || check_blob(other_blob) ||\n      !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||\n      any_shared_split_points(seams, seam) ||\n      !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {\n    word->blobs.erase(word->blobs.begin() + blob_number + 1);\n    if (seam) {\n      seam->UndoSeam(blob, other_blob);\n      delete seam;\n      seam = nullptr;\n#ifndef GRAPHICS_DISABLED\n      if (debug_level) {\n        if (debug_level > 2) {\n          display_blob(blob, ScrollView::RED);\n        }\n        tprintf(\"\\n** seam being removed ** \\n\");\n      }\n#endif\n    } else {\n      delete other_blob;\n    }\n    return nullptr;\n  }\n  return seam;\n}\n\n/**\n * @name attempt_blob_chop\n *\n * Try to split the this blob after this one.  Check to make sure that\n * it was successful.\n */\nSEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,\n                                 const std::vector<SEAM *> &seams) {\n  if (repair_unchopped_blobs) {\n    preserve_outline_tree(blob->outlines);\n  }\n  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */\n  // Insert it into the word.\n  word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);\n\n  SEAM *seam = nullptr;\n  if (prioritize_division) {\n    TPOINT location;\n    if (divisible_blob(blob, italic_blob, &location)) {\n      seam = new SEAM(0.0f, location);\n    }\n  }\n  if (seam == nullptr) {\n    seam = pick_good_seam(blob);\n  }\n  if (chop_debug) {\n    if (seam != nullptr) {\n      seam->Print(\"Good seam picked=\");\n    } else {\n      tprintf(\"\\n** no seam picked *** \\n\");\n    }\n  }\n  if (seam) {\n    seam->ApplySeam(italic_blob, blob, other_blob);\n  }\n\n  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);\n  if (seam == nullptr) {\n    if (repair_unchopped_blobs) {\n      restore_outline_tree(blob->outlines);\n    }\n    if (allow_blob_division && !prioritize_division) {\n      // If the blob can simply be divided into outlines, then do that.\n      TPOINT location;\n      if (divisible_blob(blob, italic_blob, &location)) {\n        other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */\n        word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);\n        seam = new SEAM(0.0f, location);\n        seam->ApplySeam(italic_blob, blob, other_blob);\n        seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);\n      }\n    }\n  }\n  if (seam != nullptr) {\n    // Make sure this seam doesn't get chopped again.\n    seam->Finalize();\n  }\n  return seam;\n}\n\nSEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,\n                                  const std::vector<SEAM *> &seams) {\n  return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams);\n}\n\nSEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob,\n                                     WERD_RES *word_res, unsigned *blob_number) {\n  TWERD *word = word_res->chopped_word;\n  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {\n    TBLOB *blob = word->blobs[*blob_number];\n    TPOINT topleft, botright;\n    topleft.x = blob->bounding_box().left();\n    topleft.y = blob->bounding_box().top();\n    botright.x = blob->bounding_box().right();\n    botright.y = blob->bounding_box().bottom();\n\n    TPOINT original_topleft, original_botright;\n    word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);\n    word_res->denorm.DenormTransform(nullptr, botright, &original_botright);\n\n    TBOX original_box =\n        TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y);\n\n    bool almost_equal_box = false;\n    int num_overlap = 0;\n    for (auto &&boxe : boxes) {\n      if (original_box.overlap_fraction(boxe) > 0.125) {\n        num_overlap++;\n      }\n      if (original_box.almost_equal(boxe, 3)) {\n        almost_equal_box = true;\n      }\n    }\n\n    TPOINT location;\n    if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) {\n      SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array);\n      if (seam != nullptr) {\n        return seam;\n      }\n    }\n  }\n\n  *blob_number = UINT_MAX;\n  return nullptr;\n}\n\n/**\n * @name improve_one_blob\n *\n * Finds the best place to chop, based on the worst blob, fixpt, or next to\n * a fragment, according to the input. Returns the SEAM corresponding to the\n * chop point, if any is found, and the index in the ratings_matrix of the\n * chopped blob. Note that blob_choices is just a copy of the pointers in the\n * leading diagonal of the ratings MATRIX.\n * Although the blob is chopped, the returned SEAM is yet to be inserted into\n * word->seam_array and the resulting blobs are unclassified, so this function\n * can be used by ApplyBox as well as during recognition.\n */\nSEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,\n                                bool split_next_to_fragment, bool italic_blob, WERD_RES *word,\n                                unsigned *blob_number) {\n  float rating_ceiling = FLT_MAX;\n  SEAM *seam = nullptr;\n  do {\n    auto blob = select_blob_to_split_from_fixpt(fixpt);\n    if (chop_debug) {\n      tprintf(\"blob_number from fixpt = %d\\n\", blob);\n    }\n    bool split_point_from_dict = (blob != -1);\n    if (split_point_from_dict) {\n      fixpt->clear();\n    } else {\n      blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment);\n    }\n    if (chop_debug) {\n      tprintf(\"blob_number = %d\\n\", blob);\n    }\n    *blob_number = blob;\n    if (blob == -1) {\n      return nullptr;\n    }\n\n    // TODO(rays) it may eventually help to allow italic_blob to be true,\n    seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, word->seam_array);\n    if (seam != nullptr) {\n      break; // Success!\n    }\n    if (blob_choices[*blob_number] == nullptr) {\n      return nullptr;\n    }\n    if (!split_point_from_dict) {\n      // We chopped the worst rated blob, try something else next time.\n      rating_ceiling = blob_choices[*blob_number]->rating();\n    }\n  } while (true);\n  return seam;\n}\n\n/**\n * @name chop_one_blob\n *\n * Start with the current one-blob word and its classification.  Find\n * the worst blobs and try to divide it up to improve the ratings.\n * Used for testing chopper.\n */\nSEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,\n                             const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,\n                             unsigned *blob_number) {\n  if (prioritize_division) {\n    return chop_overlapping_blob(boxes, true, word_res, blob_number);\n  } else {\n    return improve_one_blob(blob_choices, nullptr, false, true, word_res, blob_number);\n  }\n}\n\n/**\n * @name chop_word_main\n *\n * Classify the blobs in this word and permute the results.  Find the\n * worst blob in the word and chop it up.  Continue this process until\n * a good answer has been found or all the blobs have been chopped up\n * enough.  The results are returned in the WERD_RES.\n */\nvoid Wordrec::chop_word_main(WERD_RES *word) {\n  int num_blobs = word->chopped_word->NumBlobs();\n  if (word->ratings == nullptr) {\n    word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);\n  }\n  if (word->ratings->get(0, 0) == nullptr) {\n    // Run initial classification.\n    for (int b = 0; b < num_blobs; ++b) {\n      BLOB_CHOICE_LIST *choices = classify_piece(\n          word->seam_array, b, b, \"Initial:\", word->chopped_word, word->blamer_bundle);\n      word->ratings->put(b, b, choices);\n    }\n  } else {\n    // Blobs have been pre-classified. Set matrix cell for all blob choices\n    for (int col = 0; col < word->ratings->dimension(); ++col) {\n      for (int row = col;\n           row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) {\n        BLOB_CHOICE_LIST *choices = word->ratings->get(col, row);\n        if (choices != nullptr) {\n          BLOB_CHOICE_IT bc_it(choices);\n          for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {\n            bc_it.data()->set_matrix_cell(col, row);\n          }\n        }\n      }\n    }\n  }\n\n  // Run Segmentation Search.\n  BestChoiceBundle best_choice_bundle(word->ratings->dimension());\n  SegSearch(word, &best_choice_bundle, word->blamer_bundle);\n\n  if (word->best_choice == nullptr) {\n    // SegSearch found no valid paths, so just use the leading diagonal.\n    word->FakeWordFromRatings(TOP_CHOICE_PERM);\n  }\n  word->RebuildBestState();\n  // If we finished without a hyphen at the end of the word, let the next word\n  // be found in the dictionary.\n  if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) {\n    getDict().reset_hyphen_vars(true);\n  }\n\n  if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {\n    CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle);\n  }\n  if (wordrec_debug_level > 0) {\n    tprintf(\"Final Ratings Matrix:\\n\");\n    word->ratings->print(getDict().getUnicharset());\n  }\n  word->FilterWordChoices(getDict().stopper_debug_level);\n}\n\n/**\n * @name improve_by_chopping\n *\n * Repeatedly chops the worst blob, classifying the new blobs fixing up all\n * the data, and incrementally runs the segmentation search until a good word\n * is found, or no more chops can be found.\n */\nvoid Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,\n                                  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,\n                                  LMPainPoints *pain_points,\n                                  std::vector<SegSearchPending> *pending) {\n  unsigned blob_number;\n  do { // improvement loop.\n    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which\n    // one to chop.\n    std::vector<BLOB_CHOICE *> blob_choices;\n    int num_blobs = word->ratings->dimension();\n    for (int i = 0; i < num_blobs; ++i) {\n      BLOB_CHOICE_LIST *choices = word->ratings->get(i, i);\n      if (choices == nullptr || choices->empty()) {\n        blob_choices.push_back(nullptr);\n      } else {\n        BLOB_CHOICE_IT bc_it(choices);\n        blob_choices.push_back(bc_it.data());\n      }\n    }\n    SEAM *seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word,\n                                  &blob_number);\n    if (seam == nullptr) {\n      break;\n    }\n    // A chop has been made. We have to correct all the data structures to\n    // take into account the extra bottom-level blob.\n    // Put the seam into the seam_array and correct everything else on the\n    // word: ratings matrix (including matrix location in the BLOB_CHOICES),\n    // states in WERD_CHOICEs, and blob widths.\n    word->InsertSeam(blob_number, seam);\n    // Insert a new entry in the beam array.\n    best_choice_bundle->beam.insert(best_choice_bundle->beam.begin() + blob_number, new LanguageModelState);\n    // Fixpts are outdated, but will get recalculated.\n    best_choice_bundle->fixpt.clear();\n    // Remap existing pain points.\n    pain_points->RemapForSplit(blob_number);\n    // Insert a new pending at the chop point.\n    pending->insert(pending->begin() + blob_number, SegSearchPending());\n\n    // Classify the two newly created blobs using ProcessSegSearchPainPoint,\n    // as that updates the pending correctly and adds new pain points.\n    MATRIX_COORD pain_point(blob_number, blob_number);\n    ProcessSegSearchPainPoint(0.0f, pain_point, \"Chop1\", pending, word, pain_points, blamer_bundle);\n    pain_point.col = blob_number + 1;\n    pain_point.row = blob_number + 1;\n    ProcessSegSearchPainPoint(0.0f, pain_point, \"Chop2\", pending, word, pain_points, blamer_bundle);\n    if (language_model_->language_model_ngram_on) {\n      // N-gram evaluation depends on the number of blobs in a chunk, so we\n      // have to re-evaluate everything in the word.\n      ResetNGramSearch(word, best_choice_bundle, *pending);\n      blob_number = 0;\n    }\n    // Run language model incrementally. (Except with the n-gram model on.)\n    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points,\n                         best_choice_bundle, blamer_bundle);\n  } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks);\n\n  // If after running only the chopper best_choice is incorrect and no blame\n  // has been yet set, blame the classifier if best_choice is classifier's\n  // top choice and is a dictionary word (i.e. language model could not have\n  // helped). Otherwise blame the tradeoff between the classifier and\n  // the old language model (permuters).\n  if (word->blamer_bundle != nullptr &&\n      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&\n      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {\n    bool valid_permuter = word->best_choice != nullptr &&\n                          Dict::valid_word_permuter(word->best_choice->permuter(), false);\n    word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter,\n                                                    wordrec_debug_blamer);\n  }\n}\n\n/**********************************************************************\n * select_blob_to_split\n *\n * These are the results of the last classification.  Find a likely\n * place to apply splits.  If none, return -1.\n **********************************************************************/\nint Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,\n                                  float rating_ceiling, bool split_next_to_fragment) {\n  BLOB_CHOICE *blob_choice;\n  float worst = -FLT_MAX;\n  int worst_index = -1;\n  float worst_near_fragment = -FLT_MAX;\n  int worst_index_near_fragment = -1;\n  std::vector<const CHAR_FRAGMENT *> fragments;\n\n  if (chop_debug) {\n    if (rating_ceiling < FLT_MAX) {\n      tprintf(\"rating_ceiling = %8.4f\\n\", rating_ceiling);\n    } else {\n      tprintf(\"rating_ceiling = No Limit\\n\");\n    }\n  }\n\n  if (split_next_to_fragment && blob_choices.size() > 0) {\n    fragments.resize(blob_choices.size());\n    if (blob_choices[0] != nullptr) {\n      fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id());\n    } else {\n      fragments[0] = nullptr;\n    }\n  }\n\n  for (unsigned x = 0; x < blob_choices.size(); ++x) {\n    if (blob_choices[x] == nullptr) {\n      return x;\n    } else {\n      blob_choice = blob_choices[x];\n      // Populate fragments for the following position.\n      if (split_next_to_fragment && x + 1 < blob_choices.size()) {\n        if (blob_choices[x + 1] != nullptr) {\n          fragments[x + 1] =\n              getDict().getUnicharset().get_fragment(blob_choices[x + 1]->unichar_id());\n        } else {\n          fragments[x + 1] = nullptr;\n        }\n      }\n      if (blob_choice->rating() < rating_ceiling &&\n          blob_choice->certainty() < tessedit_certainty_threshold) {\n        // Update worst and worst_index.\n        if (blob_choice->rating() > worst) {\n          worst_index = x;\n          worst = blob_choice->rating();\n        }\n        if (split_next_to_fragment) {\n          // Update worst_near_fragment and worst_index_near_fragment.\n          bool expand_following_fragment =\n              (x + 1 < blob_choices.size() && fragments[x + 1] != nullptr &&\n               !fragments[x + 1]->is_beginning());\n          bool expand_preceding_fragment =\n              (x > 0 && fragments[x - 1] != nullptr && !fragments[x - 1]->is_ending());\n          if ((expand_following_fragment || expand_preceding_fragment) &&\n              blob_choice->rating() > worst_near_fragment) {\n            worst_index_near_fragment = x;\n            worst_near_fragment = blob_choice->rating();\n            if (chop_debug) {\n              tprintf(\n                  \"worst_index_near_fragment=%d\"\n                  \" expand_following_fragment=%d\"\n                  \" expand_preceding_fragment=%d\\n\",\n                  worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment);\n            }\n          }\n        }\n      }\n    }\n  }\n  // TODO(daria): maybe a threshold of badness for\n  // worst_near_fragment would be useful.\n  return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index;\n}\n\n/**********************************************************************\n * select_blob_to_split_from_fixpt\n *\n * Given the fix point from a dictionary search, if there is a single\n * dangerous blob that maps to multiple characters, return that blob\n * index as a place we need to split.  If none, return -1.\n **********************************************************************/\nint Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {\n  if (!fixpt) {\n    return -1;\n  }\n  for (auto &i : *fixpt) {\n    if (i.begin + 1 == i.end && i.dangerous && i.correct_is_ngram) {\n      return i.begin;\n    }\n  }\n  return -1;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/drawfx.cpp",
    "content": "/**********************************************************************\n * File:        drawfx.cpp\n * Description: Draw things to do with feature extraction.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"drawfx.h\"\n\n#include \"normalis.h\"\n#include \"werd.h\"\n\nnamespace tesseract {\n\n#ifndef GRAPHICS_DISABLED\n\n#  define FXDEMOWIN \"FXDemo\"\n#  define FXDEMOXPOS 250\n#  define FXDEMOYPOS 0\n#  define FXDEMOXSIZE 600\n#  define FXDEMOYSIZE 256\n#  define BLN_MAX 512 // max coord for bln\n#  define WERDWIDTH (BLN_MAX * 20)\n// title of window\n#  define DEBUG_WIN_NAME \"FXDebug\"\n\nScrollView *fx_win = nullptr;\n\n/**********************************************************************\n * create_fx_win\n *\n * Create the fx window used to show the fit.\n **********************************************************************/\n\nvoid create_fx_win() { // make features win\n  fx_win = new ScrollView(FXDEMOWIN, FXDEMOXPOS, FXDEMOYPOS, FXDEMOXSIZE, FXDEMOYSIZE,\n                          WERDWIDTH * 2, BLN_MAX * 2, true);\n}\n\n/**********************************************************************\n * clear_fx_win\n *\n * Clear the fx window and draw on the base/mean lines.\n **********************************************************************/\n\nvoid clear_fx_win() { // make features win\n  fx_win->Clear();\n  fx_win->Pen(64, 64, 64);\n  fx_win->Line(-WERDWIDTH, kBlnBaselineOffset, WERDWIDTH, kBlnBaselineOffset);\n  fx_win->Line(-WERDWIDTH, kBlnXHeight + kBlnBaselineOffset, WERDWIDTH,\n               kBlnXHeight + kBlnBaselineOffset);\n}\n\n#endif // !GRAPHICS_DISABLED\n\n/**********************************************************************\n * create_fxdebug_win\n *\n * Create the fx window used to show the fit.\n **********************************************************************/\n\nvoid create_fxdebug_win() { // make gradients win\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/drawfx.h",
    "content": "/**********************************************************************\n * File:        drawfx.h\n * Description: Draw things to do with feature extraction.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#ifndef DRAWFX_H\n#define DRAWFX_H\n\n#include \"params.h\"\n#include \"scrollview.h\"\n\nnamespace tesseract {\n\n#ifndef GRAPHICS_DISABLED\nextern ScrollView *fx_win;\n#endif // !GRAPHICS_DISABLED\nvoid create_fx_win();      // make features win\nvoid clear_fx_win();       // make features win\nvoid create_fxdebug_win(); // make gradients win\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/findseam.cpp",
    "content": "/******************************************************************************\n *\n * File:         findseam.cpp  (Formerly findseam.c)\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n#include \"findseam.h\"\n#include \"outlines.h\"\n#include \"plotedges.h\"\n#include \"seam.h\"\n#include \"wordrec.h\"\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n/**********************************************************************\n * partial_split_priority\n *\n * Assign a priority to this split based on the features that it has.\n * Grade it according to the different rating schemes and return the\n * value of its goodness.\n **********************************************************************/\n\n#define partial_split_priority(split) (grade_split_length(split) + grade_sharpness(split))\n\n/*----------------------------------------------------------------------\n              T y p e s\n----------------------------------------------------------------------*/\n#define SPLIT_CLOSENESS 20 /* Difference in x value */\n                           /* How many to keep */\n#define MAX_NUM_SEAMS 150\n/* How many to keep */\n#define NO_FULL_PRIORITY (-1) // Special marker for pri.\n                            /* Evaluate right away */\n#define BAD_PRIORITY 9999.0\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\nnamespace tesseract {\n\n/**********************************************************************\n * add_seam_to_queue\n *\n * Adds the given new_seam to the seams priority queue, unless it is full\n * and the new seam is worse than the worst.\n **********************************************************************/\nvoid Wordrec::add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams) {\n  if (new_seam == nullptr) {\n    return;\n  }\n  if (chop_debug) {\n    tprintf(\"Pushing new seam with priority %g :\", new_priority);\n    new_seam->Print(\"seam: \");\n  }\n  if (seams->size() >= MAX_NUM_SEAMS) {\n    SeamPair old_pair(0, nullptr);\n    if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {\n      if (chop_debug) {\n        tprintf(\"Old seam staying with priority %g\\n\", old_pair.key());\n      }\n      delete new_seam;\n      seams->Push(&old_pair);\n      return;\n    } else if (chop_debug) {\n      tprintf(\"New seam with priority %g beats old worst seam with %g\\n\", new_priority,\n              old_pair.key());\n    }\n  }\n  SeamPair new_pair(new_priority, new_seam);\n  seams->Push(&new_pair);\n}\n\n/**********************************************************************\n * choose_best_seam\n *\n * Choose the best seam that can be created by assembling this a\n * collection of splits.  A queue of all the possible seams is\n * maintained.  Each new split received is placed in that queue with\n * its partial priority value.  These values in the seam queue are\n * evaluated and combined until a good enough seam is found.  If no\n * further good seams are being found then this function returns to the\n * caller, who will send more splits.  If this function is called with\n * a split of nullptr, then no further splits can be supplied by the\n * caller.\n **********************************************************************/\nvoid Wordrec::choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,\n                               SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) {\n  SEAM *seam;\n  float my_priority;\n  /* Add seam of split */\n  my_priority = priority;\n  if (split != nullptr) {\n    TPOINT split_point = split->point1->pos;\n    split_point += split->point2->pos;\n    split_point /= 2;\n    seam = new SEAM(my_priority, split_point, *split);\n    if (chop_debug > 1) {\n      seam->Print(\"Partial priority    \");\n    }\n    add_seam_to_queue(my_priority, seam, seam_queue);\n\n    if (my_priority > chop_good_split) {\n      return;\n    }\n  }\n\n  TBOX bbox = blob->bounding_box();\n  /* Queue loop */\n  while (!seam_queue->empty()) {\n    SeamPair seam_pair;\n    seam_queue->Pop(&seam_pair);\n    seam = seam_pair.extract_data();\n    /* Set full priority */\n    my_priority =\n        seam->FullPriority(bbox.left(), bbox.right(), chop_overlap_knob, chop_centered_maxwidth,\n                           chop_center_knob, chop_width_change_knob);\n    if (chop_debug) {\n      char str[80];\n      snprintf(str, sizeof(str), \"Full my_priority %0.0f,  \", my_priority);\n      seam->Print(str);\n    }\n\n    if ((*seam_result == nullptr || (*seam_result)->priority() > my_priority) &&\n        my_priority < chop_ok_split) {\n      /* No crossing */\n      if (seam->IsHealthy(*blob, chop_min_outline_points, chop_min_outline_area)) {\n        delete *seam_result;\n        *seam_result = new SEAM(*seam);\n        (*seam_result)->set_priority(my_priority);\n      } else {\n        delete seam;\n        seam = nullptr;\n        my_priority = BAD_PRIORITY;\n      }\n    }\n\n    if (my_priority < chop_good_split) {\n      delete seam;\n      return; /* Made good answer */\n    }\n\n    if (seam) {\n      /* Combine with others */\n      if (seam_pile->size() < chop_seam_pile_size) {\n        combine_seam(*seam_pile, seam, seam_queue);\n        SeamDecPair pair(seam_pair.key(), seam);\n        seam_pile->Push(&pair);\n      } else if (chop_new_seam_pile && seam_pile->size() == chop_seam_pile_size &&\n                 seam_pile->PeekTop().key() > seam_pair.key()) {\n        combine_seam(*seam_pile, seam, seam_queue);\n        SeamDecPair pair;\n        seam_pile->Pop(&pair); // pop the worst.\n        // Replace the seam in pair (deleting the old one) with\n        // the new seam and score, then push back into the heap.\n        pair.set_key(seam_pair.key());\n        pair.set_data(seam);\n        seam_pile->Push(&pair);\n      } else {\n        delete seam;\n      }\n    }\n\n    my_priority = seam_queue->empty() ? NO_FULL_PRIORITY : seam_queue->PeekTop().key();\n    if ((my_priority > chop_ok_split) || (my_priority > chop_good_split && split)) {\n      return;\n    }\n  }\n}\n\n/**********************************************************************\n * combine_seam\n *\n * Find other seams to combine with this one.  The new seams that result\n * from this union should be added to the seam queue.  The return value\n * tells whether or not any additional seams were added to the queue.\n **********************************************************************/\nvoid Wordrec::combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) {\n  for (int x = 0; x < seam_pile.size(); ++x) {\n    const SEAM *this_one = seam_pile.get(x).data();\n    if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {\n      SEAM *new_one = new SEAM(*seam);\n      new_one->CombineWith(*this_one);\n      if (chop_debug > 1) {\n        new_one->Print(\"Combo priority       \");\n      }\n      add_seam_to_queue(new_one->priority(), new_one, seam_queue);\n    }\n  }\n}\n\n/**********************************************************************\n * pick_good_seam\n *\n * Find and return a good seam that will split this blob into two pieces.\n * Work from the outlines provided.\n **********************************************************************/\nSEAM *Wordrec::pick_good_seam(TBLOB *blob) {\n  SeamPile seam_pile(chop_seam_pile_size);\n  EDGEPT *points[MAX_NUM_POINTS];\n  EDGEPT_CLIST new_points;\n  SEAM *seam = nullptr;\n  TESSLINE *outline;\n  int16_t num_points = 0;\n\n#ifndef GRAPHICS_DISABLED\n  if (chop_debug > 2) {\n    wordrec_display_splits.set_value(true);\n  }\n\n  draw_blob_edges(blob);\n#endif\n\n  PointHeap point_heap(MAX_NUM_POINTS);\n  for (outline = blob->outlines; outline; outline = outline->next) {\n    prioritize_points(outline, &point_heap);\n  }\n\n  while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {\n    points[num_points++] = point_heap.PeekTop().data();\n    point_heap.Pop(nullptr);\n  }\n\n  /* Initialize queue */\n  SeamQueue seam_queue(MAX_NUM_SEAMS);\n\n  try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);\n  try_vertical_splits(points, num_points, &new_points, &seam_queue, &seam_pile, &seam, blob);\n\n  if (seam == nullptr) {\n    choose_best_seam(&seam_queue, nullptr, BAD_PRIORITY, &seam, blob, &seam_pile);\n  } else if (seam->priority() > chop_good_split) {\n    choose_best_seam(&seam_queue, nullptr, seam->priority(), &seam, blob, &seam_pile);\n  }\n\n  EDGEPT_C_IT it(&new_points);\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    EDGEPT *inserted_point = it.data();\n    if (seam == nullptr || !seam->UsesPoint(inserted_point)) {\n      for (outline = blob->outlines; outline; outline = outline->next) {\n        if (outline->loop == inserted_point) {\n          outline->loop = outline->loop->next;\n        }\n      }\n      remove_edgept(inserted_point);\n    }\n  }\n\n  if (seam) {\n    if (seam->priority() > chop_ok_split) {\n      delete seam;\n      seam = nullptr;\n    }\n#ifndef GRAPHICS_DISABLED\n    else if (wordrec_display_splits) {\n      seam->Mark(edge_window);\n      if (chop_debug > 2) {\n        edge_window->Update();\n        edge_window->Wait();\n      }\n    }\n#endif\n  }\n\n  if (chop_debug) {\n    wordrec_display_splits.set_value(false);\n  }\n\n  return (seam);\n}\n\n/**********************************************************************\n * try_point_pairs\n *\n * Try all the splits that are produced by pairing critical points\n * together.  See if any of them are suitable for use.  Use a seam\n * queue and seam pile that have already been initialized and used.\n **********************************************************************/\nvoid Wordrec::try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,\n                              SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam,\n                              TBLOB *blob) {\n  int16_t x;\n  int16_t y;\n  PRIORITY priority;\n\n  for (x = 0; x < num_points; x++) {\n    for (y = x + 1; y < num_points; y++) {\n      if (points[y] &&\n          points[x]->WeightedDistance(*points[y], chop_x_y_weight) < chop_split_length &&\n          points[x] != points[y]->next && points[y] != points[x]->next &&\n          !is_exterior_point(points[x], points[y]) && !is_exterior_point(points[y], points[x])) {\n        SPLIT split(points[x], points[y]);\n        priority = partial_split_priority(&split);\n\n        choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);\n      }\n    }\n  }\n}\n\n/**********************************************************************\n * try_vertical_splits\n *\n * Try all the splits that are produced by vertical projection to see\n * if any of them are suitable for use.  Use a seam queue and seam pile\n * that have already been initialized and used.\n * Return in new_points a collection of points that were inserted into\n * the blob while examining vertical splits and which may safely be\n * removed once a seam is chosen if they are not part of the seam.\n **********************************************************************/\nvoid Wordrec::try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,\n                                  EDGEPT_CLIST *new_points, SeamQueue *seam_queue,\n                                  SeamPile *seam_pile, SEAM **seam, TBLOB *blob) {\n  EDGEPT *vertical_point = nullptr;\n  int16_t x;\n  PRIORITY priority;\n  TESSLINE *outline;\n\n  for (x = 0; x < num_points; x++) {\n    vertical_point = nullptr;\n    for (outline = blob->outlines; outline; outline = outline->next) {\n      vertical_projection_point(points[x], outline->loop, &vertical_point, new_points);\n    }\n\n    if (vertical_point && points[x] != vertical_point->next && vertical_point != points[x]->next &&\n        points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) < chop_split_length) {\n      SPLIT split(points[x], vertical_point);\n      priority = partial_split_priority(&split);\n      choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);\n    }\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/findseam.h",
    "content": "/******************************************************************************\n *\n * File:        findseam.h\n * Author:      Mark Seaman, SW Productivity\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef FINDSEAM_H\n#define FINDSEAM_H\n\n#include \"chop.h\"\n#include \"genericheap.h\"\n#include \"kdpair.h\"\n#include \"seam.h\"\n\nnamespace tesseract {\n\n// The SeamPair elements own their SEAMs and delete them upon destruction.\nusing SeamPair = KDPtrPairInc<float, SEAM>;\nusing SeamQueue = GenericHeap<SeamPair>;\n\nusing SeamDecPair = KDPtrPairDec<float, SEAM>;\nusing SeamPile = GenericHeap<SeamDecPair>;\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/gradechop.cpp",
    "content": "/******************************************************************************\n *\n * File:         gradechop.cpp  (Formerly gradechop.c)\n * Description:\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include <algorithm>\n#include <cmath>\n#include \"chop.h\"\n#include \"wordrec.h\"\n\n/*----------------------------------------------------------------------\n              M a c r o s\n----------------------------------------------------------------------*/\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n\n/**********************************************************************\n * grade_split_length\n *\n * Return a grade for the length of this split.\n *   0    =  \"perfect\"\n *   100  =  \"no way jay\"\n **********************************************************************/\nPRIORITY Wordrec::grade_split_length(SPLIT *split) {\n  PRIORITY grade;\n  float split_length;\n\n  split_length = split->point1->WeightedDistance(*split->point2, chop_x_y_weight);\n\n  if (split_length <= 0) {\n    grade = 0;\n  } else {\n    grade = std::sqrt(split_length) * chop_split_dist_knob;\n  }\n\n  return (std::max(0.0f, grade));\n}\n\n/**********************************************************************\n * grade_sharpness\n *\n * Return a grade for the sharpness of this split.\n *   0    =  \"perfect\"\n *   100  =  \"no way jay\"\n **********************************************************************/\nPRIORITY Wordrec::grade_sharpness(SPLIT *split) {\n  PRIORITY grade;\n\n  grade = point_priority(split->point1) + point_priority(split->point2);\n\n  if (grade < -360.0) {\n    grade = 0;\n  } else {\n    grade += 360.0;\n  }\n\n  grade *= chop_sharpness_knob; /* Values 0 to -360 */\n\n  return (grade);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/language_model.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        language_model.cpp\n// Description: Functions that utilize the knowledge about the properties,\n//              structure and statistics of the language to help recognition.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"language_model.h\"\n#include <tesseract/unichar.h>       // for UNICHAR_ID, INVALID_UNICHAR_ID\n#include <cassert>                   // for assert\n#include <cmath>                     // for log2, pow\n#include \"blamer.h\"                  // for BlamerBundle\n#include \"ccutil.h\"                  // for CCUtil\n#include \"dawg.h\"                    // for NO_EDGE, Dawg, Dawg::kPatternUn...\n#include \"errcode.h\"                 // for ASSERT_HOST\n#include \"lm_state.h\"                // for ViterbiStateEntry, ViterbiState...\n#include \"matrix.h\"                  // for MATRIX_COORD\n#include \"pageres.h\"                 // for WERD_RES\n#include \"params.h\"                  // for IntParam, BoolParam, DoubleParam\n#include \"params_training_featdef.h\" // for ParamsTrainingHypothesis, PTRAI...\n#include \"tprintf.h\"                 // for tprintf\n#include \"unicharset.h\"              // for UNICHARSET\n#include \"unicity_table.h\"           // for UnicityTable\n\ntemplate <typename T>\nclass UnicityTable;\n\nnamespace tesseract {\n\nclass LMPainPoints;\nstruct FontInfo;\n\n#if defined(ANDROID)\nstatic inline double log2(double n) {\n  return log(n) / log(2.0);\n}\n#endif // ANDROID\n\nconst float LanguageModel::kMaxAvgNgramCost = 25.0f;\n\nLanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict)\n    : INT_MEMBER(language_model_debug_level, 0, \"Language model debug level\",\n                 dict->getCCUtil()->params())\n    , BOOL_INIT_MEMBER(language_model_ngram_on, false,\n                       \"Turn on/off the use of character ngram model\", dict->getCCUtil()->params())\n    , INT_MEMBER(language_model_ngram_order, 8, \"Maximum order of the character ngram model\",\n                 dict->getCCUtil()->params())\n    , INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,\n                 \"Maximum number of prunable (those for which\"\n                 \" PrunablePath() is true) entries in each viterbi list\"\n                 \" recorded in BLOB_CHOICEs\",\n                 dict->getCCUtil()->params())\n    , INT_MEMBER(language_model_viterbi_list_max_size, 500,\n                 \"Maximum size of viterbi lists recorded in BLOB_CHOICEs\",\n                 dict->getCCUtil()->params())\n    , double_MEMBER(language_model_ngram_small_prob, 0.000001,\n                    \"To avoid overly small denominators use this as the \"\n                    \"floor of the probability returned by the ngram model.\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_ngram_nonmatch_score, -40.0,\n                    \"Average classifier score of a non-matching unichar.\",\n                    dict->getCCUtil()->params())\n    , BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,\n                  \"Use only the first UTF8 step of the given string\"\n                  \" when computing log probabilities.\",\n                  dict->getCCUtil()->params())\n    , double_MEMBER(language_model_ngram_scale_factor, 0.03,\n                    \"Strength of the character ngram model relative to the\"\n                    \" character classifier \",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_ngram_rating_factor, 16.0,\n                    \"Factor to bring log-probs into the same range as ratings\"\n                    \" when multiplied by outline length \",\n                    dict->getCCUtil()->params())\n    , BOOL_MEMBER(language_model_ngram_space_delimited_language, true,\n                  \"Words are delimited by space\", dict->getCCUtil()->params())\n    , INT_MEMBER(language_model_min_compound_length, 3, \"Minimum length of compound words\",\n                 dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,\n                    \"Penalty for words not in the frequent word dictionary\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_non_dict_word, 0.15, \"Penalty for non-dictionary words\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_punc, 0.2, \"Penalty for inconsistent punctuation\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_case, 0.1, \"Penalty for inconsistent case\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_script, 0.5, \"Penalty for inconsistent script\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_chartype, 0.3, \"Penalty for inconsistent character type\",\n                    dict->getCCUtil()->params())\n    ,\n    // TODO(daria, rays): enable font consistency checking\n    // after improving font analysis.\n    double_MEMBER(language_model_penalty_font, 0.00, \"Penalty for inconsistent font\",\n                  dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_spacing, 0.05, \"Penalty for inconsistent spacing\",\n                    dict->getCCUtil()->params())\n    , double_MEMBER(language_model_penalty_increment, 0.01, \"Penalty increment\",\n                    dict->getCCUtil()->params())\n    , INT_MEMBER(wordrec_display_segmentations, 0, \"Display Segmentations (ScrollView)\",\n                 dict->getCCUtil()->params())\n    , BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,\n                       \"Use sigmoidal score for certainty\", dict->getCCUtil()->params())\n    , dawg_args_(nullptr, new DawgPositionVector(), NO_PERM)\n    , fontinfo_table_(fontinfo_table)\n    , dict_(dict) {\n  ASSERT_HOST(dict_ != nullptr);\n}\n\nLanguageModel::~LanguageModel() {\n  delete dawg_args_.updated_dawgs;\n}\n\nvoid LanguageModel::InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch,\n                                float max_char_wh_ratio, float rating_cert_scale) {\n  fixed_pitch_ = fixed_pitch;\n  max_char_wh_ratio_ = max_char_wh_ratio;\n  rating_cert_scale_ = rating_cert_scale;\n  acceptable_choice_found_ = false;\n  correct_segmentation_explored_ = false;\n\n  // Initialize vectors with beginning DawgInfos.\n  very_beginning_active_dawgs_.clear();\n  dict_->init_active_dawgs(&very_beginning_active_dawgs_, false);\n  beginning_active_dawgs_.clear();\n  dict_->default_dawgs(&beginning_active_dawgs_, false);\n\n  // Fill prev_word_str_ with the last language_model_ngram_order\n  // unichars from prev_word.\n  if (language_model_ngram_on) {\n    if (prev_word != nullptr && !prev_word->unichar_string().empty()) {\n      prev_word_str_ = prev_word->unichar_string();\n      if (language_model_ngram_space_delimited_language) {\n        prev_word_str_ += ' ';\n      }\n    } else {\n      prev_word_str_ = \" \";\n    }\n    const char *str_ptr = prev_word_str_.c_str();\n    const char *str_end = str_ptr + prev_word_str_.length();\n    int step;\n    prev_word_unichar_step_len_ = 0;\n    while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {\n      str_ptr += step;\n      ++prev_word_unichar_step_len_;\n    }\n    ASSERT_HOST(str_ptr == str_end);\n  }\n}\n\n/**\n * Helper scans the collection of predecessors for competing siblings that\n * have the same letter with the opposite case, setting competing_vse.\n */\nstatic void ScanParentsForCaseMix(const UNICHARSET &unicharset, LanguageModelState *parent_node) {\n  if (parent_node == nullptr) {\n    return;\n  }\n  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);\n  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {\n    ViterbiStateEntry *vse = vit.data();\n    vse->competing_vse = nullptr;\n    UNICHAR_ID unichar_id = vse->curr_b->unichar_id();\n    if (unicharset.get_isupper(unichar_id) || unicharset.get_islower(unichar_id)) {\n      UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);\n      if (other_case == unichar_id) {\n        continue; // Not in unicharset.\n      }\n      // Find other case in same list. There could be multiple entries with\n      // the same unichar_id, but in theory, they should all point to the\n      // same BLOB_CHOICE, and that is what we will be using to decide\n      // which to keep.\n      ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);\n      for (vit2.mark_cycle_pt();\n           !vit2.cycled_list() && vit2.data()->curr_b->unichar_id() != other_case; vit2.forward()) {\n      }\n      if (!vit2.cycled_list()) {\n        vse->competing_vse = vit2.data();\n      }\n    }\n  }\n}\n\n/**\n * Helper returns true if the given choice has a better case variant before\n * it in the choice_list that is not distinguishable by size.\n */\nstatic bool HasBetterCaseVariant(const UNICHARSET &unicharset, const BLOB_CHOICE *choice,\n                                 BLOB_CHOICE_LIST *choices) {\n  UNICHAR_ID choice_id = choice->unichar_id();\n  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);\n  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID) {\n    return false; // Not upper or lower or not in unicharset.\n  }\n  if (unicharset.SizesDistinct(choice_id, other_case)) {\n    return false; // Can be separated by size.\n  }\n  BLOB_CHOICE_IT bc_it(choices);\n  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {\n    BLOB_CHOICE *better_choice = bc_it.data();\n    if (better_choice->unichar_id() == other_case) {\n      return true; // Found an earlier instance of other_case.\n    } else if (better_choice == choice) {\n      return false; // Reached the original choice.\n    }\n  }\n  return false; // Should never happen, but just in case.\n}\n\n/**\n * UpdateState has the job of combining the ViterbiStateEntry lists on each\n * of the choices on parent_list with each of the blob choices in curr_list,\n * making a new ViterbiStateEntry for each sensible path.\n *\n * This could be a huge set of combinations, creating a lot of work only to\n * be truncated by some beam limit, but only certain kinds of paths will\n * continue at the next step:\n * - paths that are liked by the language model: either a DAWG or the n-gram\n *   model, where active.\n * - paths that represent some kind of top choice. The old permuter permuted\n *   the top raw classifier score, the top upper case word and the top lower-\n *   case word. UpdateState now concentrates its top-choice paths on top\n *   lower-case, top upper-case (or caseless alpha), and top digit sequence,\n *   with allowance for continuation of these paths through blobs where such\n *   a character does not appear in the choices list.\n *\n * GetNextParentVSE enforces some of these models to minimize the number of\n * calls to AddViterbiStateEntry, even prior to looking at the language model.\n * Thus an n-blob sequence of [l1I] will produce 3n calls to\n * AddViterbiStateEntry instead of 3^n.\n *\n * Of course it isn't quite that simple as Title Case is handled by allowing\n * lower case to continue an upper case initial, but it has to be detected\n * in the combiner so it knows which upper case letters are initial alphas.\n */\nbool LanguageModel::UpdateState(bool just_classified, int curr_col, int curr_row,\n                                BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node,\n                                LMPainPoints *pain_points, WERD_RES *word_res,\n                                BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {\n  if (language_model_debug_level > 0) {\n    tprintf(\"\\nUpdateState: col=%d row=%d %s\", curr_col, curr_row,\n            just_classified ? \"just_classified\" : \"\");\n    if (language_model_debug_level > 5) {\n      tprintf(\"(parent=%p)\\n\", static_cast<void *>(parent_node));\n    } else {\n      tprintf(\"\\n\");\n    }\n  }\n  // Initialize helper variables.\n  bool word_end = (curr_row + 1 >= word_res->ratings->dimension());\n  bool new_changed = false;\n  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;\n  const UNICHARSET &unicharset = dict_->getUnicharset();\n  BLOB_CHOICE *first_lower = nullptr;\n  BLOB_CHOICE *first_upper = nullptr;\n  BLOB_CHOICE *first_digit = nullptr;\n  bool has_alnum_mix = false;\n  if (parent_node != nullptr) {\n    int result = SetTopParentLowerUpperDigit(parent_node);\n    if (result < 0) {\n      if (language_model_debug_level > 0) {\n        tprintf(\"No parents found to process\\n\");\n      }\n      return false;\n    }\n    if (result > 0) {\n      has_alnum_mix = true;\n    }\n  }\n  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper, &first_digit)) {\n    has_alnum_mix = false;\n  };\n  ScanParentsForCaseMix(unicharset, parent_node);\n  if (language_model_debug_level > 3 && parent_node != nullptr) {\n    parent_node->Print(\"Parent viterbi list\");\n  }\n  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];\n\n  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.\n  ViterbiStateEntry_IT vit;\n  BLOB_CHOICE_IT c_it(curr_list);\n  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {\n    BLOB_CHOICE *choice = c_it.data();\n    // TODO(antonova): make sure commenting this out if ok for ngram\n    // model scoring (I think this was introduced to fix ngram model quirks).\n    // Skip nullptr unichars unless it is the only choice.\n    // if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;\n    UNICHAR_ID unichar_id = choice->unichar_id();\n    if (unicharset.get_fragment(unichar_id)) {\n      continue; // Skip fragments.\n    }\n    // Set top choice flags.\n    LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;\n    if (c_it.at_first() || !new_changed) {\n      blob_choice_flags |= kSmallestRatingFlag;\n    }\n    if (first_lower == choice) {\n      blob_choice_flags |= kLowerCaseFlag;\n    }\n    if (first_upper == choice) {\n      blob_choice_flags |= kUpperCaseFlag;\n    }\n    if (first_digit == choice) {\n      blob_choice_flags |= kDigitFlag;\n    }\n\n    if (parent_node == nullptr) {\n      // Process the beginning of a word.\n      // If there is a better case variant that is not distinguished by size,\n      // skip this blob choice, as we have no choice but to accept the result\n      // of the character classifier to distinguish between them, even if\n      // followed by an upper case.\n      // With words like iPoc, and other CamelBackWords, the lower-upper\n      // transition can only be achieved if the classifier has the correct case\n      // as the top choice, and leaving an initial I lower down the list\n      // increases the chances of choosing IPoc simply because it doesn't\n      // include such a transition. iPoc will beat iPOC and ipoc because\n      // the other words are baseline/x-height inconsistent.\n      if (HasBetterCaseVariant(unicharset, choice, curr_list)) {\n        continue;\n      }\n      // Upper counts as lower at the beginning of a word.\n      if (blob_choice_flags & kUpperCaseFlag) {\n        blob_choice_flags |= kLowerCaseFlag;\n      }\n      new_changed |= AddViterbiStateEntry(blob_choice_flags, denom, word_end, curr_col, curr_row,\n                                          choice, curr_state, nullptr, pain_points, word_res,\n                                          best_choice_bundle, blamer_bundle);\n    } else {\n      // Get viterbi entries from each parent ViterbiStateEntry.\n      vit.set_to_list(&parent_node->viterbi_state_entries);\n      int vit_counter = 0;\n      vit.mark_cycle_pt();\n      ViterbiStateEntry *parent_vse = nullptr;\n      LanguageModelFlagsType top_choice_flags;\n      while ((parent_vse =\n                  GetNextParentVSE(just_classified, has_alnum_mix, c_it.data(), blob_choice_flags,\n                                   unicharset, word_res, &vit, &top_choice_flags)) != nullptr) {\n        // Skip pruned entries and do not look at prunable entries if already\n        // examined language_model_viterbi_list_max_num_prunable of those.\n        if (PrunablePath(*parent_vse) &&\n            (++vit_counter > language_model_viterbi_list_max_num_prunable ||\n             (language_model_ngram_on && parent_vse->ngram_info->pruned))) {\n          continue;\n        }\n        // If the parent has no alnum choice, (ie choice is the first in a\n        // string of alnum), and there is a better case variant that is not\n        // distinguished by size, skip this blob choice/parent, as with the\n        // initial blob treatment above.\n        if (!parent_vse->HasAlnumChoice(unicharset) &&\n            HasBetterCaseVariant(unicharset, choice, curr_list)) {\n          continue;\n        }\n        // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()\n        // looks good according to the Dawgs or character ngram model.\n        new_changed |= AddViterbiStateEntry(top_choice_flags, denom, word_end, curr_col, curr_row,\n                                            c_it.data(), curr_state, parent_vse, pain_points,\n                                            word_res, best_choice_bundle, blamer_bundle);\n      }\n    }\n  }\n  return new_changed;\n}\n\n/**\n * Finds the first lower and upper case letter and first digit in curr_list.\n * For non-upper/lower languages, alpha counts as upper.\n * Uses the first character in the list in place of empty results.\n * Returns true if both alpha and digits are found.\n */\nbool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower,\n                                          BLOB_CHOICE **first_upper,\n                                          BLOB_CHOICE **first_digit) const {\n  BLOB_CHOICE_IT c_it(curr_list);\n  const UNICHARSET &unicharset = dict_->getUnicharset();\n  BLOB_CHOICE *first_unichar = nullptr;\n  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {\n    UNICHAR_ID unichar_id = c_it.data()->unichar_id();\n    if (unicharset.get_fragment(unichar_id)) {\n      continue; // skip fragments\n    }\n    if (first_unichar == nullptr) {\n      first_unichar = c_it.data();\n    }\n    if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {\n      *first_lower = c_it.data();\n    }\n    if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&\n        !unicharset.get_islower(unichar_id)) {\n      *first_upper = c_it.data();\n    }\n    if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {\n      *first_digit = c_it.data();\n    }\n  }\n  ASSERT_HOST(first_unichar != nullptr);\n  bool mixed = (*first_lower != nullptr || *first_upper != nullptr) && *first_digit != nullptr;\n  if (*first_lower == nullptr) {\n    *first_lower = first_unichar;\n  }\n  if (*first_upper == nullptr) {\n    *first_upper = first_unichar;\n  }\n  if (*first_digit == nullptr) {\n    *first_digit = first_unichar;\n  }\n  return mixed;\n}\n\n/**\n * Forces there to be at least one entry in the overall set of the\n * viterbi_state_entries of each element of parent_node that has the\n * top_choice_flag set for lower, upper and digit using the same rules as\n * GetTopLowerUpperDigit, setting the flag on the first found suitable\n * candidate, whether or not the flag is set on some other parent.\n * Returns 1 if both alpha and digits are found among the parents, -1 if no\n * parents are found at all (a legitimate case), and 0 otherwise.\n */\nint LanguageModel::SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const {\n  if (parent_node == nullptr) {\n    return -1;\n  }\n  UNICHAR_ID top_id = INVALID_UNICHAR_ID;\n  ViterbiStateEntry *top_lower = nullptr;\n  ViterbiStateEntry *top_upper = nullptr;\n  ViterbiStateEntry *top_digit = nullptr;\n  ViterbiStateEntry *top_choice = nullptr;\n  float lower_rating = 0.0f;\n  float upper_rating = 0.0f;\n  float digit_rating = 0.0f;\n  float top_rating = 0.0f;\n  const UNICHARSET &unicharset = dict_->getUnicharset();\n  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);\n  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {\n    ViterbiStateEntry *vse = vit.data();\n    // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan\n    // back to the real character if needed.\n    ViterbiStateEntry *unichar_vse = vse;\n    UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();\n    float rating = unichar_vse->curr_b->rating();\n    while (unichar_id == INVALID_UNICHAR_ID && unichar_vse->parent_vse != nullptr) {\n      unichar_vse = unichar_vse->parent_vse;\n      unichar_id = unichar_vse->curr_b->unichar_id();\n      rating = unichar_vse->curr_b->rating();\n    }\n    if (unichar_id != INVALID_UNICHAR_ID) {\n      if (unicharset.get_islower(unichar_id)) {\n        if (top_lower == nullptr || lower_rating > rating) {\n          top_lower = vse;\n          lower_rating = rating;\n        }\n      } else if (unicharset.get_isalpha(unichar_id)) {\n        if (top_upper == nullptr || upper_rating > rating) {\n          top_upper = vse;\n          upper_rating = rating;\n        }\n      } else if (unicharset.get_isdigit(unichar_id)) {\n        if (top_digit == nullptr || digit_rating > rating) {\n          top_digit = vse;\n          digit_rating = rating;\n        }\n      }\n    }\n    if (top_choice == nullptr || top_rating > rating) {\n      top_choice = vse;\n      top_rating = rating;\n      top_id = unichar_id;\n    }\n  }\n  if (top_choice == nullptr) {\n    return -1;\n  }\n  bool mixed = (top_lower != nullptr || top_upper != nullptr) && top_digit != nullptr;\n  if (top_lower == nullptr) {\n    top_lower = top_choice;\n  }\n  top_lower->top_choice_flags |= kLowerCaseFlag;\n  if (top_upper == nullptr) {\n    top_upper = top_choice;\n  }\n  top_upper->top_choice_flags |= kUpperCaseFlag;\n  if (top_digit == nullptr) {\n    top_digit = top_choice;\n  }\n  top_digit->top_choice_flags |= kDigitFlag;\n  top_choice->top_choice_flags |= kSmallestRatingFlag;\n  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&\n      (top_choice->top_choice_flags & (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {\n    // If the compound marker top choice carries any of the top alnum flags,\n    // then give it all of them, allowing words like I-295 to be chosen.\n    top_choice->top_choice_flags |= kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;\n  }\n  return mixed ? 1 : 0;\n}\n\n/**\n * Finds the next ViterbiStateEntry with which the given unichar_id can\n * combine sensibly, taking into account any mixed alnum/mixed case\n * situation, and whether this combination has been inspected before.\n */\nViterbiStateEntry *LanguageModel::GetNextParentVSE(bool just_classified, bool mixed_alnum,\n                                                   const BLOB_CHOICE *bc,\n                                                   LanguageModelFlagsType blob_choice_flags,\n                                                   const UNICHARSET &unicharset, WERD_RES *word_res,\n                                                   ViterbiStateEntry_IT *vse_it,\n                                                   LanguageModelFlagsType *top_choice_flags) const {\n  for (; !vse_it->cycled_list(); vse_it->forward()) {\n    ViterbiStateEntry *parent_vse = vse_it->data();\n    // Only consider the parent if it has been updated or\n    // if the current ratings cell has just been classified.\n    if (!just_classified && !parent_vse->updated) {\n      continue;\n    }\n    if (language_model_debug_level > 2) {\n      parent_vse->Print(\"Considering\");\n    }\n    // If the parent is non-alnum, then upper counts as lower.\n    *top_choice_flags = blob_choice_flags;\n    if ((blob_choice_flags & kUpperCaseFlag) && !parent_vse->HasAlnumChoice(unicharset)) {\n      *top_choice_flags |= kLowerCaseFlag;\n    }\n    *top_choice_flags &= parent_vse->top_choice_flags;\n    UNICHAR_ID unichar_id = bc->unichar_id();\n    const BLOB_CHOICE *parent_b = parent_vse->curr_b;\n    UNICHAR_ID parent_id = parent_b->unichar_id();\n    // Digits do not bind to alphas if there is a mix in both parent and current\n    // or if the alpha is not the top choice.\n    if (unicharset.get_isdigit(unichar_id) && unicharset.get_isalpha(parent_id) &&\n        (mixed_alnum || *top_choice_flags == 0)) {\n      continue; // Digits don't bind to alphas.\n    }\n    // Likewise alphas do not bind to digits if there is a mix in both or if\n    // the digit is not the top choice.\n    if (unicharset.get_isalpha(unichar_id) && unicharset.get_isdigit(parent_id) &&\n        (mixed_alnum || *top_choice_flags == 0)) {\n      continue; // Alphas don't bind to digits.\n    }\n    // If there is a case mix of the same alpha in the parent list, then\n    // competing_vse is non-null and will be used to determine whether\n    // or not to bind the current blob choice.\n    if (parent_vse->competing_vse != nullptr) {\n      const BLOB_CHOICE *competing_b = parent_vse->competing_vse->curr_b;\n      UNICHAR_ID other_id = competing_b->unichar_id();\n      if (language_model_debug_level >= 5) {\n        tprintf(\"Parent %s has competition %s\\n\", unicharset.id_to_unichar(parent_id),\n                unicharset.id_to_unichar(other_id));\n      }\n      if (unicharset.SizesDistinct(parent_id, other_id)) {\n        // If other_id matches bc wrt position and size, and parent_id, doesn't,\n        // don't bind to the current parent.\n        if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,\n                                language_model_debug_level >= 5) &&\n            !bc->PosAndSizeAgree(*parent_b, word_res->x_height, language_model_debug_level >= 5)) {\n          continue; // Competing blobchoice has a better vertical match.\n        }\n      }\n    }\n    vse_it->forward();\n    return parent_vse; // This one is good!\n  }\n  return nullptr; // Ran out of possibilities.\n}\n\nbool LanguageModel::AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom,\n                                         bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b,\n                                         LanguageModelState *curr_state,\n                                         ViterbiStateEntry *parent_vse, LMPainPoints *pain_points,\n                                         WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                                         BlamerBundle *blamer_bundle) {\n  ViterbiStateEntry_IT vit;\n  if (language_model_debug_level > 1) {\n    tprintf(\n        \"AddViterbiStateEntry for unichar %s rating=%.4f\"\n        \" certainty=%.4f top_choice_flags=0x%x\",\n        dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->rating(), b->certainty(),\n        top_choice_flags);\n    if (language_model_debug_level > 5) {\n      tprintf(\" parent_vse=%p\\n\", static_cast<void *>(parent_vse));\n    } else {\n      tprintf(\"\\n\");\n    }\n  }\n  ASSERT_HOST(curr_state != nullptr);\n  // Check whether the list is full.\n  if (curr_state->viterbi_state_entries_length >= language_model_viterbi_list_max_size) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"AddViterbiStateEntry: viterbi list is full!\\n\");\n    }\n    return false;\n  }\n\n  // Invoke Dawg language model component.\n  LanguageModelDawgInfo *dawg_info = GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);\n\n  float outline_length = AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);\n  // Invoke Ngram language model component.\n  LanguageModelNgramInfo *ngram_info = nullptr;\n  if (language_model_ngram_on) {\n    ngram_info =\n        GenerateNgramInfo(dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),\n                          denom, curr_col, curr_row, outline_length, parent_vse);\n    ASSERT_HOST(ngram_info != nullptr);\n  }\n  bool liked_by_language_model =\n      dawg_info != nullptr || (ngram_info != nullptr && !ngram_info->pruned);\n  // Quick escape if not liked by the language model, can't be consistent\n  // xheight, and not top choice.\n  if (!liked_by_language_model && top_choice_flags == 0) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"Language model components very early pruned this entry\\n\");\n    }\n    delete ngram_info;\n    delete dawg_info;\n    return false;\n  }\n\n  // Check consistency of the path and set the relevant consistency_info.\n  LMConsistencyInfo consistency_info(parent_vse != nullptr ? &parent_vse->consistency_info\n                                                           : nullptr);\n  // Start with just the x-height consistency, as it provides significant\n  // pruning opportunity.\n  consistency_info.ComputeXheightConsistency(\n      b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));\n  // Turn off xheight consistent flag if not consistent.\n  if (consistency_info.InconsistentXHeight()) {\n    top_choice_flags &= ~kXhtConsistentFlag;\n  }\n\n  // Quick escape if not liked by the language model, not consistent xheight,\n  // and not top choice.\n  if (!liked_by_language_model && top_choice_flags == 0) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"Language model components early pruned this entry\\n\");\n    }\n    delete ngram_info;\n    delete dawg_info;\n    return false;\n  }\n\n  // Compute the rest of the consistency info.\n  FillConsistencyInfo(curr_col, word_end, b, parent_vse, word_res, &consistency_info);\n  if (dawg_info != nullptr && consistency_info.invalid_punc) {\n    consistency_info.invalid_punc = false; // do not penalize dict words\n  }\n\n  // Compute cost of associating the blobs that represent the current unichar.\n  AssociateStats associate_stats;\n  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_, parent_vse, word_res,\n                        &associate_stats);\n  if (parent_vse != nullptr) {\n    associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;\n    associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;\n  }\n\n  // Create the new ViterbiStateEntry compute the adjusted cost of the path.\n  auto *new_vse = new ViterbiStateEntry(parent_vse, b, 0.0, outline_length, consistency_info,\n                                        associate_stats, top_choice_flags, dawg_info, ngram_info,\n                                        (language_model_debug_level > 0)\n                                            ? dict_->getUnicharset().id_to_unichar(b->unichar_id())\n                                            : nullptr);\n  new_vse->cost = ComputeAdjustedPathCost(new_vse);\n  if (language_model_debug_level >= 3) {\n    tprintf(\"Adjusted cost = %g\\n\", new_vse->cost);\n  }\n\n  // Invoke Top Choice language model component to make the final adjustments\n  // to new_vse->top_choice_flags.\n  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {\n    GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);\n  }\n\n  // If language model components did not like this unichar - return.\n  bool keep = new_vse->top_choice_flags || liked_by_language_model;\n  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths\n      consistency_info.inconsistent_script) {      // with inconsistent script\n    keep = false;\n  }\n  if (!keep) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"Language model components did not like this entry\\n\");\n    }\n    delete new_vse;\n    return false;\n  }\n\n  // Discard this entry if it represents a prunable path and\n  // language_model_viterbi_list_max_num_prunable such entries with a lower\n  // cost have already been recorded.\n  if (PrunablePath(*new_vse) &&\n      (curr_state->viterbi_state_entries_prunable_length >=\n       language_model_viterbi_list_max_num_prunable) &&\n      new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"Discarded ViterbiEntry with high cost %g max cost %g\\n\", new_vse->cost,\n              curr_state->viterbi_state_entries_prunable_max_cost);\n    }\n    delete new_vse;\n    return false;\n  }\n\n  // Update best choice if needed.\n  if (word_end) {\n    UpdateBestChoice(new_vse, pain_points, word_res, best_choice_bundle, blamer_bundle);\n    // Discard the entry if UpdateBestChoice() found flaws in it.\n    if (new_vse->cost >= WERD_CHOICE::kBadRating && new_vse != best_choice_bundle->best_vse) {\n      if (language_model_debug_level > 1) {\n        tprintf(\"Discarded ViterbiEntry with high cost %g\\n\", new_vse->cost);\n      }\n      delete new_vse;\n      return false;\n    }\n  }\n\n  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.\n  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare, false, new_vse);\n  curr_state->viterbi_state_entries_length++;\n  if (PrunablePath(*new_vse)) {\n    curr_state->viterbi_state_entries_prunable_length++;\n  }\n\n  // Update lms->viterbi_state_entries_prunable_max_cost and clear\n  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.\n  if ((curr_state->viterbi_state_entries_prunable_length >=\n       language_model_viterbi_list_max_num_prunable) ||\n      new_vse->top_choice_flags) {\n    ASSERT_HOST(!curr_state->viterbi_state_entries.empty());\n    int prunable_counter = language_model_viterbi_list_max_num_prunable;\n    vit.set_to_list(&(curr_state->viterbi_state_entries));\n    for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {\n      ViterbiStateEntry *curr_vse = vit.data();\n      // Clear the appropriate top choice flags of the entries in the\n      // list that have cost higher thank new_entry->cost\n      // (since they will not be top choices any more).\n      if (curr_vse->top_choice_flags && curr_vse != new_vse && curr_vse->cost > new_vse->cost) {\n        curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);\n      }\n      if (prunable_counter > 0 && PrunablePath(*curr_vse)) {\n        --prunable_counter;\n      }\n      // Update curr_state->viterbi_state_entries_prunable_max_cost.\n      if (prunable_counter == 0) {\n        curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;\n        if (language_model_debug_level > 1) {\n          tprintf(\"Set viterbi_state_entries_prunable_max_cost to %g\\n\",\n                  curr_state->viterbi_state_entries_prunable_max_cost);\n        }\n        prunable_counter = -1; // stop counting\n      }\n    }\n  }\n\n  // Print the newly created ViterbiStateEntry.\n  if (language_model_debug_level > 2) {\n    new_vse->Print(\"New\");\n    if (language_model_debug_level > 5) {\n      curr_state->Print(\"Updated viterbi list\");\n    }\n  }\n\n  return true;\n}\n\nvoid LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,\n                                          const ViterbiStateEntry *parent_vse,\n                                          LanguageModelState *lms) {\n  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));\n  for (vit.mark_cycle_pt();\n       !vit.cycled_list() && new_vse->top_choice_flags && new_vse->cost >= vit.data()->cost;\n       vit.forward()) {\n    // Clear the appropriate flags if the list already contains\n    // a top choice entry with a lower cost.\n    new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);\n  }\n  if (language_model_debug_level > 2) {\n    tprintf(\"GenerateTopChoiceInfo: top_choice_flags=0x%x\\n\", new_vse->top_choice_flags);\n  }\n}\n\nLanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(bool word_end, int curr_col, int curr_row,\n                                                       const BLOB_CHOICE &b,\n                                                       const ViterbiStateEntry *parent_vse) {\n  // Initialize active_dawgs from parent_vse if it is not nullptr.\n  // Otherwise use very_beginning_active_dawgs_.\n  if (parent_vse == nullptr) {\n    dawg_args_.active_dawgs = &very_beginning_active_dawgs_;\n    dawg_args_.permuter = NO_PERM;\n  } else {\n    if (parent_vse->dawg_info == nullptr) {\n      return nullptr; // not a dict word path\n    }\n    dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;\n    dawg_args_.permuter = parent_vse->dawg_info->permuter;\n  }\n\n  // Deal with hyphenated words.\n  if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(), b.unichar_id(), curr_col == 0)) {\n    if (language_model_debug_level > 0) {\n      tprintf(\"Hyphenated word found\\n\");\n    }\n    return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);\n  }\n\n  // Deal with compound words.\n  if (dict_->compound_marker(b.unichar_id()) &&\n      (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {\n    if (language_model_debug_level > 0) {\n      tprintf(\"Found compound marker\\n\");\n    }\n    // Do not allow compound operators at the beginning and end of the word.\n    // Do not allow more than one compound operator per word.\n    // Do not allow compounding of words with lengths shorter than\n    // language_model_min_compound_length\n    if (parent_vse == nullptr || word_end || dawg_args_.permuter == COMPOUND_PERM ||\n        parent_vse->length < language_model_min_compound_length) {\n      return nullptr;\n    }\n\n    // Check that the path terminated before the current character is a word.\n    bool has_word_ending = false;\n    for (unsigned i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {\n      const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];\n      const Dawg *pdawg = pos.dawg_index < 0 ? nullptr : dict_->GetDawg(pos.dawg_index);\n      if (pdawg == nullptr || pos.back_to_punc) {\n        continue;\n      };\n      if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&\n          pdawg->end_of_word(pos.dawg_ref)) {\n        has_word_ending = true;\n        break;\n      }\n    }\n    if (!has_word_ending) {\n      return nullptr;\n    }\n\n    if (language_model_debug_level > 0) {\n      tprintf(\"Compound word found\\n\");\n    }\n    return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);\n  } // done dealing with compound words\n\n  LanguageModelDawgInfo *dawg_info = nullptr;\n\n  // Call LetterIsOkay().\n  // Use the normalized IDs so that all shapes of ' can be allowed in words\n  // like don't.\n  const auto &normed_ids = dict_->getUnicharset().normed_ids(b.unichar_id());\n  DawgPositionVector tmp_active_dawgs;\n  for (unsigned i = 0; i < normed_ids.size(); ++i) {\n    if (language_model_debug_level > 2) {\n      tprintf(\"Test Letter OK for unichar %d, normed %d\\n\", b.unichar_id(), normed_ids[i]);\n    }\n    dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],\n                        word_end && i == normed_ids.size() - 1);\n    if (dawg_args_.permuter == NO_PERM) {\n      break;\n    } else if (i < normed_ids.size() - 1) {\n      tmp_active_dawgs = *dawg_args_.updated_dawgs;\n      dawg_args_.active_dawgs = &tmp_active_dawgs;\n    }\n    if (language_model_debug_level > 2) {\n      tprintf(\"Letter was OK for unichar %d, normed %d\\n\", b.unichar_id(), normed_ids[i]);\n    }\n  }\n  dawg_args_.active_dawgs = nullptr;\n  if (dawg_args_.permuter != NO_PERM) {\n    dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs, dawg_args_.permuter);\n  } else if (language_model_debug_level > 3) {\n    tprintf(\"Letter %s not OK!\\n\", dict_->getUnicharset().id_to_unichar(b.unichar_id()));\n  }\n\n  return dawg_info;\n}\n\nLanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(const char *unichar, float certainty,\n                                                         float denom, int curr_col, int curr_row,\n                                                         float outline_length,\n                                                         const ViterbiStateEntry *parent_vse) {\n  // Initialize parent context.\n  const char *pcontext_ptr = \"\";\n  int pcontext_unichar_step_len = 0;\n  if (parent_vse == nullptr) {\n    pcontext_ptr = prev_word_str_.c_str();\n    pcontext_unichar_step_len = prev_word_unichar_step_len_;\n  } else {\n    pcontext_ptr = parent_vse->ngram_info->context.c_str();\n    pcontext_unichar_step_len = parent_vse->ngram_info->context_unichar_step_len;\n  }\n  // Compute p(unichar | parent context).\n  int unichar_step_len = 0;\n  bool pruned = false;\n  float ngram_cost;\n  float ngram_and_classifier_cost = ComputeNgramCost(unichar, certainty, denom, pcontext_ptr,\n                                                     &unichar_step_len, &pruned, &ngram_cost);\n  // Normalize just the ngram_and_classifier_cost by outline_length.\n  // The ngram_cost is used by the params_model, so it needs to be left as-is,\n  // and the params model cost will be normalized by outline_length.\n  ngram_and_classifier_cost *= outline_length / language_model_ngram_rating_factor;\n  // Add the ngram_cost of the parent.\n  if (parent_vse != nullptr) {\n    ngram_and_classifier_cost += parent_vse->ngram_info->ngram_and_classifier_cost;\n    ngram_cost += parent_vse->ngram_info->ngram_cost;\n  }\n\n  // Shorten parent context string by unichar_step_len unichars.\n  int num_remove = (unichar_step_len + pcontext_unichar_step_len - language_model_ngram_order);\n  if (num_remove > 0) {\n    pcontext_unichar_step_len -= num_remove;\n  }\n  while (num_remove > 0 && *pcontext_ptr != '\\0') {\n    pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);\n    --num_remove;\n  }\n\n  // Decide whether to prune this ngram path and update changed accordingly.\n  if (parent_vse != nullptr && parent_vse->ngram_info->pruned) {\n    pruned = true;\n  }\n\n  // Construct and return the new LanguageModelNgramInfo.\n  auto *ngram_info = new LanguageModelNgramInfo(pcontext_ptr, pcontext_unichar_step_len, pruned,\n                                                ngram_cost, ngram_and_classifier_cost);\n  ngram_info->context += unichar;\n  ngram_info->context_unichar_step_len += unichar_step_len;\n  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);\n  return ngram_info;\n}\n\nfloat LanguageModel::ComputeNgramCost(const char *unichar, float certainty, float denom,\n                                      const char *context, int *unichar_step_len,\n                                      bool *found_small_prob, float *ngram_cost) {\n  const char *context_ptr = context;\n  char *modified_context = nullptr;\n  char *modified_context_end = nullptr;\n  const char *unichar_ptr = unichar;\n  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);\n  float prob = 0.0f;\n  int step = 0;\n  while (unichar_ptr < unichar_end && (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {\n    if (language_model_debug_level > 1) {\n      tprintf(\"prob(%s | %s)=%g\\n\", unichar_ptr, context_ptr,\n              dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));\n    }\n    prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);\n    ++(*unichar_step_len);\n    if (language_model_ngram_use_only_first_uft8_step) {\n      break;\n    }\n    unichar_ptr += step;\n    // If there are multiple UTF8 characters present in unichar, context is\n    // updated to include the previously examined characters from str,\n    // unless use_only_first_uft8_step is true.\n    if (unichar_ptr < unichar_end) {\n      if (modified_context == nullptr) {\n        size_t context_len = strlen(context);\n        modified_context = new char[context_len + strlen(unichar_ptr) + step + 1];\n        memcpy(modified_context, context, context_len);\n        modified_context_end = modified_context + context_len;\n        context_ptr = modified_context;\n      }\n      strncpy(modified_context_end, unichar_ptr - step, step);\n      modified_context_end += step;\n      *modified_context_end = '\\0';\n    }\n  }\n  prob /= static_cast<float>(*unichar_step_len); // normalize\n  if (prob < language_model_ngram_small_prob) {\n    if (language_model_debug_level > 0) {\n      tprintf(\"Found small prob %g\\n\", prob);\n    }\n    *found_small_prob = true;\n    prob = language_model_ngram_small_prob;\n  }\n  *ngram_cost = -1 * std::log2(prob);\n  float ngram_and_classifier_cost = -1 * std::log2(CertaintyScore(certainty) / denom) +\n                                    *ngram_cost * language_model_ngram_scale_factor;\n  if (language_model_debug_level > 1) {\n    tprintf(\"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\\n\", unichar, unichar, context_ptr,\n            CertaintyScore(certainty) / denom, prob, ngram_and_classifier_cost);\n  }\n  delete[] modified_context;\n  return ngram_and_classifier_cost;\n}\n\nfloat LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {\n  if (curr_list->empty()) {\n    return 1.0f;\n  }\n  float denom = 0.0f;\n  int len = 0;\n  BLOB_CHOICE_IT c_it(curr_list);\n  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {\n    ASSERT_HOST(c_it.data() != nullptr);\n    ++len;\n    denom += CertaintyScore(c_it.data()->certainty());\n  }\n  assert(len != 0);\n  // The ideal situation would be to have the classifier scores for\n  // classifying each position as each of the characters in the unicharset.\n  // Since we cannot do this because of speed, we add a very crude estimate\n  // of what these scores for the \"missing\" classifications would sum up to.\n  denom +=\n      (dict_->getUnicharset().size() - len) * CertaintyScore(language_model_ngram_nonmatch_score);\n\n  return denom;\n}\n\nvoid LanguageModel::FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b,\n                                        ViterbiStateEntry *parent_vse, WERD_RES *word_res,\n                                        LMConsistencyInfo *consistency_info) {\n  const UNICHARSET &unicharset = dict_->getUnicharset();\n  UNICHAR_ID unichar_id = b->unichar_id();\n  BLOB_CHOICE *parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;\n\n  // Check punctuation validity.\n  if (unicharset.get_ispunctuation(unichar_id)) {\n    consistency_info->num_punc++;\n  }\n  if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {\n    if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&\n        (unicharset.get_isalpha(parent_b->unichar_id()) ||\n         unicharset.get_isdigit(parent_b->unichar_id()))) {\n      // reset punc_ref for compound words\n      consistency_info->punc_ref = NO_EDGE;\n    } else {\n      bool is_apos = dict_->is_apostrophe(unichar_id);\n      bool prev_is_numalpha =\n          (parent_b != nullptr && (unicharset.get_isalpha(parent_b->unichar_id()) ||\n                                   unicharset.get_isdigit(parent_b->unichar_id())));\n      UNICHAR_ID pattern_unichar_id =\n          (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id) ||\n           (is_apos && prev_is_numalpha))\n              ? Dawg::kPatternUnicharID\n              : unichar_id;\n      if (consistency_info->punc_ref == NO_EDGE || pattern_unichar_id != Dawg::kPatternUnicharID ||\n          dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=\n              Dawg::kPatternUnicharID) {\n        NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(), consistency_info->punc_ref);\n        consistency_info->punc_ref = (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(\n                                                             node, pattern_unichar_id, word_end)\n                                                       : NO_EDGE;\n        if (consistency_info->punc_ref == NO_EDGE) {\n          consistency_info->invalid_punc = true;\n        }\n      }\n    }\n  }\n\n  // Update case related counters.\n  if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {\n    // Reset counters if we are dealing with a compound word.\n    consistency_info->num_lower = 0;\n    consistency_info->num_non_first_upper = 0;\n  } else if (unicharset.get_islower(unichar_id)) {\n    consistency_info->num_lower++;\n  } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {\n    if (unicharset.get_isupper(parent_b->unichar_id()) || consistency_info->num_lower > 0 ||\n        consistency_info->num_non_first_upper > 0) {\n      consistency_info->num_non_first_upper++;\n    }\n  }\n\n  // Initialize consistency_info->script_id (use script of unichar_id\n  // if it is not Common, use script id recorded by the parent otherwise).\n  // Set inconsistent_script to true if the script of the current unichar\n  // is not consistent with that of the parent.\n  consistency_info->script_id = unicharset.get_script(unichar_id);\n  // Hiragana and Katakana can mix with Han.\n  if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {\n    if ((unicharset.hiragana_sid() != unicharset.null_sid() &&\n         consistency_info->script_id == unicharset.hiragana_sid()) ||\n        (unicharset.katakana_sid() != unicharset.null_sid() &&\n         consistency_info->script_id == unicharset.katakana_sid())) {\n      consistency_info->script_id = dict_->getUnicharset().han_sid();\n    }\n  }\n\n  if (parent_vse != nullptr &&\n      (parent_vse->consistency_info.script_id != dict_->getUnicharset().common_sid())) {\n    int parent_script_id = parent_vse->consistency_info.script_id;\n    // If script_id is Common, use script id of the parent instead.\n    if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {\n      consistency_info->script_id = parent_script_id;\n    }\n    if (consistency_info->script_id != parent_script_id) {\n      consistency_info->inconsistent_script = true;\n    }\n  }\n\n  // Update chartype related counters.\n  if (unicharset.get_isalpha(unichar_id)) {\n    consistency_info->num_alphas++;\n  } else if (unicharset.get_isdigit(unichar_id)) {\n    consistency_info->num_digits++;\n  } else if (!unicharset.get_ispunctuation(unichar_id)) {\n    consistency_info->num_other++;\n  }\n\n  // Check font and spacing consistency.\n  if (fontinfo_table_->size() > 0 && parent_b != nullptr) {\n    int fontinfo_id = -1;\n    if (parent_b->fontinfo_id() == b->fontinfo_id() ||\n        parent_b->fontinfo_id2() == b->fontinfo_id()) {\n      fontinfo_id = b->fontinfo_id();\n    } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||\n               parent_b->fontinfo_id2() == b->fontinfo_id2()) {\n      fontinfo_id = b->fontinfo_id2();\n    }\n    if (language_model_debug_level > 1) {\n      tprintf(\n          \"pfont %s pfont %s font %s font2 %s common %s(%d)\\n\",\n          (parent_b->fontinfo_id() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id()).name : \"\",\n          (parent_b->fontinfo_id2() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id2()).name\n                                          : \"\",\n          (b->fontinfo_id() >= 0) ? fontinfo_table_->at(b->fontinfo_id()).name : \"\",\n          (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : \"\",\n          (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : \"\", fontinfo_id);\n    }\n    if (!word_res->blob_widths.empty()) { // if we have widths/gaps info\n      bool expected_gap_found = false;\n      float expected_gap = 0.0f;\n      int temp_gap;\n      if (fontinfo_id >= 0) { // found a common font\n        ASSERT_HOST(fontinfo_id < fontinfo_table_->size());\n        if (fontinfo_table_->at(fontinfo_id)\n                .get_spacing(parent_b->unichar_id(), unichar_id, &temp_gap)) {\n          expected_gap = temp_gap;\n          expected_gap_found = true;\n        }\n      } else {\n        consistency_info->inconsistent_font = true;\n        // Get an average of the expected gaps in each font\n        int num_addends = 0;\n        int temp_fid;\n        for (int i = 0; i < 4; ++i) {\n          if (i == 0) {\n            temp_fid = parent_b->fontinfo_id();\n          } else if (i == 1) {\n            temp_fid = parent_b->fontinfo_id2();\n          } else if (i == 2) {\n            temp_fid = b->fontinfo_id();\n          } else {\n            temp_fid = b->fontinfo_id2();\n          }\n          ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());\n          if (temp_fid >= 0 && fontinfo_table_->at(temp_fid).get_spacing(parent_b->unichar_id(),\n                                                                         unichar_id, &temp_gap)) {\n            expected_gap += temp_gap;\n            num_addends++;\n          }\n        }\n        if (num_addends > 0) {\n          expected_gap /= static_cast<float>(num_addends);\n          expected_gap_found = true;\n        }\n      }\n      if (expected_gap_found) {\n        int actual_gap = word_res->GetBlobsGap(curr_col - 1);\n        if (actual_gap == 0) {\n          consistency_info->num_inconsistent_spaces++;\n        } else {\n          float gap_ratio = expected_gap / actual_gap;\n          // TODO(rays) The gaps seem to be way off most of the time, saved by\n          // the error here that the ratio was compared to 1/2, when it should\n          // have been 0.5f. Find the source of the gaps discrepancy and put\n          // the 0.5f here in place of 0.0f.\n          // Test on 2476595.sj, pages 0 to 6. (In French.)\n          if (gap_ratio < 0.0f || gap_ratio > 2.0f) {\n            consistency_info->num_inconsistent_spaces++;\n          }\n        }\n        if (language_model_debug_level > 1) {\n          tprintf(\"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\\n\",\n                  unicharset.id_to_unichar(parent_b->unichar_id()), parent_b->unichar_id(),\n                  unicharset.id_to_unichar(unichar_id), unichar_id, curr_col, expected_gap,\n                  actual_gap);\n        }\n      }\n    }\n  }\n}\n\nfloat LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) {\n  ASSERT_HOST(vse != nullptr);\n  if (params_model_.Initialized()) {\n    float features[PTRAIN_NUM_FEATURE_TYPES];\n    ExtractFeaturesFromPath(*vse, features);\n    float cost = params_model_.ComputeCost(features);\n    if (language_model_debug_level > 3) {\n      tprintf(\"ComputeAdjustedPathCost %g ParamsModel features:\\n\", cost);\n      if (language_model_debug_level >= 5) {\n        for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {\n          tprintf(\"%s=%g\\n\", kParamsTrainingFeatureTypeName[f], features[f]);\n        }\n      }\n    }\n    return cost * vse->outline_length;\n  } else {\n    float adjustment = 1.0f;\n    if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {\n      adjustment += language_model_penalty_non_freq_dict_word;\n    }\n    if (vse->dawg_info == nullptr) {\n      adjustment += language_model_penalty_non_dict_word;\n      if (vse->length > language_model_min_compound_length) {\n        adjustment +=\n            ((vse->length - language_model_min_compound_length) * language_model_penalty_increment);\n      }\n    }\n    if (vse->associate_stats.shape_cost > 0) {\n      adjustment += vse->associate_stats.shape_cost / static_cast<float>(vse->length);\n    }\n    if (language_model_ngram_on) {\n      ASSERT_HOST(vse->ngram_info != nullptr);\n      return vse->ngram_info->ngram_and_classifier_cost * adjustment;\n    } else {\n      adjustment += ComputeConsistencyAdjustment(vse->dawg_info, vse->consistency_info);\n      return vse->ratings_sum * adjustment;\n    }\n  }\n}\n\nvoid LanguageModel::UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points,\n                                     WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                                     BlamerBundle *blamer_bundle) {\n  bool truth_path;\n  WERD_CHOICE *word =\n      ConstructWord(vse, word_res, &best_choice_bundle->fixpt, blamer_bundle, &truth_path);\n  ASSERT_HOST(word != nullptr);\n  if (dict_->stopper_debug_level >= 1) {\n    std::string word_str;\n    word->string_and_lengths(&word_str, nullptr);\n    vse->Print(word_str.c_str());\n  }\n  if (language_model_debug_level > 0) {\n    word->print(\"UpdateBestChoice() constructed word\");\n  }\n  // Record features from the current path if necessary.\n  ParamsTrainingHypothesis curr_hyp;\n  if (blamer_bundle != nullptr) {\n    if (vse->dawg_info != nullptr) {\n      vse->dawg_info->permuter = static_cast<PermuterType>(word->permuter());\n    }\n    ExtractFeaturesFromPath(*vse, curr_hyp.features);\n    word->string_and_lengths(&(curr_hyp.str), nullptr);\n    curr_hyp.cost = vse->cost; // record cost for error rate computations\n    if (language_model_debug_level > 0) {\n      tprintf(\"Raw features extracted from %s (cost=%g) [ \", curr_hyp.str.c_str(), curr_hyp.cost);\n      for (float feature : curr_hyp.features) {\n        tprintf(\"%g \", feature);\n      }\n      tprintf(\"]\\n\");\n    }\n    // Record the current hypothesis in params_training_bundle.\n    blamer_bundle->AddHypothesis(curr_hyp);\n    if (truth_path) {\n      blamer_bundle->UpdateBestRating(word->rating());\n    }\n  }\n  if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {\n    // The word was constructed solely for blamer_bundle->AddHypothesis, so\n    // we no longer need it.\n    delete word;\n    return;\n  }\n  if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty()) {\n    word->SetScriptPositions(false, word_res->chopped_word, language_model_debug_level);\n  }\n  // Update and log new raw_choice if needed.\n  if (word_res->raw_choice == nullptr || word->rating() < word_res->raw_choice->rating()) {\n    if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0) {\n      tprintf(\"Updated raw choice\\n\");\n    }\n  }\n  // Set the modified rating for best choice to vse->cost and log best choice.\n  word->set_rating(vse->cost);\n  // Call LogNewChoice() for best choice from Dict::adjust_word() since it\n  // computes adjust_factor that is used by the adaption code (e.g. by\n  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).\n  // Note: the rating of the word is not adjusted.\n  dict_->adjust_word(word, vse->dawg_info == nullptr, vse->consistency_info.xht_decision, 0.0,\n                     false, language_model_debug_level > 0);\n  // Hand ownership of the word over to the word_res.\n  if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,\n                                    dict_->stopper_debug_level >= 1, word)) {\n    // The word was so bad that it was deleted.\n    return;\n  }\n  if (word_res->best_choice == word) {\n    // Word was the new best.\n    if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&\n        AcceptablePath(*vse)) {\n      acceptable_choice_found_ = true;\n    }\n    // Update best_choice_bundle.\n    best_choice_bundle->updated = true;\n    best_choice_bundle->best_vse = vse;\n    if (language_model_debug_level > 0) {\n      tprintf(\"Updated best choice\\n\");\n      word->print_state(\"New state \");\n    }\n    // Update hyphen state if we are dealing with a dictionary word.\n    if (vse->dawg_info != nullptr) {\n      if (dict_->has_hyphen_end(*word)) {\n        dict_->set_hyphen_word(*word, *(dawg_args_.active_dawgs));\n      } else {\n        dict_->reset_hyphen_vars(true);\n      }\n    }\n\n    if (blamer_bundle != nullptr) {\n      blamer_bundle->set_best_choice_is_dict_and_top_choice(vse->dawg_info != nullptr &&\n                                                            vse->top_choice_flags);\n    }\n  }\n#ifndef GRAPHICS_DISABLED\n  if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {\n    word->DisplaySegmentation(word_res->chopped_word);\n  }\n#endif\n}\n\nvoid LanguageModel::ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]) {\n  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);\n  // Record dictionary match info.\n  int len = vse.length <= kMaxSmallWordUnichars ? 0 : vse.length <= kMaxMediumWordUnichars ? 1 : 2;\n  if (vse.dawg_info != nullptr) {\n    int permuter = vse.dawg_info->permuter;\n    if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {\n      if (vse.consistency_info.num_digits == vse.length) {\n        features[PTRAIN_DIGITS_SHORT + len] = 1.0f;\n      } else {\n        features[PTRAIN_NUM_SHORT + len] = 1.0f;\n      }\n    } else if (permuter == DOC_DAWG_PERM) {\n      features[PTRAIN_DOC_SHORT + len] = 1.0f;\n    } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||\n               permuter == COMPOUND_PERM) {\n      features[PTRAIN_DICT_SHORT + len] = 1.0f;\n    } else if (permuter == FREQ_DAWG_PERM) {\n      features[PTRAIN_FREQ_SHORT + len] = 1.0f;\n    }\n  }\n  // Record shape cost feature (normalized by path length).\n  features[PTRAIN_SHAPE_COST_PER_CHAR] =\n      vse.associate_stats.shape_cost / static_cast<float>(vse.length);\n  // Record ngram cost. (normalized by the path length).\n  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0f;\n  if (vse.ngram_info != nullptr) {\n    features[PTRAIN_NGRAM_COST_PER_CHAR] =\n        vse.ngram_info->ngram_cost / static_cast<float>(vse.length);\n  }\n  // Record consistency-related features.\n  // Disabled this feature for due to its poor performance.\n  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();\n  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();\n  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;\n  features[PTRAIN_NUM_BAD_CHAR_TYPE] =\n      vse.dawg_info == nullptr ? vse.consistency_info.NumInconsistentChartype() : 0.0f;\n  features[PTRAIN_NUM_BAD_SPACING] = vse.consistency_info.NumInconsistentSpaces();\n  // Disabled this feature for now due to its poor performance.\n  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;\n\n  // Classifier-related features.\n  if (vse.outline_length > 0.0f) {\n    features[PTRAIN_RATING_PER_CHAR] = vse.ratings_sum / vse.outline_length;\n  } else {\n    // Avoid FP division by 0.\n    features[PTRAIN_RATING_PER_CHAR] = 0.0f;\n  }\n}\n\nWERD_CHOICE *LanguageModel::ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res,\n                                          DANGERR *fixpt, BlamerBundle *blamer_bundle,\n                                          bool *truth_path) {\n  if (truth_path != nullptr) {\n    *truth_path =\n        (blamer_bundle != nullptr && vse->length == blamer_bundle->correct_segmentation_length());\n  }\n  BLOB_CHOICE *curr_b = vse->curr_b;\n  ViterbiStateEntry *curr_vse = vse;\n\n  int i;\n  bool compound = dict_->hyphenated(); // treat hyphenated words as compound\n\n  // Re-compute the variance of the width-to-height ratios (since we now\n  // can compute the mean over the whole word).\n  float full_wh_ratio_mean = 0.0f;\n  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {\n    vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;\n    full_wh_ratio_mean =\n        (vse->associate_stats.full_wh_ratio_total / static_cast<float>(vse->length));\n    vse->associate_stats.full_wh_ratio_var = 0.0f;\n  }\n\n  // Construct a WERD_CHOICE by tracing parent pointers.\n  auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);\n  word->set_length(vse->length);\n  int total_blobs = 0;\n  for (i = (vse->length - 1); i >= 0; --i) {\n    if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&\n        !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {\n      *truth_path = false;\n    }\n    // The number of blobs used for this choice is row - col + 1.\n    int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;\n    total_blobs += num_blobs;\n    word->set_blob_choice(i, num_blobs, curr_b);\n    // Update the width-to-height ratio variance. Useful non-space delimited\n    // languages to ensure that the blobs are of uniform width.\n    // Skip leading and trailing punctuation when computing the variance.\n    if ((full_wh_ratio_mean != 0.0f &&\n         ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||\n          !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {\n      vse->associate_stats.full_wh_ratio_var +=\n          pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);\n      if (language_model_debug_level > 2) {\n        tprintf(\"full_wh_ratio_var += (%g-%g)^2\\n\", full_wh_ratio_mean,\n                curr_vse->associate_stats.full_wh_ratio);\n      }\n    }\n\n    // Mark the word as compound if compound permuter was set for any of\n    // the unichars on the path (usually this will happen for unichars\n    // that are compounding operators, like \"-\" and \"/\").\n    if (!compound && curr_vse->dawg_info && curr_vse->dawg_info->permuter == COMPOUND_PERM) {\n      compound = true;\n    }\n\n    // Update curr_* pointers.\n    curr_vse = curr_vse->parent_vse;\n    if (curr_vse == nullptr) {\n      break;\n    }\n    curr_b = curr_vse->curr_b;\n  }\n  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.\n  ASSERT_HOST(total_blobs == word_res->ratings->dimension());\n  // Re-adjust shape cost to include the updated width-to-height variance.\n  if (full_wh_ratio_mean != 0.0f) {\n    vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;\n  }\n\n  word->set_rating(vse->ratings_sum);\n  word->set_certainty(vse->min_certainty);\n  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),\n                      vse->consistency_info.BodyMaxXHeight());\n  if (vse->dawg_info != nullptr) {\n    word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);\n  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {\n    word->set_permuter(NGRAM_PERM);\n  } else if (vse->top_choice_flags) {\n    word->set_permuter(TOP_CHOICE_PERM);\n  } else {\n    word->set_permuter(NO_PERM);\n  }\n  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true, word_res->ratings));\n  return word;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/language_model.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        language_model.h\n// Description: Functions that utilize the knowledge about the properties,\n//              structure and statistics of the language to help segmentation\n//              search.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_\n#define TESSERACT_WORDREC_LANGUAGE_MODEL_H_\n\n#include \"associate.h\"      // for AssociateStats (ptr only), AssociateUtils\n#include \"dawg.h\"           // for DawgPositionVector\n#include \"dict.h\"           // for DawgArgs, Dict\n#include \"lm_consistency.h\" // for LMConsistencyInfo\n#include \"lm_state.h\"       // for ViterbiStateEntry, LanguageModelFlagsType\n#include \"params.h\"         // for DoubleParam, double_VAR_H, IntParam, Boo...\n#include \"params_model.h\"   // for ParamsModel\n#include \"ratngs.h\"         // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST...\n#include \"stopper.h\"        // for DANGERR\n\n#include <cmath> // for exp\n\nnamespace tesseract {\n\nclass UNICHARSET;\nclass WERD_RES;\n\nstruct BlamerBundle;\n\ntemplate <typename T>\nclass UnicityTable;\n\nclass LMPainPoints;\nstruct FontInfo;\n\n// This class that contains the data structures and functions necessary\n// to represent and use the knowledge about the language.\nclass LanguageModel {\npublic:\n  // Masks for keeping track of top choices that should not be pruned out.\n  static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;\n  static const LanguageModelFlagsType kLowerCaseFlag = 0x2;\n  static const LanguageModelFlagsType kUpperCaseFlag = 0x4;\n  static const LanguageModelFlagsType kDigitFlag = 0x8;\n  static const LanguageModelFlagsType kXhtConsistentFlag = 0x10;\n\n  // Denominator for normalizing per-letter ngram cost when deriving\n  // penalty adjustments.\n  static const float kMaxAvgNgramCost;\n\n  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);\n  ~LanguageModel();\n\n  // Fills the given floats array with features extracted from path represented\n  // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h\n  // for feature information.\n  // Note: the function assumes that features points to an array of size\n  // PTRAIN_NUM_FEATURE_TYPES.\n  static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]);\n\n  // Updates data structures that are used for the duration of the segmentation\n  // search on the current word;\n  void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio,\n                   float rating_cert_scale);\n\n  // Updates language model state of the given BLOB_CHOICE_LIST (from\n  // the ratings matrix) and its parent. Updates pain_points if new\n  // problematic points are found in the segmentation graph.\n  //\n  // At most language_model_viterbi_list_size are kept in each\n  // LanguageModelState.viterbi_state_entries list.\n  // At most language_model_viterbi_list_max_num_prunable of those are prunable\n  // (non-dictionary) paths.\n  // The entries that represent dictionary word paths are kept at the front\n  // of the list.\n  // The list ordered by cost that is computed collectively by several\n  // language model components (currently dawg and ngram components).\n  bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list,\n                   LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res,\n                   BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);\n\n  // Returns true if an acceptable best choice was discovered.\n  inline bool AcceptableChoiceFound() {\n    return acceptable_choice_found_;\n  }\n  inline void SetAcceptableChoiceFound(bool val) {\n    acceptable_choice_found_ = val;\n  }\n  // Returns the reference to ParamsModel.\n  inline ParamsModel &getParamsModel() {\n    return params_model_;\n  }\n\nprotected:\n  inline float CertaintyScore(float cert) {\n    if (language_model_use_sigmoidal_certainty) {\n      // cert is assumed to be between 0 and -dict_->certainty_scale.\n      // If you enable language_model_use_sigmoidal_certainty, you\n      // need to adjust language_model_ngram_nonmatch_score as well.\n      cert = -cert / dict_->certainty_scale;\n      return 1.0f / (1.0f + exp(10.0f * cert));\n    } else {\n      return (-1.0f / cert);\n    }\n  }\n\n  inline float ComputeAdjustment(int num_problems, float penalty) {\n    if (num_problems == 0) {\n      return 0.0f;\n    }\n    if (num_problems == 1) {\n      return penalty;\n    }\n    return (penalty + (language_model_penalty_increment * static_cast<float>(num_problems - 1)));\n  }\n\n  // Computes the adjustment to the ratings sum based on the given\n  // consistency_info. The paths with invalid punctuation, inconsistent\n  // case and character type are penalized proportionally to the number\n  // of inconsistencies on the path.\n  inline float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info,\n                                            const LMConsistencyInfo &consistency_info) {\n    if (dawg_info != nullptr) {\n      return ComputeAdjustment(consistency_info.NumInconsistentCase(),\n                               language_model_penalty_case) +\n             (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f);\n    }\n    return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), language_model_penalty_punc) +\n            ComputeAdjustment(consistency_info.NumInconsistentCase(), language_model_penalty_case) +\n            ComputeAdjustment(consistency_info.NumInconsistentChartype(),\n                              language_model_penalty_chartype) +\n            ComputeAdjustment(consistency_info.NumInconsistentSpaces(),\n                              language_model_penalty_spacing) +\n            (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f) +\n            (consistency_info.inconsistent_font ? language_model_penalty_font : 0.0f));\n  }\n\n  // Returns an adjusted ratings sum that includes inconsistency penalties,\n  // penalties for non-dictionary paths and paths with dips in ngram\n  // probability.\n  float ComputeAdjustedPathCost(ViterbiStateEntry *vse);\n\n  // Finds the first lower and upper case letter and first digit in curr_list.\n  // Uses the first character in the list in place of empty results.\n  // Returns true if both alpha and digits are found.\n  bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower,\n                             BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const;\n  // Forces there to be at least one entry in the overall set of the\n  // viterbi_state_entries of each element of parent_node that has the\n  // top_choice_flag set for lower, upper and digit using the same rules as\n  // GetTopLowerUpperDigit, setting the flag on the first found suitable\n  // candidate, whether or not the flag is set on some other parent.\n  // Returns 1 if both alpha and digits are found among the parents, -1 if no\n  // parents are found at all (a legitimate case), and 0 otherwise.\n  int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;\n\n  // Finds the next ViterbiStateEntry with which the given unichar_id can\n  // combine sensibly, taking into account any mixed alnum/mixed case\n  // situation, and whether this combination has been inspected before.\n  ViterbiStateEntry *GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc,\n                                      LanguageModelFlagsType blob_choice_flags,\n                                      const UNICHARSET &unicharset, WERD_RES *word_res,\n                                      ViterbiStateEntry_IT *vse_it,\n                                      LanguageModelFlagsType *top_choice_flags) const;\n  // Helper function that computes the cost of the path composed of the\n  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.\n  // If the new path looks good enough, adds a new ViterbiStateEntry to the\n  // list of viterbi entries in the given BLOB_CHOICE and returns true.\n  bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end,\n                            int curr_col, int curr_row, BLOB_CHOICE *b,\n                            LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,\n                            LMPainPoints *pain_points, WERD_RES *word_res,\n                            BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);\n\n  // Determines whether a potential entry is a true top choice and\n  // updates changed accordingly.\n  //\n  // Note: The function assumes that b, top_choice_flags and changed\n  // are not nullptr.\n  void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse,\n                             LanguageModelState *lms);\n\n  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and\n  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo\n  // with updated active dawgs, constraints and permuter.\n  //\n  // Note: the caller is responsible for deleting the returned pointer.\n  LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int curr_col, int curr_row,\n                                          const BLOB_CHOICE &b,\n                                          const ViterbiStateEntry *parent_vse);\n\n  // Computes p(unichar | parent context) and records it in ngram_cost.\n  // If b.unichar_id() is an unlikely continuation of the parent context\n  // sets found_small_prob to true and returns nullptr.\n  // Otherwise creates a new LanguageModelNgramInfo entry containing the\n  // updated context (that includes b.unichar_id() at the end) and returns it.\n  //\n  // Note: the caller is responsible for deleting the returned pointer.\n  LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, float certainty, float denom,\n                                            int curr_col, int curr_row, float outline_length,\n                                            const ViterbiStateEntry *parent_vse);\n\n  // Computes -(log(prob(classifier)) + log(prob(ngram model)))\n  // for the given unichar in the given context. If there are multiple\n  // unichars at one position - takes the average of their probabilities.\n  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,\n  // since probability_in_context() can only handle one at a time (while\n  // unicharset might contain ngrams and glyphs composed from multiple UTF8\n  // characters).\n  float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context,\n                         int *unichar_step_len, bool *found_small_prob, float *ngram_prob);\n\n  // Computes the normalization factors for the classifier confidences\n  // (used by ComputeNgramCost()).\n  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);\n\n  // Fills the given consistency_info based on parent_vse.consistency_info\n  // and on the consistency of the given unichar_id with parent_vse.\n  void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b,\n                           ViterbiStateEntry *parent_vse, WERD_RES *word_res,\n                           LMConsistencyInfo *consistency_info);\n\n  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs\n  // on the path represented by the given BLOB_CHOICE and language model\n  // state entries (lmse, dse). The path is re-constructed by following\n  // the parent pointers in the lang model state entries). If the\n  // constructed WERD_CHOICE is better than the best/raw choice recorded\n  // in the best_choice_bundle, this function updates the corresponding\n  // fields and sets best_choice_bunldle->updated to true.\n  void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res,\n                        BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);\n\n  // Constructs a WERD_CHOICE by tracing parent pointers starting with\n  // the given LanguageModelStateEntry. Returns the constructed word.\n  // Updates best_char_choices, certainties and state if they are not\n  // nullptr (best_char_choices and certainties are assumed to have the\n  // length equal to lmse->length).\n  // The caller is responsible for freeing memory associated with the\n  // returned WERD_CHOICE.\n  WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt,\n                             BlamerBundle *blamer_bundle, bool *truth_path);\n\n  // Wrapper around AssociateUtils::ComputeStats().\n  inline void ComputeAssociateStats(int col, int row, float max_char_wh_ratio,\n                                    ViterbiStateEntry *parent_vse, WERD_RES *word_res,\n                                    AssociateStats *associate_stats) {\n    AssociateUtils::ComputeStats(\n        col, row, (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,\n        (parent_vse != nullptr) ? parent_vse->length : 0, fixed_pitch_, max_char_wh_ratio, word_res,\n        language_model_debug_level > 2, associate_stats);\n  }\n\n  // Returns true if the path with such top_choice_flags and dawg_info\n  // could be pruned out (i.e. is neither a system/user/frequent dictionary\n  // nor a top choice path).\n  // In non-space delimited languages all paths can be \"somewhat\" dictionary\n  // words. In such languages we cannot do dictionary-driven path pruning,\n  // so paths with non-empty dawg_info are considered prunable.\n  inline bool PrunablePath(const ViterbiStateEntry &vse) {\n    if (vse.top_choice_flags) {\n      return false;\n    }\n    if (vse.dawg_info != nullptr &&\n        (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || vse.dawg_info->permuter == USER_DAWG_PERM ||\n         vse.dawg_info->permuter == FREQ_DAWG_PERM)) {\n      return false;\n    }\n    return true;\n  }\n\n  // Returns true if the given ViterbiStateEntry represents an acceptable path.\n  inline bool AcceptablePath(const ViterbiStateEntry &vse) {\n    return (vse.dawg_info != nullptr || vse.Consistent() ||\n            (vse.ngram_info != nullptr && !vse.ngram_info->pruned));\n  }\n\npublic:\n  // Parameters.\n  INT_VAR_H(language_model_debug_level);\n  BOOL_VAR_H(language_model_ngram_on);\n  INT_VAR_H(language_model_ngram_order);\n  INT_VAR_H(language_model_viterbi_list_max_num_prunable);\n  INT_VAR_H(language_model_viterbi_list_max_size);\n  double_VAR_H(language_model_ngram_small_prob);\n  double_VAR_H(language_model_ngram_nonmatch_score);\n  BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step);\n  double_VAR_H(language_model_ngram_scale_factor);\n  double_VAR_H(language_model_ngram_rating_factor);\n  BOOL_VAR_H(language_model_ngram_space_delimited_language);\n  INT_VAR_H(language_model_min_compound_length);\n  // Penalties used for adjusting path costs and final word rating.\n  double_VAR_H(language_model_penalty_non_freq_dict_word);\n  double_VAR_H(language_model_penalty_non_dict_word);\n  double_VAR_H(language_model_penalty_punc);\n  double_VAR_H(language_model_penalty_case);\n  double_VAR_H(language_model_penalty_script);\n  double_VAR_H(language_model_penalty_chartype);\n  double_VAR_H(language_model_penalty_font);\n  double_VAR_H(language_model_penalty_spacing);\n  double_VAR_H(language_model_penalty_increment);\n  INT_VAR_H(wordrec_display_segmentations);\n  BOOL_VAR_H(language_model_use_sigmoidal_certainty);\n\nprotected:\n  // Member Variables.\n\n  // Temporary DawgArgs struct that is re-used across different words to\n  // avoid dynamic memory re-allocation (should be cleared before each use).\n  DawgArgs dawg_args_;\n  // Scaling for recovering blob outline length from rating and certainty.\n  float rating_cert_scale_ = 0.0f;\n\n  // The following variables are set at construction time.\n\n  // Pointer to fontinfo table (not owned by LanguageModel).\n  const UnicityTable<FontInfo> *fontinfo_table_ = nullptr;\n\n  // Pointer to Dict class, that is used for querying the dictionaries\n  // (the pointer is not owned by LanguageModel).\n  Dict *dict_ = nullptr;\n\n  // TODO(daria): the following variables should become LanguageModel params\n  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.\n  //\n  // Set to true if we are dealing with fixed pitch text\n  // (set to assume_fixed_pitch_char_segment).\n  bool fixed_pitch_ = false;\n  // Max char width-to-height ratio allowed\n  // (set to segsearch_max_char_wh_ratio).\n  float max_char_wh_ratio_ = 0.0f;\n\n  // The following variables are initialized with InitForWord().\n\n  // String representation of the classification of the previous word\n  // (since this is only used by the character ngram model component,\n  // only the last language_model_ngram_order of the word are stored).\n  std::string prev_word_str_;\n  int prev_word_unichar_step_len_ = 0;\n  // Active dawg vector.\n  DawgPositionVector very_beginning_active_dawgs_; // includes continuation\n  DawgPositionVector beginning_active_dawgs_;\n  // Set to true if acceptable choice was discovered.\n  // Note: it would be nice to use this to terminate the search once an\n  // acceptable choices is found. However we do not do that and once an\n  // acceptable choice is found we finish looking for alternative choices\n  // in the current segmentation graph and then exit the search (no more\n  // classifications are done after an acceptable choice is found).\n  // This is needed in order to let the search find the words very close to\n  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these\n  // choices. This way the stopper will know that the best choice is not\n  // ambiguous (i.e. there are best choices in the best choice list that have\n  // ratings close to the very best one) and will be less likely to mis-adapt.\n  bool acceptable_choice_found_ = false;\n  // Set to true if a choice representing correct segmentation was explored.\n  bool correct_segmentation_explored_ = false;\n\n  // Params models containing weights for computing ViterbiStateEntry costs.\n  ParamsModel params_model_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_\n"
  },
  {
    "path": "src/wordrec/lm_consistency.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lm_consistency.cpp\n// Description: Struct for recording consistency of the paths  representing\n//              OCR hypotheses.\n// Author:      Rika Antonova\n// Created:     Mon Jun 20 11:26:43 PST 2012\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n////////////////////////////////////////////////////////////////////////\n\n#include \"lm_consistency.h\"\n\n#include \"associate.h\"\n#include \"dict.h\"\n#include \"ratngs.h\"\n\nnamespace tesseract {\n\nvoid LMConsistencyInfo::ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc) {\n  if (xht_decision == XH_INCONSISTENT) {\n    return; // It isn't going to get any better.\n  }\n\n  // Compute xheight consistency.\n  bool parent_null = xht_sp < 0;\n  int parent_sp = xht_sp;\n  // Debug strings.\n  if (b->yshift() > LMConsistencyInfo::kShiftThresh) {\n    xht_sp = LMConsistencyInfo::kSUP;\n  } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) {\n    xht_sp = LMConsistencyInfo::kSUB;\n  } else {\n    xht_sp = LMConsistencyInfo::kNORM;\n  }\n  xht_count[xht_sp]++;\n  if (is_punc) {\n    xht_count_punc[xht_sp]++;\n  }\n  if (!parent_null) {\n    xpos_entropy += abs(parent_sp - xht_sp);\n  }\n  // TODO(eger): Figure out a better way to account for small caps.\n  // For the first character not y-shifted, we only care if it is too small.\n  // Too large is common in drop caps and small caps.\n  // int16_t small_xht = b->min_xheight();\n  //  if (parent_vse == nullptr && sp == LanguageModelConsistencyInfo::kNORM) {\n  //  small_xht = 0;\n  // }\n  IntersectRange(b->min_xheight(), b->max_xheight(), &(xht_lo[xht_sp]), &(xht_hi[xht_sp]));\n\n  // Compute xheight inconsistency kinds.\n  if (parent_null) {\n    if (xht_count[kNORM] == 1) {\n      xht_decision = XH_GOOD;\n    } else {\n      xht_decision = XH_SUBNORMAL;\n    }\n    return;\n  }\n\n  // When we intersect the ranges of xheights in pixels for all characters in\n  // each position (subscript, normal, superscript),\n  // How much range must be left?  0? [exactly one pixel height for xheight] 1?\n  // TODO(eger): Extend this code to take a prior for the rest of the line.\n  const int kMinIntersectedXHeightRange = 0;\n  for (int i = 0; i < kNumPos; i++) {\n    if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) {\n      xht_decision = XH_INCONSISTENT;\n      return;\n    }\n  }\n\n  // Reject as improbable anything where there's much punctuation in subscript\n  // or superscript regions.\n  if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 ||\n      xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) {\n    xht_decision = XH_INCONSISTENT;\n    return;\n  }\n\n  // Now check that the subscript and superscript aren't too small relative to\n  // the mainline.\n  auto mainline_xht = static_cast<double>(xht_lo[kNORM]);\n  double kMinSizeRatio = 0.4;\n  if (mainline_xht > 0.0 && (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio ||\n                             static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) {\n    xht_decision = XH_INCONSISTENT;\n    return;\n  }\n  // TODO(eger): Check into inconsistency of super/subscript y offsets.\n  if (xpos_entropy > kMaxEntropy) {\n    xht_decision = XH_INCONSISTENT;\n    return;\n  }\n  if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) {\n    xht_decision = XH_GOOD;\n    return;\n  }\n  xht_decision = XH_SUBNORMAL;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/lm_consistency.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lm_consistency.h\n// Description: Struct for recording consistency of the paths  representing\n//              OCR hypotheses.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n////////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_\n#define TESSERACT_WORDREC_LM_CONSISTENCY_H_\n\n#include <cstdint> // for INT16_MAX\n#include \"dawg.h\"  // for EDGE_REF, NO_EDGE\n#include \"dict.h\"  // for XH_GOOD, XH_INCONSISTENT, XHeightConsi...\n\nclass BLOB_CHOICE;\n\nnamespace tesseract {\n\nstatic const char *const XHeightConsistencyEnumName[] = {\n    \"XH_GOOD\",\n    \"XH_SUBNORMAL\",\n    \"XH_INCONSISTENT\",\n};\n\n// Struct for keeping track of the consistency of the path.\nstruct LMConsistencyInfo {\n  enum ChartypeEnum { CT_NONE, CT_ALPHA, CT_DIGIT, CT_OTHER };\n\n  // How much do characters have to be shifted away from normal parameters\n  // before we say they're not normal?\n  static const int kShiftThresh = 1;\n\n  // How much shifting from subscript to superscript and back\n  // before we declare shenanigans?\n  static const int kMaxEntropy = 1;\n\n  // Script positions - order important for entropy calculation.\n  static const int kSUB = 0, kNORM = 1, kSUP = 2;\n  static const int kNumPos = 3;\n\n  explicit LMConsistencyInfo(const LMConsistencyInfo *parent_info) {\n    if (parent_info == nullptr) {\n      // Initialize from scratch.\n      num_alphas = 0;\n      num_digits = 0;\n      num_punc = 0;\n      num_other = 0;\n      chartype = CT_NONE;\n      punc_ref = NO_EDGE;\n      invalid_punc = false;\n      num_non_first_upper = 0;\n      num_lower = 0;\n      script_id = 0;\n      inconsistent_script = false;\n      num_inconsistent_spaces = 0;\n      inconsistent_font = false;\n      // Initialize XHeight stats.\n      for (int i = 0; i < kNumPos; i++) {\n        xht_count[i] = 0;\n        xht_count_punc[i] = 0;\n        xht_lo[i] = 0;\n        xht_hi[i] = 256; // kBlnCellHeight\n      }\n      xht_sp = -1; // This invalid value indicates that there was no parent.\n      xpos_entropy = 0;\n      xht_decision = XH_GOOD;\n    } else {\n      // Copy parent info\n      *this = *parent_info;\n    }\n  }\n  inline int NumInconsistentPunc() const {\n    return invalid_punc ? num_punc : 0;\n  }\n  inline int NumInconsistentCase() const {\n    return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;\n  }\n  inline int NumInconsistentChartype() const {\n    return (NumInconsistentPunc() + num_other +\n            ((num_alphas > num_digits) ? num_digits : num_alphas));\n  }\n  inline bool Consistent() const {\n    return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&\n            NumInconsistentChartype() == 0 && !inconsistent_script && !inconsistent_font &&\n            !InconsistentXHeight());\n  }\n  inline int NumInconsistentSpaces() const {\n    return num_inconsistent_spaces;\n  }\n  inline int InconsistentXHeight() const {\n    return xht_decision == XH_INCONSISTENT;\n  }\n  void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc);\n  float BodyMinXHeight() const {\n    if (InconsistentXHeight()) {\n      return 0.0f;\n    }\n    return xht_lo[kNORM];\n  }\n  float BodyMaxXHeight() const {\n    if (InconsistentXHeight()) {\n      return static_cast<float>(INT16_MAX);\n    }\n    return xht_hi[kNORM];\n  }\n\n  EDGE_REF punc_ref;\n  int num_alphas;\n  int num_digits;\n  int num_punc;\n  int num_other;\n  ChartypeEnum chartype;\n  XHeightConsistencyEnum xht_decision;\n  int num_non_first_upper;\n  int num_lower;\n  int script_id;\n  int num_inconsistent_spaces;\n  // Metrics clumped by position.\n  float xht_lo[kNumPos];\n  float xht_hi[kNumPos];\n  int16_t xht_count[kNumPos];\n  int16_t xht_count_punc[kNumPos];\n  int16_t xht_sp;\n  int16_t xpos_entropy;\n  bool invalid_punc;\n  bool inconsistent_script;\n  bool inconsistent_font;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_\n"
  },
  {
    "path": "src/wordrec/lm_pain_points.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        pain_points.cpp\n// Description: Functions that utilize the knowledge about the properties\n//              of the paths explored by the segmentation search in order\n//              to \"pain points\" - the locations in the ratings matrix\n//              which should be classified next.\n// Author:      Rika Antonova\n// Created:     Mon Jun 20 11:26:43 PST 2012\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"lm_pain_points.h\"\n\n#include \"associate.h\"\n#include \"dict.h\"\n#include \"genericheap.h\"\n#include \"lm_state.h\"\n#include \"matrix.h\"\n#include \"pageres.h\"\n\n#include <algorithm>\n\nnamespace tesseract {\n\nconst float LMPainPoints::kDefaultPainPointPriorityAdjustment = 2.0f;\nconst float LMPainPoints::kLooseMaxCharWhRatio = 2.5f;\n\nLMPainPointsType LMPainPoints::Deque(MATRIX_COORD *pp, float *priority) {\n  for (int h = 0; h < LM_PPTYPE_NUM; ++h) {\n    if (pain_points_heaps_[h].empty()) {\n      continue;\n    }\n    *priority = pain_points_heaps_[h].PeekTop().key();\n    *pp = pain_points_heaps_[h].PeekTop().data();\n    pain_points_heaps_[h].Pop(nullptr);\n    return static_cast<LMPainPointsType>(h);\n  }\n  return LM_PPTYPE_NUM;\n}\n\nvoid LMPainPoints::GenerateInitial(WERD_RES *word_res) {\n  MATRIX *ratings = word_res->ratings;\n  AssociateStats associate_stats;\n  for (int col = 0; col < ratings->dimension(); ++col) {\n    int row_end = std::min(ratings->dimension(), col + ratings->bandwidth() + 1);\n    for (int row = col + 1; row < row_end; ++row) {\n      MATRIX_COORD coord(col, row);\n      if (coord.Valid(*ratings) && ratings->get(col, row) != NOT_CLASSIFIED) {\n        continue;\n      }\n      // Add an initial pain point if needed.\n      if (ratings->Classified(col, row - 1, dict_->WildcardID()) ||\n          (col + 1 < ratings->dimension() &&\n           ratings->Classified(col + 1, row, dict_->WildcardID()))) {\n        GeneratePainPoint(col, row, LM_PPTYPE_SHAPE, 0.0, true, max_char_wh_ratio_, word_res);\n      }\n    }\n  }\n}\n\nvoid LMPainPoints::GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,\n                                    WERD_RES *word_res) {\n  ViterbiStateEntry *curr_vse = vse;\n  BLOB_CHOICE *curr_b = vse->curr_b;\n  // The following pain point generation and priority calculation approaches\n  // prioritize exploring paths with low average rating of the known part of\n  // the path, while not relying on the ratings of the pieces to be combined.\n  //\n  // A pain point to combine the neighbors is generated for each pair of\n  // neighboring blobs on the path (the path is represented by vse argument\n  // given to GenerateFromPath()). The priority of each pain point is set to\n  // the average rating (per outline length) of the path, not including the\n  // ratings of the blobs to be combined.\n  // The ratings of the blobs to be combined are not used to calculate the\n  // priority, since it is not possible to determine from their magnitude\n  // whether it will be beneficial to combine the blobs. The reason is that\n  // chopped junk blobs (/ | - ') can have very good (low) ratings, however\n  // combining them will be beneficial. Blobs with high ratings might be\n  // over-joined pieces of characters, but also could be blobs from an unseen\n  // font or chopped pieces of complex characters.\n  while (curr_vse->parent_vse != nullptr) {\n    ViterbiStateEntry *parent_vse = curr_vse->parent_vse;\n    const MATRIX_COORD &curr_cell = curr_b->matrix_cell();\n    const MATRIX_COORD &parent_cell = parent_vse->curr_b->matrix_cell();\n    MATRIX_COORD pain_coord(parent_cell.col, curr_cell.row);\n    if (!pain_coord.Valid(*word_res->ratings) ||\n        !word_res->ratings->Classified(parent_cell.col, curr_cell.row, dict_->WildcardID())) {\n      // rat_subtr contains ratings sum of the two adjacent blobs to be merged.\n      // rat_subtr will be subtracted from the ratings sum of the path, since\n      // the blobs will be joined into a new blob, whose rating is yet unknown.\n      float rat_subtr = curr_b->rating() + parent_vse->curr_b->rating();\n      // ol_subtr contains the outline length of the blobs that will be joined.\n      float ol_subtr =\n          AssociateUtils::ComputeOutlineLength(rating_cert_scale, *curr_b) +\n          AssociateUtils::ComputeOutlineLength(rating_cert_scale, *(parent_vse->curr_b));\n      // ol_dif is the outline of the path without the two blobs to be joined.\n      float ol_dif = vse->outline_length - ol_subtr;\n      // priority is set to the average rating of the path per unit of outline,\n      // not counting the ratings of the pieces to be joined.\n      float priority = ol_dif > 0 ? (vse->ratings_sum - rat_subtr) / ol_dif : 0.0;\n      GeneratePainPoint(pain_coord.col, pain_coord.row, LM_PPTYPE_PATH, priority, true,\n                        max_char_wh_ratio_, word_res);\n    } else if (debug_level_ > 3) {\n      tprintf(\"NO pain point (Classified) for col=%d row=%d type=%s\\n\", pain_coord.col,\n              pain_coord.row, LMPainPointsTypeName[LM_PPTYPE_PATH]);\n      BLOB_CHOICE_IT b_it(word_res->ratings->get(pain_coord.col, pain_coord.row));\n      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n        BLOB_CHOICE *choice = b_it.data();\n        choice->print_full();\n      }\n    }\n\n    curr_vse = parent_vse;\n    curr_b = curr_vse->curr_b;\n  }\n}\n\nvoid LMPainPoints::GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,\n                                      WERD_RES *word_res) {\n  // Begins and ends in DANGERR vector now record the blob indices as used\n  // by the ratings matrix.\n  for (auto &&danger : fixpt) {\n    // Only use dangerous ambiguities.\n    if (danger.dangerous) {\n      GeneratePainPoint(danger.begin, danger.end - 1, LM_PPTYPE_AMBIG, vse->cost, true,\n                        kLooseMaxCharWhRatio, word_res);\n    }\n  }\n}\n\nbool LMPainPoints::GeneratePainPoint(int col, int row, LMPainPointsType pp_type,\n                                     float special_priority, bool ok_to_extend,\n                                     float max_char_wh_ratio, WERD_RES *word_res) {\n  MATRIX_COORD coord(col, row);\n  if (coord.Valid(*word_res->ratings) &&\n      word_res->ratings->Classified(col, row, dict_->WildcardID())) {\n    return false;\n  }\n  if (debug_level_ > 3) {\n    tprintf(\"Generating pain point for col=%d row=%d type=%s\\n\", col, row,\n            LMPainPointsTypeName[pp_type]);\n  }\n  // Compute associate stats.\n  AssociateStats associate_stats;\n  AssociateUtils::ComputeStats(col, row, nullptr, 0, fixed_pitch_, max_char_wh_ratio, word_res,\n                               debug_level_, &associate_stats);\n  // For fixed-pitch fonts/languages: if the current combined blob overlaps\n  // the next blob on the right and it is ok to extend the blob, try extending\n  // the blob until there is no overlap with the next blob on the right or\n  // until the width-to-height ratio becomes too large.\n  if (ok_to_extend) {\n    while (associate_stats.bad_fixed_pitch_right_gap && row + 1 < word_res->ratings->dimension() &&\n           !associate_stats.bad_fixed_pitch_wh_ratio) {\n      AssociateUtils::ComputeStats(col, ++row, nullptr, 0, fixed_pitch_, max_char_wh_ratio,\n                                   word_res, debug_level_, &associate_stats);\n    }\n  }\n  if (associate_stats.bad_shape) {\n    if (debug_level_ > 3) {\n      tprintf(\"Discarded pain point with a bad shape\\n\");\n    }\n    return false;\n  }\n\n  // Insert the new pain point into pain_points_heap_.\n  if (pain_points_heaps_[pp_type].size() < max_heap_size_) {\n    // Compute pain point priority.\n    float priority;\n    if (pp_type == LM_PPTYPE_PATH) {\n      priority = special_priority;\n    } else {\n      priority = associate_stats.gap_sum;\n    }\n    MatrixCoordPair pain_point(priority, MATRIX_COORD(col, row));\n    pain_points_heaps_[pp_type].Push(&pain_point);\n    if (debug_level_) {\n      tprintf(\"Added pain point with priority %g\\n\", priority);\n    }\n    return true;\n  } else {\n    if (debug_level_) {\n      tprintf(\"Pain points heap is full\\n\");\n    }\n    return false;\n  }\n}\n\n/**\n * Adjusts the pain point coordinates to cope with expansion of the ratings\n * matrix due to a split of the blob with the given index.\n */\nvoid LMPainPoints::RemapForSplit(int index) {\n  for (auto &pain_points_heap : pain_points_heaps_) {\n    std::vector<MatrixCoordPair> &heap = pain_points_heap.heap();\n    for (auto &&entry : heap) {\n      entry.data().MapForSplit(index);\n    }\n  }\n}\n\n} //  namespace tesseract\n"
  },
  {
    "path": "src/wordrec/lm_pain_points.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lm_pain_points.h\n// Description: Functions that utilize the knowledge about the properties\n//              of the paths explored by the segmentation search in order\n//              to generate \"pain points\" - the locations in the ratings\n//              matrix which should be classified next.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_\n#define TESSERACT_WORDREC_PAIN_POINTS_H_\n\n#include \"genericheap.h\" // for GenericHeap\n#include \"matrix.h\"      // for MATRIX_COORD (ptr only), MatrixCoordPair\n#include \"stopper.h\"     // for DANGERR\n\nnamespace tesseract {\n\nclass Dict;\nstruct ViterbiStateEntry;\nclass WERD_RES;\n\n// Heap of pain points used for determining where to chop/join.\nusing PainPointHeap = GenericHeap<MatrixCoordPair>;\n\n// Types of pain points (ordered in the decreasing level of importance).\nenum LMPainPointsType {\n  LM_PPTYPE_BLAMER,\n  LM_PPTYPE_AMBIG,\n  LM_PPTYPE_PATH,\n  LM_PPTYPE_SHAPE,\n\n  LM_PPTYPE_NUM\n};\n\nstatic const char *const LMPainPointsTypeName[] = {\n    \"LM_PPTYPE_BLAMER\",\n    \"LM_PPTYPE_AMBIGS\",\n    \"LM_PPTYPE_PATH\",\n    \"LM_PPTYPE_SHAPE\",\n};\n\nclass LMPainPoints {\npublic:\n  static const float kDefaultPainPointPriorityAdjustment;\n  // If there is a significant drop in character ngram probability or a\n  // dangerous ambiguity make the thresholds on what blob combinations\n  // can be classified looser.\n  static const float kLooseMaxCharWhRatio;\n  // Returns a description of the type of a pain point.\n  static const char *PainPointDescription(LMPainPointsType type) {\n    return LMPainPointsTypeName[type];\n  }\n\n  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)\n      : max_heap_size_(max)\n      , max_char_wh_ratio_(rat)\n      , fixed_pitch_(fp)\n      , dict_(d)\n      , debug_level_(deb) {}\n  ~LMPainPoints() = default;\n\n  // Returns true if the heap of pain points of pp_type is not empty().\n  inline bool HasPainPoints(LMPainPointsType pp_type) const {\n    return !pain_points_heaps_[pp_type].empty();\n  }\n\n  // Dequeues the next pain point from the pain points queue and copies\n  // its contents and priority to *pp and *priority.\n  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.\n  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);\n\n  // Clears pain points heap.\n  void Clear() {\n    for (auto &pain_points_heap : pain_points_heaps_) {\n      pain_points_heap.clear();\n    }\n  }\n\n  // For each cell, generate a \"pain point\" if the cell is not classified\n  // and has a left or right neighbor that was classified.\n  void GenerateInitial(WERD_RES *word_res);\n\n  // Generate pain points from the given path.\n  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);\n\n  // Generate pain points from dangerous ambiguities in best choice.\n  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);\n\n  // Adds a pain point to classify chunks_record->ratings(col, row).\n  // Returns true if a new pain point was added to an appropriate heap.\n  // Pain point priority is set to special_priority for pain points of\n  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points\n  // AssociateStats::gap_sum is used.\n  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,\n                         bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);\n\n  // Adjusts the pain point coordinates to cope with expansion of the ratings\n  // matrix due to a split of the blob with the given index.\n  void RemapForSplit(int index);\n\nprivate:\n  // Priority queues containing pain points generated by the language model\n  // The priority is set by the language model components, adjustments like\n  // seam cost and width priority are factored into the priority.\n  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];\n  // Maximum number of points to keep in the heap.\n  int max_heap_size_;\n  // Maximum character width/height ratio.\n  float max_char_wh_ratio_;\n  // Set to true if fixed pitch should be assumed.\n  bool fixed_pitch_;\n  // Cached pointer to dictionary.\n  const Dict *dict_;\n  // Debug level for print statements.\n  int debug_level_;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_PAIN_POINTS_H_\n"
  },
  {
    "path": "src/wordrec/lm_state.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lm_state.cpp\n// Description: Structures and functionality for capturing the state of\n//              segmentation search guided by the language model.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"lm_state.h\"\n\nnamespace tesseract {\n\nvoid ViterbiStateEntry::Print(const char *msg) const {\n  tprintf(\"%s ViterbiStateEntry\", msg);\n  if (updated) {\n    tprintf(\"(NEW)\");\n  }\n  if (this->debug_str != nullptr) {\n    tprintf(\" str=%s\", this->debug_str->c_str());\n  }\n  tprintf(\" with ratings_sum=%.4f length=%d cost=%.6f\", this->ratings_sum, this->length,\n          this->cost);\n  if (this->top_choice_flags) {\n    tprintf(\" top_choice_flags=0x%x\", this->top_choice_flags);\n  }\n  if (!this->Consistent()) {\n    tprintf(\" inconsistent=(punc %d case %d chartype %d script %d font %d)\",\n            this->consistency_info.NumInconsistentPunc(),\n            this->consistency_info.NumInconsistentCase(),\n            this->consistency_info.NumInconsistentChartype(),\n            this->consistency_info.inconsistent_script, this->consistency_info.inconsistent_font);\n  }\n  if (this->dawg_info) {\n    tprintf(\" permuter=%d\", this->dawg_info->permuter);\n  }\n  if (this->ngram_info) {\n    tprintf(\" ngram_cl_cost=%g context=%s ngram pruned=%d\",\n            this->ngram_info->ngram_and_classifier_cost, this->ngram_info->context.c_str(),\n            this->ngram_info->pruned);\n  }\n  if (this->associate_stats.shape_cost > 0.0f) {\n    tprintf(\" shape_cost=%g\", this->associate_stats.shape_cost);\n  }\n  tprintf(\" %s\", XHeightConsistencyEnumName[this->consistency_info.xht_decision]);\n\n  tprintf(\"\\n\");\n}\n\n/// Clears the viterbi search state back to its initial conditions.\nvoid LanguageModelState::Clear() {\n  viterbi_state_entries.clear();\n  viterbi_state_entries_prunable_length = 0;\n  viterbi_state_entries_prunable_max_cost = FLT_MAX;\n  viterbi_state_entries_length = 0;\n}\n\nvoid LanguageModelState::Print(const char *msg) {\n  tprintf(\"%s VSEs (max_cost=%g prn_len=%d tot_len=%d):\\n\", msg,\n          viterbi_state_entries_prunable_max_cost, viterbi_state_entries_prunable_length,\n          viterbi_state_entries_length);\n  ViterbiStateEntry_IT vit(&viterbi_state_entries);\n  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {\n    vit.data()->Print(\"\");\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/lm_state.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        lm_state.h\n// Description: Structures and functionality for capturing the state of\n//              segmentation search guided by the language model.\n// Author:      Rika Antonova\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_\n#define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_\n\n#include <tesseract/unichar.h> // for UNICHAR_ID\n#include \"associate.h\"         // for AssociateStats\n#include \"dawg.h\"              // for DawgPositionVector\n#include \"elst.h\"              // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#include \"lm_consistency.h\"    // for LMConsistencyInfo\n#include \"ratngs.h\"            // for BLOB_CHOICE, PermuterType\n#include \"stopper.h\"           // for DANGERR\n#include \"unicharset.h\"        // for UNICHARSET\n\nnamespace tesseract {\n\n/// Used for expressing various language model flags.\nusing LanguageModelFlagsType = unsigned char;\n\n/// The following structs are used for storing the state of the language model\n/// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs\n/// and the links are the relationships between the underlying blobs (see\n/// segsearch.h for a more detailed description).\n///\n/// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has\n/// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi\n/// search leading up to and including this BLOB_CHOICE.\n///\n/// Each ViterbiStateEntry contains information from various components of the\n/// language model: dawgs in which the path is found, character ngram model\n/// probability of the path, script/chartype/font consistency info, state for\n/// language-specific heuristics (e.g. hyphenated and compound words,\n/// lower/upper case preferences, etc).\n///\n/// Each ViterbiStateEntry also contains the parent pointer, so that the path\n/// that it represents (WERD_CHOICE) can be constructed by following these\n/// parent pointers.\n\n/// Struct for storing additional information used by Dawg language model\n/// component. It stores the set of active dawgs in which the sequence of\n/// letters on a path can be found.\nstruct LanguageModelDawgInfo {\n  LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)\n      : active_dawgs(*a), permuter(pt) {}\n  DawgPositionVector active_dawgs;\n  PermuterType permuter;\n};\n\n/// Struct for storing additional information used by Ngram language model\n/// component.\nstruct LanguageModelNgramInfo {\n  LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)\n      : context(c)\n      , context_unichar_step_len(l)\n      , pruned(p)\n      , ngram_cost(nc)\n      , ngram_and_classifier_cost(ncc) {}\n  std::string context; ///< context string\n  /// Length of the context measured by advancing using UNICHAR::utf8_step()\n  /// (should be at most the order of the character ngram model used).\n  int context_unichar_step_len;\n  /// The paths with pruned set are pruned out from the perspective of the\n  /// character ngram model. They are explored further because they represent\n  /// a dictionary match or a top choice. Thus ngram_info is still computed\n  /// for them in order to calculate the combined cost.\n  bool pruned;\n  /// -ln(P_ngram_model(path))\n  float ngram_cost;\n  /// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]\n  float ngram_and_classifier_cost;\n};\n\n/// Struct for storing the information about a path in the segmentation graph\n/// explored by Viterbi search.\nstruct ViterbiStateEntry : public ELIST<ViterbiStateEntry>::LINK {\n  ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol,\n                    const LMConsistencyInfo &ci, const AssociateStats &as,\n                    LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n,\n                    const char *debug_uch)\n      : curr_b(b)\n      , parent_vse(pe)\n      , competing_vse(nullptr)\n      , dawg_info(d)\n      , ngram_info(n)\n      , cost(c)\n      , ratings_sum(b->rating())\n      , min_certainty(b->certainty())\n      , adapted(b->IsAdapted())\n      , length(1)\n      , outline_length(ol)\n      , consistency_info(ci)\n      , associate_stats(as)\n      , top_choice_flags(tcf)\n      , updated(true) {\n    debug_str = (debug_uch == nullptr) ? nullptr : new std::string();\n    if (pe != nullptr) {\n      ratings_sum += pe->ratings_sum;\n      if (pe->min_certainty < min_certainty) {\n        min_certainty = pe->min_certainty;\n      }\n      adapted += pe->adapted;\n      length += pe->length;\n      outline_length += pe->outline_length;\n      if (debug_uch != nullptr) {\n        *debug_str += *(pe->debug_str);\n      }\n    }\n    if (debug_str != nullptr && debug_uch != nullptr) {\n      *debug_str += debug_uch;\n    }\n  }\n  ~ViterbiStateEntry() {\n    delete dawg_info;\n    delete ngram_info;\n    delete debug_str;\n  }\n  /// Comparator function for sorting ViterbiStateEntry_LISTs in\n  /// non-increasing order of costs.\n  static int Compare(const ViterbiStateEntry *ve1, const ViterbiStateEntry *ve2) {\n    return (ve1->cost < ve2->cost) ? -1 : 1;\n  }\n  inline bool Consistent() const {\n    if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {\n      return true;\n    }\n    return consistency_info.Consistent();\n  }\n  /// Returns true if this VSE has an alphanumeric character as its classifier\n  /// result.\n  bool HasAlnumChoice(const UNICHARSET &unicharset) {\n    if (curr_b == nullptr) {\n      return false;\n    }\n    UNICHAR_ID unichar_id = curr_b->unichar_id();\n    if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) {\n      return true;\n    }\n    return false;\n  }\n  void Print(const char *msg) const;\n\n  /// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).\n  BLOB_CHOICE *curr_b;\n  ViterbiStateEntry *parent_vse;\n  /// Pointer to a case-competing ViterbiStateEntry in the same list that\n  /// represents a path ending in the same letter of the opposite case.\n  ViterbiStateEntry *competing_vse;\n\n  /// Extra information maintained by Dawg language model component\n  /// (owned by ViterbiStateEntry).\n  LanguageModelDawgInfo *dawg_info;\n\n  /// Extra information maintained by Ngram language model component\n  /// (owned by ViterbiStateEntry).\n  LanguageModelNgramInfo *ngram_info;\n\n  /// UTF8 string representing the path corresponding to this vse.\n  /// Populated only in when language_model_debug_level > 0.\n  std::string *debug_str;\n\n  /// The cost is an adjusted ratings sum, that is adjusted by all the language\n  /// model components that use Viterbi search.\n  float cost;\n\n  /// Various information about the characters on the path represented\n  /// by this ViterbiStateEntry.\n  float ratings_sum;                  ///< sum of ratings of character on the path\n  float min_certainty;                ///< minimum certainty on the path\n  int adapted;                        ///< number of BLOB_CHOICES from adapted templates\n  int length;                         ///< number of characters on the path\n  float outline_length;               ///< length of the outline so far\n  LMConsistencyInfo consistency_info; ///< path consistency info\n  AssociateStats associate_stats;     ///< character widths/gaps/seams\n\n  /// Flags for marking the entry as a top choice path with\n  /// the smallest rating or lower/upper case letters).\n  LanguageModelFlagsType top_choice_flags;\n\n  bool updated; ///< set to true if the entry has just been created/updated\n};\n\nELISTIZEH(ViterbiStateEntry)\n\n/// Struct to store information maintained by various language model components.\nstruct LanguageModelState {\n  LanguageModelState()\n      : viterbi_state_entries_prunable_length(0)\n      , viterbi_state_entries_prunable_max_cost(FLT_MAX)\n      , viterbi_state_entries_length(0) {}\n  ~LanguageModelState() = default;\n\n  /// Clears the viterbi search state back to its initial conditions.\n  void Clear();\n\n  void Print(const char *msg);\n\n  /// Storage for the Viterbi state.\n  ViterbiStateEntry_LIST viterbi_state_entries;\n  /// Number and max cost of prunable paths in viterbi_state_entries.\n  int viterbi_state_entries_prunable_length;\n  float viterbi_state_entries_prunable_max_cost;\n  /// Total number of entries in viterbi_state_entries.\n  int viterbi_state_entries_length;\n};\n\n/// Bundle together all the things pertaining to the best choice/state.\nstruct BestChoiceBundle {\n  explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) {\n    beam.reserve(matrix_dimension);\n    for (int i = 0; i < matrix_dimension; ++i) {\n      beam.push_back(new LanguageModelState);\n    }\n  }\n  ~BestChoiceBundle() {\n    for (auto &state : beam) {\n      delete state;\n    }\n  }\n\n  /// Flag to indicate whether anything was changed.\n  bool updated;\n  /// Places to try to fix the word suggested by ambiguity checking.\n  DANGERR fixpt;\n  /// The beam. One LanguageModelState containing a list of ViterbiStateEntry\n  /// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is\n  /// somewhere in the corresponding row.\n  std::vector<LanguageModelState *> beam;\n  /// Best ViterbiStateEntry and BLOB_CHOICE.\n  ViterbiStateEntry *best_vse;\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_\n"
  },
  {
    "path": "src/wordrec/outlines.cpp",
    "content": "/******************************************************************************\n *\n * File:         outlines.cpp  (Formerly outlines.c)\n * Description:  Combinatorial Splitter\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n              I n c l u d e s\n----------------------------------------------------------------------*/\n#include \"outlines.h\"\n#include \"wordrec.h\"\n\nnamespace tesseract {\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n/**********************************************************************\n * near_point\n *\n * Find the point on a line segment that is closest to a point not on\n * the line segment.  Return that point in near_pt.  Returns whether\n * near_pt was newly created.\n **********************************************************************/\nbool Wordrec::near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) {\n  TPOINT p;\n\n  float slope;\n  float intercept;\n\n  float x0 = line_pt_0->pos.x;\n  float x1 = line_pt_1->pos.x;\n  float y0 = line_pt_0->pos.y;\n  float y1 = line_pt_1->pos.y;\n\n  if (x0 == x1) {\n    /* Handle vertical line */\n    p.x = static_cast<int16_t>(x0);\n    p.y = point->pos.y;\n  } else {\n    /* Slope and intercept */\n    slope = (y0 - y1) / (x0 - x1);\n    intercept = y1 - x1 * slope;\n\n    /* Find perpendicular */\n    p.x = static_cast<int16_t>((point->pos.x + (point->pos.y - intercept) * slope) /\n                               (slope * slope + 1));\n    p.y = static_cast<int16_t>(slope * p.x + intercept);\n  }\n\n  if (is_on_line(p, line_pt_0->pos, line_pt_1->pos) && (!same_point(p, line_pt_0->pos)) &&\n      (!same_point(p, line_pt_1->pos))) {\n    /* Intersection on line */\n    *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);\n    return true;\n  } else { /* Intersection not on line */\n    *near_pt = closest(point, line_pt_0, line_pt_1);\n    return false;\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/outlines.h",
    "content": "/******************************************************************************\n *\n * File:         outlines.h\n * Description:  Combinatorial Splitter\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef OUTLINES_H\n#define OUTLINES_H\n\n#include <cmath>     // for abs\n#include \"blobs.h\"   // for TPOINT\n#include \"params.h\"  // for IntParam\n#include \"wordrec.h\" // for Wordrec\n\n/*----------------------------------------------------------------------\n              C o n s t a n t s\n----------------------------------------------------------------------*/\n#define LARGE_DISTANCE 100000 /* Used for closest dist */\n#define MIN_BLOB_SIZE 10      /* Big units */\n#define MAX_ASPECT_RATIO 2.5  /* Widest character */\n\n/*----------------------------------------------------------------------\n              M a c r o s\n----------------------------------------------------------------------*/\n/**********************************************************************\n * same_point\n *\n * Return true if the point values are the same. The parameters must\n * be of type POINT.\n **********************************************************************/\n#define same_point(p1, p2) \\\n  ((abs(p1.x - p2.x) < chop_same_distance) && (abs(p1.y - p2.y) < chop_same_distance))\n\n/**********************************************************************\n * dist_square\n *\n * Return the square of the distance between these two points.  The\n * parameters must be of type POINT.\n **********************************************************************/\n\n#define dist_square(p1, p2) ((p2.x - p1.x) * (p2.x - p1.x) + (p2.y - p1.y) * (p2.y - p1.y))\n\n/**********************************************************************\n * closest\n *\n * The expression provides the EDGEPT that is closest to the point in\n * question.  All three parameters must be of type EDGEPT.\n **********************************************************************/\n\n#define closest(test_p, p1, p2)                                                                   \\\n  (p1 ? (p2 ? ((dist_square(test_p->pos, p1->pos) < dist_square(test_p->pos, p2->pos)) ? p1 : p2) \\\n            : p1)                                                                                 \\\n      : p2)\n\n/**********************************************************************\n * edgept_dist\n *\n * Return the distance (squared) between the two edge points.\n **********************************************************************/\n\n#define edgept_dist(p1, p2) (dist_square((p1)->pos, (p2)->pos))\n\n/**********************************************************************\n * is_exterior_point\n *\n * Return true if the point supplied is an exterior projection from the\n * outline.\n **********************************************************************/\n\n#define is_exterior_point(edge, point)                                                   \\\n  (same_point(edge->prev->pos, point->pos) || same_point(edge->next->pos, point->pos) || \\\n   (angle_change(edge->prev, edge, edge->next) - angle_change(edge->prev, edge, point) > 20))\n\n/**********************************************************************\n * is_equal\n *\n * Return true if the POINTs are equal.\n **********************************************************************/\n\n#define is_equal(p1, p2) (((p1).x == (p2).x) && ((p1).y == (p2).y))\n\n/**********************************************************************\n * is_on_line\n *\n * Return true if the point is on the line segment between the two end\n * points.  The two end points are included as part of the  line.  The\n * parameters must be of type POINT.\n **********************************************************************/\n\n#define is_on_line(p, p0, p1) \\\n  (within_range((p).x, (p0).x, (p1).x) && within_range((p).y, (p0).y, (p1).y))\n\n/**********************************************************************\n * within_range\n *\n * Return true if the first number is in between the second two numbers.\n * Return false otherwise.\n **********************************************************************/\n\n#define within_range(x, x0, x1) (((x0 <= x) && (x <= x1)) || ((x1 <= x) && (x <= x0)))\n\n#endif\n"
  },
  {
    "path": "src/wordrec/params_model.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        params_model.cpp\n// Description: Trained language model parameters.\n// Author:      David Eger\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"params_model.h\"\n\n#include <cctype>\n#include <cmath>\n#include <cstdio>\n\n#include \"bitvector.h\"\n#include \"helpers.h\"   // for ClipToRange\n#include \"serialis.h\"  // for TFile\n#include \"tprintf.h\"\n\nnamespace tesseract {\n\n// Scale factor to apply to params model scores.\nstatic const float kScoreScaleFactor = 100.0f;\n// Minimum cost result to return.\nstatic const float kMinFinalCost = 0.001f;\n// Maximum cost result to return.\nstatic const float kMaxFinalCost = 100.0f;\n\nvoid ParamsModel::Print() {\n  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {\n    tprintf(\"ParamsModel for pass %d lang %s\\n\", p, lang_.c_str());\n    for (unsigned i = 0; i < weights_vec_[p].size(); ++i) {\n      tprintf(\"%s = %g\\n\", kParamsTrainingFeatureTypeName[i], weights_vec_[p][i]);\n    }\n  }\n}\n\nvoid ParamsModel::Copy(const ParamsModel &other_model) {\n  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {\n    weights_vec_[p] = other_model.weights_for_pass(static_cast<PassEnum>(p));\n  }\n}\n\n// Given a (modifiable) line, parse out a key / value pair.\n// Return true on success.\nbool ParamsModel::ParseLine(char *line, char **key, float *val) {\n  if (line[0] == '#') {\n    return false;\n  }\n  int end_of_key = 0;\n  while (line[end_of_key] && !(isascii(line[end_of_key]) && isspace(line[end_of_key]))) {\n    end_of_key++;\n  }\n  if (!line[end_of_key]) {\n    tprintf(\"ParamsModel::Incomplete line %s\\n\", line);\n    return false;\n  }\n  line[end_of_key++] = 0;\n  *key = line;\n  if (sscanf(line + end_of_key, \" %f\", val) != 1) {\n    return false;\n  }\n  return true;\n}\n\n// Applies params model weights to the given features.\n// Assumes that features is an array of size PTRAIN_NUM_FEATURE_TYPES.\n// The cost is set to a number that can be multiplied by the outline length,\n// as with the old ratings scheme. This enables words of different length\n// and combinations of words to be compared meaningfully.\nfloat ParamsModel::ComputeCost(const float features[]) const {\n  float unnorm_score = 0.0;\n  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {\n    unnorm_score += weights_vec_[pass_][f] * features[f];\n  }\n  return ClipToRange(-unnorm_score / kScoreScaleFactor, kMinFinalCost, kMaxFinalCost);\n}\n\nbool ParamsModel::Equivalent(const ParamsModel &that) const {\n  float epsilon = 0.0001f;\n  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {\n    if (weights_vec_[p].size() != that.weights_vec_[p].size()) {\n      return false;\n    }\n    for (unsigned i = 0; i < weights_vec_[p].size(); i++) {\n      if (weights_vec_[p][i] != that.weights_vec_[p][i] &&\n          std::fabs(weights_vec_[p][i] - that.weights_vec_[p][i]) > epsilon) {\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\nbool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {\n  const int kMaxLineSize = 100;\n  char line[kMaxLineSize];\n  BitVector present;\n  present.Init(PTRAIN_NUM_FEATURE_TYPES);\n  lang_ = lang;\n  // Load weights for passes with adaption on.\n  std::vector<float> &weights = weights_vec_[pass_];\n  weights.clear();\n  weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f);\n\n  while (fp->FGets(line, kMaxLineSize) != nullptr) {\n    char *key = nullptr;\n    float value;\n    if (!ParseLine(line, &key, &value)) {\n      continue;\n    }\n    int idx = ParamsTrainingFeatureByName(key);\n    if (idx < 0) {\n      tprintf(\"ParamsModel::Unknown parameter %s\\n\", key);\n      continue;\n    }\n    if (!present[idx]) {\n      present.SetValue(idx, true);\n    }\n    weights[idx] = value;\n  }\n  bool complete = (present.NumSetBits() == PTRAIN_NUM_FEATURE_TYPES);\n  if (!complete) {\n    for (int i = 0; i < PTRAIN_NUM_FEATURE_TYPES; i++) {\n      if (!present[i]) {\n        tprintf(\"Missing field %s.\\n\", kParamsTrainingFeatureTypeName[i]);\n      }\n    }\n    lang_ = \"\";\n    weights.clear();\n  }\n  return complete;\n}\n\nbool ParamsModel::SaveToFile(const char *full_path) const {\n  const std::vector<float> &weights = weights_vec_[pass_];\n  if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) {\n    tprintf(\"Refusing to save ParamsModel that has not been initialized.\\n\");\n    return false;\n  }\n  FILE *fp = fopen(full_path, \"wb\");\n  if (!fp) {\n    tprintf(\"Could not open %s for writing.\\n\", full_path);\n    return false;\n  }\n  bool all_good = true;\n  for (unsigned i = 0; i < weights.size(); i++) {\n    if (fprintf(fp, \"%s %f\\n\", kParamsTrainingFeatureTypeName[i], weights[i]) < 0) {\n      all_good = false;\n    }\n  }\n  fclose(fp);\n  return all_good;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/params_model.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        params_model.h\n// Description: Trained feature serialization for language parameter training.\n// Author:      David Eger\n//\n// (C) Copyright 2011, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_\n#define TESSERACT_WORDREC_PARAMS_MODEL_H_\n\n#include <tesseract/export.h>        // for TESS_API\n#include \"params_training_featdef.h\" // for PTRAIN_NUM_FEATURE_TYPES\n\nnamespace tesseract {\n\nclass TFile;\n\n// Represents the learned weights for a given language.\nclass TESS_API ParamsModel {\npublic:\n  // Enum for expressing OCR pass.\n  enum PassEnum {\n    PTRAIN_PASS1,\n    PTRAIN_PASS2,\n\n    PTRAIN_NUM_PASSES\n  };\n\n  ParamsModel() : pass_(PTRAIN_PASS1) {}\n  ParamsModel(const char *lang, const std::vector<float> &weights)\n      : lang_(lang), pass_(PTRAIN_PASS1) {\n    weights_vec_[pass_] = weights;\n  }\n  inline bool Initialized() {\n    return weights_vec_[pass_].size() == PTRAIN_NUM_FEATURE_TYPES;\n  }\n  // Prints out feature weights.\n  void Print();\n  // Clears weights for all passes.\n  void Clear() {\n    for (auto &p : weights_vec_) {\n      p.clear();\n    }\n  }\n  // Copies the weights of the given params model.\n  void Copy(const ParamsModel &other_model);\n  // Applies params model weights to the given features.\n  // Assumes that features is an array of size PTRAIN_NUM_FEATURE_TYPES.\n  float ComputeCost(const float features[]) const;\n  bool Equivalent(const ParamsModel &that) const;\n\n  // Returns true on success.\n  bool SaveToFile(const char *full_path) const;\n\n  // Returns true on success.\n  bool LoadFromFp(const char *lang, TFile *fp);\n\n  const std::vector<float> &weights() const {\n    return weights_vec_[pass_];\n  }\n  const std::vector<float> &weights_for_pass(PassEnum pass) const {\n    return weights_vec_[pass];\n  }\n  void SetPass(PassEnum pass) {\n    pass_ = pass;\n  }\n\nprivate:\n  bool ParseLine(char *line, char **key, float *val);\n\n  std::string lang_;\n  // Set to the current pass type and used to determine which set of weights\n  // should be used for ComputeCost() and other functions.\n  PassEnum pass_;\n  // Several sets of weights for various OCR passes (e.g. pass1 with adaption,\n  // pass2 without adaption, etc).\n  std::vector<float> weights_vec_[PTRAIN_NUM_PASSES];\n};\n\n} // namespace tesseract\n\n#endif // TESSERACT_WORDREC_PARAMS_MODEL_H_\n"
  },
  {
    "path": "src/wordrec/pieces.cpp",
    "content": "/******************************************************************************\n *\n * File:         pieces.cpp\n * Description:\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1987, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n          I n c l u d e s\n----------------------------------------------------------------------*/\n\n#include \"blobs.h\"\n#include \"helpers.h\"\n#include \"matrix.h\"\n#include \"ratngs.h\"\n#include \"seam.h\"\n#include \"wordrec.h\"\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\nusing tesseract::ScoredFont;\n\n/*----------------------------------------------------------------------\n          F u n c t i o n s\n----------------------------------------------------------------------*/\n\n/**********************************************************************\n * classify_piece\n *\n * Create a larger piece from a collection of smaller ones.  Classify\n * it and return the results.  Take the large piece apart to leave\n * the collection of small pieces un modified.\n **********************************************************************/\nnamespace tesseract {\nBLOB_CHOICE_LIST *Wordrec::classify_piece(const std::vector<SEAM *> &seams, int16_t start,\n                                          int16_t end, const char *description, TWERD *word,\n                                          BlamerBundle *blamer_bundle) {\n  if (end > start) {\n    SEAM::JoinPieces(seams, word->blobs, start, end);\n  }\n  BLOB_CHOICE_LIST *choices =\n      classify_blob(word->blobs[start], description, ScrollView::WHITE, blamer_bundle);\n  // Set the matrix_cell_ entries in all the BLOB_CHOICES.\n  BLOB_CHOICE_IT bc_it(choices);\n  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {\n    bc_it.data()->set_matrix_cell(start, end);\n  }\n\n  if (end > start) {\n    SEAM::BreakPieces(seams, word->blobs, start, end);\n  }\n\n  return (choices);\n}\n\ntemplate <class BLOB_CHOICE>\nint SortByUnicharID(const void *void1, const void *void2) {\n  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);\n  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);\n\n  return p1->unichar_id() - p2->unichar_id();\n}\n\ntemplate <class BLOB_CHOICE>\nint SortByRating(const void *void1, const void *void2) {\n  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);\n  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);\n\n  if (p1->rating() < p2->rating()) {\n    return 1;\n  }\n  return -1;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/plotedges.cpp",
    "content": "/******************************************************************************\n *\n * File:         plotedges.cpp  (Formerly plotedges.c)\n * Description:  Graphics routines for \"Edges\" and \"Outlines\" windows\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"plotedges.h\"\n\n#include \"render.h\"\n#include \"split.h\"\n\n#ifndef GRAPHICS_DISABLED\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\nScrollView *edge_window = nullptr;\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n/**********************************************************************\n * display_edgepts\n *\n * Macro to display edge points in a window.\n **********************************************************************/\nvoid display_edgepts(LIST outlines) {\n  /* Set up window */\n  if (edge_window == nullptr) {\n    edge_window = new ScrollView(\"Edges\", 750, 150, 400, 128, 800, 256, true);\n  } else {\n    edge_window->Clear();\n  }\n  /* Render the outlines */\n  auto window = edge_window;\n  /* Reclaim old memory */\n  iterate(outlines) {\n    render_edgepts(window, reinterpret_cast<EDGEPT *>(outlines->first_node()), ScrollView::WHITE);\n  }\n}\n\n/**********************************************************************\n * draw_blob_edges\n *\n * Display the edges of this blob in the edges window.\n **********************************************************************/\nvoid draw_blob_edges(TBLOB *blob) {\n  if (wordrec_display_splits) {\n    LIST edge_list = NIL_LIST;\n    for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {\n      edge_list = push(edge_list, ol->loop);\n    }\n    display_edgepts(edge_list);\n    destroy(edge_list);\n  }\n}\n\n/**********************************************************************\n * mark_outline\n *\n * Make a mark on the edges window at a particular location.\n **********************************************************************/\nvoid mark_outline(EDGEPT *edgept) { /* Start of point list */\n  auto window = edge_window;\n  float x = edgept->pos.x;\n  float y = edgept->pos.y;\n\n  window->Pen(ScrollView::RED);\n  window->SetCursor(x, y);\n\n  x -= 4;\n  y -= 12;\n  window->DrawTo(x, y);\n\n  x -= 2;\n  y += 4;\n  window->DrawTo(x, y);\n\n  x -= 4;\n  y += 2;\n  window->DrawTo(x, y);\n\n  x += 10;\n  y += 6;\n  window->DrawTo(x, y);\n\n  window->Update();\n}\n\n} // namespace tesseract\n\n#endif // !GRAPHICS_DISABLED\n"
  },
  {
    "path": "src/wordrec/plotedges.h",
    "content": "/******************************************************************************\n *\n * File:         plotedges.h\n * Description:  Convert the various data type into line lists\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n#ifndef PLOTEDGES_H\n#define PLOTEDGES_H\n\n#include \"oldlist.h\" // for LIST\n\nnamespace tesseract {\n\nclass ScrollView;\n\nstruct EDGEPT;\nstruct TBLOB;\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\nextern ScrollView *edge_window; /* Window for edges */\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n---------------------------------------------------------------------*/\nvoid display_edgepts(LIST outlines);\n\nvoid draw_blob_edges(TBLOB *blob);\n\nvoid mark_outline(EDGEPT *edgept);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/render.cpp",
    "content": "/******************************************************************************\n *\n * File:         render.cpp  (Formerly render.c)\n * Description:  Convert the various data type into line lists\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n#include \"render.h\"\n\n#include \"blobs.h\"\n\n#include <cmath>\n\nnamespace tesseract {\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\nScrollView *blob_window = nullptr;\n\nScrollView::Color color_list[] = {ScrollView::RED,  ScrollView::CYAN,  ScrollView::YELLOW,\n                                  ScrollView::BLUE, ScrollView::GREEN, ScrollView::WHITE};\n\nBOOL_VAR(wordrec_display_all_blobs, 0, \"Display Blobs\");\n\nBOOL_VAR(wordrec_blob_pause, 0, \"Blob pause\");\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\n#ifndef GRAPHICS_DISABLED\n/**********************************************************************\n * display_blob\n *\n * Macro to display blob in a window.\n **********************************************************************/\nvoid display_blob(TBLOB *blob, ScrollView::Color color) {\n  /* Size of drawable */\n  if (blob_window == nullptr) {\n    blob_window = new ScrollView(\"Blobs\", 520, 10, 500, 256, 2000, 256, true);\n  } else {\n    blob_window->Clear();\n  }\n\n  render_blob(blob_window, blob, color);\n}\n\n/**********************************************************************\n * render_blob\n *\n * Create a list of line segments that represent the expanded outline\n * that was supplied as input.\n **********************************************************************/\nvoid render_blob(ScrollView *window, TBLOB *blob, ScrollView::Color color) {\n  /* No outline */\n  if (!blob) {\n    return;\n  }\n\n  render_outline(window, blob->outlines, color);\n}\n\n/**********************************************************************\n * render_edgepts\n *\n * Create a list of line segments that represent the expanded outline\n * that was supplied as input.\n **********************************************************************/\nvoid render_edgepts(ScrollView *window, EDGEPT *edgept, ScrollView::Color color) {\n  if (!edgept) {\n    return;\n  }\n\n  float x = edgept->pos.x;\n  float y = edgept->pos.y;\n  EDGEPT *this_edge = edgept;\n\n  window->Pen(color);\n  window->SetCursor(x, y);\n  do {\n    this_edge = this_edge->next;\n    x = this_edge->pos.x;\n    y = this_edge->pos.y;\n    window->DrawTo(x, y);\n  } while (edgept != this_edge);\n}\n\n/**********************************************************************\n * render_outline\n *\n * Create a list of line segments that represent the expanded outline\n * that was supplied as input.\n **********************************************************************/\nvoid render_outline(ScrollView *window, TESSLINE *outline, ScrollView::Color color) {\n  /* No outline */\n  if (!outline) {\n    return;\n  }\n  /* Draw Compact outline */\n  if (outline->loop) {\n    render_edgepts(window, outline->loop, color);\n  }\n  /* Add on next outlines */\n  render_outline(window, outline->next, color);\n}\n\n#endif // !GRAPHICS_DISABLED\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/render.h",
    "content": "/******************************************************************************\n *\n * File:         render.h\n * Description:  Convert the various data type into line lists\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1989, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n#ifndef RENDER_H\n#define RENDER_H\n\n#include \"params.h\"     // for BOOL_VAR_H, BoolParam\n#include \"scrollview.h\" // ScrollView\n\nnamespace tesseract {\n\nstruct EDGEPT;\nstruct TBLOB;\nstruct TESSLINE;\n\n/*----------------------------------------------------------------------\n              V a r i a b l e s\n----------------------------------------------------------------------*/\nextern ScrollView *blob_window;        // Window for blobs\nextern ScrollView::Color color_list[]; // Colors for outlines\n\nextern BOOL_VAR_H(wordrec_display_all_blobs);\n\nextern BOOL_VAR_H(wordrec_blob_pause);\n\n#define NUM_COLORS 6\n\n/*----------------------------------------------------------------------\n              F u n c t i o n s\n----------------------------------------------------------------------*/\nvoid display_blob(TBLOB *blob, ScrollView::Color color);\n\nvoid render_blob(ScrollView *window, TBLOB *blob, ScrollView::Color color);\n\nvoid render_edgepts(ScrollView *window, EDGEPT *edgept, ScrollView::Color color);\n\nvoid render_outline(ScrollView *window, TESSLINE *outline, ScrollView::Color color);\n\n} // namespace tesseract\n\n#endif\n"
  },
  {
    "path": "src/wordrec/segsearch.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        segsearch.cpp\n// Description: Segmentation search functions.\n// Author:      Daria Antonova\n//\n// (C) Copyright 2009, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include <cstdint>          // for INT32_MAX\n#include \"blamer.h\"         // for BlamerBundle\n#include \"errcode.h\"        // for ASSERT_HOST\n#include \"lm_pain_points.h\" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi...\n#include \"lm_state.h\"       // for BestChoiceBundle, ViterbiStateEntry\n#include \"matrix.h\"         // for MATRIX_COORD, MATRIX\n#include \"pageres.h\"        // for WERD_RES\n#include \"params.h\"         // for BoolParam, IntParam, DoubleParam\n#include \"ratngs.h\"         // for BLOB_CHOICE_LIST, BLOB_CHOICE_IT\n#include \"tprintf.h\"        // for tprintf\n#include \"wordrec.h\"        // for Wordrec, SegSearchPending (ptr only)\n\nnamespace tesseract {\n\nvoid Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                        BlamerBundle *blamer_bundle) {\n  LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio,\n                           assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level);\n  // Compute scaling factor that will help us recover blob outline length\n  // from classifier rating and certainty for the blob.\n  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;\n  std::vector<SegSearchPending> pending;\n  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle);\n\n  if (!SegSearchDone(0)) { // find a better choice\n    if (chop_enable && word_res->chopped_word != nullptr) {\n      improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle,\n                          &pain_points, &pending);\n    }\n    if (chop_debug) {\n      SEAM::PrintSeams(\"Final seam list:\", word_res->seam_array);\n    }\n\n    if (blamer_bundle != nullptr && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {\n      blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);\n    }\n  }\n  // Keep trying to find a better path by fixing the \"pain points\".\n\n  MATRIX_COORD pain_point;\n  float pain_point_priority;\n  int num_futile_classifications = 0;\n  std::string blamer_debug;\n  while (wordrec_enable_assoc &&\n         (!SegSearchDone(num_futile_classifications) ||\n          (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()))) {\n    // Get the next valid \"pain point\".\n    bool found_nothing = true;\n    LMPainPointsType pp_type;\n    while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) {\n      if (!pain_point.Valid(*word_res->ratings)) {\n        word_res->ratings->IncreaseBandSize(pain_point.row - pain_point.col + 1);\n      }\n      if (pain_point.Valid(*word_res->ratings) &&\n          !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) {\n        found_nothing = false;\n        break;\n      }\n    }\n    if (found_nothing) {\n      if (segsearch_debug_level > 0) {\n        tprintf(\"Pain points queue is empty\\n\");\n      }\n      break;\n    }\n    ProcessSegSearchPainPoint(pain_point_priority, pain_point,\n                              LMPainPoints::PainPointDescription(pp_type), &pending, word_res,\n                              &pain_points, blamer_bundle);\n\n    UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points,\n                         best_choice_bundle, blamer_bundle);\n    if (!best_choice_bundle->updated) {\n      ++num_futile_classifications;\n    }\n\n    if (segsearch_debug_level > 0) {\n      tprintf(\"num_futile_classifications %d\\n\", num_futile_classifications);\n    }\n\n    best_choice_bundle->updated = false; // reset updated\n\n    // See if it's time to terminate SegSearch or time for starting a guided\n    // search for the true path to find the blame for the incorrect best_choice.\n    if (SegSearchDone(num_futile_classifications) && blamer_bundle != nullptr &&\n        blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {\n      InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, blamer_debug);\n    }\n  } // end while loop exploring alternative paths\n  if (blamer_bundle != nullptr) {\n    blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, blamer_debug);\n  }\n\n  if (segsearch_debug_level > 0) {\n    tprintf(\"Done with SegSearch (AcceptableChoiceFound: %d)\\n\",\n            language_model_->AcceptableChoiceFound());\n  }\n}\n\n// Setup and run just the initial segsearch on an established matrix,\n// without doing any additional chopping or joining.\n// (Internal factored version that can be used as part of the main SegSearch.)\nvoid Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,\n                               std::vector<SegSearchPending> *pending,\n                               BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {\n  if (segsearch_debug_level > 0) {\n    tprintf(\"Starting SegSearch on ratings matrix%s:\\n\",\n            wordrec_enable_assoc ? \" (with assoc)\" : \"\");\n    word_res->ratings->print(getDict().getUnicharset());\n  }\n\n  pain_points->GenerateInitial(word_res);\n\n  // Compute scaling factor that will help us recover blob outline length\n  // from classifier rating and certainty for the blob.\n  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;\n\n  language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment,\n                               segsearch_max_char_wh_ratio, rating_cert_scale);\n\n  // Initialize blamer-related information: map character boxes recorded in\n  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the\n  // ratings matrix. We expect this step to succeed, since when running the\n  // chopper we checked that the correct chops are present.\n  if (blamer_bundle != nullptr) {\n    blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word, wordrec_debug_blamer);\n  }\n\n  // pending[col] tells whether there is update work to do to combine\n  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].\n  // As the language model state is updated, pending entries are modified to\n  // minimize duplication of work. It is important that during the update the\n  // children are considered in the non-decreasing order of their column, since\n  // this guarantees that all the parents would be up to date before an update\n  // of a child is done.\n  pending->clear();\n  pending->resize(word_res->ratings->dimension(), SegSearchPending());\n\n  // Search the ratings matrix for the initial best path.\n  (*pending)[0].SetColumnClassified();\n  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res, pain_points, best_choice_bundle,\n                       blamer_bundle);\n}\n\nvoid Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,\n                                   std::vector<SegSearchPending> *pending, WERD_RES *word_res,\n                                   LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,\n                                   BlamerBundle *blamer_bundle) {\n  MATRIX *ratings = word_res->ratings;\n  ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == pending->size());\n  ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == best_choice_bundle->beam.size());\n  for (int col = starting_col; col < ratings->dimension(); ++col) {\n    if (!(*pending)[col].WorkToDo()) {\n      continue;\n    }\n    int first_row = col;\n    int last_row = std::min(ratings->dimension() - 1, col + ratings->bandwidth() - 1);\n    if ((*pending)[col].SingleRow() >= 0) {\n      first_row = last_row = (*pending)[col].SingleRow();\n    }\n    if (segsearch_debug_level > 0) {\n      tprintf(\"\\n\\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\\n\", col, first_row,\n              last_row, (*pending)[col].IsRowJustClassified(INT32_MAX));\n    }\n    // Iterate over the pending list for this column.\n    for (int row = first_row; row <= last_row; ++row) {\n      // Update language model state of this child+parent pair.\n      BLOB_CHOICE_LIST *current_node = ratings->get(col, row);\n      LanguageModelState *parent_node = col == 0 ? nullptr : best_choice_bundle->beam[col - 1];\n      if (current_node != nullptr &&\n          language_model_->UpdateState((*pending)[col].IsRowJustClassified(row), col, row,\n                                       current_node, parent_node, pain_points, word_res,\n                                       best_choice_bundle, blamer_bundle) &&\n          row + 1 < ratings->dimension()) {\n        // Since the language model state of this entry changed, process all\n        // the child column.\n        (*pending)[row + 1].RevisitWholeColumn();\n        if (segsearch_debug_level > 0) {\n          tprintf(\"Added child col=%d to pending\\n\", row + 1);\n        }\n      } // end if UpdateState.\n    }   // end for row.\n  }     // end for col.\n  if (best_choice_bundle->best_vse != nullptr) {\n    ASSERT_HOST(word_res->StatesAllValid());\n    if (best_choice_bundle->best_vse->updated) {\n      pain_points->GenerateFromPath(rating_cert_scale, best_choice_bundle->best_vse, word_res);\n      if (!best_choice_bundle->fixpt.empty()) {\n        pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt, best_choice_bundle->best_vse,\n                                        word_res);\n      }\n    }\n  }\n  // The segsearch is completed. Reset all updated flags on all VSEs and reset\n  // all pendings.\n  for (unsigned col = 0; col < pending->size(); ++col) {\n    (*pending)[col].Clear();\n    ViterbiStateEntry_IT vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);\n    for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {\n      vse_it.data()->updated = false;\n    }\n  }\n}\n\nvoid Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,\n                                        const char *pain_point_type,\n                                        std::vector<SegSearchPending> *pending,\n                                        WERD_RES *word_res, LMPainPoints *pain_points,\n                                        BlamerBundle *blamer_bundle) {\n  if (segsearch_debug_level > 0) {\n    tprintf(\"Classifying pain point %s priority=%.4f, col=%d, row=%d\\n\", pain_point_type,\n            pain_point_priority, pain_point.col, pain_point.row);\n  }\n  ASSERT_HOST(pain_points != nullptr);\n  MATRIX *ratings = word_res->ratings;\n  // Classify blob [pain_point.col pain_point.row]\n  if (!pain_point.Valid(*ratings)) {\n    ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);\n  }\n  ASSERT_HOST(pain_point.Valid(*ratings));\n  BLOB_CHOICE_LIST *classified =\n      classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type,\n                     word_res->chopped_word, blamer_bundle);\n  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);\n  if (lst == nullptr) {\n    ratings->put(pain_point.col, pain_point.row, classified);\n  } else {\n    // We cannot delete old BLOB_CHOICEs, since they might contain\n    // ViterbiStateEntries that are parents of other \"active\" entries.\n    // Thus if the matrix cell already contains classifications we add\n    // the new ones to the beginning of the list.\n    BLOB_CHOICE_IT it(lst);\n    it.add_list_before(classified);\n    delete classified; // safe to delete, since empty after add_list_before()\n    classified = nullptr;\n  }\n\n  if (segsearch_debug_level > 0) {\n    print_ratings_list(\"Updated ratings matrix with a new entry:\",\n                       ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset());\n    ratings->print(getDict().getUnicharset());\n  }\n\n  // Insert initial \"pain points\" to join the newly classified blob\n  // with its left and right neighbors.\n  if (classified != nullptr && !classified->empty()) {\n    if (pain_point.col > 0) {\n      pain_points->GeneratePainPoint(pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true,\n                                     segsearch_max_char_wh_ratio, word_res);\n    }\n    if (pain_point.row + 1 < ratings->dimension()) {\n      pain_points->GeneratePainPoint(pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true,\n                                     segsearch_max_char_wh_ratio, word_res);\n    }\n  }\n  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);\n}\n\n// Resets enough of the results so that the Viterbi search is re-run.\n// Needed when the n-gram model is enabled, as the multi-length comparison\n// implementation will re-value existing paths to worse values.\nvoid Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                               std::vector<SegSearchPending> &pending) {\n  // TODO(rays) More refactoring required here.\n  // Delete existing viterbi states.\n  for (auto &col : best_choice_bundle->beam) {\n    col->Clear();\n  }\n  // Reset best_choice_bundle.\n  word_res->ClearWordChoices();\n  best_choice_bundle->best_vse = nullptr;\n  // Clear out all existing pendings and add a new one for the first column.\n  pending[0].SetColumnClassified();\n  for (auto &data : pending) {\n    data.Clear();\n  }\n}\n\nvoid Wordrec::InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,\n                                     BlamerBundle *blamer_bundle, std::string &blamer_debug) {\n  pain_points->Clear(); // Clear pain points heap.\n  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings, getDict().WildcardID(),\n                                  wordrec_debug_blamer, blamer_debug, pain_points,\n                                  segsearch_max_char_wh_ratio, word_res);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/tface.cpp",
    "content": "/**********************************************************************\n * File:        tface.cpp  (Formerly tface.c)\n * Description: C side of the Tess/tessedit C/C++ interface.\n * Author:      Ray Smith\n *\n * (C) Copyright 1992, Hewlett-Packard Ltd.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n **********************************************************************/\n\n#include <cmath>\n\n#include \"wordrec.h\"\n\n#ifndef DISABLED_LEGACY_ENGINE\n#  include \"chop.h\"\n#  include \"featdefs.h\"\n#  include \"pageres.h\"\n#  include \"params_model.h\"\n#endif\n\nnamespace tesseract {\n\n/**\n * @name program_editup\n *\n * Initialize all the things in the program that need to be initialized.\n * init_permute determines whether to initialize the permute functions\n * and Dawg models.\n */\nvoid Wordrec::program_editup(const std::string &textbase, TessdataManager *init_classifier,\n                             TessdataManager *init_dict) {\n  if (!textbase.empty()) {\n    imagefile = textbase;\n  }\n#ifndef DISABLED_LEGACY_ENGINE\n  InitFeatureDefs(&feature_defs_);\n  InitAdaptiveClassifier(init_classifier);\n  if (init_dict) {\n    getDict().SetupForLoad(Dict::GlobalDawgCache());\n    getDict().Load(lang, init_dict);\n    getDict().FinishLoad();\n  }\n  pass2_ok_split = chop_ok_split;\n#endif // ndef DISABLED_LEGACY_ENGINE\n}\n\n/**\n * @name end_recog\n *\n * Cleanup and exit the recog program.\n */\nint Wordrec::end_recog() {\n  program_editdown(0);\n\n  return (0);\n}\n\n/**\n * @name program_editdown\n *\n * This function holds any necessary post processing for the Wise Owl\n * program.\n */\nvoid Wordrec::program_editdown(int32_t elapsed_time) {\n#ifndef DISABLED_LEGACY_ENGINE\n  EndAdaptiveClassifier();\n#endif // ndef DISABLED_LEGACY_ENGINE\n  getDict().End();\n}\n\n/**\n * @name dict_word()\n *\n * Test the dictionaries, returning NO_PERM (0) if not found, or one\n * of the PermuterType values if found, according to the dictionary.\n */\nint Wordrec::dict_word(const WERD_CHOICE &word) {\n  return getDict().valid_word(word);\n}\n\n#ifndef DISABLED_LEGACY_ENGINE\n\n/**\n * @name set_pass1\n *\n * Get ready to do some pass 1 stuff.\n */\nvoid Wordrec::set_pass1() {\n  chop_ok_split.set_value(70.0);\n  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS1);\n  SetupPass1();\n}\n\n/**\n * @name set_pass2\n *\n * Get ready to do some pass 2 stuff.\n */\nvoid Wordrec::set_pass2() {\n  chop_ok_split.set_value(pass2_ok_split);\n  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS2);\n  SetupPass2();\n}\n\n/**\n * @name cc_recog\n *\n * Recognize a word.\n */\nvoid Wordrec::cc_recog(WERD_RES *word) {\n  getDict().reset_hyphen_vars(word->word->flag(W_EOL));\n  chop_word_main(word);\n  word->DebugWordChoices(getDict().stopper_debug_level >= 1, getDict().word_to_debug.c_str());\n  ASSERT_HOST(word->StatesAllValid());\n}\n\n/**\n * @name call_matcher\n *\n * Called from Tess with a blob in tess form.\n * The blob may need rotating to the correct orientation for classification.\n */\nBLOB_CHOICE_LIST *Wordrec::call_matcher(TBLOB *tessblob) {\n  // Rotate the blob for classification if necessary.\n  TBLOB *rotated_blob = tessblob->ClassifyNormalizeIfNeeded();\n  if (rotated_blob == nullptr) {\n    rotated_blob = tessblob;\n  }\n  auto *ratings = new BLOB_CHOICE_LIST(); // matcher result\n  AdaptiveClassifier(rotated_blob, ratings);\n  if (rotated_blob != tessblob) {\n    delete rotated_blob;\n  }\n  return ratings;\n}\n\n#endif // ndef DISABLED_LEGACY_ENGINE\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/wordclass.cpp",
    "content": "/******************************************************************************\n *\n * File:         wordclass.cpp  (Formerly wordclass.c)\n * Description:  Word classifier\n * Author:       Mark Seaman, OCR Technology\n *\n * (c) Copyright 1990, Hewlett-Packard Company.\n ** Licensed under the Apache License, Version 2.0 (the \"License\");\n ** you may not use this file except in compliance with the License.\n ** You may obtain a copy of the License at\n ** http://www.apache.org/licenses/LICENSE-2.0\n ** Unless required by applicable law or agreed to in writing, software\n ** distributed under the License is distributed on an \"AS IS\" BASIS,\n ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n ** See the License for the specific language governing permissions and\n ** limitations under the License.\n *\n *****************************************************************************/\n/*----------------------------------------------------------------------\n          I N C L U D E S\n----------------------------------------------------------------------*/\n\n#include \"blamer.h\"  // for blamer_bundle\n#include \"params.h\"  // for BoolParam\n#include \"render.h\"  // for display_blob, blob_window, wordrec_blob_pause\n#include \"wordrec.h\" // for Wordrec\n\nstruct TBLOB;\n\n// Include automatically generated configuration file if running autoconf.\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\"\n#endif\n\n/*----------------------------------------------------------------------\n          F u n c t i o n s\n----------------------------------------------------------------------*/\nnamespace tesseract {\n/**\n * @name classify_blob\n *\n * Classify the this blob if it is not already recorded in the match\n * table. Attempt to recognize this blob as a character. The recognition\n * rating for this blob will be stored as a part of the blob. This value\n * will also be returned to the caller.\n * @param blob Current blob\n * @param string The string to display in ScrollView\n * @param color The colour to use when displayed with ScrollView\n */\nBLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,\n                                         BlamerBundle *blamer_bundle) {\n#ifndef GRAPHICS_DISABLED\n  if (wordrec_display_all_blobs) {\n    display_blob(blob, color);\n  }\n#endif\n  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.\n  BLOB_CHOICE_LIST *choices = call_matcher(blob);\n  // If a blob with the same bounding box as one of the truth character\n  // bounding boxes is not classified as the corresponding truth character\n  // blame character classifier for incorrect answer.\n  if (blamer_bundle != nullptr) {\n    blamer_bundle->BlameClassifier(getDict().getUnicharset(), blob->bounding_box(), *choices,\n                                   wordrec_debug_blamer);\n  }\n#ifndef GRAPHICS_DISABLED\n  if (classify_debug_level && string) {\n    print_ratings_list(string, choices, getDict().getUnicharset());\n  }\n\n  if (wordrec_blob_pause) {\n    blob_window->Wait();\n  }\n#endif\n\n  return choices;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "src/wordrec/wordrec.cpp",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        wordrec.cpp\n// Description: wordrec class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#include \"wordrec.h\"\n\n#include <memory>\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include \"params.h\"\n\nnamespace tesseract {\nWordrec::Wordrec()\n    : // control parameters\n\n    BOOL_MEMBER(wordrec_debug_blamer, false, \"Print blamer debug messages\", params())\n    ,\n\n    BOOL_MEMBER(wordrec_run_blamer, false, \"Try to set the blame for errors\", params()) {\n  prev_word_best_choice_ = nullptr;\n}\n\n} // namespace tesseract\n\n#else // DISABLED_LEGACY_ENGINE not defined\n\n#  include \"language_model.h\"\n#  include \"params.h\"\n\nnamespace tesseract {\nWordrec::Wordrec()\n    : // control parameters\n    BOOL_MEMBER(merge_fragments_in_matrix, true,\n                \"Merge the fragments in the ratings matrix and delete them\"\n                \" after merging\",\n                params())\n    , BOOL_MEMBER(wordrec_enable_assoc, true, \"Associator Enable\", params())\n    , BOOL_MEMBER(force_word_assoc, false,\n                  \"force associator to run regardless of what enable_assoc is.\"\n                  \" This is used for CJK where component grouping is necessary.\",\n                  CCUtil::params())\n    , INT_MEMBER(repair_unchopped_blobs, 1, \"Fix blobs that aren't chopped\", params())\n    , double_MEMBER(tessedit_certainty_threshold, -2.25, \"Good blob limit\", params())\n    , INT_MEMBER(chop_debug, 0, \"Chop debug\", params())\n    , BOOL_MEMBER(chop_enable, 1, \"Chop enable\", params())\n    , BOOL_MEMBER(chop_vertical_creep, 0, \"Vertical creep\", params())\n    , INT_MEMBER(chop_split_length, 10000, \"Split Length\", params())\n    , INT_MEMBER(chop_same_distance, 2, \"Same distance\", params())\n    , INT_MEMBER(chop_min_outline_points, 6, \"Min Number of Points on Outline\", params())\n    , INT_MEMBER(chop_seam_pile_size, 150, \"Max number of seams in seam_pile\", params())\n    , BOOL_MEMBER(chop_new_seam_pile, 1, \"Use new seam_pile\", params())\n    , INT_MEMBER(chop_inside_angle, -50, \"Min Inside Angle Bend\", params())\n    , INT_MEMBER(chop_min_outline_area, 2000, \"Min Outline Area\", params())\n    , double_MEMBER(chop_split_dist_knob, 0.5, \"Split length adjustment\", params())\n    , double_MEMBER(chop_overlap_knob, 0.9, \"Split overlap adjustment\", params())\n    , double_MEMBER(chop_center_knob, 0.15, \"Split center adjustment\", params())\n    , INT_MEMBER(chop_centered_maxwidth, 90,\n                 \"Width of (smaller) chopped blobs \"\n                 \"above which we don't care that a chop is not near the center.\",\n                 params())\n    , double_MEMBER(chop_sharpness_knob, 0.06, \"Split sharpness adjustment\", params())\n    , double_MEMBER(chop_width_change_knob, 5.0, \"Width change adjustment\", params())\n    , double_MEMBER(chop_ok_split, 100.0, \"OK split limit\", params())\n    , double_MEMBER(chop_good_split, 50.0, \"Good split limit\", params())\n    , INT_MEMBER(chop_x_y_weight, 3, \"X / Y  length weight\", params())\n    , BOOL_MEMBER(assume_fixed_pitch_char_segment, false,\n                  \"include fixed-pitch heuristics in char segmentation\", params())\n    , INT_MEMBER(wordrec_debug_level, 0, \"Debug level for wordrec\", params())\n    , INT_MEMBER(wordrec_max_join_chunks, 4, \"Max number of broken pieces to associate\", params())\n    , BOOL_MEMBER(wordrec_skip_no_truth_words, false,\n                  \"Only run OCR for words that had truth recorded in BlamerBundle\", params())\n    , BOOL_MEMBER(wordrec_debug_blamer, false, \"Print blamer debug messages\", params())\n    , BOOL_MEMBER(wordrec_run_blamer, false, \"Try to set the blame for errors\", params())\n    , INT_MEMBER(segsearch_debug_level, 0, \"SegSearch debug level\", params())\n    , INT_MEMBER(segsearch_max_pain_points, 2000,\n                 \"Maximum number of pain points stored in the queue\", params())\n    , INT_MEMBER(segsearch_max_futile_classifications, 20,\n                 \"Maximum number of pain point classifications per chunk that\"\n                 \" did not result in finding a better word choice.\",\n                 params())\n    , double_MEMBER(segsearch_max_char_wh_ratio, 2.0, \"Maximum character width-to-height ratio\",\n                    params())\n    , BOOL_MEMBER(save_alt_choices, true,\n                  \"Save alternative paths found during chopping\"\n                  \" and segmentation search\",\n                  params())\n    , language_model_(std::make_unique<LanguageModel>(&get_fontinfo_table(), &(getDict())))\n    , pass2_ok_split(0.0f)\n    , prev_word_best_choice_(nullptr)\n    , fill_lattice_(nullptr) {\n}\n\n} // namespace tesseract\n\n#endif // DISABLED_LEGACY_ENGINE\n"
  },
  {
    "path": "src/wordrec/wordrec.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        wordrec.h\n// Description: wordrec class.\n// Author:      Samuel Charron\n//\n// (C) Copyright 2006, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_WORDREC_WORDREC_H_\n#define TESSERACT_WORDREC_WORDREC_H_\n\n#ifdef HAVE_CONFIG_H\n#  include \"config_auto.h\" // DISABLED_LEGACY_ENGINE\n#endif\n\n#ifdef DISABLED_LEGACY_ENGINE\n\n#  include <cstdint>    // for int16_t, int32_t\n#  include \"classify.h\" // for Classify\n#  include \"params.h\"   // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...\n#  include \"ratngs.h\"   // for WERD_CHOICE\n\nnamespace tesseract {\nclass TessdataManager;\n}\n\nnamespace tesseract {\n\n/* ccmain/tstruct.cpp */\n\nclass TESS_API Wordrec : public Classify {\npublic:\n  // config parameters\n\n  BOOL_VAR_H(wordrec_debug_blamer);\n  BOOL_VAR_H(wordrec_run_blamer);\n\n  // methods\n  Wordrec();\n  virtual ~Wordrec() = default;\n\n  // tface.cpp\n  void program_editup(const std::string &textbase, TessdataManager *init_classifier,\n                      TessdataManager *init_dict);\n  void program_editdown(int32_t elapsed_time);\n  int end_recog();\n  int dict_word(const WERD_CHOICE &word);\n\n  // Member variables\n  WERD_CHOICE *prev_word_best_choice_;\n};\n\n} // namespace tesseract\n\n#else // DISABLED_LEGACY_ENGINE not defined\n\n#  include <memory>\n#  include \"associate.h\"\n#  include \"chop.h\"     // for PointHeap, MAX_NUM_POINTS\n#  include \"classify.h\" // for Classify\n#  include \"dict.h\"\n#  include \"elst.h\"     // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK\n#  include \"findseam.h\" // for SeamQueue, SeamPile\n#  include \"language_model.h\"\n#  include \"matrix.h\"\n#  include \"oldlist.h\" // for LIST\n#  include \"params.h\"  // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...\n#  include \"points.h\"  // for ICOORD\n#  include \"ratngs.h\"  // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...\n#  include \"seam.h\"    // for SEAM (ptr only), PRIORITY\n#  include \"stopper.h\" // for DANGERR\n\n#  include <cstdint> // for int16_t, int32_t\n\nnamespace tesseract {\n\nclass EDGEPT_CLIST;\nclass MATRIX;\nclass TBOX;\nclass UNICHARSET;\nclass WERD_RES;\n\nclass LMPainPoints;\nclass TessdataManager;\nstruct BestChoiceBundle;\n\nstruct BlamerBundle;\nstruct EDGEPT;\nstruct MATRIX_COORD;\nstruct SPLIT;\nstruct TBLOB;\nstruct TESSLINE;\nstruct TWERD;\n\n// A class for storing which nodes are to be processed by the segmentation\n// search. There is a single SegSearchPending for each column in the ratings\n// matrix, and it indicates whether the segsearch should combine all\n// BLOB_CHOICES in the column, or just the given row with the parents\n// corresponding to *this SegSearchPending, and whether only updated parent\n// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.\nclass SegSearchPending {\npublic:\n  SegSearchPending()\n      : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}\n\n  // Marks the whole column as just classified. Used to start a search on\n  // a newly initialized ratings matrix.\n  void SetColumnClassified() {\n    column_classified_ = true;\n  }\n  // Marks the matrix entry at the given row as just classified.\n  // Used after classifying a new matrix cell.\n  // Additional to, not overriding a previous RevisitWholeColumn.\n  void SetBlobClassified(int row) {\n    classified_row_ = row;\n  }\n  // Marks the whole column as needing work, but not just classified.\n  // Used when the parent vse list is updated.\n  // Additional to, not overriding a previous SetBlobClassified.\n  void RevisitWholeColumn() {\n    revisit_whole_column_ = true;\n  }\n\n  // Clears *this to indicate no work to do.\n  void Clear() {\n    classified_row_ = -1;\n    revisit_whole_column_ = false;\n    column_classified_ = false;\n  }\n\n  // Returns true if there are updates to do in the column that *this\n  // represents.\n  bool WorkToDo() const {\n    return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;\n  }\n  // Returns true if the given row was just classified.\n  bool IsRowJustClassified(int row) const {\n    return row == classified_row_ || column_classified_;\n  }\n  // Returns the single row to process if there is only one, otherwise -1.\n  int SingleRow() const {\n    return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;\n  }\n\nprivate:\n  // If non-negative, indicates the single row in the ratings matrix that has\n  // just been classified, and so should be combined with all the parents in the\n  // column that this SegSearchPending represents.\n  // Operates independently of revisit_whole_column.\n  int classified_row_;\n  // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will\n  // be processed, but classified_row can indicate a row that is newly\n  // classified. Overridden if column_classified is true.\n  bool revisit_whole_column_;\n  // If column_classified is true, parent vses are processed with all rows\n  // regardless of whether they are just updated, overriding\n  // revisit_whole_column and classified_row.\n  bool column_classified_;\n};\n\n/* ccmain/tstruct.cpp *********************************************************/\nclass FRAGMENT : public ELIST<FRAGMENT>::LINK {\npublic:\n  FRAGMENT() { // constructor\n  }\n  FRAGMENT(EDGEPT *head_pt,  // start\n           EDGEPT *tail_pt); // end\n\n  ICOORD head;    // coords of start\n  ICOORD tail;    // coords of end\n  EDGEPT *headpt; // start point\n  EDGEPT *tailpt; // end point\n};\nELISTIZEH(FRAGMENT)\n\nclass TESS_API Wordrec : public Classify {\npublic:\n  // config parameters *******************************************************\n  BOOL_VAR_H(merge_fragments_in_matrix);\n  BOOL_VAR_H(wordrec_enable_assoc);\n  BOOL_VAR_H(force_word_assoc);\n  INT_VAR_H(repair_unchopped_blobs);\n  double_VAR_H(tessedit_certainty_threshold);\n  INT_VAR_H(chop_debug);\n  BOOL_VAR_H(chop_enable);\n  BOOL_VAR_H(chop_vertical_creep);\n  INT_VAR_H(chop_split_length);\n  INT_VAR_H(chop_same_distance);\n  INT_VAR_H(chop_min_outline_points);\n  INT_VAR_H(chop_seam_pile_size);\n  BOOL_VAR_H(chop_new_seam_pile);\n  INT_VAR_H(chop_inside_angle);\n  INT_VAR_H(chop_min_outline_area);\n  double_VAR_H(chop_split_dist_knob);\n  double_VAR_H(chop_overlap_knob);\n  double_VAR_H(chop_center_knob);\n  INT_VAR_H(chop_centered_maxwidth);\n  double_VAR_H(chop_sharpness_knob);\n  double_VAR_H(chop_width_change_knob);\n  double_VAR_H(chop_ok_split);\n  double_VAR_H(chop_good_split);\n  INT_VAR_H(chop_x_y_weight);\n  BOOL_VAR_H(assume_fixed_pitch_char_segment);\n  INT_VAR_H(wordrec_debug_level);\n  INT_VAR_H(wordrec_max_join_chunks);\n  BOOL_VAR_H(wordrec_skip_no_truth_words);\n  BOOL_VAR_H(wordrec_debug_blamer);\n  BOOL_VAR_H(wordrec_run_blamer);\n  INT_VAR_H(segsearch_debug_level);\n  INT_VAR_H(segsearch_max_pain_points);\n  INT_VAR_H(segsearch_max_futile_classifications);\n  double_VAR_H(segsearch_max_char_wh_ratio);\n  BOOL_VAR_H(save_alt_choices);\n\n  // methods from wordrec/*.cpp ***********************************************\n  Wordrec();\n  ~Wordrec() override = default;\n\n  // Fills word->alt_choices with alternative paths found during\n  // chopping/segmentation search that are kept in best_choices.\n  void SaveAltChoices(const LIST &best_choices, WERD_RES *word);\n\n  // Fills character choice lattice in the given BlamerBundle\n  // using the given ratings matrix and best choice list.\n  void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,\n                   const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);\n\n  // Calls fill_lattice_ member function\n  // (assumes that fill_lattice_ is not nullptr).\n  void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,\n                       const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {\n    (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);\n  }\n\n  // tface.cpp\n  void program_editup(const std::string &textbase, TessdataManager *init_classifier,\n                      TessdataManager *init_dict);\n  void cc_recog(WERD_RES *word);\n  void program_editdown(int32_t elapsed_time);\n  void set_pass1();\n  void set_pass2();\n  int end_recog();\n  BLOB_CHOICE_LIST *call_matcher(TBLOB *blob);\n  int dict_word(const WERD_CHOICE &word);\n  // wordclass.cpp\n  BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,\n                                  BlamerBundle *blamer_bundle);\n\n  // segsearch.cpp\n  // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.\n  // Each entry in the matrix represents the classification choice\n  // for a chunk, i.e. an entry in row 2, column 1 represents the list\n  // of ratings for the chunks 1 and 2 classified as a single blob.\n  // The entries on the diagonal of the matrix are classifier choice lists\n  // for a single chunk from the maximal segmentation.\n  //\n  // The ratings matrix given to SegSearch represents the segmentation\n  // graph / trellis for the current word. The nodes in the graph are the\n  // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings\n  // matrix. The children of each node (nodes connected by outgoing links)\n  // are the entries in the column that is equal to node's row+1. The parents\n  // (nodes connected by the incoming links) are the entries in the row that\n  // is equal to the node's column-1. Here is an example ratings matrix:\n  //\n  //    0    1    2   3   4\n  //  -------------------------\n  // 0| c,(                   |\n  // 1| d    l,1              |\n  // 2|           o           |\n  // 3|              c,(      |\n  // 4|              g,y  l,1 |\n  //  -------------------------\n  //\n  // In the example above node \"o\" has children (outgoing connection to nodes)\n  // \"c\",\"(\",\"g\",\"y\" and parents (incoming connections from nodes) \"l\",\"1\",\"d\".\n  //\n  // The objective of the search is to find the least cost path, where the cost\n  // is determined by the language model components and the properties of the\n  // cut between the blobs on the path. SegSearch starts by populating the\n  // matrix with the all the entries that were classified by the chopper and\n  // finding the initial best path. Based on the classifier ratings, language\n  // model scores and the properties of each cut, a list of \"pain points\" is\n  // constructed - those are the points on the path where the choices do not\n  // look consistent with the neighboring choices, the cuts look particularly\n  // problematic, or the certainties of the blobs are low. The most troublesome\n  // \"pain point\" is picked from the list and the new entry in the ratings\n  // matrix corresponding to this \"pain point\" is filled in. Then the language\n  // model state is updated to reflect the new classification and the new\n  // \"pain points\" are added to the list and the next most troublesome\n  // \"pain point\" is determined. This continues until either the word choice\n  // composed from the best paths in the segmentation graph is \"good enough\"\n  // (e.g. above a certain certainty threshold, is an unambiguous dictionary\n  // word, etc) or there are no more \"pain points\" to explore.\n  //\n  // If associate_blobs is set to false no new classifications will be done\n  // to combine blobs. Segmentation search will run only one \"iteration\"\n  // on the classifications already recorded in chunks_record.ratings.\n  //\n  // Note: this function assumes that word_res, best_choice_bundle arguments\n  // are not nullptr.\n  void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                 BlamerBundle *blamer_bundle);\n\n  // Setup and run just the initial segsearch on an established matrix,\n  // without doing any additional chopping or joining.\n  // (Internal factored version that can be used as part of the main SegSearch.)\n  void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,\n                        std::vector<SegSearchPending> *pending,\n                        BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);\n\n  // chop.cpp\n  PRIORITY point_priority(EDGEPT *point);\n  void add_point_to_list(PointHeap *point_heap, EDGEPT *point);\n  // Returns true if the edgept supplied as input is an inside angle.  This\n  // is determined by the angular change of the vectors from point to point.\n  bool is_inside_angle(EDGEPT *pt);\n  int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);\n  EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist);\n  void prioritize_points(TESSLINE *outline, PointHeap *points);\n  void new_min_point(EDGEPT *local_min, PointHeap *points);\n  void new_max_point(EDGEPT *local_max, PointHeap *points);\n  void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point,\n                                 EDGEPT_CLIST *new_points);\n\n  // chopper.cpp\n  SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,\n                          const std::vector<SEAM *> &seams);\n  SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,\n                           const std::vector<SEAM *> &seams);\n  SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,\n                              unsigned *blob_number);\n  SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,\n                         bool split_next_to_fragment, bool italic_blob, WERD_RES *word,\n                         unsigned *blob_number);\n  SEAM *chop_one_blob(const std::vector<TBOX> &boxes,\n                      const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,\n                      unsigned *blob_number);\n  void chop_word_main(WERD_RES *word);\n  void improve_by_chopping(float rating_cert_scale, WERD_RES *word,\n                           BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,\n                           LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);\n  int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,\n                           bool split_next_to_fragment);\n  int select_blob_to_split_from_fixpt(DANGERR *fixpt);\n\n  // findseam.cpp\n  void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams);\n  void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,\n                        SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile);\n  void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue);\n  SEAM *pick_good_seam(TBLOB *blob);\n  void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue,\n                       SeamPile *seam_pile, SEAM **seam, TBLOB *blob);\n  void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,\n                           EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile,\n                           SEAM **seam, TBLOB *blob);\n\n  // gradechop.cpp\n  PRIORITY grade_split_length(SPLIT *split);\n  PRIORITY grade_sharpness(SPLIT *split);\n\n  // outlines.cpp\n  bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);\n\n  // pieces.cpp\n  virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,\n                                           int16_t end, const char *description, TWERD *word,\n                                           BlamerBundle *blamer_bundle);\n\n  // Member variables.\n\n  std::unique_ptr<LanguageModel> language_model_;\n  PRIORITY pass2_ok_split;\n  // Stores the best choice for the previous word in the paragraph.\n  // This variable is modified by PAGE_RES_IT when iterating over\n  // words to OCR on the page.\n  WERD_CHOICE *prev_word_best_choice_;\n\n  // Function used to fill char choice lattices.\n  void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,\n                                 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);\n\nprotected:\n  inline bool SegSearchDone(int num_futile_classifications) {\n    return (language_model_->AcceptableChoiceFound() ||\n            num_futile_classifications >= segsearch_max_futile_classifications);\n  }\n\n  // Updates the language model state recorded for the child entries specified\n  // in pending[starting_col]. Enqueues the children of the updated entries\n  // into pending and proceeds to update (and remove from pending) all the\n  // remaining entries in pending[col] (col >= starting_col). Upon termination\n  // of this function all the pending[col] lists will be empty.\n  //\n  // The arguments:\n  //\n  // starting_col: index of the column in chunks_record->ratings from\n  // which the update should be started\n  //\n  // pending: list of entries listing chunks_record->ratings entries\n  // that should be updated\n  //\n  // pain_points: priority heap listing the pain points generated by\n  // the language model\n  //\n  // temp_pain_points: temporary storage for tentative pain points generated\n  // by the language model after a single call to LanguageModel::UpdateState()\n  // (the argument is passed in rather than created before each\n  // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)\n  //\n  // best_choice_bundle: a collection of variables that should be updated\n  // if a new best choice is found\n  //\n  void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,\n                            std::vector<SegSearchPending> *pending, WERD_RES *word_res,\n                            LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,\n                            BlamerBundle *blamer_bundle);\n\n  // Process the given pain point: classify the corresponding blob, enqueue\n  // new pain points to join the newly classified blob with its neighbors.\n  void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,\n                                 const char *pain_point_type,\n                                 std::vector<SegSearchPending> *pending, WERD_RES *word_res,\n                                 LMPainPoints *pain_points, BlamerBundle *blamer_bundle);\n  // Resets enough of the results so that the Viterbi search is re-run.\n  // Needed when the n-gram model is enabled, as the multi-length comparison\n  // implementation will re-value existing paths to worse values.\n  void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,\n                        std::vector<SegSearchPending> &pending);\n\n  // Add pain points for classifying blobs on the correct segmentation path\n  // (so that we can evaluate correct segmentation path and discover the reason\n  // for incorrect result).\n  void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,\n                              BlamerBundle *blamer_bundle, std::string &blamer_debug);\n};\n\n} // namespace tesseract\n\n#endif // DISABLED_LEGACY_ENGINE\n\n#endif // TESSERACT_WORDREC_WORDREC_H_\n"
  },
  {
    "path": "sw.cpp",
    "content": "void build(Solution &s)\n{\n    auto &tess = s.addProject(\"google.tesseract\", \"main\");\n    tess += Git(\"https://github.com/tesseract-ocr/tesseract\", \"\", \"{v}\");\n\n    auto cppstd = cpp17;\n\n    auto &libtesseract = tess.addTarget<LibraryTarget>(\"libtesseract\");\n    {\n        libtesseract.setChecks(\"libtesseract\");\n\n        libtesseract.PackageDefinitions = true;\n\n        libtesseract += cppstd;\n\n        libtesseract += \"TESS_API\"_api;\n        libtesseract += \"include/.*\"_rr;\n        libtesseract += \"src/.+/.*\"_rr;\n        libtesseract -= \"src/training/.*\"_rr;\n\n        libtesseract.Public += \"include\"_idir;\n        libtesseract.Protected +=\n            \"src/ccmain\"_id,\n            \"src/api\"_id,\n            \"src/dict\"_id,\n            \"src/viewer\"_id,\n            \"src/wordrec\"_id,\n            \"src/ccstruct\"_id,\n            \"src/cutil\"_id,\n            \"src/textord\"_id,\n            \"src/ccutil\"_id,\n            \"src/lstm\"_id,\n            \"src/classify\"_id,\n            \"src/arch\"_id,\n            \"src/training\"_id;\n\n        if (libtesseract.getCompilerType() == CompilerType::MSVC ||\n            libtesseract.getCompilerType() == CompilerType::ClangCl)\n        {\n            libtesseract += \"__SSE4_1__\"_def;\n            libtesseract.CompileOptions.push_back(\"-arch:AVX2\");\n\n            // openmp\n            //if (libtesseract.getOptions()[\"openmp\"] == \"true\")\n            if (0)\n            {\n                if (libtesseract.getCompilerType() == CompilerType::MSVC)\n                    libtesseract.CompileOptions.push_back(\"-openmp\");\n                else\n                    libtesseract.CompileOptions.push_back(\"-fopenmp\");\n                libtesseract += \"_OPENMP=201107\"_def;\n                if (libtesseract.getBuildSettings().Native.ConfigurationType == ConfigurationType::Debug)\n                    libtesseract += \"vcompd.lib\"_slib;\n                else\n                    libtesseract += \"vcomp.lib\"_slib;\n            }\n        }\n\n        auto win_or_mingw =\n            libtesseract.getBuildSettings().TargetOS.Type == OSType::Windows ||\n            libtesseract.getBuildSettings().TargetOS.Type == OSType::Mingw\n            ;\n\n        // check fma flags\n        libtesseract -= \"src/arch/dotproductfma.cpp\";\n        // check arch (arm)\n        libtesseract -= \"src/arch/dotproductneon.cpp\";\n\n        if (libtesseract.getBuildSettings().TargetOS.Type != OSType::Windows &&\n            libtesseract.getBuildSettings().TargetOS.Arch != ArchType::aarch64)\n        {\n            libtesseract[\"src/arch/dotproductavx.cpp\"].args.push_back(\"-mavx\");\n            libtesseract[\"src/arch/dotproductavx512.cpp\"].args.push_back(\"-mavx512f\");\n            libtesseract[\"src/arch/dotproductsse.cpp\"].args.push_back(\"-msse4.1\");\n            libtesseract[\"src/arch/intsimdmatrixsse.cpp\"].args.push_back(\"-msse4.1\");\n            libtesseract[\"src/arch/intsimdmatrixavx2.cpp\"].args.push_back(\"-mavx2\");\n        }\n        if (!win_or_mingw)\n        {\n#if SW_MODULE_ABI_VERSION > 29\n            if (!libtesseract.getBuildSettings().TargetOS.Android)\n#endif\n                libtesseract += \"pthread\"_slib;\n        }\n        if (libtesseract.getBuildSettings().TargetOS.Arch == ArchType::aarch64)\n        {\n            libtesseract += \"src/arch/dotproductneon.cpp\";\n        }\n\n        libtesseract.Public += \"HAVE_CONFIG_H\"_d;\n        libtesseract.Public += \"_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1\"_d;\n        libtesseract.Public += \"HAVE_LIBARCHIVE\"_d;\n\n        libtesseract.Public += \"org.sw.demo.danbloomberg.leptonica\"_dep;\n        libtesseract.Public += \"org.sw.demo.libarchive.libarchive\"_dep;\n\n        if (win_or_mingw)\n        {\n            libtesseract.Public += \"ws2_32.lib\"_slib;\n            libtesseract.Protected += \"NOMINMAX\"_def;\n        }\n\n        if (libtesseract.getCompilerType() == CompilerType::MSVC)\n            libtesseract.Protected.CompileOptions.push_back(\"-utf-8\");\n\n        libtesseract.Variables[\"TESSERACT_MAJOR_VERSION\"] = libtesseract.Variables[\"PACKAGE_MAJOR_VERSION\"];\n        libtesseract.Variables[\"TESSERACT_MINOR_VERSION\"] = libtesseract.Variables[\"PACKAGE_MINOR_VERSION\"];\n        libtesseract.Variables[\"TESSERACT_MICRO_VERSION\"] = libtesseract.Variables[\"PACKAGE_PATCH_VERSION\"];\n        libtesseract.Variables[\"TESSERACT_VERSION_STR\"] = \"master\";\n        libtesseract.configureFile(\"include/tesseract/version.h.in\", \"tesseract/version.h\");\n    }\n\n    //\n    auto &tesseract = tess.addExecutable(\"tesseract\");\n    {\n        tesseract += cppstd;\n        tesseract += \"src/tesseract.cpp\";\n        tesseract += libtesseract;\n    }\n\n    auto &svpaint = tess.addExecutable(\"svpaint\");\n    {\n        svpaint += cppstd;\n        svpaint += \"src/svpaint.cpp\";\n        svpaint += libtesseract;\n    }\n\n    auto &training = tess.addDirectory(\"training\");\n\n    //\n    auto &common_training = training.addLibrary(\"common_training\");\n    {\n        common_training += \"TESS_COMMON_TRAINING_API\"_api;\n        common_training += cppstd;\n        common_training += \"src/training/common/.*\"_rr;\n        common_training.Public += \"src/training/common\"_idir;\n        common_training.Public += libtesseract;\n    }\n\n    //\n    auto &unicharset_training = training.addLibrary(\"unicharset_training\");\n    {\n        unicharset_training += \"TESS_UNICHARSET_TRAINING_API\"_api;\n        unicharset_training += cppstd;\n        unicharset_training += \"src/training/unicharset/.*\"_rr;\n        unicharset_training.Public += \"src/training/unicharset\"_idir;\n        unicharset_training.Public += common_training;\n        unicharset_training.Public += \"org.sw.demo.unicode.icu.i18n\"_dep;\n\n        auto win_or_mingw =\n          unicharset_training.getBuildSettings().TargetOS.Type == OSType::Windows ||\n          unicharset_training.getBuildSettings().TargetOS.Type == OSType::Mingw\n          ;\n        if (!win_or_mingw)\n          unicharset_training += \"pthread\"_slib;\n    }\n\n    //\n#define ADD_EXE(n, ...)                     \\\n    auto &n = training.addExecutable(#n);   \\\n    n += cppstd;                            \\\n    n += \"src/training/\" #n \".*\"_rr;        \\\n    n.Public += __VA_ARGS__;                \\\n    n\n\n    ADD_EXE(ambiguous_words, common_training);\n    ADD_EXE(classifier_tester, common_training);\n    ADD_EXE(combine_lang_model, unicharset_training);\n    ADD_EXE(combine_tessdata, common_training);\n    ADD_EXE(cntraining, common_training);\n    ADD_EXE(dawg2wordlist, common_training);\n    ADD_EXE(mftraining, common_training) += \"src/training/mergenf.*\"_rr;\n    ADD_EXE(shapeclustering, common_training);\n    ADD_EXE(unicharset_extractor, unicharset_training);\n    ADD_EXE(wordlist2dawg, common_training);\n    ADD_EXE(lstmeval, unicharset_training);\n    ADD_EXE(lstmtraining, unicharset_training);\n    ADD_EXE(set_unicharset_properties, unicharset_training);\n    ADD_EXE(merge_unicharsets, common_training);\n\n    //\n    auto &pango_training = training.addLibrary(\"pango_training\");\n    {\n        pango_training += \"TESS_PANGO_TRAINING_API\"_api;\n        pango_training += cppstd;\n        pango_training += \"src/training/pango/.*\"_rr;\n        pango_training.Public += \"src/training/pango\"_idir;\n        pango_training.Public += unicharset_training;\n        pango_training.Public += \"org.sw.demo.gnome.pango.pangocairo\"_dep;\n    }\n\n    ADD_EXE(text2image, pango_training);\n    {\n        text2image += cppstd;\n        text2image +=\n            \"src/training/degradeimage.cpp\",\n            \"src/training/degradeimage.h\",\n            \"src/training/text2image.cpp\"\n            ;\n    }\n\n    if (!s.getExternalVariables()[\"with-tests\"])\n        return;\n\n    // tests\n    {\n        auto &test = tess.addDirectory(\"test\");\n        test.Scope = TargetScope::Test;\n\n        String skipped_tests_str;\n        if (s.getExternalVariables()[\"skip-tests\"])\n            skipped_tests_str = s.getExternalVariables()[\"skip-tests\"].getValue();\n        auto skipped_tests = split_string(skipped_tests_str, \",\");\n\n        auto add_test = [&test, &s, &cppstd, &libtesseract, &pango_training, &skipped_tests](const String &name) -> decltype(auto)\n        {\n            auto &t = test.addTarget<ExecutableTarget>(name);\n            t += cppstd;\n            t += FileRegex(\"unittest\", name + \"_test.*\", false);\n            t += \"unittest\"_idir;\n\n            t += \"SW_TESTING\"_def;\n\n            auto datadir = test.SourceDir / \"tessdata_unittest\";\n            if (s.getExternalVariables()[\"test-data-dir\"])\n                datadir = fs::current_path() / s.getExternalVariables()[\"test-data-dir\"].getValue();\n            t += Definition(\"TESSBIN_DIR=\\\"\" + \"\"s + \"\\\"\");\n\n            t += Definition(\"TESTING_DIR=\\\"\" + to_printable_string(normalize_path(test.SourceDir / \"test/testing\")) + \"\\\"\");\n            t += Definition(\"TESTDATA_DIR=\\\"\" + to_printable_string(normalize_path(test.SourceDir / \"test/testdata\")) + \"\\\"\");\n\n            t += Definition(\"LANGDATA_DIR=\\\"\" + to_printable_string(normalize_path(datadir / \"langdata_lstm\")) + \"\\\"\");\n            t += Definition(\"TESSDATA_DIR=\\\"\" + to_printable_string(normalize_path(datadir / \"tessdata\")) + \"\\\"\");\n            t += Definition(\"TESSDATA_BEST_DIR=\\\"\" + to_printable_string(normalize_path(datadir / \"tessdata_best\")) + \"\\\"\");\n\n            // we push all deps to all tests simplify things\n            t += pango_training;\n            t += \"org.sw.demo.google.googletest.gmock.main\"_dep;\n            t += \"org.sw.demo.google.googletest.gtest.main\"_dep;\n\n            if (t.getCompilerType() == CompilerType::MSVC)\n                t.CompileOptions.push_back(\"-utf-8\");\n\n            auto win_or_mingw =\n              t.getBuildSettings().TargetOS.Type == OSType::Windows ||\n              t.getBuildSettings().TargetOS.Type == OSType::Mingw\n              ;\n            if (!win_or_mingw)\n              t += \"pthread\"_slib;\n\n            auto tst = libtesseract.addTest(t, name);\n            for (auto &st : skipped_tests)\n            {\n                std::regex r(st);\n                if (std::regex_match(name, r))\n                {\n                    tst.skip(true);\n                    break;\n                }\n            }\n\n            return t;\n        };\n\n        Strings tests\n        {\n            \"apiexample\",\n            \"applybox\",\n            \"baseapi\",\n            \"baseapi_thread\",\n            \"bitvector\",\n            \"capiexample\",\n            \"capiexample_c\",\n            \"cleanapi\",\n            \"colpartition\",\n            \"commandlineflags\",\n            \"denorm\",\n            \"equationdetect\",\n            \"fileio\",\n            \"heap\",\n            \"imagedata\",\n            \"indexmapbidi\",\n            \"intfeaturemap\",\n            \"intsimdmatrix\",\n            \"lang_model\",\n            \"layout\",\n            \"ligature_table\",\n            \"linlsq\",\n            \"list\",\n            \"lstm_recode\",\n            \"lstm_squashed\",\n            \"lstm\",\n            \"lstmtrainer\",\n            \"loadlang\",\n            \"mastertrainer\",\n            \"matrix\",\n            \"networkio\",\n            \"normstrngs\",\n            \"nthitem\",\n            \"osd\",\n            \"pagesegmode\",\n            \"pango_font_info\",\n            \"paragraphs\",\n            \"params_model\",\n            \"progress\",\n            \"qrsequence\",\n            \"recodebeam\",\n            \"rect\",\n            \"resultiterator\",\n            \"scanutils\",\n            \"shapetable\",\n            \"stats\",\n            \"stringrenderer\",\n            \"stridemap\",\n            \"tablefind\",\n            \"tablerecog\",\n            \"tabvector\",\n            \"textlineprojection\",\n            \"tfile\",\n            \"unichar\",\n            \"unicharcompress\",\n            \"unicharset\",\n            \"validate_grapheme\",\n            \"validate_indic\",\n            \"validate_khmer\",\n            \"validate_myanmar\",\n            \"validator\",\n        };\n        for (auto t : tests)\n            add_test(t);\n        auto &dt = add_test(\"dawg\");\n        dt += Definition(\"wordlist2dawg_prog=\\\"\" + to_printable_string(normalize_path(wordlist2dawg.getOutputFile())) + \"\\\"\");\n        dt += Definition(\"dawg2wordlist_prog=\\\"\" + to_printable_string(normalize_path(dawg2wordlist.getOutputFile())) + \"\\\"\");\n\n        auto &tw = add_test(\"tatweel\");\n        tw += \"unittest/util/.*\"_rr;\n        tw += \"unittest/third_party/utf/.*\"_rr;\n    }\n}\n\nvoid check(Checker &c)\n{\n    auto &s = c.addSet(\"libtesseract\");\n    s.checkFunctionExists(\"getline\");\n    s.checkIncludeExists(\"dlfcn.h\");\n    s.checkIncludeExists(\"inttypes.h\");\n    s.checkIncludeExists(\"memory.h\");\n    s.checkIncludeExists(\"stdint.h\");\n    s.checkIncludeExists(\"stdlib.h\");\n    s.checkIncludeExists(\"string.h\");\n    s.checkIncludeExists(\"sys/stat.h\");\n    s.checkIncludeExists(\"sys/types.h\");\n    s.checkIncludeExists(\"tiffio.h\");\n    s.checkIncludeExists(\"unistd.h\");\n    s.checkTypeSize(\"long long int\");\n    s.checkTypeSize(\"size_t\");\n    s.checkTypeSize(\"void *\");\n    s.checkTypeSize(\"wchar_t\");\n    {\n        auto &c = s.checkSymbolExists(\"snprintf\");\n        c.Parameters.Includes.push_back(\"stdio.h\");\n    }\n}\n\n"
  },
  {
    "path": "tessdata/Makefile.am",
    "content": "datadir = @datadir@/tessdata\n\ndata_DATA = pdf.ttf\nEXTRA_DIST = $(data_DATA)\n\nSUBDIRS = configs tessconfigs\n\nlangdata =\n\nuninstall-local:\n\tcd $(DESTDIR)$(datadir); \\\n\trm -f $(langdata)\n"
  },
  {
    "path": "tessdata/configs/Makefile.am",
    "content": "datadir = @datadir@/tessdata/configs\ndata_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug\ndata_DATA += api_config kannada box.train.stderr quiet logfile digits get.images\ndata_DATA += lstmbox wordstrbox\n# Configurations for OCR output.\ndata_DATA += alto hocr page pdf tsv txt\ndata_DATA += linebox rebox strokewidth bigram\nEXTRA_DIST = $(data_DATA)\n"
  },
  {
    "path": "tessdata/configs/alto",
    "content": "tessedit_create_alto 1\n"
  },
  {
    "path": "tessdata/configs/ambigs.train",
    "content": "tessedit_ambigs_training\t1\nload_freq_dawg\t0\nload_punc_dawg\t0\nload_system_dawg\t0\nload_number_dawg\t0\nambigs_debug_level\t3\nload_fixed_length_dawgs\t0\n"
  },
  {
    "path": "tessdata/configs/api_config",
    "content": "tessedit_zero_rejection T\n"
  },
  {
    "path": "tessdata/configs/bazaar",
    "content": "load_system_dawg     F\nload_freq_dawg       F\nuser_words_suffix    user-words\nuser_patterns_suffix user-patterns\n"
  },
  {
    "path": "tessdata/configs/bigram",
    "content": "load_bigram_dawg\tTrue\ntessedit_enable_bigram_correction\tTrue\ntessedit_bigram_debug\t3\nsave_raw_choices\tTrue\nsave_alt_choices\tTrue\n"
  },
  {
    "path": "tessdata/configs/box.train",
    "content": "disable_character_fragments T\nfile_type                   .bl\ntextord_fast_pitch_test\tT\ntessedit_zero_rejection T\ntessedit_minimal_rejection F\ntessedit_write_rep_codes F\nedges_children_fix F\nedges_childarea 0.65\nedges_boxarea 0.9\ntessedit_resegment_from_boxes T\ntessedit_train_from_boxes T\ntextord_no_rejects T\n"
  },
  {
    "path": "tessdata/configs/box.train.stderr",
    "content": "file_type .bl\n#tessedit_use_nn F\ntextord_fast_pitch_test T\ntessedit_zero_rejection T\ntessedit_minimal_rejection F\ntessedit_write_rep_codes F\nedges_children_fix F\nedges_childarea 0.65\nedges_boxarea 0.9\ntessedit_resegment_from_boxes T\ntessedit_train_from_boxes T\n#textord_repeat_extraction F\ntextord_no_rejects T\n"
  },
  {
    "path": "tessdata/configs/digits",
    "content": "tessedit_char_whitelist 0123456789-.\n"
  },
  {
    "path": "tessdata/configs/get.images",
    "content": "tessedit_write_images T\n"
  },
  {
    "path": "tessdata/configs/hocr",
    "content": "tessedit_create_hocr 1\nhocr_font_info 0\n"
  },
  {
    "path": "tessdata/configs/inter",
    "content": "interactive_display_mode\t\t\t\tT\ntessedit_display_outwords\t\tT\n"
  },
  {
    "path": "tessdata/configs/kannada",
    "content": "textord_skewsmooth_offset 8\ntextord_skewsmooth_offset2 8\ntextord_merge_desc 0.5\ntextord_no_rejects 1\n"
  },
  {
    "path": "tessdata/configs/linebox",
    "content": "tessedit_resegment_from_line_boxes 1\ntessedit_make_boxes_from_boxes 1\n"
  },
  {
    "path": "tessdata/configs/logfile",
    "content": "debug_file tesseract.log\n"
  },
  {
    "path": "tessdata/configs/lstm.train",
    "content": "file_type                   .bl\ntextord_fast_pitch_test\tT\ntessedit_zero_rejection T\ntessedit_minimal_rejection F\ntessedit_write_rep_codes F\nedges_children_fix F\nedges_childarea 0.65\nedges_boxarea 0.9\ntessedit_train_line_recognizer T\ntextord_no_rejects T\ntessedit_init_config_only T\n"
  },
  {
    "path": "tessdata/configs/lstmbox",
    "content": "tessedit_create_lstmbox 1\n"
  },
  {
    "path": "tessdata/configs/lstmdebug",
    "content": "stopper_debug_level 1\nclassify_debug_level 1\nsegsearch_debug_level 1\nlanguage_model_debug_level 3\n"
  },
  {
    "path": "tessdata/configs/makebox",
    "content": "tessedit_create_boxfile 1\n"
  },
  {
    "path": "tessdata/configs/page",
    "content": "tessedit_create_page_xml 1\n# page_xml_polygon 1\n# page_xml_level 0\n"
  },
  {
    "path": "tessdata/configs/pdf",
    "content": "tessedit_create_pdf 1\n"
  },
  {
    "path": "tessdata/configs/quiet",
    "content": "debug_file /dev/null\n"
  },
  {
    "path": "tessdata/configs/rebox",
    "content": "tessedit_resegment_from_boxes 1\ntessedit_make_boxes_from_boxes 1\n"
  },
  {
    "path": "tessdata/configs/strokewidth",
    "content": "textord_show_blobs 0\ntextord_debug_tabfind 3\ntextord_tabfind_show_partitions 1\ntextord_tabfind_show_initial_partitions 1\ntextord_tabfind_show_columns 1\ntextord_tabfind_show_blocks 1\ntextord_tabfind_show_initialtabs 1\ntextord_tabfind_show_finaltabs 1\ntextord_tabfind_show_strokewidths 1\ntextord_tabfind_show_vlines 0\ntextord_tabfind_show_images 1\ntessedit_dump_pageseg_images 0\n"
  },
  {
    "path": "tessdata/configs/tsv",
    "content": "tessedit_create_tsv 1\n"
  },
  {
    "path": "tessdata/configs/txt",
    "content": "# This config file should be used with other config files which create renderers.\n# usage example: tesseract eurotext.tif eurotext txt hocr pdf\ntessedit_create_txt 1\n"
  },
  {
    "path": "tessdata/configs/unlv",
    "content": "tessedit_write_unlv 1\nunlv_tilde_crunching T\n"
  },
  {
    "path": "tessdata/configs/wordstrbox",
    "content": "tessedit_create_wordstrbox 1\n"
  },
  {
    "path": "tessdata/eng.user-patterns",
    "content": "1-\\d\\d\\d-GOOG-411\nwww.\\n\\\\\\*.com\n"
  },
  {
    "path": "tessdata/eng.user-words",
    "content": "the\nquick\nbrown\nfox\njumped\n"
  },
  {
    "path": "tessdata/tessconfigs/Makefile.am",
    "content": "datadir = @datadir@/tessdata/tessconfigs\ndata_DATA = batch batch.nochop nobatch matdemo segdemo msdemo\nEXTRA_DIST = batch batch.nochop nobatch matdemo segdemo msdemo\n"
  },
  {
    "path": "tessdata/tessconfigs/batch",
    "content": "# No content needed as all defaults are correct.\n"
  },
  {
    "path": "tessdata/tessconfigs/batch.nochop",
    "content": "chop_enable 0\nwordrec_enable_assoc 0\n"
  },
  {
    "path": "tessdata/tessconfigs/matdemo",
    "content": "#################################################\n# Adaptive Matcher Using PreAdapted Templates\n#################################################\n\nclassify_enable_adaptive_debugger   1\nmatcher_debug_flags         6\nmatcher_debug_level       1\n"
  },
  {
    "path": "tessdata/tessconfigs/msdemo",
    "content": "#################################################\n# Adaptive Matcher Using PreAdapted Templates\n#################################################\n\nclassify_enable_adaptive_debugger\t1\nmatcher_debug_flags         6\nmatcher_debug_level       1\n\nwordrec_display_splits          0\nwordrec_display_all_blobs       1\nwordrec_display_segmentations   2\nclassify_debug_level\t\t1\n"
  },
  {
    "path": "tessdata/tessconfigs/nobatch",
    "content": "\n"
  },
  {
    "path": "tessdata/tessconfigs/segdemo",
    "content": "#################################################\n# Adaptive Matcher Using PreAdapted Templates\n#################################################\n\nwordrec_display_splits          0\nwordrec_display_all_blobs       1\nwordrec_display_segmentations   2\nclassify_debug_level\t\t1\nstopper_debug_level 1\n"
  },
  {
    "path": "tesseract.pc.cmake",
    "content": "prefix=@CMAKE_INSTALL_PREFIX@\nexec_prefix=@CMAKE_INSTALL_PREFIX@\nlibdir=@CMAKE_INSTALL_FULL_LIBDIR@\nincludedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@\n\nName: @tesseract_NAME@\nDescription: An OCR Engine that was developed at HP Labs (1985-1995) and Google (2006-2018).\nURL: https://github.com/tesseract-ocr/tesseract\nVersion: @tesseract_VERSION@\nRequires.private: lept\nLibs: -L${libdir} -l@tesseract_OUTPUT_NAME@ @libarchive_LIBS@ @libcurl_LIBS@\nLibs.private:\nCflags: -I${includedir}\n"
  },
  {
    "path": "tesseract.pc.in",
    "content": "prefix=@prefix@\nexec_prefix=@exec_prefix@\nbindir=@bindir@\ndatarootdir = @datarootdir@\ndatadir=@datadir@\nlibdir=@libdir@\nincludedir=@includedir@\n\nName: @PACKAGE_NAME@\nDescription: An OCR Engine that was developed at HP Labs (1985-1995) and Google (2006-2018).\nURL: https://github.com/tesseract-ocr/tesseract\nVersion: @VERSION@\nRequires.private: lept\nLibs: -L${libdir} -ltesseract @libarchive_LIBS@ @libcurl_LIBS@\nLibs.private: -lpthread\nCflags: -I${includedir}\n"
  },
  {
    "path": "unittest/CMakeLists.txt",
    "content": "# find_package(GTest REQUIRED)\ninclude(GoogleTest) # Todo install GoogleTests?\n\n# Set common include directories\nset(COMMON_INCLUDE_DIRS\n    ${CMAKE_CURRENT_BINARY_DIR}/../src/training\n    ${CMAKE_CURRENT_SOURCE_DIR}/../src/ccutil\n    ${CMAKE_CURRENT_SOURCE_DIR}/../src/ccstruct\n    ${CMAKE_CURRENT_SOURCE_DIR}/../src/viewer\n    ${CMAKE_CURRENT_SOURCE_DIR}/../include\n    ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/unicharset\n    ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/common\n    ${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/googlemock/include)\n\nif (MSVC)\n    set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH}/$<CONFIG>)\nelse()\n    set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH})\nendif()\n\n# Set common compile definitions\nset(COMMON_COMPILE_DEFINITIONS\n    \"-DTESTING_DIR=\\\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testing\\\"\"\n    \"-DTESSDATA_DIR=\\\"${CMAKE_CURRENT_SOURCE_DIR}/../tessdata\\\"\"\n    \"-DTESSBIN_DIR=\\\"${TESSBIN_DIR}\\\"\"\n    \"-DTESTDATA_DIR=\\\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testdata\\\"\"\n    \"-DLANGDATA_DIR=\\\"${CMAKE_CURRENT_SOURCE_DIR}/../langdata_lstm\\\"\")\n\nfile(\n  GLOB TEST_SOURCES\n  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}\n  \"*.cc\")\n\nset(COMMON_LINK_LIBS libtesseract GTest::gtest_main common_training\n                     unicharset_training)\n\nset(TRAINING_TESTS\n    commandlineflags_test.cc\n    dawg_test.cc\n    lstm_recode_test.cc\n    lstm_squashed_test.cc\n    lstm_test.cc\n    lstm_test.cc\n    normstrngs_test.cc\n    unichar_test.cc\n    unicharcompress_test.cc\n    unicharset_test.cc\n    validate_grapheme_test.cc\n    validate_indic_test.cc\n    validate_khmer_test.cc\n    validate_myanmar_test.cc\n    validator_test.cc)\n\nset(PANGO_TESTS ligature_table_test.cc pango_font_info_test.cc\n                pango_font_info_test.cc stringrenderer_test.cc)\n\nset(LEGACY_TESTS\n    applybox_test.cc\n    bitvector_test.cc\n    equationdetect_test.cc\n    indexmapbidi_test.cc\n    intfeaturemap_test.cc\n    mastertrainer_test.cc\n    osd_test.cc\n    params_model_test.cc\n    shapetable_test.cc)\n\nif(BUILD_TRAINING_TOOLS AND PANGO_FOUND)\n  list(APPEND COMMON_INCLUDE_DIRS\n       ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/pango ${PANGO_INCLUDE_DIRS})\n\nelse()\n  list(REMOVE_ITEM TEST_SOURCES ${PANGO_TESTS})\nendif()\n\nif(DISABLED_LEGACY_ENGINE)\n  list(REMOVE_ITEM TEST_SOURCES ${LEGACY_TESTS})\nendif()\n\nif(NOT BUILD_TRAINING_TOOLS)\n  list(REMOVE_ITEM TEST_SOURCES ${TRAINING_TESTS})\nendif()\n\nset(TATWEEL_TEST_EXTRA_SRC util/utf8/unilib.cc util/utf8/unicodetext.cc\n                           third_party/utf/rune.c)\n\nmessage(STATUS \"Enabled tests: ${TEST_SOURCES}\")\n\nforeach(test_source IN LISTS TEST_SOURCES)\n  get_filename_component(test_name ${test_source} NAME_WE)\n  if(${test_source} IN_LIST PANGO_TESTS)\n    list(APPEND COMMON_LINK_LIBS pango_training ${PANGO_LIBRARIES})\n  endif()\n  if(${test_name} MATCHES \"tatweel_test\")\n    list(APPEND test_source ${TATWEEL_TEST_EXTRA_SRC})\n    list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}\n         ${CMAKE_CURRENT_SOURCE_DIR}/util/utf8)\n  endif()\n  add_executable(${test_name} ${test_source})\n  if(${test_name} MATCHES \"progress_test\")\n    target_link_libraries(${test_name} PRIVATE GTest::gmock)\n  endif()\n  target_compile_definitions(${test_name} PRIVATE ${COMMON_COMPILE_DEFINITIONS})\n  target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDE_DIRS})\n  target_link_libraries(${test_name} PRIVATE ${COMMON_LINK_LIBS})\n  add_test(NAME ${test_name} COMMAND ${test_name})\nendforeach()\n\n# Discover tests gtest_discover_tests(apiexample_test baseapi_test\n# baseapi_thread_test) add_test(baseapi_gtests baseapi_test.cc)\n"
  },
  {
    "path": "unittest/README.md",
    "content": "# Unit Testing for Tesseract\n\n## Requirements\n\n### Files and structure\n\n```\n\n├── langdata_lstm\n│   ├── common.punc\n│   ├── common.unicharambigs\n│   ├── desired_bigrams.txt\n│   ├── eng\n│   │   ├── desired_characters\n│   │   ├── eng.config\n│   │   ├── eng.numbers\n│   │   ├── eng.punc\n│   │   ├── eng.singles_text\n│   │   ├── eng.training_text\n│   │   ├── eng.unicharambigs\n│   │   ├── eng.wordlist\n│   │   └── okfonts.txt\n│   ├── extended\n│   │   └── extended.config\n│   ├── extendedhin\n│   │   └── extendedhin.config\n│   ├── font_properties\n│   ├── forbidden_characters_default\n│   ├── hin\n│   │   ├── hin.config\n│   │   ├── hin.numbers\n│   │   ├── hin.punc\n│   │   └── hin.wordlist\n│   ├── kan\n│   │   └── kan.config\n│   ├── kor\n│   │   └── kor.config\n│   ├── osd\n│   │   └── osd.unicharset\n│   └── radical-stroke.txt\n├── tessdata\n│   ├── ara.traineddata\n│   ├── chi_tra.traineddata\n│   ├── eng.traineddata\n│   ├── heb.traineddata\n│   ├── hin.traineddata\n│   ├── jpn.traineddata\n│   ├── kmr.traineddata\n│   ├── osd.traineddata\n│   └── vie.traineddata\n├── tessdata_best\n│   ├── eng.traineddata\n│   ├── fra.traineddata\n│   ├── kmr.traineddata\n│   └── osd.traineddata\n├── tessdata_fast\n│   ├── eng.traineddata\n│   ├── kmr.traineddata\n│   ├── osd.traineddata\n│   └── script\n│       └── Latin.traineddata\n└── tesseract\n    ...\n    ├── test\n    ├── unittest\n    │   └── third_party/googletest\n    └── VERSION\n```\n\n### Fonts\n\n* Microsoft fonts: arialbi.ttf, times.ttf, verdana.ttf - [installation guide](https://www.makeuseof.com/tag/how-to-install-microsoft-core-fonts-in-ubuntu-linux/)\n* [ae_Arab.ttf](https://www.wfonts.com/download/data/2014/12/03/ae-arab/ae-arab.zip)\n* dejavu-fonts: [DejaVuSans-ExtraLight.ttf](https://dejavu-fonts.github.io/Download.html)\n* [Lohit-Hindi.ttf](https://raw.githubusercontent.com/pratul/packageofpractices/master/assets/fonts/Lohit-Hindi.ttf)\n* [UnBatang.ttf](https://raw.githubusercontent.com/byrongibson/fonts/master/backup/truetype.original/unfonts-core/UnBatang.ttf)\n\n## Run tests\n\nTo run the tests, do the following in tesseract folder\n\n```\nautoreconf -fiv\ngit submodule update --init\ngit clone https://github.com/egorpugin/tessdata tessdata_unittest --depth 1\ncp tessdata_unittest/fonts/* test/testing/\nmv tessdata_unittest/* ../\nexport TESSDATA_PREFIX=/prefix/to/path/to/tessdata\nmake check\n```\n"
  },
  {
    "path": "unittest/apiexample_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        apiexample_test.cc\n// Description: Api Test for Tesseract using text fixtures and parameters.\n// Tests for Devanagari, Latin and Arabic scripts are disabled by default.\n// Disabled tests can be run when required by using the\n// --gtest_also_run_disabled_tests argument.\n//                 ./unittest/apiexample_test --gtest_also_run_disabled_tests\n//\n// Author:      ShreeDevi Kumar\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// expects clone of tessdata_fast repo in ../../tessdata_fast\n\n//#include \"log.h\"\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <time.h>\n#include <fstream>\n#include <iostream>\n#include <locale>\n#include <memory> // std::unique_ptr\n#include <string>\n#include \"include_gunit.h\"\n#include \"image.h\"\n\nnamespace tesseract {\n\nclass QuickTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    start_time_ = time(nullptr);\n  }\n  void TearDown() override {\n#ifndef NDEBUG\n    // Debug builds can be very slow, so allow 4 min for OCR of a test image.\n    // apitest_example including disabled tests takes about 18 min on ARMv7.\n    const time_t MAX_SECONDS_FOR_TEST = 240;\n#else\n    // Release builds typically need less than 10 s for OCR of a test image,\n    // apitest_example including disabled tests takes about 90 s on ARMv7.\n    const time_t MAX_SECONDS_FOR_TEST = 55;\n#endif\n    const time_t end_time = time(nullptr);\n    EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST)\n        << \"The test took too long - \" << ::testing::PrintToString(end_time - start_time_);\n  }\n  time_t start_time_;\n};\n\nvoid OCRTester(const char *imgname, const char *groundtruth, const char *tessdatadir,\n               const char *lang) {\n  // log.info() << tessdatadir << \" for language: \" << lang << std::endl;\n  char *outText;\n  std::locale loc(\"C\"); // You can also use \"\" for the default system locale\n  std::ifstream file(groundtruth);\n  file.imbue(loc); // Use it for file input\n  std::string gtText((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());\n  auto api = std::make_unique<tesseract::TessBaseAPI>();\n  ASSERT_FALSE(api->Init(tessdatadir, lang)) << \"Could not initialize tesseract.\";\n  Image image = pixRead(imgname);\n  ASSERT_TRUE(image != nullptr) << \"Failed to read test image.\";\n  api->SetImage(image);\n  outText = api->GetUTF8Text();\n  EXPECT_EQ(gtText, outText) << \"Phototest.tif OCR does not match ground truth for \"\n                             << ::testing::PrintToString(lang);\n  api->End();\n  api->ClearPersistentCache();\n  delete[] outText;\n  image.destroy();\n}\n\nclass MatchGroundTruth : public QuickTest, public ::testing::WithParamInterface<const char *> {};\n\nTEST_P(MatchGroundTruth, FastPhototestOCR) {\n  OCRTester(TESTING_DIR \"/phototest.tif\", TESTING_DIR \"/phototest.txt\", TESSDATA_DIR \"_fast\",\n            GetParam());\n}\n\nTEST_P(MatchGroundTruth, BestPhototestOCR) {\n  OCRTester(TESTING_DIR \"/phototest.tif\", TESTING_DIR \"/phototest.txt\", TESSDATA_DIR \"_best\",\n            GetParam());\n}\n\nTEST_P(MatchGroundTruth, TessPhototestOCR) {\n  OCRTester(TESTING_DIR \"/phototest.tif\", TESTING_DIR \"/phototest.txt\", TESSDATA_DIR, GetParam());\n}\n\nINSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values(\"eng\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth, ::testing::Values(\"script/Latin\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth, ::testing::Values(\"script/Devanagari\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth, ::testing::Values(\"script/Arabic\"));\n\nclass EuroText : public QuickTest {};\n\nTEST_F(EuroText, FastLatinOCR) {\n  OCRTester(TESTING_DIR \"/eurotext.tif\", TESTING_DIR \"/eurotext.txt\", TESSDATA_DIR \"_fast\",\n            \"script/Latin\");\n}\n\n// script/Latin for eurotext.tif does not match groundtruth\n// for tessdata & tessdata_best.\n// so do not test these here.\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/applybox_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <tesseract/resultiterator.h>\n#include <string>\n#include \"boxread.h\"\n#include \"rect.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nconst char *kTruthTextWords = \"To simple burn running of goods lately.\\n\";\nconst char *kTruthTextLine = \"Tosimpleburnrunningofgoodslately.\\n\";\n\n// The fixture for testing Tesseract.\nclass ApplyBoxTest : public testing::Test {\nprotected:\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, name);\n  }\n  std::string TessdataPath() {\n    return TESSDATA_DIR;\n  }\n\n  ApplyBoxTest() {\n    src_pix_ = nullptr;\n  }\n  ~ApplyBoxTest() override {\n    src_pix_.destroy();\n  }\n\n  bool SetImage(const char *filename) {\n    bool found = false;\n    src_pix_.destroy();\n    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());\n    if (api_.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) != -1) {\n      api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);\n      api_.SetImage(src_pix_);\n      api_.SetVariable(\"tessedit_make_boxes_from_boxes\", \"1\");\n      api_.SetInputName(TestDataNameToPath(filename).c_str());\n      found = true;\n    }\n    return found;\n  }\n\n  // Runs ApplyBoxes (via setting the appropriate variables and Recognize)\n  // and checks that the output ocr text matches the truth_str, and that\n  // the boxes match the given box file well enough.\n  // If line_mode is true, ApplyBoxes is run in line segmentation mode,\n  // otherwise the input box file is assumed to have character-level boxes.\n  void VerifyBoxesAndText(const char *imagefile, const char *truth_str, const char *target_box_file,\n                          bool line_mode) {\n    if (!SetImage(imagefile)) {\n      // eng.traineddata not found or other problem during Init.\n      GTEST_SKIP();\n    }\n    if (line_mode) {\n      api_.SetVariable(\"tessedit_resegment_from_line_boxes\", \"1\");\n    } else {\n      api_.SetVariable(\"tessedit_resegment_from_boxes\", \"1\");\n    }\n    api_.Recognize(nullptr);\n    char *ocr_text = api_.GetUTF8Text();\n    EXPECT_STREQ(truth_str, ocr_text);\n    delete[] ocr_text;\n    // Test the boxes by reading the target box file in parallel with the\n    // bounding boxes in the ocr output.\n    std::string box_filename = TestDataNameToPath(target_box_file);\n    FILE *box_file = OpenBoxFile(box_filename.c_str());\n    ASSERT_TRUE(box_file != nullptr);\n    int height = pixGetHeight(src_pix_);\n    ResultIterator *it = api_.GetIterator();\n    do {\n      int left, top, right, bottom;\n      EXPECT_TRUE(it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));\n      TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top));\n      int line_number = 0;\n      TBOX truth_box;\n      std::string box_text;\n      EXPECT_TRUE(ReadNextBox(0, &line_number, box_file, box_text, &truth_box));\n      // Testing for major overlap is a bit weak, but if they all\n      // major overlap successfully, then it has to be fairly close.\n      EXPECT_TRUE(ocr_box.major_overlap(truth_box));\n      // Also check that the symbol text matches the box text.\n      char *symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL);\n      EXPECT_STREQ(box_text.c_str(), symbol_text);\n      delete[] symbol_text;\n    } while (it->Next(tesseract::RIL_SYMBOL));\n    delete it;\n  }\n\n  Image src_pix_;\n  std::string ocr_text_;\n  tesseract::TessBaseAPI api_;\n};\n\n// Tests character-level applyboxes on normal Times New Roman.\nTEST_F(ApplyBoxTest, TimesCharLevel) {\n  VerifyBoxesAndText(\"trainingtimes.tif\", kTruthTextWords, \"trainingtimes.box\", false);\n}\n\n// Tests character-level applyboxes on italic Times New Roman.\nTEST_F(ApplyBoxTest, ItalicCharLevel) {\n  VerifyBoxesAndText(\"trainingital.tif\", kTruthTextWords, \"trainingital.box\", false);\n}\n\n// Tests line-level applyboxes on normal Times New Roman.\nTEST_F(ApplyBoxTest, TimesLineLevel) {\n  VerifyBoxesAndText(\"trainingtimesline.tif\", kTruthTextLine, \"trainingtimes.box\", true);\n}\n\n// Tests line-level applyboxes on italic Times New Roman.\nTEST_F(ApplyBoxTest, ItalLineLevel) {\n  VerifyBoxesAndText(\"trainingitalline.tif\", kTruthTextLine, \"trainingital.box\", true);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/baseapi_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n\n#include \"cycletimer.h\" // for CycleTimer\n#include \"log.h\"        // for LOG\n#include \"ocrblock.h\"   // for class BLOCK\n#include \"pageres.h\"\n\n#include <tesseract/baseapi.h>\n\n#include <allheaders.h>\n#include \"gmock/gmock-matchers.h\"\n\n#include <memory>\n#include <regex>\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\nusing ::testing::ContainsRegex;\nusing ::testing::HasSubstr;\n\nstatic const char *langs[] = {\"eng\", \"vie\", \"hin\", \"ara\", nullptr};\nstatic const char *image_files[] = {\"HelloGoogle.tif\", \"viet.tif\", \"raaj.tif\", \"arabic.tif\",\n                                    nullptr};\nstatic const char *gt_text[] = {\"Hello Google\", \"\\x74\\x69\\xe1\\xba\\xbf\\x6e\\x67\",\n                                \"\\xe0\\xa4\\xb0\\xe0\\xa4\\xbe\\xe0\\xa4\\x9c\",\n                                \"\\xd8\\xa7\\xd9\\x84\\xd8\\xb9\\xd8\\xb1\\xd8\\xa8\\xd9\\x8a\", nullptr};\n\nclass FriendlyTessBaseAPI : public tesseract::TessBaseAPI {\n  FRIEND_TEST(TesseractTest, LSTMGeometryTest);\n};\n\nstd::string GetCleanedTextResult(tesseract::TessBaseAPI *tess, Image pix) {\n  tess->SetImage(pix);\n  char *result = tess->GetUTF8Text();\n  std::string ocr_result = result;\n  delete[] result;\n  trim(ocr_result);\n  return ocr_result;\n}\n\n// The fixture for testing Tesseract.\nclass TesseractTest : public testing::Test {\nprotected:\n  static std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, name);\n  }\n  static std::string TessdataPath() {\n    return TESSDATA_DIR;\n  }\n};\n\n// Test static TessBaseAPI (like it is used by tesserocr).\nTEST_F(TesseractTest, StaticTessBaseAPI) {\n  static tesseract::TessBaseAPI api;\n  api.End();\n}\n\n// Tests that Tesseract gets exactly the right answer on phototest.\nTEST_F(TesseractTest, BasicTesseractTest) {\n  tesseract::TessBaseAPI api;\n  std::string truth_text;\n  std::string ocr_text;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) != -1) {\n    Image src_pix = pixRead(TestDataNameToPath(\"phototest.tif\").c_str());\n    CHECK(src_pix);\n    ocr_text = GetCleanedTextResult(&api, src_pix);\n    CHECK_OK(\n        file::GetContents(TestDataNameToPath(\"phototest.gold.txt\"), &truth_text, file::Defaults()));\n    trim(truth_text);\n    EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());\n    src_pix.destroy();\n  } else {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n}\n\n// Test that api.GetComponentImages() will return a set of images for\n// paragraphs even if text recognition was not run.\nTEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {\n  tesseract::TessBaseAPI api;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) != -1) {\n    api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);\n    api.SetVariable(\"paragraph_debug_level\", \"3\");\n#if 0 // TODO: b622.png is missing\n    Pix* src_pix = pixRead(TestDataNameToPath(\"b622.png\").c_str());\n    CHECK(src_pix);\n    api.SetImage(src_pix);\n    Boxa* para_boxes =\n        api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);\n    EXPECT_TRUE(para_boxes != nullptr);\n    Boxa* block_boxes =\n        api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);\n    EXPECT_TRUE(block_boxes != nullptr);\n    // TODO(eger): Get paragraphs out of this page pre-text.\n    EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));\n    boxaDestroy(&block_boxes);\n    boxaDestroy(&para_boxes);\n    src_pix.destroy();\n#endif\n  } else {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n}\n\n// We should get hOCR output and not seg fault, even if the api caller doesn't\n// call SetInputName().\nTEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {\n  tesseract::TessBaseAPI api;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) == -1) {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n  Image src_pix = pixRead(TestDataNameToPath(\"HelloGoogle.tif\").c_str());\n  CHECK(src_pix);\n  api.SetImage(src_pix);\n  char *result = api.GetHOCRText(0);\n  EXPECT_TRUE(result != nullptr);\n  EXPECT_THAT(result, HasSubstr(\"Hello\"));\n  EXPECT_THAT(result, HasSubstr(\"<div class='ocr_page'\"));\n  delete[] result;\n  src_pix.destroy();\n}\n\n// hOCR output should contain baseline info for upright textlines.\nTEST_F(TesseractTest, HOCRContainsBaseline) {\n  tesseract::TessBaseAPI api;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) == -1) {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n  Image src_pix = pixRead(TestDataNameToPath(\"HelloGoogle.tif\").c_str());\n  CHECK(src_pix);\n  api.SetInputName(\"HelloGoogle.tif\");\n  api.SetImage(src_pix);\n  char *result = api.GetHOCRText(0);\n  EXPECT_TRUE(result != nullptr);\n  EXPECT_THAT(result, HasSubstr(\"Hello\"));\n  EXPECT_TRUE(std::regex_search(\n      result, std::regex{\"<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+\"}));\n\n  delete[] result;\n  src_pix.destroy();\n}\n\n// Tests that Tesseract gets exactly the right answer on some page numbers.\nTEST_F(TesseractTest, AdaptToWordStrTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because TessBaseAPI::AdaptToWordStr is missing.\n  GTEST_SKIP();\n#else\n  static const char *kTrainingPages[] = {\"136.tif\", \"256.tif\", \"410.tif\", \"432.tif\", \"540.tif\",\n                                         \"692.tif\", \"779.tif\", \"793.tif\", \"808.tif\", \"815.tif\",\n                                         \"12.tif\",  \"12.tif\",  nullptr};\n  static const char *kTrainingText[] = {\"1 3 6\", \"2 5 6\", \"4 1 0\", \"4 3 2\", \"5 4 0\",\n                                        \"6 9 2\", \"7 7 9\", \"7 9 3\", \"8 0 8\", \"8 1 5\",\n                                        \"1 2\",   \"1 2\",   nullptr};\n  static const char *kTestPages[] = {\"324.tif\", \"433.tif\", \"12.tif\", nullptr};\n  static const char *kTestText[] = {\"324\", \"433\", \"12\", nullptr};\n  tesseract::TessBaseAPI api;\n  std::string truth_text;\n  std::string ocr_text;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY) == -1) {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n  api.SetVariable(\"matcher_sufficient_examples_for_prototyping\", \"1\");\n  api.SetVariable(\"classify_class_pruner_threshold\", \"220\");\n  // Train on the training text.\n  for (int i = 0; kTrainingPages[i] != nullptr; ++i) {\n    std::string image_file = TestDataNameToPath(kTrainingPages[i]);\n    Image src_pix = pixRead(image_file.c_str());\n    CHECK(src_pix);\n    api.SetImage(src_pix);\n    EXPECT_TRUE(api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))\n        << \"Failed to adapt to text \\\"\" << kTrainingText[i] << \"\\\" on image \" << image_file;\n    src_pix.destroy();\n  }\n  // Test the test text.\n  api.SetVariable(\"tess_bn_matching\", \"1\");\n  api.SetPageSegMode(tesseract::PSM_SINGLE_WORD);\n  for (int i = 0; kTestPages[i] != nullptr; ++i) {\n    Image src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str());\n    CHECK(src_pix);\n    ocr_text = GetCleanedTextResult(&api, src_pix);\n    trim(truth_text);\n    EXPECT_STREQ(kTestText[i], ocr_text.c_str());\n    src_pix.destroy();\n  }\n#endif\n}\n\n// Tests that LSTM gets exactly the right answer on phototest.\nTEST_F(TesseractTest, BasicLSTMTest) {\n  tesseract::TessBaseAPI api;\n  std::string truth_text;\n  std::string ocr_text;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_LSTM_ONLY) == -1) {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n  Image src_pix = pixRead(TestDataNameToPath(\"phototest_2.tif\").c_str());\n  CHECK(src_pix);\n  ocr_text = GetCleanedTextResult(&api, src_pix);\n  CHECK_OK(\n      file::GetContents(TestDataNameToPath(\"phototest.gold.txt\"), &truth_text, file::Defaults()));\n  trim(truth_text);\n  EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());\n  src_pix.destroy();\n}\n\n// Test that LSTM's character bounding boxes are properly converted to\n// Tesseract structures. Note that we can't guarantee that LSTM's\n// character boxes fall completely within Tesseract's word box because\n// the baseline denormalization/normalization transforms may introduce\n// errors due to float/int conversions (e.g., see OUTLINE::move() in\n// ccstruct/poutline.h) Instead, we do a loose check.\nTEST_F(TesseractTest, LSTMGeometryTest) {\n  Image src_pix = pixRead(TestDataNameToPath(\"deslant.tif\").c_str());\n  FriendlyTessBaseAPI api;\n  if (api.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_LSTM_ONLY) == -1) {\n    // eng.traineddata not found.\n    GTEST_SKIP();\n  }\n  api.SetImage(src_pix);\n  ASSERT_EQ(api.Recognize(nullptr), 0);\n\n  const PAGE_RES *page_res = api.GetPageRes();\n  PAGE_RES_IT page_res_it(const_cast<PAGE_RES *>(page_res));\n  page_res_it.restart_page();\n  BLOCK *block = page_res_it.block()->block;\n  CHECK(block);\n\n  // extract word and character boxes for each word\n  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {\n    WERD_RES *word = page_res_it.word();\n    CHECK(word);\n    CHECK(word->best_choice);\n    CHECK_GT(word->best_choice->length(), 0);\n    CHECK(word->word);\n    CHECK(word->box_word);\n    // tesseract's word box\n    TBOX tess_blob_box;\n    tess_blob_box = word->word->bounding_box();\n    tess_blob_box.rotate(block->re_rotation());\n    // verify that each of LSTM's character boxes lies close to within\n    // tesseract's word box\n    for (int i = 0; i < word->box_word->length(); ++i) {\n      TBOX lstm_blob_box = word->box_word->BlobBox(i);\n      // LSTM character box should not spill out of tesseract word box\n      // by more than a few pixels in any direction\n      EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5);\n      EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5);\n      EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5);\n      EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5);\n    }\n  }\n  src_pix.destroy();\n}\n\nTEST_F(TesseractTest, InitConfigOnlyTest) {\n  // Languages for testing initialization.\n  const char *langs[] = {\"eng\", \"chi_tra\", \"jpn\", \"vie\"};\n  std::unique_ptr<tesseract::TessBaseAPI> api;\n  CycleTimer timer;\n  for (auto &lang : langs) {\n    api = std::make_unique<tesseract::TessBaseAPI>();\n    timer.Restart();\n    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY));\n    timer.Stop();\n    LOG(INFO) << \"Lang \" << lang << \" took \" << timer.GetInMs() << \"ms in regular init\";\n  }\n  // Init variables to set for config-only initialization.\n  std::vector<std::string> vars_vec, vars_values;\n  vars_vec.emplace_back(\"tessedit_init_config_only\");\n  vars_values.emplace_back(\"1\");\n  LOG(INFO) << \"Switching to config only initialization:\";\n  for (auto &lang : langs) {\n    api = std::make_unique<tesseract::TessBaseAPI>();\n    timer.Restart();\n    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY, nullptr, 0,\n                           &vars_vec, &vars_values, false));\n    timer.Stop();\n    LOG(INFO) << \"Lang \" << lang << \" took \" << timer.GetInMs() << \"ms in config-only init\";\n  }\n}\n\n// Tests if two instances of Tesseract/LSTM can co-exist in the same thread.\n// NOTE: This is not an exhaustive test and current support for multiple\n// instances in Tesseract is fragile. This test is intended largely as a means\n// of detecting and guarding against the existing support being possibly broken\n// by future CLs. TessBaseAPI instances are initialized using the default\n// OEM_DEFAULT mode.\nTEST(TesseractInstanceTest, TestMultipleTessInstances) {\n  int num_langs = 0;\n  while (langs[num_langs] != nullptr) {\n    ++num_langs;\n  }\n\n  const std::string kTessdataPath = TESSDATA_DIR;\n\n  // Preload images and verify that OCR is correct on them individually.\n  std::vector<Image > pix(num_langs);\n  for (int i = 0; i < num_langs; ++i) {\n    std::string tracestring = \"Single instance test with lang = \";\n    tracestring += langs[i];\n    SCOPED_TRACE(tracestring);\n    std::string path = file::JoinPath(TESTING_DIR, image_files[i]);\n    pix[i] = pixRead(path.c_str());\n    QCHECK(pix[i] != nullptr) << \"Could not read \" << path;\n\n    tesseract::TessBaseAPI tess;\n    EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));\n    std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);\n    EXPECT_STREQ(gt_text[i], ocr_result.c_str());\n  }\n\n  // Process the images in all pairwise combinations of associated languages.\n  std::string ocr_result[2];\n  for (int i = 0; i < num_langs; ++i) {\n    for (int j = i + 1; j < num_langs; ++j) {\n      tesseract::TessBaseAPI tess1, tess2;\n      tess1.Init(kTessdataPath.c_str(), langs[i]);\n      tess2.Init(kTessdataPath.c_str(), langs[j]);\n\n      ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]);\n      ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]);\n\n      EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) ||\n                   strcmp(gt_text[j], ocr_result[1].c_str()))\n          << \"OCR failed on language pair \" << langs[i] << \"-\" << langs[j];\n    }\n  }\n\n  for (int i = 0; i < num_langs; ++i) {\n    pix[i].destroy();\n  }\n}\n\n// Tests whether Tesseract parameters are correctly set for the two instances.\nTEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {\n  std::string illegal_name = \"an_illegal_name\";\n  std::string langs[2] = {\"eng\", \"hin\"};\n  std::string int_param_name = \"tessedit_pageseg_mode\";\n  int int_param[2] = {1, 2};\n  std::string int_param_str[2] = {\"1\", \"2\"};\n  std::string bool_param_name = \"tessedit_ambigs_training\";\n  bool bool_param[2] = {false, true};\n  std::string bool_param_str[2] = {\"F\", \"T\"};\n  std::string str_param_name = \"tessedit_char_blacklist\";\n  std::string str_param[2] = {\"abc\", \"def\"};\n  std::string double_param_name = \"segment_penalty_dict_frequent_word\";\n  std::string double_param_str[2] = {\"0.01\", \"2\"};\n  double double_param[2] = {0.01, 2};\n\n  const std::string kTessdataPath = TESSDATA_DIR;\n\n  tesseract::TessBaseAPI tess1, tess2;\n  for (int i = 0; i < 2; ++i) {\n    tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;\n    api->Init(kTessdataPath.c_str(), langs[i].c_str());\n    api->SetVariable(illegal_name.c_str(), \"none\");\n    api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str());\n    api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str());\n    api->SetVariable(str_param_name.c_str(), str_param[i].c_str());\n    api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str());\n  }\n  for (int i = 0; i < 2; ++i) {\n    tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;\n    EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str()));\n    int intvar;\n    EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar));\n    EXPECT_EQ(int_param[i], intvar);\n    bool boolvar;\n    EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));\n    EXPECT_EQ(bool_param[i], boolvar);\n    EXPECT_STREQ(str_param[i].c_str(), api->GetStringVariable(str_param_name.c_str()));\n    double doublevar;\n    EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));\n    EXPECT_EQ(double_param[i], doublevar);\n  }\n}\n\n// Test that PAGE XML output properly closes all Page tags for multi-page documents.\nTEST_F(TesseractTest, PAGEXMLMultiPageClosingTags) {\n  tesseract::TessBaseAPI api;\n  if (api.Init(TessdataPath().c_str(), \"eng\") == -1) {\n    GTEST_SKIP();\n  }\n  \n  // Simulate two pages by calling GetPAGEText twice\n  Image src_pix = pixRead(TestDataNameToPath(\"HelloGoogle.tif\").c_str());\n  CHECK(src_pix);\n  api.SetInputName(\"page1.tif\");\n  api.SetImage(src_pix);\n  \n  char *page1 = api.GetPAGEText(0);\n  ASSERT_TRUE(page1 != nullptr);\n  \n  // Each page should have exactly one opening and one closing Page tag\n  std::string page1_str(page1);\n  size_t open_count = 0;\n  size_t close_count = 0;\n  size_t pos = 0;\n  \n  // Count opening <Page tags\n  while ((pos = page1_str.find(\"<Page\", pos)) != std::string::npos) {\n    open_count++;\n    pos += 5;\n  }\n  \n  // Count closing </Page> tags\n  pos = 0;\n  while ((pos = page1_str.find(\"</Page>\", pos)) != std::string::npos) {\n    close_count++;\n    pos += 7;\n  }\n  \n  // Each individual page output should have matching Page tags\n  EXPECT_EQ(open_count, 1) << \"Each page should have exactly one opening <Page tag\";\n  EXPECT_EQ(close_count, 1) << \"Each page should have exactly one closing </Page> tag\";\n  EXPECT_EQ(open_count, close_count) << \"Opening and closing Page tags should match\";\n  \n  // Verify the closing tag is present and not part of PcGts\n  EXPECT_THAT(page1_str, HasSubstr(\"</Page>\"));\n  EXPECT_THAT(page1_str, ::testing::Not(HasSubstr(\"</PcGts>\"))) \n      << \"Individual page output should not contain document envelope\";\n  \n  delete[] page1;\n  \n  // Test a second page to ensure each page closes properly\n  api.SetInputName(\"page2.tif\");\n  api.SetImage(src_pix);\n  char *page2 = api.GetPAGEText(1);\n  ASSERT_TRUE(page2 != nullptr);\n  \n  std::string page2_str(page2);\n  open_count = 0;\n  close_count = 0;\n  pos = 0;\n  \n  while ((pos = page2_str.find(\"<Page\", pos)) != std::string::npos) {\n    open_count++;\n    pos += 5;\n  }\n  \n  pos = 0;\n  while ((pos = page2_str.find(\"</Page>\", pos)) != std::string::npos) {\n    close_count++;\n    pos += 7;\n  }\n  \n  EXPECT_EQ(open_count, 1) << \"Second page should have exactly one opening <Page tag\";\n  EXPECT_EQ(close_count, 1) << \"Second page should have exactly one closing </Page> tag\";\n  \n  delete[] page2;\n  src_pix.destroy();\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/baseapi_thread_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// Unit test to run Tesseract instances in parallel threads and verify\n// the OCR result.\n\n// Note that success of running this test as-is does NOT verify\n// thread-safety. For that, you need to run this binary under TSAN using the\n// associated baseapi_thread_test_with_tsan.sh script.\n//\n// The tests are partitioned by instance to allow running Tesseract/Cube/both\n// and by stage to run initialization/recognition/both. See flag descriptions\n// for details.\n\n#include <functional>\n#include <memory>\n#include <string>\n#ifdef INCLUDE_TENSORFLOW\n#  include <tensorflow/core/lib/core/threadpool.h>\n#endif\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include \"commandlineflags.h\"\n#include \"include_gunit.h\"\n#include \"log.h\"\n#include \"image.h\"\n\n// Run with Tesseract instances.\nBOOL_PARAM_FLAG(test_tesseract, true, \"Test tesseract instances\");\n// Run with Cube instances.\n// Note that with TSAN, Cube typically takes much longer to test. Ignoring\n// std::string operations using the associated tess_tsan.ignore file when\n// testing Cube significantly reduces testing time.\nBOOL_PARAM_FLAG(test_cube, true, \"Test Cube instances\");\n\n// When used with TSAN, having more repetitions can help in finding hidden\n// thread-safety violations at the expense of increased testing time.\nINT_PARAM_FLAG(reps, 1, \"Num of parallel test repetitions to run.\");\n\nINT_PARAM_FLAG(max_concurrent_instances, 0,\n               \"Maximum number of instances to run in parallel at any given \"\n               \"instant. The number of concurrent instances cannot exceed \"\n               \"reps * number_of_langs_tested, which is also the default value.\");\n\nnamespace tesseract {\n\nstatic const char *kTessLangs[] = {\"eng\", \"vie\", nullptr};\nstatic const char *kTessImages[] = {\"HelloGoogle.tif\", \"viet.tif\", nullptr};\nstatic const char *kTessTruthText[] = {\"Hello Google\", \"\\x74\\x69\\xe1\\xba\\xbf\\x6e\\x67\", nullptr};\n\nstatic const char *kCubeLangs[] = {\"hin\", \"ara\", nullptr};\nstatic const char *kCubeImages[] = {\"raaj.tif\", \"arabic.tif\", nullptr};\nstatic const char *kCubeTruthText[] = {\"\\xe0\\xa4\\xb0\\xe0\\xa4\\xbe\\xe0\\xa4\\x9c\",\n                                       \"\\xd8\\xa7\\xd9\\x84\\xd8\\xb9\\xd8\\xb1\\xd8\\xa8\\xd9\\x8a\", nullptr};\n\nclass BaseapiThreadTest : public ::testing::Test {\nprotected:\n  static void SetUpTestCase() {\n    CHECK(FLAGS_test_tesseract || FLAGS_test_cube)\n        << \"Need to test at least one of Tesseract/Cube!\";\n    // Form a list of langs/gt_text/image_files we will work with.\n    std::vector<std::string> image_files;\n    if (FLAGS_test_tesseract) {\n      int i = 0;\n      while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {\n        langs_.emplace_back(kTessLangs[i]);\n        gt_text_.emplace_back(kTessTruthText[i]);\n        image_files.emplace_back(kTessImages[i]);\n        ++i;\n      }\n      LOG(INFO) << \"Testing Tesseract on \" << i << \" languages.\";\n    }\n    if (FLAGS_test_cube) {\n      int i = 0;\n      while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {\n        langs_.emplace_back(kCubeLangs[i]);\n        gt_text_.emplace_back(kCubeTruthText[i]);\n        image_files.emplace_back(kCubeImages[i]);\n        ++i;\n      }\n      LOG(INFO) << \"Testing Cube on \" << i << \" languages.\";\n    }\n    num_langs_ = langs_.size();\n\n    // Pre-load the images into an array. We will be making multiple copies of\n    // an image here if FLAGS_reps > 1 and that is intentional. In this test, we\n    // wish to not make any assumptions about the thread-safety of Pix objects,\n    // and so entirely disallow concurrent access of a Pix instance.\n    const int n = num_langs_ * FLAGS_reps;\n    for (int i = 0; i < n; ++i) {\n      std::string path = TESTING_DIR \"/\" + image_files[i % num_langs_];\n      Image new_pix = pixRead(path.c_str());\n      QCHECK(new_pix != nullptr) << \"Could not read \" << path;\n      pix_.push_back(new_pix);\n    }\n\n#ifdef INCLUDE_TENSORFLOW\n    pool_size_ = (FLAGS_max_concurrent_instances < 1) ? num_langs_ * FLAGS_reps\n                                                      : FLAGS_max_concurrent_instances;\n#endif\n  }\n\n  static void TearDownTestCase() {\n    for (auto &pix : pix_) {\n      pix.destroy();\n    }\n  }\n\n#ifdef INCLUDE_TENSORFLOW\n  void ResetPool() {\n    pool_.reset(\n        new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), \"tessthread\", pool_size_));\n  }\n\n  void WaitForPoolWorkers() {\n    pool_.reset(nullptr);\n  }\n\n  std::unique_ptr<tensorflow::thread::ThreadPool> pool_;\n  static int pool_size_;\n#endif\n  static std::vector<Image > pix_;\n  static std::vector<std::string> langs_;\n  static std::vector<std::string> gt_text_;\n  static int num_langs_;\n};\n\n// static member variable declarations.\n#ifdef INCLUDE_TENSORFLOW\nint BaseapiThreadTest::pool_size_;\n#endif\nstd::vector<Image > BaseapiThreadTest::pix_;\nstd::vector<std::string> BaseapiThreadTest::langs_;\nstd::vector<std::string> BaseapiThreadTest::gt_text_;\nint BaseapiThreadTest::num_langs_;\n\nstatic void InitTessInstance(TessBaseAPI *tess, const std::string &lang) {\n  CHECK(tess != nullptr);\n  EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));\n}\n\nstatic void GetCleanedText(TessBaseAPI *tess, Image pix, std::string &ocr_text) {\n  tess->SetImage(pix);\n  char *result = tess->GetUTF8Text();\n  ocr_text = result;\n  delete[] result;\n  trim(ocr_text);\n}\n\n#ifdef INCLUDE_TENSORFLOW\nstatic void VerifyTextResult(TessBaseAPI *tess, Image pix, const std::string &lang,\n                             const std::string &expected_text) {\n  TessBaseAPI *tess_local = nullptr;\n  if (tess) {\n    tess_local = tess;\n  } else {\n    tess_local = new TessBaseAPI;\n    InitTessInstance(tess_local, lang);\n  }\n  std::string ocr_text;\n  GetCleanedText(tess_local, pix, ocr_text);\n  EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());\n  if (tess_local != tess) {\n    delete tess_local;\n  }\n}\n#endif\n\n// Check that Tesseract/Cube produce the correct results in single-threaded\n// operation. If not, it is pointless to run the real multi-threaded tests.\nTEST_F(BaseapiThreadTest, TestBasicSanity) {\n  for (int i = 0; i < num_langs_; ++i) {\n    TessBaseAPI tess;\n    InitTessInstance(&tess, langs_[i]);\n    std::string ocr_text;\n    GetCleanedText(&tess, pix_[i], ocr_text);\n    CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) << \"Failed with lang = \" << langs_[i];\n  }\n}\n\n// Test concurrent instance initialization.\nTEST_F(BaseapiThreadTest, TestInit) {\n#ifdef INCLUDE_TENSORFLOW\n  const int n = num_langs_ * FLAGS_reps;\n  ResetPool();\n  std::vector<TessBaseAPI> tess(n);\n  for (int i = 0; i < n; ++i) {\n    pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));\n  }\n  WaitForPoolWorkers();\n#endif\n}\n\n// Test concurrent recognition.\nTEST_F(BaseapiThreadTest, TestRecognition) {\n#ifdef INCLUDE_TENSORFLOW\n  const int n = num_langs_ * FLAGS_reps;\n  std::vector<TessBaseAPI> tess(n);\n  // Initialize api instances in a single thread.\n  for (int i = 0; i < n; ++i) {\n    InitTessInstance(&tess[i], langs_[i % num_langs_]);\n  }\n\n  ResetPool();\n  for (int i = 0; i < n; ++i) {\n    pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], langs_[i % num_langs_],\n                              gt_text_[i % num_langs_]));\n  }\n  WaitForPoolWorkers();\n#endif\n}\n\nTEST_F(BaseapiThreadTest, TestAll) {\n#ifdef INCLUDE_TENSORFLOW\n  const int n = num_langs_ * FLAGS_reps;\n  ResetPool();\n  for (int i = 0; i < n; ++i) {\n    pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], langs_[i % num_langs_],\n                              gt_text_[i % num_langs_]));\n  }\n  WaitForPoolWorkers();\n#endif\n}\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/bitvector_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <cmath>\n#include <cstdio>\n#include <string>\n\n#include \"bitvector.h\"\n\n#include \"include_gunit.h\"\n\nconst int kPrimeLimit = 1000;\n\nnamespace tesseract {\n\nclass BitVectorTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\npublic:\n  std::string OutputNameToPath(const std::string &name) {\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n  // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.\n  void ComputePrimes(BitVector *map) {\n    map->Init(kPrimeLimit + 1);\n    TestAll(*map, false);\n    map->SetBit(2);\n    // Set all the odds to true.\n    for (int i = 3; i <= kPrimeLimit; i += 2) {\n      map->SetValue(i, true);\n    }\n    int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));\n    for (int f = 3; f <= factor_limit; f += 2) {\n      if (map->At(f)) {\n        for (int m = 2; m * f <= kPrimeLimit; ++m) {\n          map->ResetBit(f * m);\n        }\n      }\n    }\n  }\n\n  void TestPrimes(const BitVector &map) {\n    // Now all primes in the vector are true, and all others false.\n    // According to Wikipedia, there are 168 primes under 1000, the last\n    // of which is 997.\n    int total_primes = 0;\n    for (int i = 0; i <= kPrimeLimit; ++i) {\n      if (map[i]) {\n        ++total_primes;\n      }\n    }\n    EXPECT_EQ(168, total_primes);\n    EXPECT_TRUE(map[997]);\n    EXPECT_FALSE(map[998]);\n    EXPECT_FALSE(map[999]);\n  }\n  // Test that all bits in the vector have the given value.\n  void TestAll(const BitVector &map, bool value) {\n    for (int i = 0; i < map.size(); ++i) {\n      EXPECT_EQ(value, map[i]);\n    }\n  }\n\n  // Sets up a BitVector with bit patterns for byte values in\n  // [start_byte, end_byte) positioned every spacing bytes (for spacing >= 1)\n  // with spacing-1  zero bytes in between the pattern bytes.\n  void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector *bv) {\n    bv->Init((end_byte - start_byte) * 8 * spacing);\n    for (int byte_value = start_byte; byte_value < end_byte; ++byte_value) {\n      for (int bit = 0; bit < 8; ++bit) {\n        if (byte_value & (1 << bit)) {\n          bv->SetBit((byte_value - start_byte) * 8 * spacing + bit);\n        }\n      }\n    }\n  }\n\n  // Expects that every return from NextSetBit is really set and that all others\n  // are really not set. Checks the return from NumSetBits also.\n  void ExpectCorrectBits(const BitVector &bv) {\n    int bit_index = -1;\n    int prev_bit_index = -1;\n    int num_bits_tested = 0;\n    while ((bit_index = bv.NextSetBit(bit_index)) >= 0) {\n      EXPECT_LT(bit_index, bv.size());\n      // All bits in between must be 0.\n      for (int i = prev_bit_index + 1; i < bit_index; ++i) {\n        EXPECT_EQ(0, bv[i]) << \"i = \" << i << \" prev = \" << prev_bit_index;\n      }\n      // This bit must be 1.\n      EXPECT_EQ(1, bv[bit_index]) << \"Bit index = \" << bit_index;\n      ++num_bits_tested;\n      prev_bit_index = bit_index;\n    }\n    // Check the bits between the last and the end.\n    for (int i = prev_bit_index + 1; i < bv.size(); ++i) {\n      EXPECT_EQ(0, bv[i]);\n    }\n    EXPECT_EQ(num_bits_tested, bv.NumSetBits());\n  }\n};\n\n// Tests the sieve of Eratosthenes as a way of testing set/reset and I/O.\nTEST_F(BitVectorTest, Primes) {\n  BitVector map;\n  ComputePrimes(&map);\n  TestPrimes(map);\n  // It still works if we use the copy constructor.\n  BitVector map2(map);\n  TestPrimes(map2);\n  // Or if we assign it.\n  BitVector map3;\n  map3 = map;\n  TestPrimes(map3);\n  // Test file i/o too.\n  std::string filename = OutputNameToPath(\"primesbitvector\");\n  FILE *fp = fopen(filename.c_str(), \"wb\");\n  ASSERT_TRUE(fp != nullptr);\n  EXPECT_TRUE(map.Serialize(fp));\n  fclose(fp);\n  fp = fopen(filename.c_str(), \"rb\");\n  ASSERT_TRUE(fp != nullptr);\n  BitVector read_map;\n  EXPECT_TRUE(read_map.DeSerialize(false, fp));\n  fclose(fp);\n  TestPrimes(read_map);\n}\n\n// Tests the many-to-one setup feature.\nTEST_F(BitVectorTest, SetAll) {\n  // Test the default constructor and set/resetall.\n  BitVector map(42);\n  TestAll(map, false);\n  map.SetAllTrue();\n  TestAll(map, true);\n  map.SetAllFalse();\n  TestAll(map, false);\n}\n\n// Tests the values in the tables offset_table_, next_table_, hamming_table_\n// by setting all possible byte patterns and verifying that the NextSetBit and\n// NumSetBits functions return the correct values.\nTEST_F(BitVectorTest, TestNextSetBit) {\n  BitVector bv;\n  for (int spacing = 1; spacing <= 5; ++spacing) {\n    SetBitPattern(0, 256, spacing, &bv);\n    ExpectCorrectBits(bv);\n  }\n}\n\n// Tests the values in hamming_table_ more thoroughly by setting single byte\n// patterns for each byte individually.\nTEST_F(BitVectorTest, TestNumSetBits) {\n  BitVector bv;\n  for (int byte = 0; byte < 256; ++byte) {\n    SetBitPattern(byte, byte + 1, 1, &bv);\n    ExpectCorrectBits(bv);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/capiexample_c_test.c",
    "content": "///////////////////////////////////////////////////////////////////////\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Verifies that C is able to include capi header.\n#include <tesseract/capi.h>\n\n// Verifies that the libtesseract library has C API symbols.\nint main() {\n  printf(\"%s\\n\", TessVersion());\n  return 0;\n}\n"
  },
  {
    "path": "unittest/capiexample_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// Verifies that C++ is able to include capi header.\n#include <tesseract/capi.h>\n\n#include <gtest/gtest.h>\n\n// Verifies that the libtesseract library has C API symbols.\nTEST(C, VersionTest) {\n  TessVersion();\n}\n"
  },
  {
    "path": "unittest/cleanapi_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <tesseract/baseapi.h>\n\n// Dummy enum in the global namespace that checks for collision with awkward\n// names.\n// If this test fails to compile, clean up the includes in tesseract/baseapi.h!\n// They are not supposed to drag in definitions of any of the tesseract\n// types included in this enum!\nenum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD };\n\n#include \"gtest/gtest.h\"\n\nnamespace tesseract {\n\n// Verifies that the global namespace is clean.\nTEST(CleanNamespaceTess, DummyTest) {\n  tesseract::TessBaseAPI api;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/colpartition_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"colpartition.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TestableColPartition : public ColPartition {\npublic:\n  void SetColumnRange(int first, int last) {\n    set_first_column(first);\n    set_last_column(last);\n  }\n};\n\nclass ColPartitionTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  void TearDown() override {}\n};\n\nTEST_F(ColPartitionTest, IsInSameColumnAsReflexive) {\n  TestableColPartition a, b;\n  a.SetColumnRange(1, 2);\n  b.SetColumnRange(3, 3);\n\n  EXPECT_TRUE(a.IsInSameColumnAs(a));\n  EXPECT_TRUE(b.IsInSameColumnAs(b));\n}\n\nTEST_F(ColPartitionTest, IsInSameColumnAsBorders) {\n  TestableColPartition a, b, c, d;\n  a.SetColumnRange(0, 1);\n  b.SetColumnRange(1, 2);\n  c.SetColumnRange(2, 3);\n  d.SetColumnRange(4, 5);\n\n  EXPECT_TRUE(a.IsInSameColumnAs(b));\n  EXPECT_TRUE(b.IsInSameColumnAs(a));\n  EXPECT_FALSE(c.IsInSameColumnAs(d));\n  EXPECT_FALSE(d.IsInSameColumnAs(c));\n  EXPECT_FALSE(a.IsInSameColumnAs(d));\n}\n\nTEST_F(ColPartitionTest, IsInSameColumnAsSuperset) {\n  TestableColPartition a, b;\n  a.SetColumnRange(4, 7);\n  b.SetColumnRange(2, 8);\n\n  EXPECT_TRUE(a.IsInSameColumnAs(b));\n  EXPECT_TRUE(b.IsInSameColumnAs(a));\n}\n\nTEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) {\n  TestableColPartition a, b;\n  a.SetColumnRange(3, 8);\n  b.SetColumnRange(6, 10);\n\n  EXPECT_TRUE(a.IsInSameColumnAs(b));\n  EXPECT_TRUE(b.IsInSameColumnAs(a));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/commandlineflags_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"commandlineflags.h\"\n\n#include \"include_gunit.h\"\n\n// Flags used for testing parser.\nINT_PARAM_FLAG(foo_int, 0, \"Integer flag for testing\");\nINT_PARAM_FLAG(bar_int, 0, \"Integer flag for testing\");\nDOUBLE_PARAM_FLAG(foo_double, 0.1, \"Double flag for testing\");\nDOUBLE_PARAM_FLAG(bar_double, 0.2, \"Double flag for testing\");\nSTRING_PARAM_FLAG(foo_string, \"foo\", \"String flag for testing\");\nSTRING_PARAM_FLAG(bar_string, \"bar\", \"String flag for testing\");\nBOOL_PARAM_FLAG(foo_bool, false, \"Bool flag for testing\");\nBOOL_PARAM_FLAG(bar_bool, false, \"Bool flag for testing\");\n// A flag whose name is a single character, tested for backward\n// compatibility. This should be selected to not conflict with existing flags\n// in commontraining.cpp.\nSTRING_PARAM_FLAG(q, \"\", \"Single character name\");\n\nnamespace tesseract {\n\nclass CommandlineflagsTest : public ::testing::Test {\nprotected:\n  void TestParser(int argc, const char **const_argv) {\n    TestParser(\"\", argc, const_argv);\n  }\n  void TestParser(const char *usage, int argc, const char **const_argv) {\n    // Make a copy of the pointer since it can be altered by the function.\n    char **argv = const_cast<char **>(const_argv);\n    tesseract::ParseCommandLineFlags(usage, &argc, &argv, true);\n  }\n};\n\nTEST_F(CommandlineflagsTest, RemoveFlags) {\n  const char *const_argv[] = {\"Progname\", \"--foo_int\", \"3\", \"file1.h\", \"file2.h\"};\n  int argc = countof(const_argv);\n  char **argv = const_cast<char **>(const_argv);\n  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);\n\n  // argv should be rearranged to look like { \"Progname\", \"file1.h\", \"file2.h\" }\n  EXPECT_EQ(3, argc);\n  EXPECT_STREQ(\"Progname\", argv[0]);\n  EXPECT_STREQ(\"file1.h\", argv[1]);\n  EXPECT_STREQ(\"file2.h\", argv[2]);\n}\n\n#if 0 // TODO: this test needs an update (it currently fails).\nTEST_F(CommandlineflagsTest, PrintUsageAndExit) {\n  const char* argv[] = { \"Progname\", \"--help\" };\n  EXPECT_EXIT(TestParser(\"Progname [flags]\", countof(argv), argv),\n              ::testing::ExitedWithCode(0),\n              \"USAGE: Progname \\\\[flags\\\\]\");\n}\n#endif\n\nTEST_F(CommandlineflagsTest, ExitsWithErrorOnInvalidFlag) {\n  const char *argv[] = {\"\", \"--test_nonexistent_flag\"};\n  EXPECT_EXIT(TestParser(countof(argv), argv), ::testing::ExitedWithCode(1),\n              \"ERROR: Non-existent flag\");\n}\n\nTEST_F(CommandlineflagsTest, ParseIntegerFlags) {\n  const char *argv[] = {\"\", \"--foo_int=3\", \"--bar_int\", \"-4\"};\n  TestParser(countof(argv), argv);\n  EXPECT_EQ(3, FLAGS_foo_int);\n  EXPECT_EQ(-4, FLAGS_bar_int);\n\n  const char *arg_no_value[] = {\"\", \"--bar_int\"};\n  EXPECT_EXIT(TestParser(countof(arg_no_value), arg_no_value), ::testing::ExitedWithCode(1),\n              \"ERROR\");\n\n  const char *arg_invalid_value[] = {\"\", \"--bar_int\", \"--foo_int=3\"};\n  EXPECT_EXIT(TestParser(countof(arg_invalid_value), arg_invalid_value),\n              ::testing::ExitedWithCode(1), \"ERROR\");\n\n  const char *arg_bad_format[] = {\"\", \"--bar_int=\"};\n  EXPECT_EXIT(TestParser(countof(arg_bad_format), arg_bad_format), ::testing::ExitedWithCode(1),\n              \"ERROR\");\n}\n\nTEST_F(CommandlineflagsTest, ParseDoubleFlags) {\n  const char *argv[] = {\"\", \"--foo_double=3.14\", \"--bar_double\", \"1.2\"};\n  TestParser(countof(argv), argv);\n\n  EXPECT_EQ(3.14, FLAGS_foo_double);\n  EXPECT_EQ(1.2, FLAGS_bar_double);\n\n  const char *arg_no_value[] = {\"\", \"--bar_double\"};\n  EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), \"ERROR\");\n\n  const char *arg_bad_format[] = {\"\", \"--bar_double=\"};\n  EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1), \"ERROR\");\n}\n\nTEST_F(CommandlineflagsTest, ParseStringFlags) {\n  const char *argv[] = {\"\", \"--foo_string=abc\", \"--bar_string\", \"def\"};\n  TestParser(countof(argv), argv);\n\n  EXPECT_STREQ(\"abc\", FLAGS_foo_string.c_str());\n  EXPECT_STREQ(\"def\", FLAGS_bar_string.c_str());\n\n  const char *arg_no_value[] = {\"\", \"--bar_string\"};\n  EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), \"ERROR\");\n\n  FLAGS_bar_string.set_value(\"bar\");\n  const char *arg_empty_string[] = {\"\", \"--bar_string=\"};\n  TestParser(2, arg_empty_string);\n  EXPECT_STREQ(\"\", FLAGS_bar_string.c_str());\n}\n\nTEST_F(CommandlineflagsTest, ParseBoolFlags) {\n  const char *argv[] = {\"\", \"--foo_bool=true\", \"--bar_bool=1\"};\n  FLAGS_foo_bool.set_value(false);\n  FLAGS_bar_bool.set_value(false);\n  TestParser(countof(argv), argv);\n  // Verify changed value\n  EXPECT_TRUE(FLAGS_foo_bool);\n  EXPECT_TRUE(FLAGS_bar_bool);\n\n  const char *inv_argv[] = {\"\", \"--foo_bool=false\", \"--bar_bool=0\"};\n  FLAGS_foo_bool.set_value(true);\n  FLAGS_bar_bool.set_value(true);\n  TestParser(3, inv_argv);\n  // Verify changed value\n  EXPECT_FALSE(FLAGS_foo_bool);\n  EXPECT_FALSE(FLAGS_bar_bool);\n\n  const char *arg_implied_true[] = {\"\", \"--bar_bool\"};\n  FLAGS_bar_bool.set_value(false);\n  TestParser(2, arg_implied_true);\n  EXPECT_TRUE(FLAGS_bar_bool);\n\n  const char *arg_missing_val[] = {\"\", \"--bar_bool=\"};\n  EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1), \"ERROR\");\n}\n\nTEST_F(CommandlineflagsTest, ParseOldFlags) {\n  EXPECT_STREQ(\"\", FLAGS_q.c_str());\n  const char *argv[] = {\"\", \"-q\", \"text\"};\n  TestParser(countof(argv), argv);\n  EXPECT_STREQ(\"text\", FLAGS_q.c_str());\n}\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/cycletimer.h",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n// Portability include to match the Google test environment.\n\n#ifndef TESSERACT_UNITTEST_CYCLETIMER_H\n#define TESSERACT_UNITTEST_CYCLETIMER_H\n\n#include <chrono> // for std::chrono\n\n// See https://github.com/google/or-tools/blob/master/ortools/base/timer.h\nclass CycleTimer {\nprivate:\n  static int64_t now() {\n    return std::chrono::duration_cast<std::chrono::milliseconds>(\n      std::chrono::steady_clock::now().time_since_epoch()).count();\n  }\n\npublic:\n  CycleTimer() {\n    Reset();\n  }\n\n  void Reset() {\n    running_ = false;\n    sum_ = 0;\n    start_ = 0;\n  }\n\n  // When Start() is called multiple times, only the most recent is used.\n  void Start() {\n    running_ = true;\n    start_ = now();\n  }\n\n  void Restart() {\n    sum_ = 0;\n    Start();\n  }\n\n  void Stop() {\n    if (running_) {\n      sum_ += now() - start_;\n      running_ = false;\n    }\n  }\n  int64_t GetInMs() const {\n    return running_ ? now() - start_ + sum_ : sum_;\n  }\n\nprivate:\n  bool running_;\n  int64_t start_;\n  int64_t sum_;\n};\n\n#endif // TESSERACT_UNITTEST_CYCLETIMER_H\n"
  },
  {
    "path": "unittest/dawg_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n\n#include \"ratngs.h\"\n#include \"trie.h\"\n#include \"unicharset.h\"\n\n#include <sys/stat.h>\n#include <cstdlib> // for system\n#include <fstream> // for ifstream\n#include <set>\n#include <string>\n#include <vector>\n\n#ifndef SW_TESTING\n#  define wordlist2dawg_prog \"wordlist2dawg\"\n#  define dawg2wordlist_prog \"dawg2wordlist\"\n#endif\n\nnamespace tesseract {\n\n// Test some basic functionality dealing with Dawgs (compressed dictionaries,\n// aka Directed Acyclic Word Graphs).\nclass DawgTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  void LoadWordlist(const std::string &filename, std::set<std::string> *words) const {\n    std::ifstream file(filename);\n    if (file.is_open()) {\n      std::string line;\n      while (getline(file, line)) {\n        // Remove trailing line terminators from line.\n        while (!line.empty() && (line.back() == '\\n' || line.back() == '\\r')) {\n          line.resize(line.size() - 1);\n        }\n        // Add line to set.\n        words->insert(line.c_str());\n      }\n      file.close();\n    }\n  }\n  std::string TessBinaryPath(const std::string &name) const {\n    return file::JoinPath(TESSBIN_DIR, name);\n  }\n  std::string OutputNameToPath(const std::string &name) const {\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n  int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,\n                 const std::string &arg3) const {\n    std::string cmdline = TessBinaryPath(program) + \" \" + arg1 + \" \" + arg2 + \" \" + arg3;\n    return system(cmdline.c_str());\n  }\n  // Test that we are able to convert a wordlist file (one \"word\" per line) to\n  // a dawg (a compressed format) and then extract the original wordlist back\n  // out using the tools \"wordlist2dawg\" and \"dawg2wordlist.\"\n  void TestDawgRoundTrip(const std::string &unicharset_filename,\n                         const std::string &wordlist_filename) const {\n    std::set<std::string> orig_words, roundtrip_words;\n    std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);\n    std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);\n    std::string output_dawg = OutputNameToPath(wordlist_filename + \".dawg\");\n    std::string output_wordlist = OutputNameToPath(wordlist_filename);\n    LoadWordlist(orig_wordlist, &orig_words);\n    EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);\n    EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);\n    LoadWordlist(output_wordlist, &roundtrip_words);\n    EXPECT_EQ(orig_words, roundtrip_words);\n  }\n};\n\nTEST_F(DawgTest, TestDawgConversion) {\n  TestDawgRoundTrip(\"eng.unicharset\", \"eng.wordlist.clean.freq\");\n}\n\nTEST_F(DawgTest, TestMatching) {\n  UNICHARSET unicharset;\n  unicharset.load_from_file(file::JoinPath(TESTING_DIR, \"eng.unicharset\").c_str());\n  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, \"basic_dawg\", NGRAM_PERM, unicharset.size(), 0);\n  WERD_CHOICE space_apos(\" '\", unicharset);\n  trie.add_word_to_dawg(space_apos);\n\n  WERD_CHOICE space(\" \", unicharset);\n\n  // partial match ok - then good!\n  EXPECT_TRUE(trie.prefix_in_dawg(space, false));\n  // require complete match - not present.\n  EXPECT_FALSE(trie.word_in_dawg(space));\n  EXPECT_FALSE(trie.prefix_in_dawg(space, true));\n\n  // partial or complete match ok for full word:\n  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));\n  EXPECT_TRUE(trie.word_in_dawg(space_apos));\n  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/denorm_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"blobs.h\"\n#include \"normalis.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass DENORMTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\npublic:\n  void TearDown() override {}\n\n  void ExpectCorrectTransform(const DENORM &denorm, const TPOINT &src, const TPOINT &result,\n                              bool local) {\n    TPOINT normed;\n    if (local) {\n      denorm.LocalNormTransform(src, &normed);\n    } else {\n      denorm.NormTransform(nullptr, src, &normed);\n    }\n    EXPECT_EQ(result.x, normed.x);\n    EXPECT_EQ(result.y, normed.y);\n    // Now undo\n    TPOINT denormed;\n    if (local) {\n      denorm.LocalDenormTransform(normed, &denormed);\n    } else {\n      denorm.DenormTransform(nullptr, normed, &denormed);\n    }\n    EXPECT_EQ(src.x, denormed.x);\n    EXPECT_EQ(src.y, denormed.y);\n  }\n};\n\n// Tests a simple baseline-style normalization.\nTEST_F(DENORMTest, NoRotations) {\n  DENORM denorm;\n  denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,\n                            static_cast<float>(kBlnBaselineOffset));\n  TPOINT pt1(1100, 2000);\n  TPOINT result1(200, kBlnBaselineOffset);\n  ExpectCorrectTransform(denorm, pt1, result1, true);\n  ExpectCorrectTransform(denorm, pt1, result1, false);\n  TPOINT pt2(900, 2100);\n  TPOINT result2(-200, 300 + kBlnBaselineOffset);\n  ExpectCorrectTransform(denorm, pt2, result2, true);\n  ExpectCorrectTransform(denorm, pt2, result2, false);\n}\n\n// Tests a simple baseline-style normalization with a rotation.\nTEST_F(DENORMTest, WithRotations) {\n  DENORM denorm;\n  FCOORD rotation90(0.0f, 1.0f);\n  denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,\n                            static_cast<float>(kBlnBaselineOffset));\n\n  TPOINT pt1(1100, 2000);\n  TPOINT result1(0, 200 + kBlnBaselineOffset);\n  ExpectCorrectTransform(denorm, pt1, result1, true);\n  ExpectCorrectTransform(denorm, pt1, result1, false);\n  TPOINT pt2(900, 2100);\n  TPOINT result2(-300, kBlnBaselineOffset - 200);\n  ExpectCorrectTransform(denorm, pt2, result2, true);\n  ExpectCorrectTransform(denorm, pt2, result2, false);\n}\n\n// Tests a simple baseline-style normalization with a second rotation & scale.\nTEST_F(DENORMTest, Multiple) {\n  DENORM denorm;\n  denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,\n                            static_cast<float>(kBlnBaselineOffset));\n\n  DENORM denorm2;\n  FCOORD rotation90(0.0f, 1.0f);\n  denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f, 0.25f, 0.0f,\n                             0.0f);\n  TPOINT pt1(1050, 2000);\n  TPOINT result1(100, kBlnBaselineOffset);\n  ExpectCorrectTransform(denorm, pt1, result1, true);\n  ExpectCorrectTransform(denorm, pt1, result1, false);\n  TPOINT result2(kBlnBaselineOffset / 4, -14);\n  ExpectCorrectTransform(denorm2, result1, result2, true);\n  ExpectCorrectTransform(denorm2, pt1, result2, false);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/doubleptr.h",
    "content": "// Copyright 2012 Google Inc. All Rights Reserved.\n// Author: rays@google.com (Ray Smith)\n///////////////////////////////////////////////////////////////////////\n// File:        doubleptr.h\n// Description: Double-ended pointer that keeps pointing correctly even\n//              when reallocated or copied.\n// Author:      Ray Smith\n//\n// (C) Copyright 2012, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_\n#define TESSERACT_CCUTIL_DOUBLEPTR_H_\n\n#include \"errcode.h\"\n\nnamespace tesseract {\n\n// A smart pointer class that implements a double-ended pointer. Each end\n// points to the other end. The copy constructor and operator= have MOVE\n// semantics, meaning that the relationship with the other end moves to the\n// destination of the copy, leaving the source unattached.\n// For this reason both the copy constructor and the operator= take a non-const\n// reference argument, and the const reference versions cannot be used.\n// DoublePtr is useful to incorporate into structures that are part of a\n// collection such as STL containers, where reallocs can\n// relocate the members. DoublePtr is also useful in a GenericHeap, where it\n// can correctly maintain the pointer to an element of the heap despite it\n// getting moved around on the heap.\nclass DoublePtr {\npublic:\n  DoublePtr() : other_end_(nullptr) {}\n  // Copy constructor steals the partner off src and is therefore a non\n  // const reference arg.\n  // Copying a const DoublePtr generates a compiler error.\n  DoublePtr(const DoublePtr &src) {\n    other_end_ = src.other_end_;\n    if (other_end_ != nullptr) {\n      other_end_->other_end_ = this;\n      ((DoublePtr &)src).other_end_ = nullptr;\n    }\n  }\n  // Operator= steals the partner off src, and therefore needs src to be a non-\n  // const reference.\n  // Assigning from a const DoublePtr generates a compiler error.\n  void operator=(const DoublePtr &src) {\n    Disconnect();\n    other_end_ = src.other_end_;\n    if (other_end_ != nullptr) {\n      other_end_->other_end_ = this;\n      ((DoublePtr &)src).other_end_ = nullptr;\n    }\n  }\n\n  // Connects this and other, discarding any existing connections.\n  void Connect(DoublePtr *other) {\n    other->Disconnect();\n    Disconnect();\n    other->other_end_ = this;\n    other_end_ = other;\n  }\n  // Disconnects this and other, making OtherEnd() return nullptr for both.\n  void Disconnect() {\n    if (other_end_ != nullptr) {\n      other_end_->other_end_ = nullptr;\n      other_end_ = nullptr;\n    }\n  }\n  // Returns the pointer to the other end of the double pointer.\n  DoublePtr *OtherEnd() const {\n    return other_end_;\n  }\n\nprivate:\n  // Pointer to the other end of the link. It is always true that either\n  // other_end_ == nullptr or other_end_->other_end_ == this.\n  DoublePtr *other_end_;\n};\n\n} // namespace tesseract.\n\n#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_\n"
  },
  {
    "path": "unittest/equationdetect_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n\n#include \"colpartitiongrid.h\"\n#include \"equationdetect.h\"\n#include \"tesseractclass.h\"\n\n#include <allheaders.h>\n\n#include <memory>\n#include <string>\n#include <utility>\n\n#define ENABLE_IdentifySpecialText_TEST 0\n#if ENABLE_IdentifySpecialText_TEST\n#  define EQU_TRAINEDDATA_NAME \"equ\"\n#else\n#  define EQU_TRAINEDDATA_NAME \"equINTENTIONALLY_MISSING_FILE\"\n#endif\n\nnamespace tesseract {\n\nclass TestableEquationDetect : public EquationDetect {\npublic:\n  TestableEquationDetect(const char *tessdata, Tesseract *lang_tesseract)\n      : EquationDetect(tessdata, EQU_TRAINEDDATA_NAME) {\n    SetLangTesseract(lang_tesseract);\n  }\n\n  // Insert a certain math and digit blobs into part.\n  void AddMathDigitBlobs(const int math_blobs, const int digit_blobs, const int total_blobs,\n                         ColPartition *part) {\n    CHECK(part != nullptr);\n    CHECK_LE(math_blobs + digit_blobs, total_blobs);\n    int count = 0;\n    for (int i = 0; i < math_blobs; i++, count++) {\n      auto *blob = new BLOBNBOX();\n      blob->set_special_text_type(BSTT_MATH);\n      part->AddBox(blob);\n    }\n    for (int i = 0; i < digit_blobs; i++, count++) {\n      auto *blob = new BLOBNBOX();\n      blob->set_special_text_type(BSTT_DIGIT);\n      part->AddBox(blob);\n    }\n    for (int i = count; i < total_blobs; i++) {\n      auto *blob = new BLOBNBOX();\n      blob->set_special_text_type(BSTT_NONE);\n      part->AddBox(blob);\n    }\n  }\n\n  // Set up pix_binary for lang_tesseract_.\n  void SetPixBinary(Image pix) {\n    CHECK_EQ(1, pixGetDepth(pix));\n    *(lang_tesseract_->mutable_pix_binary()) = pix;\n  }\n\n  void RunIdentifySpecialText(BLOBNBOX *blob, const int height_th) {\n    IdentifySpecialText(blob, height_th);\n  }\n\n  BlobSpecialTextType RunEstimateTypeForUnichar(const char *val) {\n    const UNICHARSET &unicharset = lang_tesseract_->unicharset;\n    return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val));\n  }\n\n  EquationDetect::IndentType RunIsIndented(ColPartitionGrid *part_grid, ColPartition *part) {\n    this->part_grid_ = part_grid;\n    return IsIndented(part);\n  }\n\n  bool RunIsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) {\n    return IsNearSmallNeighbor(seed_box, part_box);\n  }\n\n  bool RunCheckSeedBlobsCount(ColPartition *part) {\n    return CheckSeedBlobsCount(part);\n  }\n\n  float RunComputeForegroundDensity(const TBOX &tbox) {\n    return ComputeForegroundDensity(tbox);\n  }\n\n  int RunCountAlignment(const std::vector<int> &sorted_vec, const int val) {\n    return CountAlignment(sorted_vec, val);\n  }\n\n  void RunSplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {\n    SplitCPHorLite(part, splitted_boxes);\n  }\n\n  void RunSplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {\n    SplitCPHor(part, parts_splitted);\n  }\n\n  void TestComputeCPsSuperBBox(const TBOX &box, ColPartitionGrid *part_grid) {\n    CHECK(part_grid != nullptr);\n    part_grid_ = part_grid;\n    ComputeCPsSuperBBox();\n    EXPECT_TRUE(*cps_super_bbox_ == box);\n  }\n};\n\nclass EquationFinderTest : public testing::Test {\nprotected:\n  std::unique_ptr<TestableEquationDetect> equation_det_;\n  std::unique_ptr<Tesseract> tesseract_;\n\n  // The directory for testdata;\n  std::string testdata_dir_;\n\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    tesseract_ = std::make_unique<Tesseract>();\n    tesseract_->init_tesseract(TESSDATA_DIR, \"eng\", OEM_TESSERACT_ONLY);\n    tesseract_->set_source_resolution(300);\n    equation_det_ = std::make_unique<TestableEquationDetect>(TESSDATA_DIR, tesseract_.get());\n    equation_det_->SetResolution(300);\n\n    testdata_dir_ = TESTDATA_DIR;\n  }\n\n  void TearDown() override {\n    tesseract_.reset(nullptr);\n    equation_det_.reset(nullptr);\n  }\n\n  // Add a BLOCK covering the whole page.\n  void AddPageBlock(Image pix, BLOCK_LIST *blocks) {\n    CHECK(pix != nullptr);\n    CHECK(blocks != nullptr);\n    BLOCK_IT block_it(blocks);\n    auto *block = new BLOCK(\"\", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix));\n    block_it.add_to_end(block);\n  }\n\n  // Create col partitions, add into part_grid, and put them into all_parts.\n  void CreateColParts(const int rows, const int cols, ColPartitionGrid *part_grid,\n                      std::vector<ColPartition *> *all_parts) {\n    const int kWidth = 10, kHeight = 10;\n    ClearParts(all_parts);\n    for (int y = 0; y < rows; ++y) {\n      for (int x = 0; x < cols; ++x) {\n        int left = x * kWidth * 2, bottom = y * kHeight * 2;\n        TBOX box(left, bottom, left + kWidth, bottom + kHeight);\n        ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n        part_grid->InsertBBox(true, true, part);\n        all_parts->push_back(part);\n      }\n    }\n  }\n\n  void ClearParts(std::vector<ColPartition *> *all_parts) {\n    for (auto &all_part : *all_parts) {\n      all_part->DeleteBoxes();\n      delete all_part;\n    }\n  }\n\n  // Create a BLOBNBOX object with bounding box tbox, and add it into part.\n  void AddBlobIntoPart(const TBOX &tbox, ColPartition *part) {\n    CHECK(part != nullptr);\n    auto *blob = new BLOBNBOX();\n    blob->set_bounding_box(tbox);\n    part->AddBox(blob);\n  }\n};\n\nTEST_F(EquationFinderTest, IdentifySpecialText) {\n#if !ENABLE_IdentifySpecialText_TEST\n  GTEST_SKIP();\n#else // TODO: missing equ_gt1.tif\n  // Load Image.\n  std::string imagefile = file::JoinPath(testdata_dir_, \"equ_gt1.tif\");\n  Image pix_binary = pixRead(imagefile.c_str());\n  CHECK(pix_binary != nullptr && pixGetDepth(pix_binary) == 1);\n\n  // Get components.\n  BLOCK_LIST blocks;\n  TO_BLOCK_LIST to_blocks;\n  AddPageBlock(pix_binary, &blocks);\n  Textord *textord = tesseract_->mutable_textord();\n  textord->find_components(pix_binary, &blocks, &to_blocks);\n\n  // Identify special texts from to_blocks.\n  TO_BLOCK_IT to_block_it(&to_blocks);\n  std::map<int, int> stt_count;\n  for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); to_block_it.forward()) {\n    TO_BLOCK *to_block = to_block_it.data();\n    BLOBNBOX_IT blob_it(&(to_block->blobs));\n    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {\n      BLOBNBOX *blob = blob_it.data();\n      // blob->set_special_text_type(BSTT_NONE);\n      equation_det_->RunIdentifySpecialText(blob, 0);\n      tensorflow::gtl::InsertIfNotPresent(&stt_count, blob->special_text_type(), 0);\n      stt_count[blob->special_text_type()]++;\n    }\n  }\n\n  // Verify the number, but allow a range of +/- kCountRange before squealing.\n  const int kCountRange = 3;\n  EXPECT_GE(39 + kCountRange, stt_count[BSTT_NONE]);\n  EXPECT_LE(39 - kCountRange, stt_count[BSTT_NONE]);\n\n  // if you count all the subscripts etc, there are ~45 italic chars.\n  EXPECT_GE(45 + kCountRange, stt_count[BSTT_ITALIC]);\n  EXPECT_LE(45 - kCountRange, stt_count[BSTT_ITALIC]);\n  EXPECT_GE(41 + kCountRange, stt_count[BSTT_DIGIT]);\n  EXPECT_LE(41 - kCountRange, stt_count[BSTT_DIGIT]);\n  EXPECT_GE(50 + kCountRange, stt_count[BSTT_MATH]);\n  EXPECT_LE(50 - kCountRange, stt_count[BSTT_MATH]);\n  EXPECT_GE(10 + kCountRange, stt_count[BSTT_UNCLEAR]);\n  EXPECT_LE(10 - kCountRange, stt_count[BSTT_UNCLEAR]);\n\n  // Release memory.\n  pix_binary.destroy();\n#endif\n}\n\nTEST_F(EquationFinderTest, EstimateTypeForUnichar) {\n  // Test abc characters.\n  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(\"a\"));\n  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(\"c\"));\n\n  // Test punctuation characters.\n  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(\"'\"));\n  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(\",\"));\n\n  // Test digits.\n  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar(\"1\"));\n  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar(\"4\"));\n  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar(\"|\"));\n\n  // Test math symbols.\n  EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar(\"(\"));\n  EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar(\"+\"));\n}\n\nTEST_F(EquationFinderTest, IsIndented) {\n  ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));\n\n  // Create five ColPartitions:\n  // part 1: ************\n  // part 2:   *********\n  // part 3: *******\n  // part 4:   *****\n  //\n  // part 5:   ********\n  TBOX box1(0, 950, 999, 999);\n  ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part_grid.InsertBBox(true, true, part1);\n  TBOX box2(300, 920, 900, 940);\n  ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part_grid.InsertBBox(true, true, part2);\n  TBOX box3(0, 900, 600, 910);\n  ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part_grid.InsertBBox(true, true, part3);\n  TBOX box4(300, 890, 600, 899);\n  ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part_grid.InsertBBox(true, true, part4);\n  TBOX box5(300, 500, 900, 510);\n  ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part_grid.InsertBBox(true, true, part5);\n\n  // Test\n  // part1 should be no indent.\n  EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part1));\n  // part2 should be left indent in terms of part1.\n  EXPECT_EQ(EquationDetect::LEFT_INDENT, equation_det_->RunIsIndented(&part_grid, part2));\n  // part3 should be right indent.\n  EXPECT_EQ(EquationDetect::RIGHT_INDENT, equation_det_->RunIsIndented(&part_grid, part3));\n  // part4 should be both indented.\n  EXPECT_EQ(EquationDetect::BOTH_INDENT, equation_det_->RunIsIndented(&part_grid, part4));\n  // part5 should be no indent because it is too far from part1.\n  EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part5));\n\n  // Release memory.\n  part1->DeleteBoxes();\n  delete (part1);\n  part2->DeleteBoxes();\n  delete (part2);\n  part3->DeleteBoxes();\n  delete (part3);\n  part4->DeleteBoxes();\n  delete (part4);\n  part5->DeleteBoxes();\n  delete (part5);\n}\n\nTEST_F(EquationFinderTest, IsNearSmallNeighbor) {\n  // Create four tboxes:\n  //          part 1, part 2\n  //           *****   *****\n  // part 3:   *****\n  //\n  // part 4: *****************\n  TBOX box1(0, 950, 499, 999);\n  TBOX box2(500, 950, 999, 998);\n  TBOX box3(0, 900, 499, 949);\n  TBOX box4(0, 550, 499, 590);\n\n  // Test\n  // box2 should be box1's near neighbor but not vice versa.\n  EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box2));\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box1));\n  // box1 and box3 should be near neighbors of each other.\n  EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box3));\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));\n  // box2 and box3 should not be near neighbors of each other.\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box2));\n\n  // box4 should not be the near neighbor of any one.\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box1, box4));\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box4));\n  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box4));\n}\n\nTEST_F(EquationFinderTest, CheckSeedBlobsCount) {\n  TBOX box(0, 950, 999, 999);\n  ColPartition *part1 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  ColPartition *part2 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  ColPartition *part3 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  ColPartition *part4 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n\n  // Part 1: 8 math, 0 digit, 20 total.\n  equation_det_->AddMathDigitBlobs(8, 0, 20, part1);\n  EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part1));\n\n  // Part 2: 1 math, 8 digit, 20 total.\n  equation_det_->AddMathDigitBlobs(1, 8, 20, part2);\n  EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part2));\n\n  // Part 3: 3 math, 8 digit, 8 total.\n  equation_det_->AddMathDigitBlobs(3, 8, 20, part3);\n  EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part3));\n\n  // Part 4: 8 math, 0 digit, 8 total.\n  equation_det_->AddMathDigitBlobs(0, 0, 8, part4);\n  EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part4));\n\n  // Release memory.\n  part1->DeleteBoxes();\n  delete (part1);\n  part2->DeleteBoxes();\n  delete (part2);\n  part3->DeleteBoxes();\n  delete (part3);\n  part4->DeleteBoxes();\n  delete (part4);\n}\n\nTEST_F(EquationFinderTest, ComputeForegroundDensity) {\n  // Create the pix with top half foreground, bottom half background.\n  int width = 1024, height = 768;\n  Image pix = pixCreate(width, height, 1);\n  pixRasterop(pix, 0, 0, width, height / 2, PIX_SET, nullptr, 0, 0);\n  TBOX box1(100, 0, 140, 140), box2(100, height / 2 - 20, 140, height / 2 + 20),\n      box3(100, height - 40, 140, height);\n  equation_det_->SetPixBinary(pix);\n\n  // Verify\n  EXPECT_NEAR(0.0, equation_det_->RunComputeForegroundDensity(box1), 0.0001f);\n  EXPECT_NEAR(0.5, equation_det_->RunComputeForegroundDensity(box2), 0.0001f);\n  EXPECT_NEAR(1.0, equation_det_->RunComputeForegroundDensity(box3), 0.0001f);\n}\n\nTEST_F(EquationFinderTest, CountAlignment) {\n  std::vector<int> vec;\n  vec.push_back(1);\n  vec.push_back(1);\n  vec.push_back(1);\n  vec.push_back(100);\n  vec.push_back(200);\n  vec.push_back(200);\n\n  // Test the right point.\n  EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 1));\n  EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 100));\n  EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 200));\n\n  // Test the near neighbors.\n  EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 3));\n  EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 99));\n  EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 202));\n\n  // Test the far neighbors.\n  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 150));\n  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 50));\n  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 250));\n}\n\nTEST_F(EquationFinderTest, ComputeCPsSuperBBox) {\n  Image pix = pixCreate(1001, 1001, 1);\n  equation_det_->SetPixBinary(pix);\n  ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));\n\n  TBOX box1(0, 0, 999, 99);\n  ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  TBOX box2(0, 100, 499, 199);\n  ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  TBOX box3(500, 100, 999, 199);\n  ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  TBOX box4(0, 200, 999, 299);\n  ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  TBOX box5(0, 900, 999, 999);\n  ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n\n  // Add part1->part3 into part_grid and test.\n  part_grid.InsertBBox(true, true, part1);\n  part_grid.InsertBBox(true, true, part2);\n  part_grid.InsertBBox(true, true, part3);\n  TBOX super_box(0, 0, 999, 199);\n  equation_det_->TestComputeCPsSuperBBox(super_box, &part_grid);\n\n  // Add part4 and test.\n  part_grid.InsertBBox(true, true, part4);\n  TBOX super_box2(0, 0, 999, 299);\n  equation_det_->TestComputeCPsSuperBBox(super_box2, &part_grid);\n\n  // Add part5 and test.\n  part_grid.InsertBBox(true, true, part5);\n  TBOX super_box3(0, 0, 999, 999);\n  equation_det_->TestComputeCPsSuperBBox(super_box3, &part_grid);\n\n  // Release memory.\n  part1->DeleteBoxes();\n  delete (part1);\n  part2->DeleteBoxes();\n  delete (part2);\n  part3->DeleteBoxes();\n  delete (part3);\n  part4->DeleteBoxes();\n  delete (part4);\n  part5->DeleteBoxes();\n  delete (part5);\n}\n\nTEST_F(EquationFinderTest, SplitCPHorLite) {\n  TBOX box(0, 0, 999, 99);\n  ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part->DeleteBoxes();\n  part->set_median_width(10);\n  std::vector<TBOX> splitted_boxes;\n\n  // Test an empty part.\n  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);\n  EXPECT_TRUE(splitted_boxes.empty());\n\n  // Test with one blob.\n  AddBlobIntoPart(TBOX(0, 0, 10, 50), part);\n  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);\n  EXPECT_EQ(1, splitted_boxes.size());\n  EXPECT_TRUE(TBOX(0, 0, 10, 50) == splitted_boxes[0]);\n\n  // Add more blob and test.\n  AddBlobIntoPart(TBOX(11, 0, 20, 60), part);\n  AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point.\n  AddBlobIntoPart(TBOX(100, 0, 110, 15), part);\n  AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point.\n  AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point.\n  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);\n  // Verify.\n  EXPECT_EQ(3, splitted_boxes.size());\n  EXPECT_TRUE(TBOX(0, 0, 30, 60) == splitted_boxes[0]);\n  EXPECT_TRUE(TBOX(100, 0, 140, 45) == splitted_boxes[1]);\n  EXPECT_TRUE(TBOX(500, 0, 540, 35) == splitted_boxes[2]);\n\n  part->DeleteBoxes();\n  delete (part);\n}\n\nTEST_F(EquationFinderTest, SplitCPHor) {\n  TBOX box(0, 0, 999, 99);\n  ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n  part->DeleteBoxes();\n  part->set_median_width(10);\n  std::vector<ColPartition *> parts_splitted;\n\n  // Test an empty part.\n  equation_det_->RunSplitCPHor(part, &parts_splitted);\n  EXPECT_TRUE(parts_splitted.empty());\n  // Test with one blob.\n  AddBlobIntoPart(TBOX(0, 0, 10, 50), part);\n\n  equation_det_->RunSplitCPHor(part, &parts_splitted);\n  EXPECT_EQ(1, parts_splitted.size());\n  EXPECT_TRUE(TBOX(0, 0, 10, 50) == parts_splitted[0]->bounding_box());\n\n  // Add more blob and test.\n  AddBlobIntoPart(TBOX(11, 0, 20, 60), part);\n  AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point.\n  AddBlobIntoPart(TBOX(100, 0, 110, 15), part);\n  AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point.\n  AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point.\n  equation_det_->RunSplitCPHor(part, &parts_splitted);\n\n  // Verify.\n  EXPECT_EQ(3, parts_splitted.size());\n  EXPECT_TRUE(TBOX(0, 0, 30, 60) == parts_splitted[0]->bounding_box());\n  EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());\n  EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());\n\n  for (auto part_splitted : parts_splitted) {\n    delete part_splitted;\n  }\n  part->DeleteBoxes();\n  delete (part);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/fileio_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdio.h>\n#include <memory>\n\n#include \"fileio.h\"\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nTEST(FileTest, JoinPath) {\n  EXPECT_EQ(\"/abc/def\", File::JoinPath(\"/abc\", \"def\"));\n  EXPECT_EQ(\"/abc/def\", File::JoinPath(\"/abc/\", \"def\"));\n  EXPECT_EQ(\"def\", File::JoinPath(\"\", \"def\"));\n}\n\nTEST(OutputBufferTest, WriteString) {\n  const int kMaxBufSize = 128;\n  char buffer[kMaxBufSize];\n  for (char &i : buffer) {\n    i = '\\0';\n  }\n  FILE *fp = tmpfile();\n  CHECK(fp != nullptr);\n\n  auto output = std::make_unique<OutputBuffer>(fp);\n  output->WriteString(\"Hello \");\n  output->WriteString(\"world!\");\n\n  rewind(fp);\n  auto s = \"Hello world!\";\n  fread(buffer, strlen(s), 1, fp);\n  EXPECT_STREQ(s, buffer);\n}\n\nTEST(InputBufferTest, Read) {\n  const int kMaxBufSize = 128;\n  char buffer[kMaxBufSize];\n  auto s = \"Hello\\n world!\";\n  strncpy(buffer, s, kMaxBufSize);\n  EXPECT_STREQ(s, buffer);\n  FILE *fp = tmpfile();\n  CHECK(fp != nullptr);\n  fwrite(buffer, strlen(s), 1, fp);\n  rewind(fp);\n\n  std::string str;\n  auto input = std::make_unique<InputBuffer>(fp);\n  EXPECT_TRUE(input->Read(&str));\n  std::vector<std::string> lines = split(str, '\\n');\n  EXPECT_EQ(2, lines.size());\n  EXPECT_EQ(\"Hello\", lines[0]);\n  EXPECT_EQ(\" world!\", lines[1]);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/fuzzers/fuzzer-api.cpp",
    "content": "#include <allheaders.h>\n#include <tesseract/baseapi.h>\n\n#include <libgen.h> // for dirname\n#include <cstdio>   // for printf\n#include <cstdlib>  // for std::getenv, std::setenv\n#include <string>   // for std::string\n\n#ifndef TESSERACT_FUZZER_WIDTH\n#  define TESSERACT_FUZZER_WIDTH 100\n#endif\n\n#ifndef TESSERACT_FUZZER_HEIGHT\n#  define TESSERACT_FUZZER_HEIGHT 100\n#endif\n\nclass BitReader {\nprivate:\n  uint8_t const *data;\n  size_t size;\n  size_t shift;\n\npublic:\n  BitReader(const uint8_t *data, size_t size) : data(data), size(size), shift(0) {}\n\n  int Read(void) {\n    if (size == 0) {\n      return 0;\n    }\n\n    const int ret = ((*data) >> shift) & 1;\n\n    shift++;\n    if (shift >= 8) {\n      shift = 0;\n      data++;\n      size--;\n    }\n\n    return ret;\n  }\n};\n\nstatic tesseract::TessBaseAPI *api = nullptr;\n\nextern \"C\" int LLVMFuzzerInitialize(int * /*pArgc*/, char ***pArgv) {\n  if (std::getenv(\"TESSDATA_PREFIX\") == nullptr) {\n    std::string binary_path = *pArgv[0];\n    const std::string filepath = dirname(&binary_path[0]);\n\n    const std::string tessdata_path = filepath + \"/\" + \"tessdata\";\n    if (setenv(\"TESSDATA_PREFIX\", tessdata_path.c_str(), 1) != 0) {\n      printf(\"Setenv failed\\n\");\n      std::abort();\n    }\n  }\n\n  api = new tesseract::TessBaseAPI();\n  if (api->Init(nullptr, \"eng\") != 0) {\n    printf(\"Cannot initialize API\\n\");\n    abort();\n  }\n\n  /* Silence output */\n  api->SetVariable(\"debug_file\", \"/dev/null\");\n\n  return 0;\n}\n\nstatic PIX *createPix(BitReader &BR, const size_t width, const size_t height) {\n  Pix *pix = pixCreate(width, height, 1);\n\n  if (pix == nullptr) {\n    printf(\"pix creation failed\\n\");\n    abort();\n  }\n\n  for (size_t i = 0; i < width; i++) {\n    for (size_t j = 0; j < height; j++) {\n      pixSetPixel(pix, i, j, BR.Read());\n    }\n  }\n\n  return pix;\n}\n\nextern \"C\" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {\n  BitReader BR(data, size);\n\n  auto pix = createPix(BR, TESSERACT_FUZZER_WIDTH, TESSERACT_FUZZER_HEIGHT);\n\n  api->SetImage(pix);\n\n  char *outText = api->GetUTF8Text();\n\n  pixDestroy(&pix);\n  delete[] outText;\n\n  return 0;\n}\n"
  },
  {
    "path": "unittest/fuzzers/oss-fuzz-build.sh",
    "content": "#!/bin/bash -eu\n# Copyright 2019 Google Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n################################################################################\n\ncd \"$SRC\"/leptonica\n./autogen.sh\n./configure --disable-shared\nmake SUBDIRS=src install -j\"$(nproc)\"\nldconfig\n\ncd \"$SRC\"/tesseract\n./autogen.sh\nCXXFLAGS=\"$CXXFLAGS -D_GLIBCXX_DEBUG\" ./configure --disable-graphics --disable-shared\nmake -j\"$(nproc)\"\n\n# Get the models which are needed for the fuzzers.\n\nmkdir -p \"$OUT\"/tessdata\n(\ncd \"$OUT\"/tessdata\ntest -f eng.traineddata || \\\n  curl -sSL -O https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata\n)\n\n# OSS-Fuzz requires static linking for the project specific libraries,\n# so get the list of those libraries for Leptonica and TIFF.\n# Note that libm must be linker dynamically to avoid linker errors.\n\nLEPTONICA_CFLAGS=$(pkg-config --cflags lept)\nLEPTONICA_LIBS=$(pkg-config --static --libs lept)\nLIBTIFF_LIBS=$(pkg-config --static --libs libtiff-4 | sed 's/ -lm//')\n\n$CXX $CXXFLAGS \\\n    -I \"$SRC\"/tesseract/include \\\n     \"$SRC\"/tesseract/unittest/fuzzers/fuzzer-api.cpp -o \"$OUT\"/fuzzer-api \\\n     \"$SRC\"/tesseract/.libs/libtesseract.a \\\n     $LEPTONICA_CFLAGS \\\n     -Wl,-Bstatic $LEPTONICA_LIBS $LIBTIFF_LIBS -Wl,-Bdynamic \\\n     $LIB_FUZZING_ENGINE\n\n$CXX $CXXFLAGS \\\n    -DTESSERACT_FUZZER_WIDTH=512 \\\n    -DTESSERACT_FUZZER_HEIGHT=256 \\\n    -I \"$SRC\"/tesseract/include \\\n     \"$SRC\"/tesseract/unittest/fuzzers/fuzzer-api.cpp -o \"$OUT\"/fuzzer-api-512x256 \\\n     \"$SRC\"/tesseract/.libs/libtesseract.a \\\n     $LEPTONICA_CFLAGS \\\n     -Wl,-Bstatic $LEPTONICA_LIBS $LIBTIFF_LIBS -Wl,-Bdynamic \\\n     $LIB_FUZZING_ENGINE\n"
  },
  {
    "path": "unittest/heap_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n\n#include \"doubleptr.h\"\n#include \"genericheap.h\"\n#include \"kdpair.h\"\n\n#include <string>\n#include <utility>\n\nnamespace tesseract {\n\nint test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0};\n\n// The fixture for testing GenericHeap and DoublePtr.\nclass HeapTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\npublic:\n  ~HeapTest() override;\n  // Pushes the test data onto both the heap and the KDVector.\n  void PushTestData(GenericHeap<IntKDPair> *heap, KDVector *v) {\n    for (size_t i = 0; i < countof(test_data); ++i) {\n      IntKDPair pair(test_data[i], i);\n      heap->Push(&pair);\n      v->push_back(pair);\n    }\n  }\n  // Verifies that the data in the heap matches the vector (after sorting) by\n  // popping everything off the heap.\n  void VerifyHeapVectorMatch(GenericHeap<IntKDPair> *heap, KDVector *v) {\n    EXPECT_FALSE(heap->empty());\n    EXPECT_EQ(heap->size(), v->size());\n    // Sort the vector and check that the keys come out of the heap in the same\n    // order as v.\n    // Also check that the indices match, except for 9, which is duplicated.\n    std::sort(v->begin(), v->end());\n    // Check that we have increasing order.\n    EXPECT_LT((*v)[0].key(), v->back().key());\n    for (unsigned i = 0; i < v->size(); ++i) {\n      EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());\n      // Indices don't necessarily match for equal keys, so don't test them.\n      if (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {\n        while (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {\n          heap->Pop(nullptr);\n          ++i;\n          EXPECT_FALSE(heap->empty());\n          EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());\n        }\n      } else {\n        // The indices must also match if the key is unique.\n        EXPECT_EQ((*v)[i].data(), heap->PeekTop().data());\n      }\n      EXPECT_FALSE(heap->empty());\n      EXPECT_TRUE(heap->Pop(nullptr));\n    }\n    EXPECT_TRUE(heap->empty());\n  }\n};\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of a weak vtable (fixes compiler warning).\nHeapTest::~HeapTest() = default;\n\n// Tests that a sort using a GenericHeap matches the result of a sort using\n// a KDVector.\nTEST_F(HeapTest, SortTest) {\n  GenericHeap<IntKDPair> heap;\n  EXPECT_TRUE(heap.empty());\n  KDVector v;\n  EXPECT_EQ(heap.size(), v.size());\n  // Push the test data onto both the heap and the KDVector.\n  PushTestData(&heap, &v);\n  VerifyHeapVectorMatch(&heap, &v);\n}\n\n// Tests that pushing some stuff, popping some stuff, and then pushing more\n// stuff results in output that matches the sort using a KDVector.\n// a KDVector.\nTEST_F(HeapTest, MixedTest) {\n  GenericHeap<IntKDPair> heap;\n  KDVector v;\n  // Push the test data onto both the heap and the KDVector.\n  PushTestData(&heap, &v);\n  // Sort the vector and remove the first 5 values from both heap and v.\n  std::sort(v.begin(), v.end());\n  for (int i = 0; i < 5; ++i) {\n    heap.Pop(nullptr);\n    v.erase(v.begin());\n  }\n  // Push the test data onto both the heap and the KDVector.\n  PushTestData(&heap, &v);\n  // Heap and vector should still match!\n  VerifyHeapVectorMatch(&heap, &v);\n}\n\n// Tests that PopWorst still leaves the heap in a state such that it still\n// matches a sorted KDVector.\nTEST_F(HeapTest, PopWorstTest) {\n  GenericHeap<IntKDPair> heap;\n  KDVector v;\n  // Push the test data onto both the heap and the KDVector.\n  PushTestData(&heap, &v);\n  // Get the worst element off the heap.\n  IntKDPair pair;\n  heap.PopWorst(&pair);\n  EXPECT_EQ(pair.key(), 65536);\n  EXPECT_EQ(pair.data(), 6);\n  // Sort and remove the worst element from the vector.\n  std::sort(v.begin(), v.end());\n  v.resize(v.size() - 1);\n  // After that they should still match!\n  VerifyHeapVectorMatch(&heap, &v);\n}\n\n// Tests that Reshuffle works and the heap still matches a KDVector with the\n// same value changed. Doubles up as a test of DoublePtr.\nTEST_F(HeapTest, RevalueTest) {\n  // Here the data element of the pair is a DoublePtr, which links the entries\n  // in the vector and heap, and we test a MAX heap.\n  typedef KDPairDec<int, DoublePtr> PtrPair;\n  GenericHeap<PtrPair> heap;\n  std::vector<PtrPair> v;\n  // Push the test data onto both the heap and the vector.\n  for (int i : test_data) {\n    PtrPair h_pair;\n    h_pair.key() = i;\n    PtrPair v_pair;\n    v_pair.key() = i;\n    h_pair.data().Connect(&v_pair.data());\n    heap.Push(&h_pair);\n    v.push_back(v_pair);\n  }\n  // Test changes both ways. Index 0 is 8, so change it to -1.\n  v[0].key() = -1;\n  // v[0].data.OtherEnd() is a pointer to the data element in the appropriate\n  // heap entry, wherever it may be. We can change its value via that pointer.\n  // Without Reshuffle, that would be a terribly bad thing to do, as it violates\n  // the heap invariant, making the heap corrupt.\n  auto *pair_ptr = reinterpret_cast<PtrPair *>(v[0].data().OtherEnd());\n  pair_ptr->key() = v[0].key();\n  heap.Reshuffle(pair_ptr);\n  // Index 1 is 1. Change to 32767.\n  v[1].key() = 32767;\n  pair_ptr = reinterpret_cast<PtrPair *>(v[1].data().OtherEnd());\n  pair_ptr->key() = v[1].key();\n  heap.Reshuffle(pair_ptr);\n  // After the changes, popping the heap should still match the sorted order\n  // of the vector.\n  std::sort(v.begin(), v.end());\n  EXPECT_GT(v[0].key(), v.back().key());\n  for (auto &i : v) {\n    EXPECT_EQ(i.key(), heap.PeekTop().key());\n    EXPECT_FALSE(heap.empty());\n    heap.Pop(nullptr);\n  }\n  EXPECT_TRUE(heap.empty());\n}\n\n#if 0\n// Helper checks that the compiler rejects use of a copy constructor with\n// a const argument and the default copy constructor is properly hidden by\n// the non-const version.\nstatic void ConstRefTest(const DoublePtr& ptr1) {\n  DoublePtr ptr2(ptr1);  // Compiler error here.\n  EXPECT_EQ(&ptr2, ptr2.OtherEnd()->OtherEnd());\n  EXPECT_TRUE(ptr1.OtherEnd() == nullptr);\n}\n#endif\n\n// Tests that DoublePtr works as expected.\nTEST_F(HeapTest, DoublePtrTest) {\n  DoublePtr ptr1;\n  DoublePtr ptr2;\n  ptr1.Connect(&ptr2);\n  // Check that the correct copy constructor is used.\n  DoublePtr ptr3(ptr1);\n  EXPECT_EQ(&ptr3, ptr3.OtherEnd()->OtherEnd());\n  EXPECT_TRUE(ptr1.OtherEnd() == nullptr);\n  // Check that the correct operator= is used.\n  ptr1 = ptr3;\n  EXPECT_EQ(&ptr1, ptr1.OtherEnd()->OtherEnd());\n  EXPECT_TRUE(ptr3.OtherEnd() == nullptr);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/imagedata_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string>\n#include <vector>\n\n#include \"imagedata.h\"\n#include \"include_gunit.h\"\n#include \"log.h\"\n\nnamespace tesseract {\n\n// Tests the caching mechanism of DocumentData/ImageData.\n\nclass ImagedataTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  ImagedataTest() = default;\n\n  // Creates a fake DocumentData, writes it to a file, and returns the filename.\n  std::string MakeFakeDoc(int num_pages, unsigned doc_id, std::vector<std::string> *page_texts) {\n    // The size of the fake images that we will use.\n    const int kImageSize = 1048576;\n    // Not using a real image here - just an array of zeros! We are just testing\n    // that the truth text matches.\n    std::vector<char> fake_image(kImageSize, 0);\n    DocumentData write_doc(\"My document\");\n    for (int p = 0; p < num_pages; ++p) {\n      // Make some fake text that is different for each page and save it.\n      char text[80];\n      snprintf(text, sizeof(text), \"Page %d of %d in doc %u\", p, num_pages, doc_id);\n      page_texts->push_back(text);\n      // Make an imagedata and put it in the document.\n      ImageData *imagedata = ImageData::Build(\"noname\", p, \"eng\", fake_image.data(),\n                                              fake_image.size(), (*page_texts)[p].c_str(), nullptr);\n      EXPECT_EQ(kImageSize, imagedata->MemoryUsed());\n      write_doc.AddPageToDocument(imagedata);\n    }\n    // Write it to a file.\n    std::string filename =\n        file::JoinPath(FLAGS_test_tmpdir, \"documentdata\");\n    filename += std::to_string(doc_id) + \".lstmf\";\n    EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));\n    return filename;\n  }\n};\n\nTEST_F(ImagedataTest, CachesProperly) {\n  // This test verifies that Imagedata can be stored in a DocumentData and a\n  // collection of them is cached correctly given limited memory.\n  // Number of pages to put in the fake document.\n  const int kNumPages = 12;\n  // Allowances to read the document. Big enough for 1, 3, 0, all pages.\n  const int kMemoryAllowances[] = {2000000, 4000000, 1000000, 100000000, 0};\n  // Order in which to read the pages, with some sequential and some seeks.\n  const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1};\n\n  std::vector<std::string> page_texts;\n  std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts);\n  // Now try getting it back with different memory allowances and check that\n  // the pages can still be read.\n  for (int m = 0; kMemoryAllowances[m] > 0; ++m) {\n    DocumentData read_doc(\"My document\");\n    EXPECT_TRUE(read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr));\n    LOG(ERROR) << \"Allowance = \" << kMemoryAllowances[m];\n    // Read the pages in a specific order.\n    for (int p = 0; kPageReadOrder[p] >= 0; ++p) {\n      int page = kPageReadOrder[p];\n      const ImageData *imagedata = read_doc.GetPage(page);\n      EXPECT_NE(nullptr, imagedata);\n      // EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);\n      // Check that this is the right page.\n      EXPECT_STREQ(page_texts[page].c_str(), imagedata->transcription().c_str());\n    }\n  }\n}\n\nTEST_F(ImagedataTest, CachesMultiDocs) {\n  // This test verifies that DocumentCache works to store multiple DocumentData\n  // and the two caching strategies read images in the right order.\n  // Number of pages in each document.\n  const std::vector<int> kNumPages = {6, 5, 7};\n  std::vector<std::vector<std::string>> page_texts;\n  std::vector<std::string> filenames;\n  for (size_t d = 0; d < kNumPages.size(); ++d) {\n    page_texts.emplace_back(std::vector<std::string>());\n    std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());\n    filenames.push_back(filename);\n  }\n  // Now try getting them back with different cache strategies and check that\n  // the pages come out in the right order.\n  DocumentCache robin_cache(8000000);\n  robin_cache.LoadDocuments(filenames, tesseract::CS_ROUND_ROBIN, nullptr);\n  DocumentCache serial_cache(8000000);\n  serial_cache.LoadDocuments(filenames, tesseract::CS_SEQUENTIAL, nullptr);\n  for (int p = 0; p <= 21; ++p) {\n    LOG(INFO) << \"Page \" << p;\n    const ImageData *robin_data = robin_cache.GetPageBySerial(p);\n    const ImageData *serial_data = serial_cache.GetPageBySerial(p);\n    CHECK(robin_data != nullptr);\n    CHECK(serial_data != nullptr);\n    int robin_doc = p % kNumPages.size();\n    int robin_page = p / kNumPages.size() % kNumPages[robin_doc];\n    // Check that this is the right page.\n    EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(), robin_data->transcription().c_str());\n    int serial_doc = p / kNumPages[0] % kNumPages.size();\n    int serial_page = p % kNumPages[0] % kNumPages[serial_doc];\n    EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(), serial_data->transcription().c_str());\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/include_gunit.h",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n// Portability include to match the Google test environment.\n\n#ifndef TESSERACT_UNITTEST_INCLUDE_GUNIT_H_\n#define TESSERACT_UNITTEST_INCLUDE_GUNIT_H_\n\n#include \"errcode.h\" // for ASSERT_HOST\n#include \"fileio.h\"  // for tesseract::File\n#include \"gtest/gtest.h\"\n#include \"log.h\" // for LOG\n\nstatic const char *FLAGS_test_tmpdir = \"./tmp\";\n\nnamespace tesseract {\n\nstatic inline void trim(std::string &s) {\n  s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {\n    return !std::isspace(ch);\n  }));\n  s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {\n    return !std::isspace(ch);\n  }).base(), s.end());\n}\n\n} // namespace tesseract\n\nclass file : public tesseract::File {\npublic:\n  static void MakeTmpdir() {\n#if defined(_WIN32)\n    _mkdir(FLAGS_test_tmpdir);\n#else\n    mkdir(FLAGS_test_tmpdir, S_IRWXU | S_IRWXG);\n#endif\n  }\n\n  // Create a file and write a string to it.\n  static bool WriteStringToFile(const std::string &contents, const std::string &filename) {\n    File::WriteStringToFileOrDie(contents, filename);\n    return true;\n  }\n\n  static bool GetContents(const std::string &filename, std::string *out, int) {\n    return File::ReadFileToString(filename, out);\n  }\n\n  static bool SetContents(const std::string &name, const std::string &contents,\n                          bool /*is_default*/) {\n    return WriteStringToFile(contents, name);\n  }\n\n  static int Defaults() {\n    return 0;\n  }\n\n  static std::string JoinPath(const std::string &s1, const std::string &s2) {\n    return tesseract::File::JoinPath(s1, s2);\n  }\n\n  static std::string JoinPath(const std::string &s1, const std::string &s2, const std::string &s3) {\n    return JoinPath(JoinPath(s1, s2), s3);\n  }\n};\n\n// /usr/include/tensorflow/core/platform/default/logging.h defines the CHECK* macros.\n#if !defined(CHECK)\n#  define CHECK(condition) \\\n    if (!(condition))      \\\n    LOG(FATAL) << \"Check failed: \" #condition \" \"\n#  define CHECK_EQ(test, value) CHECK((test) == (value))\n#  define CHECK_GE(test, value) CHECK((test) >= (value))\n#  define CHECK_GT(test, value) CHECK((test) > (value))\n#  define CHECK_LT(test, value) CHECK((test) < (value))\n#  define CHECK_LE(test, value) CHECK((test) <= (value))\n#  define CHECK_OK(test) CHECK(test)\n#endif\n\n#endif // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_\n"
  },
  {
    "path": "unittest/indexmapbidi_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <cmath>\n#include <cstdio>\n#include <string>\n\n#include \"indexmapbidi.h\"\n\n#include \"include_gunit.h\"\n\nconst int kPrimeLimit = 1000;\n\nnamespace tesseract {\n\nclass IndexMapBiDiTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\npublic:\n  std::string OutputNameToPath(const std::string &name) {\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n  // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.\n  void ComputePrimes(IndexMapBiDi *map) {\n    map->Init(kPrimeLimit + 1, false);\n    map->SetMap(2, true);\n    // Set all the odds to true.\n    for (int i = 3; i <= kPrimeLimit; i += 2) {\n      map->SetMap(i, true);\n    }\n    int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));\n    for (int f = 3; f <= factor_limit; f += 2) {\n      if (map->SparseToCompact(f) >= 0) {\n        for (int m = 2; m * f <= kPrimeLimit; ++m) {\n          map->SetMap(f * m, false);\n        }\n      }\n    }\n    map->Setup();\n  }\n\n  void TestPrimes(const IndexMap &map) {\n    // Now all primes are mapped in the sparse map to their index.\n    // According to Wikipedia, the 168th prime is 997, and it has compact\n    // index 167 because we are indexing from 0.\n    EXPECT_EQ(167, map.SparseToCompact(997));\n    EXPECT_EQ(997, map.CompactToSparse(167));\n    // 995, 996, 998, 999 are not prime.\n    EXPECT_EQ(-1, map.SparseToCompact(995));\n    EXPECT_EQ(-1, map.SparseToCompact(996));\n    EXPECT_EQ(-1, map.SparseToCompact(998));\n    EXPECT_EQ(-1, map.SparseToCompact(999));\n    // The 167th prime is 991.\n    EXPECT_EQ(991, map.CompactToSparse(166));\n    // There are 168 primes in 0..1000.\n    EXPECT_EQ(168, map.CompactSize());\n    EXPECT_EQ(kPrimeLimit + 1, map.SparseSize());\n  }\n};\n\n// Tests the sieve of Eratosthenes as a way of testing setup.\nTEST_F(IndexMapBiDiTest, Primes) {\n  IndexMapBiDi map;\n  ComputePrimes(&map);\n  TestPrimes(map);\n  // It still works if we assign it to another.\n  IndexMapBiDi map2;\n  map2.CopyFrom(map);\n  TestPrimes(map2);\n  // Or if we assign it to a base class.\n  IndexMap base_map;\n  base_map.CopyFrom(map);\n  TestPrimes(base_map);\n  // Test file i/o too.\n  std::string filename = OutputNameToPath(\"primesmap\");\n  FILE *fp = fopen(filename.c_str(), \"wb\");\n  CHECK(fp != nullptr);\n  EXPECT_TRUE(map.Serialize(fp));\n  fclose(fp);\n  fp = fopen(filename.c_str(), \"rb\");\n  CHECK(fp != nullptr);\n  IndexMapBiDi read_map;\n  EXPECT_TRUE(read_map.DeSerialize(false, fp));\n  fclose(fp);\n  TestPrimes(read_map);\n}\n\n// Tests the many-to-one setup feature.\nTEST_F(IndexMapBiDiTest, ManyToOne) {\n  // Test the example in the comment on CompleteMerges.\n  IndexMapBiDi map;\n  map.Init(13, false);\n  map.SetMap(2, true);\n  map.SetMap(4, true);\n  map.SetMap(7, true);\n  map.SetMap(9, true);\n  map.SetMap(11, true);\n  map.Setup();\n  map.Merge(map.SparseToCompact(2), map.SparseToCompact(9));\n  map.Merge(map.SparseToCompact(4), map.SparseToCompact(11));\n  map.CompleteMerges();\n  EXPECT_EQ(3, map.CompactSize());\n  EXPECT_EQ(13, map.SparseSize());\n  EXPECT_EQ(1, map.SparseToCompact(4));\n  EXPECT_EQ(4, map.CompactToSparse(1));\n  EXPECT_EQ(1, map.SparseToCompact(11));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/intfeaturemap_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"intfeaturemap.h\"\n#include \"intfeaturespace.h\"\n\n#include \"include_gunit.h\"\n\n// Random re-quantization to test that they don't have to be easy.\n// WARNING! Change these and change the expected_misses calculation below.\nconst int kXBuckets = 16;\nconst int kYBuckets = 24;\nconst int kThetaBuckets = 13;\n\nnamespace tesseract {\n\nclass IntFeatureMapTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\npublic:\n  // Expects that the given vector has contiguous integer values in the\n  // range [start, end).\n  void ExpectContiguous(const std::vector<int> &v, int start, int end) {\n    for (int i = start; i < end; ++i) {\n      EXPECT_EQ(i, v[i - start]);\n    }\n  }\n};\n\n// Tests the IntFeatureMap and implicitly the IntFeatureSpace underneath.\nTEST_F(IntFeatureMapTest, Exhaustive) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because IntFeatureSpace is missing.\n  GTEST_SKIP();\n#else\n  IntFeatureSpace space;\n  space.Init(kXBuckets, kYBuckets, kThetaBuckets);\n  IntFeatureMap map;\n  map.Init(space);\n  int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent;\n  auto features = std::make_unique<INT_FEATURE_STRUCT[]>(total_size);\n  // Fill the features with every value.\n  for (int y = 0; y < kIntFeatureExtent; ++y) {\n    for (int x = 0; x < kIntFeatureExtent; ++x) {\n      for (int theta = 0; theta < kIntFeatureExtent; ++theta) {\n        int f_index = (y * kIntFeatureExtent + x) * kIntFeatureExtent + theta;\n        features[f_index].X = x;\n        features[f_index].Y = y;\n        features[f_index].Theta = theta;\n      }\n    }\n  }\n  std::vector<int> index_features;\n  map.IndexAndSortFeatures(features.get(), total_size, &index_features);\n  EXPECT_EQ(total_size, index_features.size());\n  int total_buckets = kXBuckets * kYBuckets * kThetaBuckets;\n  std::vector<int> map_features;\n  int misses = map.MapIndexedFeatures(index_features, &map_features);\n  EXPECT_EQ(0, misses);\n  EXPECT_EQ(total_buckets, map_features.size());\n  ExpectContiguous(map_features, 0, total_buckets);\n  EXPECT_EQ(total_buckets, map.compact_size());\n  EXPECT_EQ(total_buckets, map.sparse_size());\n\n  // Every offset should be within dx, dy, dtheta of the start point.\n  int dx = kIntFeatureExtent / kXBuckets + 1;\n  int dy = kIntFeatureExtent / kYBuckets + 1;\n  int dtheta = kIntFeatureExtent / kThetaBuckets + 1;\n  int bad_offsets = 0;\n  for (int index = 0; index < total_buckets; ++index) {\n    for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps; ++dir) {\n      int offset_index = map.OffsetFeature(index, dir);\n      if (dir == 0) {\n        EXPECT_EQ(index, offset_index);\n      } else if (offset_index >= 0) {\n        INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);\n        INT_FEATURE_STRUCT f2 = map.InverseIndexFeature(offset_index);\n        EXPECT_TRUE(f.X != f2.X || f.Y != f2.Y || f.Theta != f2.Theta);\n        EXPECT_LE(abs(f.X - f2.X), dx);\n        EXPECT_LE(abs(f.Y - f2.Y), dy);\n        int theta_delta = abs(f.Theta - f2.Theta);\n        if (theta_delta > kIntFeatureExtent / 2) {\n          theta_delta = kIntFeatureExtent - theta_delta;\n        }\n        EXPECT_LE(theta_delta, dtheta);\n      } else {\n        ++bad_offsets;\n        INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);\n      }\n    }\n  }\n  EXPECT_LE(bad_offsets, (kXBuckets + kYBuckets) * kThetaBuckets);\n\n  // To test the mapping further, delete the 1st and last map feature, and\n  // test again.\n  map.DeleteMapFeature(0);\n  map.DeleteMapFeature(total_buckets - 1);\n  map.FinalizeMapping(nullptr);\n  map.IndexAndSortFeatures(features.get(), total_size, &index_features);\n  // Has no effect on index features.\n  EXPECT_EQ(total_size, index_features.size());\n  misses = map.MapIndexedFeatures(index_features, &map_features);\n  int expected_misses = (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets) *\n                        (kIntFeatureExtent / kThetaBuckets + 1);\n  expected_misses += (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets + 1) *\n                     (kIntFeatureExtent / kThetaBuckets);\n  EXPECT_EQ(expected_misses, misses);\n  EXPECT_EQ(total_buckets - 2, map_features.size());\n  ExpectContiguous(map_features, 0, total_buckets - 2);\n  EXPECT_EQ(total_buckets - 2, map.compact_size());\n  EXPECT_EQ(total_buckets, map.sparse_size());\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/intsimdmatrix_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        intsimdmatrix_test.cc\n// Author:      rays@google.com (Ray Smith)\n//\n// Copyright 2017 Google Inc. All Rights Reserved.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"intsimdmatrix.h\"\n#include <gtest/gtest.h>\n#include <gtest/internal/gtest-port.h>\n#include <memory>\n#include <vector>\n#include \"include_gunit.h\"\n#include \"matrix.h\"\n#include \"simddetect.h\"\n\nnamespace tesseract {\n\nclass IntSimdMatrixTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  // Makes a random weights matrix of the given size.\n  GENERIC_2D_ARRAY<int8_t> InitRandom(int no, int ni) {\n    GENERIC_2D_ARRAY<int8_t> a(no, ni, 0);\n    for (int i = 0; i < no; ++i) {\n      for (int j = 0; j < ni; ++j) {\n        a(i, j) = static_cast<int8_t>(random_.SignedRand(INT8_MAX));\n      }\n    }\n    return a;\n  }\n  // Makes a random input vector of the given size, with rounding up.\n  std::vector<int8_t> RandomVector(int size, const IntSimdMatrix &matrix) {\n    int rounded_size = matrix.RoundInputs(size);\n    std::vector<int8_t> v(rounded_size, 0);\n    for (int i = 0; i < size; ++i) {\n      v[i] = static_cast<int8_t>(random_.SignedRand(INT8_MAX));\n    }\n    return v;\n  }\n  // Makes a random scales vector of the given size.\n  std::vector<TFloat> RandomScales(int size) {\n    std::vector<TFloat> v(size);\n    for (int i = 0; i < size; ++i) {\n      v[i] = (1.0 + random_.SignedRand(1.0)) / INT8_MAX;\n    }\n    return v;\n  }\n  // Tests a range of sizes and compares the results against the generic version.\n  void ExpectEqualResults(const IntSimdMatrix &matrix) {\n    TFloat total = 0.0;\n    for (int num_out = 1; num_out < 130; ++num_out) {\n      for (int num_in = 1; num_in < 130; ++num_in) {\n        GENERIC_2D_ARRAY<int8_t> w = InitRandom(num_out, num_in + 1);\n        std::vector<int8_t> u = RandomVector(num_in, matrix);\n        std::vector<TFloat> scales = RandomScales(num_out);\n        int ro = num_out;\n        if (IntSimdMatrix::intSimdMatrix) {\n          ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);\n        }\n        std::vector<TFloat> base_result(num_out);\n        IntSimdMatrix::MatrixDotVector(w, scales, u.data(), base_result.data());\n        std::vector<TFloat> test_result(ro);\n        std::vector<int8_t> shaped_wi;\n        int32_t rounded_num_out;\n        matrix.Init(w, shaped_wi, rounded_num_out);\n        scales.resize(rounded_num_out);\n        if (matrix.matrixDotVectorFunction) {\n          matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0], &scales[0], &u[0],\n                                         &test_result[0]);\n        } else {\n          IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data());\n        }\n        for (int i = 0; i < num_out; ++i) {\n          EXPECT_FLOAT_EQ(base_result[i], test_result[i]) << \"i=\" << i;\n          total += base_result[i];\n        }\n      }\n    }\n    // Compare sum of all results with expected value.\n#ifdef FAST_FLOAT\n    EXPECT_FLOAT_EQ(total, -423236.53f);\n#else\n    EXPECT_FLOAT_EQ(total, -423243.392011);\n#endif\n  }\n\n  TRand random_;\n};\n\n// Test the C++ implementation without SIMD.\nTEST_F(IntSimdMatrixTest, C) {\n  static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};\n  ExpectEqualResults(matrix);\n}\n\n// Tests that the SSE implementation gets the same result as the vanilla.\nTEST_F(IntSimdMatrixTest, SSE) {\n#if defined(HAVE_SSE4_1)\n  if (!SIMDDetect::IsSSEAvailable()) {\n    GTEST_LOG_(INFO) << \"No SSE found! Not tested!\";\n    GTEST_SKIP();\n  }\n  ExpectEqualResults(IntSimdMatrix::intSimdMatrixSSE);\n#else\n  GTEST_LOG_(INFO) << \"SSE unsupported! Not tested!\";\n  GTEST_SKIP();\n#endif\n}\n\n// Tests that the AVX2 implementation gets the same result as the vanilla.\nTEST_F(IntSimdMatrixTest, AVX2) {\n#if defined(HAVE_AVX2)\n  if (!SIMDDetect::IsAVX2Available()) {\n    GTEST_LOG_(INFO) << \"No AVX2 found! Not tested!\";\n    GTEST_SKIP();\n  }\n  ExpectEqualResults(IntSimdMatrix::intSimdMatrixAVX2);\n#else\n  GTEST_LOG_(INFO) << \"AVX2 unsupported! Not tested!\";\n  GTEST_SKIP();\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/lang_model_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string> // for std::string\n\n#include \"gmock/gmock.h\" // for testing::ElementsAreArray\n\n#include \"include_gunit.h\"\n#include \"lang_model_helpers.h\"\n#include \"log.h\" // for LOG\n#include \"lstmtrainer.h\"\n#include \"unicharset_training_utils.h\"\n\nnamespace tesseract {\n\nstd::string TestDataNameToPath(const std::string &name) {\n  return file::JoinPath(TESTING_DIR, name);\n}\n\n// This is an integration test that verifies that CombineLangModel works to\n// the extent that an LSTMTrainer can be initialized with the result, and it\n// can encode strings. More importantly, the test verifies that adding an extra\n// character to the unicharset does not change the encoding of strings.\nTEST(LangModelTest, AddACharacter) {\n  constexpr char kTestString[] = \"Simple ASCII string to encode !@#$%&\";\n  constexpr char kTestStringRupees[] = \"ASCII string with Rupee symbol ₹\";\n  // Setup the arguments.\n  std::string script_dir = LANGDATA_DIR;\n  std::string eng_dir = file::JoinPath(script_dir, \"eng\");\n  std::string unicharset_path = TestDataNameToPath(\"eng_beam.unicharset\");\n  UNICHARSET unicharset;\n  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));\n  std::string version_str = \"TestVersion\";\n  file::MakeTmpdir();\n  std::string output_dir = FLAGS_test_tmpdir;\n  LOG(INFO) << \"Output dir=\" << output_dir << \"\\n\";\n  std::string lang1 = \"eng\";\n  bool pass_through_recoder = false;\n  // If these reads fail, we get a warning message and an empty list of words.\n  std::vector<std::string> words = split(ReadFile(file::JoinPath(eng_dir, \"eng.wordlist\")), '\\n');\n  EXPECT_GT(words.size(), 0);\n  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(eng_dir, \"eng.punc\")), '\\n');\n  EXPECT_GT(puncs.size(), 0);\n  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(eng_dir, \"eng.numbers\")), '\\n');\n  EXPECT_GT(numbers.size(), 0);\n  bool lang_is_rtl = false;\n  // Generate the traineddata file.\n  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,\n                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,\n                                nullptr));\n  // Init a trainer with it, and encode kTestString.\n  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + \".traineddata\";\n  LSTMTrainer trainer1;\n  trainer1.InitCharSet(traineddata1);\n  std::vector<int> labels1;\n  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));\n  std::string test1_decoded = trainer1.DecodeLabels(labels1);\n  std::string test1_str(&test1_decoded[0], test1_decoded.length());\n  LOG(INFO) << \"Labels1=\" << test1_str << \"\\n\";\n\n  // Add a new character to the unicharset and try again.\n  int size_before = unicharset.size();\n  unicharset.unichar_insert(\"₹\");\n  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);\n  EXPECT_EQ(size_before + 1, unicharset.size());\n  // Generate the traineddata file.\n  std::string lang2 = \"extended\";\n  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,\n                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,\n                                           nullptr, nullptr));\n  // Init a trainer with it, and encode kTestString.\n  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + \".traineddata\";\n  LSTMTrainer trainer2;\n  trainer2.InitCharSet(traineddata2);\n  std::vector<int> labels2;\n  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));\n  std::string test2_decoded = trainer2.DecodeLabels(labels2);\n  std::string test2_str(&test2_decoded[0], test2_decoded.length());\n  LOG(INFO) << \"Labels2=\" << test2_str << \"\\n\";\n  // encode kTestStringRupees.\n  std::vector<int> labels3;\n  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));\n  std::string test3_decoded = trainer2.DecodeLabels(labels3);\n  std::string test3_str(&test3_decoded[0], test3_decoded.length());\n  LOG(INFO) << \"labels3=\" << test3_str << \"\\n\";\n  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.\n  // Since Tensor Flow's CTC implementation insists on having the null be the\n  // last label, and we want to be compatible, null has to be renumbered when\n  // we add a class.\n  int null1 = trainer1.null_char();\n  int null2 = trainer2.null_char();\n  EXPECT_EQ(null1 + 1, null2);\n  std::vector<int> labels1_v(labels1.size());\n  for (unsigned i = 0; i < labels1.size(); ++i) {\n    if (labels1[i] == null1) {\n      labels1_v[i] = null2;\n    } else {\n      labels1_v[i] = labels1[i];\n    }\n  }\n  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));\n  // To make sure we we are not cheating somehow, we can now encode the Rupee\n  // symbol, which we could not do before.\n  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));\n  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));\n}\n\n// Same as above test, for hin instead of eng\nTEST(LangModelTest, AddACharacterHindi) {\n  constexpr char kTestString[] = \"हिन्दी में एक लाइन लिखें\";\n  constexpr char kTestStringRupees[] = \"हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००\";\n  // Setup the arguments.\n  std::string script_dir = LANGDATA_DIR;\n  std::string hin_dir = file::JoinPath(script_dir, \"hin\");\n  std::string unicharset_path = TestDataNameToPath(\"hin_beam.unicharset\");\n  UNICHARSET unicharset;\n  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));\n  std::string version_str = \"TestVersion\";\n  file::MakeTmpdir();\n  std::string output_dir = FLAGS_test_tmpdir;\n  LOG(INFO) << \"Output dir=\" << output_dir << \"\\n\";\n  std::string lang1 = \"hin\";\n  bool pass_through_recoder = false;\n  // If these reads fail, we get a warning message and an empty list of words.\n  std::vector<std::string> words = split(ReadFile(file::JoinPath(hin_dir, \"hin.wordlist\")), '\\n');\n  EXPECT_GT(words.size(), 0);\n  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(hin_dir, \"hin.punc\")), '\\n');\n  EXPECT_GT(puncs.size(), 0);\n  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(hin_dir, \"hin.numbers\")), '\\n');\n  EXPECT_GT(numbers.size(), 0);\n  bool lang_is_rtl = false;\n  // Generate the traineddata file.\n  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,\n                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,\n                                nullptr));\n  // Init a trainer with it, and encode kTestString.\n  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + \".traineddata\";\n  LSTMTrainer trainer1;\n  trainer1.InitCharSet(traineddata1);\n  std::vector<int> labels1;\n  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));\n  std::string test1_decoded = trainer1.DecodeLabels(labels1);\n  std::string test1_str(&test1_decoded[0], test1_decoded.length());\n  LOG(INFO) << \"Labels1=\" << test1_str << \"\\n\";\n\n  // Add a new character to the unicharset and try again.\n  int size_before = unicharset.size();\n  unicharset.unichar_insert(\"₹\");\n  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);\n  EXPECT_EQ(size_before + 1, unicharset.size());\n  // Generate the traineddata file.\n  std::string lang2 = \"extendedhin\";\n  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,\n                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,\n                                           nullptr, nullptr));\n  // Init a trainer with it, and encode kTestString.\n  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + \".traineddata\";\n  LSTMTrainer trainer2;\n  trainer2.InitCharSet(traineddata2);\n  std::vector<int> labels2;\n  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));\n  std::string test2_decoded = trainer2.DecodeLabels(labels2);\n  std::string test2_str(&test2_decoded[0], test2_decoded.length());\n  LOG(INFO) << \"Labels2=\" << test2_str << \"\\n\";\n  // encode kTestStringRupees.\n  std::vector<int> labels3;\n  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));\n  std::string test3_decoded = trainer2.DecodeLabels(labels3);\n  std::string test3_str(&test3_decoded[0], test3_decoded.length());\n  LOG(INFO) << \"labels3=\" << test3_str << \"\\n\";\n  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.\n  // Since Tensor Flow's CTC implementation insists on having the null be the\n  // last label, and we want to be compatible, null has to be renumbered when\n  // we add a class.\n  int null1 = trainer1.null_char();\n  int null2 = trainer2.null_char();\n  EXPECT_EQ(null1 + 1, null2);\n  std::vector<int> labels1_v(labels1.size());\n  for (unsigned i = 0; i < labels1.size(); ++i) {\n    if (labels1[i] == null1) {\n      labels1_v[i] = null2;\n    } else {\n      labels1_v[i] = labels1[i];\n    }\n  }\n  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));\n  // To make sure we we are not cheating somehow, we can now encode the Rupee\n  // symbol, which we could not do before.\n  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));\n  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/layout_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string>\n#include <utility>\n\n#include \"include_gunit.h\"\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <tesseract/resultiterator.h>\n#include \"coutln.h\"\n#include \"log.h\" // for LOG\n#include \"mutableiterator.h\"\n#include \"ocrblock.h\" // for class BLOCK\n#include \"pageres.h\"\n#include \"polyblk.h\"\n#include \"stepblob.h\"\n\nnamespace tesseract {\n\n/** String name for each block type. Keep in sync with PolyBlockType. */\nstatic const char *kPolyBlockNames[] = {\n    \"Unknown\",\n    \"Flowing Text\",\n    \"Heading Text\",\n    \"Pullout Text\",\n    \"Equation\",\n    \"Inline Equation\",\n    \"Table\",\n    \"Vertical Text\",\n    \"Caption Text\",\n    \"Flowing Image\",\n    \"Heading Image\",\n    \"Pullout Image\",\n    \"Horizontal Line\",\n    \"Vertical Line\",\n    \"Noise\",\n    \"\" // End marker for testing that sizes match.\n};\n\nconst char *kStrings8087_054[] = {\"dat\", \"Dalmatian\", \"\", \"DAMAGED DURING\", \"margarine,\", nullptr};\nconst PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT, PT_PULLOUT_IMAGE,\n                                         PT_CAPTION_TEXT, PT_FLOWING_TEXT};\n\n// The fixture for testing Tesseract.\nclass LayoutTest : public testing::Test {\nprotected:\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, \"/\" + name);\n  }\n  std::string TessdataPath() {\n    return file::JoinPath(TESSDATA_DIR, \"\");\n  }\n\n  LayoutTest() {\n    src_pix_ = nullptr;\n  }\n  ~LayoutTest() override {\n    src_pix_.destroy();\n  }\n\n  void SetImage(const char *filename, const char *lang) {\n    src_pix_.destroy();\n    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());\n    api_.Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY);\n    api_.SetPageSegMode(tesseract::PSM_AUTO);\n    api_.SetImage(src_pix_);\n  }\n\n  // Tests reading order and block finding (very roughly) by iterating\n  // over the blocks, expecting that they contain the strings in order,\n  // allowing for other blocks in between.\n  // An empty string should match an image block, and a nullptr string\n  // indicates the end of the array.\n  void VerifyBlockTextOrder(const char *strings[], const PolyBlockType *blocks,\n                            ResultIterator *it) {\n    it->Begin();\n    int string_index = 0;\n    int block_index = 0;\n    do {\n      char *block_text = it->GetUTF8Text(tesseract::RIL_BLOCK);\n      if (block_text != nullptr && it->BlockType() == blocks[string_index] &&\n          strstr(block_text, strings[string_index]) != nullptr) {\n        LOG(INFO) << \"Found string \" << strings[string_index] << \" in block \" << block_index\n                  << \" of type \" << kPolyBlockNames[blocks[string_index]] << \"\\n\";\n        // Found this one.\n        ++string_index;\n      } else if (it->BlockType() == blocks[string_index] && block_text == nullptr &&\n                 strings[string_index][0] == '\\0') {\n        LOG(INFO) << \"Found block of type \" << kPolyBlockNames[blocks[string_index]] << \" at block \"\n                  << block_index << \"\\n\";\n        // Found this one.\n        ++string_index;\n      } else {\n        LOG(INFO) << \"No match found in block with text:\\n\" << block_text;\n      }\n      delete[] block_text;\n      ++block_index;\n      if (strings[string_index] == nullptr) {\n        break;\n      }\n    } while (it->Next(tesseract::RIL_BLOCK));\n    EXPECT_TRUE(strings[string_index] == nullptr);\n  }\n\n  // Tests that approximate order of the biggest text blocks is correct.\n  // Correctness is tested by the following simple rules:\n  // If a block overlaps its predecessor in x, then it must be below it.\n  // otherwise, if the block is not below its predecessor, then it must\n  // be to the left of it if right_to_left is true, or to the right otherwise.\n  void VerifyRoughBlockOrder(bool right_to_left, ResultIterator *it) {\n    int prev_left = 0;\n    int prev_right = 0;\n    int prev_bottom = 0;\n    it->Begin();\n    do {\n      int left, top, right, bottom;\n      if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&\n          PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) {\n        if (prev_right > prev_left) {\n          if (std::min(right, prev_right) > std::max(left, prev_left)) {\n            EXPECT_GE(top, prev_bottom) << \"Overlapping block should be below\";\n          } else if (top < prev_bottom) {\n            if (right_to_left) {\n              EXPECT_GE(prev_left, right) << \"Block should be to the left\";\n            } else {\n              EXPECT_GE(left, prev_right) << \"Block should be to the right\";\n            }\n          }\n        }\n        prev_left = left;\n        prev_right = right;\n        prev_bottom = bottom;\n      }\n    } while (it->Next(tesseract::RIL_BLOCK));\n  }\n\n  // Tests that every blob assigned to the biggest text blocks is contained\n  // fully within its block by testing that the block polygon winds around\n  // the center of the bounding boxes of the outlines in the blob.\n  void VerifyTotalContainment(int winding_target, MutableIterator *it) {\n    it->Begin();\n    do {\n      int left, top, right, bottom;\n      if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&\n          PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) {\n        const PAGE_RES_IT *pr_it = it->PageResIt();\n        POLY_BLOCK *pb = pr_it->block()->block->pdblk.poly_block();\n        CHECK(pb != nullptr);\n        FCOORD skew = pr_it->block()->block->skew();\n        EXPECT_GT(skew.x(), 0.0f);\n        EXPECT_GT(skew.y(), 0.0f);\n        // Iterate the words in the block.\n        MutableIterator word_it = *it;\n        do {\n          const PAGE_RES_IT *w_it = word_it.PageResIt();\n          // Iterate the blobs in the word.\n          C_BLOB_IT b_it(w_it->word()->word->cblob_list());\n          for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {\n            C_BLOB *blob = b_it.data();\n            // Iterate the outlines in the blob.\n            C_OUTLINE_IT ol_it(blob->out_list());\n            for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {\n              C_OUTLINE *ol = ol_it.data();\n              TBOX box = ol->bounding_box();\n              ICOORD middle((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2);\n              EXPECT_EQ(winding_target, pb->winding_number(middle));\n            }\n          }\n        } while (word_it.Next(tesseract::RIL_WORD) &&\n                 !word_it.IsAtBeginningOf(tesseract::RIL_BLOCK));\n      }\n    } while (it->Next(tesseract::RIL_BLOCK));\n  }\n\n  Image src_pix_;\n  std::string ocr_text_;\n  tesseract::TessBaseAPI api_;\n};\n\n// Tests that array sizes match their intended size.\nTEST_F(LayoutTest, ArraySizeTest) {\n  int size = 0;\n  for (size = 0; kPolyBlockNames[size][0] != '\\0'; ++size) {\n    ;\n  }\n  EXPECT_EQ(size, PT_COUNT);\n}\n\n// Tests that Tesseract gets the important blocks and in the right order\n// on a UNLV page numbered 8087_054.3B.tif. (Dubrovnik)\nTEST_F(LayoutTest, UNLV8087_054) {\n  SetImage(\"8087_054.3B.tif\", \"eng\");\n  // Just run recognition.\n  EXPECT_EQ(api_.Recognize(nullptr), 0);\n  // Check iterator position.\n  tesseract::ResultIterator *it = api_.GetIterator();\n  VerifyBlockTextOrder(kStrings8087_054, kBlocks8087_054, it);\n  delete it;\n}\n\n// Tests that Tesseract gets the important blocks and in the right order\n// on GOOGLE:13510798882202548:74:84.sj-79.tif (Hebrew image)\n// TODO: replace hebrew.png by Google image referred above\nTEST_F(LayoutTest, HebrewOrderingAndSkew) {\n  SetImage(\"hebrew.png\", \"eng\");\n  // Just run recognition.\n  EXPECT_EQ(api_.Recognize(nullptr), 0);\n  tesseract::MutableIterator *it = api_.GetMutableIterator();\n  // In eng mode, block order should not be RTL.\n  VerifyRoughBlockOrder(false, it);\n  VerifyTotalContainment(1, it);\n  delete it;\n  // Now try again using Hebrew.\n  SetImage(\"hebrew.png\", \"heb\");\n  // Just run recognition.\n  EXPECT_EQ(api_.Recognize(nullptr), 0);\n  it = api_.GetMutableIterator();\n  // In heb mode, block order should be RTL.\n  VerifyRoughBlockOrder(true, it);\n  // And blobs should still be fully contained.\n  VerifyTotalContainment(-1, it);\n  delete it;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/ligature_table_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"ligature_table.h\"\n#include \"commandlineflags.h\"\n#include \"fileio.h\"\n#include \"include_gunit.h\"\n#include \"pango_font_info.h\"\n\nnamespace tesseract {\n\n#if 0 // not with NFC normalization\nconst char kEngNonLigatureText[] = \"fidelity effigy ſteep\";\n// Same as above text, but with \"fi\" in the first word and \"ffi\" in the second\n// word replaced with their respective ligatures.\nconst char kEngLigatureText[] = \"ﬁdelity eﬃgy ﬅeep\";\n// Same as kEngLigatureText but with \"fi\" in both words replaced with their\n// ligature. The test Verdana font does not support the \"ffi\" or \"ſt\" ligature.\nconst char kRenderableEngLigatureText[] = \"ﬁdelity efﬁgy ſteep\";\n#endif\n\nstatic PangoFontMap *font_map;\n\nclass LigatureTableTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    lig_table_ = LigatureTable::Get();\n    if (!font_map) {\n      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);\n    }\n    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));\n  }\n\n  static void SetUpTestCase() {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n\n    FLAGS_fonts_dir = TESTING_DIR;\n    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;\n    file::MakeTmpdir();\n    PangoFontInfo::SoftInitFontConfig(); // init early\n  }\n  LigatureTable *lig_table_;\n};\n\nTEST_F(LigatureTableTest, DoesFillLigatureTables) {\n  EXPECT_GT(lig_table_->norm_to_lig_table().size(), 0);\n  EXPECT_GT(lig_table_->lig_to_norm_table().size(), 0);\n}\n\n#if 0 // not with NFC normalization\nTEST_F(LigatureTableTest, DoesAddLigatures) {\n  EXPECT_STREQ(kEngLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str());\n}\n\nTEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) {\n  PangoFontInfo font;\n  EXPECT_TRUE(font.ParseFontDescriptionName(\"Verdana\"));\n  printf(\"1:%s\\n\", kRenderableEngLigatureText);\n  printf(\"2:%s\\n\", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());\n  EXPECT_STREQ(kRenderableEngLigatureText,\n               lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());\n}\n\nTEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) {\n  PangoFontInfo font;\n  EXPECT_TRUE(font.ParseFontDescriptionName(\"Lohit Hindi\"));\n  EXPECT_STREQ(kEngNonLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());\n}\n\nTEST_F(LigatureTableTest, DoesRemoveLigatures) {\n  EXPECT_STREQ(kEngNonLigatureText, lig_table_->RemoveLigatures(kEngLigatureText).c_str());\n}\n#endif\n\nTEST_F(LigatureTableTest, TestCustomLigatures) {\n  const char *kTestCases[] = {\n      \"act\",       \"a\\uE003\", \"publiſh\",    \"publi\\uE006\", \"ſince\",\n      \"\\uE007nce\", \"aſleep\",  \"a\\uE008eep\", \"neceſſary\",   \"nece\\uE009ary\",\n  };\n  for (size_t i = 0; i < countof(kTestCases); i += 2) {\n    EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());\n    EXPECT_STREQ(kTestCases[i], lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str());\n    EXPECT_STREQ(kTestCases[i], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());\n  }\n}\n\n#if 0 // not with NFC normalization\nTEST_F(LigatureTableTest, TestRemovesCustomLigatures) {\n  const char *kTestCases[] = {\n      \"fiction\",\n      \"ﬁ\\uE003ion\",\n      \"ﬁction\",\n  };\n  for (size_t i = 0; i < countof(kTestCases); i += 3) {\n    EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());\n    EXPECT_STREQ(kTestCases[i + 2], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());\n  }\n}\n#endif\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/linlsq_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"linlsq.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass LLSQTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\npublic:\n  void TearDown() override {}\n\n  void ExpectCorrectLine(const LLSQ &llsq, double m, double c, double rms, double pearson,\n                         double tolerance) {\n    EXPECT_NEAR(m, llsq.m(), tolerance);\n    EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance);\n    EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance);\n    EXPECT_NEAR(pearson, llsq.pearson(), tolerance);\n  }\n  FCOORD PtsMean(const std::vector<FCOORD> &pts) {\n    FCOORD total(0, 0);\n    for (const auto &p : pts) {\n      total += p;\n    }\n    return (pts.size() > 0) ? total / pts.size() : total;\n  }\n  void VerifyRmsOrth(const std::vector<FCOORD> &pts, const FCOORD &orth) {\n    LLSQ llsq;\n    FCOORD xavg = PtsMean(pts);\n    FCOORD nvec = !orth;\n    nvec.normalise();\n    double expected_answer = 0;\n    for (const auto &p : pts) {\n      llsq.add(p.x(), p.y());\n      double dot = nvec % (p - xavg);\n      expected_answer += dot * dot;\n    }\n    expected_answer /= pts.size();\n    expected_answer = sqrt(expected_answer);\n    EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001);\n  }\n  void ExpectCorrectVector(const LLSQ &llsq, FCOORD correct_mean_pt, FCOORD correct_vector,\n                           float tolerance) {\n    FCOORD mean_pt = llsq.mean_point();\n    FCOORD vector = llsq.vector_fit();\n    EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance);\n    EXPECT_NEAR(correct_mean_pt.y(), mean_pt.y(), tolerance);\n    EXPECT_NEAR(correct_vector.x(), vector.x(), tolerance);\n    EXPECT_NEAR(correct_vector.y(), vector.y(), tolerance);\n  }\n};\n\n// Tests a simple baseline-style normalization.\nTEST_F(LLSQTest, BasicLines) {\n  LLSQ llsq;\n  llsq.add(1.0, 1.0);\n  llsq.add(2.0, 2.0);\n  ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6);\n  float half_root_2 = sqrt(2.0) / 2.0f;\n  ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f), FCOORD(half_root_2, half_root_2), 1e-6);\n  llsq.remove(2.0, 2.0);\n  llsq.add(1.0, 2.0);\n  llsq.add(10.0, 1.0);\n  llsq.add(-8.0, 1.0);\n  // The point at 1,2 pulls the result away from what would otherwise be a\n  // perfect fit to a horizontal line by 0.25 unit, with rms error of 0.433.\n  ExpectCorrectLine(llsq, 0.0, 1.25, 0.433, 0.0, 1e-2);\n  ExpectCorrectVector(llsq, FCOORD(1.0f, 1.25f), FCOORD(1.0f, 0.0f), 1e-3);\n  llsq.add(1.0, 2.0, 10.0);\n  // With a heavy weight, the point at 1,2 pulls the line nearer.\n  ExpectCorrectLine(llsq, 0.0, 1.786, 0.41, 0.0, 1e-2);\n  ExpectCorrectVector(llsq, FCOORD(1.0f, 1.786f), FCOORD(1.0f, 0.0f), 1e-3);\n}\n\n// Tests a simple baseline-style normalization with a rotation.\nTEST_F(LLSQTest, Vectors) {\n  LLSQ llsq;\n  llsq.add(1.0, 1.0);\n  llsq.add(1.0, -1.0);\n  ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-6);\n  llsq.add(0.9, -2.0);\n  llsq.add(1.1, -3.0);\n  llsq.add(0.9, 2.0);\n  llsq.add(1.10001, 3.0);\n  ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-3);\n}\n\n// Verify that rms_orth() actually calculates:\n//   sqrt( sum (!nvec * (x_i - x_avg))^2 / n)\nTEST_F(LLSQTest, RmsOrthWorksAsIntended) {\n  std::vector<FCOORD> pts;\n  pts.emplace_back(0.56f, 0.95f);\n  pts.emplace_back(0.09f, 0.09f);\n  pts.emplace_back(0.13f, 0.77f);\n  pts.emplace_back(0.16f, 0.83f);\n  pts.emplace_back(0.45f, 0.79f);\n  VerifyRmsOrth(pts, FCOORD(1.f, 0.f));\n  VerifyRmsOrth(pts, FCOORD(1.f, 1.f));\n  VerifyRmsOrth(pts, FCOORD(1.f, 2.f));\n  VerifyRmsOrth(pts, FCOORD(2.f, 1.f));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/list_test.cc",
    "content": "// (C) Copyright 2020, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"clst.h\"\n#include \"elst.h\"\n#include \"elst2.h\"\n\nnamespace tesseract {\n\nclass ListTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n  }\n  const size_t ListSize = 5;\n};\n\nclass Clst {\npublic:\n  Clst(unsigned n) : value(n) {}\n  unsigned value;\n};\n\nclass Elst : public ELIST<Elst>::LINK {\npublic:\n  Elst(unsigned n) : value(n) {}\n  unsigned value;\n};\n\nclass Elst2 : public ELIST2<Elst2>::LINK {\npublic:\n  Elst2(unsigned n) : value(n) {}\n  unsigned value;\n};\n\nCLISTIZEH(Clst)\nELISTIZEH(Elst)\nELIST2IZEH(Elst2)\n\nTEST_F(ListTest, TestCLIST) {\n  Clst_CLIST list;\n  EXPECT_TRUE(list.empty());\n  EXPECT_EQ(list.length(), 0);\n  auto it = Clst_CLIST::ITERATOR(&list);\n  for (unsigned i = 0; i < ListSize; i++) {\n    auto *lst = new Clst(i);\n    it.add_to_end(lst);\n  }\n  EXPECT_TRUE(!list.empty());\n  EXPECT_EQ(list.length(), ListSize);\n  it.move_to_first();\n  unsigned n = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    EXPECT_TRUE(n == 0 || !it.at_first());\n    auto *lst = reinterpret_cast<Clst *>(it.data());\n    EXPECT_EQ(lst->value, n);\n    n++;\n    EXPECT_TRUE(n != ListSize || it.at_last());\n  }\n  it.forward();\n  n++;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    auto *lst = reinterpret_cast<Clst *>(it.extract());\n    EXPECT_EQ(lst->value, n % ListSize);\n    n++;\n    delete lst;\n  }\n  // TODO: add more tests for CLIST\n}\n\nTEST_F(ListTest, TestELIST) {\n  Elst_LIST list;\n  EXPECT_TRUE(list.empty());\n  EXPECT_EQ(list.length(), 0);\n  auto it = ELIST<Elst>::ITERATOR(&list);\n  for (unsigned i = 0; i < ListSize; i++) {\n    auto *elst = new Elst(i);\n    it.add_to_end(elst);\n  }\n  EXPECT_TRUE(!list.empty());\n  EXPECT_EQ(list.length(), ListSize);\n  it.move_to_first();\n  unsigned n = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    EXPECT_TRUE(n == 0 || !it.at_first());\n    auto *elst = reinterpret_cast<Elst *>(it.data());\n    EXPECT_EQ(elst->value, n);\n    n++;\n    EXPECT_TRUE(n != ListSize || it.at_last());\n  }\n  it.forward();\n  n++;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    auto *elst = reinterpret_cast<Elst *>(it.extract());\n    EXPECT_EQ(elst->value, n % ListSize);\n    n++;\n    delete elst;\n  }\n  // TODO: add more tests for ELIST\n}\n\nTEST_F(ListTest, TestELIST2) {\n  Elst2_LIST list;\n  EXPECT_TRUE(list.empty());\n  EXPECT_EQ(list.length(), 0);\n  auto it = ELIST2<Elst2>::ITERATOR(&list);\n  for (unsigned i = 0; i < ListSize; i++) {\n    auto *lst = new Elst2(i);\n    it.add_to_end(lst);\n  }\n  EXPECT_TRUE(!list.empty());\n  EXPECT_EQ(list.length(), ListSize);\n  it.move_to_first();\n  unsigned n = 0;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    EXPECT_TRUE(n == 0 || !it.at_first());\n    auto *lst = reinterpret_cast<Elst2 *>(it.data());\n    EXPECT_EQ(lst->value, n);\n    n++;\n    EXPECT_TRUE(n != ListSize || it.at_last());\n  }\n  it.backward();\n  n--;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.backward()) {\n    auto *lst = reinterpret_cast<Elst2 *>(it.data());\n    EXPECT_EQ(lst->value, n);\n    n--;\n  }\n  it.forward();\n  n++;\n  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {\n    auto *lst = reinterpret_cast<Elst2 *>(it.extract());\n    EXPECT_EQ(lst->value, n % ListSize);\n    n++;\n    delete lst;\n  }\n  // TODO: add more tests for ELIST2\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "unittest/loadlang_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        loadlang_test.cc\n// Description: Test loading of All languages and Scripts for Tesseract.\n// Tests for All languages and scripts are Disabled by default.\n// Force the disabled test to run if required by using the\n// --gtest_also_run_disabled_tests argument. Author:      Shree Devi Kumar\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include <tesseract/baseapi.h>\n#include <time.h>\n#include <memory> // std::unique_ptr\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass QuickTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    start_time_ = time(nullptr);\n  }\n  void TearDown() override {\n    const time_t end_time = time(nullptr);\n    EXPECT_TRUE(end_time - start_time_ <= 25)\n        << \"The test took too long - \" << ::testing::PrintToString(end_time - start_time_);\n  }\n  time_t start_time_;\n};\n\nvoid LangLoader(const char *lang, const char *tessdatadir) {\n  auto api = std::make_unique<tesseract::TessBaseAPI>();\n  ASSERT_FALSE(api->Init(tessdatadir, lang)) << \"Could not initialize tesseract for $lang.\";\n  api->End();\n}\n\n// For all languages\n\nclass LoadLanguage : public QuickTest, public ::testing::WithParamInterface<const char *> {};\n\nTEST_P(LoadLanguage, afr) {\n  LangLoader(\"afr\", GetParam());\n}\nTEST_P(LoadLanguage, amh) {\n  LangLoader(\"amh\", GetParam());\n}\nTEST_P(LoadLanguage, ara) {\n  LangLoader(\"ara\", GetParam());\n}\nTEST_P(LoadLanguage, asm) {\n  LangLoader(\"asm\", GetParam());\n}\nTEST_P(LoadLanguage, aze) {\n  LangLoader(\"aze\", GetParam());\n}\nTEST_P(LoadLanguage, aze_cyrl) {\n  LangLoader(\"aze_cyrl\", GetParam());\n}\nTEST_P(LoadLanguage, bel) {\n  LangLoader(\"bel\", GetParam());\n}\nTEST_P(LoadLanguage, ben) {\n  LangLoader(\"ben\", GetParam());\n}\nTEST_P(LoadLanguage, bod) {\n  LangLoader(\"bod\", GetParam());\n}\nTEST_P(LoadLanguage, bos) {\n  LangLoader(\"bos\", GetParam());\n}\nTEST_P(LoadLanguage, bre) {\n  LangLoader(\"bre\", GetParam());\n}\nTEST_P(LoadLanguage, bul) {\n  LangLoader(\"bul\", GetParam());\n}\nTEST_P(LoadLanguage, cat) {\n  LangLoader(\"cat\", GetParam());\n}\nTEST_P(LoadLanguage, ceb) {\n  LangLoader(\"ceb\", GetParam());\n}\nTEST_P(LoadLanguage, ces) {\n  LangLoader(\"ces\", GetParam());\n}\nTEST_P(LoadLanguage, chi_sim) {\n  LangLoader(\"chi_sim\", GetParam());\n}\nTEST_P(LoadLanguage, chi_sim_vert) {\n  LangLoader(\"chi_sim_vert\", GetParam());\n}\nTEST_P(LoadLanguage, chi_tra) {\n  LangLoader(\"chi_tra\", GetParam());\n}\nTEST_P(LoadLanguage, chi_tra_vert) {\n  LangLoader(\"chi_tra_vert\", GetParam());\n}\nTEST_P(LoadLanguage, chr) {\n  LangLoader(\"chr\", GetParam());\n}\nTEST_P(LoadLanguage, cos) {\n  LangLoader(\"cos\", GetParam());\n}\nTEST_P(LoadLanguage, cym) {\n  LangLoader(\"cym\", GetParam());\n}\nTEST_P(LoadLanguage, dan) {\n  LangLoader(\"dan\", GetParam());\n}\nTEST_P(LoadLanguage, deu) {\n  LangLoader(\"deu\", GetParam());\n}\nTEST_P(LoadLanguage, deu_latf) {\n  LangLoader(\"deu_latf\", GetParam());\n}\nTEST_P(LoadLanguage, div) {\n  LangLoader(\"div\", GetParam());\n}\nTEST_P(LoadLanguage, dzo) {\n  LangLoader(\"dzo\", GetParam());\n}\nTEST_P(LoadLanguage, ell) {\n  LangLoader(\"ell\", GetParam());\n}\nTEST_P(LoadLanguage, eng) {\n  LangLoader(\"eng\", GetParam());\n}\nTEST_P(LoadLanguage, enm) {\n  LangLoader(\"enm\", GetParam());\n}\nTEST_P(LoadLanguage, epo) {\n  LangLoader(\"epo\", GetParam());\n}\nTEST_P(LoadLanguage, est) {\n  LangLoader(\"est\", GetParam());\n}\nTEST_P(LoadLanguage, eus) {\n  LangLoader(\"eus\", GetParam());\n}\nTEST_P(LoadLanguage, fao) {\n  LangLoader(\"fao\", GetParam());\n}\nTEST_P(LoadLanguage, fas) {\n  LangLoader(\"fas\", GetParam());\n}\nTEST_P(LoadLanguage, fil) {\n  LangLoader(\"fil\", GetParam());\n}\nTEST_P(LoadLanguage, fin) {\n  LangLoader(\"fin\", GetParam());\n}\nTEST_P(LoadLanguage, fra) {\n  LangLoader(\"fra\", GetParam());\n}\nTEST_P(LoadLanguage, frm) {\n  LangLoader(\"frm\", GetParam());\n}\nTEST_P(LoadLanguage, fry) {\n  LangLoader(\"fry\", GetParam());\n}\nTEST_P(LoadLanguage, gla) {\n  LangLoader(\"gla\", GetParam());\n}\nTEST_P(LoadLanguage, gle) {\n  LangLoader(\"gle\", GetParam());\n}\nTEST_P(LoadLanguage, glg) {\n  LangLoader(\"glg\", GetParam());\n}\nTEST_P(LoadLanguage, grc) {\n  LangLoader(\"grc\", GetParam());\n}\nTEST_P(LoadLanguage, guj) {\n  LangLoader(\"guj\", GetParam());\n}\nTEST_P(LoadLanguage, hat) {\n  LangLoader(\"hat\", GetParam());\n}\nTEST_P(LoadLanguage, heb) {\n  LangLoader(\"heb\", GetParam());\n}\nTEST_P(LoadLanguage, hin) {\n  LangLoader(\"hin\", GetParam());\n}\nTEST_P(LoadLanguage, hrv) {\n  LangLoader(\"hrv\", GetParam());\n}\nTEST_P(LoadLanguage, hun) {\n  LangLoader(\"hun\", GetParam());\n}\nTEST_P(LoadLanguage, hye) {\n  LangLoader(\"hye\", GetParam());\n}\nTEST_P(LoadLanguage, iku) {\n  LangLoader(\"iku\", GetParam());\n}\nTEST_P(LoadLanguage, ind) {\n  LangLoader(\"ind\", GetParam());\n}\nTEST_P(LoadLanguage, isl) {\n  LangLoader(\"isl\", GetParam());\n}\nTEST_P(LoadLanguage, ita) {\n  LangLoader(\"ita\", GetParam());\n}\nTEST_P(LoadLanguage, ita_old) {\n  LangLoader(\"ita_old\", GetParam());\n}\nTEST_P(LoadLanguage, jav) {\n  LangLoader(\"jav\", GetParam());\n}\nTEST_P(LoadLanguage, jpn) {\n  LangLoader(\"jpn\", GetParam());\n}\nTEST_P(LoadLanguage, jpn_vert) {\n  LangLoader(\"jpn_vert\", GetParam());\n}\nTEST_P(LoadLanguage, kan) {\n  LangLoader(\"kan\", GetParam());\n}\nTEST_P(LoadLanguage, kat) {\n  LangLoader(\"kat\", GetParam());\n}\nTEST_P(LoadLanguage, kat_old) {\n  LangLoader(\"kat_old\", GetParam());\n}\nTEST_P(LoadLanguage, kaz) {\n  LangLoader(\"kaz\", GetParam());\n}\nTEST_P(LoadLanguage, khm) {\n  LangLoader(\"khm\", GetParam());\n}\nTEST_P(LoadLanguage, kir) {\n  LangLoader(\"kir\", GetParam());\n}\n//  TEST_P(LoadLanguage, kmr) {LangLoader(\"kmr\" , GetParam());}\nTEST_P(LoadLanguage, kor) {\n  LangLoader(\"kor\", GetParam());\n}\nTEST_P(LoadLanguage, kor_vert) {\n  LangLoader(\"kor_vert\", GetParam());\n}\nTEST_P(LoadLanguage, lao) {\n  LangLoader(\"lao\", GetParam());\n}\nTEST_P(LoadLanguage, lat) {\n  LangLoader(\"lat\", GetParam());\n}\nTEST_P(LoadLanguage, lav) {\n  LangLoader(\"lav\", GetParam());\n}\nTEST_P(LoadLanguage, lit) {\n  LangLoader(\"lit\", GetParam());\n}\nTEST_P(LoadLanguage, ltz) {\n  LangLoader(\"ltz\", GetParam());\n}\nTEST_P(LoadLanguage, mal) {\n  LangLoader(\"mal\", GetParam());\n}\nTEST_P(LoadLanguage, mar) {\n  LangLoader(\"mar\", GetParam());\n}\nTEST_P(LoadLanguage, mkd) {\n  LangLoader(\"mkd\", GetParam());\n}\nTEST_P(LoadLanguage, mlt) {\n  LangLoader(\"mlt\", GetParam());\n}\nTEST_P(LoadLanguage, mon) {\n  LangLoader(\"mon\", GetParam());\n}\nTEST_P(LoadLanguage, mri) {\n  LangLoader(\"mri\", GetParam());\n}\nTEST_P(LoadLanguage, msa) {\n  LangLoader(\"msa\", GetParam());\n}\nTEST_P(LoadLanguage, mya) {\n  LangLoader(\"mya\", GetParam());\n}\nTEST_P(LoadLanguage, nep) {\n  LangLoader(\"nep\", GetParam());\n}\nTEST_P(LoadLanguage, nld) {\n  LangLoader(\"nld\", GetParam());\n}\nTEST_P(LoadLanguage, nor) {\n  LangLoader(\"nor\", GetParam());\n}\nTEST_P(LoadLanguage, oci) {\n  LangLoader(\"oci\", GetParam());\n}\nTEST_P(LoadLanguage, ori) {\n  LangLoader(\"ori\", GetParam());\n}\nTEST_P(LoadLanguage, osd) {\n  LangLoader(\"osd\", GetParam());\n}\nTEST_P(LoadLanguage, pan) {\n  LangLoader(\"pan\", GetParam());\n}\nTEST_P(LoadLanguage, pol) {\n  LangLoader(\"pol\", GetParam());\n}\nTEST_P(LoadLanguage, por) {\n  LangLoader(\"por\", GetParam());\n}\nTEST_P(LoadLanguage, pus) {\n  LangLoader(\"pus\", GetParam());\n}\nTEST_P(LoadLanguage, que) {\n  LangLoader(\"que\", GetParam());\n}\nTEST_P(LoadLanguage, ron) {\n  LangLoader(\"ron\", GetParam());\n}\nTEST_P(LoadLanguage, rus) {\n  LangLoader(\"rus\", GetParam());\n}\nTEST_P(LoadLanguage, san) {\n  LangLoader(\"san\", GetParam());\n}\nTEST_P(LoadLanguage, sin) {\n  LangLoader(\"sin\", GetParam());\n}\nTEST_P(LoadLanguage, slk) {\n  LangLoader(\"slk\", GetParam());\n}\nTEST_P(LoadLanguage, slv) {\n  LangLoader(\"slv\", GetParam());\n}\nTEST_P(LoadLanguage, snd) {\n  LangLoader(\"snd\", GetParam());\n}\nTEST_P(LoadLanguage, spa) {\n  LangLoader(\"spa\", GetParam());\n}\nTEST_P(LoadLanguage, spa_old) {\n  LangLoader(\"spa_old\", GetParam());\n}\nTEST_P(LoadLanguage, sqi) {\n  LangLoader(\"sqi\", GetParam());\n}\nTEST_P(LoadLanguage, srp) {\n  LangLoader(\"srp\", GetParam());\n}\nTEST_P(LoadLanguage, srp_latn) {\n  LangLoader(\"srp_latn\", GetParam());\n}\nTEST_P(LoadLanguage, sun) {\n  LangLoader(\"sun\", GetParam());\n}\nTEST_P(LoadLanguage, swa) {\n  LangLoader(\"swa\", GetParam());\n}\nTEST_P(LoadLanguage, swe) {\n  LangLoader(\"swe\", GetParam());\n}\nTEST_P(LoadLanguage, syr) {\n  LangLoader(\"syr\", GetParam());\n}\nTEST_P(LoadLanguage, tam) {\n  LangLoader(\"tam\", GetParam());\n}\nTEST_P(LoadLanguage, tat) {\n  LangLoader(\"tat\", GetParam());\n}\nTEST_P(LoadLanguage, tel) {\n  LangLoader(\"tel\", GetParam());\n}\nTEST_P(LoadLanguage, tgk) {\n  LangLoader(\"tgk\", GetParam());\n}\nTEST_P(LoadLanguage, tha) {\n  LangLoader(\"tha\", GetParam());\n}\nTEST_P(LoadLanguage, tir) {\n  LangLoader(\"tir\", GetParam());\n}\nTEST_P(LoadLanguage, ton) {\n  LangLoader(\"ton\", GetParam());\n}\nTEST_P(LoadLanguage, tur) {\n  LangLoader(\"tur\", GetParam());\n}\nTEST_P(LoadLanguage, uig) {\n  LangLoader(\"uig\", GetParam());\n}\nTEST_P(LoadLanguage, ukr) {\n  LangLoader(\"ukr\", GetParam());\n}\nTEST_P(LoadLanguage, urd) {\n  LangLoader(\"urd\", GetParam());\n}\nTEST_P(LoadLanguage, uzb) {\n  LangLoader(\"uzb\", GetParam());\n}\nTEST_P(LoadLanguage, uzb_cyrl) {\n  LangLoader(\"uzb_cyrl\", GetParam());\n}\nTEST_P(LoadLanguage, vie) {\n  LangLoader(\"vie\", GetParam());\n}\nTEST_P(LoadLanguage, yid) {\n  LangLoader(\"yid\", GetParam());\n}\nTEST_P(LoadLanguage, yor) {\n  LangLoader(\"yor\", GetParam());\n}\n\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage,\n                         ::testing::Values(TESSDATA_DIR \"_fast\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage,\n                         ::testing::Values(TESSDATA_DIR \"_best\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage, ::testing::Values(TESSDATA_DIR));\n\n// For all scripts\n\nclass LoadScript : public QuickTest, public ::testing::WithParamInterface<const char *> {};\n\nTEST_P(LoadScript, Arabic) {\n  LangLoader(\"script/Arabic\", GetParam());\n}\nTEST_P(LoadScript, Armenian) {\n  LangLoader(\"script/Armenian\", GetParam());\n}\nTEST_P(LoadScript, Bengali) {\n  LangLoader(\"script/Bengali\", GetParam());\n}\nTEST_P(LoadScript, Canadian_Aboriginal) {\n  LangLoader(\"script/Canadian_Aboriginal\", GetParam());\n}\nTEST_P(LoadScript, Cherokee) {\n  LangLoader(\"script/Cherokee\", GetParam());\n}\nTEST_P(LoadScript, Cyrillic) {\n  LangLoader(\"script/Cyrillic\", GetParam());\n}\nTEST_P(LoadScript, Devanagari) {\n  LangLoader(\"script/Devanagari\", GetParam());\n}\nTEST_P(LoadScript, Ethiopic) {\n  LangLoader(\"script/Ethiopic\", GetParam());\n}\nTEST_P(LoadScript, Fraktur) {\n  LangLoader(\"script/Fraktur\", GetParam());\n}\nTEST_P(LoadScript, Georgian) {\n  LangLoader(\"script/Georgian\", GetParam());\n}\nTEST_P(LoadScript, Greek) {\n  LangLoader(\"script/Greek\", GetParam());\n}\nTEST_P(LoadScript, Gujarati) {\n  LangLoader(\"script/Gujarati\", GetParam());\n}\nTEST_P(LoadScript, Gurmukhi) {\n  LangLoader(\"script/Gurmukhi\", GetParam());\n}\nTEST_P(LoadScript, HanS) {\n  LangLoader(\"script/HanS\", GetParam());\n}\nTEST_P(LoadScript, HanS_vert) {\n  LangLoader(\"script/HanS_vert\", GetParam());\n}\nTEST_P(LoadScript, HanT) {\n  LangLoader(\"script/HanT\", GetParam());\n}\nTEST_P(LoadScript, HanT_vert) {\n  LangLoader(\"script/HanT_vert\", GetParam());\n}\nTEST_P(LoadScript, Hangul) {\n  LangLoader(\"script/Hangul\", GetParam());\n}\nTEST_P(LoadScript, Hangul_vert) {\n  LangLoader(\"script/Hangul_vert\", GetParam());\n}\nTEST_P(LoadScript, Hebrew) {\n  LangLoader(\"script/Hebrew\", GetParam());\n}\nTEST_P(LoadScript, Japanese) {\n  LangLoader(\"script/Japanese\", GetParam());\n}\nTEST_P(LoadScript, Japanese_vert) {\n  LangLoader(\"script/Japanese_vert\", GetParam());\n}\nTEST_P(LoadScript, Kannada) {\n  LangLoader(\"script/Kannada\", GetParam());\n}\nTEST_P(LoadScript, Khmer) {\n  LangLoader(\"script/Khmer\", GetParam());\n}\nTEST_P(LoadScript, Lao) {\n  LangLoader(\"script/Lao\", GetParam());\n}\nTEST_P(LoadScript, Latin) {\n  LangLoader(\"script/Latin\", GetParam());\n}\nTEST_P(LoadScript, Malayalam) {\n  LangLoader(\"script/Malayalam\", GetParam());\n}\nTEST_P(LoadScript, Myanmar) {\n  LangLoader(\"script/Myanmar\", GetParam());\n}\nTEST_P(LoadScript, Oriya) {\n  LangLoader(\"script/Oriya\", GetParam());\n}\nTEST_P(LoadScript, Sinhala) {\n  LangLoader(\"script/Sinhala\", GetParam());\n}\nTEST_P(LoadScript, Syriac) {\n  LangLoader(\"script/Syriac\", GetParam());\n}\nTEST_P(LoadScript, Tamil) {\n  LangLoader(\"script/Tamil\", GetParam());\n}\nTEST_P(LoadScript, Telugu) {\n  LangLoader(\"script/Telugu\", GetParam());\n}\nTEST_P(LoadScript, Thaana) {\n  LangLoader(\"script/Thaana\", GetParam());\n}\nTEST_P(LoadScript, Thai) {\n  LangLoader(\"script/Thai\", GetParam());\n}\nTEST_P(LoadScript, Tibetan) {\n  LangLoader(\"script/Tibetan\", GetParam());\n}\nTEST_P(LoadScript, Vietnamese) {\n  LangLoader(\"script/Vietnamese\", GetParam());\n}\n\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript,\n                         ::testing::Values(TESSDATA_DIR \"_fast\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript,\n                         ::testing::Values(TESSDATA_DIR \"_best\"));\nINSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript, ::testing::Values(TESSDATA_DIR));\n\nclass LoadLang : public QuickTest {};\n\n// Test Load of English here, as the parameterized tests are disabled by\n// default.\nTEST_F(LoadLang, engFast) {\n  LangLoader(\"eng\", TESSDATA_DIR \"_fast\");\n}\nTEST_F(LoadLang, engBest) {\n  LangLoader(\"eng\", TESSDATA_DIR \"_best\");\n}\nTEST_F(LoadLang, engBestInt) {\n  LangLoader(\"eng\", TESSDATA_DIR);\n}\n\n// Use class LoadLang for languages which are NOT there in all three repos\nTEST_F(LoadLang, kmrFast) {\n  LangLoader(\"kmr\", TESSDATA_DIR \"_fast\");\n}\nTEST_F(LoadLang, kmrBest) {\n  LangLoader(\"kmr\", TESSDATA_DIR \"_best\");\n}\n//  TEST_F(LoadLang, kmrBestInt) {LangLoader(\"kmr\" , TESSDATA_DIR);}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/log.h",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        log.h\n// Description: Include for custom log message for unittest for tesseract.\n//              based on\n//              https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#ifndef TESSERACT_UNITTEST_LOG_H_\n#define TESSERACT_UNITTEST_LOG_H_\n\n// This is a minimal implementation of the TensorFlow logging API\n// which is sufficient for the Tesseract unit tests.\n\n// See tensorflow/core/platform/default/logging.h for the original code.\n\n#include <iostream>\n\nenum LogLevel { INFO, WARNING, ERROR, FATAL };\n\n// Avoid conflict with logging.h from TensorFlow.\n#undef LOG\n\nstatic inline std::ostream &LOG(enum LogLevel level) {\n  switch (level) {\n    case INFO:\n      std::cout << \"[INFO]  \";\n      break;\n    case WARNING:\n      std::cout << \"[WARN]  \";\n      break;\n    case ERROR:\n      std::cout << \"[ERROR] \";\n      break;\n    case FATAL:\n      std::cout << \"[FATAL] \";\n      break;\n  }\n  return std::cout;\n}\n\n// Avoid conflict with logging.h from TensorFlow.\n#undef QCHECK\n\n// https://github.com/google/ion/blob/master/ion/base/logging.h\nstatic inline std::ostream &QCHECK(bool condition) {\n  if (condition) {\n    static std::ostream null_stream(nullptr);\n    return null_stream;\n  }\n  return std::cout;\n}\n\n#endif // TESSERACT_UNITTEST_LOG_H_\n"
  },
  {
    "path": "unittest/lstm_recode_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"lstm_test.h\"\n\nnamespace tesseract {\n\n// Tests that training with unicharset recoding learns faster than without,\n// for Korean. This test is split in two, so it can be run sharded.\n\nTEST_F(LSTMTrainerTest, RecodeTestKorBase) {\n  // A basic single-layer, bi-di 1d LSTM on Korean.\n  SetupTrainer(\"[1,1,0,32 Lbx96 O1c1]\", \"kor-full\", \"kor/kor.unicharset\",\n               \"kor.Arial_Unicode_MS.exp0.lstmf\", false, true, 5e-4, false, \"kor\");\n  double kor_full_err = TrainIterations(kTrainerIterations * 2);\n  EXPECT_LT(kor_full_err, 88);\n  //  EXPECT_GT(kor_full_err, 85);\n  LOG(INFO) << \"********** Expected  < 88 ************\\n\";\n}\n\nTEST_F(LSTMTrainerTest, RecodeTestKor) {\n  // A basic single-layer, bi-di 1d LSTM on Korean.\n  SetupTrainer(\"[1,1,0,32 Lbx96 O1c1]\", \"kor-recode\", \"kor/kor.unicharset\",\n               \"kor.Arial_Unicode_MS.exp0.lstmf\", true, true, 5e-4, false, \"kor\");\n  double kor_recode_err = TrainIterations(kTrainerIterations);\n  EXPECT_LT(kor_recode_err, 60);\n  LOG(INFO) << \"********** Expected  < 60 ************\\n\";\n}\n\n// Tests that the given string encodes and decodes back to the same\n// with both recode on and off for Korean.\n\nTEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) {\n  TestEncodeDecodeBoth(\"kor\", \"한국어 위키백과에 오신 것을 환영합니다!\");\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "unittest/lstm_squashed_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"lstm_test.h\"\n\nnamespace tesseract {\n\n// Tests that a Squashed network learns correctly.\n// Almost as fast as the 2d-lstm.\nTEST_F(LSTMTrainerTest, TestSquashed) {\n  // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and\n  // a small convolution/maxpool below that.\n  // Match training conditions to those typically used with this spec:\n  // recoding on, adam on.\n  SetupTrainerEng(\"[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]\", \"SQU-2-layer-lstm\",\n                  /*recode*/ true, /*adam*/ true);\n  double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);\n  EXPECT_LT(lstm_2d_err, 80);\n  LOG(INFO) << \"********** < 80 ************\\n\";\n  TestIntMode(kTrainerIterations);\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "unittest/lstm_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// Generating the training data:\n// If the format of the lstmf (ImageData) file changes, the training data will\n// have to be regenerated as follows:\n//\n// Use --xsize 800 for text2image to be similar to original training data.\n//\n// tesstrain.py --fonts_dir /usr/share/fonts --lang eng \\\n// --linedata_only   --noextract_font_properties --langdata_dir ../langdata_lstm \\\n// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \\\n// --fontlist \"Arial\" --maxpages 10\n//\n\n#include \"lstm_test.h\"\n\nnamespace tesseract {\n\n// Tests that some simple networks can learn Arial and meet accuracy targets.\nTEST_F(LSTMTrainerTest, BasicTest) {\n  // A Convolver sliding window classifier without LSTM.\n  SetupTrainer(\n      \"[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 \"\n      \"Ct1,1,64O1c1]\",\n      \"no-lstm\", \"eng/eng.unicharset\", \"eng.Arial.exp0.lstmf\", false, false, 2e-4, false, \"eng\");\n  double non_lstm_err = TrainIterations(kTrainerIterations * 4);\n  EXPECT_LT(non_lstm_err, 98);\n  LOG(INFO) << \"********** Expected  < 98 ************\\n\";\n\n  // A basic single-layer, single direction LSTM.\n  SetupTrainerEng(\"[1,1,0,32 Lfx100 O1c1]\", \"1D-lstm\", false, false);\n  double lstm_uni_err = TrainIterations(kTrainerIterations * 2);\n  EXPECT_LT(lstm_uni_err, 86);\n  LOG(INFO) << \"********** Expected  < 86 ************\\n\";\n  // Beats the convolver. (Although it does have a lot more weights, it still\n  // iterates faster.)\n  EXPECT_LT(lstm_uni_err, non_lstm_err);\n}\n\n// Color learns almost as fast as normalized grey/2D.\nTEST_F(LSTMTrainerTest, ColorTest) {\n  // A basic single-layer, single direction LSTM.\n  SetupTrainerEng(\"[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2D-color-lstm\", true, true);\n  double lstm_uni_err = TrainIterations(kTrainerIterations);\n  EXPECT_LT(lstm_uni_err, 85);\n  //  EXPECT_GT(lstm_uni_err, 66);\n  LOG(INFO) << \"********** Expected  < 85 ************\\n\";\n}\n\nTEST_F(LSTMTrainerTest, BidiTest) {\n  // A basic single-layer, bi-di 1d LSTM.\n  SetupTrainerEng(\"[1,1,0,32 Lbx100 O1c1]\", \"bidi-lstm\", false, false);\n  double lstm_bi_err = TrainIterations(kTrainerIterations);\n  EXPECT_LT(lstm_bi_err, 75);\n  LOG(INFO) << \"********** Expected   < 75 ************\\n\";\n  // Int mode training is dead, so convert the trained network to int and check\n  // that its error rate is close to the float version.\n  TestIntMode(kTrainerIterations);\n}\n\n// Tests that a 2d-2-layer network learns correctly.\n// It takes a lot of iterations to get there.\nTEST_F(LSTMTrainerTest, Test2D) {\n  // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.\n  SetupTrainerEng(\"[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2-D-2-layer-lstm\", false,\n                  false);\n  double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);\n  EXPECT_LT(lstm_2d_err, 98);\n  //  EXPECT_GT(lstm_2d_err, 90);\n  LOG(INFO) << \"********** Expected  < 98 ************\\n\";\n  // Int mode training is dead, so convert the trained network to int and check\n  // that its error rate is close to the float version.\n  TestIntMode(kTrainerIterations);\n}\n\n// Tests that a 2d-2-layer network with Adam does *a lot* better than\n// without it.\nTEST_F(LSTMTrainerTest, TestAdam) {\n  // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.\n  SetupTrainerEng(\"[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2-D-2-layer-lstm\", false,\n                  true);\n  double lstm_2d_err = TrainIterations(kTrainerIterations);\n  EXPECT_LT(lstm_2d_err, 70);\n  LOG(INFO) << \"********** Expected   < 70 ************\\n\";\n  TestIntMode(kTrainerIterations);\n}\n\n// Trivial test of training speed on a fairly complex network.\nTEST_F(LSTMTrainerTest, SpeedTest) {\n  SetupTrainerEng(\n      \"[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 \"\n      \"O1c1]\",\n      \"2-D-2-layer-lstm\", false, true);\n  TrainIterations(kTrainerIterations);\n  LOG(INFO) << \"********** *** ************\\n\";\n}\n\n// Tests that two identical networks trained the same get the same results.\n// Also tests that the same happens with a serialize/deserialize in the middle.\nTEST_F(LSTMTrainerTest, DeterminismTest) {\n  SetupTrainerEng(\"[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2-D-2-layer-lstm\", false,\n                  false);\n  double lstm_2d_err_a = TrainIterations(kTrainerIterations);\n  double act_error_a = trainer_->ActivationError();\n  double char_error_a = trainer_->CharError();\n  std::vector<char> trainer_a_data;\n  EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, *trainer_, &trainer_a_data));\n  SetupTrainerEng(\"[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2-D-2-layer-lstm\", false,\n                  false);\n  double lstm_2d_err_b = TrainIterations(kTrainerIterations);\n  double act_error_b = trainer_->ActivationError();\n  double char_error_b = trainer_->CharError();\n  EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);\n  EXPECT_FLOAT_EQ(act_error_a, act_error_b);\n  EXPECT_FLOAT_EQ(char_error_a, char_error_b);\n  // Now train some more iterations.\n  lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);\n  act_error_b = trainer_->ActivationError();\n  char_error_b = trainer_->CharError();\n  // Unpack into a new trainer and train that some more too.\n  SetupTrainerEng(\"[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]\", \"2-D-2-layer-lstm\", false,\n                  false);\n  EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, *trainer_));\n  lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);\n  act_error_a = trainer_->ActivationError();\n  char_error_a = trainer_->CharError();\n  EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);\n  EXPECT_FLOAT_EQ(act_error_a, act_error_b);\n  EXPECT_FLOAT_EQ(char_error_a, char_error_b);\n  LOG(INFO) << \"********** *** ************\\n\";\n}\n\n// The baseline network against which to test the built-in softmax.\nTEST_F(LSTMTrainerTest, SoftmaxBaselineTest) {\n  // A basic single-layer, single direction LSTM.\n  SetupTrainerEng(\"[1,1,0,32 Lfx96 O1c1]\", \"1D-lstm\", false, true);\n  double lstm_uni_err = TrainIterations(kTrainerIterations * 2);\n  EXPECT_LT(lstm_uni_err, 60);\n  //  EXPECT_GT(lstm_uni_err, 48);\n  LOG(INFO) << \"********** Expected  < 60 ************\\n\";\n  // Check that it works in int mode too.\n  TestIntMode(kTrainerIterations);\n  // If we run TestIntMode again, it tests that int_mode networks can\n  // serialize and deserialize correctly.\n  double delta = TestIntMode(kTrainerIterations);\n  // The two tests (both of int mode this time) should be almost identical.\n  LOG(INFO) << \"Delta in Int mode error rates = \" << delta << \"\\n\";\n  EXPECT_LT(delta, 0.01);\n}\n\n// Tests that the built-in softmax does better than the external one,\n// which has an error rate slightly less than 55%, as tested by\n// SoftmaxBaselineTest.\nTEST_F(LSTMTrainerTest, SoftmaxTest) {\n  // LSTM with a built-in softmax can beat the external softmax.\n  SetupTrainerEng(\"[1,1,0,32 LS96]\", \"Lstm-+-softmax\", false, true);\n  double lstm_sm_err = TrainIterations(kTrainerIterations * 2);\n  EXPECT_LT(lstm_sm_err, 49.0);\n  LOG(INFO) << \"********** Expected  < 49 ************\\n\";\n  // Check that it works in int mode too.\n  TestIntMode(kTrainerIterations);\n}\n\n// Tests that the built-in encoded softmax does better than the external one.\n// It takes a lot of iterations to get there.\nTEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {\n  // LSTM with a built-in encoded softmax can beat the external softmax.\n  SetupTrainerEng(\"[1,1,0,32 LE96]\", \"Lstm-+-softmax\", false, true);\n  double lstm_sm_err = TrainIterations(kTrainerIterations * 2);\n  EXPECT_LT(lstm_sm_err, 62.0);\n  LOG(INFO) << \"********** Expected   < 62 ************\\n\";\n  // Check that it works in int mode too.\n  TestIntMode(kTrainerIterations);\n}\n\n// Tests that layer access methods work correctly.\nTEST_F(LSTMTrainerTest, TestLayerAccess) {\n  // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.\n  SetupTrainerEng(\"[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]\", \"SQU-lstm\", false, false);\n  // Number of layers.\n  const size_t kNumLayers = 8;\n  // Expected layer names.\n  const char *kLayerIds[kNumLayers] = {\":0\", \":1:0\", \":1:1\", \":2\", \":3:0\", \":4:0\", \":4:1:0\", \":5\"};\n  const char *kLayerNames[kNumLayers] = {\"Input\",  \"Convolve\",  \"ConvNL\", \"Maxpool\",\n                                         \"Lfys32\", \"Lbx128LTR\", \"Lbx128\", \"Output\"};\n  // Expected number of weights.\n  const int kNumWeights[kNumLayers] = {0,\n                                       0,\n                                       16 * (25 + 1),\n                                       0,\n                                       32 * (4 * (32 + 16 + 1)),\n                                       128 * (4 * (128 + 32 + 1)),\n                                       128 * (4 * (128 + 32 + 1)),\n                                       112 * (2 * 128 + 1)};\n\n  auto layers = trainer_->EnumerateLayers();\n  EXPECT_EQ(kNumLayers, layers.size());\n  for (unsigned i = 0; i < kNumLayers && i < layers.size(); ++i) {\n    EXPECT_STREQ(kLayerIds[i], layers[i].c_str());\n    EXPECT_STREQ(kLayerNames[i], trainer_->GetLayer(layers[i])->name().c_str());\n    EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());\n  }\n}\n\n} // namespace tesseract.\n"
  },
  {
    "path": "unittest/lstm_test.h",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_UNITTEST_LSTM_TEST_H_\n#define TESSERACT_UNITTEST_LSTM_TEST_H_\n\n#include <memory>\n#include <string>\n#include <utility>\n\n#include \"include_gunit.h\"\n\n#include \"helpers.h\"\n\n#include \"functions.h\"\n#include \"lang_model_helpers.h\"\n#include \"log.h\" // for LOG\n#include \"lstmtrainer.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\n#if DEBUG_DETAIL == 0\n// Number of iterations to run all the trainers.\nconst int kTrainerIterations = 600;\n// Number of iterations between accuracy checks.\nconst int kBatchIterations = 100;\n#else\n// Number of iterations to run all the trainers.\nconst int kTrainerIterations = 2;\n// Number of iterations between accuracy checks.\nconst int kBatchIterations = 1;\n#endif\n\n// The fixture for testing LSTMTrainer.\nclass LSTMTrainerTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  LSTMTrainerTest() = default;\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTDATA_DIR, \"\" + name);\n  }\n  std::string TessDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESSDATA_DIR, \"\" + name);\n  }\n  std::string TestingNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, \"\" + name);\n  }\n\n  void SetupTrainerEng(const std::string &network_spec, const std::string &model_name, bool recode,\n                       bool adam) {\n    SetupTrainer(network_spec, model_name, \"eng/eng.unicharset\", \"eng.Arial.exp0.lstmf\", recode,\n                 adam, 5e-4, false, \"eng\");\n  }\n  void SetupTrainer(const std::string &network_spec, const std::string &model_name,\n                    const std::string &unicharset_file, const std::string &lstmf_file, bool recode,\n                    bool adam, float learning_rate, bool layer_specific, const std::string &kLang) {\n    //    constexpr char kLang[] = \"eng\";  // Exact value doesn't matter.\n    std::string unicharset_name = TestDataNameToPath(unicharset_file);\n    UNICHARSET unicharset;\n    ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false));\n    std::string script_dir = file::JoinPath(LANGDATA_DIR, \"\");\n    std::vector<std::string> words;\n    EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, \"\", FLAGS_test_tmpdir, kLang, !recode,\n                                  words, words, words, false, nullptr, nullptr));\n    std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name);\n    std::string checkpoint_path = model_path + \"_checkpoint\";\n    trainer_ = std::make_unique<LSTMTrainer>(model_path.c_str(), checkpoint_path.c_str(), 0, 0);\n    trainer_->InitCharSet(\n        file::JoinPath(FLAGS_test_tmpdir, kLang, kLang) + \".traineddata\");\n    int net_mode = adam ? NF_ADAM : 0;\n    // Adam needs a higher learning rate, due to not multiplying the effective\n    // rate by 1/(1-momentum).\n    if (adam) {\n      learning_rate *= 20.0f;\n    }\n    if (layer_specific) {\n      net_mode |= NF_LAYER_SPECIFIC_LR;\n    }\n    EXPECT_TRUE(\n        trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1, learning_rate, 0.9, 0.999));\n    std::vector<std::string> filenames;\n    filenames.emplace_back(TestDataNameToPath(lstmf_file).c_str());\n    EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false));\n    LOG(INFO) << \"Setup network:\" << model_name << \"\\n\";\n  }\n  // Trains for a given number of iterations and returns the char error rate.\n  double TrainIterations(int max_iterations) {\n    int iteration = trainer_->training_iteration();\n    int iteration_limit = iteration + max_iterations;\n    double best_error = 100.0;\n    do {\n      std::stringstream log_str;\n      int target_iteration = iteration + kBatchIterations;\n      // Train a few.\n      double mean_error = 0.0;\n      while (iteration < target_iteration && iteration < iteration_limit) {\n        trainer_->TrainOnLine(trainer_.get(), false);\n        iteration = trainer_->training_iteration();\n        mean_error += trainer_->LastSingleError(ET_CHAR_ERROR);\n      }\n      trainer_->MaintainCheckpoints(nullptr, log_str);\n      iteration = trainer_->training_iteration();\n      mean_error *= 100.0 / kBatchIterations;\n      if (mean_error < best_error) {\n        best_error = mean_error;\n      }\n    } while (iteration < iteration_limit);\n    LOG(INFO) << \"Trainer error rate = \" << best_error << \"\\n\";\n    return best_error;\n  }\n  // Tests for a given number of iterations and returns the char error rate.\n  double TestIterations(int max_iterations) {\n    CHECK_GT(max_iterations, 0);\n    int iteration = trainer_->sample_iteration();\n    double mean_error = 0.0;\n    int error_count = 0;\n    while (error_count < max_iterations) {\n      const ImageData &trainingdata =\n          *trainer_->mutable_training_data()->GetPageBySerial(iteration);\n      NetworkIO fwd_outputs, targets;\n      if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) != UNENCODABLE) {\n        mean_error += trainer_->NewSingleError(ET_CHAR_ERROR);\n        ++error_count;\n      }\n      trainer_->SetIteration(++iteration);\n    }\n    mean_error *= 100.0 / max_iterations;\n    LOG(INFO) << \"Tester error rate = \" << mean_error << \"\\n\";\n    return mean_error;\n  }\n  // Tests that the current trainer_ can be converted to int mode and still gets\n  // within 1% of the error rate. Returns the increase in error from float to\n  // int.\n  double TestIntMode(int test_iterations) {\n    std::vector<char> trainer_data;\n    EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, *trainer_, &trainer_data));\n    // Get the error on the next few iterations in float mode.\n    double float_err = TestIterations(test_iterations);\n    // Restore the dump, convert to int and test error on that.\n    EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_data, *trainer_));\n    trainer_->ConvertToInt();\n    double int_err = TestIterations(test_iterations);\n    EXPECT_LT(int_err, float_err + 1.0);\n    return int_err - float_err;\n  }\n  // Sets up a trainer with the given language and given recode+ctc condition.\n  // It then verifies that the given str encodes and decodes back to the same\n  // string.\n  void TestEncodeDecode(const std::string &lang, const std::string &str, bool recode) {\n    std::string unicharset_name = lang + \"/\" + lang + \".unicharset\";\n    std::string lstmf_name = lang + \".Arial_Unicode_MS.exp0.lstmf\";\n    SetupTrainer(\"[1,1,0,32 Lbx100 O1c1]\", \"bidi-lstm\", unicharset_name, lstmf_name, recode, true,\n                 5e-4f, true, lang);\n    std::vector<int> labels;\n    EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels));\n    std::string decoded = trainer_->DecodeLabels(labels);\n    std::string decoded_str(&decoded[0], decoded.length());\n    EXPECT_EQ(str, decoded_str);\n  }\n  // Calls TestEncodeDeode with both recode on and off.\n  void TestEncodeDecodeBoth(const std::string &lang, const std::string &str) {\n    TestEncodeDecode(lang, str, false);\n    TestEncodeDecode(lang, str, true);\n  }\n\n  std::unique_ptr<LSTMTrainer> trainer_;\n};\n\n} // namespace tesseract.\n\n#endif // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_\n"
  },
  {
    "path": "unittest/lstmtrainer_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include \"lstm_test.h\"\n\nnamespace tesseract {\n\nTEST_F(LSTMTrainerTest, EncodesEng) {\n  TestEncodeDecodeBoth(\"eng\", \"The quick brown 'fox' jumps over: the lazy dog!\");\n}\n\nTEST_F(LSTMTrainerTest, EncodesKan) {\n  TestEncodeDecodeBoth(\"kan\", \"ಫ್ರಬ್ರವರಿ ತತ್ವಾಂಶಗಳೆಂದರೆ ಮತ್ತು ಜೊತೆಗೆ ಕ್ರಮವನ್ನು\");\n}\n\nTEST_F(LSTMTrainerTest, EncodesKor) {\n  TestEncodeDecodeBoth(\"kor\", \"이는 것으로 다시 넣을 수는 있지만 선택의 의미는\");\n}\n\nTEST_F(LSTMTrainerTest, MapCoder) {\n  LSTMTrainer fra_trainer;\n  fra_trainer.InitCharSet(TestDataNameToPath(\"fra/fra.traineddata\"));\n  LSTMTrainer deu_trainer;\n  deu_trainer.InitCharSet(TestDataNameToPath(\"deu/deu.traineddata\"));\n  // A string that uses characters common to French and German.\n  std::string kTestStr = \"The quick brown 'fox' jumps over: the lazy dog!\";\n  std::vector<int> deu_labels;\n  EXPECT_TRUE(deu_trainer.EncodeString(kTestStr.c_str(), &deu_labels));\n  // The french trainer cannot decode them correctly.\n  std::string badly_decoded = fra_trainer.DecodeLabels(deu_labels);\n  std::string bad_str(&badly_decoded[0], badly_decoded.length());\n  LOG(INFO) << \"bad_str fra=\" << bad_str << \"\\n\";\n  EXPECT_NE(kTestStr, bad_str);\n  // Encode the string as fra.\n  std::vector<int> fra_labels;\n  EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels));\n  // Use the mapper to compute what the labels are as deu.\n  std::vector<int> mapping =\n      fra_trainer.MapRecoder(deu_trainer.GetUnicharset(), deu_trainer.GetRecoder());\n  std::vector<int> mapped_fra_labels(fra_labels.size(), -1);\n  for (unsigned i = 0; i < fra_labels.size(); ++i) {\n    mapped_fra_labels[i] = mapping[fra_labels[i]];\n    EXPECT_NE(-1, mapped_fra_labels[i]) << \"i=\" << i << \", ch=\" << kTestStr[i];\n    EXPECT_EQ(mapped_fra_labels[i], deu_labels[i])\n        << \"i=\" << i << \", ch=\" << kTestStr[i] << \" has deu label=\" << deu_labels[i]\n        << \", but mapped to \" << mapped_fra_labels[i];\n  }\n  // The german trainer can now decode them correctly.\n  std::string decoded = deu_trainer.DecodeLabels(mapped_fra_labels);\n  std::string ok_str(&decoded[0], decoded.length());\n  LOG(INFO) << \"ok_str deu=\" << ok_str << \"\\n\";\n  EXPECT_EQ(kTestStr, ok_str);\n}\n\n// Tests that the actual fra model can be converted to the deu character set\n// and still read an eng image with 100% accuracy.\nTEST_F(LSTMTrainerTest, ConvertModel) {\n  // Setup a trainer with a deu charset.\n  LSTMTrainer deu_trainer;\n  deu_trainer.InitCharSet(TestDataNameToPath(\"deu/deu.traineddata\"));\n  // Load the fra traineddata, strip out the model, and save to a tmp file.\n  TessdataManager mgr;\n  std::string fra_data = file::JoinPath(TESSDATA_DIR \"_best\", \"fra.traineddata\");\n  CHECK(mgr.Init(fra_data.c_str()));\n  LOG(INFO) << \"Load \" << fra_data << \"\\n\";\n  file::MakeTmpdir();\n  std::string model_path = file::JoinPath(FLAGS_test_tmpdir, \"fra.lstm\");\n  CHECK(mgr.ExtractToFile(model_path.c_str()));\n  LOG(INFO) << \"Extract \" << model_path << \"\\n\";\n  // Load the fra model into the deu_trainer, and save the converted model.\n  CHECK(deu_trainer.TryLoadingCheckpoint(model_path.c_str(), fra_data.c_str()));\n  LOG(INFO) << \"Checkpoint load for \" << model_path << \" and \" << fra_data << \"\\n\";\n  std::string deu_data = file::JoinPath(FLAGS_test_tmpdir, \"deu.traineddata\");\n  CHECK(deu_trainer.SaveTraineddata(deu_data.c_str()));\n  LOG(INFO) << \"Save \" << deu_data << \"\\n\";\n  // Now run the saved model on phototest. (See BasicTesseractTest in\n  // baseapi_test.cc).\n  TessBaseAPI api;\n  api.Init(FLAGS_test_tmpdir, \"deu\", tesseract::OEM_LSTM_ONLY);\n  Image src_pix = pixRead(TestingNameToPath(\"phototest.tif\").c_str());\n  CHECK(src_pix);\n  api.SetImage(src_pix);\n  std::unique_ptr<char[]> result(api.GetUTF8Text());\n  std::string truth_text;\n  CHECK_OK(\n      file::GetContents(TestingNameToPath(\"phototest.gold.txt\"), &truth_text, file::Defaults()));\n\n  EXPECT_STREQ(truth_text.c_str(), result.get());\n  src_pix.destroy();\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/mastertrainer_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// Although this is a trivial-looking test, it exercises a lot of code:\n// SampleIterator has to correctly iterate over the correct characters, or\n// it will fail.\n// The canonical and cloud features computed by TrainingSampleSet need to\n// be correct, along with the distance caches, organizing samples by font\n// and class, indexing of features, distance calculations.\n// IntFeatureDist has to work, or the canonical samples won't work.\n// Mastertrainer has ability to read tr files and set itself up tested.\n// Finally the serialize/deserialize test ensures that MasterTrainer,\n// TrainingSampleSet, TrainingSample can all serialize/deserialize correctly\n// enough to reproduce the same results.\n\n#include \"include_gunit.h\"\n\n#include \"commontraining.h\"\n#include \"errorcounter.h\"\n#include \"log.h\" // for LOG\n#include \"mastertrainer.h\"\n#include \"shapeclassifier.h\"\n#include \"shapetable.h\"\n#include \"trainingsample.h\"\n#include \"unicharset.h\"\n\n#include <string>\n#include <utility>\n#include <vector>\n\nusing namespace tesseract;\n\n// Specs of the MockClassifier.\nstatic const int kNumTopNErrs = 10;\nstatic const int kNumTop2Errs = kNumTopNErrs + 20;\nstatic const int kNumTop1Errs = kNumTop2Errs + 30;\nstatic const int kNumTopTopErrs = kNumTop1Errs + 25;\nstatic const int kNumNonReject = 1000;\nstatic const int kNumCorrect = kNumNonReject - kNumTop1Errs;\n// The total number of answers is given by the number of non-rejects plus\n// all the multiple answers.\nstatic const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) +\n                               (kNumTop1Errs - kNumTop2Errs) + (kNumTopTopErrs - kNumTop1Errs);\n\n#ifndef DISABLED_LEGACY_ENGINE\nstatic bool safe_strto32(const std::string &str, int *pResult) {\n  long n = strtol(str.c_str(), nullptr, 0);\n  *pResult = n;\n  return true;\n}\n#endif\n\n// Mock ShapeClassifier that cheats by looking at the correct answer, and\n// creates a specific pattern of errors that can be tested.\nclass MockClassifier : public ShapeClassifier {\npublic:\n  explicit MockClassifier(ShapeTable *shape_table)\n      : shape_table_(shape_table), num_done_(0), done_bad_font_(false) {\n    // Add a false font answer to the shape table. We pick a random unichar_id,\n    // add a new shape for it with a false font. Font must actually exist in\n    // the font table, but not match anything in the first 1000 samples.\n    false_unichar_id_ = 67;\n    false_shape_ = shape_table_->AddShape(false_unichar_id_, 25);\n  }\n  ~MockClassifier() override = default;\n\n  // Classifies the given [training] sample, writing to results.\n  // If debug is non-zero, then various degrees of classifier dependent debug\n  // information is provided.\n  // If keep_this (a shape index) is >= 0, then the results should always\n  // contain keep_this, and (if possible) anything of intermediate confidence.\n  // The return value is the number of classes saved in results.\n  int ClassifySample(const TrainingSample &sample, Image page_pix, int debug, UNICHAR_ID keep_this,\n                     std::vector<ShapeRating> *results) override {\n    results->clear();\n    // Everything except the first kNumNonReject is a reject.\n    if (++num_done_ > kNumNonReject) {\n      return 0;\n    }\n\n    int class_id = sample.class_id();\n    int font_id = sample.font_id();\n    int shape_id = shape_table_->FindShape(class_id, font_id);\n    // Get ids of some wrong answers.\n    int wrong_id1 = shape_id > 10 ? shape_id - 1 : shape_id + 1;\n    int wrong_id2 = shape_id > 10 ? shape_id - 2 : shape_id + 2;\n    if (num_done_ <= kNumTopNErrs) {\n      // The first kNumTopNErrs are top-n errors.\n      results->push_back(ShapeRating(wrong_id1, 1.0f));\n    } else if (num_done_ <= kNumTop2Errs) {\n      // The next kNumTop2Errs - kNumTopNErrs are top-2 errors.\n      results->push_back(ShapeRating(wrong_id1, 1.0f));\n      results->push_back(ShapeRating(wrong_id2, 0.875f));\n      results->push_back(ShapeRating(shape_id, 0.75f));\n    } else if (num_done_ <= kNumTop1Errs) {\n      // The next kNumTop1Errs - kNumTop2Errs are top-1 errors.\n      results->push_back(ShapeRating(wrong_id1, 1.0f));\n      results->push_back(ShapeRating(shape_id, 0.8f));\n    } else if (num_done_ <= kNumTopTopErrs) {\n      // The next kNumTopTopErrs - kNumTop1Errs are cases where the actual top\n      // is not correct, but do not count as a top-1 error because the rating\n      // is close enough to the top answer.\n      results->push_back(ShapeRating(wrong_id1, 1.0f));\n      results->push_back(ShapeRating(shape_id, 0.99f));\n    } else if (!done_bad_font_ && class_id == false_unichar_id_) {\n      // There is a single character with a bad font.\n      results->push_back(ShapeRating(false_shape_, 1.0f));\n      done_bad_font_ = true;\n    } else {\n      // Everything else is correct.\n      results->push_back(ShapeRating(shape_id, 1.0f));\n    }\n    return results->size();\n  }\n  // Provides access to the ShapeTable that this classifier works with.\n  const ShapeTable *GetShapeTable() const override {\n    return shape_table_;\n  }\n\nprivate:\n  // Borrowed pointer to the ShapeTable.\n  ShapeTable *shape_table_;\n  // Unichar_id of a random character that occurs after the first 60 samples.\n  int false_unichar_id_;\n  // Shape index of prepared false answer for false_unichar_id.\n  int false_shape_;\n  // The number of classifications we have processed.\n  int num_done_;\n  // True after the false font has been emitted.\n  bool done_bad_font_;\n};\n\nconst double kMin1lDistance = 0.25;\n\n// The fixture for testing Tesseract.\nclass MasterTrainerTest : public testing::Test {\n#ifndef DISABLED_LEGACY_ENGINE\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, name);\n  }\n  std::string TmpNameToPath(const std::string &name) {\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n\n  MasterTrainerTest() :\n    shape_table_(nullptr),\n    master_trainer_(nullptr) {\n  }\n  ~MasterTrainerTest() override {\n    delete shape_table_;\n  }\n\n  // Initializes the master_trainer_ and shape_table_.\n  // if load_from_tmp, then reloads a master trainer that was saved by a\n  // previous call in which it was false.\n  void LoadMasterTrainer() {\n    FLAGS_output_trainer = TmpNameToPath(\"tmp_trainer\").c_str();\n    FLAGS_F = file::JoinPath(LANGDATA_DIR, \"font_properties\").c_str();\n    FLAGS_X = TestDataNameToPath(\"eng.xheights\").c_str();\n    FLAGS_U = TestDataNameToPath(\"eng.unicharset\").c_str();\n    std::string tr_file_name(TestDataNameToPath(\"eng.Arial.exp0.tr\"));\n    const char *filelist[] = {tr_file_name.c_str(), nullptr};\n    std::string file_prefix;\n    delete shape_table_;\n    shape_table_ = nullptr;\n    master_trainer_ = LoadTrainingData(filelist, false, &shape_table_, file_prefix);\n    EXPECT_TRUE(master_trainer_ != nullptr);\n    EXPECT_TRUE(shape_table_ != nullptr);\n  }\n\n  // EXPECTs that the distance between I and l in Arial is 0 and that the\n  // distance to 1 is significantly not 0.\n  void VerifyIl1() {\n    // Find the font id for Arial.\n    int font_id = master_trainer_->GetFontInfoId(\"Arial\");\n    EXPECT_GE(font_id, 0);\n    // Track down the characters we are interested in.\n    int unichar_I = master_trainer_->unicharset().unichar_to_id(\"I\");\n    EXPECT_GT(unichar_I, 0);\n    int unichar_l = master_trainer_->unicharset().unichar_to_id(\"l\");\n    EXPECT_GT(unichar_l, 0);\n    int unichar_1 = master_trainer_->unicharset().unichar_to_id(\"1\");\n    EXPECT_GT(unichar_1, 0);\n    // Now get the shape ids.\n    int shape_I = shape_table_->FindShape(unichar_I, font_id);\n    EXPECT_GE(shape_I, 0);\n    int shape_l = shape_table_->FindShape(unichar_l, font_id);\n    EXPECT_GE(shape_l, 0);\n    int shape_1 = shape_table_->FindShape(unichar_1, font_id);\n    EXPECT_GE(shape_1, 0);\n\n    float dist_I_l = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l);\n    // No tolerance here. We expect that I and l should match exactly.\n    EXPECT_EQ(0.0f, dist_I_l);\n    float dist_l_I = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I);\n    // BOTH ways.\n    EXPECT_EQ(0.0f, dist_l_I);\n\n    // l/1 on the other hand should be distinct.\n    float dist_l_1 = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1);\n    EXPECT_GT(dist_l_1, kMin1lDistance);\n    float dist_1_l = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l);\n    EXPECT_GT(dist_1_l, kMin1lDistance);\n\n    // So should I/1.\n    float dist_I_1 = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1);\n    EXPECT_GT(dist_I_1, kMin1lDistance);\n    float dist_1_I = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I);\n    EXPECT_GT(dist_1_I, kMin1lDistance);\n  }\n\n  // Objects declared here can be used by all tests in the test case for Foo.\n  ShapeTable *shape_table_;\n  std::unique_ptr<MasterTrainer> master_trainer_;\n#endif\n};\n\n// Tests that the MasterTrainer correctly loads its data and reaches the correct\n// conclusion over the distance between Arial I l and 1.\nTEST_F(MasterTrainerTest, Il1Test) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because LoadTrainingData is missing.\n  GTEST_SKIP();\n#else\n  // Initialize the master_trainer_ and load the Arial tr file.\n  LoadMasterTrainer();\n  VerifyIl1();\n#endif\n}\n\n// Tests the ErrorCounter using a MockClassifier to check that it counts\n// error categories correctly.\nTEST_F(MasterTrainerTest, ErrorCounterTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because LoadTrainingData is missing.\n  GTEST_SKIP();\n#else\n  // Initialize the master_trainer_ from the saved tmp file.\n  LoadMasterTrainer();\n  // Add the space character to the shape_table_ if not already present to\n  // count junk.\n  if (shape_table_->FindShape(0, -1) < 0) {\n    shape_table_->AddShape(0, 0);\n  }\n  // Make a mock classifier.\n  auto shape_classifier = std::make_unique<MockClassifier>(shape_table_);\n  // Get the accuracy report.\n  std::string accuracy_report;\n  master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0, false,\n                                           shape_classifier.get(), &accuracy_report);\n  LOG(INFO) << accuracy_report.c_str();\n  std::string result_string = accuracy_report.c_str();\n  std::vector<std::string> results = split(result_string, '\\t');\n  EXPECT_EQ(tesseract::CT_SIZE + 1, results.size());\n  int result_values[tesseract::CT_SIZE];\n  for (int i = 0; i < tesseract::CT_SIZE; ++i) {\n    EXPECT_TRUE(safe_strto32(results[i + 1], &result_values[i]));\n  }\n  // These tests are more-or-less immune to additions to the number of\n  // categories or changes in the training data.\n  int num_samples = master_trainer_->GetSamples()->num_raw_samples();\n  EXPECT_EQ(kNumCorrect, result_values[tesseract::CT_UNICHAR_TOP_OK]);\n  EXPECT_EQ(1, result_values[tesseract::CT_FONT_ATTR_ERR]);\n  EXPECT_EQ(kNumTopTopErrs, result_values[tesseract::CT_UNICHAR_TOPTOP_ERR]);\n  EXPECT_EQ(kNumTop1Errs, result_values[tesseract::CT_UNICHAR_TOP1_ERR]);\n  EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]);\n  EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]);\n  // Each of the TOPTOP errs also counts as a multi-unichar.\n  EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs, result_values[tesseract::CT_OK_MULTI_UNICHAR]);\n  EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]);\n  EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]);\n#endif\n}\n"
  },
  {
    "path": "unittest/matrix_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        matrix_test.cc\n// Author:      rays@google.com (Ray Smith)\n//\n// Copyright 2016 Google Inc. All Rights Reserved.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n#include \"matrix.h\"\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass MatrixTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  // Fills src_ with data so it can pretend to be a tensor thus:\n  //  dims_=[5, 4, 3, 2]\n  //  array_=[0, 1, 2, ....119]\n  //  tensor=[[[[0, 1][2, 3][4, 5]]\n  //           [[6, 7][8, 9][10, 11]]\n  //           [[12, 13][14, 15][16, 17]]\n  //           [[18, 19][20, 21][22, 23]]]\n  //          [[[24, 25]...\n  MatrixTest() {\n    src_.Resize(1, kInputSize_, 0);\n    for (int i = 0; i < kInputSize_; ++i) {\n      src_.put(0, i, i);\n    }\n    for (int i = 0; i < kNumDims_; ++i) {\n      dims_[i] = 5 - i;\n    }\n  }\n  // Number of dimensions in src_.\n  static const int kNumDims_ = 4;\n  // Number of elements in src_.\n  static const int kInputSize_ = 120;\n  // Size of each dimension in src_;\n  int dims_[kNumDims_];\n  // Input array filled with [0,kInputSize).\n  GENERIC_2D_ARRAY<int> src_;\n};\n\n// Tests that the RotatingTranspose function does the right thing for various\n// transformations.\n// dims=[5, 4, 3, 2]->[5, 2, 4, 3]\nTEST_F(MatrixTest, RotatingTranspose_3_1) {\n  GENERIC_2D_ARRAY<int> m;\n  src_.RotatingTranspose(dims_, kNumDims_, 3, 1, &m);\n  m.ResizeNoInit(kInputSize_ / 3, 3);\n  // Verify that the result is:\n  // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]]\n  //                 [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]]\n  //                [[[24, 26, 28]...\n  EXPECT_EQ(0, m(0, 0));\n  EXPECT_EQ(2, m(0, 1));\n  EXPECT_EQ(4, m(0, 2));\n  EXPECT_EQ(6, m(1, 0));\n  EXPECT_EQ(1, m(4, 0));\n  EXPECT_EQ(24, m(8, 0));\n  EXPECT_EQ(26, m(8, 1));\n  EXPECT_EQ(25, m(12, 0));\n}\n\n// dims=[5, 4, 3, 2]->[3, 5, 4, 2]\nTEST_F(MatrixTest, RotatingTranspose_2_0) {\n  GENERIC_2D_ARRAY<int> m;\n  src_.RotatingTranspose(dims_, kNumDims_, 2, 0, &m);\n  m.ResizeNoInit(kInputSize_ / 2, 2);\n  // Verify that the result is:\n  // output tensor=[[[[0, 1][6, 7][12, 13][18, 19]]\n  //                 [[24, 25][30, 31][36, 37][42, 43]]\n  //                 [[48, 49][54, 55][60, 61][66, 67]]\n  //                 [[72, 73][78, 79][84, 85][90, 91]]\n  //                 [[96, 97][102, 103][108, 109][114, 115]]]\n  //                [[[2,3]...\n  EXPECT_EQ(0, m(0, 0));\n  EXPECT_EQ(1, m(0, 1));\n  EXPECT_EQ(6, m(1, 0));\n  EXPECT_EQ(7, m(1, 1));\n  EXPECT_EQ(24, m(4, 0));\n  EXPECT_EQ(25, m(4, 1));\n  EXPECT_EQ(30, m(5, 0));\n  EXPECT_EQ(2, m(20, 0));\n}\n\n// dims=[5, 4, 3, 2]->[5, 3, 2, 4]\nTEST_F(MatrixTest, RotatingTranspose_1_3) {\n  GENERIC_2D_ARRAY<int> m;\n  src_.RotatingTranspose(dims_, kNumDims_, 1, 3, &m);\n  m.ResizeNoInit(kInputSize_ / 4, 4);\n  // Verify that the result is:\n  // output tensor=[[[[0, 6, 12, 18][1, 7, 13, 19]]\n  //                 [[2, 8, 14, 20][3, 9, 15, 21]]\n  //                 [[4, 10, 16, 22][5, 11, 17, 23]]]\n  //                [[[24, 30, 36, 42]...\n  EXPECT_EQ(0, m(0, 0));\n  EXPECT_EQ(6, m(0, 1));\n  EXPECT_EQ(1, m(1, 0));\n  EXPECT_EQ(2, m(2, 0));\n  EXPECT_EQ(3, m(3, 0));\n  EXPECT_EQ(4, m(4, 0));\n  EXPECT_EQ(5, m(5, 0));\n  EXPECT_EQ(24, m(6, 0));\n  EXPECT_EQ(30, m(6, 1));\n}\n\n// dims=[5, 4, 3, 2]->[4, 3, 5, 2]\nTEST_F(MatrixTest, RotatingTranspose_0_2) {\n  GENERIC_2D_ARRAY<int> m;\n  src_.RotatingTranspose(dims_, kNumDims_, 0, 2, &m);\n  m.ResizeNoInit(kInputSize_ / 2, 2);\n  // Verify that the result is:\n  // output tensor=[[[[0, 1][24, 25][48, 49][72, 73][96, 97]]\n  //                 [[2, 3][26, 27][50, 51][74, 75][98, 99]]\n  //                 [[4, 5][28, 29][52, 53][76, 77][100, 101]]]\n  //                [[[6, 7]...\n  EXPECT_EQ(0, m(0, 0));\n  EXPECT_EQ(1, m(0, 1));\n  EXPECT_EQ(24, m(1, 0));\n  EXPECT_EQ(25, m(1, 1));\n  EXPECT_EQ(96, m(4, 0));\n  EXPECT_EQ(97, m(4, 1));\n  EXPECT_EQ(2, m(5, 0));\n  EXPECT_EQ(6, m(15, 0));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/networkio_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"networkio.h\"\n#include \"include_gunit.h\"\n#include \"stridemap.h\"\n#ifdef INCLUDE_TENSORFLOW\n#  include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D\n#endif\n\nnamespace tesseract {\n\nclass NetworkioTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n#ifdef INCLUDE_TENSORFLOW\n  // Sets up an Array2d object of the given size, initialized to increasing\n  // values starting with start.\n  std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {\n    std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));\n    int value = start;\n    for (int y = 0; y < ysize; ++y) {\n      for (int x = 0; x < xsize; ++x) {\n        (*a)(y, x) = value++;\n      }\n    }\n    return a;\n  }\n  // Sets up a NetworkIO with a batch of 2 \"images\" of known values.\n  void SetupNetworkIO(NetworkIO *nio) {\n    std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;\n    arrays.push_back(SetupArray(3, 4, 0));\n    arrays.push_back(SetupArray(4, 5, 12));\n    std::vector<std::pair<int, int>> h_w_sizes;\n    for (size_t i = 0; i < arrays.size(); ++i) {\n      h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());\n    }\n    StrideMap stride_map;\n    stride_map.SetStride(h_w_sizes);\n    nio->ResizeToMap(true, stride_map, 2);\n    // Iterate over the map, setting nio's contents from the arrays.\n    StrideMap::Index index(stride_map);\n    do {\n      int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT), index.index(FD_WIDTH));\n      nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f);\n      nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f);\n    } while (index.Increment());\n  }\n#endif\n};\n\n// Tests that the initialization via SetPixel works and the resize correctly\n// fills with zero where image sizes don't match.\nTEST_F(NetworkioTest, InitWithZeroFill) {\n#ifdef INCLUDE_TENSORFLOW\n  NetworkIO nio;\n  nio.Resize2d(true, 32, 2);\n  int width = nio.Width();\n  for (int t = 0; t < width; ++t) {\n    nio.SetPixel(t, 0, 0, 0.0f, 128.0f);\n    nio.SetPixel(t, 1, 0, 0.0f, 128.0f);\n  }\n  // The initialization will wipe out all previously set values.\n  SetupNetworkIO(&nio);\n  nio.ZeroInvalidElements();\n  StrideMap::Index index(nio.stride_map());\n  int next_t = 0;\n  int pos = 0;\n  do {\n    int t = index.t();\n    // The indexed values just increase monotonically.\n    int value = nio.i(t)[0];\n    EXPECT_EQ(value, pos);\n    value = nio.i(t)[1];\n    EXPECT_EQ(value, -pos);\n    // When we skip t values, the data is always 0.\n    while (next_t < t) {\n      EXPECT_EQ(nio.i(next_t)[0], 0);\n      EXPECT_EQ(nio.i(next_t)[1], 0);\n      ++next_t;\n    }\n    ++pos;\n    ++next_t;\n  } while (index.Increment());\n  EXPECT_EQ(pos, 32);\n  EXPECT_EQ(next_t, 40);\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\n// Tests that CopyWithYReversal works.\nTEST_F(NetworkioTest, CopyWithYReversal) {\n#ifdef INCLUDE_TENSORFLOW\n  NetworkIO nio;\n  SetupNetworkIO(&nio);\n  NetworkIO copy;\n  copy.CopyWithYReversal(nio);\n  StrideMap::Index index(copy.stride_map());\n  int next_t = 0;\n  int pos = 0;\n  std::vector<int> expected_values = {8,  9,  10, 11, 4,  5,  6,  7,  0,  1,  2,\n                                      3,  27, 28, 29, 30, 31, 22, 23, 24, 25, 26,\n                                      17, 18, 19, 20, 21, 12, 13, 14, 15, 16};\n  do {\n    int t = index.t();\n    // The indexed values match the expected values.\n    int value = copy.i(t)[0];\n    EXPECT_EQ(value, expected_values[pos]);\n    value = copy.i(t)[1];\n    EXPECT_EQ(value, -expected_values[pos]);\n    // When we skip t values, the data is always 0.\n    while (next_t < t) {\n      EXPECT_EQ(copy.i(next_t)[0], 0) << \"Failure t = \" << next_t;\n      EXPECT_EQ(copy.i(next_t)[1], 0) << \"Failure t = \" << next_t;\n      ++next_t;\n    }\n    ++pos;\n    ++next_t;\n  } while (index.Increment());\n  EXPECT_EQ(pos, 32);\n  EXPECT_EQ(next_t, 40);\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\n// Tests that CopyWithXReversal works.\nTEST_F(NetworkioTest, CopyWithXReversal) {\n#ifdef INCLUDE_TENSORFLOW\n  NetworkIO nio;\n  SetupNetworkIO(&nio);\n  NetworkIO copy;\n  copy.CopyWithXReversal(nio);\n  StrideMap::Index index(copy.stride_map());\n  int next_t = 0;\n  int pos = 0;\n  std::vector<int> expected_values = {3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,\n                                      8,  16, 15, 14, 13, 12, 21, 20, 19, 18, 17,\n                                      26, 25, 24, 23, 22, 31, 30, 29, 28, 27};\n  do {\n    int t = index.t();\n    // The indexed values match the expected values.\n    int value = copy.i(t)[0];\n    EXPECT_EQ(value, expected_values[pos]);\n    value = copy.i(t)[1];\n    EXPECT_EQ(value, -expected_values[pos]);\n    // When we skip t values, the data is always 0.\n    while (next_t < t) {\n      EXPECT_EQ(copy.i(next_t)[0], 0) << \"Failure t = \" << next_t;\n      EXPECT_EQ(copy.i(next_t)[1], 0) << \"Failure t = \" << next_t;\n      ++next_t;\n    }\n    ++pos;\n    ++next_t;\n  } while (index.Increment());\n  EXPECT_EQ(pos, 32);\n  EXPECT_EQ(next_t, 40);\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\n// Tests that CopyWithXYTranspose works.\nTEST_F(NetworkioTest, CopyWithXYTranspose) {\n#ifdef INCLUDE_TENSORFLOW\n  NetworkIO nio;\n  SetupNetworkIO(&nio);\n  NetworkIO copy;\n  copy.CopyWithXYTranspose(nio);\n  StrideMap::Index index(copy.stride_map());\n  int next_t = 0;\n  int pos = 0;\n  std::vector<int> expected_values = {0,  4,  8,  1,  5,  9,  2,  6,  10, 3,  7,\n                                      11, 12, 17, 22, 27, 13, 18, 23, 28, 14, 19,\n                                      24, 29, 15, 20, 25, 30, 16, 21, 26, 31};\n  do {\n    int t = index.t();\n    // The indexed values match the expected values.\n    int value = copy.i(t)[0];\n    EXPECT_EQ(value, expected_values[pos]);\n    value = copy.i(t)[1];\n    EXPECT_EQ(value, -expected_values[pos]);\n    // When we skip t values, the data is always 0.\n    while (next_t < t) {\n      EXPECT_EQ(copy.i(next_t)[0], 0);\n      EXPECT_EQ(copy.i(next_t)[1], 0);\n      ++next_t;\n    }\n    ++pos;\n    ++next_t;\n  } while (index.Increment());\n  EXPECT_EQ(pos, 32);\n  EXPECT_EQ(next_t, 40);\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/normstrngs_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"normstrngs.h\"\n#include <tesseract/unichar.h>\n#include \"include_gunit.h\"\n#include \"normstrngs_test.h\"\n#ifdef INCLUDE_TENSORFLOW\n#  include \"util/utf8/unilib.h\" // for UniLib\n#endif\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\n#if defined(MISSING_CODE)\nstatic std::string EncodeAsUTF8(const char32 ch32) {\n  UNICHAR uni_ch(ch32);\n  return std::string(uni_ch.utf8(), uni_ch.utf8_len());\n}\n#endif\n\nTEST(NormstrngsTest, BasicText) {\n  const char *kBasicText = \"AbCd Ef\";\n  std::string result;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kBasicText, &result));\n  EXPECT_STREQ(kBasicText, result.c_str());\n}\n\nTEST(NormstrngsTest, LigatureText) {\n  const char *kTwoByteLigText = \"ĳ\"; // U+0133 (ĳ) -> ij\n  std::string result;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kTwoByteLigText, &result));\n  EXPECT_STREQ(\"ij\", result.c_str());\n\n  const char *kThreeByteLigText = \"ﬁnds\"; // U+FB01 (ﬁ) -> fi\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kThreeByteLigText, &result));\n  EXPECT_STREQ(\"finds\", result.c_str());\n}\n\nTEST(NormstrngsTest, OcrSpecificNormalization) {\n  const char *kSingleQuoteText = \"‘Hi\"; // U+2018 (‘) -> U+027 (')\n  std::string result;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kSingleQuoteText, &result));\n  EXPECT_STREQ(\"'Hi\", result.c_str());\n\n  const char *kDoubleQuoteText = \"“Hi\"; // U+201C (“) -> U+022 (\")\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kDoubleQuoteText, &result));\n  EXPECT_STREQ(\"\\\"Hi\", result.c_str());\n\n  const char *kEmDash = \"Hi—\"; // U+2014 (—) -> U+02D (-)\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,\n                                  GraphemeNorm::kNormalize, kEmDash, &result));\n  EXPECT_STREQ(\"Hi-\", result.c_str());\n  // Without the ocr normalization, these changes are not made.\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kSingleQuoteText, &result));\n  EXPECT_STREQ(kSingleQuoteText, result.c_str());\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kDoubleQuoteText, &result));\n  EXPECT_STREQ(kDoubleQuoteText, result.c_str());\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kEmDash, &result));\n  EXPECT_STREQ(kEmDash, result.c_str());\n}\n\n// Sample text used in tests.\nconst char kEngText[] = \"the quick brown fox jumps over the lazy dog\";\nconst char kHinText[] = \"पिताने विवाह की | हो गई उद्विग्न वह सोचा\";\nconst char kKorText[] = \"이는 것으로\";\n// Hindi words containing illegal vowel sequences.\nconst char *kBadlyFormedHinWords[] = {\"उपयोक्ताो\", \"नहीें\", \"प्रंात\", \"कहीअे\", \"पत्रिाका\", \"छह्णाीस\"};\n// Thai illegal sequences.\nconst char *kBadlyFormedThaiWords[] = {\"ฤิ\", \"กา้ํ\", \"กิำ\", \"นำ้\", \"เเก\"};\n\nTEST(NormstrngsTest, DetectsCorrectText) {\n  std::string chars;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kEngText, &chars));\n  EXPECT_STREQ(kEngText, chars.c_str());\n\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kHinText, &chars))\n      << \"Incorrect text: '\" << kHinText << \"'\";\n  EXPECT_STREQ(kHinText, chars.c_str());\n\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  kKorText, &chars));\n  EXPECT_STREQ(kKorText, chars.c_str());\n}\n\nTEST(NormstrngsTest, DetectsIncorrectText) {\n  for (auto &kBadlyFormedHinWord : kBadlyFormedHinWords) {\n    EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,\n                                     GraphemeNorm::kNormalize, kBadlyFormedHinWord, nullptr))\n        << kBadlyFormedHinWord;\n  }\n  for (auto &kBadlyFormedThaiWord : kBadlyFormedThaiWords) {\n    EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,\n                                     GraphemeNorm::kNormalize, kBadlyFormedThaiWord, nullptr))\n        << kBadlyFormedThaiWord;\n  }\n}\n\nTEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {\n  std::string nonindic = \"Here's some latin text.\";\n  std::string dest;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  nonindic.c_str(), &dest))\n      << PrintString32WithUnicodes(nonindic);\n  EXPECT_EQ(dest, nonindic);\n}\n\nTEST(NormstrngsTest, NoLonelyJoiners) {\n  std::string str = \"x\\u200d\\u0d06\\u0d34\\u0d02\";\n  std::vector<std::string> glyphs;\n  // Returns true, but the joiner is gone.\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[0], std::string(\"x\"));\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0d06\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0d34\\u0d02\"));\n}\n\nTEST(NormstrngsTest, NoLonelyJoinersPlus) {\n  std::string str = \"\\u0d2a\\u200d+\\u0d2a\\u0d4b\";\n  std::vector<std::string> glyphs;\n  // Returns true, but the joiner is gone.\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0d2a\"));\n  EXPECT_EQ(glyphs[1], std::string(\"+\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0d2a\\u0d4b\"));\n}\n\nTEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {\n  std::string str = \"\\u200d+\\u200c\\u200d\";\n  // Returns true, but the joiners are gone.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string(\"+\"));\n  str = \"\\u200d\\u200c\\u200d\";\n  // Without the plus, the string is invalid.\n  std::string result;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result))\n      << PrintString32WithUnicodes(result);\n}\n\nTEST(NormstrngsTest, JoinersStayInArabic) {\n  std::string str = \"\\u0628\\u200c\\u0628\\u200d\\u0628\";\n  // Returns true, string untouched.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str);\n}\n\nTEST(NormstrngsTest, DigitOK) {\n  std::string str = \"\\u0cea\"; // Digit 4.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);\n}\n\nTEST(NormstrngsTest, DandaOK) {\n  std::string str = \"\\u0964\"; // Single danda.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);\n  str = \"\\u0965\"; // Double danda.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);\n}\n\nTEST(NormstrngsTest, AllScriptsRegtest) {\n  // Tests some valid text in a large number of scripts, some of which were\n  // found to be rejected by an earlier version.\n  const std::vector<std::pair<std::string, std::string>> kScriptText(\n      {{\"Arabic\",\n        \" فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن\"\n        \"توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة \"\n        \"مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند \"\n        \"سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای\"},\n       {\"Armenian\",\n        \"անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-\"\n        \"պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-\"\n        \"Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-\"\n        \"գծերը եւ միջագծերը կը համրուին վարէն վեր:\"},\n       {\"Bengali\",\n        \"এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত \"\n        \"পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে \"\n        \"সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে \"\n        \"কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী\"},\n       {\"Cyrillic\",\n        \"достей, є ще нагороди й почесті, є хай і сумнівна, але слава, \"\n        \"вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., \"\n        \"»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- \"\n        \"І знов майдан зачорнів од народу. Всередині чоло-\"},\n       {\"Devanagari\",\n        \"डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी \"\n        \"बाबतीत लिहिणे ही  एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा \"\n        \"प्रबंध, आधोगिक प्रबंध तथा बैंकिंग  एवम वाणिज्य आदि विषयों में \"\n        \"चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत\"},\n       {\"Greek\",\n        \"Μέσα ένα τετράδιο είχα στριμώξει το πρώτο \"\n        \"νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα \"\n        \"οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. \"\n        \"είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-\"},\n       {\"Gujarati\",\n        \"ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું \"\n        \"શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને \"\n        \"ત્યાં વાંકુથી પાછે  આવ્યો, ચોરીનો માલ સોંપવા ! \"\n        \"કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી\"},\n       {\"Gurmukhi\",\n        \"ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ \"\n        \"ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ \"\n        \"ਭੂਰਾ  ਸਾਨੂੰ  ਥੜਾ  ਚੰਗਾ  ਲਗਦਾ  ਸੀ ।  ਉਸ  ਦਾ  ਇਕ  ਪੈਰ  ਜਨਮ ਤੋ \"\n        \"ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,\"},\n       {\"Hangul\",\n        \"로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 \"\n        \"그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 \"\n        \"의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 \"\n        \"마르크스 레\"\n        \"각하는 그는 그들의 식사보장을 위해 때때로 집에\"},\n       {\"HanS\",\n        \"大凡世界上的先生可 分 三 种： 第一种只会教书， 只会拿一 \"\n        \"书像是探宝一样，在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红\"\n        \" \"\n        \"持 “左” 倾冒险主义的干部，便扣上 “富农 \"\n        \"笑说：“我听说了，王总工程师也跟我说过了，只是工作忙，谁\"},\n       {\"HanT\",\n        \"叁、 銀行資產管理的群組分析模式 \"\n        \"民國六十三年，申請就讀台灣大學歷史研究所，並從事著述，\"\n        \"質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 \"\n        \"董橋，一九四二年生，福建晉江人，國立成功大學外\"},\n       {\"Hebrew\",\n        \" אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי\"\n        \" הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את\"\n        \" ווערטער  געהאט,  אבער  דער  עיקר  איז  ניט  דאָס  וואָרט,  נאָר\"\n        \" על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד\"},\n       {\"Japanese\",\n        \"は異民族とみなされていた。楚の荘王（前613〜前 \"\n        \"を詳細に吟味する。実際の治療活動の領域は便宜上、(1)　障害者 \"\n        \"困難性は多角企業の場合原則として部門別に判断されている.). \"\n        \"☆ご希望の団体には見本をお送りします\"},\n       {\"Kannada\",\n        \"ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು \"\n        \"ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ \"\n        \"ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು \"\n        \"\\\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\\\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,\"},\n       {\"Khmer\",\n        \"សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ \"\n        \"និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី \"\n        \"កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន \"\n        \"ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ\"},\n       {\"Lao\",\n        \"ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ \"\n        \"ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; \"\n        \"ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. \"\n        \"ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)\"},\n       {\"Latin\",\n        \"režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt \"\n        \"Ešte nedávno sa chcel mladý Novomeský „liečiť” \"\n        \"tiivisia kysymyksiä, mistä seuraa, että spekula-   |   don luonteesta \"\n        \"Grabiel Sanchez, yang bertani selama 120 tahun meninggal\"},\n       {\"Malayalam\",\n        \"അമൂർത്തചിത്രമായിരിക്കും.  ഛേ! ആ വീട്ടിലേക്ക്  അവളൊന്നിച്ച്  പോകേണ്ടതാ \"\n        \"മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു \"\n        \"വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും?  പറ. \"\n        \"എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട\"},\n       {\"Tamil\",\n        \"பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி \"\n        \"உள்ளடக்கி  நிற்பது  விநோத  வார்த்தையின் அஃறிணை \"\n        \"சூரிய   கிரஹண   சமயத்தில்   குருக்ஷேத்திரம்   செல்வது \"\n        \"காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',\"},\n       {\"Telugu\",\n        \"1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు \"\n        \"ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. \"\n        \"సంచారము చేయును.  మీరు ఇప్పుడే కాళకాలయమునకు \"\n        \"ఎంతటి  సరళమైన  భాషలో  వ్రాశాడో  విశదమవుతుంది.   పైగా  ఆనాటి   భాష\"},\n       {\"Thai\",\n        \"อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า \"\n        \"ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว  ตราบนั้น \"\n        \"พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน \"\n        \"อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง\"},\n       {\"Vietnamese\",\n        \"vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng \"\n        \"chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng \"\n        \"hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng \"\n        \"Cặp câu đói súc tích mà sâu sắc, là lời chúc lời\"}});\n\n  for (const auto &p : kScriptText) {\n    std::string normalized;\n    EXPECT_TRUE(tesseract::NormalizeUTF8String(\n        tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,\n        tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))\n        << \"Script=\" << p.first << \" text=\" << p.second;\n  }\n}\n\nTEST(NormstrngsTest, IsWhitespace) {\n  // U+0020 is whitespace\n  EXPECT_TRUE(IsWhitespace(' '));\n  EXPECT_TRUE(IsWhitespace('\\t'));\n  EXPECT_TRUE(IsWhitespace('\\r'));\n  EXPECT_TRUE(IsWhitespace('\\n'));\n  // U+2000 through U+200A\n  for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {\n    char text[80];\n    snprintf(text, sizeof(text), \"Failed at U+%x\", ch);\n    SCOPED_TRACE(text);\n    EXPECT_TRUE(IsWhitespace(ch));\n  }\n  // U+3000 is whitespace\n  EXPECT_TRUE(IsWhitespace(0x3000));\n  // ZWNBSP is not considered a space.\n  EXPECT_FALSE(IsWhitespace(0xFEFF));\n}\n\nTEST(NormstrngsTest, SpanUTF8Whitespace) {\n  EXPECT_EQ(4, SpanUTF8Whitespace(\" \\t\\r\\n\"));\n  EXPECT_EQ(4, SpanUTF8Whitespace(\" \\t\\r\\nabc\"));\n  EXPECT_EQ(0, SpanUTF8Whitespace(\"abc \\t\\r\\nabc\"));\n  EXPECT_EQ(0, SpanUTF8Whitespace(\"\"));\n}\n\nTEST(NormstrngsTest, SpanUTF8NotWhitespace) {\n  const char kHinText[] = \"पिताने विवाह\";\n  const char kKorText[] = \"이는 것으로 다시 넣을\";\n  const char kMixedText[] = \"والفكر 123 والصراع abc\";\n\n  EXPECT_EQ(0, SpanUTF8NotWhitespace(\"\"));\n  EXPECT_EQ(0, SpanUTF8NotWhitespace(\" abc\"));\n  EXPECT_EQ(0, SpanUTF8NotWhitespace(\"\\rabc\"));\n  EXPECT_EQ(0, SpanUTF8NotWhitespace(\"\\tabc\"));\n  EXPECT_EQ(0, SpanUTF8NotWhitespace(\"\\nabc\"));\n  EXPECT_EQ(3, SpanUTF8NotWhitespace(\"abc def\"));\n  EXPECT_EQ(18, SpanUTF8NotWhitespace(kHinText));\n  EXPECT_EQ(6, SpanUTF8NotWhitespace(kKorText));\n  EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));\n}\n\n// Test that the method clones the util/utf8/unilib definition of\n// interchange validity.\nTEST(NormstrngsTest, IsInterchangeValid) {\n#ifdef INCLUDE_TENSORFLOW\n  const int32_t kMinUnicodeValue = 33;\n  const int32_t kMaxUnicodeValue = 0x10FFFF;\n  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {\n    char text[80];\n    snprintf(text, sizeof(text), \"Failed at U+%x\", ch);\n    SCOPED_TRACE(text);\n    EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));\n  }\n#else\n  GTEST_SKIP();\n#endif\n}\n\n// Test that the method clones the util/utf8/unilib definition of\n// 7-bit ASCII interchange validity.\nTEST(NormstrngsTest, IsInterchangeValid7BitAscii) {\n#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)\n  const int32_t kMinUnicodeValue = 33;\n  const int32_t kMaxUnicodeValue = 0x10FFFF;\n  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {\n    char text[80];\n    snprintf(text, sizeof(text), \"Failed at U+%x\", ch);\n    SCOPED_TRACE(text);\n    std::string str = EncodeAsUTF8(ch);\n    EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), IsInterchangeValid7BitAscii(ch));\n  }\n#else\n  // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.\n  GTEST_SKIP();\n#endif\n}\n\n// Test that the method clones the util/utf8/unilib definition of\n// fullwidth-halfwidth .\nTEST(NormstrngsTest, FullwidthToHalfwidth) {\n  // U+FF21 -> U+0041 (Latin capital letter A)\n  EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21));\n  // U+FF05 -> U+0025 (percent sign)\n  EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05));\n  // U+FFE6 -> U+20A9 (won sign)\n  EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));\n\n#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)\n  // Skipped because of missing UniLib::FullwidthToHalfwidth.\n  const int32_t kMinUnicodeValue = 33;\n  const int32_t kMaxUnicodeValue = 0x10FFFF;\n  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {\n    if (!IsValidCodepoint(ch))\n      continue;\n    char text[80];\n    snprintf(text, sizeof(text), \"Failed at U+%x\", ch);\n    SCOPED_TRACE(text);\n    std::string str = EncodeAsUTF8(ch);\n    const std::string expected_half_str =\n        UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);\n    EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));\n  }\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/normstrngs_test.h",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_\n#define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_\n\n#include <tesseract/unichar.h>\n#include <sstream> // for std::stringstream\n#include <string>\n#include <vector>\n\nnamespace tesseract {\n\ninline std::string CodepointList(const std::vector<char32> &str32) {\n  std::stringstream result;\n  int total_chars = str32.size();\n  result << std::hex;\n  for (int i = 0; i < total_chars; ++i) {\n    result << \"[\" << str32[i] << \"]\";\n  }\n  return result.str();\n}\n\ninline std::string PrintString32WithUnicodes(const std::string &str) {\n  std::vector<char32> str32 = UNICHAR::UTF8ToUTF32(str.c_str());\n  std::string s = \"\\\"\";\n  s += \"\\\" \" + CodepointList(str32);\n  return s;\n}\n\ninline std::string PrintStringVectorWithUnicodes(const std::vector<std::string> &glyphs) {\n  std::string result;\n  for (const auto &s : glyphs) {\n    result += \"Glyph:\";\n    result += PrintString32WithUnicodes(s) + \"\\n\";\n  }\n  return result;\n}\n\ninline void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode,\n                                      int unicode_count, int glyph_count, int grapheme_count,\n                                      const std::string &target_str) {\n  std::vector<std::string> glyphs;\n  std::string s;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(\n      u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, str.c_str(), &glyphs));\n  EXPECT_EQ(glyphs.size(), unicode_count) << PrintStringVectorWithUnicodes(glyphs);\n  for (auto &glyph : glyphs) {\n    s += glyph;\n  }\n  EXPECT_EQ(target_str, s);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,\n                                           true, str.c_str(), &glyphs));\n  EXPECT_EQ(glyphs.size(), glyph_count) << PrintStringVectorWithUnicodes(glyphs);\n  s.clear();\n  for (auto &glyph : glyphs) {\n    s += glyph;\n  }\n  EXPECT_EQ(target_str, s);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kCombined,\n                                           true, str.c_str(), &glyphs));\n  EXPECT_EQ(glyphs.size(), grapheme_count) << PrintStringVectorWithUnicodes(glyphs);\n  s.clear();\n  for (auto &glyph : glyphs) {\n    s += glyph;\n  }\n  EXPECT_EQ(target_str, s);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kSingleString,\n                                           true, str.c_str(), &glyphs));\n  EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs);\n  EXPECT_EQ(target_str, glyphs[0]);\n  std::string result;\n  EXPECT_TRUE(\n      NormalizeUTF8String(u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));\n  EXPECT_EQ(target_str, result);\n}\n\n} // namespace tesseract\n\n#endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_\n"
  },
  {
    "path": "unittest/nthitem_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"kdpair.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nint test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0, -32767, 6, 7};\n\n// The fixture for testing GenericHeap and DoublePtr.\nclass NthItemTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\npublic:\n  ~NthItemTest() override;\n  // Pushes the test data onto the KDVector.\n  void PushTestData(KDVector *v) {\n    for (size_t i = 0; i < countof(test_data); ++i) {\n      IntKDPair pair(test_data[i], i);\n      v->push_back(pair);\n    }\n  }\n};\n\n// Destructor.\n// It is defined here, so the compiler can create a single vtable\n// instead of a weak vtable (fixes compiler warning).\nNthItemTest::~NthItemTest() = default;\n\n// Tests basic results.\nTEST_F(NthItemTest, GeneralTest) {\n  KDVector v;\n  // Push the test data onto the KDVector.\n  PushTestData(&v);\n  // Get the min item.\n  size_t index = 0;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is -32767.\n  EXPECT_EQ(-32767, v[index].key());\n  // Get the max item.\n  index = v.size() - 1;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 65536.\n  EXPECT_EQ(65536, v[index].key());\n}\n\n// Tests results on boring data with lots of duplication.\nTEST_F(NthItemTest, BoringTest) {\n  KDVector v;\n  // Push the test data onto the KDVector.\n  int test_data[] = {8, 8, 8, 8, 8, 7, 7, 7, 7};\n  for (size_t i = 0; i < countof(test_data); ++i) {\n    IntKDPair pair(test_data[i], i);\n    v.push_back(pair);\n  }\n  // The 3rd item is 7 but the 4th is 8..\n  size_t index = 3;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 7.\n  EXPECT_EQ(7, v[index].key());\n  index = 4;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 8.\n  EXPECT_EQ(8, v[index].key());\n  // Get the min item.\n  index = 0;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 7.\n  EXPECT_EQ(7, v[index].key());\n  // Get the max item.\n  index = v.size() - 1;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 8.\n  EXPECT_EQ(8, v[index].key());\n}\n\n// Tests that a unique median in an odd-size array is found correctly.\nTEST_F(NthItemTest, UniqueTest) {\n  KDVector v;\n  // Push the test data onto the KDVector.\n  PushTestData(&v);\n  // Get the median item.\n  size_t index = v.size() / 2;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 6, it started out at index 11.\n  EXPECT_EQ(6, v[index].key());\n  EXPECT_EQ(11, v[index].data());\n}\n\n// Tests that an equal median is found correctly.\nTEST_F(NthItemTest, EqualTest) {\n  KDVector v;\n  // Push the test data onto the KDVector.\n  PushTestData(&v);\n  // Add an extra 8. This makes the median 7.\n  IntKDPair pair(8, 13);\n  v.push_back(pair);\n  // Get the median item.\n  size_t index = v.size() / 2;\n  std::nth_element(v.begin(), v.begin() + index, v.end());\n  // The result is 7, it started out at index 4 or 12.\n  EXPECT_EQ(7, v[index].key());\n  EXPECT_TRUE(v[index].data() == 4 || v[index].data() == 12);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/osd_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        osd_test.cc\n// Description: OSD Tests for Tesseract.\n// Author:      ShreeDevi Kumar\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// based on https://gist.github.com/amitdo/7c7a522004dd79b398340c9595b377e1\n\n// expects clones of tessdata, tessdata_fast and tessdata_best repos\n\n//#include \"log.h\"\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <iostream>\n#include <memory> // std::unique_ptr\n#include <string>\n#include \"include_gunit.h\"\n#include \"image.h\"\n\nnamespace tesseract {\n\nclass TestClass : public testing::Test {\nprotected:\n};\n\n#ifndef DISABLED_LEGACY_ENGINE\nstatic void OSDTester(int expected_deg, const char *imgname, const char *tessdatadir) {\n  // log.info() << tessdatadir << \" for image: \" << imgname << std::endl;\n  auto api = std::make_unique<tesseract::TessBaseAPI>();\n  ASSERT_FALSE(api->Init(tessdatadir, \"osd\")) << \"Could not initialize tesseract.\";\n  Image image = pixRead(imgname);\n  ASSERT_TRUE(image != nullptr) << \"Failed to read test image.\";\n  api->SetImage(image);\n  int orient_deg;\n  float orient_conf;\n  const char *script_name;\n  float script_conf;\n  bool detected =\n      api->DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf);\n  ASSERT_FALSE(!detected) << \"Failed to detect OSD.\";\n  printf(\n      \"************ Orientation in degrees: %d, Orientation confidence: %.2f\\n\"\n      \"             Script: %s, Script confidence: %.2f\\n\",\n      orient_deg, orient_conf, script_name, script_conf);\n  EXPECT_EQ(expected_deg, orient_deg);\n  api->End();\n  image.destroy();\n}\n#endif\n\nclass OSDTest : public TestClass,\n                public ::testing::WithParamInterface<std::tuple<int, const char *, const char *>> {\n};\n\nTEST_P(OSDTest, MatchOrientationDegrees) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because TessBaseAPI::DetectOrientationScript is missing.\n  GTEST_SKIP();\n#else\n  OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam()));\n#endif\n}\n\nINSTANTIATE_TEST_SUITE_P(TessdataEngEuroHebrew, OSDTest,\n                         ::testing::Combine(::testing::Values(0),\n                                            ::testing::Values(TESTING_DIR \"/phototest.tif\",\n                                                              TESTING_DIR \"/eurotext.tif\",\n                                                              TESTING_DIR \"/hebrew.png\"),\n                                            ::testing::Values(TESSDATA_DIR)));\n\nINSTANTIATE_TEST_SUITE_P(TessdataBestEngEuroHebrew, OSDTest,\n                         ::testing::Combine(::testing::Values(0),\n                                            ::testing::Values(TESTING_DIR \"/phototest.tif\",\n                                                              TESTING_DIR \"/eurotext.tif\",\n                                                              TESTING_DIR \"/hebrew.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_best\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastEngEuroHebrew, OSDTest,\n                         ::testing::Combine(::testing::Values(0),\n                                            ::testing::Values(TESTING_DIR \"/phototest.tif\",\n                                                              TESTING_DIR \"/eurotext.tif\",\n                                                              TESTING_DIR \"/hebrew.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastRotated90, OSDTest,\n                         ::testing::Combine(::testing::Values(90),\n                                            ::testing::Values(TESTING_DIR\n                                                              \"/phototest-rotated-R.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastRotated180, OSDTest,\n                         ::testing::Combine(::testing::Values(180),\n                                            ::testing::Values(TESTING_DIR\n                                                              \"/phototest-rotated-180.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastRotated270, OSDTest,\n                         ::testing::Combine(::testing::Values(270),\n                                            ::testing::Values(TESTING_DIR\n                                                              \"/phototest-rotated-L.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastDevaRotated270, OSDTest,\n                         ::testing::Combine(::testing::Values(270),\n                                            ::testing::Values(TESTING_DIR\n                                                              \"/devatest-rotated-270.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\nINSTANTIATE_TEST_SUITE_P(TessdataFastDeva, OSDTest,\n                         ::testing::Combine(::testing::Values(0),\n                                            ::testing::Values(TESTING_DIR \"/devatest.png\"),\n                                            ::testing::Values(TESSDATA_DIR \"_fast\")));\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/pagesegmode_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <filesystem>\n#include <string>\n#include \"helpers.h\"\n#include \"include_gunit.h\"\n#include \"image.h\"\n#include \"log.h\"\n\nnamespace tesseract {\n\n// The fixture for testing Tesseract.\nclass PageSegModeTest : public testing::Test {\nprotected:\n  PageSegModeTest() = default;\n  ~PageSegModeTest() override {\n    src_pix_.destroy();\n  }\n\n  void SetUp() override {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n  }\n\n  void SetImage(const char *filename) {\n    src_pix_.destroy();\n    src_pix_ = pixRead(filename);\n    api_.Init(TESSDATA_DIR, \"eng\", tesseract::OEM_TESSERACT_ONLY);\n    api_.SetImage(src_pix_);\n  }\n\n  // Tests that the given rectangle produces exactly the given text in the\n  // given segmentation mode (after chopping off the last 2 newlines.)\n  void VerifyRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,\n                      int height) {\n    api_.SetPageSegMode(mode);\n    api_.SetRectangle(left, top, width, height);\n    char *result = api_.GetUTF8Text();\n    chomp_string(result);\n    chomp_string(result);\n    EXPECT_STREQ(str, result);\n    delete[] result;\n  }\n\n  // Tests that the given rectangle does NOT produce the given text in the\n  // given segmentation mode.\n  void NotRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,\n                   int height) {\n    api_.SetPageSegMode(mode);\n    api_.SetRectangle(left, top, width, height);\n    char *result = api_.GetUTF8Text();\n    EXPECT_STRNE(str, result);\n    delete[] result;\n  }\n\n  Image src_pix_ = nullptr;\n  std::string ocr_text_;\n  tesseract::TessBaseAPI api_;\n};\n\n// Tests the single-word segmentation mode, and that it performs correctly\n// and differently to line and block mode.\nTEST_F(PageSegModeTest, WordTest) {\n  std::string filename = file::JoinPath(TESTING_DIR, \"segmodeimg.tif\");\n  if (!std::filesystem::exists(filename)) {\n    LOG(INFO) << \"Skip test because of missing \" << filename << '\\n';\n    GTEST_SKIP();\n  } else {\n    SetImage(filename.c_str());\n    // Test various rectangles around the inverse page number.\n    VerifyRectText(tesseract::PSM_SINGLE_WORD, \"183\", 1419, 264, 69, 34);\n    VerifyRectText(tesseract::PSM_SINGLE_WORD, \"183\", 1411, 252, 78, 62);\n    VerifyRectText(tesseract::PSM_SINGLE_WORD, \"183\", 1396, 218, 114, 102);\n    // Test a random pair of words as a line\n    VerifyRectText(tesseract::PSM_SINGLE_LINE, \"What should\", 237, 393, 256, 36);\n  #ifdef DISABLED_LEGACY_ENGINE\n    // Skip check as LSTM mode adds a space.\n    LOG(INFO) << \"Skip `Whatshould` test in LSTM Mode\\n\";\n  #else\n    // Test a random pair of words as a word\n    VerifyRectText(tesseract::PSM_SINGLE_WORD, \"Whatshould\", 237, 393, 256, 36);\n  #endif\n    // Test single block mode.\n    VerifyRectText(tesseract::PSM_SINGLE_BLOCK, \"both the\\nfrom the\", 237, 450, 172, 94);\n    // But doesn't work in line or word mode.\n    NotRectText(tesseract::PSM_SINGLE_LINE, \"both the\\nfrom the\", 237, 450, 172, 94);\n    NotRectText(tesseract::PSM_SINGLE_WORD, \"both the\\nfrom the\", 237, 450, 172, 94);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/pango_font_info_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"pango_font_info.h\"\n#include <pango/pango.h>\n#include <cstdio>\n#include <string>\n#include \"commandlineflags.h\"\n#include \"fileio.h\"\n#include \"gmock/gmock-matchers.h\" // for EXPECT_THAT\n#include \"include_gunit.h\"\n#ifdef INCLUDE_TENSORFLOW\n#  include \"util/utf8/unicodetext.h\" // for UnicodeText\n#endif\n\nnamespace tesseract {\n\n// Fonts in testdata directory\nconst char *kExpectedFontNames[] = {\"Arab\",\n                                    \"Arial Bold Italic\",\n                                    \"DejaVu Sans Ultra-Light\",\n                                    \"Lohit Hindi\",\n#if PANGO_VERSION <= 12005\n                                    \"Times New Roman\",\n#else\n                                    \"Times New Roman,\", // Pango v1.36.2 requires a trailing ','\n#endif\n                                    \"UnBatang\",\n                                    \"Verdana\"};\n\n// Sample text used in tests.\nconst char kArabicText[] = \"والفكر والصراع 1234,\\nوالفكر والصراع\";\nconst char kEngText[] = \"the quick brown fox jumps over the lazy dog\";\nconst char kHinText[] = \"पिताने विवाह की | हो गई उद्विग्न वह सोचा\";\nconst char kKorText[] = \"이는 것으로\";\n// Hindi words containing illegal vowel sequences.\nconst char *kBadlyFormedHinWords[] = {\n#if PANGO_VERSION <= 12005\n    \"उपयोक्ताो\", \"नहीें\", \"कहीअे\", \"पत्रिाका\", \"छह्णाीस\",\n#endif\n    // Pango v1.36.2 will render the above words even though they are invalid.\n    \"प्रंात\", nullptr};\n\nstatic PangoFontMap *font_map;\n\nclass PangoFontInfoTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    if (!font_map) {\n      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);\n    }\n    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));\n  }\n\n  // Creates a fake fonts.conf file that points to the testdata fonts for\n  // fontconfig to initialize with.\n  static void SetUpTestCase() {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n\n    FLAGS_fonts_dir = TESTING_DIR;\n    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;\n    file::MakeTmpdir();\n    PangoFontInfo::SoftInitFontConfig(); // init early\n  }\n\n  PangoFontInfo font_info_;\n};\n\nTEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {\n  PangoFontInfo font(\"Arial Bold Italic 12\");\n  EXPECT_EQ(12, font.font_size());\n  EXPECT_EQ(\"Arial\", font.family_name());\n}\n\nTEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"Arial Bold Italic 12\"));\n  EXPECT_EQ(12, font_info_.font_size());\n  EXPECT_EQ(\"Arial\", font_info_.family_name());\n\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"Verdana 10\"));\n  EXPECT_EQ(10, font_info_.font_size());\n  EXPECT_EQ(\"Verdana\", font_info_.family_name());\n\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"DejaVu Sans Ultra-Light\"));\n  EXPECT_EQ(\"DejaVu Sans\", font_info_.family_name());\n}\n\nTEST_F(PangoFontInfoTest, DoesParseMissingFonts) {\n  // Font family one of whose faces exists but this one doesn't.\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"Arial Italic 12\"));\n  EXPECT_EQ(12, font_info_.font_size());\n  EXPECT_EQ(\"Arial\", font_info_.family_name());\n\n  // Font family that doesn't exist in testdata. It will still parse the\n  // description name. But without the file, it will not be able to populate\n  // some font family details, like is_monospace().\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"Georgia 10\"));\n  EXPECT_EQ(10, font_info_.font_size());\n  EXPECT_EQ(\"Georgia\", font_info_.family_name());\n}\n\nTEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {\n  EXPECT_TRUE(font_info_.ParseFontDescriptionName(\"Arial Italic 12\"));\n  int x_bearing, x_advance;\n  EXPECT_TRUE(font_info_.GetSpacingProperties(\"A\", &x_bearing, &x_advance));\n  EXPECT_GT(x_advance, 0);\n  EXPECT_TRUE(font_info_.GetSpacingProperties(\"a\", &x_bearing, &x_advance));\n  EXPECT_GT(x_advance, 0);\n}\n\nTEST_F(PangoFontInfoTest, CanRenderString) {\n  font_info_.ParseFontDescriptionName(\"Verdana 12\");\n  EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));\n\n  font_info_.ParseFontDescriptionName(\"UnBatang 12\");\n  EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));\n\n  font_info_.ParseFontDescriptionName(\"Lohit Hindi 12\");\n  EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));\n}\n\nTEST_F(PangoFontInfoTest, CanRenderLigature) {\n  font_info_.ParseFontDescriptionName(\"Arab 12\");\n  const char kArabicLigature[] = \"لا\";\n  EXPECT_TRUE(font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));\n\n  printf(\"Next word\\n\");\n  EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));\n}\n\nTEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {\n  font_info_.ParseFontDescriptionName(\"Verdana 12\");\n  EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));\n}\n\nTEST_F(PangoFontInfoTest, CannotRenderInvalidString) {\n  font_info_.ParseFontDescriptionName(\"Lohit Hindi 12\");\n  for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {\n    EXPECT_FALSE(\n        font_info_.CanRenderString(kBadlyFormedHinWords[i], strlen(kBadlyFormedHinWords[i])))\n        << \"Can render \" << kBadlyFormedHinWords[i];\n  }\n}\n\nTEST_F(PangoFontInfoTest, CanDropUncoveredChars) {\n  font_info_.ParseFontDescriptionName(\"Verdana 12\");\n  // Verdana cannot render the \"ff\" ligature\n  std::string word = \"oﬀice\";\n  EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));\n  EXPECT_EQ(\"oice\", word);\n\n  // Don't drop non-letter characters like word joiners.\n  const char *kJoiners[] = {\n      \"\\u2060\", // U+2060 (WJ)\n      \"\\u200C\", // U+200C (ZWJ)\n      \"\\u200D\"  // U+200D (ZWNJ)\n  };\n  for (auto &kJoiner : kJoiners) {\n    word = kJoiner;\n    EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));\n    EXPECT_STREQ(kJoiner, word.c_str());\n  }\n}\n\n// ------------------------ FontUtils ------------------------------------\n\nclass FontUtilsTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    file::MakeTmpdir();\n  }\n  // Creates a fake fonts.conf file that points to the testdata fonts for\n  // fontconfig to initialize with.\n  static void SetUpTestCase() {\n    FLAGS_fonts_dir = TESTING_DIR;\n    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;\n    if (!font_map) {\n      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);\n    }\n    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));\n  }\n\n#ifdef INCLUDE_TENSORFLOW\n  void CountUnicodeChars(const char *utf8_text, std::unordered_map<char32, int64_t> *ch_map) {\n    ch_map->clear();\n    UnicodeText ut;\n    ut.PointToUTF8(utf8_text, strlen(utf8_text));\n    for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {\n#  if 0\n      if (UnicodeProps::IsWhitespace(*it)) continue;\n#  else\n      if (std::isspace(*it))\n        continue;\n#  endif\n      ++(*ch_map)[*it];\n    }\n  }\n#endif\n};\n\nTEST_F(FontUtilsTest, DoesFindAvailableFonts) {\n  EXPECT_TRUE(FontUtils::IsAvailableFont(\"Arial Bold Italic\"));\n  EXPECT_TRUE(FontUtils::IsAvailableFont(\"Verdana\"));\n  EXPECT_TRUE(FontUtils::IsAvailableFont(\"DejaVu Sans Ultra-Light\"));\n\n  // Test that we can support font name convention for Pango v1.30.2 even when\n  // we are running an older version.\n  EXPECT_TRUE(FontUtils::IsAvailableFont(\"Times New Roman,\"));\n}\n\nTEST_F(FontUtilsTest, DoesDetectMissingFonts) {\n  // Only bold italic face is available.\n  EXPECT_FALSE(FontUtils::IsAvailableFont(\"Arial\"));\n  // Don't have a ttf for the Courier family.\n  EXPECT_FALSE(FontUtils::IsAvailableFont(\"Courier\"));\n  // Pango \"synthesizes\" the italic font from the available Verdana Regular and\n  // includes it in its list, but it is not really loadable.\n  EXPECT_FALSE(FontUtils::IsAvailableFont(\"Verdana Italic\"));\n  // We have \"Dejavu Sans Ultra-Light\" but not its medium weight counterpart.\n  EXPECT_FALSE(FontUtils::IsAvailableFont(\"DejaVu Sans\"));\n}\n\nTEST_F(FontUtilsTest, DoesListAvailableFonts) {\n  const std::vector<std::string> &fonts = FontUtils::ListAvailableFonts();\n  EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));\n  for (auto &font : fonts) {\n    PangoFontInfo font_info;\n    EXPECT_TRUE(font_info.ParseFontDescriptionName(font));\n  }\n}\n\n#ifdef INCLUDE_TENSORFLOW\nTEST_F(FontUtilsTest, DoesFindBestFonts) {\n  std::string fonts_list;\n  std::unordered_map<char32, int64_t> ch_map;\n  CountUnicodeChars(kEngText, &ch_map);\n  EXPECT_EQ(26, ch_map.size()); // 26 letters\n  std::vector<std::pair<const char *, std::vector<bool> > > font_flags;\n  std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);\n  EXPECT_TRUE(best_list.size());\n  // All fonts except Lohit Hindi should render English text.\n  EXPECT_EQ(countof(kExpectedFontNames) - 1, font_flags.size());\n\n  CountUnicodeChars(kKorText, &ch_map);\n  best_list = FontUtils::BestFonts(ch_map, &font_flags);\n  EXPECT_TRUE(best_list.size());\n  // Only UnBatang font family is able to render korean.\n  EXPECT_EQ(1, font_flags.size());\n  EXPECT_STREQ(\"UnBatang\", font_flags[0].first);\n}\n#endif\n\nTEST_F(FontUtilsTest, DoesSelectFont) {\n  const char *kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};\n  const char *kLangNames[] = {\"Arabic\", \"English\", \"Hindi\", \"Korean\", nullptr};\n  for (int i = 0; kLangText[i] != nullptr; ++i) {\n    SCOPED_TRACE(kLangNames[i]);\n    std::vector<std::string> graphemes;\n    std::string selected_font;\n    EXPECT_TRUE(\n        FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes));\n    EXPECT_TRUE(selected_font.size());\n    EXPECT_TRUE(graphemes.size());\n  }\n}\n\nTEST_F(FontUtilsTest, DoesFailToSelectFont) {\n  const char kMixedScriptText[] = \"पिताने विवाह की | والفكر والصراع\";\n  std::vector<std::string> graphemes;\n  std::string selected_font;\n  EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font,\n                                     &graphemes));\n}\n\n#if 0\n// Needs fix. FontUtils::GetAllRenderableCharacters was removed\n// because of deprecated pango_coverage_max.\nTEST_F(FontUtilsTest, GetAllRenderableCharacters) {\n  const int32_t kHindiChar = 0x0905;\n  const int32_t kArabicChar = 0x0623;\n  const int32_t kMongolianChar = 0x180E;  // Mongolian vowel separator\n  const int32_t kOghamChar = 0x1680;      // Ogham space mark\n  std::vector<bool> unicode_mask;\n  FontUtils::GetAllRenderableCharacters(&unicode_mask);\n  EXPECT_TRUE(unicode_mask['A']);\n  EXPECT_TRUE(unicode_mask['1']);\n  EXPECT_TRUE(unicode_mask[kHindiChar]);\n  EXPECT_TRUE(unicode_mask[kArabicChar]);\n  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // no font for mongolian.\n#  if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham\n  EXPECT_FALSE(unicode_mask[kOghamChar]);      // no font for ogham.\n#  endif\n  unicode_mask.clear();\n\n  std::vector<std::string> selected_fonts;\n  selected_fonts.push_back(\"Lohit Hindi\");\n  FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);\n  EXPECT_TRUE(unicode_mask['1']);\n  EXPECT_TRUE(unicode_mask[kHindiChar]);\n  EXPECT_FALSE(unicode_mask['A']);             // Lohit doesn't render English,\n  EXPECT_FALSE(unicode_mask[kArabicChar]);     // or Arabic,\n  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // or Mongolian,\n  EXPECT_FALSE(unicode_mask[kOghamChar]);      // or Ogham.\n  unicode_mask.clear();\n\n  // Check that none of the included fonts cover the Mongolian or Ogham space\n  // characters.\n  for (size_t f = 0; f < countof(kExpectedFontNames); ++f) {\n    std::string tracestring = \"Testing \" + kExpectedFontNames[f];\n    SCOPED_TRACE(tracestring);\n    FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);\n#  if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham\n    EXPECT_FALSE(unicode_mask[kOghamChar]);\n#  endif\n    EXPECT_FALSE(unicode_mask[kMongolianChar]);\n    unicode_mask.clear();\n  }\n}\n#endif\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/paragraphs_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string> // for std::string\n\n#include \"include_gunit.h\" // for TEST\n#include \"log.h\"           // for LOG\n\n// ccmain\n#include \"paragraphs.h\"\n#include \"paragraphs_internal.h\"\n// ccstruct\n#include \"ocrpara.h\"\n\nnamespace tesseract {\n\n// Functions for making monospace ASCII trial text for the paragraph detector.\nconst ParagraphJustification kLeft = JUSTIFICATION_LEFT;\nconst ParagraphJustification kCenter = JUSTIFICATION_CENTER;\nconst ParagraphJustification kRight = JUSTIFICATION_RIGHT;\nconst ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN;\n\nenum TextModelInputType {\n  PCONT = 0,  // Continuation line of a paragraph (default).\n  PSTART = 1, // First line of a paragraph.\n  PNONE = 2,  // Not a paragraph line.\n};\n\nstruct TextAndModel {\n  const char *ascii;\n  TextModelInputType model_type;\n\n  // fields corresponding to PARA (see ccstruct/ocrpara.h)\n  ParagraphModel model;\n  bool is_very_first_or_continuation;\n  bool is_list_item;\n};\n\n// Imagine that the given text is typewriter ASCII with each character ten\n// pixels wide and twenty pixels high and return an appropriate row_info.\nvoid AsciiToRowInfo(const char *text, int row_number, RowInfo *info) {\n  const int kCharWidth = 10;\n  const int kLineSpace = 30;\n  info->text = text;\n  info->has_leaders = strstr(text, \"...\") != nullptr || strstr(text, \". . .\") != nullptr;\n  info->has_drop_cap = false;\n  info->pix_ldistance = info->pix_rdistance = 0;\n  info->average_interword_space = kCharWidth;\n  info->pix_xheight = kCharWidth;\n  info->lword_text = info->rword_text = \"\";\n  info->ltr = true;\n\n  std::vector<std::string> words = split(text, ' ');\n  info->num_words = words.size();\n  if (info->num_words < 1) {\n    return;\n  }\n\n  info->lword_text = words[0].c_str();\n  info->rword_text = words[words.size() - 1].c_str();\n  unsigned lspace = 0;\n  while (lspace < info->text.size() && text[lspace] == ' ') {\n    lspace++;\n  }\n  unsigned rspace = 0;\n  while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') {\n    rspace++;\n  }\n\n  int top = -kLineSpace * row_number;\n  int bottom = top - kLineSpace;\n  int row_right = kCharWidth * info->text.size();\n  int lword_width = kCharWidth * info->lword_text.size();\n  int rword_width = kCharWidth * info->rword_text.size();\n  info->pix_ldistance = lspace * kCharWidth;\n  info->pix_rdistance = rspace * kCharWidth;\n  info->lword_box = TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);\n  info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,\n                         row_right - info->pix_rdistance, top);\n  LeftWordAttributes(nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,\n                     &info->lword_likely_starts_idea, &info->lword_likely_ends_idea);\n  RightWordAttributes(nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,\n                      &info->rword_likely_starts_idea, &info->rword_likely_ends_idea);\n}\n\nvoid MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo> *output) {\n  output->clear();\n  RowInfo info;\n  for (int i = 0; i < n; i++) {\n    AsciiToRowInfo(row_infos[i].ascii, i, &info);\n    output->push_back(info);\n  }\n}\n\n// Given n rows of reference ground truth, evaluate whether the n rows\n// of PARA * pointers yield the same paragraph breakpoints.\nvoid EvaluateParagraphDetection(const TextAndModel *correct, int n,\n                                const std::vector<PARA *> &detector_output) {\n  int incorrect_breaks = 0;\n  int missed_breaks = 0;\n  int poorly_matched_models = 0;\n  int bad_crowns = 0;\n  int bad_list_items = 0;\n  ASSERT_EQ(detector_output.size(), n);\n  for (int i = 1; i < n; i++) {\n    bool has_break = correct[i].model_type != PCONT;\n    bool detected_break = (detector_output[i - 1] != detector_output[i]);\n    if (has_break && !detected_break) {\n      missed_breaks++;\n    }\n    if (detected_break && !has_break) {\n      incorrect_breaks++;\n    }\n    if (has_break) {\n      if (correct[i].model_type == PNONE) {\n        if (detector_output[i]->model != nullptr) {\n          poorly_matched_models++;\n        }\n      } else {\n        if (correct[i].model.justification() != kUnknown &&\n            (detector_output[i]->model == nullptr ||\n             !correct[i].model.Comparable(*detector_output[i]->model))) {\n          poorly_matched_models++;\n        }\n      }\n      if (correct[i].is_very_first_or_continuation ^\n          detector_output[i]->is_very_first_or_continuation) {\n        bad_crowns++;\n      }\n      if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {\n        bad_list_items++;\n      }\n    }\n  }\n  EXPECT_EQ(incorrect_breaks, 0);\n  EXPECT_EQ(missed_breaks, 0);\n  EXPECT_EQ(poorly_matched_models, 0);\n  EXPECT_EQ(bad_list_items, 0);\n  EXPECT_EQ(bad_crowns, 0);\n  if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) {\n    std::vector<std::string> dbg_lines;\n    dbg_lines.emplace_back(\"# ==========================\");\n    dbg_lines.emplace_back(\"# Correct paragraph breaks:\");\n    dbg_lines.emplace_back(\"# ==========================\");\n    for (int i = 0; i < n; i++) {\n      if (correct[i].model_type != PCONT) {\n        std::string s = std::string(correct[i].ascii) + \"  #  \" +\n                        correct[i].model.ToString() +\n                        (correct[i].is_very_first_or_continuation ? \" crown\" : \"\") +\n                        (correct[i].is_list_item ? \" li\" : \"\");\n        dbg_lines.push_back(s);\n      } else {\n        dbg_lines.emplace_back(correct[i].ascii);\n      }\n    }\n    dbg_lines.emplace_back(\"\");\n    dbg_lines.emplace_back(\"# ==========================\");\n    dbg_lines.emplace_back(\"# Paragraph detector output:\");\n    dbg_lines.emplace_back(\"# ==========================\");\n    for (int i = 0; i < n; i++) {\n      std::string annotation;\n      if (i == 0 || (detector_output[i - 1] != detector_output[i])) {\n        if (detector_output[i] && detector_output[i]->model) {\n          annotation +=\n              \"  #  \" + detector_output[i]->model->ToString() +\n              (detector_output[i]->is_very_first_or_continuation ? \" crown\" : \"\") +\n              (detector_output[i]->is_list_item ? \" li\" : \"\");\n        } else {\n          annotation = \"  #  Unmodeled paragraph.\";\n        }\n      }\n      std::string s = correct[i].ascii + annotation;\n      dbg_lines.push_back(s);\n    }\n    std::string s;\n    for (auto &dbg_line : dbg_lines) {\n      s += dbg_line + \"\\n\";\n    }\n    LOG(INFO) << \"Discrepancy!\\n\" << s;\n  }\n}\n\nvoid TestParagraphDetection(const TextAndModel *correct, int num_rows) {\n  std::vector<RowInfo> row_infos;\n  std::vector<PARA *> row_owners;\n  PARA_LIST paragraphs;\n  std::vector<ParagraphModel *> models;\n\n  MakeAsciiRowInfos(correct, num_rows, &row_infos);\n  int debug_level(3);\n  tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, &paragraphs, &models);\n  EvaluateParagraphDetection(correct, num_rows, row_owners);\n  for (auto *model : models) {\n    delete model;\n  }\n}\n\nTEST(ParagraphsTest, ListItemsIdentified) {\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"iii\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"A.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"B.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"C.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"1.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"2.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"3.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"1\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"2\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"3\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"[[1]]\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"A-1.\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"A-2\"));\n  EXPECT_TRUE(tesseract::AsciiLikelyListItem(\"(A)(i)\"));\n\n  EXPECT_FALSE(tesseract::AsciiLikelyListItem(\"The\"));\n  EXPECT_FALSE(tesseract::AsciiLikelyListItem(\"first\"));\n  EXPECT_FALSE(tesseract::AsciiLikelyListItem(\"house\"));\n  EXPECT_FALSE(tesseract::AsciiLikelyListItem(\"Oregonian.\"));\n  EXPECT_FALSE(tesseract::AsciiLikelyListItem(\"on.\"));\n}\n\ntypedef ParagraphModel PModel;\n\nconst TextAndModel kTwoSimpleParagraphs[] = {\n    {\"  Look here, I have a paragraph.\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"This paragraph starts at the top\", PCONT, PModel(), false, false},\n    {\"of the page and takes 3 lines.  \", PCONT, PModel(), false, false},\n    {\"  Here I have a second paragraph\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"which indicates that the first  \", PCONT, PModel(), false, false},\n    {\"paragraph is not a continuation \", PCONT, PModel(), false, false},\n    {\"from a previous page, as it is  \", PCONT, PModel(), false, false},\n    {\"indented just like this second  \", PCONT, PModel(), false, false},\n    {\"paragraph.                      \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestSimpleParagraphDetection) {\n  TestParagraphDetection(kTwoSimpleParagraphs, countof(kTwoSimpleParagraphs));\n}\n\nconst TextAndModel kFewCluesWithCrown[] = {\n    {\"This paragraph starts at the top\", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},\n    {\"of the page and takes two lines.\", PCONT, PModel(), false, false},\n    {\"  Here I have a second paragraph\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"which indicates that the first  \", PCONT, PModel(), false, false},\n    {\"paragraph is a continuation from\", PCONT, PModel(), false, false},\n    {\"a previous page, as it is       \", PCONT, PModel(), false, false},\n    {\"indented just like this second  \", PCONT, PModel(), false, false},\n    {\"paragraph.                      \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestFewCluesWithCrown) {\n  TestParagraphDetection(kFewCluesWithCrown, countof(kFewCluesWithCrown));\n}\n\nconst TextAndModel kCrownedParagraph[] = {\n    {\"The first paragraph on a page is\", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},\n    {\"often not indented as the rest  \", PCONT, PModel(), false, false},\n    {\"of the paragraphs are.  Nonethe-\", PCONT, PModel(), false, false},\n    {\"less it should be counted as the\", PCONT, PModel(), false, false},\n    {\"same type of paragraph.         \", PCONT, PModel(), false, false},\n    {\"  The second and third para-    \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"graphs are both indented two    \", PCONT, PModel(), false, false},\n    {\"spaces.                         \", PCONT, PModel(), false, false},\n    {\"  The first paragraph has what  \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"fmt refers to as a 'crown.'     \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestCrownParagraphDetection) {\n  TestParagraphDetection(kCrownedParagraph, countof(kCrownedParagraph));\n}\n\nconst TextAndModel kFlushLeftParagraphs[] = {\n    {\"It  is sometimes  the case  that\", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},\n    {\"flush  left   paragraphs  (those\", PCONT, PModel(), false, false},\n    {\"with  no  body  indent)  are not\", PCONT, PModel(), false, false},\n    {\"actually crowns.                \", PCONT, PModel(), false, false},\n    {\"Instead,  further paragraphs are\", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},\n    {\"also flush left aligned.  Usual-\", PCONT, PModel(), false, false},\n    {\"ly,  these  paragraphs  are  set\", PCONT, PModel(), false, false},\n    {\"apart vertically  by some white-\", PCONT, PModel(), false, false},\n    {\"space,  but you can also  detect\", PCONT, PModel(), false, false},\n    {\"them by observing  the big empty\", PCONT, PModel(), false, false},\n    {\"space at the  ends  of the para-\", PCONT, PModel(), false, false},\n    {\"graphs.                         \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsText, TestRealFlushLeftParagraphs) {\n  TestParagraphDetection(kFlushLeftParagraphs, countof(kFlushLeftParagraphs));\n}\n\nconst TextAndModel kSingleFullPageContinuation[] = {\n    {\"sometimes a page is one giant\", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},\n    {\"continuation.  It flows  from\", PCONT, PModel(), false, false},\n    {\"line to  line, using the full\", PCONT, PModel(), false, false},\n    {\"column  width  with  no clear\", PCONT, PModel(), false, false},\n    {\"paragraph  break,  because it\", PCONT, PModel(), false, false},\n    {\"actually doesn't have one. It\", PCONT, PModel(), false, false},\n    {\"is the  middle of one monster\", PCONT, PModel(), false, false},\n    {\"paragraph continued  from the\", PCONT, PModel(), false, false},\n    {\"previous page and  continuing\", PCONT, PModel(), false, false},\n    {\"onto the  next  page.  There-\", PCONT, PModel(), false, false},\n    {\"fore,  it  ends  up   getting\", PCONT, PModel(), false, false},\n    {\"marked  as a  crown  and then\", PCONT, PModel(), false, false},\n    {\"getting re-marked as any  ex-\", PCONT, PModel(), false, false},\n    {\"isting model.  Not great, but\", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestSingleFullPageContinuation) {\n  const TextAndModel *correct = kSingleFullPageContinuation;\n  int num_rows = countof(kSingleFullPageContinuation);\n  std::vector<RowInfo> row_infos;\n  std::vector<PARA *> row_owners;\n  PARA_LIST paragraphs;\n  std::vector<ParagraphModel *> models;\n  models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));\n  MakeAsciiRowInfos(correct, num_rows, &row_infos);\n  tesseract::DetectParagraphs(3, &row_infos, &row_owners, &paragraphs, &models);\n  EvaluateParagraphDetection(correct, num_rows, row_owners);\n  for (auto *model : models) {\n    delete model;\n  }\n}\n\nconst TextAndModel kRightAligned[] = {\n    {\"Right-aligned paragraphs are\", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},\n    {\"   uncommon in Left-to-Right\", PCONT, PModel(), false, false},\n    {\"      languages, but they do\", PCONT, PModel(), false, false},\n    {\"                      exist.\", PCONT, PModel(), false, false},\n    {\"    Mostly, however, they're\", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},\n    {\" horribly tiny paragraphs in\", PCONT, PModel(), false, false},\n    {\"  tables on which we have no\", PCONT, PModel(), false, false},\n    {\"             chance anyways.\", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestRightAlignedParagraph) {\n  TestParagraphDetection(kRightAligned, countof(kRightAligned));\n}\n\nconst TextAndModel kTinyParagraphs[] = {\n    {\"  Occasionally, interspersed with\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"obvious paragraph text, you might\", PCONT, PModel(), false, false},\n    {\"find short exchanges of dialogue \", PCONT, PModel(), false, false},\n    {\"between characters.              \", PCONT, PModel(), false, false},\n    {\"  'Oh?'                          \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"  'Don't be confused!'           \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"  'Not me!'                      \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"  One naive approach would be to \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"mark a new paragraph whenever one\", PCONT, PModel(), false, false},\n    {\"of the statistics (left, right or\", PCONT, PModel(), false, false},\n    {\"center)  changes  from  one text-\", PCONT, PModel(), false, false},\n    {\"line  to  the  next.    Such   an\", PCONT, PModel(), false, false},\n    {\"approach  would  misclassify  the\", PCONT, PModel(), false, false},\n    {\"tiny paragraphs above as a single\", PCONT, PModel(), false, false},\n    {\"paragraph.                       \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestTinyParagraphs) {\n  TestParagraphDetection(kTinyParagraphs, countof(kTinyParagraphs));\n}\n\nconst TextAndModel kComplexPage1[] = {\n    {\"       Awesome                  \", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},\n    {\"   Centered Title               \", PCONT, PModel(), false, false},\n    {\" Paragraph Detection            \", PCONT, PModel(), false, false},\n    {\"      OCR TEAM                  \", PCONT, PModel(), false, false},\n    {\"  10 November 2010              \", PCONT, PModel(), false, false},\n    {\"                                \", PNONE, PModel(), false, false},\n    {\"  Look here, I have a paragraph.\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"This paragraph starts at the top\", PCONT, PModel(), false, false},\n    {\"of the page and takes 3 lines.  \", PCONT, PModel(), false, false},\n    {\"  Here I have a second paragraph\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"which indicates that the first  \", PCONT, PModel(), false, false},\n    {\"paragraph is not a continuation \", PCONT, PModel(), false, false},\n    {\"from a previous page, as it is  \", PCONT, PModel(), false, false},\n    {\"indented just like this second  \", PCONT, PModel(), false, false},\n    {\"paragraph.                      \", PCONT, PModel(), false, false},\n    {\"   Here is a block quote. It    \", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},\n    {\"   looks like the prior text    \", PCONT, PModel(), false, false},\n    {\"   but it  is indented  more    \", PCONT, PModel(), false, false},\n    {\"   and is fully justified.      \", PCONT, PModel(), false, false},\n    {\"  So how does one deal with     \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"centered text, block quotes,    \", PCONT, PModel(), false, false},\n    {\"normal paragraphs, and lists    \", PCONT, PModel(), false, false},\n    {\"like what follows?              \", PCONT, PModel(), false, false},\n    {\"1. Make a plan.                 \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"2. Use a heuristic, for example,\", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   looking for lines where the  \", PCONT, PModel(), false, false},\n    {\"   first word of the next line  \", PCONT, PModel(), false, false},\n    {\"   would fit on the previous    \", PCONT, PModel(), false, false},\n    {\"   line.                        \", PCONT, PModel(), false, false},\n    {\"8. Try to implement the plan in \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   Python and try it out.       \", PCONT, PModel(), false, false},\n    {\"4. Determine how to fix the     \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   mistakes.                    \", PCONT, PModel(), false, false},\n    {\"5. Repeat.                      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"  For extra painful penalty work\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"you can try to identify source  \", PCONT, PModel(), false, false},\n    {\"code.  Ouch!                    \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestComplexPage1) {\n  TestParagraphDetection(kComplexPage1, countof(kComplexPage1));\n}\n\n// The same as above, but wider.\nconst TextAndModel kComplexPage2[] = {\n    {\"       Awesome                     \", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},\n    {\"   Centered Title                  \", PCONT, PModel(), false, false},\n    {\" Paragraph Detection               \", PCONT, PModel(), false, false},\n    {\"      OCR TEAM                     \", PCONT, PModel(), false, false},\n    {\"  10 November 2010                 \", PCONT, PModel(), false, false},\n    {\"                                   \", PNONE, PModel(), false, false},\n    {\"  Look here, I have a paragraph.   \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"This paragraph starts at the top of\", PCONT, PModel(), false, false},\n    {\"the page and takes 3 lines.        \", PCONT, PModel(), false, false},\n    {\"  Here I have a second paragraph   \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"which indicates that the first     \", PCONT, PModel(), false, false},\n    {\"paragraph is not a continuation    \", PCONT, PModel(), false, false},\n    {\"from a previous page, as it is in- \", PCONT, PModel(), false, false},\n    {\"dented just like this second para- \", PCONT, PModel(), false, false},\n    {\"graph.                             \", PCONT, PModel(), false, false},\n    {\"   Here is a block quote. It       \", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},\n    {\"   looks like the prior text       \", PCONT, PModel(), false, false},\n    {\"   but it  is indented  more       \", PCONT, PModel(), false, false},\n    {\"   and is fully justified.         \", PCONT, PModel(), false, false},\n    {\"  So how does one deal with center-\", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"ed text, block quotes, normal para-\", PCONT, PModel(), false, false},\n    {\"graphs, and lists like what follow?\", PCONT, PModel(), false, false},\n    {\"1. Make a plan.                    \", PCONT, PModel(), false, false}, // BUG!!\n    {\"2. Use a heuristic, for example,   \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   looking for lines where the     \", PCONT, PModel(), false, false},\n    {\"   first word of the next line     \", PCONT, PModel(), false, false},\n    {\"   would fit on the previous line. \", PCONT, PModel(), false, false},\n    {\"8. Try to implement the plan in    \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   Python and try it out.          \", PCONT, PModel(), false, false},\n    {\"4. Determine how to fix the        \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"   mistakes.                       \", PCONT, PModel(), false, false},\n    {\"5. Repeat.                         \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},\n    {\"  For extra painful penalty work   \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"you can try to identify source     \", PCONT, PModel(), false, false},\n    {\"code.  Ouch!                       \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestComplexPage2) {\n  TestParagraphDetection(kComplexPage2, countof(kComplexPage2));\n}\n\nconst TextAndModel kSubtleCrown[] = {\n    {\"The first paragraph on a page is\", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},\n    {\"often not indented as the rest  \", PCONT, PModel(), false, false},\n    {\"of the paragraphs are.  Nonethe-\", PCONT, PModel(), false, false},\n    {\"less it should be counted as the\", PCONT, PModel(), false, false},\n    {\"same type of paragraph.         \", PCONT, PModel(), false, false},\n    {\"  Even a short second paragraph \", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},\n    {\"should suffice.                 \", PCONT, PModel(), false, false},\n    {\"             1235               \", PNONE, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, TestSubtleCrown) {\n  TestParagraphDetection(kSubtleCrown, countof(kSubtleCrown) - 1);\n}\n\nTEST(ParagraphsTest, TestStrayLineInBlock) {\n  TestParagraphDetection(kSubtleCrown, countof(kSubtleCrown));\n}\n\nconst TextAndModel kUnlvRep3AO[] = {\n    {\"    Defined contribution plans cover employees in Australia, New\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"Zealand, Spain, the United Kingdom and some U.S. subsidiaries.  \", PCONT, PModel(), false,\n     false},\n    {\"In addition, employees in the U.S. are eligible to participate in    \", PCONT, PModel(),\n     false, false},\n    {\"deﬁned contribution plans (Employee Savings Plans) by contribut-\", PCONT, PModel(), false,\n     false},\n    {\"ing a portion of their compensation. The Company matches com- \", PCONT, PModel(), false,\n     false},\n    {\"pensation, depending on Company proﬁt levels. Contributions    \", PCONT, PModel(), false,\n     false},\n    {\"charged to income for deﬁned contribution plans were $92 in    \", PCONT, PModel(), false,\n     false},\n    {\"1993, $98 in 1992 and $89 in 1991.                             \", PCONT, PModel(), false,\n     false},\n    {\"     In addition to providing pension beneﬁts, the Company pro- \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"vides certain health care and life insurance beneﬁts to retired     \", PCONT, PModel(), false,\n     false},\n    {\"employees. As discussed in Note A, the Company adopted FASB   \", PCONT, PModel(), false,\n     false},\n    {\"Statement No. 106 effective January 1, 1992. Previously, the     \", PCONT, PModel(), false,\n     false},\n    {\"Company recognized the cost of providing these beneﬁts as the     \", PCONT, PModel(), false,\n     false},\n    {\"beneﬁts were paid. These pretax costs amounted to $53 in 1991.   \", PCONT, PModel(), false,\n     false},\n    {\"The Company continues to fund most of the cost of these medical \", PCONT, PModel(), false,\n     false},\n    {\"and life insurance beneﬁts in the year incurred.                \", PCONT, PModel(), false,\n     false},\n    {\"     The U.S. plan covering the parent company is the largest plan.\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"It provides medical and life insurance beneﬁts including hospital,  \", PCONT, PModel(), false,\n     false},\n    {\"physicians’ services and major medical expense beneﬁts and life   \", PCONT, PModel(), false,\n     false},\n    {\"insurance beneﬁts. The plan provides beneﬁts supplemental to    \", PCONT, PModel(), false,\n     false},\n    {\"Medicare after retirees are eligible for these beneﬁts. The cost of  \", PCONT, PModel(),\n     false, false},\n    {\"these beneﬁts are shared by the Company and the retiree, with the  \", PCONT, PModel(), false,\n     false},\n    {\"Company portion increasing as the retiree has increased years of   \", PCONT, PModel(), false,\n     false},\n    {\"credited service. The Company has the ability to change these    \", PCONT, PModel(), false,\n     false},\n    {\"beneﬁts at any time.                                            \", PCONT, PModel(), false,\n     false},\n    {\"     Effective October 1993, the Company amended its health   \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"beneﬁts plan in the U.S. to cap the cost absorbed by the Company \", PCONT, PModel(), false,\n     false},\n    {\"at approximately twice the 1993 cost per person for employees who\", PCONT, PModel(), false,\n     false},\n    {\"retire after December 31, 1993. The effect of this amendment was \", PCONT, PModel(), false,\n     false},\n    {\"to reduce the December 31, 1993 accumulated postretirement   \", PCONT, PModel(), false,\n     false},\n    {\"beneﬁt obligation by $327. It also reduced the net periodic postre- \", PCONT, PModel(), false,\n     false},\n    {\"tirement cost by $21 for 1993 and is estimated to reduce this cost  \", PCONT, PModel(), false,\n     false},\n    {\"for 1994 by approximately $83.                                     \", PCONT, PModel(), false,\n     false},\n};\n\nTEST(ParagraphsTest, TestUnlvInsurance) {\n  TestParagraphDetection(kUnlvRep3AO, countof(kUnlvRep3AO));\n}\n\n// The basic outcome we want for something with a bunch of leader dots is that\n// we group each logical entry as a separate item.  Without knowledge of\n// leaders, we would most likely mark the text below as a simple right aligned\n// paragraph or two.\n// This example comes from Volume 9886293, Page 5\nconst TextAndModel kTableOfContents[] = {\n    {\"1 Hmong People ........... 1\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"   Hmong Origins . . . . . 1\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"    Language . . . . . . . 1\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"     Proverbs . . . . . .  2\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"        Discussion . . . . 2\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"     Riddles . . . . . . . 2\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"        Discussion . . . . 3\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"     Appearance . . . . .  3\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"   Hmong History . . . . . 4\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"    Hmong in SE Asia . . . 4\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"    Hmong in the West . . .5\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"    Hmong in the USA . . . 5\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n    {\"        Discussion . . . . 6\", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},\n};\n\nTEST(ParagraphsTest, TestSplitsOutLeaderLines) {\n  TestParagraphDetection(kTableOfContents, countof(kTableOfContents));\n}\n\nconst TextAndModel kTextWithSourceCode[] = {\n    {\"  A typical page of a programming book may contain\", PSTART, PModel(kLeft, 0, 20, 0, 0),\n     false, false},\n    {\"examples of source code to exemplify an algorithm \", PCONT, PModel(), false, false},\n    {\"being described in prose.  Such examples should be\", PCONT, PModel(), false, false},\n    {\"rendered as lineated text, meaning text with      \", PCONT, PModel(), false, false},\n    {\"explicit line breaks but without extra inter-line \", PCONT, PModel(), false, false},\n    {\"spacing.  Accidentally finding stray paragraphs in\", PCONT, PModel(), false, false},\n    {\"source code would lead to a bad reading experience\", PCONT, PModel(), false, false},\n    {\"when the text is re-flowed.                       \", PCONT, PModel(), false, false},\n    {\"  Let's show this by describing the function fact-\", PSTART, PModel(kLeft, 0, 20, 0, 0),\n     false, false},\n    {\"orial.  Factorial is a simple recursive function  \", PCONT, PModel(), false, false},\n    {\"which grows very quickly.  So quickly, in fact,   \", PCONT, PModel(), false, false},\n    {\"that the typical C implementation will only work  \", PCONT, PModel(), false, false},\n    {\"for values less than about 12:                    \", PCONT, PModel(), false, false},\n    {\"                                                  \", PNONE, PModel(), false, false},\n    {\"  # Naive implementation in C                     \", PCONT, PModel(), false, false},\n    {\"  int factorial(int n) {                          \", PCONT, PModel(), false, false},\n    {\"    if (n < 2)                                    \", PCONT, PModel(), false, false},\n    {\"      return 1;                                   \", PCONT, PModel(), false, false},\n    {\"    return  n * factorial(n - 1);                 \", PCONT, PModel(), false, false},\n    {\"  }                                               \", PCONT, PModel(), false, false},\n    {\"                                                  \", PCONT, PModel(), false, false},\n    {\"  The C programming language does not have built- \", PSTART, PModel(kLeft, 0, 20, 0, 0),\n     false, false},\n    {\"in support for detecting integer overflow, so this\", PCONT, PModel(), false, false},\n    {\"naive implementation simply returns random values \", PCONT, PModel(), false, false},\n    {\"if even a moderate sized n is provided.           \", PCONT, PModel(), false, false},\n};\n\nTEST(ParagraphsTest, NotDistractedBySourceCode) {\n  TestParagraphDetection(kTextWithSourceCode, countof(kTextWithSourceCode));\n}\n\nconst TextAndModel kOldManAndSea[] = {\n    {\"royal  palm  which  are called  guano  and  in it  there was a bed,  a\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"table, one chair, and a place on the dirt floor to cook with charcoal.\", PCONT, PModel(),\n     false, false},\n    {\"On  the  brown  walls  of  the ﬂattened,  overlapping  leaves  of  the\", PCONT, PModel(),\n     false, false},\n    {\"sturdy  fibered guano  there  was  a  picture in  color of  the Sacred\", PCONT, PModel(),\n     false, false},\n    {\"Heart  of  Jesus  and  another  of  the  Virgin  of Cobre.  These were\", PCONT, PModel(),\n     false, false},\n    {\"relics of  his wife.   Once there had been  a tinted photograph of his\", PCONT, PModel(),\n     false, false},\n    {\"wife on  the wall  but he  had taken  it  down because it made him too\", PCONT, PModel(),\n     false, false},\n    {\"lonely to see it and it was on the shelf in the corner under his clean\", PCONT, PModel(),\n     false, false},\n    {\"shirt.                                                                \", PCONT, PModel(),\n     false, false},\n    {\"     \\\"What  do  you  have  to  eat?\\\"     the  boy   asked.          \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"A pot of yellow rice with fish. Do you want some?\\\"            \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"No. I will eat at home. Do you want me to make the fire?\\\"   \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"No. I will make it later on. Or I may eat the rice cold.\\\"     \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"May I take the cast net?\\\"                                     \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"Of course.\\\"                                                   \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     There was  no  cast net  and  the boy  remembered  when  they had\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"sold it.   But they went through  this fiction every day. There was no\", PCONT, PModel(),\n     false, false},\n    {\"pot of yellow rice and fish and the boy knew this too.                 \"\n     \" \",\n     PCONT, PModel(), false, false},\n    {\"     \\\"Eighty-five  is a lucky number,\\\"  the  old  man  said.   \\\"How\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"would  you  like to see  me  bring one  in that dressed out over a \"\n     \"thou-\",\n     PCONT, PModel(), false, false},\n    {\"sand pounds?                                                           \"\n     \" \",\n     PCONT, PModel(), false, false},\n    {\"     \\\"I'll get the cast net and go for sardines.  Will you sit in the \"\n     \"sun\",\n     PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"in the doorway?\\\"                                                        \"\n     \" \",\n     PCONT, PModel(), false, false},\n    {\"     \\\"Yes.  I have yesterday's paper and I will read the baseball.\\\"   \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     The boy  did not  know  whether  yesterday's paper  was a fiction\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"too.  But the old man brought it out from under the bed.              \", PCONT, PModel(),\n     false, false},\n    {\"     \\\"Pedrico gave it to me at the bodega,\\\" he explained.             \"\n     \" \",\n     PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"I'll be back when I have the sardines.  I'll keep yours and mine\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"together  on ice  and  we  can  share  them  in the  morning.   When I\", PCONT, PModel(),\n     false, false},\n    {\"come back you can tell me about the baseball.\\\"                       \", PCONT, PModel(),\n     false, false},\n    {\"     \\\"The Yankees cannot lose.\\\"                                     \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"But I fear the Indians of Cleveland.\\\"                         \", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"     \\\"Have faith  in  the Yankees  my son.   Think of  the great  Di-\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"Maggio.\\\"                                                             \", PCONT, PModel(),\n     false, false},\n    {\"     \\\"I  fear both  the Tigers of Detroit  and the  Indians of Cleve-\", PSTART,\n     PModel(kLeft, 0, 50, 0, 0), false, false},\n    {\"land.\\\"                                                               \", PCONT, PModel(),\n     false, false}};\n\nTEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {\n  TestParagraphDetection(kOldManAndSea, countof(kOldManAndSea));\n}\n\nconst TextAndModel kNewZealandIndex[] = {\n    {\"Oats, 51                      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"O'Brien, Gregory, 175         \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Occupational composition, 110,\", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"   138                        \", PCONT, PModel(), false, false},\n    {\"OECD rankings, 155, 172       \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Okiato (original capital), 47 \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Oil shock: 1974, xxx, 143; 1979,\", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"   145                        \", PCONT, PModel(), false, false},\n    {\"Old Age Pensions, xxii, 89-90 \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Old World evils, 77           \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Oliver, W. H., 39, 77, 89     \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Olssen, Erik, 45, 64, 84      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Olympic Games, 1924, 111, 144 \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Once on Chunuk Bair, 149      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Once Were Warriors, xxxiii, 170\", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"On—shore whaling, xvi         \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Opotiki, xix                  \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Orakau battle of, xviii, 57   \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"O’Regan, Tipene, 170, 198-99  \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Organic agriculture, 177      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Orwell, George, 151           \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago, xvii, 45, 49-50, 70    \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago block, xvii             \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago Daily Times, 67         \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago Girls’ High School, xix, 61,\", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"   85                         \", PCONT, PModel(), false, false},\n    {\"Otago gold rushes, 61-63      \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago Peninsula, xx           \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otago Provincial Council, 68  \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Otaki, 33                     \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},\n    {\"Owls Do Cry, 139              \", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}};\n\nTEST(ParagraphsTest, IndexPageTest) {\n  TestParagraphDetection(kNewZealandIndex, countof(kNewZealandIndex));\n}\n\n// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/params_model_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string> // std::string\n#include <vector>\n\n#include \"include_gunit.h\"\n#include \"params_model.h\"\n#include \"serialis.h\" // TFile\n#include \"tprintf.h\"  // tprintf\n\nnamespace tesseract {\n\n// Test some basic I/O of params model files (automated learning of language\n// model weights).\n#ifndef DISABLED_LEGACY_ENGINE\nstatic bool LoadFromFile(tesseract::ParamsModel &model, const char *lang, const char *full_path) {\n  tesseract::TFile fp;\n  if (!fp.Open(full_path, nullptr)) {\n    tprintf(\"Error opening file %s\\n\", full_path);\n    return false;\n  }\n  return model.LoadFromFp(lang, &fp);\n}\n#endif\n\nclass ParamsModelTest : public testing::Test {\n#ifndef DISABLED_LEGACY_ENGINE\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  std::string TestDataNameToPath(const std::string &name) const {\n    return file::JoinPath(TESTDATA_DIR, name);\n  }\n  std::string OutputNameToPath(const std::string &name) const {\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n  // Test that we are able to load a params model, save it, reload it,\n  // and verify that the re-serialized version is the same as the original.\n  void TestParamsModelRoundTrip(const std::string &params_model_filename) const {\n    tesseract::ParamsModel orig_model;\n    tesseract::ParamsModel duplicate_model;\n    file::MakeTmpdir();\n    std::string orig_file = TestDataNameToPath(params_model_filename);\n    std::string out_file = OutputNameToPath(params_model_filename);\n\n    EXPECT_TRUE(LoadFromFile(orig_model, \"eng\", orig_file.c_str()));\n    EXPECT_TRUE(orig_model.SaveToFile(out_file.c_str()));\n\n    EXPECT_TRUE(LoadFromFile(duplicate_model, \"eng\", out_file.c_str()));\n    EXPECT_TRUE(orig_model.Equivalent(duplicate_model));\n  }\n#endif\n};\n\nTEST_F(ParamsModelTest, TestEngParamsModelIO) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because ParamsModel::LoadFromFp is missing.\n  GTEST_SKIP();\n#else\n  TestParamsModelRoundTrip(\"eng.params_model\");\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/progress_test.cc",
    "content": "///////////////////////////////////////////////////////////////////////\n// File:        progress_test.cc\n// Description: Progress reporting API Test for Tesseract.\n// Author:      Jaroslaw Kubik\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n///////////////////////////////////////////////////////////////////////\n\n// expects clone of tessdata_fast repo in ../../tessdata_fast\n\n#include \"include_gunit.h\"\n\n#include <tesseract/baseapi.h>\n#include <tesseract/ocrclass.h>\n#include \"image.h\"\n\n#include <allheaders.h>\n#include \"gmock/gmock.h\"\n\n#include <fstream>\n#include <iostream>\n#include <locale>\n#include <memory> // std::unique_ptr\n#include <string>\n\n#include <time.h>\n\nnamespace tesseract {\n\nclass QuickTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    start_time_ = time(nullptr);\n  }\n  void TearDown() override {\n    const time_t end_time = time(nullptr);\n    EXPECT_TRUE(end_time - start_time_ <= 25)\n        << \"The test took too long - \" << ::testing::PrintToString(end_time - start_time_);\n  }\n  time_t start_time_;\n};\n\nclass ClassicMockProgressSink {\npublic:\n  MOCK_METHOD1(classicProgress, bool(int));\n  MOCK_METHOD1(cancel, bool(int));\n\n  ETEXT_DESC monitor;\n\n  ClassicMockProgressSink() {\n    monitor.progress_callback = [](int progress, int, int, int, int) -> bool {\n      return instance->classicProgress(progress);\n    };\n    monitor.cancel = [](void *ths, int words) -> bool {\n      return ((ClassicMockProgressSink *)ths)->cancel(words);\n    };\n    monitor.cancel_this = this;\n    instance = this;\n  }\n\n  static ClassicMockProgressSink *instance;\n};\n\nClassicMockProgressSink *ClassicMockProgressSink::instance = nullptr;\n\nclass NewMockProgressSink : public ClassicMockProgressSink {\npublic:\n  MOCK_METHOD1(progress, bool(int));\n\n  NewMockProgressSink() {\n    monitor.progress_callback2 = [](ETEXT_DESC *ths, int, int, int, int) -> bool {\n      return ((NewMockProgressSink *)ths->cancel_this)->progress(ths->progress);\n    };\n  }\n};\n\nvoid ClassicProgressTester(const char *imgname, const char *tessdatadir, const char *lang) {\n  using ::testing::_;\n  using ::testing::AllOf;\n  using ::testing::AtLeast;\n  using ::testing::DoAll;\n  using ::testing::Gt;\n  using ::testing::Le;\n  using ::testing::Return;\n  using ::testing::SaveArg;\n\n  auto api = std::make_unique<tesseract::TessBaseAPI>();\n  ASSERT_FALSE(api->Init(tessdatadir, lang)) << \"Could not initialize tesseract.\";\n  Image image = pixRead(imgname);\n  ASSERT_TRUE(image != nullptr) << \"Failed to read test image.\";\n  api->SetImage(image);\n\n  ClassicMockProgressSink progressSink;\n\n  int currentProgress = -1;\n  EXPECT_CALL(progressSink, classicProgress(AllOf(Gt<int &>(currentProgress), Le(100))))\n      .Times(AtLeast(5))\n      .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));\n  EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false));\n\n  EXPECT_EQ(api->Recognize(&progressSink.monitor), false);\n  EXPECT_GE(currentProgress, 50) << \"The reported progress did not reach 50%\";\n\n  api->End();\n  image.destroy();\n}\n\nvoid NewProgressTester(const char *imgname, const char *tessdatadir, const char *lang) {\n  using ::testing::_;\n  using ::testing::AllOf;\n  using ::testing::AtLeast;\n  using ::testing::DoAll;\n  using ::testing::Gt;\n  using ::testing::Le;\n  using ::testing::Return;\n  using ::testing::SaveArg;\n\n  auto api = std::make_unique<tesseract::TessBaseAPI>();\n  ASSERT_FALSE(api->Init(tessdatadir, lang)) << \"Could not initialize tesseract.\";\n  Image image = pixRead(imgname);\n  ASSERT_TRUE(image != nullptr) << \"Failed to read test image.\";\n  api->SetImage(image);\n\n  NewMockProgressSink progressSink;\n\n  int currentProgress = -1;\n  EXPECT_CALL(progressSink, classicProgress(_)).Times(0);\n  EXPECT_CALL(progressSink, progress(AllOf(Gt<int &>(currentProgress), Le(100))))\n      .Times(AtLeast(5))\n      .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));\n  EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false));\n\n  EXPECT_EQ(api->Recognize(&progressSink.monitor), false);\n  EXPECT_GE(currentProgress, 50) << \"The reported progress did not reach 50%\";\n\n  api->End();\n  image.destroy();\n}\n\nTEST(QuickTest, ClassicProgressReporting) {\n  ClassicProgressTester(TESTING_DIR \"/phototest.tif\", TESSDATA_DIR \"_fast\", \"eng\");\n}\n\nTEST(QuickTest, NewProgressReporting) {\n  NewProgressTester(TESTING_DIR \"/phototest.tif\", TESSDATA_DIR \"_fast\", \"eng\");\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/qrsequence_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <algorithm>\n#include <vector>\n\n#include \"cycletimer.h\"\n#include \"include_gunit.h\"\n#include \"log.h\"\n#include \"qrsequence.h\"\n\nnamespace tesseract {\n\nclass TestableQRSequenceGenerator : public QRSequenceGenerator {\npublic:\n  explicit TestableQRSequenceGenerator(const int &N) : QRSequenceGenerator(N) {}\n  // Overriding scope for testing\n  using QRSequenceGenerator::GetBinaryReversedInteger;\n};\n\n// Verifies binary inversion for a small range.\nTEST(QRSequenceGenerator, GetBinaryReversedInteger) {\n  const int kRangeSize = 8;\n  TestableQRSequenceGenerator generator(kRangeSize);\n  int reversed_vals[kRangeSize] = {0, 4, 2, 6, 1, 5, 3, 7};\n  for (int i = 0; i < kRangeSize; ++i) {\n    EXPECT_EQ(reversed_vals[i], generator.GetBinaryReversedInteger(i));\n  }\n}\n\n// Trivial test fixture for a parameterized test.\nclass QRSequenceGeneratorTest : public ::testing::TestWithParam<int> {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n};\n\nTEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) {\n  const int kRangeSize = GetParam();\n  TestableQRSequenceGenerator generator(kRangeSize);\n  std::vector<int> vals(kRangeSize);\n  CycleTimer timer;\n  timer.Restart();\n  for (int i = 0; i < kRangeSize; ++i) {\n    vals[i] = generator.GetVal();\n  }\n  LOG(INFO) << kRangeSize << \"-length sequence took \" << timer.GetInMs() << \"ms\";\n  // Sort the numbers to verify that we've covered the range without repetition.\n  std::sort(vals.begin(), vals.end());\n  for (int i = 0; i < kRangeSize; ++i) {\n    EXPECT_EQ(i, vals[i]);\n    if (i != vals[i]) {\n      LOG(INFO) << \"Aborting remaining comparisons\";\n      break;\n    }\n  }\n}\n\n// Run a parameterized test using the following range sizes.\nINSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest,\n                         ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6));\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/recodebeam_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"log.h\" // for LOG\n\n#include \"matrix.h\"\n#include \"normstrngs.h\"\n#include \"pageres.h\"\n#include \"ratngs.h\"\n#include \"recodebeam.h\"\n#include \"unicharcompress.h\"\n#include \"unicharset_training_utils.h\"\n\n#include \"helpers.h\"\n\nnamespace tesseract {\n\n// Number of characters to test beam search with.\nconst int kNumChars = 100;\n// Amount of extra random data to pad with after.\nconst int kPadding = 64;\n// Dictionary test data.\n// The top choice is: \"Gef s wordsright.\".\n// The desired phrase is \"Gets words right.\".\n// There is a competing dictionary phrase: \"Get swords right.\".\n// ... due to the following errors from the network:\n// f stronger than t in \"Get\".\n// weak space between Gef and s and between s and words.\n// weak space between words and right.\nconst char *kGWRTops[] = {\"G\", \"e\", \"f\", \" \", \"s\", \" \", \"w\", \"o\", \"r\",    \"d\",\n                          \"s\", \"\",  \"r\", \"i\", \"g\", \"h\", \"t\", \".\", nullptr};\nconst float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,\n                               0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};\nconst char *kGWR2nds[] = {\"C\", \"c\", \"t\", \"\",  \"S\", \"\",  \"W\", \"O\", \"t\",    \"h\",\n                          \"S\", \" \", \"t\", \"I\", \"9\", \"b\", \"f\", \",\", nullptr};\nconst float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,\n                               0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};\n\nconst char *kZHTops[] = {\"实\", \"学\", \"储\", \"啬\", \"投\", \"学\", \"生\", nullptr};\nconst float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};\nconst char *kZH2nds[] = {\"学\", \"储\", \"投\", \"生\", \"学\", \"生\", \"实\", nullptr};\nconst float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};\n\nconst char *kViTops[] = {\"v\", \"ậ\", \"y\", \" \", \"t\", \"ộ\", \"i\", nullptr};\nconst float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};\nconst char *kVi2nds[] = {\"V\", \"a\", \"v\", \"\", \"l\", \"o\", \"\", nullptr};\nconst float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};\n\nclass RecodeBeamTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  RecodeBeamTest() : lstm_dict_(&ccutil_) {}\n  ~RecodeBeamTest() override {\n    lstm_dict_.End();\n  }\n\n  // Loads and compresses the given unicharset.\n  void LoadUnicharset(const std::string &unicharset_name) {\n    std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, \"radical-stroke.txt\");\n    std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);\n    std::string radical_data;\n    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));\n    CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));\n    unichar_null_char_ =\n        ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size();\n    std::string radical_str(radical_data.c_str());\n    EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));\n    RecodedCharID code;\n    recoder_.EncodeUnichar(unichar_null_char_, &code);\n    encoded_null_char_ = code(0);\n    // Space should encode as itself.\n    recoder_.EncodeUnichar(UNICHAR_SPACE, &code);\n    EXPECT_EQ(UNICHAR_SPACE, code(0));\n    std::string output_name = file::JoinPath(FLAGS_test_tmpdir, \"testenc.txt\");\n    std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);\n    std::string encoding_str(&encoding[0], encoding.size());\n    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));\n    LOG(INFO) << \"Wrote encoding to:\" << output_name << \"\\n\";\n  }\n  // Loads the dictionary.\n  void LoadDict(const std::string &lang) {\n    std::string traineddata_name = lang + \".traineddata\";\n    std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);\n    lstm_dict_.SetupForLoad(nullptr);\n    tesseract::TessdataManager mgr;\n    mgr.Init(traineddata_file.c_str());\n    lstm_dict_.LoadLSTM(lang.c_str(), &mgr);\n    lstm_dict_.FinishLoad();\n  }\n\n  // Expects the appropriate results from the compressed_  ccutil_.unicharset.\n  void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output,\n                     const std::vector<int> &transcription) {\n    // Get the utf8 string of the transcription.\n    std::string truth_utf8;\n    for (int i : transcription) {\n      truth_utf8 += ccutil_.unicharset.id_to_unichar(i);\n    }\n    PointerVector<WERD_RES> words;\n    ExpectCorrect(output, truth_utf8, nullptr, &words);\n  }\n  void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,\n                     Dict *dict, PointerVector<WERD_RES> *words) {\n    RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);\n    beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);\n    // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:\n    // beam_search.DebugBeams(ccutil_.unicharset);\n    std::vector<int> labels, xcoords;\n    beam_search.ExtractBestPathAsLabels(&labels, &xcoords);\n    LOG(INFO) << \"Labels size = \" << labels.size() << \" coords \" << xcoords.size() << \"\\n\";\n    // Now decode using recoder_.\n    std::string decoded;\n    int end = 1;\n    for (unsigned start = 0; start < labels.size(); start = end) {\n      RecodedCharID code;\n      unsigned index = start;\n      int uni_id = INVALID_UNICHAR_ID;\n      do {\n        code.Set(code.length(), labels[index++]);\n        uni_id = recoder_.DecodeUnichar(code);\n      } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&\n               (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));\n      EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << \"index=\" << index << \"/\" << labels.size();\n      // To the extent of truth_utf8, we expect decoded to match, but if\n      // transcription is shorter, that is OK too, as we may just be testing\n      // that we get a valid sequence when padded with random data.\n      if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {\n        decoded += ccutil_.unicharset.id_to_unichar(uni_id);\n      }\n      end = index;\n    }\n    EXPECT_EQ(truth_utf8, decoded);\n\n    // Check that ExtractBestPathAsUnicharIds does the same thing.\n    std::vector<int> unichar_ids;\n    std::vector<float> certainties, ratings;\n    beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,\n                                            &ratings, &xcoords);\n    std::string u_decoded;\n    float total_rating = 0.0f;\n    for (unsigned u = 0; u < unichar_ids.size(); ++u) {\n      // To the extent of truth_utf8, we expect decoded to match, but if\n      // transcription is shorter, that is OK too, as we may just be testing\n      // that we get a valid sequence when padded with random data.\n      if (u_decoded.size() < truth_utf8.size()) {\n        const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);\n        total_rating += ratings[u];\n        LOG(INFO) << u << \":u_id=\" << unichar_ids[u] << \"=\" << str << \", c=\"\n          << certainties[u] << \", r=\" << ratings[u] << \"r_sum=\"\n          << total_rating << \" @\" << xcoords[u] << \"\\n\";\n        if (str[0] == ' ') {\n          total_rating = 0.0f;\n        }\n        u_decoded += str;\n      }\n    }\n    EXPECT_EQ(truth_utf8, u_decoded);\n\n    // Check that ExtractBestPathAsWords does the same thing.\n    TBOX line_box(0, 0, 100, 10);\n    for (int i = 0; i < 2; ++i) {\n      beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);\n      std::string w_decoded;\n      for (int w = 0; w < words->size(); ++w) {\n        const WERD_RES *word = (*words)[w];\n        if (w_decoded.size() < truth_utf8.size()) {\n          if (!w_decoded.empty() && word->word->space()) {\n            w_decoded += \" \";\n          }\n          w_decoded += word->best_choice->unichar_string().c_str();\n        }\n        LOG(INFO) << \"Word:\" << w << \" = \" << word->best_choice->unichar_string()\n          << \", c=\" << word->best_choice->certainty() << \", r=\" << word->best_choice->rating()\n          << \", perm=\" << word->best_choice->permuter() << \"\\n\";\n      }\n      std::string w_trunc(w_decoded.data(), truth_utf8.size());\n      if (truth_utf8 != w_trunc) {\n        tesseract::NormalizeUTF8String(\n            tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,\n            tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);\n        w_trunc.assign(w_decoded.data(), truth_utf8.size());\n      }\n      EXPECT_EQ(truth_utf8, w_trunc);\n    }\n  }\n  // Generates easy encoding of the given unichar_ids, and pads with at least\n  // padding of random data.\n  GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,\n                                                      int padding) {\n    int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;\n    int num_codes = recoder_.code_range();\n    GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);\n    // Fill with random data.\n    TRand random;\n    for (int t = 0; t < width; ++t) {\n      for (int i = 0; i < num_codes; ++i) {\n        outputs(t, i) = random.UnsignedRand(0.25);\n      }\n    }\n    int t = 0;\n    for (int unichar_id : unichar_ids) {\n      RecodedCharID code;\n      int len = recoder_.EncodeUnichar(unichar_id, &code);\n      EXPECT_NE(0, len);\n      for (int j = 0; j < len; ++j) {\n        // Make the desired answer a clear winner.\n        if (j > 0 && code(j) == code(j - 1)) {\n          // We will collapse adjacent equal codes so put a null in between.\n          outputs(t++, encoded_null_char_) = 1.0f;\n        }\n        outputs(t++, code(j)) = 1.0f;\n      }\n      // Put a 0 as a null char in between.\n      outputs(t++, encoded_null_char_) = 1.0f;\n    }\n    // Normalize the probs.\n    for (int t = 0; t < width; ++t) {\n      double sum = 0.0;\n      for (int i = 0; i < num_codes; ++i) {\n        sum += outputs(t, i);\n      }\n      for (int i = 0; i < num_codes; ++i) {\n        outputs(t, i) /= sum;\n      }\n    }\n\n    return outputs;\n  }\n  // Encodes a utf8 string (character) as unichar_id, then recodes, and sets\n  // the score for the appropriate sequence of codes, returning the ending t.\n  int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,\n                 GENERIC_2D_ARRAY<float> *outputs) {\n    int t = start_t;\n    std::vector<int> unichar_ids;\n    EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));\n    if (unichar_ids.empty() || utf8_str[0] == '\\0') {\n      unichar_ids.clear();\n      unichar_ids.push_back(unichar_null_char_);\n    }\n    int num_ids = unichar_ids.size();\n    for (int u = 0; u < num_ids; ++u) {\n      RecodedCharID code;\n      int len = recoder_.EncodeUnichar(unichar_ids[u], &code);\n      EXPECT_NE(0, len);\n      for (int i = 0; i < len; ++i) {\n        // Apply the desired score.\n        (*outputs)(t++, code(i)) = score;\n        if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {\n          int dups = static_cast<int>(random->UnsignedRand(3.0));\n          for (int d = 0; d < dups; ++d) {\n            // Duplicate the desired score.\n            (*outputs)(t++, code(i)) = score;\n          }\n        }\n      }\n      if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {\n        int dups = static_cast<int>(random->UnsignedRand(3.0));\n        for (int d = 0; d < dups; ++d) {\n          // Add a random number of nulls as well.\n          (*outputs)(t++, encoded_null_char_) = score;\n        }\n      }\n    }\n    return t;\n  }\n  // Generates an encoding of the given 4 arrays as synthetic network scores.\n  // uses scores1 for chars1 and scores2 for chars2, and everything else gets\n  // the leftovers shared out equally. Note that empty string encodes as the\n  // null_char_.\n  GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],\n                                                   const char *chars2[], const float scores2[],\n                                                   TRand *random) {\n    int width = 0;\n    while (chars1[width] != nullptr) {\n      ++width;\n    }\n    int padding = width * RecodedCharID::kMaxCodeLen;\n    int num_codes = recoder_.code_range();\n    GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);\n    int t = 0;\n    for (int i = 0; i < width; ++i) {\n      // In case there is overlap in the codes between 1st and 2nd choice, it\n      // is better to encode the 2nd choice first.\n      int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);\n      int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);\n      // Advance t to the max end, setting everything else to the leftovers.\n      int max_t = std::max(end_t1, end_t2);\n      while (t < max_t) {\n        double total_score = 0.0;\n        for (int j = 0; j < num_codes; ++j) {\n          total_score += outputs(t, j);\n        }\n        double null_remainder = (1.0 - total_score) / 2.0;\n        double remainder = null_remainder / (num_codes - 2);\n        if (outputs(t, encoded_null_char_) < null_remainder) {\n          outputs(t, encoded_null_char_) += null_remainder;\n        } else {\n          remainder += remainder;\n        }\n        for (int j = 0; j < num_codes; ++j) {\n          if (outputs(t, j) == 0.0f) {\n            outputs(t, j) = remainder;\n          }\n        }\n        ++t;\n      }\n    }\n    // Fill the rest with null chars.\n    while (t < width + padding) {\n      outputs(t++, encoded_null_char_) = 1.0f;\n    }\n    return outputs;\n  }\n  UnicharCompress recoder_;\n  int unichar_null_char_ = 0;\n  int encoded_null_char_ = 0;\n  CCUtil ccutil_;\n  Dict lstm_dict_;\n};\n\nTEST_F(RecodeBeamTest, DoesChinese) {\n  LOG(INFO) << \"Testing chi_tra\"\n            << \"\\n\";\n  LoadUnicharset(\"chi_tra.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n  LOG(INFO) << \"Testing chi_sim\"\n            << \"\\n\";\n  LoadUnicharset(\"chi_sim.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  transcription.clear();\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DoesJapanese) {\n  LOG(INFO) << \"Testing jpn\"\n            << \"\\n\";\n  LoadUnicharset(\"jpn.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DoesKorean) {\n  LOG(INFO) << \"Testing kor\"\n            << \"\\n\";\n  LoadUnicharset(\"kor.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DoesKannada) {\n  LOG(INFO) << \"Testing kan\"\n            << \"\\n\";\n  LoadUnicharset(\"kan.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DoesMarathi) {\n  LOG(INFO) << \"Testing mar\"\n            << \"\\n\";\n  LoadUnicharset(\"mar.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DoesEnglish) {\n  LOG(INFO) << \"Testing eng\"\n            << \"\\n\";\n  LoadUnicharset(\"eng.unicharset\");\n  // Correctly reproduce the first kNumchars characters from easy output.\n  std::vector<int> transcription;\n  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {\n    transcription.push_back(i);\n  }\n  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);\n  ExpectCorrect(outputs, transcription);\n}\n\nTEST_F(RecodeBeamTest, DISABLED_EngDictionary) {\n  LOG(INFO) << \"Testing eng dictionary\"\n            << \"\\n\";\n  LoadUnicharset(\"eng_beam.unicharset\");\n  GENERIC_2D_ARRAY<float> outputs =\n      GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);\n  std::string default_str;\n  for (int i = 0; kGWRTops[i] != nullptr; ++i) {\n    default_str += kGWRTops[i];\n  }\n  PointerVector<WERD_RES> words;\n  ExpectCorrect(outputs, default_str, nullptr, &words);\n  // Now try again with the dictionary.\n  LoadDict(\"eng_beam\");\n  ExpectCorrect(outputs, \"Gets words right.\", &lstm_dict_, &words);\n}\n\nTEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {\n  LOG(INFO) << \"Testing zh_hans dictionary\"\n            << \"\\n\";\n  LoadUnicharset(\"zh_hans.unicharset\");\n  GENERIC_2D_ARRAY<float> outputs =\n      GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);\n  PointerVector<WERD_RES> words;\n  ExpectCorrect(outputs, \"实学储啬投学生\", nullptr, &words);\n  // Each is an individual word, with permuter = top choice.\n  EXPECT_EQ(7, words.size());\n  for (int w = 0; w < words.size(); ++w) {\n    EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());\n  }\n  // Now try again with the dictionary.\n  LoadDict(\"zh_hans\");\n  ExpectCorrect(outputs, \"实学储啬投学生\", &lstm_dict_, &words);\n  // Number of words expected.\n  const int kNumWords = 5;\n  // Content of the words.\n  const char *kWords[kNumWords] = {\"实学\", \"储\", \"啬\", \"投\", \"学生\"};\n  // Permuters of the words.\n  const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,\n                                     TOP_CHOICE_PERM, SYSTEM_DAWG_PERM};\n  EXPECT_EQ(kNumWords, words.size());\n  for (int w = 0; w < kNumWords && w < words.size(); ++w) {\n    EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());\n    EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());\n  }\n}\n\n// Tests that a recoder built with decomposed unicode allows true ctc\n// arbitrary duplicates and inserted nulls inside the multicode sequence.\nTEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {\n  LOG(INFO) << \"Testing duplicates in multi-code sequences\"\n            << \"\\n\";\n  LoadUnicharset(\"vie.d.unicharset\");\n  tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);\n  TRand random;\n  GENERIC_2D_ARRAY<float> outputs =\n      GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);\n  PointerVector<WERD_RES> words;\n  std::string truth_str;\n  tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,\n                                 tesseract::GraphemeNorm::kNone, \"vậy tội\", &truth_str);\n  ExpectCorrect(outputs, truth_str, nullptr, &words);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/rect_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"rect.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TBOXTest : public testing::Test {\npublic:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  void TearDown() override {}\n};\n\nTEST_F(TBOXTest, OverlapInside) {\n  TBOX a(10, 10, 20, 20);\n  TBOX b(11, 11, 12, 12);\n\n  EXPECT_TRUE(a.overlap(b));\n  EXPECT_TRUE(b.overlap(a));\n  EXPECT_DOUBLE_EQ(0.01, a.overlap_fraction(b));\n  EXPECT_DOUBLE_EQ(1.0, b.overlap_fraction(a));\n}\n\nTEST_F(TBOXTest, OverlapBoolCorners) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX bottom_left(5, 5, 15, 15);\n  TBOX top_left(5, 25, 15, 35);\n  // other corners covered by symmetry\n\n  EXPECT_TRUE(mid.overlap(bottom_left));\n  EXPECT_TRUE(bottom_left.overlap(mid));\n  EXPECT_TRUE(mid.overlap(top_left));\n  EXPECT_TRUE(top_left.overlap(mid));\n}\n\nTEST_F(TBOXTest, OverlapFractionCorners) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX bottom_left(5, 5, 15, 15);\n  TBOX top_left(5, 25, 15, 35);\n  // other corners covered by symmetry\n\n  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(bottom_left));\n  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), bottom_left.overlap_fraction(mid));\n  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left));\n  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid));\n}\n\nTEST_F(TBOXTest, OverlapBoolSides) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX left(5, 15, 15, 25);\n  TBOX bottom(15, 5, 25, 15);\n  // other sides covered by symmetry\n\n  EXPECT_TRUE(mid.overlap(left));\n  EXPECT_TRUE(left.overlap(mid));\n  EXPECT_TRUE(mid.overlap(bottom));\n  EXPECT_TRUE(bottom.overlap(mid));\n}\n\nTEST_F(TBOXTest, OverlapFractionSides) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX left(5, 15, 15, 25);\n  TBOX bottom(15, 5, 25, 15);\n  // other sides covered by symmetry\n\n  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(left));\n  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), left.overlap_fraction(mid));\n  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(bottom));\n  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), bottom.overlap_fraction(mid));\n}\n\nTEST_F(TBOXTest, OverlapBoolSpan) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX vertical(15, 5, 25, 35);\n  TBOX horizontal(5, 15, 35, 25);\n  // other sides covered by symmetry in other test cases\n\n  EXPECT_TRUE(mid.overlap(vertical));\n  EXPECT_TRUE(vertical.overlap(mid));\n  EXPECT_TRUE(mid.overlap(horizontal));\n  EXPECT_TRUE(horizontal.overlap(mid));\n}\n\nTEST_F(TBOXTest, OverlapFractionSpan) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX vertical(15, 5, 25, 35);\n  TBOX horizontal(5, 15, 35, 25);\n  // other sides covered by symmetry in other test cases\n\n  EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0), mid.overlap_fraction(vertical));\n  EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0), vertical.overlap_fraction(mid));\n  EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(horizontal));\n  EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0), horizontal.overlap_fraction(mid));\n}\n\n// TODO(nbeato): pretty much all cases\nTEST_F(TBOXTest, OverlapOutsideTests) {\n  TBOX mid(10, 10, 30, 30);\n  TBOX left(0, 15, 5, 25);\n\n  EXPECT_FALSE(mid.overlap(left));\n  EXPECT_FALSE(left.overlap(mid));\n  EXPECT_DOUBLE_EQ(0.0, mid.overlap_fraction(left));\n  EXPECT_DOUBLE_EQ(0.0, left.overlap_fraction(mid));\n}\n\nTEST_F(TBOXTest, OverlapXFraction) {\n  TBOX a(10, 10, 20, 20);\n  TBOX b(12, 100, 26, 200);\n  TBOX c(0, 0, 100, 100);\n  TBOX d(0, 0, 1, 1);\n\n  EXPECT_DOUBLE_EQ(8.0 / 10.0, a.x_overlap_fraction(b));\n  EXPECT_DOUBLE_EQ(8.0 / 14.0, b.x_overlap_fraction(a));\n  EXPECT_DOUBLE_EQ(1.0, a.x_overlap_fraction(c));\n  EXPECT_DOUBLE_EQ(10.0 / 100.0, c.x_overlap_fraction(a));\n  EXPECT_DOUBLE_EQ(0.0, a.x_overlap_fraction(d));\n  EXPECT_DOUBLE_EQ(0.0, d.x_overlap_fraction(a));\n}\n\nTEST_F(TBOXTest, OverlapYFraction) {\n  TBOX a(10, 10, 20, 20);\n  TBOX b(100, 12, 200, 26);\n  TBOX c(0, 0, 100, 100);\n  TBOX d(0, 0, 1, 1);\n\n  EXPECT_DOUBLE_EQ(8.0 / 10.0, a.y_overlap_fraction(b));\n  EXPECT_DOUBLE_EQ(8.0 / 14.0, b.y_overlap_fraction(a));\n  EXPECT_DOUBLE_EQ(1.0, a.y_overlap_fraction(c));\n  EXPECT_DOUBLE_EQ(10.0 / 100.0, c.y_overlap_fraction(a));\n  EXPECT_DOUBLE_EQ(0.0, a.y_overlap_fraction(d));\n  EXPECT_DOUBLE_EQ(0.0, d.y_overlap_fraction(a));\n}\n\nTEST_F(TBOXTest, OverlapXFractionZeroSize) {\n  TBOX zero(10, 10, 10, 10);\n  TBOX big(0, 0, 100, 100);\n  TBOX small(0, 0, 1, 1);\n\n  EXPECT_DOUBLE_EQ(1.0, zero.x_overlap_fraction(big));\n  EXPECT_DOUBLE_EQ(0.0, big.x_overlap_fraction(zero));\n  EXPECT_DOUBLE_EQ(0.0, zero.x_overlap_fraction(small));\n  EXPECT_DOUBLE_EQ(0.0, small.x_overlap_fraction(zero));\n}\n\nTEST_F(TBOXTest, OverlapYFractionZeroSize) {\n  TBOX zero(10, 10, 10, 10);\n  TBOX big(0, 0, 100, 100);\n  TBOX small(0, 0, 1, 1);\n\n  EXPECT_DOUBLE_EQ(1.0, zero.y_overlap_fraction(big));\n  EXPECT_DOUBLE_EQ(0.0, big.y_overlap_fraction(zero));\n  EXPECT_DOUBLE_EQ(0.0, zero.y_overlap_fraction(small));\n  EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/resultiterator_test.cc",
    "content": "\n#include <allheaders.h>\n#include <tesseract/baseapi.h>\n#include <tesseract/resultiterator.h>\n#include <string>\n#include \"scrollview.h\"\n\n#include \"include_gunit.h\"\n#include \"log.h\" // for LOG\n\nnamespace tesseract {\n\n// DEFINE_string(tess_config, \"\", \"config file for tesseract\");\n// DEFINE_bool(visual_test, false, \"Runs a visual test using scrollview\");\n\n// The fixture for testing Tesseract.\nclass ResultIteratorTest : public testing::Test {\nprotected:\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTING_DIR, name);\n  }\n  std::string TessdataPath() {\n    return file::JoinPath(TESSDATA_DIR, \"\");\n  }\n  std::string OutputNameToPath(const std::string &name) {\n    file::MakeTmpdir();\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n\n  ResultIteratorTest() {\n    src_pix_ = nullptr;\n  }\n  ~ResultIteratorTest() override = default;\n\n  void SetImage(const char *filename) {\n    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());\n    api_.Init(TessdataPath().c_str(), \"eng\", tesseract::OEM_TESSERACT_ONLY);\n    //    if (!FLAGS_tess_config.empty())\n    //      api_.ReadConfigFile(FLAGS_tess_config.c_str());\n    api_.SetPageSegMode(tesseract::PSM_AUTO);\n    api_.SetImage(src_pix_);\n    src_pix_.destroy();\n    src_pix_ = api_.GetInputImage();\n  }\n\n  // Rebuilds the image using the binary images at the given level, and\n  // EXPECTs that the number of pixels in the xor of the rebuilt image with\n  // the original is at most max_diff.\n  void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator *it) {\n    it->Begin();\n    int width = pixGetWidth(src_pix_);\n    int height = pixGetHeight(src_pix_);\n    int depth = pixGetDepth(src_pix_);\n    Image pix = pixCreate(width, height, depth);\n    EXPECT_TRUE(depth == 1 || depth == 8);\n    if (depth == 8) {\n      pixSetAll(pix);\n    }\n    do {\n      int left, top, right, bottom;\n      PageIteratorLevel im_level = level;\n      // If the return is false, it is a non-text block so get the block image.\n      if (!it->BoundingBox(level, &left, &top, &right, &bottom)) {\n        im_level = tesseract::RIL_BLOCK;\n        EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));\n      }\n      LOG(INFO) << \"BBox: [L:\" << left << \", T:\" << top << \", R:\" << right << \", B:\" << bottom\n                << \"]\"\n                << \"\\n\";\n      Image block_pix;\n      if (depth == 1) {\n        block_pix = it->GetBinaryImage(im_level);\n        pixRasterop(pix, left, top, right - left, bottom - top, PIX_SRC ^ PIX_DST, block_pix, 0, 0);\n      } else {\n        block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);\n        pixRasterop(pix, left, top, pixGetWidth(block_pix), pixGetHeight(block_pix),\n                    PIX_SRC & PIX_DST, block_pix, 0, 0);\n      }\n      CHECK(block_pix != nullptr);\n      block_pix.destroy();\n    } while (it->Next(level));\n    //    if (base::GetFlag(FLAGS_v) >= 1)\n    //      pixWrite(OutputNameToPath(\"rebuilt.png\").c_str(), pix, IFF_PNG);\n    pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0);\n    if (depth == 8) {\n      Image binary_pix = pixThresholdToBinary(pix, 128);\n      pix.destroy();\n      pixInvert(binary_pix, binary_pix);\n      pix = binary_pix;\n    }\n    //    if (base::GetFlag(FLAGS_v) >= 1)\n    //      pixWrite(OutputNameToPath(\"rebuiltxor.png\").c_str(), pix, IFF_PNG);\n    l_int32 pixcount;\n    pixCountPixels(pix, &pixcount, nullptr);\n    if (pixcount > max_diff) {\n      std::string outfile = OutputNameToPath(\"failedxor.png\");\n      LOG(INFO) << \"outfile = \" << outfile << \"\\n\";\n      pixWrite(outfile.c_str(), pix, IFF_PNG);\n    }\n    pix.destroy();\n    LOG(INFO) << \"At level \" << level << \": pix diff = \" << pixcount << \"\\n\";\n    EXPECT_LE(pixcount, max_diff);\n    //    if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff);\n  }\n\n  // Rebuilds the text from the iterator strings at the given level, and\n  // EXPECTs that the rebuild string exactly matches the truth string.\n  void VerifyIteratorText(const std::string &truth, PageIteratorLevel level, ResultIterator *it) {\n    LOG(INFO) << \"Text Test Level \" << level << \"\\n\";\n    it->Begin();\n    std::string result;\n    do {\n      char *text = it->GetUTF8Text(level);\n      result += text;\n      delete[] text;\n      if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) &&\n          it->IsAtFinalElement(tesseract::RIL_WORD, level)) {\n        if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) {\n          result += '\\n';\n        } else {\n          result += ' ';\n        }\n        if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&\n            !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) {\n          result += '\\n';\n        }\n      }\n    } while (it->Next(level));\n    EXPECT_STREQ(truth.c_str(), result.c_str()) << \"Rebuild failed at Text Level \" << level;\n  }\n\n  void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_limit,\n                      int symbol_limit, PageIterator *it, PageIteratorLevel maxlevel=tesseract::RIL_SYMBOL) {\n    VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);\n    VerifyRebuild(para_limit, tesseract::RIL_PARA, it);\n    VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);\n    VerifyRebuild(word_limit, tesseract::RIL_WORD, it);\n    if (maxlevel == tesseract::RIL_SYMBOL) {\n      VerifyRebuild(symbol_limit, maxlevel, it);\n    }\n  }\n\n  void VerifyAllText(const std::string &truth, ResultIterator *it) {\n    VerifyIteratorText(truth, tesseract::RIL_BLOCK, it);\n    VerifyIteratorText(truth, tesseract::RIL_PARA, it);\n    VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it);\n    VerifyIteratorText(truth, tesseract::RIL_WORD, it);\n    VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it);\n  }\n\n  // Verifies that ResultIterator::CalculateTextlineOrder() produces the right\n  // results given an array of word directions (word_dirs[num_words]), an\n  // expected output reading order\n  // (expected_reading_order[num_reading_order_entries]) and a given reading\n  // context (ltr or rtl).\n  void ExpectTextlineReadingOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,\n                                  int num_words, int *expected_reading_order,\n                                  int num_reading_order_entries) const {\n    std::vector<StrongScriptDirection> gv_word_dirs;\n    for (int i = 0; i < num_words; i++) {\n      gv_word_dirs.push_back(word_dirs[i]);\n    }\n\n    std::vector<int> calculated_order;\n    ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &calculated_order);\n    // STL vector can be used with EXPECT_EQ, so convert...\n    std::vector<int> correct_order(expected_reading_order,\n                                   expected_reading_order + num_reading_order_entries);\n    EXPECT_EQ(correct_order, calculated_order);\n  }\n\n  // Verify that ResultIterator::CalculateTextlineOrder() produces sane output\n  // for a given array of word_dirs[num_words] in ltr or rtl context.\n  // Sane means that the output contains some permutation of the indices\n  // 0..[num_words - 1] interspersed optionally with negative (marker) values.\n  void VerifySaneTextlineOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,\n                               int num_words) const {\n    std::vector<StrongScriptDirection> gv_word_dirs;\n    for (int i = 0; i < num_words; i++) {\n      gv_word_dirs.push_back(word_dirs[i]);\n    }\n\n    std::vector<int> output;\n    ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output);\n    ASSERT_GE(output.size(), num_words);\n    std::vector<int> output_copy(output);\n    std::sort(output_copy.begin(), output_copy.end());\n    bool sane = true;\n    unsigned j = 0;\n    while (j < output_copy.size() && output_copy[j] < 0) {\n      j++;\n    }\n    for (int i = 0; i < num_words; i++, j++) {\n      if (output_copy[j] != i) {\n        sane = false;\n        break;\n      }\n    }\n    if (j != output_copy.size()) {\n      sane = false;\n    }\n    if (!sane) {\n      std::vector<int> empty;\n      EXPECT_EQ(output, empty) << \" permutation of 0..\" << num_words - 1 << \" not found in \"\n                               << (in_ltr_context ? \"ltr\" : \"rtl\") << \" context.\";\n    }\n  }\n\n  // Objects declared here can be used by all tests in the test case for Foo.\n  Image src_pix_; // Borrowed from api_. Do not destroy.\n  std::string ocr_text_;\n  tesseract::TessBaseAPI api_;\n};\n\n// Tests layout analysis output (and scrollview) on the UNLV page numbered\n// 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true.\n//\n// TEST_F(ResultIteratorTest, VisualTest) {\n//  if (!FLAGS_visual_test) return;\n//  const char* kIms[] = {\"8087_054.3G.tif\", \"8071_093.3B.tif\", nullptr};\n//  for (int i = 0; kIms[i] != nullptr; ++i) {\n//    SetImage(kIms[i]);\n//    // Just run layout analysis.\n//    PageIterator* it = api_.AnalyseLayout();\n//    EXPECT_FALSE(it == nullptr);\n//    // Make a scrollview window for the display.\n//    int width = pixGetWidth(src_pix_);\n//    int height = pixGetHeight(src_pix_);\n//    ScrollView* win =\n//        new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height);\n//    win->Image(src_pix_, 0, 0);\n//    it->Begin();\n//    ScrollView::Color color = ScrollView::RED;\n//    win->Brush(ScrollView::NONE);\n//    do {\n//      Pta* pts = it->BlockPolygon();\n//      if (pts != nullptr) {\n//        win->Pen(color);\n//        int num_pts = ptaGetCount(pts);\n//        l_float32 x, y;\n//        ptaGetPt(pts, num_pts - 1, &x, &y);\n//        win->SetCursor(static_cast<int>(x), static_cast<int>(y));\n//        for (int p = 0; p < num_pts; ++p) {\n//          ptaGetPt(pts, p, &x, &y);\n//          win->DrawTo(static_cast<int>(x), static_cast<int>(y));\n//        }\n//      }\n//      ptaDestroy(&pts);\n//    } while (it->Next(tesseract::RIL_BLOCK));\n//    win->Update();\n//    delete win->AwaitEvent(SVET_DESTROY);\n//    delete win;\n//    delete it;\n//  }\n//}\n\n// Tests that Tesseract gets exactly the right answer on phototest.\nTEST_F(ResultIteratorTest, EasyTest) {\n  SetImage(\"phototest.tif\");\n  // Just run layout analysis.\n  PageIterator *p_it = api_.AnalyseLayout();\n  EXPECT_FALSE(p_it == nullptr);\n  // Check iterator position.\n  EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));\n  // This should be a single block.\n  EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK));\n  EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));\n\n  // The images should rebuild almost perfectly.\n  LOG(INFO) << \"Verifying image rebuilds 1 (pageiterator)\"\n            << \"\\n\";\n  VerifyRebuilds(10, 10, 0, 0, 0, p_it);\n  delete p_it;\n\n  char *result = api_.GetUTF8Text();\n  ocr_text_ = result;\n  delete[] result;\n  ResultIterator *r_it = api_.GetIterator();\n  // The images should rebuild almost perfectly.\n  LOG(INFO) << \"Verifying image rebuilds 2a (resultiterator)\"\n            << \"\\n\";\n  VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);\n  // Test the text.\n  LOG(INFO) << \"Verifying text rebuilds 1 (resultiterator)\"\n            << \"\\n\";\n  VerifyAllText(ocr_text_, r_it);\n\n  // The images should rebuild almost perfectly.\n  LOG(INFO) << \"Verifying image rebuilds 2b (resultiterator)\"\n            << \"\\n\";\n  VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);\n\n  r_it->Begin();\n  // Test baseline of the first line.\n  int x1, y1, x2, y2;\n  r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);\n  LOG(INFO) << \"Baseline (\"\n     << x1 << ',' << y1 << \")->(\" << x2 << ',' << y2 << \")\\n\";\n  // Make sure we have a decent vector.\n  EXPECT_GE(x2, x1 + 400);\n  // The point 200,116 should be very close to the baseline.\n  // (x3,y3) is the vector from (x1,y1) to (200,116)\n  int x3 = 200 - x1;\n  int y3 = 116 - y1;\n  x2 -= x1;\n  y2 -= y1;\n  // The cross product (x2,y1)x(x3,y3) should be small.\n  int product = x2 * y3 - x3 * y2;\n  EXPECT_LE(abs(product), x2);\n\n  // Test font attributes for each word.\n  do {\n    float confidence = r_it->Confidence(tesseract::RIL_WORD);\n#ifndef DISABLED_LEGACY_ENGINE\n    int pointsize, font_id;\n    bool bold, italic, underlined, monospace, serif, smallcaps;\n    const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,\n                                                &smallcaps, &pointsize, &font_id);\n    EXPECT_GE(confidence, 80.0f);\n#endif\n    char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);\n\n#ifdef DISABLED_LEGACY_ENGINE\n    LOG(INFO) << \"Word \" << word_str << \", conf \" << confidence << \"\\n\";\n#else\n    LOG(INFO) << \"Word \" << word_str << \" in font \" << font\n      << \", id \" << font_id << \", size \" << pointsize\n      << \", conf \" << confidence << \"\\n\";\n#endif // def DISABLED_LEGACY_ENGINE\n    delete[] word_str;\n#ifndef DISABLED_LEGACY_ENGINE\n    EXPECT_FALSE(bold);\n    EXPECT_FALSE(italic);\n    EXPECT_FALSE(underlined);\n    EXPECT_FALSE(monospace);\n    EXPECT_FALSE(serif);\n    // The text is about 31 pixels high.  Above we say the source is 200 ppi,\n    // which translates to:\n    // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts\n    EXPECT_GE(pointsize, 11.16 - 1.50);\n    EXPECT_LE(pointsize, 11.16 + 1.50);\n#endif // def DISABLED_LEGACY_ENGINE\n  } while (r_it->Next(tesseract::RIL_WORD));\n  delete r_it;\n}\n\n// Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)\nTEST_F(ResultIteratorTest, ComplexTest) {\n  SetImage(\"8087_054.3B.tif\");\n  // Just run layout analysis.\n  PageIterator *it = api_.AnalyseLayout();\n  EXPECT_FALSE(it == nullptr);\n  // The images should rebuild almost perfectly.\n  VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it);\n  delete it;\n}\n\n// Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik)\nTEST_F(ResultIteratorTest, GreyTest) {\n  SetImage(\"8087_054.3G.tif\");\n  // Just run layout analysis.\n  PageIterator *it = api_.AnalyseLayout();\n  EXPECT_FALSE(it == nullptr);\n  // The images should rebuild almost perfectly.\n  VerifyRebuilds(600, 600, 600, 600, 600, it);\n  delete it;\n}\n\n// Tests that Tesseract gets smallcaps and dropcaps.\nTEST_F(ResultIteratorTest, SmallCapDropCapTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test as LSTM mode does not recognize smallcaps & dropcaps attributes.\n  GTEST_SKIP();\n#else\n  SetImage(\"8071_093.3B.tif\");\n  char *result = api_.GetUTF8Text();\n  delete[] result;\n  ResultIterator *r_it = api_.GetIterator();\n  // Iterate over the words.\n  int found_dropcaps = 0;\n  int found_smallcaps = 0;\n  int false_positives = 0;\n  do {\n    bool bold, italic, underlined, monospace, serif, smallcaps;\n    int pointsize, font_id;\n    r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,\n                             &pointsize, &font_id);\n    char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);\n    if (word_str != nullptr) {\n      LOG(INFO) << \"Word \" << word_str\n        << \" is \" << (smallcaps ? \"SMALLCAPS\" : \"Normal\") << \"\\n\";\n      if (r_it->SymbolIsDropcap()) {\n        ++found_dropcaps;\n      }\n      if (strcmp(word_str, \"SHE\") == 0 || strcmp(word_str, \"MOPED\") == 0 ||\n          strcmp(word_str, \"RALPH\") == 0 || strcmp(word_str, \"KINNEY\") == 0 || // Not working yet.\n          strcmp(word_str, \"BENNETT\") == 0) {\n        EXPECT_TRUE(smallcaps) << word_str;\n        ++found_smallcaps;\n      } else {\n        if (smallcaps) {\n          ++false_positives;\n        }\n      }\n      // No symbol other than the first of any word should be dropcap.\n      ResultIterator s_it(*r_it);\n      while (s_it.Next(tesseract::RIL_SYMBOL) && !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {\n        if (s_it.SymbolIsDropcap()) {\n          char *sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);\n          LOG(ERROR) << \"Symbol \" << sym_str << \" of word \" << word_str << \" is dropcap\";\n          delete[] sym_str;\n        }\n        EXPECT_FALSE(s_it.SymbolIsDropcap());\n      }\n      delete[] word_str;\n    }\n  } while (r_it->Next(tesseract::RIL_WORD));\n  delete r_it;\n  EXPECT_EQ(1, found_dropcaps);\n  EXPECT_GE(4, found_smallcaps);\n  EXPECT_LE(false_positives, 3);\n#endif // DISABLED_LEGACY_ENGINE\n}\n\n#if 0\n// TODO(rays) uncomment on the next change to layout analysis.\n// CL 22736106 breaks it, but it is fixed in the change when\n// the textline finders start to collapse.\n\n// Tests that Tesseract gets subscript and superscript.\n// TODO(rays) This test is a bit feeble, due to bad textline finding on this\n// image, so beef up the test a bit when we get less false positive subs.\nTEST_F(ResultIteratorTest, SubSuperTest) {\n  SetImage(\"0146_281.3B.tif\");\n  char* result = api_.GetUTF8Text();\n  delete [] result;\n  ResultIterator* r_it = api_.GetIterator();\n  // Iterate over the symbols.\n  // Accuracy isn't great, so just count up and expect a decent count of\n  // positives and negatives.\n  const char kAllowedSupers[] = \"O0123456789-\";\n  int found_subs = 0;\n  int found_supers = 0;\n  int found_normal = 0;\n  do {\n    if (r_it->SymbolIsSubscript()) {\n      ++found_subs;\n    } else if (r_it->SymbolIsSuperscript()) {\n      result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL);\n      if (strchr(kAllowedSupers, result[0]) == nullptr) {\n        char* word = r_it->GetUTF8Text(tesseract::RIL_WORD);\n        LOG(ERROR) << \"Char \" << result << \" in word \" << word << \" is unexpected super!\";\n        delete [] word;\n        EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr);\n      }\n      delete [] result;\n      ++found_supers;\n    } else {\n      ++found_normal;\n    }\n  } while (r_it->Next(tesseract::RIL_SYMBOL));\n  delete r_it;\n  LOG(INFO) << \"Subs = \" << found_subs << \", supers= \" << found_supers\n    << \", normal = \" << found_normal << \"\\n\";\n  EXPECT_GE(found_subs, 25);\n  EXPECT_GE(found_supers, 25);\n  EXPECT_GE(found_normal, 1350);\n}\n#endif\n\nstatic const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT;\nstatic const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT;\nstatic const StrongScriptDirection dN = DIR_NEUTRAL;\n\n// Test that a sequence of words that could be interpreted to start from\n// the left side left-to-right or from the right side right-to-left is\n// interpreted appropriately in different contexts.\nTEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {\n  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};\n  int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,\n                                     0, 1, 2, 3, ResultIterator::kMinorRunEnd};\n  int reading_order_ltr_context[] = {\n      0, 1, 2, 3, 4, ResultIterator::kMinorRunStart, 7, 6, 5, ResultIterator::kMinorRunEnd};\n\n  ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,\n                             countof(reading_order_ltr_context));\n  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,\n                             countof(reading_order_rtl_context));\n}\n\n// Tests that clearly left-direction text (with no right-to-left indications)\n// comes out strictly left to right no matter the context.\nTEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {\n  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL};\n  // The order here is just left to right, nothing fancy.\n  int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};\n  // In the strange event that this shows up in an RTL paragraph, nonetheless\n  // just presume the whole thing is an LTR line.\n  int reading_order_rtl_context[] = {ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,\n                                     ResultIterator::kMinorRunEnd};\n\n  ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,\n                             countof(reading_order_ltr_context));\n  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,\n                             countof(reading_order_rtl_context));\n}\n\n// Test that right-direction text comes out strictly right-to-left in\n// a right-to-left context.\nTEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {\n  const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};\n  // The order here is just right-to-left, nothing fancy.\n  int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};\n  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,\n                             countof(reading_order_rtl_context));\n}\n\nTEST_F(ResultIteratorTest, TextlineOrderSanityCheck) {\n  // Iterate through all 7-word sequences and make sure that the output\n  // contains each of the indices 0..6 exactly once.\n  const int kNumWords(7);\n  const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations\n  StrongScriptDirection word_dirs[kNumWords];\n  for (int i = 0; i < kNumCombos; i++) {\n    // generate the next combination.\n    int tmp = i;\n    for (auto &word_dir : word_dirs) {\n      word_dir = static_cast<StrongScriptDirection>(tmp % 4);\n      tmp = tmp / 4;\n    }\n    VerifySaneTextlineOrder(true, word_dirs, kNumWords);\n    VerifySaneTextlineOrder(false, word_dirs, kNumWords);\n  }\n}\n\n// TODO: Missing image\nTEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {\n  SetImage(\"5318c4b679264.jpg\");\n  char *result = api_.GetUTF8Text();\n  delete[] result;\n  ResultIterator *r_it = api_.GetIterator();\n  // Iterate over the words.\n  do {\n    char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);\n    if (word_str != nullptr) {\n      LOG(INFO) << \"Word \" << word_str << \":\\n\";\n      ResultIterator s_it = *r_it;\n      do {\n        tesseract::ChoiceIterator c_it(s_it);\n        do {\n          const char *char_str = c_it.GetUTF8Text();\n          if (char_str == nullptr) {\n            LOG(INFO) << \"Null char choice\"\n                      << \"\\n\";\n          } else {\n            LOG(INFO) << \"Char choice \" << char_str << \"\\n\";\n          }\n          CHECK(char_str != nullptr);\n        } while (c_it.Next());\n      } while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&\n               s_it.Next(tesseract::RIL_SYMBOL));\n      delete[] word_str;\n    }\n  } while (r_it->Next(tesseract::RIL_WORD));\n  delete r_it;\n}\n\n// TODO: Missing image\nTEST_F(ResultIteratorTest, NonNullConfidencesTest) {\n  //  SetImage(\"line6.tiff\");\n  SetImage(\"trainingitalline.tif\");\n  api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);\n  // Force recognition so we can used the result iterator.\n  // We don't care about the return from GetUTF8Text.\n  char *result = api_.GetUTF8Text();\n  delete[] result;\n  ResultIterator *r_it = api_.GetIterator();\n  // Iterate over the words.\n  do {\n    char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);\n    if (word_str != nullptr) {\n      EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD));\n      EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL));\n      ResultIterator s_it = *r_it;\n      do {\n        const char *char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);\n        CHECK(char_str != nullptr);\n        float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);\n        LOG(INFO) << \"Char \" << char_str << \" has confidence \" << confidence << \"\\n\";\n        delete[] char_str;\n      } while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&\n               s_it.Next(tesseract::RIL_SYMBOL));\n      delete[] word_str;\n    } else {\n      LOG(INFO) << \"Empty word found\"\n                << \"\\n\";\n    }\n  } while (r_it->Next(tesseract::RIL_WORD));\n  delete r_it;\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/scanutils_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <iostream> // for cout\n\n#include \"include_gunit.h\"\n#include \"scanutils.h\"\n\nnamespace tesseract {\n\nclass ScanutilsTest : public ::testing::Test {\nprotected:\n  void SetUp() override {}\n};\n\nTEST_F(ScanutilsTest, DoesScanf) {\n  // This test verifies that tfscanf does Scanf the same as stdio fscanf.\n  // There are probably a gazillion more test cases that could be added, but\n  // these brought the tesseract and unittest test results in line.\n  std::string filename = file::JoinPath(TESTDATA_DIR, \"scanftest.txt\");\n  FILE *fp1 = fopen(filename.c_str(), \"r\");\n  if (fp1 == nullptr) {\n    std::cout << \"Failed to open file \" << filename << '\\n';\n    GTEST_SKIP();\n  }\n  FILE *fp2 = fopen(filename.c_str(), \"r\");\n  if (fp2 == nullptr) {\n    std::cout << \"Failed to open file \" << filename << '\\n';\n    fclose(fp1);\n    GTEST_SKIP();\n  }\n  // The file contains this:\n  // 42.5 17 0.001000 -0.001000\n  // 0 1 123 -123 0x100\n  // abcdefghijklmnopqrstuvwxyz\n  // abcdefghijklmnopqrstuvwxyz\n  // MF 25 6.25e-2 0.5e5 -1e+4\n  // 42 MF 25 6.25e-2 0.5\n  // 24\n  const int kNumFloats = 4;\n  float f1[kNumFloats], f2[kNumFloats];\n  int r1 = fscanf(fp1, \"%f %f %f %f\", &f1[0], &f1[1], &f1[2], &f1[3]);\n  int r2 = tfscanf(fp2, \"%f %f %f %f\", &f2[0], &f2[1], &f2[2], &f2[3]);\n  EXPECT_EQ(r1, kNumFloats);\n  EXPECT_EQ(r2, kNumFloats);\n  if (r1 == r2) {\n    for (int i = 0; i < r1; ++i) {\n      EXPECT_FLOAT_EQ(f1[i], f2[i]);\n    }\n  }\n  const int kNumInts = 5;\n  int i1[kNumInts], i2[kNumInts];\n  r1 = fscanf(fp1, \"%d %d %d %d %i\", &i1[0], &i1[1], &i1[2], &i1[3], &i1[4]);\n  r2 = tfscanf(fp2, \"%d %d %d %d %i\", &i2[0], &i2[1], &i2[2], &i2[3], &i2[4]);\n  EXPECT_EQ(r1, kNumInts);\n  EXPECT_EQ(r2, kNumInts);\n  if (r1 == r2) {\n    for (int i = 0; i < kNumInts; ++i) {\n      EXPECT_EQ(i1[i], i2[i]);\n    }\n  }\n  const int kStrLen = 1024;\n  char s1[kStrLen];\n  char s2[kStrLen];\n  r1 = fscanf(fp1, \"%1023s\", s1);\n  r2 = tfscanf(fp2, \"%1023s\", s2);\n  EXPECT_EQ(r1, r2);\n  EXPECT_STREQ(s1, s2);\n  EXPECT_EQ(26, strlen(s2));\n  r1 = fscanf(fp1, \"%20s\", s1);\n  r2 = tfscanf(fp2, \"%20s\", s2);\n  EXPECT_EQ(r1, r2);\n  EXPECT_STREQ(s1, s2);\n  EXPECT_EQ(20, strlen(s2));\n  // Now read the rest of the alphabet.\n  r1 = fscanf(fp1, \"%1023s\", s1);\n  r2 = tfscanf(fp2, \"%1023s\", s2);\n  EXPECT_EQ(r1, r2);\n  EXPECT_STREQ(s1, s2);\n  EXPECT_EQ(6, strlen(s2));\n  r1 = fscanf(fp1, \"%1023s\", s1);\n  r2 = tfscanf(fp2, \"%1023s\", s2);\n  EXPECT_EQ(r1, r2);\n  EXPECT_STREQ(s1, s2);\n  EXPECT_EQ(2, strlen(s2));\n  r1 = fscanf(fp1, \"%f %f %f %f\", &f1[0], &f1[1], &f1[2], &f1[3]);\n  r2 = tfscanf(fp2, \"%f %f %f %f\", &f2[0], &f2[1], &f2[2], &f2[3]);\n  EXPECT_EQ(r1, r2);\n  for (int i = 0; i < kNumFloats; ++i) {\n    EXPECT_FLOAT_EQ(f1[i], f2[i]);\n  }\n  // Test the * for field suppression.\n  r1 = fscanf(fp1, \"%d %*s %*d %*f %*f\", &i1[0]);\n  r2 = tfscanf(fp2, \"%d %*s %*d %*f %*f\", &i2[0]);\n  EXPECT_EQ(r1, r2);\n  EXPECT_EQ(i1[0], i2[0]);\n  // We should still see the next value and no phantoms.\n  r1 = fscanf(fp1, \"%d %1023s\", &i1[0], s1);\n  r2 = tfscanf(fp2, \"%d %1023s\", &i2[0], s2);\n  EXPECT_EQ(r1, r2);\n  EXPECT_EQ(1, r2);\n  EXPECT_EQ(i1[0], i2[0]);\n  fclose(fp2);\n  fclose(fp1);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/shapetable_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string>\n#include <utility>\n\n#include \"include_gunit.h\"\n\n#include \"serialis.h\"\n#include \"shapetable.h\"\n#include \"unicharset.h\"\n\nnamespace tesseract {\n\n#ifndef DISABLED_LEGACY_ENGINE\n\nstatic std::string TmpNameToPath(const std::string &name) {\n  return file::JoinPath(FLAGS_test_tmpdir, name);\n}\n\n// Sets up a simple shape with some unichars.\nstatic void Setup352(int font_id, Shape *shape) {\n  shape->AddToShape(3, font_id);\n  shape->AddToShape(5, font_id);\n  shape->AddToShape(2, font_id);\n}\n\n// Verifies some properties of the 352 shape.\nstatic void Expect352(int font_id, const Shape &shape) {\n  EXPECT_EQ(3, shape.size());\n  EXPECT_TRUE(shape.ContainsUnichar(2));\n  EXPECT_TRUE(shape.ContainsUnichar(3));\n  EXPECT_TRUE(shape.ContainsUnichar(5));\n  EXPECT_FALSE(shape.ContainsUnichar(1));\n  EXPECT_TRUE(shape.ContainsUnicharAndFont(2, font_id));\n  EXPECT_FALSE(shape.ContainsUnicharAndFont(2, font_id - 1));\n  EXPECT_FALSE(shape.ContainsUnicharAndFont(font_id, 2));\n  // It should be a subset of itself.\n  EXPECT_TRUE(shape.IsSubsetOf(shape));\n}\n\n#endif\n\n// The fixture for testing Shape.\nclass ShapeTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n};\n\n// Tests that a Shape works as expected for all the basic functions.\nTEST_F(ShapeTest, BasicTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because Shape is missing.\n  GTEST_SKIP();\n#else\n  Shape shape1;\n  EXPECT_EQ(0, shape1.size());\n  Setup352(101, &shape1);\n  Expect352(101, shape1);\n  // It should still work after file I/O.\n  std::string filename = TmpNameToPath(\"shapefile\");\n  FILE *fp = fopen(filename.c_str(), \"wb\");\n  ASSERT_TRUE(fp != nullptr);\n  EXPECT_TRUE(shape1.Serialize(fp));\n  fclose(fp);\n  TFile tfp;\n  EXPECT_TRUE(tfp.Open(filename.c_str(), nullptr));\n  Shape shape2;\n  EXPECT_TRUE(shape2.DeSerialize(&tfp));\n  Expect352(101, shape2);\n  // They should be subsets of each other.\n  EXPECT_TRUE(shape1.IsSubsetOf(shape2));\n  EXPECT_TRUE(shape2.IsSubsetOf(shape1));\n  // They should be equal unichars.\n  EXPECT_TRUE(shape1.IsEqualUnichars(&shape2));\n  // and still pass afterwards.\n  Expect352(101, shape1);\n  Expect352(101, shape2);\n#endif\n}\n\n// Tests AddShape separately, as it takes quite a bit of work.\nTEST_F(ShapeTest, AddShapeTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because Shape is missing.\n  GTEST_SKIP();\n#else\n  Shape shape1;\n  Setup352(101, &shape1);\n  Expect352(101, shape1);\n  // Now setup a different shape with different content.\n  Shape shape2;\n  shape2.AddToShape(3, 101); // Duplicates shape1.\n  shape2.AddToShape(5, 110); // Different font to shape1.\n  shape2.AddToShape(7, 101); // Different unichar to shape1.\n  // They should NOT be subsets of each other.\n  EXPECT_FALSE(shape1.IsSubsetOf(shape2));\n  EXPECT_FALSE(shape2.IsSubsetOf(shape1));\n  // Now add shape2 to shape1.\n  shape1.AddShape(shape2);\n  // Test subsets again.\n  EXPECT_FALSE(shape1.IsSubsetOf(shape2));\n  EXPECT_TRUE(shape2.IsSubsetOf(shape1));\n  EXPECT_EQ(4, shape1.size());\n  EXPECT_FALSE(shape1.ContainsUnichar(1));\n  EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 101));\n  EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 110));\n  EXPECT_FALSE(shape1.ContainsUnicharAndFont(3, 110));\n  EXPECT_FALSE(shape1.ContainsUnicharAndFont(7, 110));\n  EXPECT_FALSE(shape1.IsEqualUnichars(&shape2));\n#endif\n}\n\n// The fixture for testing Shape.\nclass ShapeTableTest : public testing::Test {};\n\n// Tests that a Shape works as expected for all the basic functions.\nTEST_F(ShapeTableTest, FullTest) {\n#ifdef DISABLED_LEGACY_ENGINE\n  // Skip test because Shape is missing.\n  GTEST_SKIP();\n#else\n  Shape shape1;\n  Setup352(101, &shape1);\n  // Build a shape table with the same data, but in separate shapes.\n  UNICHARSET unicharset;\n  unicharset.unichar_insert(\" \");\n  for (int i = 1; i <= 10; ++i) {\n    char class_str[20];\n    snprintf(class_str, sizeof(class_str), \"class%d\", i);\n    unicharset.unichar_insert(class_str);\n  }\n  ShapeTable st(unicharset);\n  EXPECT_EQ(0, st.AddShape(3, 101));\n  EXPECT_EQ(1, st.AddShape(5, 101));\n  EXPECT_EQ(2, st.AddShape(2, 101));\n  EXPECT_EQ(3, st.NumShapes());\n  Expect352(101, shape1);\n  EXPECT_EQ(3, st.AddShape(shape1));\n  for (int i = 0; i < 3; ++i) {\n    EXPECT_FALSE(st.MutableShape(i)->IsEqualUnichars(&shape1));\n  }\n  EXPECT_TRUE(st.MutableShape(3)->IsEqualUnichars(&shape1));\n  EXPECT_TRUE(st.AnyMultipleUnichars());\n  st.DeleteShape(3);\n  EXPECT_FALSE(st.AnyMultipleUnichars());\n\n  // Now merge to make a single shape like shape1.\n  EXPECT_EQ(1, st.MasterUnicharCount(0));\n  st.MergeShapes(0, 1);\n  EXPECT_EQ(3, st.MergedUnicharCount(1, 2));\n  st.MergeShapes(1, 2);\n  for (int i = 0; i < 3; ++i) {\n    EXPECT_EQ(3, st.MasterUnicharCount(i));\n    // Master font count is the sum of all the font counts in the shape, not\n    // the actual number of different fonts in the shape.\n    EXPECT_EQ(3, st.MasterFontCount(i));\n  }\n  EXPECT_EQ(0, st.MasterDestinationIndex(1));\n  EXPECT_EQ(0, st.MasterDestinationIndex(2));\n  ShapeTable st2;\n  st2.AppendMasterShapes(st, nullptr);\n  EXPECT_EQ(1, st.NumMasterShapes());\n  EXPECT_EQ(1, st2.NumShapes());\n  EXPECT_TRUE(st2.MutableShape(0)->IsEqualUnichars(&shape1));\n  EXPECT_TRUE(st2.AnyMultipleUnichars());\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/stats_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"kdpair.h\"\n#include \"statistc.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nconst int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1};\n\nclass STATSTest : public testing::Test {\npublic:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    stats_.set_range(0, 15);\n    for (size_t i = 0; i < countof(kTestData); ++i) {\n      stats_.add(i, kTestData[i]);\n    }\n  }\n\n  void TearDown() override {}\n\n  STATS stats_;\n};\n\n// Tests some basic numbers from the stats_.\nTEST_F(STATSTest, BasicStats) {\n  EXPECT_EQ(37, stats_.get_total());\n  EXPECT_EQ(2, stats_.mode());\n  EXPECT_EQ(12, stats_.pile_count(2));\n}\n\nTEST_F(STATSTest, InitStats) {\n  STATS stats;\n  EXPECT_EQ(0, stats.get_total());\n  EXPECT_EQ(0, stats.mode());\n  EXPECT_EQ(0, stats.pile_count(2));\n}\n\n// Tests the top_n_modes function.\nTEST_F(STATSTest, TopNModes) {\n  std::vector<tesseract::KDPairInc<float, int> > modes;\n  int num_modes = stats_.top_n_modes(3, modes);\n  EXPECT_EQ(3, num_modes);\n  // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14.\n  EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key());\n  EXPECT_EQ(14, modes[0].data());\n  // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13.\n  EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key());\n  EXPECT_EQ(13, modes[1].data());\n  // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5.\n  EXPECT_FLOAT_EQ(13.5f, modes[2].key());\n  EXPECT_EQ(6, modes[2].data());\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/stridemap_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifdef INCLUDE_TENSORFLOW\n#  include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D\n#else\n#  include <array> // std::array\n#endif\n#include \"include_gunit.h\"\n#include \"stridemap.h\"\n\nnamespace tesseract {\n\n#if !defined(INCLUDE_TENSORFLOW) && 0\nnamespace xla {\n\ntemplate <typename T>\nclass Array2D : public std::vector<T> {\npublic:\n  Array2D() : std::vector<T>(std::vector<int64_t>{0, 0}) {}\n\n  Array2D(const int64_t n1, const int64_t n2) : std::vector<T>(std::vector<int64_t>{n1, n2}) {}\n\n  Array2D(const int64_t n1, const int64_t n2, const T value) : std::vector<T>({n1, n2}, value) {}\n};\n} // namespace xla\n#endif\n\nclass StridemapTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n#ifdef INCLUDE_TENSORFLOW\n  // Sets up an Array2d object of the given size, initialized to increasing\n  // values starting with start.\n  std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {\n    std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));\n    int value = start;\n    for (int y = 0; y < ysize; ++y) {\n      for (int x = 0; x < xsize; ++x) {\n#  ifdef INCLUDE_TENSORFLOW\n        (*a)(y, x) = value++;\n#  else\n        a[y][x] = value++;\n#  endif\n      }\n    }\n    return a;\n  }\n#endif\n};\n\nTEST_F(StridemapTest, Indexing) {\n  // This test verifies that with a batch of arrays of different sizes, the\n  // iteration index each of them in turn, without going out of bounds.\n#ifdef INCLUDE_TENSORFLOW\n  std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;\n  arrays.push_back(SetupArray(3, 4, 0));\n  arrays.push_back(SetupArray(4, 5, 12));\n  arrays.push_back(SetupArray(4, 4, 32));\n  arrays.push_back(SetupArray(3, 5, 48));\n  std::vector<std::pair<int, int>> h_w_sizes;\n  for (size_t i = 0; i < arrays.size(); ++i) {\n    h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());\n  }\n  StrideMap stride_map;\n  stride_map.SetStride(h_w_sizes);\n  StrideMap::Index index(stride_map);\n  int pos = 0;\n  do {\n    EXPECT_GE(index.t(), pos);\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              pos);\n    EXPECT_EQ(index.IsLast(FD_BATCH), index.index(FD_BATCH) == arrays.size() - 1);\n    EXPECT_EQ(index.IsLast(FD_HEIGHT),\n              index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1);\n    EXPECT_EQ(index.IsLast(FD_WIDTH),\n              index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1);\n    EXPECT_TRUE(index.IsValid());\n    ++pos;\n  } while (index.Increment());\n  LOG(INFO) << \"pos=\" << pos;\n  index.InitToLast();\n  do {\n    --pos;\n    EXPECT_GE(index.t(), pos);\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              pos);\n    StrideMap::Index copy(index);\n    // Since a change in batch index changes the height and width, it isn't\n    // necessarily true that the position is still valid, even when changing\n    // to another valid batch index.\n    if (index.IsLast(FD_BATCH)) {\n      EXPECT_FALSE(copy.AddOffset(1, FD_BATCH));\n    }\n    copy = index;\n    EXPECT_EQ(index.IsLast(FD_HEIGHT), !copy.AddOffset(1, FD_HEIGHT));\n    copy = index;\n    EXPECT_EQ(index.IsLast(FD_WIDTH), !copy.AddOffset(1, FD_WIDTH));\n    copy = index;\n    if (index.index(FD_BATCH) == 0) {\n      EXPECT_FALSE(copy.AddOffset(-1, FD_BATCH));\n    }\n    copy = index;\n    EXPECT_EQ(index.index(FD_HEIGHT) == 0, !copy.AddOffset(-1, FD_HEIGHT));\n    copy = index;\n    EXPECT_EQ(index.index(FD_WIDTH) == 0, !copy.AddOffset(-1, FD_WIDTH));\n    copy = index;\n    EXPECT_FALSE(copy.AddOffset(10, FD_WIDTH));\n    copy = index;\n    EXPECT_FALSE(copy.AddOffset(-10, FD_HEIGHT));\n    EXPECT_TRUE(index.IsValid());\n  } while (index.Decrement());\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\nTEST_F(StridemapTest, Scaling) {\n  // This test verifies that with a batch of arrays of different sizes, the\n  // scaling/reduction functions work as expected.\n#ifdef INCLUDE_TENSORFLOW\n  std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;\n  arrays.push_back(SetupArray(3, 4, 0));  // 0-11\n  arrays.push_back(SetupArray(4, 5, 12)); // 12-31\n  arrays.push_back(SetupArray(4, 4, 32)); // 32-47\n  arrays.push_back(SetupArray(3, 5, 48)); // 48-62\n  std::vector<std::pair<int, int>> h_w_sizes;\n  for (size_t i = 0; i < arrays.size(); ++i) {\n    h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());\n  }\n  StrideMap stride_map;\n  stride_map.SetStride(h_w_sizes);\n\n  // Scale x by 2, keeping y the same.\n  std::vector<int> values_x2 = {0,  1,  4,  5,  8,  9,  12, 13, 17, 18, 22, 23, 27, 28,\n                                32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 53, 54, 58, 59};\n  StrideMap test_map(stride_map);\n  test_map.ScaleXY(2, 1);\n  StrideMap::Index index(test_map);\n  int pos = 0;\n  do {\n    int expected_value = values_x2[pos++];\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              expected_value);\n  } while (index.Increment());\n  EXPECT_EQ(pos, values_x2.size());\n\n  test_map = stride_map;\n  // Scale y by 2, keeping x the same.\n  std::vector<int> values_y2 = {0,  1,  2,  3,  12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n                                32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52};\n  test_map.ScaleXY(1, 2);\n  index.InitToFirst();\n  pos = 0;\n  do {\n    int expected_value = values_y2[pos++];\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              expected_value);\n  } while (index.Increment());\n  EXPECT_EQ(pos, values_y2.size());\n\n  test_map = stride_map;\n  // Scale x and y by 2.\n  std::vector<int> values_xy2 = {0, 1, 12, 13, 17, 18, 32, 33, 36, 37, 48, 49};\n  test_map.ScaleXY(2, 2);\n  index.InitToFirst();\n  pos = 0;\n  do {\n    int expected_value = values_xy2[pos++];\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              expected_value);\n  } while (index.Increment());\n  EXPECT_EQ(pos, values_xy2.size());\n\n  test_map = stride_map;\n  // Reduce Width to 1.\n  std::vector<int> values_x_to_1 = {0, 4, 8, 12, 17, 22, 27, 32, 36, 40, 44, 48, 53, 58};\n  test_map.ReduceWidthTo1();\n  index.InitToFirst();\n  pos = 0;\n  do {\n    int expected_value = values_x_to_1[pos++];\n    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),\n              expected_value);\n  } while (index.Increment());\n  EXPECT_EQ(pos, values_x_to_1.size());\n#else\n  LOG(INFO) << \"Skip test because of missing xla::Array2D\";\n  GTEST_SKIP();\n#endif\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/stringrenderer_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n\n#include \"boxchar.h\"\n#include \"boxread.h\"\n#include \"commandlineflags.h\"\n#include \"stringrenderer.h\"\n\n#include <allheaders.h>\n\n#include <memory>\n#include <string>\n\nBOOL_PARAM_FLAG(display, false, \"Display image for inspection\");\n\nnamespace tesseract {\n\nconst char kEngText[] = \"the quick brown fox jumps over the lazy dog\";\nconst char kHinText[] = \"पिताने विवाह की | हो गई उद्विग्न वह सोचा\";\n\nconst char kKorText[] = \"이는 것으로 다시 넣을 1234 수는 있지만 선택의 의미는\";\nconst char kArabicText[] =\n    \"والفكر والصراع ، بالتأمل والفهم والتحليل ، \"\n    \"بالعلم والفن ، وأخيرا بالضحك أوبالبكاء ، \";\nconst char kMixedText[] = \"والفكر 123 والصراع abc\";\n\nconst char kEngNonLigatureText[] = \"fidelity\";\n// Same as kEngNonLigatureText, but with \"fi\" replaced with its ligature.\nconst char kEngLigatureText[] = \"ﬁdelity\";\n\nstatic PangoFontMap *font_map;\n\nclass StringRendererTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    if (!font_map) {\n      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);\n    }\n    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));\n  }\n\n  static void SetUpTestCase() {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n\n    l_chooseDisplayProg(L_DISPLAY_WITH_XZGV);\n    FLAGS_fonts_dir = TESTING_DIR;\n    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;\n    file::MakeTmpdir();\n    PangoFontInfo::SoftInitFontConfig(); // init early\n  }\n\n  void DisplayClusterBoxes(Image pix) {\n    if (!FLAGS_display) {\n      return;\n    }\n    const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n    Boxa *boxes = boxaCreate(0);\n    for (const auto &boxchar : boxchars) {\n      if (boxchar->box()) {\n        boxaAddBox(boxes, const_cast<Box *>(boxchar->box()), L_CLONE);\n      }\n    }\n    Image box_pix = pixDrawBoxaRandom(pix, boxes, 1);\n    boxaDestroy(&boxes);\n    pixDisplay(box_pix, 0, 0);\n    box_pix.destroy();\n  }\n  std::unique_ptr<StringRenderer> renderer_;\n};\n\nTEST_F(StringRendererTest, DoesRenderToImage) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n\n  renderer_ = std::make_unique<StringRenderer>(\"UnBatang 10\", 600, 600);\n  EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n\n  renderer_ = std::make_unique<StringRenderer>(\"Lohit Hindi 10\", 600, 600);\n  EXPECT_EQ(strlen(kHinText), renderer_->RenderToImage(kHinText, strlen(kHinText), &pix));\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n\n  // RTL text\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 10\", 600, 600);\n  EXPECT_EQ(strlen(kArabicText), renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n\n  // Mixed direction Arabic + english text\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 10\", 600, 600);\n  EXPECT_EQ(strlen(kMixedText), renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n}\n\nTEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  // Underline all words but NOT intervening spaces.\n  renderer_->set_underline_start_prob(1.0);\n  renderer_->set_underline_continuation_prob(0);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n  renderer_->ClearBoxes();\n\n  // Underline all words AND intervening spaces.\n  renderer_->set_underline_start_prob(1.0);\n  renderer_->set_underline_continuation_prob(1.0);\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n  renderer_->ClearBoxes();\n\n  // Underline words and intervening spaces with 0.5 prob.\n  renderer_->set_underline_start_prob(0.5);\n  renderer_->set_underline_continuation_prob(0.5);\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n}\n\nTEST_F(StringRendererTest, DoesHandleNewlineCharacters) {\n  const char kRawText[] = \"\\n\\n\\n A \\nB \\nC \\n\\n\\n\";\n  const char kStrippedText[] = \" A B C \"; // text with newline chars removed\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kRawText), renderer_->RenderToImage(kRawText, strlen(kRawText), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  // 3 characters + 4 spaces => 7 boxes\n  EXPECT_EQ(7, boxchars.size());\n  if (boxchars.size() == 7) {\n    // Verify the text content of the boxchars\n    for (size_t i = 0; i < boxchars.size(); ++i) {\n      EXPECT_EQ(std::string(1, kStrippedText[i]), boxchars[i]->ch());\n    }\n  }\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n}\n\nTEST_F(StringRendererTest, DoesRenderLigatures) {\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 12\", 600, 250);\n  const char kArabicLigature[] = \"لا\";\n\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kArabicLigature),\n            renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix));\n  EXPECT_TRUE(pix != nullptr);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  const std::vector<BoxChar *> &boxes = renderer_->GetBoxes();\n  EXPECT_EQ(1, boxes.size());\n  EXPECT_TRUE(boxes[0]->box() != nullptr);\n  EXPECT_STREQ(kArabicLigature, boxes[0]->ch().c_str());\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 12\", 600, 250);\n  const char kArabicMixedText[] = \"والفكر والصراع 1234,\\nوالفكر لا والصراع\";\n  renderer_->RenderToImage(kArabicMixedText, strlen(kArabicMixedText), &pix);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n}\n\nstatic int FindBoxCharXCoord(const std::vector<BoxChar *> &boxchars, const std::string &ch) {\n  for (const auto &boxchar : boxchars) {\n    if (boxchar->ch() == ch) {\n      return boxchar->box()->x;\n    }\n  }\n  return INT_MAX;\n}\n\nTEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) {\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 10\", 600, 600);\n  Image pix = nullptr;\n  // Arabic letters should be in decreasing x-coordinates\n  const char kArabicWord[] = \"\\u0644\\u0627\\u0641\\u0643\\u0631\";\n  const std::string kRevWord = \"\\u0631\\u0643\\u0641\\u0627\\u0644\";\n  renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);\n  std::string boxes_str = renderer_->GetBoxesStr();\n  // Decode to get the box text strings.\n  EXPECT_FALSE(boxes_str.empty());\n  std::vector<std::string> texts;\n  EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts, nullptr, nullptr));\n  std::string ltr_str;\n  for (auto &text : texts) {\n    ltr_str += text.c_str();\n  }\n  // The string should come out perfectly reversed, despite there being a\n  // ligature.\n  EXPECT_EQ(ltr_str, kRevWord);\n  // Just to prove there was a ligature, the number of texts is less than the\n  // number of unicodes.\n  EXPECT_LT(texts.size(), 5);\n  pix.destroy();\n}\n\nTEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) {\n  renderer_ = std::make_unique<StringRenderer>(\"Arab 10\", 600, 600);\n  Image pix = nullptr;\n  // Arabic letters should be in decreasing x-coordinates\n  const char kArabicWord[] = \"والفكر\";\n  renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  for (size_t i = 1; i < boxchars.size(); ++i) {\n    EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch();\n  }\n  pix.destroy();\n\n  // English letters should be in increasing x-coordinates\n  const char kEnglishWord[] = \"Google\";\n  renderer_->ClearBoxes();\n  renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix);\n  EXPECT_EQ(boxchars.size(), strlen(kEnglishWord));\n  for (size_t i = 1; i < boxchars.size(); ++i) {\n    EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch();\n  }\n  pix.destroy();\n\n  // Mixed text should satisfy both.\n  renderer_->ClearBoxes();\n  renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix);\n  EXPECT_LT(FindBoxCharXCoord(boxchars, \"a\"), FindBoxCharXCoord(boxchars, \"b\"));\n  EXPECT_LT(FindBoxCharXCoord(boxchars, \"1\"), FindBoxCharXCoord(boxchars, \"2\"));\n  EXPECT_GT(FindBoxCharXCoord(boxchars, \"و\"), FindBoxCharXCoord(boxchars, \"ر\"));\n  pix.destroy();\n}\n\nTEST_F(StringRendererTest, DoesRenderVerticalText) {\n  Image pix = nullptr;\n  renderer_ = std::make_unique<StringRenderer>(\"UnBatang 10\", 600, 600);\n  renderer_->set_vertical_text(true);\n  EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  DisplayClusterBoxes(pix);\n  pix.destroy();\n}\n\n// Checks that we preserve charboxes across RenderToImage calls, with\n// appropriate page numbers.\nTEST_F(StringRendererTest, DoesKeepAllImageBoxes) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  Image pix = nullptr;\n  int num_boxes_per_page = 0;\n  const int kNumTrials = 2;\n  for (int i = 0; i < kNumTrials; ++i) {\n    EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n    EXPECT_TRUE(pix != nullptr);\n    pix.destroy();\n    EXPECT_GT(renderer_->GetBoxes().size(), 0);\n    if (!num_boxes_per_page) {\n      num_boxes_per_page = renderer_->GetBoxes().size();\n    } else {\n      EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size());\n    }\n    for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page; ++j) {\n      EXPECT_EQ(i, renderer_->GetBoxes()[j]->page());\n    }\n  }\n}\n\nTEST_F(StringRendererTest, DoesClearBoxes) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  pix.destroy();\n  EXPECT_GT(renderer_->GetBoxes().size(), 0);\n  const int num_boxes_per_page = renderer_->GetBoxes().size();\n\n  renderer_->ClearBoxes();\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  pix.destroy();\n  EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size());\n}\n\nTEST_F(StringRendererTest, DoesLigatureTextForRendering) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  renderer_->set_add_ligatures(true);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngNonLigatureText),\n            renderer_->RenderToImage(kEngNonLigatureText, strlen(kEngNonLigatureText), &pix));\n  pix.destroy();\n#if 0 // not with NFC normalization\n  // There should be one less box than letters due to the 'fi' ligature.\n  EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());\n  // The output box text should be ligatured.\n  EXPECT_STREQ(\"ﬁ\", renderer_->GetBoxes()[0]->ch().c_str());\n#endif\n}\n\nTEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngLigatureText),\n            renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText), &pix));\n  pix.destroy();\n  // There should be one less box than letters due to the 'fi' ligature.\n  EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());\n  // The output box text should be ligatured.\n  EXPECT_STREQ(\"\\uFB01\", renderer_->GetBoxes()[0]->ch().c_str());\n}\n\nTEST_F(StringRendererTest, DoesStripUnrenderableWords) {\n  // Verdana should only be able to render the english letters and numbers in\n  // the mixed text.\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  std::string text(kMixedText);\n  EXPECT_GT(renderer_->StripUnrenderableWords(&text), 0);\n  EXPECT_EQ(\" 123  abc\", text);\n}\n\nTEST_F(StringRendererTest, DoesRenderWordBoxes) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  renderer_->set_output_word_boxes(true);\n  Image pix = nullptr;\n  EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));\n  pix.destroy();\n  // Verify #boxchars = #words + #spaces\n  std::vector<std::string> words = split(kEngText, ' ');\n  const int kNumSpaces = words.size() - 1;\n  const int kExpectedNumBoxes = words.size() + kNumSpaces;\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  EXPECT_EQ(kExpectedNumBoxes, boxchars.size());\n  // Verify content of words and spaces\n  for (size_t i = 0; i < boxchars.size(); i += 2) {\n    EXPECT_EQ(words[i / 2], boxchars[i]->ch());\n    if (i < boxchars.size() - 1) {\n      EXPECT_EQ(\" \", boxchars[i + 1]->ch());\n      EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);\n    }\n  }\n}\n\nTEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 600, 600);\n  renderer_->set_output_word_boxes(true);\n  Image pix = nullptr;\n  const char kMultlineText[] = \"the quick brown fox\\njumps over the lazy dog\";\n  EXPECT_EQ(strlen(kMultlineText), renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix));\n  pix.destroy();\n  // Verify #boxchars = #words + #spaces + #newlines\n  std::vector<std::string> words;\n  for (auto &line : split(kMultlineText, '\\n')) {\n    for (auto &word : split(line, ' ')) {\n      words.push_back(word);\n    }\n  }\n  const int kNumSeparators = words.size() - 1;\n  const int kExpectedNumBoxes = words.size() + kNumSeparators;\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  EXPECT_EQ(kExpectedNumBoxes, boxchars.size());\n  // Verify content of words and spaces\n  for (size_t i = 0; i < boxchars.size(); i += 2) {\n    EXPECT_EQ(words[i / 2], boxchars[i]->ch());\n    if (i + 1 < boxchars.size()) {\n      EXPECT_EQ(\" \", boxchars[i + 1]->ch());\n      EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);\n    }\n  }\n}\n\nTEST_F(StringRendererTest, DoesRenderAllFontsToImage) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 1200, 1200);\n  size_t offset = 0;\n  std::string font_used;\n  do {\n    Image pix = nullptr;\n    font_used.clear();\n    offset += renderer_->RenderAllFontsToImage(1.0, kEngText + offset, strlen(kEngText + offset),\n                                               &font_used, &pix);\n    if (offset < strlen(kEngText)) {\n      EXPECT_TRUE(pix != nullptr);\n      EXPECT_STRNE(\"\", font_used.c_str());\n    }\n    if (FLAGS_display) {\n      pixDisplay(pix, 0, 0);\n    }\n    pix.destroy();\n  } while (offset < strlen(kEngText));\n}\n\nTEST_F(StringRendererTest, DoesNotRenderWordJoiner) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 500, 200);\n  const std::string word = \"A- -B C-D A BC\";\n  const std::string joined_word = StringRenderer::InsertWordJoiners(word);\n  Image pix = nullptr;\n  renderer_->RenderToImage(joined_word.c_str(), joined_word.length(), &pix);\n  pix.destroy();\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  const std::string kWordJoinerUTF8 = \"\\u2060\";\n  ASSERT_EQ(word.length(), boxchars.size());\n  for (size_t i = 0; i < boxchars.size(); ++i) {\n    EXPECT_NE(kWordJoinerUTF8, boxchars[i]->ch());\n    EXPECT_EQ(word.substr(i, 1), boxchars[i]->ch());\n  }\n}\n\nTEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) {\n  renderer_ = std::make_unique<StringRenderer>(\"Verdana 10\", 500, 200);\n  renderer_->set_drop_uncovered_chars(true);\n  const std::string kWord = \"oﬀice\";\n  const std::string kCleanWord = \"oice\";\n  Image pix = nullptr;\n  EXPECT_FALSE(renderer_->font().CanRenderString(kWord.c_str(), kWord.length()));\n  EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length()));\n  int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix);\n  pix.destroy();\n  const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();\n  EXPECT_EQ(kWord.length(), offset);\n  ASSERT_EQ(kCleanWord.length(), boxchars.size());\n  for (size_t i = 0; i < boxchars.size(); ++i) {\n    EXPECT_EQ(kCleanWord.substr(i, 1), boxchars[i]->ch());\n  }\n}\n\n// ------------ StringRenderer::ConvertBasicLatinToFullwidthLatin() ------------\n\nTEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) {\n  const std::string kHalfAlpha = \"ABCD\";\n  const std::string kFullAlpha = \"ＡＢＣＤ\";\n  EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha));\n\n  const std::string kHalfDigit = \"0123\";\n  const std::string kFullDigit = \"０１２３\";\n  EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit));\n\n  const std::string kHalfSym = \"()[]:;!?\";\n  const std::string kFullSym = \"（）［］：；！？\";\n  EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym));\n}\n\nTEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) {\n  const std::string kFullAlpha = \"ＡＢＣＤ\";\n  EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha));\n\n  const std::string kFullDigit = \"０１２３\";\n  EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit));\n\n  const std::string kFullSym = \"（）［］：；！？\";\n  EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym));\n}\n\nTEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) {\n  const std::string kHalfKana = \"ｱｲｳｴｵ\";\n  const std::string kFullKana = \"アイウエオ\";\n  EXPECT_EQ(kHalfKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana));\n  EXPECT_EQ(kFullKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana));\n}\n\nTEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) {\n  const std::string kHalfSpace = \" \";\n  const std::string kFullSpace = \"　\";\n  EXPECT_EQ(kHalfSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace));\n  EXPECT_EQ(kFullSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace));\n}\n\n// ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------\n\nTEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) {\n  const std::string kHalfAlpha = \"ABCD\";\n  const std::string kFullAlpha = \"ＡＢＣＤ\";\n  EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha));\n\n  const std::string kHalfDigit = \"0123\";\n  const std::string kFullDigit = \"０１２３\";\n  EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit));\n\n  const std::string kHalfSym = \"()[]:;!?\";\n  const std::string kFullSym = \"（）［］：；！？\";\n  EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym));\n}\n\nTEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) {\n  const std::string kHalfAlpha = \"ABCD\";\n  EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha));\n\n  const std::string kHalfDigit = \"0123\";\n  EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit));\n\n  const std::string kHalfSym = \"()[]:;!?\";\n  EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym));\n}\n\nTEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) {\n  const std::string kHalfKana = \"ｱｲｳｴｵ\";\n  const std::string kFullKana = \"アイウエオ\";\n  EXPECT_EQ(kHalfKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana));\n  EXPECT_EQ(kFullKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana));\n}\n\nTEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) {\n  const std::string kHalfSpace = \" \";\n  const std::string kFullSpace = \"　\";\n  EXPECT_EQ(kHalfSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace));\n  EXPECT_EQ(kFullSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace));\n}\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/syntaxnet/base.h",
    "content": "/* Copyright 2016 Google Inc. All Rights Reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n==============================================================================*/\n\n#ifndef SYNTAXNET_BASE_H_\n#define SYNTAXNET_BASE_H_\n\n#include <map>\n#include <functional>\n#include <string>\n#include <unordered_map>\n#include <unordered_set>\n#include <vector>\n\n#ifdef INCLUDE_TENSORFLOW\n\n#include \"google/protobuf/util/message_differencer.h\"\n\n#include \"tensorflow/core/lib/core/status.h\"\n#include \"tensorflow/core/lib/strings/strcat.h\"\n#include \"tensorflow/core/lib/strings/stringprintf.h\"\n#include \"tensorflow/core/platform/default/integral_types.h\"\n#include \"tensorflow/core/platform/mutex.h\"\n#include \"tensorflow/core/platform/protobuf.h\"\n\n#endif\n\nusing std::map;\nusing std::pair;\nusing std::unordered_map;\nusing std::unordered_set;\nusing std::vector;\n#ifdef INCLUDE_TENSORFLOW\nusing tensorflow::int16;\nusing tensorflow::int32;\nusing tensorflow::int64;\nusing tensorflow::int8;\nusing tensorflow::mutex;\nusing tensorflow::mutex_lock;\nusing tensorflow::uint16;\nusing tensorflow::uint32;\nusing tensorflow::uint64;\nusing tensorflow::uint8;\nusing tensorflow::protobuf::TextFormat;\n#endif\ntypedef signed int char32;\n\nusing std::string;\n#ifdef INCLUDE_TENSORFLOW\nusing tensorflow::StringPiece;\n#endif\n\n// namespace syntaxnet\n\n#endif // SYNTAXNET_BASE_H_\n"
  },
  {
    "path": "unittest/tablefind_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <memory>\n\n#include \"colpartition.h\"\n#include \"colpartitiongrid.h\"\n#include \"tablefind.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TestableTableFinder : public tesseract::TableFinder {\npublic:\n  using TableFinder::GapInXProjection;\n  using TableFinder::HasLeaderAdjacent;\n  using TableFinder::InsertLeaderPartition;\n  using TableFinder::InsertTextPartition;\n  using TableFinder::set_global_median_blob_width;\n  using TableFinder::set_global_median_ledding;\n  using TableFinder::set_global_median_xheight;\n  using TableFinder::SplitAndInsertFragmentedTextPartition;\n\n  void ExpectPartition(const TBOX &box) {\n    tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);\n    gsearch.SetUniqueMode(true);\n    gsearch.StartFullSearch();\n    ColPartition *part = nullptr;\n    bool found = false;\n    while ((part = gsearch.NextFullSearch()) != nullptr) {\n      if (part->bounding_box().left() == box.left() &&\n          part->bounding_box().bottom() == box.bottom() &&\n          part->bounding_box().right() == box.right() && part->bounding_box().top() == box.top()) {\n        found = true;\n      }\n    }\n    EXPECT_TRUE(found);\n  }\n  void ExpectPartitionCount(int expected_count) {\n    tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);\n    gsearch.SetUniqueMode(true);\n    gsearch.StartFullSearch();\n    ColPartition *part = nullptr;\n    int count = 0;\n    while ((part = gsearch.NextFullSearch()) != nullptr) {\n      ++count;\n    }\n    EXPECT_EQ(expected_count, count);\n  }\n};\n\nclass TableFinderTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    free_boxes_it_.set_to_list(&free_boxes_);\n    finder_ = std::make_unique<TestableTableFinder>();\n    finder_->Init(1, ICOORD(0, 0), ICOORD(500, 500));\n    // gap finding\n    finder_->set_global_median_xheight(5);\n    finder_->set_global_median_blob_width(5);\n  }\n\n  void TearDown() override {\n    if (partition_.get() != nullptr) {\n      partition_->DeleteBoxes();\n    }\n    DeletePartitionListBoxes();\n    finder_.reset(nullptr);\n  }\n\n  void MakePartition(int x_min, int y_min, int x_max, int y_max) {\n    MakePartition(x_min, y_min, x_max, y_max, 0, 0);\n  }\n\n  void MakePartition(int x_min, int y_min, int x_max, int y_max, int first_column,\n                     int last_column) {\n    if (partition_.get() != nullptr) {\n      partition_->DeleteBoxes();\n    }\n    TBOX box;\n    box.set_to_given_coords(x_min, y_min, x_max, y_max);\n    partition_.reset(ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE));\n    partition_->set_first_column(first_column);\n    partition_->set_last_column(last_column);\n  }\n\n  void InsertTextPartition(ColPartition *part) {\n    finder_->InsertTextPartition(part);\n    free_boxes_it_.add_after_then_move(part);\n  }\n\n  void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max) {\n    InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0);\n  }\n\n  void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max, int first_column,\n                             int last_column) {\n    TBOX box;\n    box.set_to_given_coords(x_min, y_min, x_max, y_max);\n    ColPartition *part =\n        ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_UNKNOWN, BTFT_LEADER);\n    part->set_first_column(first_column);\n    part->set_last_column(last_column);\n    finder_->InsertLeaderPartition(part);\n    free_boxes_it_.add_after_then_move(part);\n  }\n\n  void DeletePartitionListBoxes() {\n    for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list(); free_boxes_it_.forward()) {\n      ColPartition *part = free_boxes_it_.data();\n      part->DeleteBoxes();\n    }\n  }\n\n  std::unique_ptr<TestableTableFinder> finder_;\n  std::unique_ptr<ColPartition> partition_;\n\nprivate:\n  tesseract::ColPartition_CLIST free_boxes_;\n  tesseract::ColPartition_C_IT free_boxes_it_;\n};\n\nTEST_F(TableFinderTest, GapInXProjectionNoGap) {\n  int data[100];\n  for (int &i : data) {\n    i = 10;\n  }\n  EXPECT_FALSE(finder_->GapInXProjection(data, 100));\n}\n\nTEST_F(TableFinderTest, GapInXProjectionEdgeGap) {\n  int data[100];\n  for (int i = 0; i < 10; ++i) {\n    data[i] = 2;\n  }\n  for (int i = 10; i < 90; ++i) {\n    data[i] = 10;\n  }\n  for (int i = 90; i < 100; ++i) {\n    data[i] = 2;\n  }\n  EXPECT_FALSE(finder_->GapInXProjection(data, 100));\n}\n\nTEST_F(TableFinderTest, GapInXProjectionExists) {\n  int data[100];\n  for (int i = 0; i < 10; ++i) {\n    data[i] = 10;\n  }\n  for (int i = 10; i < 90; ++i) {\n    data[i] = 2;\n  }\n  for (int i = 90; i < 100; ++i) {\n    data[i] = 10;\n  }\n  EXPECT_TRUE(finder_->GapInXProjection(data, 100));\n}\n\nTEST_F(TableFinderTest, HasLeaderAdjacentOverlapping) {\n  InsertLeaderPartition(90, 0, 150, 5);\n  MakePartition(0, 0, 100, 10);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(0, 25, 100, 40);\n  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(145, 0, 200, 20);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(40, 0, 50, 4);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n}\n\nTEST_F(TableFinderTest, HasLeaderAdjacentNoOverlap) {\n  InsertLeaderPartition(90, 10, 150, 15);\n  MakePartition(0, 10, 85, 20);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(0, 25, 100, 40);\n  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(0, 0, 100, 10);\n  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n  // TODO(nbeato): is this a useful metric? case fails\n  // MakePartition(160, 0, 200, 15);  // leader is primarily above it\n  // EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n}\n\nTEST_F(TableFinderTest, HasLeaderAdjacentPreservesColumns) {\n  InsertLeaderPartition(90, 0, 150, 5, 1, 2);\n  MakePartition(0, 0, 85, 10, 0, 0);\n  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(0, 0, 100, 10, 0, 1);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(0, 0, 200, 10, 0, 5);\n  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));\n  MakePartition(155, 0, 200, 10, 5, 5);\n  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));\n}\n\n// TODO(nbeato): Only testing a splitting case. Add more...\n// Also test non-split cases.\nTEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) {\n  finder_->set_global_median_blob_width(3);\n  finder_->set_global_median_xheight(10);\n\n  TBOX part_box(10, 5, 100, 15);\n  auto *all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));\n  all->set_type(PT_FLOWING_TEXT);\n  all->set_blob_type(BRT_TEXT);\n  all->set_flow(BTFT_CHAIN);\n  all->set_left_margin(10);\n  all->set_right_margin(100);\n  TBOX blob_box = part_box;\n  for (int i = 10; i <= 20; i += 5) {\n    blob_box.set_left(i + 1);\n    blob_box.set_right(i + 4);\n    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));\n  }\n  for (int i = 35; i <= 55; i += 5) {\n    blob_box.set_left(i + 1);\n    blob_box.set_right(i + 4);\n    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));\n  }\n  for (int i = 80; i <= 95; i += 5) {\n    blob_box.set_left(i + 1);\n    blob_box.set_right(i + 4);\n    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));\n  }\n  // TODO(nbeato): Ray's newer code...\n  // all->ClaimBoxes();\n  all->ComputeLimits();     // This is to make sure median iinfo is set.\n  InsertTextPartition(all); // This is to delete blobs\n  ColPartition *fragment_me = all->CopyButDontOwnBlobs();\n\n  finder_->SplitAndInsertFragmentedTextPartition(fragment_me);\n  finder_->ExpectPartition(TBOX(11, 5, 24, 15));\n  finder_->ExpectPartition(TBOX(36, 5, 59, 15));\n  finder_->ExpectPartition(TBOX(81, 5, 99, 15));\n  finder_->ExpectPartitionCount(3);\n}\n\nTEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) {\n  finder_->set_global_median_blob_width(3);\n  finder_->set_global_median_xheight(10);\n\n  TBOX part_box(10, 5, 100, 15);\n  auto *all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));\n  all->set_type(PT_FLOWING_TEXT);\n  all->set_blob_type(BRT_TEXT);\n  all->set_flow(BTFT_CHAIN);\n  all->set_left_margin(10);\n  all->set_right_margin(100);\n  TBOX blob_box = part_box;\n  for (int i = 10; i <= 95; i += 5) {\n    blob_box.set_left(i + 1);\n    blob_box.set_right(i + 4);\n    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));\n  }\n  // TODO(nbeato): Ray's newer code...\n  // all->ClaimBoxes();\n  all->ComputeLimits();     // This is to make sure median iinfo is set.\n  InsertTextPartition(all); // This is to delete blobs\n  ColPartition *fragment_me = all->CopyButDontOwnBlobs();\n\n  finder_->SplitAndInsertFragmentedTextPartition(fragment_me);\n  finder_->ExpectPartition(TBOX(11, 5, 99, 15));\n  finder_->ExpectPartitionCount(1);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/tablerecog_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <memory>\n\n#include \"colpartition.h\"\n#include \"colpartitiongrid.h\"\n#include \"tablerecog.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TestableTableRecognizer : public tesseract::TableRecognizer {\npublic:\n  using TableRecognizer::FindLinesBoundingBox;\n  using TableRecognizer::HasSignificantLines;\n  using TableRecognizer::RecognizeLinedTable;\n  using TableRecognizer::RecognizeTable;\n  using TableRecognizer::RecognizeWhitespacedTable;\n};\n\nclass TestableStructuredTable : public tesseract::StructuredTable {\npublic:\n  using StructuredTable::CountHorizontalIntersections;\n  using StructuredTable::CountVerticalIntersections;\n  using StructuredTable::FindLinedStructure;\n  using StructuredTable::FindWhitespacedColumns;\n  using StructuredTable::FindWhitespacedStructure;\n  using StructuredTable::VerifyLinedTableCells;\n\n  void InjectCellY(int y) {\n    cell_y_.push_back(y);\n    std::sort(cell_y_.begin(), cell_y_.end());\n  }\n  void InjectCellX(int x) {\n    cell_x_.push_back(x);\n    std::sort(cell_x_.begin(), cell_x_.end());\n  }\n\n  void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) {\n    ASSERT_EQ(0, (almost_done - second) % add);\n    EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size());\n    EXPECT_EQ(x_min, cell_x_.at(0));\n    EXPECT_EQ(x_max, cell_x_.at(cell_x_.size() - 1));\n    for (unsigned i = 1; i < cell_x_.size() - 1; ++i) {\n      EXPECT_EQ(second + add * (i - 1), cell_x_.at(i));\n    }\n  }\n\n  void ExpectSortedX() {\n    EXPECT_GT(cell_x_.size(), 0);\n    for (unsigned i = 1; i < cell_x_.size(); ++i) {\n      EXPECT_LT(cell_x_.at(i - 1), cell_x_.at(i));\n    }\n  }\n};\n\nclass SharedTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    ICOORD bleft(0, 0);\n    ICOORD tright(1000, 1000);\n    text_grid_ = std::make_unique<ColPartitionGrid>(5, bleft, tright);\n    line_grid_ = std::make_unique<ColPartitionGrid>(5, bleft, tright);\n  }\n\n  void TearDown() override {\n    tesseract::ColPartition_IT memory(&allocated_parts_);\n    for (memory.mark_cycle_pt(); !memory.cycled_list(); memory.forward()) {\n      memory.data()->DeleteBoxes();\n    }\n  }\n\n  void InsertPartitions() {\n    for (int row = 0; row < 800; row += 20) {\n      for (int col = 0; col < 500; col += 25) {\n        InsertPartition(col + 1, row + 1, col + 24, row + 19);\n      }\n    }\n  }\n\n  void InsertPartition(int left, int bottom, int right, int top) {\n    TBOX box(left, bottom, right, top);\n    ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);\n    part->set_median_width(3);\n    part->set_median_height(3);\n    text_grid_->InsertBBox(true, true, part);\n\n    tesseract::ColPartition_IT add_it(&allocated_parts_);\n    add_it.add_after_stay_put(part);\n  }\n\n  void InsertLines() {\n    line_box_.set_to_given_coords(100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(),\n                                  450 + line_grid_->gridsize(), 50 + line_grid_->gridsize());\n    for (int i = 10; i <= 50; i += 10) {\n      InsertHorizontalLine(100, 450, i);\n    }\n    for (int i = 100; i <= 450; i += 50) {\n      InsertVerticalLine(i, 10, 50);\n    }\n\n    for (int i = 100; i <= 200; i += 20) {\n      InsertHorizontalLine(0, 100, i);\n    }\n  }\n\n  void InsertHorizontalLine(int left, int right, int y) {\n    TBOX box(left, y - line_grid_->gridsize(), right, y + line_grid_->gridsize());\n    ColPartition *part = ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE);\n    line_grid_->InsertBBox(true, true, part);\n\n    tesseract::ColPartition_IT add_it(&allocated_parts_);\n    add_it.add_after_stay_put(part);\n  }\n  void InsertVerticalLine(int x, int bottom, int top) {\n    TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(), top);\n    ColPartition *part = ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE);\n    line_grid_->InsertBBox(true, true, part);\n\n    tesseract::ColPartition_IT add_it(&allocated_parts_);\n    add_it.add_after_stay_put(part);\n  }\n\n  void InsertCellsInLines() {\n    for (int y = 10; y <= 50; y += 10) {\n      for (int x = 100; x <= 450; x += 50) {\n        InsertPartition(x + 1, y + 1, x + 49, y + 9);\n      }\n    }\n  }\n\n  TBOX line_box_;\n  std::unique_ptr<ColPartitionGrid> text_grid_;\n  std::unique_ptr<ColPartitionGrid> line_grid_;\n  ColPartition_LIST allocated_parts_;\n};\n\nclass TableRecognizerTest : public SharedTest {\nprotected:\n  void SetUp() override {\n    SharedTest::SetUp();\n    recognizer_ = std::make_unique<TestableTableRecognizer>();\n    recognizer_->Init();\n    recognizer_->set_text_grid(text_grid_.get());\n    recognizer_->set_line_grid(line_grid_.get());\n  }\n\n  std::unique_ptr<TestableTableRecognizer> recognizer_;\n};\n\nclass StructuredTableTest : public SharedTest {\nprotected:\n  void SetUp() override {\n    SharedTest::SetUp();\n    table_ = std::make_unique<TestableStructuredTable>();\n    table_->Init();\n    table_->set_text_grid(text_grid_.get());\n    table_->set_line_grid(line_grid_.get());\n  }\n\n  std::unique_ptr<TestableStructuredTable> table_;\n};\n\nTEST_F(TableRecognizerTest, HasSignificantLinesBasicPass) {\n  InsertLines();\n  TBOX smaller_guess(120, 15, 370, 45);\n  TBOX larger_guess(90, 5, 490, 70);\n  EXPECT_TRUE(recognizer_->HasSignificantLines(line_box_));\n  EXPECT_TRUE(recognizer_->HasSignificantLines(larger_guess));\n  EXPECT_TRUE(recognizer_->HasSignificantLines(smaller_guess));\n}\n\nTEST_F(TableRecognizerTest, HasSignificantLinesBasicFail) {\n  InsertLines();\n  TBOX box(370, 35, 500, 45);\n  EXPECT_FALSE(recognizer_->HasSignificantLines(box));\n}\n\nTEST_F(TableRecognizerTest, HasSignificantLinesHorizontalOnlyFails) {\n  InsertLines();\n  TBOX box(0, 100, 200, 200);\n  EXPECT_FALSE(recognizer_->HasSignificantLines(box));\n}\n\nTEST_F(TableRecognizerTest, FindLinesBoundingBoxBasic) {\n  InsertLines();\n  TBOX box(0, 0, 200, 50);\n  bool result = recognizer_->FindLinesBoundingBox(&box);\n  EXPECT_TRUE(result);\n  EXPECT_EQ(line_box_.left(), box.left());\n  EXPECT_EQ(line_box_.right(), box.right());\n  EXPECT_EQ(line_box_.bottom(), box.bottom());\n  EXPECT_EQ(line_box_.top(), box.top());\n}\n\nTEST_F(TableRecognizerTest, RecognizeLinedTableBasic) {\n  InsertLines();\n  TBOX guess(120, 15, 370, 45);\n  tesseract::StructuredTable table;\n  table.set_text_grid(text_grid_.get());\n  table.set_line_grid(line_grid_.get());\n\n  EXPECT_TRUE(recognizer_->RecognizeLinedTable(guess, &table));\n  EXPECT_EQ(line_box_.bottom(), table.bounding_box().bottom());\n  EXPECT_EQ(line_box_.top(), table.bounding_box().top());\n  EXPECT_EQ(line_box_.left(), table.bounding_box().left());\n  EXPECT_EQ(line_box_.right(), table.bounding_box().right());\n  EXPECT_EQ(line_box_.area(), table.bounding_box().area());\n  EXPECT_EQ(7, table.column_count());\n  EXPECT_EQ(4, table.row_count());\n  EXPECT_EQ(28, table.cell_count());\n  EXPECT_TRUE(table.is_lined());\n}\n\nTEST_F(TableRecognizerTest, RecognizeWhitespacedTableBasic) {\n  InsertPartitions();\n  TBOX guess(0, 0, 500, 800);\n\n  tesseract::StructuredTable table;\n  table.set_text_grid(text_grid_.get());\n  table.set_line_grid(line_grid_.get());\n  EXPECT_TRUE(recognizer_->RecognizeWhitespacedTable(guess, &table));\n  EXPECT_EQ(1, table.bounding_box().bottom());\n  EXPECT_EQ(799, table.bounding_box().top());\n  EXPECT_EQ(1, table.bounding_box().left());\n  EXPECT_EQ(499, table.bounding_box().right());\n  EXPECT_EQ(798 * 498, table.bounding_box().area());\n  EXPECT_EQ(500 / 25, table.column_count());\n  EXPECT_EQ(800 / 20, table.row_count());\n  EXPECT_EQ(500 * 800 / 20 / 25, table.cell_count());\n  EXPECT_FALSE(table.is_lined());\n}\n\nTEST_F(StructuredTableTest, CountVerticalIntersectionsAll) {\n  table_->set_bounding_box(TBOX(0, 0, 1000, 1000));\n  InsertPartition(0, 0, 100, 10);\n  InsertPartition(1, 12, 43, 21);\n  EXPECT_EQ(2, table_->CountVerticalIntersections(4));\n  EXPECT_EQ(2, table_->CountVerticalIntersections(20));\n  EXPECT_EQ(2, table_->CountVerticalIntersections(40));\n  EXPECT_EQ(1, table_->CountVerticalIntersections(50));\n  EXPECT_EQ(1, table_->CountVerticalIntersections(60));\n  EXPECT_EQ(1, table_->CountVerticalIntersections(80));\n  EXPECT_EQ(1, table_->CountVerticalIntersections(95));\n  EXPECT_EQ(0, table_->CountVerticalIntersections(104));\n  EXPECT_EQ(0, table_->CountVerticalIntersections(150));\n}\n\nTEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) {\n  table_->set_bounding_box(TBOX(0, 0, 1000, 1000));\n  InsertPartition(0, 3, 100, 10);\n  InsertPartition(110, 5, 200, 16);\n\n  EXPECT_EQ(0, table_->CountHorizontalIntersections(0));\n  EXPECT_EQ(1, table_->CountHorizontalIntersections(4));\n  EXPECT_EQ(2, table_->CountHorizontalIntersections(8));\n  EXPECT_EQ(1, table_->CountHorizontalIntersections(12));\n  EXPECT_EQ(0, table_->CountHorizontalIntersections(20));\n}\n\nTEST_F(StructuredTableTest, VerifyLinedTableBasicPass) {\n  for (int y = 10; y <= 50; y += 10) {\n    table_->InjectCellY(y);\n  }\n  for (int x = 100; x <= 450; x += 50) {\n    table_->InjectCellX(x);\n  }\n  InsertLines();\n  InsertCellsInLines();\n  table_->set_bounding_box(line_box_);\n  EXPECT_TRUE(table_->VerifyLinedTableCells());\n}\n\nTEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) {\n  for (int y = 10; y <= 50; y += 10) {\n    table_->InjectCellY(y);\n  }\n  for (int x = 100; x <= 450; x += 50) {\n    table_->InjectCellX(x);\n  }\n  InsertLines();\n  InsertCellsInLines();\n  InsertPartition(101, 11, 299, 19);\n  table_->set_bounding_box(line_box_);\n  EXPECT_FALSE(table_->VerifyLinedTableCells());\n}\n\nTEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) {\n  for (int y = 10; y <= 50; y += 10) {\n    table_->InjectCellY(y);\n  }\n  for (int x = 100; x <= 450; x += 50) {\n    table_->InjectCellX(x);\n  }\n  InsertLines();\n  InsertCellsInLines();\n  InsertPartition(151, 21, 199, 39);\n  table_->set_bounding_box(line_box_);\n  EXPECT_FALSE(table_->VerifyLinedTableCells());\n}\n\nTEST_F(StructuredTableTest, FindWhitespacedColumnsBasic) {\n  InsertPartitions();\n  TBOX guess(0, 0, 500, 800);\n  table_->set_bounding_box(guess);\n  table_->FindWhitespacedColumns();\n  table_->ExpectCellX(1, 25, 25, 475, 499);\n}\n\nTEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) {\n  InsertPartitions();\n  TBOX guess(0, 0, 500, 800);\n  table_->set_bounding_box(guess);\n  table_->FindWhitespacedColumns();\n  table_->ExpectSortedX();\n}\n\n// TODO(nbeato): check failure cases\n// TODO(nbeato): check Recognize processes correctly on trivial real examples.\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/tabvector_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <memory>\n\n#include \"tabvector.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TabVectorTest : public testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    vector_.reset();\n  }\n\n  void TearDown() override {}\n\n  void MakeSimpleTabVector(int x1, int y1, int x2, int y2) {\n    vector_ = std::make_unique<TabVector>();\n    vector_->set_startpt(ICOORD(x1, y1));\n    vector_->set_endpt(ICOORD(x2, y2));\n  }\n\n  std::unique_ptr<TabVector> vector_;\n};\n\nTEST_F(TabVectorTest, SetStartEndPointsMatch) {\n  vector_ = std::make_unique<TabVector>();\n  ICOORD start(51, 65);\n  ICOORD end(7568, 234);\n  // Test coordinates individually to avoid adding an ostream operator\n  // explicitly to the ICOORD class (Droid doesn't support it).\n  vector_->set_startpt(start);\n  EXPECT_EQ(start.x(), vector_->startpt().x());\n  EXPECT_EQ(start.y(), vector_->startpt().y());\n  vector_->set_endpt(end);\n  EXPECT_EQ(end.x(), vector_->endpt().x());\n  EXPECT_EQ(end.y(), vector_->endpt().y());\n}\n\nTEST_F(TabVectorTest, XAtY45DegreeSlopeInRangeExact) {\n  MakeSimpleTabVector(0, 0, 100, 100);\n  for (int y = 0; y <= 100; ++y) {\n    int x = vector_->XAtY(y);\n    EXPECT_EQ(y, x);\n  }\n}\n\nTEST_F(TabVectorTest, XAtYVerticalInRangeExact) {\n  const int x = 120; // Arbitrary choice\n  MakeSimpleTabVector(x, 0, x, 100);\n  for (int y = 0; y <= 100; ++y) {\n    int result_x = vector_->XAtY(y);\n    EXPECT_EQ(x, result_x);\n  }\n}\n\nTEST_F(TabVectorTest, XAtYHorizontal) {\n  const int y = 76; // arbitrary\n  MakeSimpleTabVector(0, y, 100, y);\n  EXPECT_EQ(0, vector_->XAtY(y));\n  // TODO(nbeato): What's the failure condition?\n  // Undefined! Should not pass! Allow until resolved answer.\n  EXPECT_EQ(0, vector_->XAtY(10));\n}\n\nTEST_F(TabVectorTest, XAtYRoundingSimple) {\n  MakeSimpleTabVector(0, 0, 2, 10000);\n  int x = vector_->XAtY(1);\n  EXPECT_EQ(0, x);\n  x = vector_->XAtY(4999);\n  EXPECT_EQ(0, x);\n  x = vector_->XAtY(5001);\n  EXPECT_EQ(1, x);\n  x = vector_->XAtY(9999);\n  EXPECT_EQ(1, x);\n}\n\nTEST_F(TabVectorTest, XAtYLargeNumbers) {\n  // Assume a document is 800 DPI,\n  // the width of a page is 10 inches across (8000 pixels), and\n  // the height of the page is 15 inches (12000 pixels).\n  MakeSimpleTabVector(7804, 504, 7968, 11768); // Arbitrary for vertical line\n  int x = vector_->XAtY(6136);                 // test mid point\n  EXPECT_EQ(7886, x);\n}\n\nTEST_F(TabVectorTest, XAtYHorizontalInRangeExact) {\n  const int y = 120; // Arbitrary choice\n  MakeSimpleTabVector(50, y, 150, y);\n\n  int x = vector_->XAtY(y);\n  EXPECT_EQ(50, x);\n}\n\nTEST_F(TabVectorTest, VOverlapInRangeSimple) {\n  MakeSimpleTabVector(0, 0, 100, 100);\n  int overlap = vector_->VOverlap(90, 10);\n  EXPECT_EQ(80, overlap);\n  overlap = vector_->VOverlap(100, 0);\n  EXPECT_EQ(100, overlap);\n}\n\nTEST_F(TabVectorTest, VOverlapOutOfRange) {\n  MakeSimpleTabVector(0, 10, 100, 90);\n  int overlap = vector_->VOverlap(100, 0);\n  EXPECT_EQ(80, overlap);\n}\n\nTEST_F(TabVectorTest, XYFlip) {\n  MakeSimpleTabVector(1, 2, 3, 4);\n  vector_->XYFlip();\n  EXPECT_EQ(2, vector_->startpt().x());\n  EXPECT_EQ(1, vector_->startpt().y());\n  EXPECT_EQ(4, vector_->endpt().x());\n  EXPECT_EQ(3, vector_->endpt().y());\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/tatweel_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <filesystem>\n#include \"dawg.h\"\n#include \"include_gunit.h\"\n#include \"trie.h\"\n#include \"unicharset.h\"\n#include \"util/utf8/unicodetext.h\" // for UnicodeText\n\nnamespace tesseract {\n\nclass TatweelTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    static std::locale system_locale(\"\");\n    std::locale::global(system_locale);\n  }\n\n  TatweelTest() {\n    std::string filename = TestDataNameToPath(\"ara.wordlist\");\n    if (std::filesystem::exists(filename)) {\n      std::string wordlist(\"\\u0640\");\n      CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));\n      // Put all the unicodes in the unicharset_.\n      UnicodeText text;\n      text.PointToUTF8(wordlist.data(), wordlist.size());\n      int num_tatweel = 0;\n      for (auto it = text.begin(); it != text.end(); ++it) {\n        std::string utf8 = it.get_utf8_string();\n        if (utf8.find(\"\\u0640\") != std::string::npos)\n          ++num_tatweel;\n        unicharset_.unichar_insert(utf8.c_str());\n      }\n      LOG(INFO) << \"Num tatweels in source data=\" << num_tatweel;\n      EXPECT_GT(num_tatweel, 0);\n    }\n  }\n\n  std::string TestDataNameToPath(const std::string &name) {\n    return file::JoinPath(TESTDATA_DIR, name);\n  }\n  UNICHARSET unicharset_;\n};\n\nTEST_F(TatweelTest, UnicharsetIgnoresTatweel) {\n  // This test verifies that the unicharset ignores the Tatweel character.\n  for (int i = 0; i < unicharset_.size(); ++i) {\n    const char *utf8 = unicharset_.id_to_unichar(i);\n    EXPECT_EQ(strstr(utf8, reinterpret_cast<const char *>(u8\"\\u0640\")), nullptr);\n  }\n}\n\nTEST_F(TatweelTest, DictIgnoresTatweel) {\n  // This test verifies that the dictionary ignores the Tatweel character.\n  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, \"ara\", SYSTEM_DAWG_PERM, unicharset_.size(), 0);\n  std::string filename = TestDataNameToPath(\"ara.wordlist\");\n  if (!std::filesystem::exists(filename)) {\n    LOG(INFO) << \"Skip test because of missing \" << filename;\n    GTEST_SKIP();\n  } else {\n    EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,\n                                            tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));\n    EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));\n  }\n}\n\nTEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {\n  // This test verifies that a load of an existing unicharset keeps any\n  // existing tatweel for backwards compatibility.\n  std::string filename = TestDataNameToPath(\"ara.unicharset\");\n  if (!std::filesystem::exists(filename)) {\n    LOG(INFO) << \"Skip test because of missing \" << filename;\n    GTEST_SKIP();\n  } else {\n    EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));\n    int num_tatweel = 0;\n    for (int i = 0; i < unicharset_.size(); ++i) {\n      const char *utf8 = unicharset_.id_to_unichar(i);\n      if (strstr(utf8, reinterpret_cast<const char *>(u8\"\\u0640\")) != nullptr) {\n        ++num_tatweel;\n      }\n    }\n    LOG(INFO) << \"Num tatweels in unicharset=\" << num_tatweel;\n    EXPECT_EQ(num_tatweel, 4);\n  }\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/tesseract_leaksanitizer.supp",
    "content": "# Suppress memory leaks.\n# Use with LSAN_OPTIONS=suppressions=tesseract_lsan.supp\nleak:FcLangSetCreate\nleak:FcPatternObjectAddWithBinding\nleak:FcPatternObjectInsertElt\nleak:FcValueListAppend\nleak:FcValueListDuplicate\nleak:FcValueListPrepend\nleak:IA__FcLangSetCreate\nleak:IA__FcValueSave\nleak:libfontconfig.so\nleak:libfreetype.so\n"
  },
  {
    "path": "unittest/textlineprojection_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <allheaders.h>\n#include <string> // for std::string\n\n#include \"include_gunit.h\"\n\n#include <tesseract/baseapi.h>\n#include <tesseract/osdetect.h>\n#include \"colfind.h\"\n#include \"log.h\" // for LOG\n#include \"mutableiterator.h\"\n#include \"pageres.h\"\n#include \"tesseractclass.h\"\n#include \"textlineprojection.h\"\n\nnamespace tesseract {\n\n// Minimum score for a STRONG_CHAIN textline.\n// NOTE: Keep in sync with textlineprojection.cc.\nconst int kMinStrongTextValue = 6;\n\n// The fixture for testing Tesseract.\nclass TextlineProjectionTest : public testing::Test {\nprotected:\n  std::string OutputNameToPath(const std::string &name) {\n    file::MakeTmpdir();\n    return file::JoinPath(FLAGS_test_tmpdir, name);\n  }\n\n  TextlineProjectionTest() {\n    src_pix_ = nullptr;\n    bin_pix_ = nullptr;\n    finder_ = nullptr;\n    denorm_ = nullptr;\n    projection_ = nullptr;\n  }\n  ~TextlineProjectionTest() override {\n    src_pix_.destroy();\n    bin_pix_.destroy();\n    delete finder_;\n  }\n\n  void SetImage(const char *filename) {\n    src_pix_.destroy();\n    src_pix_ = pixRead(file::JoinPath(TESTING_DIR, filename).c_str());\n    api_.Init(TESSDATA_DIR, \"eng\", tesseract::OEM_TESSERACT_ONLY);\n    api_.SetPageSegMode(tesseract::PSM_AUTO_OSD);\n    api_.SetImage(src_pix_);\n  }\n\n  // Ugly hacked-together function sets up projection_ and denorm_ by setting\n  // up for auto pagelayout, setting up a ColumnFinder, running it, and\n  // using accessors to get at the internal denorm and projection.\n  // If the coordinates have been rotated, the denorm should match\n  // correctly and transform coordinates back to the projection.\n  // We throw away all the blocks, blobs etc, and test the projection with\n  // the resultiterator from a separate BaseAPI run.\n  void SetupProjection() {\n    tesseract::TessdataManager mgr;\n    auto osd_tess = std::make_unique<Tesseract>();\n    OSResults osr;\n    EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, \"\", \"osd\", tesseract::OEM_TESSERACT_ONLY,\n                                       nullptr, 0, nullptr, nullptr, false, &mgr),\n              0);\n    tesseract_ = std::make_unique<Tesseract>();\n    EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, \"\", \"eng\", tesseract::OEM_TESSERACT_ONLY,\n                                         nullptr, 0, nullptr, nullptr, false, &mgr),\n              0);\n    bin_pix_ = api_.GetThresholdedImage();\n    *tesseract_->mutable_pix_binary() = bin_pix_.clone();\n    osd_tess->set_source_resolution(api_.tesseract()->source_resolution());\n    tesseract_->set_source_resolution(api_.tesseract()->source_resolution());\n    int width = pixGetWidth(bin_pix_);\n    int height = pixGetHeight(bin_pix_);\n    // First make a single block covering the whole image.\n    auto *block = new BLOCK(\"\", true, 0, 0, 0, 0, width, height);\n    block->set_right_to_left(false);\n    BLOCK_LIST src_blocks;\n    BLOCK_IT block_it(&src_blocks);\n    block_it.add_to_end(block);\n    Image photomask_pix = nullptr;\n    // The blocks made by the ColumnFinder. Moved to blocks before return.\n    BLOCK_LIST found_blocks;\n    TO_BLOCK_LIST temp_blocks;\n    finder_ =\n        tesseract_->SetupPageSegAndDetectOrientation(tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess.get(),\n                                                     &osr, &temp_blocks, &photomask_pix, nullptr);\n    TO_BLOCK_IT to_block_it(&temp_blocks);\n    TO_BLOCK *to_block = to_block_it.data();\n    denorm_ = finder_->denorm();\n    TO_BLOCK_LIST to_blocks;\n    BLOBNBOX_LIST diacritic_blobs;\n    EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block, photomask_pix, nullptr,\n                                  nullptr, nullptr, &found_blocks, &diacritic_blobs, &to_blocks),\n              0);\n    projection_ = finder_->projection();\n    photomask_pix.destroy();\n  }\n\n  // Helper evaluates the given box, expects the result to be greater_than\n  // or !greater_than the target_value and provides diagnostics if not.\n  void EvaluateBox(const TBOX &box, bool greater_or_equal, int target_value, const char *text,\n                   const char *message) {\n    int value = projection_->EvaluateBox(box, denorm_, false);\n    if (greater_or_equal != (value > target_value)) {\n      LOG(INFO) << \"EvaluateBox too \" << (greater_or_equal ? \"low\" : \"high\")\n        << \":\" << value << \" vs \" << target_value << \" for \" << message << \" word '\" << text << \"' at:\";\n      box.print();\n      value = projection_->EvaluateBox(box, denorm_, true);\n    } else {\n      LOG(INFO) << \"EvaluateBox OK(\" << value << \") for \" << message << \" word '\" << text << \"'\";\n    }\n    if (greater_or_equal) {\n      EXPECT_GE(value, target_value);\n    } else {\n      EXPECT_LT(value, target_value);\n    }\n  }\n\n  // Helper evaluates the DistanceOfBoxFromBox function by expecting that\n  // box should be nearer to true_box than false_box.\n  void EvaluateDistance(const TBOX &box, const TBOX &true_box, const TBOX &false_box,\n                        const char *text, const char *message) {\n    int true_dist = projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false);\n    int false_dist = projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false);\n    if (false_dist <= true_dist) {\n      LOG(INFO) << \"Distance wrong:\" << false_dist << \" vs \" << true_dist\n        << \" for \" << message << \" word '\" << text << \"' at:\";\n      true_box.print();\n      projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true);\n      projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true);\n    } else {\n      LOG(INFO) << \"Distance OK(\" << false_dist << \" vs \" << true_dist\n        << \") for \" << message << \" word '\" << text << \"'\";\n    }\n  }\n\n  // Tests the projection on the word boxes of the given image.\n  // line_height is the cap + descender size of the text.\n  void VerifyBoxes(const char *imagefile, int line_height) {\n    SetImage(imagefile);\n    api_.Recognize(nullptr);\n    SetupProjection();\n    MutableIterator *it = api_.GetMutableIterator();\n    do {\n      char *text = it->GetUTF8Text(tesseract::RIL_WORD);\n      const PAGE_RES_IT *pr_it = it->PageResIt();\n      WERD_RES *word = pr_it->word();\n      // The word_box refers to the internal, possibly rotated, coords.\n      TBOX word_box = word->word->bounding_box();\n      bool small_word = word_box.height() * 1.5 < line_height;\n      bool tall_word = word_box.height() * 1.125 > line_height;\n      // We pad small and tall words differently because ascenders and\n      // descenders affect the position and size of the upper/lower boxes.\n      int padding;\n      if (small_word) {\n        padding = word_box.height();\n      } else if (tall_word) {\n        padding = word_box.height() / 3;\n      } else {\n        padding = word_box.height() / 2;\n      }\n      // Test that the word box gets a good score.\n      EvaluateBox(word_box, true, kMinStrongTextValue, text, \"Real Word\");\n\n      // Now test a displaced box, both above and below the word.\n      TBOX upper_box(word_box);\n      upper_box.set_bottom(word_box.top());\n      upper_box.set_top(word_box.top() + padding);\n      EvaluateBox(upper_box, false, kMinStrongTextValue, text, \"Upper Word\");\n      EvaluateBox(upper_box, true, -1, text, \"Upper Word not vertical\");\n      TBOX lower_box = word_box;\n      lower_box.set_top(word_box.bottom());\n      lower_box.set_bottom(word_box.bottom() - padding);\n      if (tall_word) {\n        lower_box.move(ICOORD(0, padding / 2));\n      }\n      EvaluateBox(lower_box, false, kMinStrongTextValue, text, \"Lower Word\");\n      EvaluateBox(lower_box, true, -1, text, \"Lower Word not vertical\");\n\n      // Since some words have no text below and some words have no text above\n      // check that at least one of the boxes satisfies BoxOutOfTextline.\n      bool upper_or_lower_out_of_textline =\n          projection_->BoxOutOfHTextline(upper_box, denorm_, false) ||\n          projection_->BoxOutOfHTextline(lower_box, denorm_, false);\n      if (!upper_or_lower_out_of_textline) {\n        projection_->BoxOutOfHTextline(upper_box, denorm_, true);\n        projection_->BoxOutOfHTextline(lower_box, denorm_, true);\n      }\n      EXPECT_TRUE(upper_or_lower_out_of_textline);\n\n      // Now test DistanceOfBoxFromBox by faking a challenger word, and asking\n      // that each pad box be nearer to its true textline than the\n      // challenger. Due to the tight spacing of latin text, getting\n      // the right position and size of these test boxes is quite fiddly.\n      padding = line_height / 4;\n      upper_box.set_top(upper_box.bottom() + padding);\n      TBOX target_box(word_box);\n      if (!small_word) {\n        upper_box.move(ICOORD(0, -padding * 3 / 2));\n      }\n      target_box.set_top(upper_box.bottom());\n      TBOX upper_challenger(upper_box);\n      upper_challenger.set_bottom(upper_box.top());\n      upper_challenger.set_top(upper_box.top() + word_box.height());\n      EvaluateDistance(upper_box, target_box, upper_challenger, text, \"Upper Word\");\n      if (tall_word) {\n        lower_box.move(ICOORD(0, padding / 2));\n      }\n      lower_box.set_bottom(lower_box.top() - padding);\n      target_box = word_box;\n      target_box.set_bottom(lower_box.top());\n      TBOX lower_challenger(lower_box);\n      lower_challenger.set_top(lower_box.bottom());\n      lower_challenger.set_bottom(lower_box.bottom() - word_box.height());\n      EvaluateDistance(lower_box, target_box, lower_challenger, text, \"Lower Word\");\n\n      delete[] text;\n    } while (it->Next(tesseract::RIL_WORD));\n    delete it;\n  }\n\n  Image src_pix_;\n  Image bin_pix_;\n  BLOCK_LIST blocks_;\n  std::string ocr_text_;\n  tesseract::TessBaseAPI api_;\n  std::unique_ptr<Tesseract> tesseract_;\n  ColumnFinder *finder_;\n  const DENORM *denorm_;\n  const TextlineProjection *projection_;\n};\n\n// Tests all word boxes on an unrotated image.\nTEST_F(TextlineProjectionTest, Unrotated) {\n  VerifyBoxes(\"phototest.tif\", 31);\n}\n\n// Tests character-level applyboxes on italic Times New Roman.\nTEST_F(TextlineProjectionTest, Rotated) {\n  VerifyBoxes(\"phototestrot.tif\", 31);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/tfile_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"serialis.h\"\n\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\n// Tests TFile and std::vector serialization by serializing and\n// writing/reading.\n\nclass TfileTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n\n  TfileTest() = default;\n\n  // Some data to serialize.\n  class MathData {\n  public:\n    MathData() : num_squares_(0), num_triangles_(0) {}\n    void Setup() {\n      // Setup some data.\n      for (int s = 0; s < 42; ++s) {\n        squares_.push_back(s * s);\n      }\n      num_squares_ = squares_.size();\n      for (int t = 0; t < 52; ++t) {\n        triangles_.push_back(t * (t + 1) / 2);\n      }\n      num_triangles_ = triangles_.size();\n    }\n    void ExpectEq(const MathData &other) {\n      // Check the data.\n      EXPECT_EQ(num_squares_, other.num_squares_);\n      for (unsigned s = 0; s < squares_.size(); ++s) {\n        EXPECT_EQ(squares_[s], other.squares_[s]);\n      }\n      EXPECT_EQ(num_triangles_, other.num_triangles_);\n      for (unsigned s = 0; s < triangles_.size(); ++s) {\n        EXPECT_EQ(triangles_[s], other.triangles_[s]);\n      }\n    }\n    bool Serialize(TFile *fp) {\n      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) {\n        return false;\n      }\n      if (!fp->Serialize(squares_)) {\n        return false;\n      }\n      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) {\n        return false;\n      }\n      if (!fp->Serialize(triangles_)) {\n        return false;\n      }\n      return true;\n    }\n    bool DeSerialize(TFile *fp) {\n      if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) {\n        return false;\n      }\n      if (!fp->DeSerialize(squares_)) {\n        return false;\n      }\n      if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) {\n        return false;\n      }\n      if (!fp->DeSerialize(triangles_)) {\n        return false;\n      }\n      return true;\n    }\n    bool SerializeBigEndian(TFile *fp) {\n      ReverseN(&num_squares_, sizeof(num_squares_));\n      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) {\n        return false;\n      }\n      // Write an additional reversed size before the vector, which will get\n      // used as its size on reading.\n      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) {\n        return false;\n      }\n      for (int &square : squares_) {\n        ReverseN(&square, sizeof(square));\n      }\n      if (!fp->Serialize(squares_)) {\n        return false;\n      }\n      ReverseN(&num_triangles_, sizeof(num_triangles_));\n      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) {\n        return false;\n      }\n      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) {\n        return false;\n      }\n      for (auto &triangle : triangles_) {\n        ReverseN(&triangle, sizeof(triangles_[0]));\n      }\n      return fp->Serialize(triangles_);\n    }\n    bool DeSerializeBigEndian(TFile *fp) {\n      if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) {\n        return false;\n      }\n      if (!fp->DeSerialize(squares_)) {\n        return false;\n      }\n      // The first element is the size that was written, so we will delete it\n      // and read the last element separately.\n      int last_element;\n      if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1) {\n        return false;\n      }\n      squares_.erase(squares_.begin());\n      squares_.push_back(last_element);\n      if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) {\n        return false;\n      }\n      if (!fp->DeSerialize(triangles_)) {\n        return false;\n      }\n      if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1) {\n        return false;\n      }\n      triangles_.erase(triangles_.begin());\n      triangles_.push_back(last_element);\n      return true;\n    }\n\n  private:\n    std::vector<int> squares_;\n    int num_squares_;\n    std::vector<int> triangles_;\n    int num_triangles_;\n  };\n};\n\nTEST_F(TfileTest, Serialize) {\n  // This test verifies that Tfile can serialize a class.\n  MathData m1;\n  m1.Setup();\n  std::vector<char> data;\n  TFile fpw;\n  fpw.OpenWrite(&data);\n  EXPECT_TRUE(m1.Serialize(&fpw));\n  TFile fpr;\n  EXPECT_TRUE(fpr.Open(&data[0], data.size()));\n  MathData m2;\n  EXPECT_TRUE(m2.DeSerialize(&fpr));\n  m1.ExpectEq(m2);\n  MathData m3;\n  EXPECT_FALSE(m3.DeSerialize(&fpr));\n  fpr.Rewind();\n  EXPECT_TRUE(m3.DeSerialize(&fpr));\n  m1.ExpectEq(m3);\n}\n\nTEST_F(TfileTest, FGets) {\n  // This test verifies that Tfile can interleave FGets with binary data.\n  MathData m1;\n  std::string line_str = \"This is a textline with a newline\\n\";\n  m1.Setup();\n  std::vector<char> data;\n  TFile fpw;\n  fpw.OpenWrite(&data);\n  EXPECT_TRUE(m1.Serialize(&fpw));\n  EXPECT_EQ(1, fpw.FWrite(line_str.data(), line_str.size(), 1));\n  EXPECT_TRUE(m1.Serialize(&fpw));\n  // Now get back the 2 copies of m1 with the line in between.\n  TFile fpr;\n  EXPECT_TRUE(fpr.Open(&data[0], data.size()));\n  MathData m2;\n  EXPECT_TRUE(m2.DeSerialize(&fpr));\n  m1.ExpectEq(m2);\n  const int kBufsize = 1024;\n  char buffer[kBufsize + 1];\n  EXPECT_EQ(buffer, fpr.FGets(buffer, kBufsize));\n  EXPECT_STREQ(line_str.c_str(), buffer);\n  MathData m3;\n  EXPECT_TRUE(m3.DeSerialize(&fpr));\n  m1.ExpectEq(m3);\n}\n\nTEST_F(TfileTest, BigEndian) {\n  // This test verifies that Tfile can auto-reverse big-endian data.\n  MathData m1;\n  m1.Setup();\n  std::vector<char> data;\n  TFile fpw;\n  fpw.OpenWrite(&data);\n  EXPECT_TRUE(m1.SerializeBigEndian(&fpw));\n  TFile fpr;\n  EXPECT_TRUE(fpr.Open(&data[0], data.size()));\n  fpr.set_swap(true);\n  MathData m2;\n  EXPECT_TRUE(m2.DeSerializeBigEndian(&fpr));\n  // That serialize was destructive, so test against a fresh MathData.\n  MathData m3;\n  m3.Setup();\n  m3.ExpectEq(m2);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/third_party/utf/rune.c",
    "content": "/*\n * The authors of this software are Rob Pike and Ken Thompson.\n *              Copyright (c) 2002 by Lucent Technologies.\n * Permission to use, copy, modify, and distribute this software for any\n * purpose without fee is hereby granted, provided that this entire notice\n * is included in all copies of any software which is or includes a copy\n * or modification of this software and in all copies of the supporting\n * documentation for such software.\n * THIS SOFTWARE IS BEING PROVIDED \"AS IS\", WITHOUT ANY EXPRESS OR IMPLIED\n * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY\n * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY\n * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.\n */\n#include <stdarg.h>\n#include <stdint.h>\n#include <string.h>\n#include \"third_party/utf/utf.h\"\n\nenum {\n  Bit1 = 7,\n  Bitx = 6,\n  Bit2 = 5,\n  Bit3 = 4,\n  Bit4 = 3,\n  Bit5 = 2,\n\n  T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */\n  Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */\n  T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */\n  T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */\n  T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */\n  T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */\n\n  Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */\n  Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */\n  Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */\n  Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1,\n  /* 0001 1111 1111 1111 1111 1111 */\n\n  Maskx = (1 << Bitx) - 1, /* 0011 1111 */\n  Testx = Maskx ^ 0xFF,    /* 1100 0000 */\n\n  Bad = Runeerror,\n};\n\n/*\n * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24\n * This is a slower but \"safe\" version of the old chartorune\n * that works on strings that are not necessarily null-terminated.\n *\n * If you know for sure that your string is null-terminated,\n * chartorune will be a bit faster.\n *\n * It is guaranteed not to attempt to access \"length\"\n * past the incoming pointer.  This is to avoid\n * possible access violations.  If the string appears to be\n * well-formed but incomplete (i.e., to get the whole Rune\n * we'd need to read past str+length) then we'll set the Rune\n * to Bad and return 0.\n *\n * Note that if we have decoding problems for other\n * reasons, we return 1 instead of 0.\n */\nint charntorune(Rune *rune, const char *str, int length) {\n  int c, c1, c2, c3;\n  long l;\n\n  /* When we're not allowed to read anything */\n  if (length <= 0) {\n    goto badlen;\n  }\n\n  /*\n   * one character sequence (7-bit value)\n   *\t00000-0007F => T1\n   */\n  c = *(uint8_t *)str;\n  if (c < Tx) {\n    *rune = c;\n    return 1;\n  }\n\n  // If we can't read more than one character we must stop\n  if (length <= 1) {\n    goto badlen;\n  }\n\n  /*\n   * two character sequence (11-bit value)\n   *\t0080-07FF => T2 Tx\n   */\n  c1 = *(uint8_t *)(str + 1) ^ Tx;\n  if (c1 & Testx)\n    goto bad;\n  if (c < T3) {\n    if (c < T2)\n      goto bad;\n    l = ((c << Bitx) | c1) & Rune2;\n    if (l <= Rune1)\n      goto bad;\n    *rune = l;\n    return 2;\n  }\n\n  // If we can't read more than two characters we must stop\n  if (length <= 2) {\n    goto badlen;\n  }\n\n  /*\n   * three character sequence (16-bit value)\n   *\t0800-FFFF => T3 Tx Tx\n   */\n  c2 = *(uint8_t *)(str + 2) ^ Tx;\n  if (c2 & Testx)\n    goto bad;\n  if (c < T4) {\n    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;\n    if (l <= Rune2)\n      goto bad;\n    *rune = l;\n    return 3;\n  }\n\n  if (length <= 3)\n    goto badlen;\n\n  /*\n   * four character sequence (21-bit value)\n   *\t10000-1FFFFF => T4 Tx Tx Tx\n   */\n  c3 = *(uint8_t *)(str + 3) ^ Tx;\n  if (c3 & Testx)\n    goto bad;\n  if (c < T5) {\n    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;\n    if (l <= Rune3)\n      goto bad;\n    if (l > Runemax)\n      goto bad;\n    *rune = l;\n    return 4;\n  }\n\n  // Support for 5-byte or longer UTF-8 would go here, but\n  // since we don't have that, we'll just fall through to bad.\n\n  /*\n   * bad decoding\n   */\nbad:\n  *rune = Bad;\n  return 1;\nbadlen:\n  *rune = Bad;\n  return 0;\n}\n\n/*\n * This is the older \"unsafe\" version, which works fine on\n * null-terminated strings.\n */\nint chartorune(Rune *rune, const char *str) {\n  int c, c1, c2, c3;\n  long l;\n\n  /*\n   * one character sequence\n   *\t00000-0007F => T1\n   */\n  c = *(uint8_t *)str;\n  if (c < Tx) {\n    *rune = c;\n    return 1;\n  }\n\n  /*\n   * two character sequence\n   *\t0080-07FF => T2 Tx\n   */\n  c1 = *(uint8_t *)(str + 1) ^ Tx;\n  if (c1 & Testx)\n    goto bad;\n  if (c < T3) {\n    if (c < T2)\n      goto bad;\n    l = ((c << Bitx) | c1) & Rune2;\n    if (l <= Rune1)\n      goto bad;\n    *rune = l;\n    return 2;\n  }\n\n  /*\n   * three character sequence\n   *\t0800-FFFF => T3 Tx Tx\n   */\n  c2 = *(uint8_t *)(str + 2) ^ Tx;\n  if (c2 & Testx)\n    goto bad;\n  if (c < T4) {\n    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;\n    if (l <= Rune2)\n      goto bad;\n    *rune = l;\n    return 3;\n  }\n\n  /*\n   * four character sequence (21-bit value)\n   *\t10000-1FFFFF => T4 Tx Tx Tx\n   */\n  c3 = *(uint8_t *)(str + 3) ^ Tx;\n  if (c3 & Testx)\n    goto bad;\n  if (c < T5) {\n    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;\n    if (l <= Rune3)\n      goto bad;\n    if (l > Runemax)\n      goto bad;\n    *rune = l;\n    return 4;\n  }\n\n  /*\n   * Support for 5-byte or longer UTF-8 would go here, but\n   * since we don't have that, we'll just fall through to bad.\n   */\n\n  /*\n   * bad decoding\n   */\nbad:\n  *rune = Bad;\n  return 1;\n}\n\nint isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed) {\n  *consumed = charntorune(rune, str, length);\n  return *rune != Runeerror || *consumed == 3;\n}\n\nint runetochar(char *str, const Rune *rune) {\n  /* Runes are signed, so convert to unsigned for range check. */\n  unsigned long c;\n\n  /*\n   * one character sequence\n   *\t00000-0007F => 00-7F\n   */\n  c = *rune;\n  if (c <= Rune1) {\n    str[0] = c;\n    return 1;\n  }\n\n  /*\n   * two character sequence\n   *\t0080-07FF => T2 Tx\n   */\n  if (c <= Rune2) {\n    str[0] = T2 | (c >> 1 * Bitx);\n    str[1] = Tx | (c & Maskx);\n    return 2;\n  }\n\n  /*\n   * If the Rune is out of range, convert it to the error rune.\n   * Do this test here because the error rune encodes to three bytes.\n   * Doing it earlier would duplicate work, since an out of range\n   * Rune wouldn't have fit in one or two bytes.\n   */\n  if (c > Runemax)\n    c = Runeerror;\n\n  /*\n   * three character sequence\n   *\t0800-FFFF => T3 Tx Tx\n   */\n  if (c <= Rune3) {\n    str[0] = T3 | (c >> 2 * Bitx);\n    str[1] = Tx | ((c >> 1 * Bitx) & Maskx);\n    str[2] = Tx | (c & Maskx);\n    return 3;\n  }\n\n  /*\n   * four character sequence (21-bit value)\n   *     10000-1FFFFF => T4 Tx Tx Tx\n   */\n  str[0] = T4 | (c >> 3 * Bitx);\n  str[1] = Tx | ((c >> 2 * Bitx) & Maskx);\n  str[2] = Tx | ((c >> 1 * Bitx) & Maskx);\n  str[3] = Tx | (c & Maskx);\n  return 4;\n}\n\nint runelen(Rune rune) {\n  char str[10];\n\n  return runetochar(str, &rune);\n}\n\nint runenlen(const Rune *r, int nrune) {\n  int nb;\n  unsigned long c; /* Rune is signed, so use unsigned for range check. */\n\n  nb = 0;\n  while (nrune--) {\n    c = *r++;\n    if (c <= Rune1)\n      nb++;\n    else if (c <= Rune2)\n      nb += 2;\n    else if (c <= Rune3)\n      nb += 3;\n    else if (c <= Runemax)\n      nb += 4;\n    else\n      nb += 3; /* Runeerror = 0xFFFD, see runetochar */\n  }\n  return nb;\n}\n\nint fullrune(const char *str, int n) {\n  if (n > 0) {\n    int c = *(uint8_t *)str;\n    if (c < Tx)\n      return 1;\n    if (n > 1) {\n      if (c < T3)\n        return 1;\n      if (n > 2) {\n        if (c < T4 || n > 3)\n          return 1;\n      }\n    }\n  }\n  return 0;\n}\n"
  },
  {
    "path": "unittest/third_party/utf/utf.h",
    "content": "/*\n * The authors of this software are Rob Pike and Ken Thompson.\n *              Copyright (c) 2002 by Lucent Technologies.\n * Permission to use, copy, modify, and distribute this software for any\n * purpose without fee is hereby granted, provided that this entire notice\n * is included in all copies of any software which is or includes a copy\n * or modification of this software and in all copies of the supporting\n * documentation for such software.\n * THIS SOFTWARE IS BEING PROVIDED \"AS IS\", WITHOUT ANY EXPRESS OR IMPLIED\n * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY\n * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY\n * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.\n */\n#ifndef _UTFH_\n#define _UTFH_ 1\n\n#include <stdint.h>\n\ntypedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/\n\nenum {\n  UTFmax = 4,         /* maximum bytes per rune */\n  Runesync = 0x80,    /* cannot represent part of a UTF sequence (<) */\n  Runeself = 0x80,    /* rune and UTF sequences are the same (<) */\n  Runeerror = 0xFFFD, /* decoding error in UTF */\n  Runemax = 0x10FFFF, /* maximum rune value */\n};\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/*\n * rune routines\n */\n\n/*\n * These routines were written by Rob Pike and Ken Thompson\n * and first appeared in Plan 9.\n * SEE ALSO\n * utf (7)\n * tcs (1)\n */\n\n// runetochar copies (encodes) one rune, pointed to by r, to at most\n// UTFmax bytes starting at s and returns the number of bytes generated.\n\nint runetochar(char *s, const Rune *r);\n\n// chartorune copies (decodes) at most UTFmax bytes starting at s to\n// one rune, pointed to by r, and returns the number of bytes consumed.\n// If the input is not exactly in UTF format, chartorune will set *r\n// to Runeerror and return 1.\n//\n// Note: There is no special case for a \"null-terminated\" string. A\n// string whose first byte has the value 0 is the UTF8 encoding of the\n// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal\n// anywhere else in a UTF sequence.\n\nint chartorune(Rune *r, const char *s);\n\n// charntorune is like chartorune, except that it will access at most\n// n bytes of s.  If the UTF sequence is incomplete within n bytes,\n// charntorune will set *r to Runeerror and return 0. If it is complete\n// but not in UTF format, it will set *r to Runeerror and return 1.\n//\n// Added 2004-09-24 by Wei-Hwa Huang\n\nint charntorune(Rune *r, const char *s, int n);\n\n// isvalidcharntorune(str, n, r, consumed)\n// is a convenience function that calls \"*consumed = charntorune(r, str, n)\"\n// and returns an int (logically boolean) indicating whether the first\n// n bytes of str was a valid and complete UTF sequence.\n\nint isvalidcharntorune(const char *str, int n, Rune *r, int *consumed);\n\n// runelen returns the number of bytes required to convert r into UTF.\n\nint runelen(Rune r);\n\n// runenlen returns the number of bytes required to convert the n\n// runes pointed to by r into UTF.\n\nint runenlen(const Rune *r, int n);\n\n// fullrune returns 1 if the string s of length n is long enough to be\n// decoded by chartorune, and 0 otherwise. This does not guarantee\n// that the string contains a legal UTF encoding. This routine is used\n// by programs that obtain input one byte at a time and need to know\n// when a full rune has arrived.\n\nint fullrune(const char *s, int n);\n\n// The following routines are analogous to the corresponding string\n// routines with \"utf\" substituted for \"str\", and \"rune\" substituted\n// for \"chr\".\n\n// utflen returns the number of runes that are represented by the UTF\n// string s. (cf. strlen)\n\nint utflen(const char *s);\n\n// utfnlen returns the number of complete runes that are represented\n// by the first n bytes of the UTF string s. If the last few bytes of\n// the string contain an incompletely coded rune, utfnlen will not\n// count them; in this way, it differs from utflen, which includes\n// every byte of the string. (cf. strnlen)\n\nint utfnlen(const char *s, long n);\n\n// utfrune returns a pointer to the first occurrence of rune r in the\n// UTF string s, or 0 if r does not occur in the string.  The NULL\n// byte terminating a string is considered to be part of the string s.\n// (cf. strchr)\n\nconst char *utfrune(const char *s, Rune r);\n\n// utfrrune returns a pointer to the last occurrence of rune r in the\n// UTF string s, or 0 if r does not occur in the string.  The NULL\n// byte terminating a string is considered to be part of the string s.\n// (cf. strrchr)\n\nconst char *utfrrune(const char *s, Rune r);\n\n// utfutf returns a pointer to the first occurrence of the UTF string\n// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the\n// null string, utfutf returns s1. (cf. strstr)\n\nconst char *utfutf(const char *s1, const char *s2);\n\n// utfecpy copies UTF sequences until a null sequence has been copied,\n// but writes no sequences beyond es1.  If any sequences are copied,\n// s1 is terminated by a null sequence, and a pointer to that sequence\n// is returned.  Otherwise, the original s1 is returned. (cf. strecpy)\n\nchar *utfecpy(char *s1, char *es1, const char *s2);\n\n// These functions are rune-string analogues of the corresponding\n// functions in strcat (3).\n//\n// These routines first appeared in Plan 9.\n// SEE ALSO\n// memmove (3)\n// rune (3)\n// strcat (2)\n//\n// BUGS: The outcome of overlapping moves varies among implementations.\n\nRune *runestrcat(Rune *s1, const Rune *s2);\nRune *runestrncat(Rune *s1, const Rune *s2, long n);\n\nconst Rune *runestrchr(const Rune *s, Rune c);\n\nint runestrcmp(const Rune *s1, const Rune *s2);\nint runestrncmp(const Rune *s1, const Rune *s2, long n);\n\nRune *runestrcpy(Rune *s1, const Rune *s2);\nRune *runestrncpy(Rune *s1, const Rune *s2, long n);\nRune *runestrecpy(Rune *s1, Rune *es1, const Rune *s2);\n\nRune *runestrdup(const Rune *s);\n\nconst Rune *runestrrchr(const Rune *s, Rune c);\nlong runestrlen(const Rune *s);\nconst Rune *runestrstr(const Rune *s1, const Rune *s2);\n\n// The following routines test types and modify cases for Unicode\n// characters.  Unicode defines some characters as letters and\n// specifies three cases: upper, lower, and title.  Mappings among the\n// cases are also defined, although they are not exhaustive: some\n// upper case letters have no lower case mapping, and so on.  Unicode\n// also defines several character properties, a subset of which are\n// checked by these routines.  These routines are based on Unicode\n// version 3.0.0.\n//\n// NOTE: The routines are implemented in C, so the boolean functions\n// (e.g., isupperrune) return 0 for false and 1 for true.\n//\n//\n// toupperrune, tolowerrune, and totitlerune are the Unicode case\n// mappings. These routines return the character unchanged if it has\n// no defined mapping.\n\nRune toupperrune(Rune r);\nRune tolowerrune(Rune r);\nRune totitlerune(Rune r);\n\n// isupperrune tests for upper case characters, including Unicode\n// upper case letters and targets of the toupper mapping. islowerrune\n// and istitlerune are defined analogously.\n\nint isupperrune(Rune r);\nint islowerrune(Rune r);\nint istitlerune(Rune r);\n\n// isalpharune tests for Unicode letters; this includes ideographs in\n// addition to alphabetic characters.\n\nint isalpharune(Rune r);\n\n// isdigitrune tests for digits. Non-digit numbers, such as Roman\n// numerals, are not included.\n\nint isdigitrune(Rune r);\n\n// isideographicrune tests for ideographic characters and numbers, as\n// defined by the Unicode standard.\n\nint isideographicrune(Rune r);\n\n// isspacerune tests for whitespace characters, including \"C\" locale\n// whitespace, Unicode defined whitespace, and the \"zero-width\n// non-break space\" character.\n\nint isspacerune(Rune r);\n\n// (The comments in this file were copied from the manpage files rune.3,\n// isalpharune.3, and runestrcat.3. Some formatting changes were also made\n// to conform to Google style. /JRM 11/11/05)\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif\n"
  },
  {
    "path": "unittest/unichar_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <tesseract/unichar.h>\n#include \"gmock/gmock.h\" // for testing::ElementsAreArray\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nTEST(UnicharTest, Conversion) {\n  // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8\n  // show the required conversion properties.\n  // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes.\n  const char *kUTF8Src = \"a\\u05d0\\u0ca4\\U0002a714\";\n  const std::vector<char32> kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714};\n  // Check for round-trip conversion.\n  std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src);\n  EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src));\n  std::string utf8 = UNICHAR::UTF32ToUTF8(utf32);\n  EXPECT_STREQ(kUTF8Src, utf8.c_str());\n}\n\nTEST(UnicharTest, InvalidText) {\n  // This test verifies that Unichar correctly deals with invalid text.\n  const char *kInvalidUTF8 = \"a b\\200d string\";\n  const std::vector<char32> kInvalidUTF32 = {'a', ' ', 0x200000, 'x'};\n  // Invalid utf8 produces an empty vector.\n  std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8);\n  EXPECT_TRUE(utf32.empty());\n  // Invalid utf32 produces an empty string.\n  std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32);\n  EXPECT_TRUE(utf8.empty());\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/unicharcompress_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <string>\n\n#include <allheaders.h>\n\n#include \"include_gunit.h\"\n#include \"log.h\" // for LOG\n#include \"serialis.h\"\n#include \"unicharcompress.h\"\n\nnamespace tesseract {\n\nclass UnicharcompressTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n    file::MakeTmpdir();\n  }\n\n  // Loads and compresses the given unicharset.\n  void LoadUnicharset(const std::string &unicharset_name) {\n    std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, \"radical-stroke.txt\");\n    std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);\n    std::string radical_data;\n    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));\n    CHECK(unicharset_.load_from_file(unicharset_file.c_str()));\n    std::string radical_str(radical_data.c_str());\n    null_char_ = unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();\n    compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);\n    // Get the encoding of the null char.\n    RecodedCharID code;\n    compressed_.EncodeUnichar(null_char_, &code);\n    encoded_null_char_ = code(0);\n    std::string output_name =\n        file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + \".encoding.txt\";\n    std::string encoding = compressed_.GetEncodingAsString(unicharset_);\n    std::string encoding_str(&encoding[0], encoding.size());\n    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));\n    LOG(INFO) << \"Wrote encoding to:\" << output_name;\n  }\n  // Serializes and de-serializes compressed_ over itself.\n  void SerializeAndUndo() {\n    std::vector<char> data;\n    TFile wfp;\n    wfp.OpenWrite(&data);\n    EXPECT_TRUE(compressed_.Serialize(&wfp));\n    TFile rfp;\n    rfp.Open(&data[0], data.size());\n    EXPECT_TRUE(compressed_.DeSerialize(&rfp));\n  }\n  // Returns true if the lang is in CJK.\n  bool IsCJKLang(const std::string &lang) {\n    return lang == \"chi_sim\" || lang == \"chi_tra\" || lang == \"kor\" || lang == \"jpn\";\n  }\n  // Returns true if the lang is Indic.\n  bool IsIndicLang(const std::string &lang) {\n    return lang == \"asm\" || lang == \"ben\" || lang == \"bih\" || lang == \"hin\" || lang == \"mar\" ||\n           lang == \"nep\" || lang == \"san\" || lang == \"bod\" || lang == \"dzo\" || lang == \"guj\" ||\n           lang == \"kan\" || lang == \"mal\" || lang == \"ori\" || lang == \"pan\" || lang == \"sin\" ||\n           lang == \"tam\" || lang == \"tel\";\n  }\n\n  // Expects the appropriate results from the compressed_  unicharset_.\n  void ExpectCorrect(const std::string &lang) {\n    // Count the number of times each code is used in each element of\n    // RecodedCharID.\n    RecodedCharID zeros;\n    for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {\n      zeros.Set(i, 0);\n    }\n    int code_range = compressed_.code_range();\n    std::vector<RecodedCharID> times_seen(code_range, zeros);\n    for (int u = 0; u <= unicharset_.size(); ++u) {\n      if (u != UNICHAR_SPACE && u != null_char_ &&\n          (u == unicharset_.size() ||\n           (unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) {\n        continue; // Not used so not encoded.\n      }\n      RecodedCharID code;\n      int len = compressed_.EncodeUnichar(u, &code);\n      // Check round-trip encoding.\n      int unichar_id;\n      std::vector<UNICHAR_ID> normed_ids;\n      if (u == null_char_ || u == unicharset_.size()) {\n        unichar_id = null_char_;\n      } else {\n        unichar_id = u;\n      }\n      EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));\n      // Check that the codes are valid.\n      for (int i = 0; i < len; ++i) {\n        int code_val = code(i);\n        EXPECT_GE(code_val, 0);\n        EXPECT_LT(code_val, code_range);\n        times_seen[code_val].Set(i, times_seen[code_val](i) + 1);\n      }\n    }\n    // Check that each code is used in at least one position.\n    for (int c = 0; c < code_range; ++c) {\n      int num_used = 0;\n      for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {\n        if (times_seen[c](i) != 0) {\n          ++num_used;\n        }\n      }\n      EXPECT_GE(num_used, 1) << \"c=\" << c << \"/\" << code_range;\n    }\n    // Check that GetNextCodes/GetFinalCodes lists match the times_seen,\n    // and create valid codes.\n    RecodedCharID code;\n    CheckCodeExtensions(code, times_seen);\n    // Finally, we achieved all that using a codebook < 10% of the size of\n    // the original unicharset, for CK or Indic, and 20% with J, but just\n    // no bigger for all others.\n    if (IsCJKLang(lang) || IsIndicLang(lang)) {\n      EXPECT_LT(code_range, unicharset_.size() / (lang == \"jpn\" ? 5 : 10));\n    } else {\n      EXPECT_LE(code_range, unicharset_.size() + 1);\n    }\n    LOG(INFO) << \"Compressed unicharset of \" << unicharset_.size() << \" to \" << code_range;\n  }\n  // Checks for extensions of the current code that either finish a code, or\n  // extend it and checks those extensions recursively.\n  void CheckCodeExtensions(const RecodedCharID &code,\n                           const std::vector<RecodedCharID> &times_seen) {\n    RecodedCharID extended = code;\n    int length = code.length();\n    const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);\n    if (final_codes != nullptr) {\n      for (int ending : *final_codes) {\n        EXPECT_GT(times_seen[ending](length), 0);\n        extended.Set(length, ending);\n        int unichar_id = compressed_.DecodeUnichar(extended);\n        EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);\n      }\n    }\n    const std::vector<int> *next_codes = compressed_.GetNextCodes(code);\n    if (next_codes != nullptr) {\n      for (int extension : *next_codes) {\n        EXPECT_GT(times_seen[extension](length), 0);\n        extended.Set(length, extension);\n        CheckCodeExtensions(extended, times_seen);\n      }\n    }\n  }\n\n  UnicharCompress compressed_;\n  UNICHARSET unicharset_;\n  int null_char_;\n  // The encoding of the null_char_.\n  int encoded_null_char_;\n};\n\nTEST_F(UnicharcompressTest, DoesChinese) {\n  LOG(INFO) << \"Testing chi_tra\";\n  LoadUnicharset(\"chi_tra.unicharset\");\n  ExpectCorrect(\"chi_tra\");\n  LOG(INFO) << \"Testing chi_sim\";\n  LoadUnicharset(\"chi_sim.unicharset\");\n  ExpectCorrect(\"chi_sim\");\n}\n\nTEST_F(UnicharcompressTest, DoesJapanese) {\n  LOG(INFO) << \"Testing jpn\";\n  LoadUnicharset(\"jpn.unicharset\");\n  ExpectCorrect(\"jpn\");\n}\n\nTEST_F(UnicharcompressTest, DoesKorean) {\n  LOG(INFO) << \"Testing kor\";\n  LoadUnicharset(\"kor.unicharset\");\n  ExpectCorrect(\"kor\");\n}\n\nTEST_F(UnicharcompressTest, DoesKannada) {\n  LOG(INFO) << \"Testing kan\";\n  LoadUnicharset(\"kan.unicharset\");\n  ExpectCorrect(\"kan\");\n  SerializeAndUndo();\n  ExpectCorrect(\"kan\");\n}\n\nTEST_F(UnicharcompressTest, DoesMarathi) {\n  LOG(INFO) << \"Testing mar\";\n  LoadUnicharset(\"mar.unicharset\");\n  ExpectCorrect(\"mar\");\n}\n\nTEST_F(UnicharcompressTest, DoesEnglish) {\n  LOG(INFO) << \"Testing eng\";\n  LoadUnicharset(\"eng.unicharset\");\n  ExpectCorrect(\"eng\");\n}\n\n// Tests that a unicharset that contains double-letter ligatures (eg ff) has\n// no null char in the encoding at all.\nTEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {\n  LOG(INFO) << \"Testing por with ligatures\";\n  LoadUnicharset(\"por.unicharset\");\n  ExpectCorrect(\"por\");\n  // Check that any unichar-id that is encoded with multiple codes has the\n  // correct encoded_null_char_ in between.\n  for (int u = 0; u <= unicharset_.size(); ++u) {\n    RecodedCharID code;\n    int len = compressed_.EncodeUnichar(u, &code);\n    if (len > 1) {\n      // The should not be any null char in the code.\n      for (int i = 0; i < len; ++i) {\n        EXPECT_NE(encoded_null_char_, code(i));\n      }\n    }\n  }\n}\n\n// Tests that GetEncodingAsString returns the right result for a trivial\n// unicharset.\nTEST_F(UnicharcompressTest, GetEncodingAsString) {\n  LoadUnicharset(\"trivial.unicharset\");\n  ExpectCorrect(\"trivial\");\n  std::string encoding = compressed_.GetEncodingAsString(unicharset_);\n  std::string encoding_str(&encoding[0], encoding.length());\n  std::vector<std::string> lines = split(encoding_str, '\\n');\n  EXPECT_EQ(5, lines.size());\n  // The first line is always space.\n  EXPECT_EQ(\"0\\t \", lines[0]);\n  // Next we have i.\n  EXPECT_EQ(\"1\\ti\", lines[1]);\n  // Next we have f.\n  EXPECT_EQ(\"2\\tf\", lines[2]);\n  // Next we have the fi ligature: ﬁ. There are no nulls in it, as there are no\n  // repeated letter ligatures in this unicharset, unlike por.unicharset above.\n  EXPECT_EQ(\"2,1\\tﬁ\", lines[3]);\n  // Finally the null character.\n  EXPECT_EQ(\"3\\t<nul>\", lines[4]);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/unicharset_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"unicharset.h\"\n#include <string>\n#include \"gmock/gmock.h\" // for testing::ElementsAreArray\n#include \"include_gunit.h\"\n#include \"log.h\" // for LOG\n\nusing testing::ElementsAreArray;\n\nnamespace tesseract {\n\nclass UnicharsetTest : public ::testing::Test {\nprotected:\n  void SetUp() override {\n    std::locale::global(std::locale(\"\"));\n  }\n};\n\nTEST(UnicharsetTest, Basics) {\n  // This test verifies basic insertion, unichar_to_id, and encode.\n  UNICHARSET u;\n  u.unichar_insert(\"a\");\n  EXPECT_EQ(u.size(), 4);\n  u.unichar_insert(\"f\");\n  EXPECT_EQ(u.size(), 5);\n  u.unichar_insert(\"i\");\n  EXPECT_EQ(u.size(), 6);\n  // The fi ligature is NOT added because it can be encoded with a cleanup as f\n  // then i.\n  u.unichar_insert(\"\\ufb01\");\n  EXPECT_EQ(u.size(), 6);\n  u.unichar_insert(\"e\");\n  EXPECT_EQ(u.size(), 7);\n  u.unichar_insert(\"n\");\n  EXPECT_EQ(u.size(), 8);\n  EXPECT_EQ(u.unichar_to_id(\"f\"), 4);\n  EXPECT_EQ(u.unichar_to_id(\"i\"), 5);\n  // The fi ligature has no valid id.\n  EXPECT_EQ(u.unichar_to_id(\"\\ufb01\"), INVALID_UNICHAR_ID);\n  // The fi pair has no valid id.\n  EXPECT_EQ(u.unichar_to_id(\"fi\"), INVALID_UNICHAR_ID);\n  std::vector<int> labels;\n  EXPECT_TRUE(u.encode_string(\"affine\", true, &labels, nullptr, nullptr));\n  std::vector<int> v(&labels[0], &labels[0] + labels.size());\n  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));\n  // With the fi ligature encoding fails without a pre-cleanup.\n  std::string lig_str = \"af\\ufb01ne\";\n  EXPECT_FALSE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));\n  lig_str = u.CleanupString(lig_str.c_str());\n  EXPECT_TRUE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));\n  v = std::vector<int>(&labels[0], &labels[0] + labels.size());\n  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));\n}\n\nTEST(UnicharsetTest, Multibyte) {\n  // This test verifies basic insertion, unichar_to_id, and encode.\n  // The difference from Basic above is that now we are testing multi-byte\n  // unicodes instead of single byte.\n  UNICHARSET u;\n  // Insert some Arabic letters.\n  u.unichar_insert(\"\\u0627\");\n  EXPECT_EQ(u.size(), 4);\n  u.unichar_insert(\"\\u062c\");\n  EXPECT_EQ(u.size(), 5);\n  u.unichar_insert(\"\\u062f\");\n  EXPECT_EQ(u.size(), 6);\n  u.unichar_insert(\"\\ufb01\"); // fi ligature is added as fi pair.\n  EXPECT_EQ(u.size(), 7);\n  u.unichar_insert(\"\\u062b\");\n  EXPECT_EQ(u.size(), 8);\n  u.unichar_insert(\"\\u0635\");\n  EXPECT_EQ(u.size(), 9);\n  EXPECT_EQ(u.unichar_to_id(\"\\u0627\"), 3);\n  EXPECT_EQ(u.unichar_to_id(\"\\u062c\"), 4);\n  // The first two bytes of this string is \\u0627, which matches id 3;\n  EXPECT_EQ(u.unichar_to_id(\"\\u0627\\u062c\", 2), 3);\n  EXPECT_EQ(u.unichar_to_id(\"\\u062f\"), 5);\n  // Individual f and i are not present, but they are there as a pair.\n  EXPECT_EQ(u.unichar_to_id(\"f\"), INVALID_UNICHAR_ID);\n  EXPECT_EQ(u.unichar_to_id(\"i\"), INVALID_UNICHAR_ID);\n  EXPECT_EQ(u.unichar_to_id(\"fi\"), 6);\n  // The fi ligature is findable.\n  EXPECT_EQ(u.unichar_to_id(\"\\ufb01\"), 6);\n  std::vector<int> labels;\n  EXPECT_TRUE(\n      u.encode_string(\"\\u0627\\u062c\\u062c\\u062f\\u0635\\u062b\", true, &labels, nullptr, nullptr));\n  std::vector<int> v(&labels[0], &labels[0] + labels.size());\n  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));\n  // With the fi ligature the fi is picked out.\n  std::vector<char> lengths;\n  unsigned encoded_length;\n  std::string src_str = \"\\u0627\\u062c\\ufb01\\u0635\\u062b\";\n  // src_str has to be pre-cleaned for lengths to be correct.\n  std::string cleaned = u.CleanupString(src_str.c_str());\n  EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, &encoded_length));\n  EXPECT_EQ(encoded_length, cleaned.size());\n  std::string len_str(&lengths[0], lengths.size());\n  EXPECT_STREQ(len_str.c_str(), \"\\002\\002\\002\\002\\002\");\n  v = std::vector<int>(&labels[0], &labels[0] + labels.size());\n  EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));\n}\n\nTEST(UnicharsetTest, MultibyteBigrams) {\n  // This test verifies basic insertion, unichar_to_id, and encode.\n  // The difference from Basic above is that now we are testing multi-byte\n  // unicodes instead of single byte.\n  UNICHARSET u;\n  // Insert some Arabic letters.\n  u.unichar_insert(\"\\u0c9c\");\n  EXPECT_EQ(u.size(), 4);\n  u.unichar_insert(\"\\u0cad\");\n  EXPECT_EQ(u.size(), 5);\n  u.unichar_insert(\"\\u0ccd\\u0c9c\");\n  EXPECT_EQ(u.size(), 6);\n  u.unichar_insert(\"\\u0ccd\");\n  EXPECT_EQ(u.size(), 7);\n  // By default the encodable bigram is NOT added.\n  u.unichar_insert(\"\\u0ccd\\u0cad\");\n  EXPECT_EQ(u.size(), 7);\n  // It is added if we force it to be.\n  u.unichar_insert(\"\\u0ccd\\u0cad\", OldUncleanUnichars::kTrue);\n  EXPECT_EQ(u.size(), 8);\n  std::vector<char> data;\n  tesseract::TFile fp;\n  fp.OpenWrite(&data);\n  u.save_to_file(&fp);\n  fp.Open(&data[0], data.size());\n  UNICHARSET v;\n  v.load_from_file(&fp, false);\n  EXPECT_EQ(v.unichar_to_id(\"\\u0c9c\"), 3);\n  EXPECT_EQ(v.unichar_to_id(\"\\u0cad\"), 4);\n  EXPECT_EQ(v.unichar_to_id(\"\\u0ccd\\u0c9c\"), 5);\n  EXPECT_EQ(v.unichar_to_id(\"\\u0ccd\"), 6);\n  EXPECT_EQ(v.unichar_to_id(\"\\u0ccd\\u0cad\"), 7);\n}\n\nTEST(UnicharsetTest, OldStyle) {\n  // This test verifies an old unicharset that contains fi/fl ligatures loads\n  // and keeps all the entries.\n  std::string filename = file::JoinPath(TESTDATA_DIR, \"eng.unicharset\");\n  UNICHARSET u;\n  LOG(INFO) << \"Filename=\" << filename;\n  EXPECT_TRUE(u.load_from_file(filename.c_str()));\n  EXPECT_EQ(u.size(), 111);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/util/utf8/unicodetext.cc",
    "content": "/**\n * Copyright 2010 Google Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include \"include_gunit.h\"\n#include \"util/utf8/unicodetext.h\"\n\n#include <string.h>  // for memcpy, NULL, memcmp, etc\n#include <algorithm> // for max\n\n//#include \"base/logging.h\"               // for operator<<, CHECK, etc\n//#include \"base/stringprintf.h\"          // for StringPrintf, StringAppendF\n//#include \"strings/stringpiece.h\"        // for StringPiece, etc\n\n#include \"third_party/utf/utf.h\"         // for isvalidcharntorune, etc\n#include \"util/utf8/unilib.h\"            // for IsInterchangeValid, etc\n#include \"util/utf8/unilib_utf8_utils.h\" // for OneCharLen\n\nstatic int CodepointDistance(const char *start, const char *end) {\n  int n = 0;\n  // Increment n on every non-trail-byte.\n  for (const char *p = start; p < end; ++p) {\n    n += (*reinterpret_cast<const signed char *>(p) >= -0x40);\n  }\n  return n;\n}\n\nstatic int CodepointCount(const char *utf8, int len) {\n  return CodepointDistance(utf8, utf8 + len);\n}\n\nUnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first,\n                                                      const UnicodeText::const_iterator &last) {\n  return CodepointDistance(first.it_, last.it_);\n}\n\n// ---------- Utility ----------\n\nstatic int ConvertToInterchangeValid(char *start, int len) {\n  // This routine is called only when we've discovered that a UTF-8 buffer\n  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8\n  // was not interchange valid. This indicates a bug in the caller, and\n  // a LOG(WARNING) is done in that case.\n  // This is similar to CoerceToInterchangeValid, but it replaces each\n  // structurally valid byte with a space, and each non-interchange\n  // character with a space, even when that character requires more\n  // than one byte in UTF8. E.g., \"\\xEF\\xB7\\x90\" (U+FDD0) is\n  // structurally valid UTF8, but U+FDD0 is not an interchange-valid\n  // code point. The result should contain one space, not three.\n  //\n  // Since the conversion never needs to write more data than it\n  // reads, it is safe to change the buffer in place. It returns the\n  // number of bytes written.\n  char *const in = start;\n  char *out = start;\n  char *const end = start + len;\n  while (start < end) {\n    int good = UniLib::SpanInterchangeValid(start, end - start);\n    if (good > 0) {\n      if (out != start) {\n        memmove(out, start, good);\n      }\n      out += good;\n      start += good;\n      if (start == end) {\n        break;\n      }\n    }\n    // Is the current string invalid UTF8 or just non-interchange UTF8?\n    char32 rune;\n    int n;\n    if (isvalidcharntorune(start, end - start, &rune, &n)) {\n      // structurally valid UTF8, but not interchange valid\n      start += n; // Skip over the whole character.\n    } else {      // bad UTF8\n      start += 1; // Skip over just one byte\n    }\n    *out++ = ' ';\n  }\n  return out - in;\n}\n\n// *************** Data representation **********\n\n// Note: the copy constructor is undefined.\n\n// After reserve(), resize(), or clear(), we're an owner, not an alias.\n\nvoid UnicodeText::Repr::reserve(int new_capacity) {\n  // If there's already enough capacity, and we're an owner, do nothing.\n  if (capacity_ >= new_capacity && ours_)\n    return;\n\n  // Otherwise, allocate a new buffer.\n  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);\n  char *new_data = new char[capacity_];\n\n  // If there is an old buffer, copy it into the new buffer.\n  if (data_) {\n    memcpy(new_data, data_, size_);\n    if (ours_)\n      delete[] data_; // If we owned the old buffer, free it.\n  }\n  data_ = new_data;\n  ours_ = true; // We own the new buffer.\n  // size_ is unchanged.\n}\n\nvoid UnicodeText::Repr::resize(int new_size) {\n  if (new_size == 0) {\n    clear();\n  } else {\n    if (!ours_ || new_size > capacity_)\n      reserve(new_size);\n    // Clear the memory in the expanded part.\n    if (size_ < new_size)\n      memset(data_ + size_, 0, new_size - size_);\n    size_ = new_size;\n    ours_ = true;\n  }\n}\n\n// This implementation of clear() deallocates the buffer if we're an owner.\n// That's not strictly necessary; we could just set size_ to 0.\nvoid UnicodeText::Repr::clear() {\n  if (ours_)\n    delete[] data_;\n  data_ = nullptr;\n  size_ = capacity_ = 0;\n  ours_ = true;\n}\n\nvoid UnicodeText::Repr::Copy(const char *data, int size) {\n  resize(size);\n  memcpy(data_, data, size);\n}\n\nvoid UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {\n  if (data == data_)\n    return; // We already own this memory. (Weird case.)\n  if (ours_ && data_)\n    delete[] data_; // If we owned the old buffer, free it.\n  data_ = data;\n  size_ = size;\n  capacity_ = capacity;\n  ours_ = true;\n}\n\nvoid UnicodeText::Repr::PointTo(const char *data, int size) {\n  if (ours_ && data_)\n    delete[] data_; // If we owned the old buffer, free it.\n  data_ = const_cast<char *>(data);\n  size_ = size;\n  capacity_ = size;\n  ours_ = false;\n}\n\nvoid UnicodeText::Repr::append(const char *bytes, int byte_length) {\n  reserve(size_ + byte_length);\n  memcpy(data_ + size_, bytes, byte_length);\n  size_ += byte_length;\n}\n\n#ifdef INCLUDE_TENSORFLOW\nstring UnicodeText::Repr::DebugString() const {\n  return tensorflow::strings::Printf(\"{Repr %p data=%p size=%d capacity=%d %s}\", this, data_, size_,\n                                     capacity_, ours_ ? \"Owned\" : \"Alias\");\n}\n#endif\n\n// *************** UnicodeText ******************\n\n// ----- Constructors -----\n\n// Default constructor\nUnicodeText::UnicodeText() {}\n\n// Copy constructor\nUnicodeText::UnicodeText(const UnicodeText &src) {\n  Copy(src);\n}\n\n// Substring constructor\nUnicodeText::UnicodeText(const UnicodeText::const_iterator &first,\n                         const UnicodeText::const_iterator &last) {\n  CHECK(first <= last) << \" Incompatible iterators\";\n  repr_.append(first.it_, last.it_ - first.it_);\n}\n\nstring UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) {\n  CHECK(first <= last) << \" Incompatible iterators\";\n  return string(first.it_, last.it_ - first.it_);\n}\n\n// ----- Copy -----\n\nUnicodeText &UnicodeText::operator=(const UnicodeText &src) {\n  if (this != &src) {\n    Copy(src);\n  }\n  return *this;\n}\n\nUnicodeText &UnicodeText::Copy(const UnicodeText &src) {\n  repr_.Copy(src.repr_.data_, src.repr_.size_);\n  return *this;\n}\n\nUnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {\n  repr_.Copy(buffer, byte_length);\n  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {\n    LOG(WARNING) << \"UTF-8 buffer is not interchange-valid.\";\n    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);\n  }\n  return *this;\n}\n\nUnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {\n  repr_.Copy(buffer, byte_length);\n  return *this;\n}\n\n// ----- TakeOwnershipOf  -----\n\nUnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {\n  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);\n  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {\n    LOG(WARNING) << \"UTF-8 buffer is not interchange-valid.\";\n    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);\n  }\n  return *this;\n}\n\nUnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,\n                                                    int byte_capacity) {\n  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);\n  return *this;\n}\n\n// ----- PointTo -----\n\nUnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {\n  if (UniLib::IsInterchangeValid(buffer, byte_length)) {\n    repr_.PointTo(buffer, byte_length);\n  } else {\n    LOG(WARNING) << \"UTF-8 buffer is not interchange-valid.\";\n    repr_.Copy(buffer, byte_length);\n    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);\n  }\n  return *this;\n}\n\nUnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {\n  repr_.PointTo(buffer, byte_length);\n  return *this;\n}\n\nUnicodeText &UnicodeText::PointTo(const UnicodeText &src) {\n  repr_.PointTo(src.repr_.data_, src.repr_.size_);\n  return *this;\n}\n\nUnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) {\n  CHECK(first <= last) << \" Incompatible iterators\";\n  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());\n  return *this;\n}\n\n// ----- Append -----\n\nUnicodeText &UnicodeText::append(const UnicodeText &u) {\n  repr_.append(u.repr_.data_, u.repr_.size_);\n  return *this;\n}\n\nUnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) {\n  CHECK(first <= last) << \" Incompatible iterators\";\n  repr_.append(first.it_, last.it_ - first.it_);\n  return *this;\n}\n\nUnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {\n  repr_.append(utf8, len);\n  return *this;\n}\n\n// ----- substring searching -----\n\nUnicodeText::const_iterator UnicodeText::find(const UnicodeText &look,\n                                              const_iterator start_pos) const {\n  CHECK_GE(start_pos.utf8_data(), utf8_data());\n  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());\n  return UnsafeFind(look, start_pos);\n}\n\nUnicodeText::const_iterator UnicodeText::find(const UnicodeText &look) const {\n  return UnsafeFind(look, begin());\n}\n\nUnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,\n                                                    const_iterator start_pos) const {\n  // Due to the magic of the UTF8 encoding, searching for a sequence of\n  // letters is equivalent to substring search.\n#ifdef INCLUDE_TENSORFLOW\n  StringPiece searching(utf8_data(), utf8_length());\n  StringPiece look_piece(look.utf8_data(), look.utf8_length());\n#endif\n  LOG(FATAL) << \"Not implemented\";\n#ifdef INCLUDE_TENSORFLOW\n  // StringPiece::size_type found =\n  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());\n  StringPiece::size_type found = StringPiece::npos;\n  if (found == StringPiece::npos)\n    return end();\n  return const_iterator(utf8_data() + found);\n#else\n  return end();\n#endif\n}\n\n#ifdef INCLUDE_TENSORFLOW\nbool UnicodeText::HasReplacementChar() const {\n  // Equivalent to:\n  //   UnicodeText replacement_char;\n  //   replacement_char.push_back(0xFFFD);\n  //   return find(replacement_char) != end();\n  StringPiece searching(utf8_data(), utf8_length());\n  StringPiece looking_for(\"\\xEF\\xBF\\xBD\", 3);\n  LOG(FATAL) << \"Not implemented\";\n  // return searching.find(looking_for) != StringPiece::npos;\n  return false;\n}\n#endif\n\n// ----- other methods -----\n\n// Clear operator\nvoid UnicodeText::clear() {\n  repr_.clear();\n}\n\n// Destructor\nUnicodeText::~UnicodeText() {}\n\nvoid UnicodeText::push_back(char32 c) {\n  if (UniLib::IsValidCodepoint(c)) {\n    char buf[UTFmax];\n    int len = runetochar(buf, &c);\n    if (UniLib::IsInterchangeValid(buf, len)) {\n      repr_.append(buf, len);\n    } else {\n      LOG(WARNING) << \"Unicode value 0x\" << std::hex << c << \" is not valid for interchange\";\n      repr_.append(\" \", 1);\n    }\n  } else {\n    LOG(WARNING) << \"Illegal Unicode value: 0x\" << std::hex << c;\n    repr_.append(\" \", 1);\n  }\n}\n\nint UnicodeText::size() const {\n  return CodepointCount(repr_.data_, repr_.size_);\n}\n\nbool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {\n  if (&lhs == &rhs)\n    return true;\n  if (lhs.repr_.size_ != rhs.repr_.size_)\n    return false;\n  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;\n}\n\n#ifdef INCLUDE_TENSORFLOW\nstring UnicodeText::DebugString() const {\n  return tensorflow::strings::Printf(\"{UnicodeText %p chars=%d repr=%s}\", this, size(),\n                                     repr_.DebugString().c_str());\n}\n#endif\n\n// ******************* UnicodeText::const_iterator *********************\n\n// The implementation of const_iterator would be nicer if it\n// inherited from boost::iterator_facade\n// (http://boost.org/libs/iterator/doc/iterator_facade.html).\n\nUnicodeText::const_iterator::const_iterator() : it_(nullptr) {}\n\nUnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {}\n\nUnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) {\n  if (&other != this)\n    it_ = other.it_;\n  return *this;\n}\n\nUnicodeText::const_iterator UnicodeText::begin() const {\n  return const_iterator(repr_.data_);\n}\n\nUnicodeText::const_iterator UnicodeText::end() const {\n  return const_iterator(repr_.data_ + repr_.size_);\n}\n\nbool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) {\n  return lhs.it_ < rhs.it_;\n}\n\nchar32 UnicodeText::const_iterator::operator*() const {\n  // (We could call chartorune here, but that does some\n  // error-checking, and we're guaranteed that our data is valid\n  // UTF-8. Also, we expect this routine to be called very often. So\n  // for speed, we do the calculation ourselves.)\n\n  // Convert from UTF-8\n  unsigned char byte1 = it_[0];\n  if (byte1 < 0x80)\n    return byte1;\n\n  unsigned char byte2 = it_[1];\n  if (byte1 < 0xE0)\n    return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);\n\n  unsigned char byte3 = it_[2];\n  if (byte1 < 0xF0)\n    return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);\n\n  unsigned char byte4 = it_[3];\n  return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);\n}\n\nUnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {\n  it_ += UniLib::OneCharLen(it_);\n  return *this;\n}\n\nUnicodeText::const_iterator &UnicodeText::const_iterator::operator--() {\n  while (UniLib::IsTrailByte(*--it_))\n    ;\n  return *this;\n}\n\nint UnicodeText::const_iterator::get_utf8(char *utf8_output) const {\n  utf8_output[0] = it_[0];\n  if ((it_[0] & 0xff) < 0x80)\n    return 1;\n  utf8_output[1] = it_[1];\n  if ((it_[0] & 0xff) < 0xE0)\n    return 2;\n  utf8_output[2] = it_[2];\n  if ((it_[0] & 0xff) < 0xF0)\n    return 3;\n  utf8_output[3] = it_[3];\n  return 4;\n}\n\nstring UnicodeText::const_iterator::get_utf8_string() const {\n  return string(utf8_data(), utf8_length());\n}\n\nint UnicodeText::const_iterator::utf8_length() const {\n  if ((it_[0] & 0xff) < 0x80) {\n    return 1;\n  } else if ((it_[0] & 0xff) < 0xE0) {\n    return 2;\n  } else if ((it_[0] & 0xff) < 0xF0) {\n    return 3;\n  } else {\n    return 4;\n  }\n}\n\nUnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {\n  CHECK(p != nullptr);\n  const char *start = utf8_data();\n  int len = utf8_length();\n  const char *end = start + len;\n  CHECK(p >= start);\n  CHECK(p <= end);\n  CHECK(p == end || !UniLib::IsTrailByte(*p));\n  return const_iterator(p);\n}\n\n#ifdef INCLUDE_TENSORFLOW\nstring UnicodeText::const_iterator::DebugString() const {\n  return tensorflow::strings::Printf(\"{iter %p}\", it_);\n}\n\n// *************************** Utilities *************************\n\nstring CodepointString(const UnicodeText &t) {\n  string s;\n  UnicodeText::const_iterator it = t.begin(), end = t.end();\n  while (it != end)\n    tensorflow::strings::Appendf(&s, \"%X \", *it++);\n  return s;\n}\n#endif\n"
  },
  {
    "path": "unittest/util/utf8/unicodetext.h",
    "content": "/**\n * Copyright 2010 Google Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_\n#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_\n\n#include <stddef.h> // for NULL, ptrdiff_t\n#include <iterator> // for bidirectional_iterator_tag, etc\n#include <string>   // for string\n#include <utility>  // for pair\n\n#include \"syntaxnet/base.h\"\n\n// ***************************** UnicodeText **************************\n//\n// A UnicodeText object is a container for a sequence of Unicode\n// codepoint values. It has default, copy, and assignment constructors.\n// Data can be appended to it from another UnicodeText, from\n// iterators, or from a single codepoint.\n//\n// The internal representation of the text is UTF-8. Since UTF-8 is a\n// variable-width format, UnicodeText does not provide random access\n// to the text, and changes to the text are permitted only at the end.\n//\n// The UnicodeText class defines a const_iterator. The dereferencing\n// operator (*) returns a codepoint (char32). The iterator is a\n// bidirectional, read-only iterator. It becomes invalid if the text\n// is changed.\n//\n// There are methods for appending and retrieving UTF-8 data directly.\n// The 'utf8_data' method returns a const char* that contains the\n// UTF-8-encoded version of the text; 'utf8_length' returns the number\n// of bytes in the UTF-8 data. An iterator's 'get' method stores up to\n// 4 bytes of UTF-8 data in a char array and returns the number of\n// bytes that it stored.\n//\n// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,\n// 0x10FFFF], but UnicodeText has the additional restriction that it\n// can contain only those characters that are valid for interchange on\n// the Web. This excludes all of the control codes except for carriage\n// return, line feed, and horizontal tab.  It also excludes\n// non-characters, but codepoints that are in the Private Use regions\n// are allowed, as are codepoints that are unassigned. (See the\n// Unicode reference for details.) The function UniLib::IsInterchangeValid\n// can be used as a test for this property.\n//\n// UnicodeTexts are safe. Every method that constructs or modifies a\n// UnicodeText tests for interchange-validity, and will substitute a\n// space for the invalid data. Such cases are reported via\n// LOG(WARNING).\n//\n// MEMORY MANAGEMENT: copy, take ownership, or point to\n//\n// A UnicodeText is either an \"owner\", meaning that it owns the memory\n// for the data buffer and will free it when the UnicodeText is\n// destroyed, or it is an \"alias\", meaning that it does not.\n//\n// There are three methods for storing UTF-8 data in a UnicodeText:\n//\n// CopyUTF8(buffer, len) copies buffer.\n//\n// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.\n//\n// PointToUTF8(buffer, size) creates an alias pointing to buffer.\n//\n// All three methods perform a validity check on the buffer. There are\n// private, \"unsafe\" versions of these functions that bypass the\n// validity check. They are used internally and by friend-functions\n// that are handling UTF-8 data that has already been validated.\n//\n// The purpose of an alias is to avoid making an unnecessary copy of a\n// UTF-8 buffer while still providing access to the Unicode values\n// within that text through iterators or the fast scanners that are\n// based on UTF-8 state tables. The lifetime of an alias must not\n// exceed the lifetime of the buffer from which it was constructed.\n//\n// The semantics of an alias might be described as \"copy on write or\n// repair.\" The source data is never modified. If push_back() or\n// append() is called on an alias, a copy of the data will be created,\n// and the UnicodeText will become an owner. If clear() is called on\n// an alias, it becomes an (empty) owner.\n//\n// The copy constructor and the assignment operator produce an owner.\n// That is, after direct initialization (\"UnicodeText x(y);\") or copy\n// initialization (\"UnicodeText x = y;\") x will be an owner, even if y\n// was an alias. The assignment operator (\"x = y;\") also produces an\n// owner unless x and y are the same object and y is an alias.\n//\n// Aliases should be used with care. If the source from which an alias\n// was created is freed, or if the contents are changed, while the\n// alias is still in use, fatal errors could result. But it can be\n// quite useful to have a UnicodeText \"window\" through which to see a\n// UTF-8 buffer without having to pay the price of making a copy.\n//\n// UTILITIES\n//\n// The interfaces in util/utf8/public/textutils.h provide higher-level\n// utilities for dealing with UnicodeTexts, including routines for\n// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or\n// strings, creating strings from UnicodeTexts, normalizing text for\n// efficient matching or display, and others.\n\nclass UnicodeText {\npublic:\n  class const_iterator;\n\n  typedef char32 value_type;\n\n  // Constructors. These always produce owners.\n  UnicodeText();                       // Create an empty text.\n  UnicodeText(const UnicodeText &src); // copy constructor\n  // Construct a substring (copies the data).\n  UnicodeText(const const_iterator &first, const const_iterator &last);\n\n  // Assignment operator. This copies the data and produces an owner\n  // unless this == &src, e.g., \"x = x;\", which is a no-op.\n  UnicodeText &operator=(const UnicodeText &src);\n\n  // x.Copy(y) copies the data from y into x.\n  UnicodeText &Copy(const UnicodeText &src);\n  inline UnicodeText &assign(const UnicodeText &src) {\n    return Copy(src);\n  }\n\n  // x.PointTo(y) changes x so that it points to y's data.\n  // It does not copy y or take ownership of y's data.\n  UnicodeText &PointTo(const UnicodeText &src);\n  UnicodeText &PointTo(const const_iterator &first, const const_iterator &last);\n\n  ~UnicodeText();\n\n  void clear(); // Clear text.\n  bool empty() const {\n    return repr_.size_ == 0;\n  } // Test if text is empty.\n\n  // Add a codepoint to the end of the text.\n  // If the codepoint is not interchange-valid, add a space instead\n  // and log a warning.\n  void push_back(char32 codepoint);\n\n  // Generic appending operation.\n  // iterator_traits<ForwardIterator>::value_type must be implicitly\n  // convertible to char32. Typical uses of this method might include:\n  //     char32 chars[] = {0x1, 0x2, ...};\n  //     vector<char32> more_chars = ...;\n  //     utext.append(chars, chars+arraysize(chars));\n  //     utext.append(more_chars.begin(), more_chars.end());\n  template <typename ForwardIterator>\n  UnicodeText &append(ForwardIterator first, const ForwardIterator last) {\n    while (first != last) {\n      push_back(*first++);\n    }\n    return *this;\n  }\n\n  // A specialization of the generic append() method.\n  UnicodeText &append(const const_iterator &first, const const_iterator &last);\n\n  // An optimization of append(source.begin(), source.end()).\n  UnicodeText &append(const UnicodeText &source);\n\n  int size() const; // the number of Unicode characters (codepoints)\n\n  friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);\n  friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs);\n\n  class const_iterator {\n    typedef const_iterator CI;\n\n  public:\n    typedef std::bidirectional_iterator_tag iterator_category;\n    typedef char32 value_type;\n    typedef ptrdiff_t difference_type;\n    typedef void pointer;           // (Not needed.)\n    typedef const char32 reference; // (Needed for const_reverse_iterator)\n\n    // Iterators are default-constructible.\n    const_iterator();\n\n    // It's safe to make multiple passes over a UnicodeText.\n    const_iterator(const const_iterator &other);\n    const_iterator &operator=(const const_iterator &other);\n\n    char32 operator*() const; // Dereference\n\n    const_iterator &operator++();    // Advance (++iter)\n    const_iterator operator++(int) { // (iter++)\n      const_iterator result(*this);\n      ++*this;\n      return result;\n    }\n\n    const_iterator &operator--();    // Retreat (--iter)\n    const_iterator operator--(int) { // (iter--)\n      const_iterator result(*this);\n      --*this;\n      return result;\n    }\n\n    // We love relational operators.\n    friend bool operator==(const CI &lhs, const CI &rhs) {\n      return lhs.it_ == rhs.it_;\n    }\n    friend bool operator!=(const CI &lhs, const CI &rhs) {\n      return !(lhs == rhs);\n    }\n    friend bool operator<(const CI &lhs, const CI &rhs);\n    friend bool operator>(const CI &lhs, const CI &rhs) {\n      return rhs < lhs;\n    }\n    friend bool operator<=(const CI &lhs, const CI &rhs) {\n      return !(rhs < lhs);\n    }\n    friend bool operator>=(const CI &lhs, const CI &rhs) {\n      return !(lhs < rhs);\n    }\n\n    friend difference_type distance(const CI &first, const CI &last);\n\n    // UTF-8-specific methods\n    // Store the UTF-8 encoding of the current codepoint into buf,\n    // which must be at least 4 bytes long. Return the number of\n    // bytes written.\n    int get_utf8(char *buf) const;\n    // Return the UTF-8 character that the iterator points to.\n    string get_utf8_string() const;\n    // Return the byte length of the UTF-8 character the iterator points to.\n    int utf8_length() const;\n    // Return the iterator's pointer into the UTF-8 data.\n    const char *utf8_data() const {\n      return it_;\n    }\n\n    string DebugString() const;\n\n  private:\n    friend class UnicodeText;\n    friend class UnicodeTextUtils;\n    friend class UTF8StateTableProperty;\n    explicit const_iterator(const char *it) : it_(it) {}\n\n    const char *it_;\n  };\n\n  const_iterator begin() const;\n  const_iterator end() const;\n\n  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {\n  public:\n    explicit const_reverse_iterator(const_iterator it)\n        : std::reverse_iterator<const_iterator>(it) {}\n    const char *utf8_data() const {\n      const_iterator tmp_it = base();\n      return (--tmp_it).utf8_data();\n    }\n    int get_utf8(char *buf) const {\n      const_iterator tmp_it = base();\n      return (--tmp_it).get_utf8(buf);\n    }\n    string get_utf8_string() const {\n      const_iterator tmp_it = base();\n      return (--tmp_it).get_utf8_string();\n    }\n    int utf8_length() const {\n      const_iterator tmp_it = base();\n      return (--tmp_it).utf8_length();\n    }\n  };\n  const_reverse_iterator rbegin() const {\n    return const_reverse_iterator(end());\n  }\n  const_reverse_iterator rend() const {\n    return const_reverse_iterator(begin());\n  }\n\n  // Substring searching.  Returns the beginning of the first\n  // occurrence of \"look\", or end() if not found.\n  const_iterator find(const UnicodeText &look, const_iterator start_pos) const;\n  // Equivalent to find(look, begin())\n  const_iterator find(const UnicodeText &look) const;\n\n  // Returns whether this contains the character U+FFFD.  This can\n  // occur, for example, if the input to Encodings::Decode() had byte\n  // sequences that were invalid in the source encoding.\n  bool HasReplacementChar() const;\n\n  // UTF-8-specific methods\n  //\n  // Return the data, length, and capacity of UTF-8-encoded version of\n  // the text. Length and capacity are measured in bytes.\n  const char *utf8_data() const {\n    return repr_.data_;\n  }\n  int utf8_length() const {\n    return repr_.size_;\n  }\n  int utf8_capacity() const {\n    return repr_.capacity_;\n  }\n\n  // Return the UTF-8 data as a string.\n  static string UTF8Substring(const const_iterator &first, const const_iterator &last);\n\n  // There are three methods for initializing a UnicodeText from UTF-8\n  // data. They vary in details of memory management. In all cases,\n  // the data is tested for interchange-validity. If it is not\n  // interchange-valid, a LOG(WARNING) is issued, and each\n  // structurally invalid byte and each interchange-invalid codepoint\n  // is replaced with a space.\n\n  // x.CopyUTF8(buf, len) copies buf into x.\n  UnicodeText &CopyUTF8(const char *utf8_buffer, int byte_length);\n\n  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of\n  // buf. buf is not copied.\n  UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);\n\n  // x.PointToUTF8(buf,len) changes x so that it points to buf\n  // (\"becomes an alias\"). It does not take ownership or copy buf.\n  // If the buffer is not valid, this has the same effect as\n  // CopyUTF8(utf8_buffer, byte_length).\n  UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);\n\n  // Occasionally it is necessary to use functions that operate on the\n  // pointer returned by utf8_data(). MakeIterator(p) provides a way\n  // to get back to the UnicodeText level. It uses CHECK to ensure\n  // that p is a pointer within this object's UTF-8 data, and that it\n  // points to the beginning of a character.\n  const_iterator MakeIterator(const char *p) const;\n\n  string DebugString() const;\n\nprivate:\n  friend class const_iterator;\n  friend class UnicodeTextUtils;\n\n  class Repr { // A byte-string.\n  public:\n    char *data_;\n    int size_;\n    int capacity_;\n    bool ours_; // Do we own data_?\n\n    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}\n    ~Repr() {\n      if (ours_)\n        delete[] data_;\n    }\n\n    void clear();\n    void reserve(int capacity);\n    void resize(int size);\n\n    void append(const char *bytes, int byte_length);\n    void Copy(const char *data, int size);\n    void TakeOwnershipOf(char *data, int size, int capacity);\n    void PointTo(const char *data, int size);\n\n    string DebugString() const;\n\n  private:\n    Repr &operator=(const Repr &);\n    Repr(const Repr &other);\n  };\n\n  Repr repr_;\n\n  // UTF-8-specific private methods.\n  // These routines do not perform a validity check when compiled\n  // in opt mode.\n  // It is an error to call these methods with UTF-8 data that\n  // is not interchange-valid.\n  //\n  UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length);\n  UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);\n  UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length);\n  UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length);\n  const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const;\n};\n\nbool operator==(const UnicodeText &lhs, const UnicodeText &rhs);\n\ninline bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs) {\n  return !(lhs == rhs);\n}\n\n// UnicodeTextRange is a pair of iterators, useful for specifying text\n// segments. If the iterators are ==, the segment is empty.\ntypedef pair<UnicodeText::const_iterator, UnicodeText::const_iterator> UnicodeTextRange;\n\ninline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r) {\n  return r.first == r.second;\n}\n\n// *************************** Utilities *************************\n\n// A factory function for creating a UnicodeText from a buffer of\n// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It\n// is an \"owner.\")\n//\n// Each byte that is structurally invalid will be replaced with a\n// space. Each codepoint that is interchange-invalid will also be\n// replaced with a space, even if the codepoint was represented with a\n// multibyte sequence in the UTF-8 data.\n//\ninline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length,\n                                                     int byte_capacity) {\n  return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity);\n}\n\n// A factory function for creating a UnicodeText from a buffer of\n// UTF-8 data. The new UnicodeText does not take ownership of the\n// buffer. (It is an \"alias.\")\n//\ninline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer,\n                                                            int byte_length) {\n  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);\n}\n\n// Create a UnicodeText from a UTF-8 string or buffer.\n//\n// If do_copy is true, then a copy of the string is made. The copy is\n// owned by the resulting UnicodeText object and will be freed when\n// the object is destroyed. This UnicodeText object is referred to\n// as an \"owner.\"\n//\n// If do_copy is false, then no copy is made. The resulting\n// UnicodeText object does NOT take ownership of the string; in this\n// case, the lifetime of the UnicodeText object must not exceed the\n// lifetime of the string. This Unicodetext object is referred to as\n// an \"alias.\" This is the same as MakeUnicodeTextWithoutAcceptingOwnership.\n//\n// If the input string does not contain valid UTF-8, then a copy is\n// made (as if do_copy were true) and coerced to valid UTF-8 by\n// replacing each invalid byte with a space.\n//\ninline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) {\n  UnicodeText t;\n  if (do_copy) {\n    t.CopyUTF8(utf8_buf, len);\n  } else {\n    t.PointToUTF8(utf8_buf, len);\n  }\n  return t;\n}\n\ninline UnicodeText UTF8ToUnicodeText(const string &utf_string, bool do_copy) {\n  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);\n}\n\ninline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len) {\n  return UTF8ToUnicodeText(utf8_buf, len, true);\n}\ninline UnicodeText UTF8ToUnicodeText(const string &utf8_string) {\n  return UTF8ToUnicodeText(utf8_string, true);\n}\n\n// Return a string containing the UTF-8 encoded version of all the\n// Unicode characters in t.\ninline string UnicodeTextToUTF8(const UnicodeText &t) {\n  return string(t.utf8_data(), t.utf8_length());\n}\n\n// This template function declaration is used in defining arraysize.\n// Note that the function doesn't need an implementation, as we only\n// use its type.\ntemplate <typename T, size_t N>\nchar (&ArraySizeHelper(T (&array)[N]))[N];\n#define arraysize(array) (sizeof(ArraySizeHelper(array)))\n\n// For debugging.  Return a string of integers, written in uppercase\n// hex (%X), corresponding to the codepoints within the text. Each\n// integer is followed by a space. E.g., \"61 62 6A 3005 \".\nstring CodepointString(const UnicodeText &t);\n\n#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_\n"
  },
  {
    "path": "unittest/util/utf8/unilib.cc",
    "content": "/**\n * Copyright 2010 Google Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n// Author: sligocki@google.com (Shawn Ligocki)\n\n#include \"util/utf8/unilib.h\"\n\n#include \"syntaxnet/base.h\"\n#include \"third_party/utf/utf.h\"\n\nnamespace UniLib {\n\n// Codepoints not allowed for interchange are:\n//   C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),\n//       Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),\n//       Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)\n//   C1 controls: U+007F to U+009F\n//   Surrogates: U+D800 to U+DFFF\n//   Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx\nbool IsInterchangeValid(char32 c) {\n  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||\n           (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) ||\n           (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE);\n}\n\nint SpanInterchangeValid(const char *begin, int byte_length) {\n  char32 rune;\n  const char *p = begin;\n  const char *end = begin + byte_length;\n  while (p < end) {\n    int bytes_consumed = charntorune(&rune, p, end - p);\n    // We want to accept Runeerror == U+FFFD as a valid char, but it is used\n    // by chartorune to indicate error. Luckily, the real codepoint is size 3\n    // while errors return bytes_consumed <= 1.\n    if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) {\n      break; // Found\n    }\n    p += bytes_consumed;\n  }\n  return p - begin;\n}\n\n} // namespace UniLib\n"
  },
  {
    "path": "unittest/util/utf8/unilib.h",
    "content": "/**\n * Copyright 2010 Google Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n// Routines to do manipulation of Unicode characters or text\n//\n// The StructurallyValid routines accept buffers of arbitrary bytes.\n// For CoerceToStructurallyValid(), the input buffer and output buffers may\n// point to exactly the same memory.\n//\n// In all other cases, the UTF-8 string must be structurally valid and\n// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.\n// Debug builds take a fatal error for invalid UTF-8 input.\n// The input and output buffers may not overlap at all.\n//\n// The char32 routines are here only for convenience; they convert to UTF-8\n// internally and use the UTF-8 routines.\n\n#ifndef UTIL_UTF8_UNILIB_H__\n#define UTIL_UTF8_UNILIB_H__\n\n#include <string>\n#include \"syntaxnet/base.h\"\n\n// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,\n// but they are defined in unilib_utf8_utils.h.\n//#include \"util/utf8/public/unilib_utf8_utils.h\"  // IWYU pragma: export\n\nnamespace UniLib {\n\n// Returns the length in bytes of the prefix of src that is all\n//  interchange valid UTF-8\nint SpanInterchangeValid(const char *src, int byte_length);\ninline int SpanInterchangeValid(const std::string &src) {\n  return SpanInterchangeValid(src.data(), src.size());\n}\n\n// Returns true if the source is all interchange valid UTF-8\n// \"Interchange valid\" is a stronger than structurally valid --\n// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.\nbool IsInterchangeValid(char32 codepoint);\ninline bool IsInterchangeValid(const char *src, int byte_length) {\n  return (byte_length == SpanInterchangeValid(src, byte_length));\n}\ninline bool IsInterchangeValid(const std::string &src) {\n  return IsInterchangeValid(src.data(), src.size());\n}\n\n} // namespace UniLib\n\n#endif // UTIL_UTF8_PUBLIC_UNILIB_H_\n"
  },
  {
    "path": "unittest/util/utf8/unilib_utf8_utils.h",
    "content": "/**\n * Copyright 2010 Google Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_\n#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_\n\n// These definitions are self-contained and have no dependencies.\n// They are also exported from unilib.h for legacy reasons.\n\n#include \"syntaxnet/base.h\"\n#include \"third_party/utf/utf.h\"\n\nnamespace UniLib {\n\n// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]\n// (i.e., is not a surrogate codepoint). See also\n// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.\ninline bool IsValidCodepoint(char32 c) {\n  return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);\n}\n\n// Returns true if 'str' is the start of a structurally valid UTF-8\n// sequence and is not a surrogate codepoint. Returns false if str.empty()\n// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function\n// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).\n#ifdef INCLUDE_TENSORFLOW\ninline bool IsUTF8ValidCodepoint(StringPiece str) {\n  char32 c;\n  int consumed;\n  // It's OK if str.length() > consumed.\n  return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&\n         IsValidCodepoint(c);\n}\n#endif\n\n// Returns the length (number of bytes) of the Unicode code point\n// starting at src, based on inspecting just that one byte. This\n// requires that src point to a well-formed UTF-8 string; the result\n// is undefined otherwise.\ninline int OneCharLen(const char *src) {\n  return \"\\1\\1\\1\\1\\1\\1\\1\\1\\1\\1\\1\\1\\2\\2\\3\\4\"[(*src & 0xFF) >> 4];\n}\n\n// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)\ninline bool IsTrailByte(char x) {\n  // return (x & 0xC0) == 0x80;\n  // Since trail bytes are always in [0x80, 0xBF], we can optimize:\n  return static_cast<signed char>(x) < -0x40;\n}\n\n} // namespace UniLib\n\n#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_\n"
  },
  {
    "path": "unittest/validate_grapheme_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"normstrngs.h\"\n#include \"normstrngs_test.h\"\n\nnamespace tesseract {\n\nTEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {\n  std::string str = \"\\u0c15\\u0c3f\\u0c15\\u0c0e\"; // KA - dep I - KA - ind E.\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  // It made 3 graphemes.\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0c15\\u0c3f\"));\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0c15\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0c0e\"));\n}\n\nTEST(ValidateGraphemeTest, SingleConsonantOK) {\n  std::string str = \"\\u0cb9\"; // HA\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n}\n\nTEST(ValidateGraphemeTest, SimpleCV) {\n  std::string str = \"\\u0cb9\\u0cbf\"; // HA I\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n}\n\nTEST(ValidateGraphemeTest, SubscriptConjunct) {\n  std::string str = \"\\u0cb9\\u0ccd\\u0c95\\u0cbf\"; // HA Virama KA I\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0ccd\\u0c95\"));\n}\n\nTEST(ValidateGraphemeTest, HalfFormJoiner) {\n  std::string str = \"\\u0d15\\u0d4d\\u200d\\u0d24\"; // KA Virama ZWJ Ta\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0d15\\u0d4d\\u200d\"));\n}\n\nTEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {\n  std::string str = \"\\u0d15\\u200d\\u0d4d\\u0d24\"; // KA ZWI Virama Ta\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u200d\\u0d4d\"));\n}\n\nTEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {\n  std::string str = \"\\u0d15\\u200c\\u0d4d\\u0d24\"; // KA ZWNJ Virama Ta\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u200c\\u0d4d\"));\n  // Malaylam only, so not allowed in Telugu.\n  str = \"\\u0c15\\u200c\\u0c4d\\u0c24\"; // KA ZWNJ Virama Ta\n  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                            GraphemeNormMode::kCombined, true, str.c_str(),\n                                            &glyphs))\n      << PrintString32WithUnicodes(str);\n}\n\nTEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {\n  std::string str = \"\\u0d15\\u0d4d\\u200c\\u0d24\"; // KA Virama ZWNJ Ta\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 2);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0d24\"));\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0d4d\\u200c\"));\n}\n\nTEST(ValidateGraphemeTest, ThaiGraphemes) {\n  // This is a single grapheme unless in glyph split mode\n  std::string str = \"\\u0e14\\u0e38\\u0e4a\";\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 1);\n  EXPECT_EQ(glyphs[0], str);\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0e14\"));\n}\n\nTEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {\n  std::string str = \"'\\u0d24\\u0d23\\u0d32\\u0d4d'\\u200d\";\n  std::vector<std::string> glyphs;\n  // Returns true, but the joiner is gone.\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(glyphs.size(), 5);\n  EXPECT_EQ(glyphs[0], std::string(\"'\"));\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0d24\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0d23\"));\n  EXPECT_EQ(glyphs[3], std::string(\"\\u0d32\\u0d4d\\u200c\"));\n  EXPECT_EQ(glyphs[4], std::string(\"'\"));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/validate_indic_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"normstrngs.h\"\n#include \"normstrngs_test.h\"\n\nnamespace tesseract {\n\n// Though the unicode example for Telugu in section 12.7:\n// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf\n// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to\n// suppress a conjugate that would otherwise occur.  If a consonant is followed\n// by a virama and then by a non-Indic character, OpenType will presume that\n// the user simply meant to suppress the inherent vowel of the consonant\n// and render it as the consonant with an explicit virama, the same as if\n// a ZWNJ had followed. Since this is confusing to an OCR engine, the\n// normalizer always puts a termninating ZWNJ on the end if not present,\n// and accepts the string as valid.\nTEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {\n  std::string str = \"\\u0c15\\u0c4d\";              // KA - virama\n  std::string target_str = \"\\u0c15\\u0c4d\\u200c\"; // KA - virama - ZWNJ\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);\n  // Same result if we started with the normalized string.\n  ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);\n}\n\n// Only one dependent vowel is allowed.\nTEST(ValidateIndicTest, OnlyOneDependentVowel) {\n  std::string str = \"\\u0d15\\u0d3e\\u0d42\"; // KA AA UU\n  std::string dest;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n}\n\n//  [c26][c4d][c01]\n//     A consonant (DA) followed by the virama followed by a bindu\n//     Syllable modifiers [c01][c02][c03] all modify the pronunciation of\n//     the vowel in a syllable, as does the virama [c04].  You can only\n//     have one of these on a syllable.\n//\n//  References:\n//    http://www.omniglot.com/writing/telugu.htm\nTEST(ValidateIndicTest, OnlyOneVowelModifier) {\n  std::string str = \"\\u0c26\\u0c4d\\u0c01\"; // DA virama candrabindu\n  std::string result;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &result));\n  // It made 1 grapheme of 4 chars, by terminating the explicit virama.\n  EXPECT_EQ(std::string(\"\\u0c26\\u0c4d\\u200c\\u0c01\"), result);\n\n  str = \"\\u0995\\u0983\\u0981\"; // KA visarga candrabindu\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n\n  // Exception: Malayalam allows multiple anusvara.\n  str = \"\\u0d15\\u0d02\\u0d02\"; // KA Anusvara Anusvara\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &result));\n  EXPECT_EQ(str, result);\n}\n\n//  [c28][c02][c3f]\n//    A consonant (NA) followed by the Anusvara/sunna and another matra (I).\n// The anusvara [c02] is a pronunciation directive\n//    for a whole syllable and only appears at the end of the syllable\n//  References:\n//    + Unicode v9, 12.1 \"Modifier Mark Rules R10,\"\n//       and the Microsoft page\n//       http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx\nTEST(ValidateIndicTest, VowelModifierMustBeLast) {\n  std::string str = \"\\u0c28\\u0c02\\u0c3f\"; // NA Sunna I\n  std::string dest;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  // Swap c02/c3f and all is ok.\n  str = \"\\u0c28\\u0c3f\\u0c02\"; // NA I Sunna\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n}\n\n//  [c05][c47]\n//    A Vowel (A) followed by a combining vowel/matra (EE).\n//    In Telugu, matras are only put on consonants, not independent\n//    vowels.\n//  References:\n//  + Unicode v9, 12.1:\n//     Principles of the Devanagari Script: Dependent Vowel Signs (Matras).\n//  + http://varamozhi.sourceforge.net/iscii91.pdf\nTEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {\n  std::string str = \"\\u0c05\\u0c47\"; // A EE\n  std::string dest;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  str = \"\\u0c1e\\u0c3e\"; // NYA AA\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n}\n\n// Sub-graphemes are allowed if GraphemeNorm is turned off.\nTEST(ValidateIndicTest, SubGraphemes) {\n  std::string str = \"\\u0d3e\"; // AA\n  std::string dest;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n}\n\nTEST(ValidateIndicTest, Nukta) {\n  std::string str = \"\\u0c95\\u0cbc\\u0ccd\\u0cb9\"; // KA Nukta Virama HA\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0ccd\\u0cb9\"));\n  // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.\n  std::string str2 = \"\\u0c95\\u0ccd\\u0cbc\\u0cb9\"; // KA Virama Nukta HA\n  ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);\n}\n\n// Sinhala has some of its own specific rules. See www.macciato.com/sinhala\nTEST(ValidateIndicTest, SinhalaRakaransaya) {\n  std::string str = \"\\u0d9a\\u0dca\\u200d\\u0dbb\"; // KA Virama ZWJ Rayanna\n  std::string dest;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 2);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0dca\\u200d\\u0dbb\"));\n  // Can be followed by a dependent vowel.\n  str += \"\\u0dd9\"; // E\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n}\n\nTEST(ValidateIndicTest, SinhalaYansaya) {\n  std::string str = \"\\u0d9a\\u0dca\\u200d\\u0dba\"; // KA Virama ZWJ Yayanna\n  std::string dest;\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n  // Can be followed by a dependent vowel.\n  str += \"\\u0ddd\"; // OO\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                  str.c_str(), &dest))\n      << PrintString32WithUnicodes(str);\n  EXPECT_EQ(dest, str);\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0dca\\u200d\\u0dba\"));\n}\n\nTEST(ValidateIndicTest, SinhalaRepaya) {\n  std::string str = \"\\u0d9a\\u0dbb\\u0dca\\u200d\\u0db8\"; // KA Rayanna Virama ZWJ MA\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kCombined, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 2);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0dbb\\u0dca\\u200d\\u0db8\"));\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 3);\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0dbb\\u0dca\\u200d\"));\n}\n\nTEST(ValidateIndicTest, SinhalaSpecials) {\n  // Sinhala has some exceptions from the usual rules.\n  std::string str = \"\\u0dc0\\u0d9c\\u0dca\\u200d\\u0dbb\\u0dca\\u200d\\u0dbb\\u0dca\\u200d\";\n  std::vector<std::string> glyphs;\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0dc0\"));\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0d9c\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0dca\\u200d\\u0dbb\"));\n  EXPECT_EQ(glyphs[3], std::string(\"\\u0dca\\u200d\"));\n  EXPECT_EQ(glyphs[4], std::string(\"\\u0dbb\\u0dca\\u200d\"));\n  str = \"\\u0dc3\\u0dbb\\u0dca\\u200d\\u0dbb\\u0dca\\u200d\\u0dcf\";\n  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                           GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                           &glyphs));\n  EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);\n  EXPECT_EQ(glyphs[0], std::string(\"\\u0dc3\"));\n  EXPECT_EQ(glyphs[1], std::string(\"\\u0dbb\\u0dca\\u200d\"));\n  EXPECT_EQ(glyphs[2], std::string(\"\\u0dbb\\u0dca\\u200d\"));\n  EXPECT_EQ(glyphs[3], std::string(\"\\u0dcf\"));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/validate_khmer_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"normstrngs.h\"\n#include \"normstrngs_test.h\"\n\nnamespace tesseract {\n\n// Test some random Khmer words.\nTEST(ValidateKhmerTest, GoodKhmerWords) {\n  std::string str = \"ព័ត៏មានប្លែកៗ\";\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 13, 12, 7, str);\n  str = \"ទំនុកច្រៀង\";\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 10, 9, 5, str);\n  str = \"កាលីហ្វូញ៉ា\";\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 4, str);\n  str = \"ចាប់ពីផ្លូវ\";\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 5, str);\n}\n\n// Test some random Khmer words with dotted circles.\nTEST(ValidateKhmerTest, BadKhmerWords) {\n  std::string result;\n  // Multiple dependent vowels not allowed\n  std::string str = \"\\u1796\\u17b6\\u17b7\";\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n  // Multiple shifters not allowed\n  str = \"\\u1798\\u17c9\\u17ca\";\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n  // Multiple signs not allowed\n  str = \"\\u1780\\u17b6\\u17cb\\u17cd\";\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/validate_myanmar_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"include_gunit.h\"\n#include \"normstrngs.h\"\n#include \"normstrngs_test.h\"\n\nnamespace tesseract {\n\n// Test some random Myanmar words.\nTEST(ValidateMyanmarTest, GoodMyanmarWords) {\n  std::string str = \"လျှာကသိသည် \"; // No viramas in this one.\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str);\n  str = \"တုန္လႈပ္မႈ \";\n  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str);\n}\n\n// Test some random Myanmar words with dotted circles.\nTEST(ValidateMyanmarTest, BadMyanmarWords) {\n  std::string str = \"က်န္းမာေရး\";\n  std::vector<std::string> glyphs;\n  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                            GraphemeNormMode::kCombined, true, str.c_str(),\n                                            &glyphs));\n  std::string result;\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n  // It works if the grapheme normalization is turned off.\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,\n                                  str.c_str(), &result));\n  EXPECT_EQ(str, result);\n  str = \"ခုႏွစ္\";\n  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,\n                                            GraphemeNormMode::kGlyphSplit, true, str.c_str(),\n                                            &glyphs));\n  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,\n                                   str.c_str(), &result));\n  // It works if the grapheme normalization is turned off.\n  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,\n                                  str.c_str(), &result));\n  EXPECT_EQ(str, result);\n}\n\n} // namespace tesseract\n"
  },
  {
    "path": "unittest/validator_test.cc",
    "content": "// (C) Copyright 2017, Google Inc.\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n// http://www.apache.org/licenses/LICENSE-2.0\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"validator.h\"\n\n#include \"gmock/gmock.h\" // for testing::ElementsAreArray\n#include \"include_gunit.h\"\n\nnamespace tesseract {\n\nclass TestableValidator : public Validator {\npublic:\n  static ViramaScript TestableMostFrequentViramaScript(const std::vector<char32> &utf32) {\n    return MostFrequentViramaScript(utf32);\n  }\n};\n\n// The majority of Validator is tested by the script-specific tests of its\n// subclasses, but the MostFrequentViramaScript function is worth a unittest.\nTEST(ValidatorTest, MostFrequentViramaScript) {\n  // The most frequent virama script should come out correct, despite\n  // distractions from other scripts.\n  EXPECT_EQ(ViramaScript::kTelugu, TestableValidator::TestableMostFrequentViramaScript({0xc05}));\n  // It is still Telugu surrounded by Latin.\n  EXPECT_EQ(ViramaScript::kTelugu,\n            TestableValidator::TestableMostFrequentViramaScript({'a', 0xc05, 'b', 'c'}));\n  // But not still Telugu surrounded by Devanagari.\n  EXPECT_EQ(ViramaScript::kDevanagari,\n            TestableValidator::TestableMostFrequentViramaScript({0x905, 0xc05, 0x906, 0x907}));\n  EXPECT_EQ(ViramaScript::kKannada,\n            TestableValidator::TestableMostFrequentViramaScript({0xc85, 0xc05, 0xc86, 0xc87}));\n  EXPECT_EQ(ViramaScript::kBengali,\n            TestableValidator::TestableMostFrequentViramaScript({0x985, 0xc05, 0x986, 0x987}));\n  // Danda and double Danda don't count as Devanagari, as they are common.\n  EXPECT_EQ(ViramaScript::kTelugu,\n            TestableValidator::TestableMostFrequentViramaScript({0x964, 0xc05, 0x965, 0x965}));\n}\n\n// ValidateCleanAndSegment doesn't modify the input by much, but its\n// transformation should be idempotent. (Doesn't change again if re-applied.)\nTEST(ValidatorTest, Idempotency) {\n  std::vector<char32> str1({0xd24, 0xd23, 0xd32, 0xd4d, '\\'', 0x200d, 0x200c, 0x200d, 0x200c});\n  std::vector<char32> str2({0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\\''});\n  std::vector<std::vector<char32>> result1, result2, result3, result4;\n  EXPECT_TRUE(\n      Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str1, &result1));\n  EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result1[0],\n                                                 &result2));\n  EXPECT_EQ(result1.size(), result2.size());\n  EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0]));\n  EXPECT_TRUE(\n      Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str2, &result3));\n  EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result3[0],\n                                                 &result4));\n  EXPECT_EQ(result3.size(), result4.size());\n  EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0]));\n}\n\n} // namespace tesseract\n"
  }
]